Install anaconda
python -V # get python version
type(person) # Determine the type of an object
# not None test
if not (val is None):
# ...
str() # convert variable to string
int()
float()
bool()
#convert a bool (list) to 0 or 1
=B*1
B
# check if a object is None
if foo is None:
print('is None')
range(10) # range 0 to 9
list(range(11, 17)) # range to a list: [11, 12, 13, 14, 15, 16]
reversed(range(10)) # reversed range 9 to 0
list(reversed(range(10))) # to print out
Lists can contain any data types (including list). mutable
= [3, 4, 5]
li
= [None]*5 # [None, None, None, None, None]
li
# check if it is empty
if not li:
print("List is empty")
# check if it is in the list
if 'b' in ['a', 'b', 'c'] :
print('Yes, b found in List')
# sub list
0] # 3
li[0:2] # [incluse, excluse] 3,4
li[-1] # get the last element
li[-2] = 3 # Set the second last element
li[list(li[i] for i in [0, 2]) # first and third element
2] # every second element from default start and stop index. s[start:stop:step]
li[::
# last n elements
= [1, 2, 3, 4, 5, 6, 7, 8]
a -3:]
a[# [6, 7, 8]
# add or remove element
= li + [6]
li = del li[5]
li 4) # removes the first matching value, not a specific index:
li.remove(1) # removes the item at a specific index and returns it.
li.pop(= 5
n = li[n:] # remove first n element in the list
li
# append an element - append is a destructive operation (it modifies the list in place instead of of returning a new list)
'bar')
li.append(
# combine two lists
= [1,2,3]
l1 = [4,5,6]
l2 = l1 + l2
mergedlist
# concatenate element-wise two lists in Python
= [0, 1, 5, 6, 10, 11]
a = ['asp1', 'asp1', 'asp1', 'asp1', 'asp2', 'asp2']
b +str(n) for m,n in zip(b,a)]
[m
# copy: reference only
= li
li2 # copy: explicit copy
= li[:] # or li2 = list(li)
li2
float(n)] # convert a float to list
[
len(li) # length of list
3 in li # check if the element is in the list
= [ [1, 2, 3, 4], [5, 6, 7, 8], ] # nested list
matrix
# if all elements of a list are False
= [False, False, False]
li not any(data) # any() will return True if there's any truth value in the iterable.
# >> True
# loop in the list
= ['a',' b', 'c ', ' d ']
li for i, s in enumerate(li):
= s.strip()
li[i] print(li) # -> ['a', 'b', 'c', 'd']
# loop for the list
= [(1, 2), (1, 3), (2, 3)]
li sum(tup) for tup in li]
[# [3, 4, 5]
# list to string
= ["This" , "is", "a", "sample", "program"]
li ' '.join(li) # 'This is a sample program'
','.join(li) # 'This,is,a,sample,program'
# Check if all values in list are greater than a certain number
= [29, 500, 43]
l all(i >= 30 for i in l)
# True
# Sort list by given order of indices
=["A","B","C","D","E"]
li = [3,2,1,0,4]
sortedIndex for i in sortedIndex]
[li[i] # ['D', 'C', 'B', 'A', 'E']
# concatenate element-wise two lists
= [0, 1, 5]
a = ['as', 'ap', 'asp']
b +str(n) for m,n in zip(b,a)]
[m
# Convert a list to a string and back
import json
with open(data_file, 'wb') as dump:
dump.write(json.dumps(arbitrary_data))
= open(data_file, 'rb').read()
source = json.loads(source) data
Tuples are immutable sequences
= 12345, 54321, 'hello!'
t 0] # 12345 t[
A set is an unordered collection with no duplicate elements
= {'apple', 'orange', 'apple', 'pear', 'orange', 'banana'}
basket
# check if a element is in the set
'apple' in basket
Dictionaries store key-value pairs, kind of like JSON objects.
# Creating an empty dictionary
= {} # OR data = dict()
d
# Creating a dictionary with initial values
= {'a':2,'b':21,'c':3} # OR data = dict(a=2, b=1, c=3)
d
# Inserting/Updating a single value
'a']=1 # Updates if 'a' exists, else adds 'a'
d[# OR data.update(a=1)
# Inserting/Updating multiple values
'c':3,'d':4}) # Updates 'c' and adds 'd'
d.update({
# delete a element based on key
del d['a']
# find index of element
= d.index('c')
index
# find max based on the key
max(d.items(), key = lambda x: x[0])
# ('c', 3)
# find max based on the value
max(d.items(), key = lambda x: x[0])
# ('b', 21)
# Get list of values for list of keys
= ["a", "c"]
mykeys for x in mykeys]
[d[x] # [2, 3]
# return a key list
= list(d.keys())
key_list # ['a', 'b', 'c']
# return sorted key list
sorted(d, key=(lambda key:d[key]), reverse=True)
# ['b', 'c', 'a']
# return sorted tuple(key, value) list
sorted(d.items(), key=lambda kv: kv[1])
# [('a', 2), ('c', 3), ('b', 21)]
# create a dictionary
= {
person "name": "Amos",
"age": 23,
"hobbies": ["Travelling", "Swimming", "Coding", "Music"]
}
# iterate through the dict and print the keys
for key in person:
print(key)
# iterate through the dict's keys and print their values
for key in person:
print(person[key])
# Convert two lists into a dictionary
= ['a', 'b', 'c']
keys = [1, 2, 3]
values = dict(zip(keys, values))
dictionary print(dictionary)
# {'a': 1, 'b': 2, 'c': 3}
Array can contain one data type
# convert list to array
= [3, 4, 5]
li = np.array(li)
ay
# initial an array
3, 5), 7)
np.full((# array([[ 7, 7, 7, 7., 7],
# [ 7, 7, 7, 7., 7],
# [ 7, 7, ., 7., 7]])
# to contain multiple data type. assign datatype as object
= np.array([["a", 0]] * 5, dtype='O')
ay # array([['a', 0],
# ['a', 0],
# ['a', 0],
# ['a', 0],
# ['a', 0]], dtype=object)
# get second value of each element in the array
1]
ay[:, # array([0, 0, 0, 0, 0], dtype=object)
# get second value of first element in the array
0, 1]
ay[# 0
# set the value
0, 1] = 15
ay[
# find the index of element which second value > 3
1]>3)
np.where(ay[:,# (array([0], dtype=int64),)
# check if has an element which second value = 5
if len(np.where(ay[:,1]==5)[0]) >0
# a range value
0, 5, 0.5)
np.arange(# Out[32]: array([0. , 0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5])
# a random value
10, size=2) # integer value, add replace=False for non duplicated values
np.random.randint(5) # random float between 0 and 1
np.random.rand(
# convert np array to list
ay.tolist()
= np.array([[1,2], [3,4]])
ay
# shape of array
ay.shape
# Return a copy of the array collapsed into one dimension
ay.flatten() # array([1, 2, 3, 4])
4)
np.reshape(ay, # array([1, 2, 3, 4])
# append an array
= np.append(ay, ay1)
ay
# delete an element
np.delete(a, index)
# count the occurrence of certain item in an ndarray in Python
= np.array([0, 3, 0, 1, 0, 1, 2, 1, 0, 0, 0, 0, 1, 3, 4])
a = numpy.unique(a, return_counts=True) unique, counts
= np.array([[1, 2], [3, 4]])
a = np.array([[5, 6]])
b =0)
np.concatenate((a, b), axis# array([[1, 2],
# [3, 4],
# [5, 6]])
=1)
np.concatenate((a, b.T), axis# array([[1, 2, 5],
# [3, 4, 6]])
=None)
np.concatenate((a, b), axis# array([1, 2, 3, 4, 5, 6])
# subset by index
= np.array([10, 20, 30, 40, 50, 60])
arr 1,4,5]]
arr[[# array([20, 50, 60])
# subset by index for multidimensional arrays:
= np.arange(9).reshape(3,3)
arr
arr# array([[0, 1, 2],
# [3, 4, 5],
# [6, 7, 8]])
0, 1, 1], [1, 0, 2]]
arr[[# array([1, 3, 5])
# the ith column of a np multidimensional array
= np.array([[1, 2], [3, 4], [5, 6]])
test 0]
test[:,# array([1, 3, 5])
= np.array([2,3,4,6,7])
ay1 = np.array([1,4,5,5,6,8,8,9])
ay2
np.in1d(ay1, ay2)# array([False, False, True, True, False])
np.setdiff1d(ay1, ay2)# array([2, 3, 7])
np.isin(ay1, ay2)# array([False, False, True, True, False])
np.where(np.isin(ay1, ay2))# (array([2, 3], dtype=int64),)
Initialize data frame
= np.array([[1,2,3], [1,2,3], [1,2,3]])
mat print(mat)
# [[1 2 3]
# [1 2 3]
# [1 2 3]]
sum(mat, axis=1) # axis=1 "get the sum of each the columns" np.
http://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/
A one-dimensional labeled array A capable of holding any data type
# initialized a series from a list
= pd.Series(['string1', 'string2', 'string3'])
s
s# 0 string
# 1 string2
# 2 string3
# dtype: object
= pd.Series([3, -5, 7, 4], index=['a', 'b', 'c', 'd'])
s
# get the last element
-1]
s.iloc[-1]
s[
# convert to an numpy.ndarray
1, 2, 3]).values
pd.Series([# array([1, 2, 3])
# convert to a list
1, 2, 3]).tolist()
pd.Series([
# reset the index
=True, inplace=True) # drop: delete the index entirely; inplace: directly modify and overwrite your original DataFrame
s.reset_index(drop
# regex
'f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)
pd.Series([
# -- Factorize ---
= pd.Series(['a','b','c','d'])
s_class
= s_class.factorize()[0] # convert string to numbers
s_class_id
s_class_id# array([0, 1, 2, 3], dtype=int64)
= s_class.factorize() # keep mapping class
s_class_id, mapping = mapping.take(pd.Series([3,2,3,1]))
mapped_back_to_class
mapped_back_to_class# Index(['d', 'c', 'd', 'b'], dtype='object')
# -- check if ALL values are NaN in Series
all()
se.isnull().
# -- apply
1, 2, 3]).apply(lambda x :x*2)
pd.Series([# 0 2
# 1 4
# 2 6
initialize data frame
= pd.DataFrame(
df "a":[11 ,12, 13], "b":[21, 22, 23], "c":[31, 32, 33]},
{= [1, 2, 3])
index # a b c
# 1 11 21 31
# 2 12 22 32
# 3 13 23 33
# d is your list of dicts to dataframe
= [{"a":1, "b":1}, {"a":2, "b":2}]
l = pd.DataFrame(l)
df # a b
# 0 1 1
# 1 2 2
# numpy array to data frame
= np.array([0, 3, 0, 1])
X = pd.DataFrame(X, index=np.arange(0,len(X)))
df
# Out[8]:
# 0
# 0 0
# 1 3
# 2 0
# 3 1
= np.array([[0, 3, 0], [2, 2, 2], [7, 7, 7]])
X = pd.DataFrame(X, index=np.arange(0,len(X)))
df
# Out[10]:
# 0 1 2
# 0 0 3 0
# 1 2 2 2
# 2 7 7 7
# from dict to dataframe
dict)
pd.DataFrame.from_dict(
# mean of each column
df.mean() # (rows,columns)
df.shape # column names
df.columns # Describe index
df.index # Info on DataFrame
df.info() # Number of non-NA values
df.count()
# column rename
= pd.DataFrame({'aa':[1,2], 'bb': [10,20]})
df = ['a', 'b']
df.columns
df# a b
# 0 1 10
# 1 2 20
= df.rename({'a': 'axx'}, axis='columns') # or axis=1
df
3].value_counts() # frequency table of a column, e.g. column 3
df[
= df.values #convert a pandas dataframe (df) to a numpy ndarray,
df
= df.rename(columns={'old_name':'new_name'}) # rename a column name
df
all() # check if ALL values are NaN in DataFrame df.isnull().
# Remove duplicate rows (only considers columns).
df.drop_duplicates()
= df3[~df3.index.duplicated(keep='first')] #Remove rows with duplicate indices, using the duplicated method on the Pandas Index itself
df3
# keep only the last entry from duplicate values
= pd.DataFrame({'A': ['x1', 'x2', 'x3', 'x1',], 'B': [85, 70, 80, 85]})
df # A B
#0 x1 85
#1 x2 70
#2 x3 80
#3 x1 85
= df.drop_duplicates(subset = ['A','B'], keep ='last').reset_index(drop = True)
df2
df2# A B
#0 x2 70
#1 x3 80
#2 x1 85
0) # Replace all the NaN values with Zero
df.fillna(
= df.replace(np.nan, '', regex=True) # Replace all the NaN with blank/empty string
df1
=1, how='all') # Drop the columns where all elements are NaN df.dropna(axis
= pd.DataFrame({'P': [1, 2], 'Q': [0.7, 0.85]},
df =['p', 'q'])
index# P Q
# p 1 0.70
# q 2 0.85
# to structure numpy array. it will keep column name and data type information
df.to_records()# rec.array([('p', 1, 0.7 ), ('q', 2, 0.85)],
# dtype=[('index', 'O'), ('P', '<i8'), ('Q', '<f8')])
# convert back to dataframe
pd.DataFrame(df.to_records())# index P Q
# 0 p 1 0.70
# 1 q 2 0.85
'index')
pd.DataFrame(df.to_records()).set_index(# P Q
# index
# p 1 0.70
# q 2 0.85
# Order rows by values of a column
'mpg') # (low to high).
df.sort_values('mpg', ascending=False) # (high to low).
df.sort_values(
#sort by two or more columns?
= df1.sort_values(['a', 'b'], ascending=[True, False])
df1
# Shuffle DataFrame rows
=1) # The frac specifies the fraction of rows to return in the random sample, frac=1 means return all rows.
df.sample(frac
# random n row selection
df.sample(n)
# reset index
= df.reset_index(drop=True) df
# change value of a cell
'C', 'x'] = 10
df.at[
#--------- row selection ---------------
# Select first n rows.
df.head(n) # Select last n rows.
df.tail(n)
# based on position
0:3] # select the first 3 rows (rows 0,1,2)
df[5] # select the first 5 rows (rows 0,1,2,3,4)
df[:-1:] # select the last row
df[
#iloc: Select Column & Row by Positions (it only takes integers).
# rows:
0] # select the first row
df.iloc[2:] # select every row beginning with the third row
df.iloc[-1] # last row of data frame
df.iloc[# Columns:
0] # first column of data frame
data.iloc[:,1] # second column of data frame
data.iloc[:,-1] # last column of data frame
data.iloc[:,# rows and columns
0:3, 1:4] # select the first 3 rows (rows 0,1,2) and (columns 1, 2 and 3)
df.iloc[
# return type
3] # return a pandas series when one row is selected
df.iloc[3]] # return a pandas dataframe. pass a single valued list to return a dataframe
df.iloc[[2:3] # return a pandas dataframe when multiple rows are selected
df.iloc[
=pd.DataFrame(
df'age':[30,24,32], 'name':['john','jack','luck'], 'gender':['M','M','F']},
{=['a', 'b', 'c'])
index# age name gender
# a 30 john M
# b 24 jack M
# c 32 luck F
#loc: Select Column & Rows by name/index_label
# rows:
3] # slice up to and including label 3 (in the case of label is integer)
df.loc[:'b'] # select rows by b
df.loc[:'b':'c',:] # select rows from b to c
df.loc['b','c'],:] # select rows by multiple names/labels
df.loc[['column_name'] == some_value] # select rows whose column value equals a scalar
df.loc[df[~df['column_name'].isin(some_values)] # select rows whose column value is in an iterable
df.loc['column_name'] == some_value) & df['other_column'].isin(some_values)]
df.loc[(df[
>7] # select rows that meet logical criteria.
df[df.Length
# loop for each row
for index, row in df.iterrows():
print row['column_name_1'], row['column_name_2']
# 10 100
# 11 110
# 12 120
# copy as a new df, not only reference
= df[df['B'] == 'b.2'].copy()
df2
#--------- column selection ---------------
# Select columns by names
'name']
df['age', 'name']]
df[["column_name")
df.columns.get_loc(
'age', 'name']] # = df[['age', 'name']]
df.loc[:, [# select col_name1, col_name2 from table where column_name = some_value.
'column_name'] == some_value][[col_name1, col_name2]]
df.loc[df[
# Select columns by regular expression
filter(regex='name|address')
df.
# select columns by a list
= ['name','age','weight']
lst
df[np.intersect1d(df.columns, lst)]# age name
#0 30 john
#1 24 jack
#2 32 luc
# select Rows by integers and Columns by names
1], 'name'] # second index by df.index[1]:
df.loc[df.index[# or
1, df.columns.get_loc('name')] # get position of column name by get_loc
df.iloc[
# --------- Cell value selection ---------------
# if subset is one row dataframe, use .values[0] to get the value
'name']=='jack'] # subset result has only one row
df.loc[df[# age name gender
# b 24 jack M
'name']=='jack']['age'] # it return a series
df.loc[df[# b 24
# Name: age, dtype: int64
'name']=='jack']['age'].values[0]
df.loc[df[# 24
# append a column
'new_col'] = pd.Series(np.arange(0, 100), index=df.index)
df[
# append a list as a row
= pd.DataFrame([[1, 2], [3, 4]], columns = ["a", "b"])
df # a b
# 0 1 2
# 1 3 4
# fist approach
= [5, 6]
li len(df)] = li
df.loc[# a b
# 0 1 2
# 1 3 4
# 2 5 6
# second approach
= pd.Series(li, index = df.columns)
a_series = df.append(a_series, ignore_index=True) # not use the index labels of a_series df
# drop last n rows
df.drop(df.tail(n).index) # drop first n rows
df.drop(df.head(n).index)
# delete rows based on index
= df.drop(df.index[1: 4])
df
# delete column on i index
= df.drop(df.columns[i], axis=1) # note: it will remove both columns if there is duplicate names in columns,
df
# delete column with column name
= df.drop(['column_name_A', 'column_name_B'], axis=1)
df
# delete columns based on regular expression
= df[df.columns.drop(list(df.filter(regex='Test')))]
df
# drop columns which have same values in all rows
= df.apply(pd.Series.nunique)
nunique = nunique[nunique == 1].index
cols_to_drop =1) df.drop(cols_to_drop, axis
# sum each row of column
sum(axis=1) df.
# groupby dataframe by key
= df.groupby('gender')
grouped
# list keys
grouped.groups.keys()
for name, group in grouped:
print('key:' name)
print('value:' group)
= pd.DataFrame({'user_id': ['a1', 'a1', 'a1', 'a2', 'a2', 'a2', 'a3', 'a3', 'a3'],
data 'product_id': ['p1', 'p1', 'p2', 'p1', 'p1', 'p1', 'p2', 'p2', 'p3'],
'product_count': [1, 2, 2, 2, 2, 1, 4, 10, 30]})
= data.groupby(['user_id', 'product_id']).size()
count_series # has a hierarchical index
count_series # user_id product_id
# a1 p1 2
# p2 1
# a2 p1 3
# a3 p2 2
# p3 1
# Converting a GroupBy output from Series to DataFrame
= count_series.to_frame(name = '_size').reset_index()
count_df
count_df# user_id product_id _size
# 0 a1 p1 2
# 1 a1 p2 1
# 2 a2 p1 3
# 3 a3 p2 2
# 4 a3 p3 1
= data.groupby(['user_id', 'product_id'])['product_count'].sum() # Compute sum of group values.
sum_series
sum_series# user_id product_id
# a1 p1 3
# p2 2
# a2 p1 5
# a3 p2 14
# p3 30
# select top n row from each group after group by in pandas
'product_count', ascending=False).groupby(['user_id'], sort=False).head(3) data.sort_values(
# Append a dataframe
= pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
df1
df1# A B
# 0 1 2
# 1 3 4
= pd.DataFrame([[1, 6], [7, 8]], columns=list('AB'))
df2
# ----------- pd.concat -----------------
= pd.concat([df1, df2]) # concatenate a list of dataframes along row
df # A B
#0 1 2
#1 3 4
#0 1 6
#1 7 8
# reset index if necessary
= df.reset_index(drop=True)
df
=1) # concatenate a list of dataframes along column
pd.concat([df1, df2], axis# A B A B
#0 1 2 1 6
#1 3 4 7 8
# ----------- pd.merge, Database-style -----------------
= pd.merge(df1, df2, on="A") # how='inner' by default.
df # A B_x B_y
#0 1 2 6
= pd.merge(df1, df2, on="A", how="left")
df # A B_x B_y
#0 1 2 6.0
#1 3 4 NaN
import pandas as pd
= "test"
table_name = pd.DataFrame(
df "a": [11, 12, 13], "b": [21, 22, 23], "c": ["c1", "c2", "c3"]},
{=[1, 2, 3])
index= '''select a, c from test where b >=22 '''
sql
# ----------- sqlite3 -----------------------
import sqlite3
= sqlite3.connect(':memory:') # create a db connection
cnx =table_name, con=cnx) # load data into db
df.to_sql(name= pd.read_sql(sql, cnx)
output print(output)
# a c
# 0 12 c2
# 1 13 c3
# --------- sqlglot -------------------------
from sqlglot.executor import execute
= {table_name: df.to_dict("records")} # load data into a dict
tables # {'test': [{'a': 11, 'b': 21, 'c': 'c1'}, {'a': 12, 'b': 22, 'c': 'c2'}, {'a': 13, 'b': 23, 'c': 'c3'}]}
= execute(sql, tables=tables)
output = pd.DataFrame(output.rows, columns=output.columns)
sql_df print(sql_df)
# ------- duckdb -----------------
import duckdb
= globals()
g = df # load data in global variable
g[table_name] = duckdb.query(sql).to_df()
output print(output)
# --------- pandasql ------------
from pandasql import sqldf
= globals()
g = df # load data in global variable
g[table_name] = sqldf(sql)
output print(output)
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html
'new_col'] = data['column'].rolling(5).mean() # Rolling Mean on pandas on a specific column
df[
# Pandas Rolling Apply custom function
def my_test(vals):
= vals.values
values return values[0]
'new_col'] = data['column'].rolling(5).apply(my_test) df[
# type of each column
df.dtypes
# convert ALL columns to strings
= df.astype(str)
df # convert selected columns
"D", "E"]] = df[["D", "E"]].astype(int)
df[[
# To Factors
= pd.Series(['single', 'touching', 'nuclei', 'dusts', 'touching', 'single', 'nuclei'])
s = pd.factorize(s) s_enc
import datetime as dt
# date
# datetime.date(2021, 11, 19)
dt.date.today()
# datetime
# now
dt.datetime.now() # now.month or now.hour, etc
dt.datetime.now().year # now.month
dt.datetime.now().month '%02d' % datetime.dt.now().month # ‘04’
'%Y-%m-%d %H:%M:%S') # format time stamp
dt.datetime.now().strftime(- datetime.timedelta(days=2) # Day before Yesterday
dt.datetime.now() - datetime.timedelta(minutes=15)
dt.datetime.now() # only keep date part of datetime
dt.datetime.now().date() + dt.timedelta(days=2) # after 2 days
dt.datetime.today()
# combine date and time strings to single datetime object
= dt.datetime.strptime('0130','%H%M').time()
mytime = dt.datetime.combine(dt.date.today(), mytime)
mydatetime
# ---- unix time ------
# convert (Unix) timestamp (the number of seconds since January 1st, 1970).
1485714600).strftime("%A, %B %d, %Y %I:%M:%S")
dt.datetime.fromtimestamp('Sunday, January 29, 2017 08:30:00'
# DateTime to Unix timestamp with 13 digits
= dt.datetime.now()
presentDate = dt.datetime.timestamp(presentDate)*1000
unix_timestamp print(unix_timestamp)
# substract 5 minutes from a Unix timestamp
=$((timestamp - 5 * 60 * 1000)) five_minutes_before
Convert string to datetime
from datetime import datetime datetime.strptime(‘Jun 1 2005 1:33PM’, ‘%b %d %Y %I:%M%p’)
https://stackoverflow.com/questions/53892450/get-the-format-in-dateutil-parse https://stackabuse.com/converting-strings-to-datetime-in-python/
= ['%Y-%m-%dT%H:%M:%S*%f%z','%Y %b %d %H:%M:%S.%f %Z','%b %d %H:%M:%S %z %Y','%d/%b/%Y:%H:%M:%S %z','%b %d, %Y %I:%M:%S %p','%b %d %Y %H:%M:%S','%b %d %H:%M:%S %Y','%b %d %H:%M:%S %z','%b %d %H:%M:%S','%Y-%m-%dT%H:%M:%S%z','%Y-%m-%dT%H:%M:%S.%f%z','%Y-%m-%d %H:%M:%S %z','%Y-%m-%d %H:%M:%S%z','%Y-%m-%d %H:%M:%S,%f','%Y/%m/%d*%H:%M:%S','%Y %b %d %H:%M:%S.%f*%Z','%Y %b %d %H:%M:%S.%f','%Y-%m-%d %H:%M:%S,%f%z','%Y-%m-%d %H:%M:%S.%f','%Y-%m-%d %H:%M:%S.%f%z','%Y-%m-%dT%H:%M:%S.%f','%Y-%m-%dT%H:%M:%S','%Y-%m-%dT%H:%M:%S%Z','%Y-%m-%dT%H:%M:%S.%f','%Y-%m-%dT%H:%M:%S','%Y-%m-%d*%H:%M:%S:%f','%Y-%m-%d*%H:%M:%S','%y-%m-%d %H:%M:%S,%f %z','%y-%m-%d %H:%M:%S,%f','%y-%m-%d %H:%M:%S','%y/%m/%d %H:%M:%S','%y%m%d %H:%M:%S','%Y%m%d %H:%M:%S.%f','%m/%d/%y*%H:%M:%S','%m/%d/%Y*%H:%M:%S','%m/%d/%Y*%H:%M:%S*%f','%m/%d/%y %H:%M:%S %z','%m/%d/%Y %H:%M:%S %z','%H:%M:%S','%H:%M:%S.%f','%H:%M:%S,%f','%d/%b %H:%M:%S,%f','%d/%b/%Y:%H:%M:%S','%d/%b/%Y %H:%M:%S','%d-%b-%Y %H:%M:%S','%d-%b-%Y %H:%M:%S.%f','%d %b %Y %H:%M:%S','%d %b %Y %H:%M:%S*%f','%m%d_%H:%M:%S','%m%d_%H:%M:%S.%f','%m/%d/%Y %I:%M:%S %p:%f','%m/%d/%Y %H:%M:%S %p']
formats
= ['2018-08-20T13:20:10*633+0000','2017 Mar 03 05:12:41.211 PDT','Jan 21 18:20:11 +0000 2017','19/Apr/2017:06:36:15 -0700','Dec 2, 2017 2:39:58 AM','Jun 09 2018 15:28:14','Apr 20 00:00:35 2010','Sep 28 19:00:00 +0000','Mar 16 08:12:04','2017-10-14T22:11:20+0000','2017-07-01T14:59:55.711+0000','2017-08-19 12:17:55 -0400','2017-08-19 12:17:55-0400','2017-06-26 02:31:29,573','2017/04/12*19:37:50','2018 Apr 13 22:08:13.211*PDT','2017 Mar 10 01:44:20.392','2017-03-10 14:30:12,655+0000','2018-02-27 15:35:20.311','2017-03-12 13:11:34.222-0700','2017-07-22T16:28:55.444','2017-09-08T03:13:10','2017-03-12T17:56:22-0700','2017-11-22T10:10:15.455','2017-02-11T18:31:44','2017-10-30*02:47:33:899','2017-07-04*13:23:55','11-02-11 16:47:35,985 +0000','10-06-26 02:31:29,573','10-04-19 12:00:17','06/01/22 04:11:05','150423 11:42:35','20150423 11:42:35.173','08/10/11*13:33:56','11/22/2017*05:13:11','05/09/2017*08:22:14*612','04/23/17 04:34:22 +0000','10/03/2017 07:29:46 -0700','11:42:35','11:42:35.173','11:42:35,173','23/Apr 11:42:35,173','23/Apr/2017:11:42:35','23/Apr/2017 11:42:35','23-Apr-2017 11:42:35','23-Apr-2017 11:42:35.883','23 Apr 2017 11:42:35','23 Apr 2017 10:32:35*311','0423_11:42:35','0423_11:42:35.883','8/5/2011 3:31:18 AM:234','9/28/2011 2:23:15 PM']
datestrings
from datetime import datetime
def parse_timestamp(datestring, formats):
for f in formats:
try:
= datetime.strptime(datestring, f)
d except:
continue
return (d, f)
return (datestring, 'Unable to parse format')
# --- convert string to panda datatime
= pd.DataFrame({'timestamp_col':['2005-05-23 00:00:00', '2005-05-24 00:00:00']})
df # dt_col
#0 2005-05-23 00:00:00
#1 2005-05-24 00:00:00
= pd.to_datetime(df.timestamp_col, format="%Y-%m-%d %H:%M:%S")
df.timestamp_col type(df.timestamp_col[0])
# <class 'pandas._libs.tslibs.timestamps.Timestamp'>
# --- Group datetime column (timestamp_col) into hour and minute aggregations for values (value_col)
sum()
df.groupby([df.timestamp_col.dt.hour, df.timestamp_col.dt.minute]).value_col.
# remove data time instance is NaT in a dataframe
= df[df[3].apply(lambda x: not isinstance(x, pd._libs.tslib.NaTType))]
df
='2017-01-01', end='2017-10-01', freq='M')
pd.period_range(start# PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06',
# '2017-07', '2017-08', '2017-09', '2017-10'],
# dtype='period[M]', freq='M')
-time1).total_seconds() #Convert timedelta to total seconds
(time2
# --- Converting between python_datetime and Pandas_Timestamp objects
= pd.Timestamp('2014-01-23 00:00:00', tz=None)
ts # Timestamp('2014-01-23 00:00:00', tz=None)
ts.to_pydatetime()# datetime.datetime(2014, 1, 23, 0, 0)
# --- Convert unix time to panda.datetime
df
date price# 0 1349720105 12.08
# 1 1349806505 12.35
'date'] = pd.to_datetime(df['date'],unit='s')
df[
df
date price# 0 2012-10-08 18:15:05 12.08
# 1 2012-10-09 18:15:05 12.35
=pd.to_datetime('1493530261000', unit='ms')
result_msstr(result_ms) # -> '2017-04-30 05:31:01'
# --- round time with dt.round ---
= pd.DataFrame(
df 'Date' : pd.date_range('1/1/2011', periods=5, freq='3675S'),
{'Num' : np.random.rand(5)})
# Date Num
#0 2011-01-01 00:00:00 0.580997
#1 2011-01-01 01:01:15 0.407332
#2 2011-01-01 02:02:30 0.786035
#3 2011-01-01 03:03:45 0.821792
# hourly round
=df.Date.dt.round('H'))
df.assign(Date# Date Num
#0 2011-01-01 00:00:00 0.577957
#1 2011-01-01 01:00:00 0.995748
#2 2011-01-01 02:00:00 0.864013
#3 2011-01-01 03:00:00 0.468762
# --- Use Pandas rolling method, 3600s = 1 Hour -----
'occurrences_in_last_hour'] = df_sample['occurrences'].rolling('3600s').sum()
df_sample[15)
df_sample.head(# occurrences occurrences_in_last_hour
#Run time
#2020-01-01 00:00:00 1 1.0
#2020-01-01 00:04:10 1 2.0
#2020-01-01 00:08:20 1 3.0
#2020-01-01 00:12:30 1 4.0
#2020-01-01 00:16:40 1 5.0
import re
# search
= re.search('(?<=abc)def', 'abcdef')
m 0)
m.group(# 'def'
# sub
re.sub(regex_search,regex_replace,contents)
= pathlib.Path(__file__).parent
CODE_PATH = CODE_PATH.joinpath('data')
DATA_PATH = CODE_PATH.joinpath('log.txt') LOG_PATH
= pd.read_csv('file.csv', header=None, nrows=5)
df
"myDataFrame.csv", index = False, sep=',', encoding='utf-8')
df.to_dense().to_csv(
'file.xlsx')
pd.read_excel('dir/myDataFrame.xlsx', sheet_name='Sheet1') pd.to_excel(
= "C:\\Users\\xxxxx\\Desktop\\2014101709-12.txt"
file_name
with open(file_name) as myfile:
= [next(myfile) for x in range(20)]
head print(head)
import os as os
= [f for f in os.listdir('.') if os.path.isfile(f)]
files for f in files:
print(f)
def save_object(obj, filename):
with open(filename, 'wb') as output: # Overwrites any existing file.
pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
def read_object(filename):
if Path(filename).is_file():
with open(filename, 'rb') as input:
= pickle.load(input)
o return o
else:
return None
= [1, 2, 3]
numbers = ['a', 'b', 'c']
letters = zip(numbers, letters)
zipped # Holds an iterator object
zipped # <zip object at 0x7fa4831153c8>
type(zipped)
# <class 'zip'>
list(zipped)
# [(1, 'a'), (2, 'b'), (3, 'c')]
import logging
try:
print(5 / 0)
except Exception as e:
=True) logging.warning(e.__traceback__, exc_info
# --------- lambda -----------
lambda arguments: expression # Anonymous function objects
# - function -
def add(x, y):
return x + y
2, 3) # Output: 5
add(
# - lambda arguments : expression -
= lambda x, y : x + y
add 2, 3) # Output: 5
add(
# -------- map ----------
map(function, iterable(s)) # The map() iterates through all items in the given iterable and executes the function
= [1,3,7,8]
li= map(lambda x: x*2, li)
map_object print(list(map_object)) # map() function returns the map_object type, can be convert to list
# -------- filter ----------
filter(function, iterable(s)) # filter() forms a new list that contains only elements that satisfy a certain condition, i.e. the function we passed returns True.
= [1,3,7,8]
li= filter(lambda x: x%2==0, li)
filter_object print(list(filter_object))
# --------- reduce ----------
reduce(function, sequence[, initial]) # reduce() works by calling the function we passed for the first two items in the sequence. The result returned by the function is used in another call to function alongside with the next (third in this case), element
from functools import reduce
= [1,3,7,8]
li= reduce(lambda x,y: x+y, li)
reduce_object print(reduce_object)
# storing the value of last expression in interpreter
>>> 10
10
>>> _ * 3
30
# For Ignoring the values
= (1, 2, 3) # x = 1, y = 3
x, _, y
# _single_leading_underscore:
# is used for declaring private variables, functions, methods and classes in a module
class _Base: # private class
= 2 # private variable
_hidden_factor
# __double_leading_underscore:
# for mangling
class A:
def __double_method(self): # the name will be mangled in "_ClassName__method" form.
pass
# __double_leading_and_trailing_underscore__ :
# for special variables or methods
__file__ # indicates the location of Python file
__init__ # will be executed at first when a instance of class is created
import importlib
import foo #import the module here, so that it can be reloaded.
reload(foo) importlib.
The yield statement suspends a function’s execution and sends a value back to the caller. After the caller finish processing the value, the function continues execution immediately after the last yield run.
import time
# the following two generator functions are same
def simpleGeneratorFun():
print("sending 1")
yield 1
print("sending 2")
yield 2
print("sending 3")
yield 3
def simpleGeneratorFun():
for x in [1, 2, 3]:
print(f"sending {x}")
yield x
# caller code to check above generator function
for value in simpleGeneratorFun():
print(f"received {value}")
5)
time.sleep(print(f"finished sleep")
# - output -
# sending 1
# received 1
# finished sleep
# sending 2
# received 2
# finished sleep
# sending 3
# received 3
# finished sleep
from abc import ABC, abstractmethod
# abc is a builtin module, we have to import ABC and abstractmethod
class Animal(ABC): # Inherit from ABC(Abstract base class)
@property
def food_eaten(self):
return self._food
@food_eaten.setter
def food_eaten(self, food):
if food in self.diet:
self._food = food
else:
raise ValueError(f"You can't feed this animal with {food}.")
@property
@abstractmethod # combination of @property and @abstractmethod in order to define an abstract property.
def diet(self):
pass
@abstractmethod # Decorator to define an abstract method. # If a class inherits from an ABC, it must implement all it's abstract methods!
def feed(self, time):
pass
class Lion(Animal):
@property
def diet(self):
return ["antelope", "cheetah", "buffaloe"]
def feed(self, time):
print(f"Feeding a lion with {self._food} meat! At {time}")
class Snake(Animal):
@property
def diet(self):
return ["frog", "rabbit"]
def feed(self, time):
print(f"Feeding a snake with {self._food} meat! At {time}")
= Lion()
leo print(leo.diet)
= "antelope"
leo.food_eaten "10:10 AM")
leo.feed(
= Snake()
adam = "frog"
adam.food_eaten "10:20 AM")
adam.feed(
# ['antelope', 'cheetah', 'buffaloe']
# Feeding a lion with antelope meat! At 10:10 AM
# Feeding a snake with frog meat! At 10:20 AM
import numpy as np
import matplotlib.pyplot as plt
0, 10, 0, 1])
plt.axis([
plt.ion()
for i in range(100):
= np.random.random()
y
plt.scatter(i, y)0.5)
plt.pause(
while True:
0.5) plt.pause(
import random
= [random.randint(0, 10) for _ in range(50)]
values import matplotlib.pyplot as plt
=11)
plt.hist(values, bins
# plot a vertical line
=3) plt.axvline(x