Install anaconda
python -V # get python versionPython Object-Oriented Programming (OOP): https://pynative.com/python-polymorphism/
type(person) # Determine the type of an object
# not None test
if not (val is None):
# ...str() # convert variable to string
int()
float()
bool()
#convert a bool (list) to 0 or 1
B=B*1
# check if a object is None
if foo is None:
print('is None')range(10) # range 0 to 9
list(range(11, 17)) # range to a list: [11, 12, 13, 14, 15, 16]
reversed(range(10)) # reversed range 9 to 0
list(reversed(range(10))) # to print out Lists can contain any data types (including list). mutable
li = [3, 4, 5]
li = [None]*5 # [None, None, None, None, None]
# to a string and back
s = json.dumps(li)
li = json.loads(s)
# check if it is empty
if not li:
print("List is empty")
# check if it is in the list
if 'b' in ['a', 'b', 'c'] :
print('Yes, b found in List')
# sub list
li[0] # 3
li[0:2] # [incluse, excluse] 3,4
li[-1] # get the last element
li[-2] = 3 # Set the second last element
list(li[i] for i in [0, 2]) # first and third element
li[::2] # every second element from default start and stop index. s[start:stop:step]
# last n elements
a = [1, 2, 3, 4, 5, 6, 7, 8]
a[-3:]
# [6, 7, 8]
# add or remove element
li = li + [6]
li = del li[5]
li.remove(4) # removes the first matching value, not a specific index:
li.pop(1) # removes the item at a specific index and returns it.
n = 5
li = li[n:] # remove first n element in the list
# append an element - append is a destructive operation (it modifies the list in place instead of of returning a new list)
li.append('bar')
# combine two lists
l1 = [1,2,3]
l2 = [4,5,6]
mergedlist = l1 + l2
# concatenate element-wise two lists in Python
a = [0, 1, 5, 6, 10, 11]
b = ['asp1', 'asp1', 'asp1', 'asp1', 'asp2', 'asp2']
[m+str(n) for m,n in zip(b,a)]
# copy: reference only
li2 = li
# copy: explicit copy
li2 = li[:] # or li2 = list(li)
[float(n)] # convert a float to list
len(li) # length of list
3 in li # check if the element is in the list
matrix = [ [1, 2, 3, 4], [5, 6, 7, 8], ] # nested list
# if all elements of a list are False
li = [False, False, False]
not any(data) # any() will return True if there's any truth value in the iterable.
# >> True
# loop in the list
li = ['a',' b', 'c ', ' d ']
for i, s in enumerate(li):
li[i] = s.strip()
print(li) # -> ['a', 'b', 'c', 'd']
# loop for the list
li = [(1, 2), (1, 3), (2, 3)]
[sum(tup) for tup in li]
# [3, 4, 5]
# list to string
li = ["This" , "is", "a", "sample", "program"]
' '.join(li) # 'This is a sample program'
','.join(li) # 'This,is,a,sample,program'
# Check if all values in list are greater than a certain number
l = [29, 500, 43]
all(i >= 30 for i in l)
# True
# Sort list by given order of indices
li =["A","B","C","D","E"]
sortedIndex = [3,2,1,0,4]
[li[i] for i in sortedIndex]
# ['D', 'C', 'B', 'A', 'E']
# concatenate element-wise two lists
a = [0, 1, 5]
b = ['as', 'ap', 'asp']
[m+str(n) for m,n in zip(b,a)]
# Convert a list to a string and back
import json
with open(data_file, 'wb') as dump:
dump.write(json.dumps(arbitrary_data))
source = open(data_file, 'rb').read()
data = json.loads(source)Tuples are immutable sequences
t = 12345, 54321, 'hello!'
t[0] # 12345A set is an unordered collection with no duplicate elements
basket = {'apple', 'orange', 'apple', 'pear', 'orange', 'banana'}
# check if a element is in the set
'apple' in basketDictionaries store key-value pairs, kind of like JSON objects.
# Creating an empty dictionary
d = {} # OR data = dict()
# Creating a dictionary with initial values
d = {'a':2,'b':21,'c':3} # OR data = dict(a=2, b=1, c=3)
# Inserting/Updating a single value
d['a']=1 # Updates if 'a' exists, else adds 'a'
# OR data.update(a=1)
# Inserting/Updating multiple values
d.update({'c':3,'d':4}) # Updates 'c' and adds 'd'
# delete a element based on key
del d['a']
# find index of element
index = d.index('c')
# find max based on the key
max(d.items(), key = lambda x: x[0])
# ('c', 3)
# find max based on the value
max(d.items(), key = lambda x: x[0])
# ('b', 21)
# Get list of values for list of keys
mykeys = ["a", "c"]
[d[x] for x in mykeys]
# [2, 3]
# return a key list
key_list = list(d.keys())
# ['a', 'b', 'c']
# return sorted key list
sorted(d, key=(lambda key:d[key]), reverse=True)
# ['b', 'c', 'a']
# return sorted tuple(key, value) list
sorted(d.items(), key=lambda kv: kv[1])
# [('a', 2), ('c', 3), ('b', 21)]
# create a dictionary
person = {
"name": "Amos",
"age": 23,
"hobbies": ["Travelling", "Swimming", "Coding", "Music"]
}
# iterate through the dict and print the keys
for key in person:
print(key)
# iterate through the dict's keys and print their values
for key in person:
print(person[key])
# Convert two lists into a dictionary
keys = ['a', 'b', 'c']
values = [1, 2, 3]
dictionary = dict(zip(keys, values))
print(dictionary)
# {'a': 1, 'b': 2, 'c': 3}Array can contain one data type
# convert list to array
li = [3, 4, 5]
ay = np.array(li)
# initial an array
np.full((3, 5), 7)
# array([[ 7, 7, 7, 7., 7],
# [ 7, 7, 7, 7., 7],
# [ 7, 7, ., 7., 7]])
# to contain multiple data type. assign datatype as object
ay = np.array([["a", 0]] * 5, dtype='O')
# array([['a', 0],
# ['a', 0],
# ['a', 0],
# ['a', 0],
# ['a', 0]], dtype=object)
# initialize data frame
mat = np.array([[1,2,3], [1,2,3], [1,2,3]])
print(mat)
# [[1 2 3]
# [1 2 3]
# [1 2 3]]
np.sum(mat, axis=1) # axis=1 : get the sum of each the columns
# array([6, 6, 6])
# get second value of each element in the array
ay[:, 1]
# array([0, 0, 0, 0, 0], dtype=object)
# get second value of first element in the array
ay[0, 1]
# 0
# set the value
ay[0, 1] = 15
# find the index of element which second value > 3
np.where(ay[:,1]>3)
# (array([0], dtype=int64),)
# check if has an element which second value = 5
if len(np.where(ay[:,1]==5)[0]) >0
# a range value
np.arange(0, 5, 0.5)
# Out[32]: array([0. , 0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5])
# a random value
np.random.randint(10, size=2) # integer value, add replace=False for non duplicated values
np.random.rand(5) # random float between 0 and 1
# convert np array to list
ay.tolist()ay = np.array([[1,2], [3,4]])
# shape of array
ay.shape
# Return a copy of the array collapsed into one dimension
ay.flatten()
# array([1, 2, 3, 4])
np.reshape(ay, 4)
# array([1, 2, 3, 4])# append an array
ay = np.append(ay, ay1)
# delete an element
np.delete(a, index)
# count the occurrence of certain item in an ndarray in Python
a = np.array([0, 3, 0, 1, 0, 1, 2, 1, 0, 0, 0, 0, 1, 3, 4])
unique, counts = numpy.unique(a, return_counts=True)a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6]])
np.concatenate((a, b), axis=0)
# array([[1, 2],
# [3, 4],
# [5, 6]])
np.concatenate((a, b.T), axis=1)
# array([[1, 2, 5],
# [3, 4, 6]])
np.concatenate((a, b), axis=None)
# array([1, 2, 3, 4, 5, 6])# subset by index
arr = np.array([10, 20, 30, 40, 50, 60])
arr[[1,4,5]]
# array([20, 50, 60])
# subset by index for multidimensional arrays:
arr = np.arange(9).reshape(3,3)
arr
# array([[0, 1, 2],
# [3, 4, 5],
# [6, 7, 8]])
arr[[0, 1, 1], [1, 0, 2]]
# array([1, 3, 5])
# the ith column of a np multidimensional array
test = np.array([[1, 2], [3, 4], [5, 6]])
test[:,0]
# array([1, 3, 5])ay1 = np.array([2,3,4,6,7])
ay2 = np.array([1,4,5,5,6,8,8,9])
np.in1d(ay1, ay2)
# array([False, False, True, True, False])
np.setdiff1d(ay1, ay2)
# array([2, 3, 7])
np.isin(ay1, ay2)
# array([False, False, True, True, False])
np.where(np.isin(ay1, ay2))
# (array([2, 3], dtype=int64),) http://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/
A one-dimensional labeled array A capable of holding any data type
# initialized a series from a list
s = pd.Series(['string1', 'string2', 'string3'])
s
# 0 string
# 1 string2
# 2 string3
# dtype: object
s = pd.Series([3, -5, 7, 4], index=['a', 'b', 'c', 'd'])
# get the last element
s.iloc[-1]
s[-1]
# convert to an numpy.ndarray
pd.Series([1, 2, 3]).values
# array([1, 2, 3])
# convert to a list
pd.Series([1, 2, 3]).tolist()
# reset the index
s.reset_index(drop=True, inplace=True) # drop: delete the index entirely; inplace: directly modify and overwrite your original DataFrame
# regex
pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)
# -- Factorize ---
s_class = pd.Series(['a','b','c','d'])
s_class_id = s_class.factorize()[0] # convert string to numbers
s_class_id
# array([0, 1, 2, 3], dtype=int64)
s_class_id, mapping = s_class.factorize() # keep mapping class
mapped_back_to_class = mapping.take(pd.Series([3,2,3,1]))
mapped_back_to_class
# Index(['d', 'c', 'd', 'b'], dtype='object')
# -- check if ALL values are NaN in Series
se.isnull().all()
# -- apply
pd.Series([1, 2, 3]).apply(lambda x :x*2)
# 0 2
# 1 4
# 2 6initialize data frame
df = pd.DataFrame(
{"a":[11 ,12, 13], "b":[21, 22, 23], "c":[31, 32, 33]},
index = [1, 2, 3])
# a b c
# 1 11 21 31
# 2 12 22 32
# 3 13 23 33
# d is your list of dicts to dataframe
l = [{"a":1, "b":1}, {"a":2, "b":2}]
df = pd.DataFrame(l)
# a b
# 0 1 1
# 1 2 2
# numpy array to data frame
X = np.array([0, 3, 0, 1])
df = pd.DataFrame(X, index=np.arange(0,len(X)))
# Out[8]:
# 0
# 0 0
# 1 3
# 2 0
# 3 1
X = np.array([[0, 3, 0], [2, 2, 2], [7, 7, 7]])
df = pd.DataFrame(X, index=np.arange(0,len(X)))
# Out[10]:
# 0 1 2
# 0 0 3 0
# 1 2 2 2
# 2 7 7 7
# from dict to dataframe
pd.DataFrame.from_dict(dict)
df.mean() # mean of each column
df.shape # (rows,columns)
df.columns # column names
df.index # Describe index
df.info() # Info on DataFrame
df.count() # Number of non-NA values
# column rename
df = pd.DataFrame({'aa':[1,2], 'bb': [10,20]})
df.columns = ['a', 'b']
df
# a b
# 0 1 10
# 1 2 20
df = df.rename({'a': 'axx'}, axis='columns') # or axis=1
df[3].value_counts() # frequency table of a column, e.g. column 3
df = df.values #convert a pandas dataframe (df) to a numpy ndarray,
df = df.rename(columns={'old_name':'new_name'}) # rename a column name
df.isnull().all() # check if ALL values are NaN in DataFramedf.drop_duplicates() # Remove duplicate rows (only considers columns).
df3 = df3[~df3.index.duplicated(keep='first')] #Remove rows with duplicate indices, using the duplicated method on the Pandas Index itself
# keep only the last entry from duplicate values
df = pd.DataFrame({'A': ['x1', 'x2', 'x3', 'x1',], 'B': [85, 70, 80, 85]})
# A B
#0 x1 85
#1 x2 70
#2 x3 80
#3 x1 85
df2 = df.drop_duplicates(subset = ['A','B'], keep ='last').reset_index(drop = True)
df2
# A B
#0 x2 70
#1 x3 80
#2 x1 85df.fillna(0) # Replace all the NaN values with Zero
df1 = df.replace(np.nan, '', regex=True) # Replace all the NaN with blank/empty string
df.dropna(axis=1, how='all') # Drop the columns where all elements are NaNdf = pd.DataFrame({'c1': [10, 11, 12], 'c2': [100, 110, 120]})
for index, row in df.iterrows():
print(index, row['c1'], row['c2'])
# 0 10 100
# 1 11 110
# 2 12 120df = pd.DataFrame({'P': [1, 2], 'Q': [0.7, 0.85]},
index=['p', 'q'])
# P Q
# p 1 0.70
# q 2 0.85
# to structure numpy array. it will keep column name and data type information
df.to_records()
# rec.array([('p', 1, 0.7 ), ('q', 2, 0.85)],
# dtype=[('index', 'O'), ('P', '<i8'), ('Q', '<f8')])
# convert back to dataframe
pd.DataFrame(df.to_records())
# index P Q
# 0 p 1 0.70
# 1 q 2 0.85
pd.DataFrame(df.to_records()).set_index('index')
# P Q
# index
# p 1 0.70
# q 2 0.85# Order rows by values of a column
df.sort_values('mpg') # (low to high).
df.sort_values('mpg', ascending=False) # (high to low).
#sort by two or more columns?
df1 = df1.sort_values(['a', 'b'], ascending=[True, False])
# Shuffle DataFrame rows
df.sample(frac=1) # The frac specifies the fraction of rows to return in the random sample, frac=1 means return all rows.
# random n row selection
df.sample(n)
# reset index
df = df.reset_index(drop=True)# change value of a cell
df.at['C', 'x'] = 10
#--------- row selection ---------------
df.head(n) # Select first n rows.
df.tail(n) # Select last n rows.
# based on position
df[0:3] # select the first 3 rows (rows 0,1,2)
df[:5] # select the first 5 rows (rows 0,1,2,3,4)
df[-1:] # select the last row
#iloc: Select Column & Row by Positions (it only takes integers).
# rows:
df.iloc[0] # select the first row
df.iloc[2:] # select every row beginning with the third row
df.iloc[-1] # last row of data frame
# Columns:
data.iloc[:,0] # first column of data frame
data.iloc[:,1] # second column of data frame
data.iloc[:,-1] # last column of data frame
# rows and columns
df.iloc[0:3, 1:4] # select the first 3 rows (rows 0,1,2) and (columns 1, 2 and 3)
# return type
df.iloc[3] # return a pandas series when one row is selected
df.iloc[[3]] # return a pandas dataframe. pass a single valued list to return a dataframe
df.iloc[2:3] # return a pandas dataframe when multiple rows are selected
df=pd.DataFrame(
{'age':[30,24,32], 'name':['john','jack','luck'], 'gender':['M','M','F']},
index=['a', 'b', 'c'])
# age name gender
# a 30 john M
# b 24 jack M
# c 32 luck F
#loc: Select Column & Rows by name/index_label
# rows:
df.loc[:3] # slice up to and including label 3 (in the case of label is integer)
df.loc[:'b'] # select rows by b
df.loc['b':'c',:] # select rows from b to c
df.loc[['b','c'],:] # select rows by multiple names/labels
df.loc[df['column_name'] == some_value] # select rows whose column value equals a scalar
df.loc[~df['column_name'].isin(some_values)] # select rows whose column value is in an iterable
df.loc[(df['column_name'] == some_value) & df['other_column'].isin(some_values)]
df[df.Length>7] # select rows that meet logical criteria.
# loop for each row
for index, row in df.iterrows():
print row['column_name_1'], row['column_name_2']
# 10 100
# 11 110
# 12 120
# copy as a new df, not only reference
df2 = df[df['B'] == 'b.2'].copy()
#--------- column selection ---------------
# Select columns by names
df['name']
df[['age', 'name']]
df.columns.get_loc("column_name")
df.loc[:, ['age', 'name']] # = df[['age', 'name']]
# select col_name1, col_name2 from table where column_name = some_value.
df.loc[df['column_name'] == some_value][[col_name1, col_name2]]
# Select columns by regular expression
df.filter(regex='name|address')
# select columns by a list
lst = ['name','age','weight']
df[np.intersect1d(df.columns, lst)]
# age name
#0 30 john
#1 24 jack
#2 32 luc
# select Rows by integers and Columns by names
df.loc[df.index[1], 'name'] # second index by df.index[1]:
# or
df.iloc[1, df.columns.get_loc('name')] # get position of column name by get_loc
# --------- Cell value selection ---------------
# if subset is one row dataframe, use .values[0] to get the value
df.loc[df['name']=='jack'] # subset result has only one row
# age name gender
# b 24 jack M
df.loc[df['name']=='jack']['age'] # it return a series
# b 24
# Name: age, dtype: int64
df.loc[df['name']=='jack']['age'].values[0]
# 24# append a column
df['new_col'] = pd.Series(np.arange(0, 100), index=df.index)
# append a list as a row
df = pd.DataFrame([[1, 2], [3, 4]], columns = ["a", "b"])
# a b
# 0 1 2
# 1 3 4
# fist approach
li = [5, 6]
df.loc[len(df)] = li
# a b
# 0 1 2
# 1 3 4
# 2 5 6
# second approach
a_series = pd.Series(li, index = df.columns)
df = df.append(a_series, ignore_index=True) # not use the index labels of a_seriesdf.drop(df.tail(n).index) # drop last n rows
df.drop(df.head(n).index) # drop first n rows
# delete rows based on index
df = df.drop(df.index[1: 4])
# delete column on i index
df = df.drop(df.columns[i], axis=1) # note: it will remove both columns if there is duplicate names in columns,
# delete column with column name
df = df.drop(['column_name_A', 'column_name_B'], axis=1)
# delete columns based on regular expression
df = df[df.columns.drop(list(df.filter(regex='Test')))]
# drop columns which have same values in all rows
nunique = df.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
df.drop(cols_to_drop, axis=1)# sum each row of column
df.sum(axis=1)# groupby dataframe by key
grouped = df.groupby('gender')
grouped.groups.keys() # list keys
for name, group in grouped:
print('key:' name)
print('value:' group)
data = pd.DataFrame({'user_id': ['a1', 'a1', 'a1', 'a2', 'a2', 'a2', 'a3', 'a3', 'a3'],
'product_id': ['p1', 'p1', 'p2', 'p1', 'p1', 'p1', 'p2', 'p2', 'p3'],
'product_count': [1, 2, 2, 2, 2, 1, 4, 10, 30]})
count_series = data.groupby(['user_id', 'product_id']).size()
count_series # has a hierarchical index
# user_id product_id
# a1 p1 2
# p2 1
# a2 p1 3
# a3 p2 2
# p3 1
# Converting a GroupBy output from Series to DataFrame
count_df = count_series.to_frame(name = '_size').reset_index()
count_df
# user_id product_id _size
# 0 a1 p1 2
# 1 a1 p2 1
# 2 a2 p1 3
# 3 a3 p2 2
# 4 a3 p3 1
sum_series = data.groupby(['user_id', 'product_id'])['product_count'].sum() # Compute sum of group values.
sum_series
# user_id product_id
# a1 p1 3
# p2 2
# a2 p1 5
# a3 p2 14
# p3 30
# select top n row from each group after group by in pandas
data.sort_values('product_count', ascending=False).groupby(['user_id'], sort=False).head(3)# Append a dataframe
df1 = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
df1
# A B
# 0 1 2
# 1 3 4
df2 = pd.DataFrame([[1, 6], [7, 8]], columns=list('AB'))
# ----------- pd.concat -----------------
df = pd.concat([df1, df2]) # concatenate a list of dataframes along row
# A B
#0 1 2
#1 3 4
#0 1 6
#1 7 8
# reset index if necessary
df = df.reset_index(drop=True)
pd.concat([df1, df2], axis=1) # concatenate a list of dataframes along column
# A B A B
#0 1 2 1 6
#1 3 4 7 8
# ----------- pd.merge, Database-style -----------------
df = pd.merge(df1, df2, on="A") # how='inner' by default.
# A B_x B_y
#0 1 2 6
df = pd.merge(df1, df2, on="A", how="left")
# A B_x B_y
#0 1 2 6.0
#1 3 4 NaNimport pandas as pd
table_name = "test"
df = pd.DataFrame(
{"a": [11, 12, 13], "b": [21, 22, 23], "c": ["c1", "c2", "c3"]},
index=[1, 2, 3])
sql = '''select a, c from test where b >=22 '''
# ----------- sqlite3 -----------------------
import sqlite3
cnx = sqlite3.connect(':memory:') # create a db connection
df.to_sql(name=table_name, con=cnx) # load data into db
output = pd.read_sql(sql, cnx)
print(output)
# a c
# 0 12 c2
# 1 13 c3
# --------- sqlglot -------------------------
from sqlglot.executor import execute
tables = {table_name: df.to_dict("records")} # load data into a dict
# {'test': [{'a': 11, 'b': 21, 'c': 'c1'}, {'a': 12, 'b': 22, 'c': 'c2'}, {'a': 13, 'b': 23, 'c': 'c3'}]}
output = execute(sql, tables=tables)
sql_df = pd.DataFrame(output.rows, columns=output.columns)
print(sql_df)
# ------- duckdb -----------------
import duckdb
g = globals()
g[table_name] = df # load data in global variable
output = duckdb.query(sql).to_df()
print(output)
# --------- pandasql ------------
from pandasql import sqldf
g = globals()
g[table_name] = df # load data in global variable
output = sqldf(sql)
print(output)https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html
df['new_col'] = data['column'].rolling(5).mean() # Rolling Mean on pandas on a specific column
# Pandas Rolling Apply custom function
def my_test(vals):
values = vals.values
return values[0]
df['new_col'] = data['column'].rolling(5).apply(my_test)# type of each column
df.dtypes
# convert ALL columns to strings
df = df.astype(str)
# convert selected columns
df[["D", "E"]] = df[["D", "E"]].astype(int)
# To Factors
s = pd.Series(['single', 'touching', 'nuclei', 'dusts', 'touching', 'single', 'nuclei'])
s_enc = pd.factorize(s) import datetime as dt
# date
dt.date.today() # datetime.date(2021, 11, 19)
# datetime
dt.datetime.now() # now
dt.datetime.now().year # now.month or now.hour, etc
dt.datetime.now().month # now.month
'%02d' % datetime.dt.now().month # ‘04’
dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # format time stamp
dt.datetime.now() - datetime.timedelta(days=2) # Day before Yesterday
dt.datetime.now() - datetime.timedelta(minutes=15)
dt.datetime.now().date() # only keep date part of datetime
dt.datetime.today() + dt.timedelta(days=2) # after 2 days
# combine date and time strings to single datetime object
mytime = dt.datetime.strptime('0130','%H%M').time()
mydatetime = dt.datetime.combine(dt.date.today(), mytime)
# ---- unix time ------
# convert (Unix) timestamp (the number of seconds since January 1st, 1970).
dt.datetime.fromtimestamp(1485714600).strftime("%A, %B %d, %Y %I:%M:%S")
'Sunday, January 29, 2017 08:30:00'
# DateTime to Unix timestamp with 13 digits
presentDate = dt.datetime.now()
unix_timestamp = dt.datetime.timestamp(presentDate)*1000
print(unix_timestamp)
# substract 5 minutes from a Unix timestamp
five_minutes_before=$((timestamp - 5 * 60 * 1000))Convert string to datetime
from datetime import datetime datetime.strptime(‘Jun 1 2005 1:33PM’, ‘%b %d %Y %I:%M%p’)
https://stackoverflow.com/questions/53892450/get-the-format-in-dateutil-parse https://stackabuse.com/converting-strings-to-datetime-in-python/
formats = ['%Y-%m-%dT%H:%M:%S*%f%z','%Y %b %d %H:%M:%S.%f %Z','%b %d %H:%M:%S %z %Y','%d/%b/%Y:%H:%M:%S %z','%b %d, %Y %I:%M:%S %p','%b %d %Y %H:%M:%S','%b %d %H:%M:%S %Y','%b %d %H:%M:%S %z','%b %d %H:%M:%S','%Y-%m-%dT%H:%M:%S%z','%Y-%m-%dT%H:%M:%S.%f%z','%Y-%m-%d %H:%M:%S %z','%Y-%m-%d %H:%M:%S%z','%Y-%m-%d %H:%M:%S,%f','%Y/%m/%d*%H:%M:%S','%Y %b %d %H:%M:%S.%f*%Z','%Y %b %d %H:%M:%S.%f','%Y-%m-%d %H:%M:%S,%f%z','%Y-%m-%d %H:%M:%S.%f','%Y-%m-%d %H:%M:%S.%f%z','%Y-%m-%dT%H:%M:%S.%f','%Y-%m-%dT%H:%M:%S','%Y-%m-%dT%H:%M:%S%Z','%Y-%m-%dT%H:%M:%S.%f','%Y-%m-%dT%H:%M:%S','%Y-%m-%d*%H:%M:%S:%f','%Y-%m-%d*%H:%M:%S','%y-%m-%d %H:%M:%S,%f %z','%y-%m-%d %H:%M:%S,%f','%y-%m-%d %H:%M:%S','%y/%m/%d %H:%M:%S','%y%m%d %H:%M:%S','%Y%m%d %H:%M:%S.%f','%m/%d/%y*%H:%M:%S','%m/%d/%Y*%H:%M:%S','%m/%d/%Y*%H:%M:%S*%f','%m/%d/%y %H:%M:%S %z','%m/%d/%Y %H:%M:%S %z','%H:%M:%S','%H:%M:%S.%f','%H:%M:%S,%f','%d/%b %H:%M:%S,%f','%d/%b/%Y:%H:%M:%S','%d/%b/%Y %H:%M:%S','%d-%b-%Y %H:%M:%S','%d-%b-%Y %H:%M:%S.%f','%d %b %Y %H:%M:%S','%d %b %Y %H:%M:%S*%f','%m%d_%H:%M:%S','%m%d_%H:%M:%S.%f','%m/%d/%Y %I:%M:%S %p:%f','%m/%d/%Y %H:%M:%S %p']
datestrings = ['2018-08-20T13:20:10*633+0000','2017 Mar 03 05:12:41.211 PDT','Jan 21 18:20:11 +0000 2017','19/Apr/2017:06:36:15 -0700','Dec 2, 2017 2:39:58 AM','Jun 09 2018 15:28:14','Apr 20 00:00:35 2010','Sep 28 19:00:00 +0000','Mar 16 08:12:04','2017-10-14T22:11:20+0000','2017-07-01T14:59:55.711+0000','2017-08-19 12:17:55 -0400','2017-08-19 12:17:55-0400','2017-06-26 02:31:29,573','2017/04/12*19:37:50','2018 Apr 13 22:08:13.211*PDT','2017 Mar 10 01:44:20.392','2017-03-10 14:30:12,655+0000','2018-02-27 15:35:20.311','2017-03-12 13:11:34.222-0700','2017-07-22T16:28:55.444','2017-09-08T03:13:10','2017-03-12T17:56:22-0700','2017-11-22T10:10:15.455','2017-02-11T18:31:44','2017-10-30*02:47:33:899','2017-07-04*13:23:55','11-02-11 16:47:35,985 +0000','10-06-26 02:31:29,573','10-04-19 12:00:17','06/01/22 04:11:05','150423 11:42:35','20150423 11:42:35.173','08/10/11*13:33:56','11/22/2017*05:13:11','05/09/2017*08:22:14*612','04/23/17 04:34:22 +0000','10/03/2017 07:29:46 -0700','11:42:35','11:42:35.173','11:42:35,173','23/Apr 11:42:35,173','23/Apr/2017:11:42:35','23/Apr/2017 11:42:35','23-Apr-2017 11:42:35','23-Apr-2017 11:42:35.883','23 Apr 2017 11:42:35','23 Apr 2017 10:32:35*311','0423_11:42:35','0423_11:42:35.883','8/5/2011 3:31:18 AM:234','9/28/2011 2:23:15 PM']
from datetime import datetime
def parse_timestamp(datestring, formats):
for f in formats:
try:
d = datetime.strptime(datestring, f)
except:
continue
return (d, f)
return (datestring, 'Unable to parse format')# --- convert string to panda datatime
df = pd.DataFrame({'timestamp_col':['2005-05-23 00:00:00', '2005-05-24 00:00:00']})
# dt_col
#0 2005-05-23 00:00:00
#1 2005-05-24 00:00:00
df.timestamp_col = pd.to_datetime(df.timestamp_col, format="%Y-%m-%d %H:%M:%S")
type(df.timestamp_col[0])
# <class 'pandas._libs.tslibs.timestamps.Timestamp'>
# --- Group datetime column (timestamp_col) into hour and minute aggregations for values (value_col)
df.groupby([df.timestamp_col.dt.hour, df.timestamp_col.dt.minute]).value_col.sum()
# remove data time instance is NaT in a dataframe
df = df[df[3].apply(lambda x: not isinstance(x, pd._libs.tslib.NaTType))]
pd.period_range(start='2017-01-01', end='2017-10-01', freq='M')
# PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06',
# '2017-07', '2017-08', '2017-09', '2017-10'],
# dtype='period[M]', freq='M')
(time2-time1).total_seconds() #Convert timedelta to total seconds
# --- Converting between python_datetime and Pandas_Timestamp objects
ts = pd.Timestamp('2014-01-23 00:00:00', tz=None)
# Timestamp('2014-01-23 00:00:00', tz=None)
ts.to_pydatetime()
# datetime.datetime(2014, 1, 23, 0, 0)
# --- Convert unix time to panda.datetime
df
date price
# 0 1349720105 12.08
# 1 1349806505 12.35
df['date'] = pd.to_datetime(df['date'],unit='s')
df
date price
# 0 2012-10-08 18:15:05 12.08
# 1 2012-10-09 18:15:05 12.35
result_ms=pd.to_datetime('1493530261000', unit='ms')
str(result_ms) # -> '2017-04-30 05:31:01'
# --- round time with dt.round ---
df = pd.DataFrame(
{'Date' : pd.date_range('1/1/2011', periods=5, freq='3675S'),
'Num' : np.random.rand(5)})
# Date Num
#0 2011-01-01 00:00:00 0.580997
#1 2011-01-01 01:01:15 0.407332
#2 2011-01-01 02:02:30 0.786035
#3 2011-01-01 03:03:45 0.821792
# hourly round
df.assign(Date=df.Date.dt.round('H'))
# Date Num
#0 2011-01-01 00:00:00 0.577957
#1 2011-01-01 01:00:00 0.995748
#2 2011-01-01 02:00:00 0.864013
#3 2011-01-01 03:00:00 0.468762
# --- Use Pandas rolling method, 3600s = 1 Hour -----
df_sample['occurrences_in_last_hour'] = df_sample['occurrences'].rolling('3600s').sum()
df_sample.head(15)
# occurrences occurrences_in_last_hour
#Run time
#2020-01-01 00:00:00 1 1.0
#2020-01-01 00:04:10 1 2.0
#2020-01-01 00:08:20 1 3.0
#2020-01-01 00:12:30 1 4.0
#2020-01-01 00:16:40 1 5.0import re
# search
m = re.search('(?<=abc)def', 'abcdef')
m.group(0)
# 'def'
# sub
re.sub(regex_search,regex_replace,contents)CODE_PATH = pathlib.Path(__file__).parent
DATA_PATH = CODE_PATH.joinpath('data')
LOG_PATH = CODE_PATH.joinpath('log.txt')df = pd.read_csv('file.csv', header=None, nrows=5)
df.to_dense().to_csv("myDataFrame.csv", index = False, sep=',', encoding='utf-8')
pd.read_excel('file.xlsx')
pd.to_excel('dir/myDataFrame.xlsx', sheet_name='Sheet1')file_name = "C:\\Users\\xxxxx\\Desktop\\2014101709-12.txt"
with open(file_name) as myfile:
head = [next(myfile) for x in range(20)]
print(head)import os as os
files = [f for f in os.listdir('.') if os.path.isfile(f)]
for f in files:
print(f)def save_object(obj, filename):
with open(filename, 'wb') as output: # Overwrites any existing file.
pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
def read_object(filename):
if Path(filename).is_file():
with open(filename, 'rb') as input:
o = pickle.load(input)
return o
else:
return None numbers = [1, 2, 3]
letters = ['a', 'b', 'c']
zipped = zip(numbers, letters)
zipped # Holds an iterator object
# <zip object at 0x7fa4831153c8>
type(zipped)
# <class 'zip'>
list(zipped)
# [(1, 'a'), (2, 'b'), (3, 'c')]import logging
try:
print(5 / 0)
except Exception as e:
logging.warning(e.__traceback__, exc_info=True)# --------- lambda -----------
lambda arguments: expression # Anonymous function objects
# - function -
def add(x, y):
return x + y
add(2, 3) # Output: 5
# - lambda arguments : expression -
add = lambda x, y : x + y
add(2, 3) # Output: 5
# -------- map ----------
map(function, iterable(s)) # The map() iterates through all items in the given iterable and executes the function
li= [1,3,7,8]
map_object = map(lambda x: x*2, li)
print(list(map_object)) # map() function returns the map_object type, can be convert to list
# -------- filter ----------
filter(function, iterable(s)) # filter() forms a new list that contains only elements that satisfy a certain condition, i.e. the function we passed returns True.
li= [1,3,7,8]
filter_object = filter(lambda x: x%2==0, li)
print(list(filter_object))
# --------- reduce ----------
reduce(function, sequence[, initial]) # reduce() works by calling the function we passed for the first two items in the sequence. The result returned by the function is used in another call to function alongside with the next (third in this case), element
from functools import reduce
li= [1,3,7,8]
reduce_object = reduce(lambda x,y: x+y, li)
print(reduce_object)# storing the value of last expression in interpreter
>>> 10
10
>>> _ * 3
30
# For Ignoring the values
x, _, y = (1, 2, 3) # x = 1, y = 3
# _single_leading_underscore:
# is used for declaring private variables, functions, methods and classes in a module
class _Base: # private class
_hidden_factor = 2 # private variable
# __double_leading_underscore:
# for mangling
class A:
def __double_method(self): # the name will be mangled in "_ClassName__method" form.
pass
# __double_leading_and_trailing_underscore__ :
# for special variables or methods
__file__ # indicates the location of Python file
__init__ # will be executed at first when a instance of class is createdimport importlib
import foo #import the module here, so that it can be reloaded.
importlib.reload(foo)The yield statement suspends a function’s execution and sends a value back to the caller. After the caller finish processing the value, the function continues execution immediately after the last yield run.
import time
# the following two generator functions are same
def simpleGeneratorFun():
print("sending 1")
yield 1
print("sending 2")
yield 2
print("sending 3")
yield 3
def simpleGeneratorFun():
for x in [1, 2, 3]:
print(f"sending {x}")
yield x
# caller code to check above generator function
for value in simpleGeneratorFun():
print(f"received {value}")
time.sleep(5)
print(f"finished sleep")
# - output -
# sending 1
# received 1
# finished sleep
# sending 2
# received 2
# finished sleep
# sending 3
# received 3
# finished sleepfrom abc import ABC, abstractmethod
# abc is a builtin module, we have to import ABC and abstractmethod
class Animal(ABC): # Inherit from ABC(Abstract base class)
@property
def food_eaten(self):
return self._food
@food_eaten.setter
def food_eaten(self, food):
if food in self.diet:
self._food = food
else:
raise ValueError(f"You can't feed this animal with {food}.")
@property
@abstractmethod # combination of @property and @abstractmethod in order to define an abstract property.
def diet(self):
pass
@abstractmethod # Decorator to define an abstract method. # If a class inherits from an ABC, it must implement all it's abstract methods!
def feed(self, time):
pass
class Lion(Animal):
@property
def diet(self):
return ["antelope", "cheetah", "buffaloe"]
def feed(self, time):
print(f"Feeding a lion with {self._food} meat! At {time}")
class Snake(Animal):
@property
def diet(self):
return ["frog", "rabbit"]
def feed(self, time):
print(f"Feeding a snake with {self._food} meat! At {time}")
leo = Lion()
print(leo.diet)
leo.food_eaten = "antelope"
leo.feed("10:10 AM")
adam = Snake()
adam.food_eaten = "frog"
adam.feed("10:20 AM")
# ['antelope', 'cheetah', 'buffaloe']
# Feeding a lion with antelope meat! At 10:10 AM
# Feeding a snake with frog meat! At 10:20 AMimport numpy as np
import matplotlib.pyplot as plt
plt.axis([0, 10, 0, 1])
plt.ion()
for i in range(100):
y = np.random.random()
plt.scatter(i, y)
plt.pause(0.5)
while True:
plt.pause(0.5)import random
values = [random.randint(0, 10) for _ in range(50)]
import matplotlib.pyplot as plt
plt.hist(values, bins=11)
# plot a vertical line
plt.axvline(x=3)