1 Basic

Install anaconda

python -V # get python version

2 Data types

type(person) # Determine the type of an object

# not None test
if not (val is None):
    # ...

2.1 Atomic data types

str() # convert variable to string
int() 
float()
bool()

#convert a bool (list) to 0 or 1
B=B*1

# check if a object is None
if foo is None: 
  print('is None')

2.2 Complex data types

2.2.1 Range

range(10) # range 0 to 9
list(range(11, 17)) # range to a list: [11, 12, 13, 14, 15, 16]

reversed(range(10)) # reversed range 9 to 0

list(reversed(range(10))) # to print out 

2.2.2 List

Lists can contain any data types (including list). mutable

li = [3, 4, 5]

li = [None]*5 # [None, None, None, None, None]

# check if it is empty
if not li:
  print("List is empty")
  
# check if it is in the list
if 'b' in ['a', 'b', 'c'] :
    print('Yes, b found in List')

# sub list
li[0] # 3
li[0:2] # [incluse, excluse] 3,4
li[-1] # get the last element
li[-2] = 3 # Set the second last element
list(li[i] for i in [0, 2]) # first and third element 
li[::2] # every second element from default start and stop index. s[start:stop:step]

# last n elements
a = [1, 2, 3, 4, 5, 6, 7, 8]
a[-3:]
# [6, 7, 8]

# add or remove element
li = li + [6]
li = del li[5]
li.remove(4) # removes the first matching value, not a specific index:
li.pop(1) # removes the item at a specific index and returns it.
n = 5
li = li[n:] # remove first n element in the list

# append an element - append is a destructive operation (it modifies the list in place instead of of returning a new list)
li.append('bar')

# combine two lists
l1 = [1,2,3]
l2 = [4,5,6]
mergedlist = l1 + l2

# concatenate element-wise two lists in Python
a = [0, 1, 5, 6, 10, 11] 
b = ['asp1', 'asp1', 'asp1', 'asp1', 'asp2', 'asp2']
[m+str(n) for m,n in zip(b,a)]

# copy: reference only
li2 = li
# copy: explicit copy 
li2 = li[:] # or li2 = list(li)

[float(n)] # convert a float to list

len(li) # length of list

3 in li # check if the element is in the list

matrix = [ [1, 2, 3, 4], [5, 6, 7, 8], ] # nested list

# if all elements of a list are False
li = [False, False, False]
not any(data) # any() will return True if there's any truth value in the iterable.
# >> True

# loop in the list
li = ['a',' b', 'c ', ' d ']
for i, s in enumerate(li):
  li[i] = s.strip()
  print(li) # -> ['a', 'b', 'c', 'd']

# loop for the list
li = [(1, 2), (1, 3), (2, 3)]
[sum(tup) for tup in li]
# [3, 4, 5]

# list to string
li = ["This" , "is", "a", "sample", "program"]
' '.join(li) # 'This is a sample program'
','.join(li) # 'This,is,a,sample,program'

# Check if all values in list are greater than a certain number
l = [29, 500, 43]
all(i >= 30 for i in l)
# True

# Sort list by given order of indices
li =["A","B","C","D","E"]
sortedIndex = [3,2,1,0,4]
[li[i] for i in sortedIndex]
# ['D', 'C', 'B', 'A', 'E']

# concatenate element-wise two lists
a = [0, 1, 5] 
b = ['as', 'ap', 'asp']
[m+str(n) for m,n in zip(b,a)]

# Convert a list to a string and back
import json
with open(data_file, 'wb') as dump:
    dump.write(json.dumps(arbitrary_data))

source = open(data_file, 'rb').read()
data = json.loads(source)

2.2.3 Tuple

Tuples are immutable sequences

t = 12345, 54321, 'hello!'
t[0] # 12345

2.2.4 Set

A set is an unordered collection with no duplicate elements

basket = {'apple', 'orange', 'apple', 'pear', 'orange', 'banana'}

# check if a element is in the set
'apple' in basket

2.2.5 Dictionary

Dictionaries store key-value pairs, kind of like JSON objects.

# Creating an empty dictionary
d = {} # OR data = dict()

# Creating a dictionary with initial values
d = {'a':2,'b':21,'c':3} # OR data = dict(a=2, b=1, c=3)

# Inserting/Updating a single value
d['a']=1  # Updates if 'a' exists, else adds 'a'
# OR data.update(a=1)

# Inserting/Updating multiple values
d.update({'c':3,'d':4})  # Updates 'c' and adds 'd'

# delete a element based on key
del d['a']

# find index of element
index = d.index('c')

# find max based on the key
max(d.items(), key = lambda x: x[0])
# ('c', 3)

# find max based on the value
max(d.items(), key = lambda x: x[0])
# ('b', 21)

# Get list of values for list of keys
mykeys = ["a", "c"]
[d[x] for x in mykeys]
# [2, 3]

# return a key list
key_list = list(d.keys())
# ['a', 'b', 'c']

# return sorted key list
sorted(d, key=(lambda key:d[key]), reverse=True)
# ['b', 'c', 'a']

# return sorted tuple(key, value) list
sorted(d.items(), key=lambda kv: kv[1]) 
# [('a', 2), ('c', 3), ('b', 21)]

# create a dictionary
person = {
            "name": "Amos",
            "age": 23,
            "hobbies": ["Travelling", "Swimming", "Coding", "Music"]
        }

# iterate through the dict and print the keys
for key in person:
    print(key)

# iterate through the dict's keys and print their values
for key in person:
    print(person[key])
    
# Convert two lists into a dictionary
keys = ['a', 'b', 'c']
values = [1, 2, 3]
dictionary = dict(zip(keys, values))
print(dictionary)
# {'a': 1, 'b': 2, 'c': 3}

3 Other Data Structures

3.1 Numpy::Array

Array can contain one data type

3.1.1 initialization

# convert list to array
li = [3, 4, 5]
ay = np.array(li)

# initial an array
np.full((3, 5), 7)
# array([[ 7,  7,  7,  7.,  7],
#        [ 7,  7,  7,  7.,  7],
#        [ 7,  7,  .,  7.,  7]])

# to contain multiple data type. assign datatype as object
ay = np.array([["a", 0]] * 5, dtype='O')
# array([['a', 0],
#        ['a', 0],
#        ['a', 0],
#        ['a', 0],
#        ['a', 0]], dtype=object)

# get second value of each element in the array
ay[:, 1]
# array([0, 0, 0, 0, 0], dtype=object)

# get second value of first element in the array
ay[0, 1]
# 0

# set the value 
ay[0, 1] = 15

# find the index of element which second value > 3
np.where(ay[:,1]>3)
# (array([0], dtype=int64),)

# check if has an element which second value = 5
if len(np.where(ay[:,1]==5)[0]) >0 

# a range value
np.arange(0, 5, 0.5)
# Out[32]: array([0. , 0.5, 1. , 1.5, 2. , 2.5, 3. , 3.5, 4. , 4.5])

# a random value
np.random.randint(10, size=2) # integer value, add replace=False for non duplicated values
np.random.rand(5) # random float between 0 and 1

# convert np array to list
ay.tolist()

3.1.2 shape

ay = np.array([[1,2], [3,4]])

# shape of array
ay.shape

# Return a copy of the array collapsed into one dimension
ay.flatten() 
# array([1, 2, 3, 4])

np.reshape(ay, 4)
# array([1, 2, 3, 4])

3.1.3 append / delete

# append an array
ay = np.append(ay, ay1)

# delete an element
np.delete(a, index)

# count the occurrence of certain item in an ndarray in Python
a = np.array([0, 3, 0, 1, 0, 1, 2, 1, 0, 0, 0, 0, 1, 3, 4])
unique, counts = numpy.unique(a, return_counts=True)

3.1.4 combine

a = np.array([[1, 2], [3, 4]])
b = np.array([[5, 6]])
np.concatenate((a, b), axis=0)
# array([[1, 2],
#        [3, 4],
#        [5, 6]])

np.concatenate((a, b.T), axis=1)
# array([[1, 2, 5],
#        [3, 4, 6]])

np.concatenate((a, b), axis=None)
# array([1, 2, 3, 4, 5, 6])

3.1.5 subset

# subset by index
arr = np.array([10, 20, 30, 40, 50, 60])
arr[[1,4,5]]
# array([20, 50, 60])

# subset by index for multidimensional arrays:
arr = np.arange(9).reshape(3,3)
arr
# array([[0, 1, 2],
#       [3, 4, 5],
#       [6, 7, 8]])
arr[[0, 1, 1], [1, 0, 2]]
# array([1, 3, 5])

# the ith column of a np multidimensional array 
test = np.array([[1, 2], [3, 4], [5, 6]])
test[:,0] 
# array([1, 3, 5])

3.1.6 compare

ay1 = np.array([2,3,4,6,7])
ay2 = np.array([1,4,5,5,6,8,8,9])

np.in1d(ay1, ay2)
# array([False, False,  True,  True, False])

np.setdiff1d(ay1, ay2)
# array([2, 3, 7])

np.isin(ay1, ay2)
# array([False, False,  True,  True, False])

np.where(np.isin(ay1, ay2))
# (array([2, 3], dtype=int64),)   

Initialize data frame

mat = np.array([[1,2,3], [1,2,3], [1,2,3]])
print(mat)
#     [[1 2 3]
#     [1 2 3]
#     [1 2 3]]

np.sum(mat, axis=1)  # axis=1  "get the sum of each the columns" 

3.2 Panda::Series

http://manishamde.github.io/blog/2013/03/07/pandas-and-python-top-10/

A one-dimensional labeled array A capable of holding any data type

# initialized a series from a list
s = pd.Series(['string1', 'string2', 'string3'])
s
# 0    string
# 1    string2
# 2    string3
# dtype: object

s = pd.Series([3, -5, 7, 4], index=['a', 'b', 'c', 'd'])

# get the last element 
s.iloc[-1]
s[-1]

# convert to an numpy.ndarray
pd.Series([1, 2, 3]).values
# array([1, 2, 3])

# convert to a list
pd.Series([1, 2, 3]).tolist()

# reset the index
s.reset_index(drop=True, inplace=True) # drop: delete the index entirely; inplace: directly modify and overwrite your original DataFrame

# regex 
pd.Series(['f.o', 'fuz', np.nan]).str.replace('f.', 'ba', regex=False)

# -- Factorize ---
s_class = pd.Series(['a','b','c','d'])

s_class_id = s_class.factorize()[0] # convert string to numbers
s_class_id
# array([0, 1, 2, 3], dtype=int64)

s_class_id, mapping = s_class.factorize() # keep mapping class
mapped_back_to_class = mapping.take(pd.Series([3,2,3,1]))
mapped_back_to_class
# Index(['d', 'c', 'd', 'b'], dtype='object')

# -- check if ALL values are NaN in Series
se.isnull().all()

# -- apply 
pd.Series([1, 2, 3]).apply(lambda x :x*2)
# 0    2
# 1    4
# 2    6

3.3 Panda::DataFrame

initialize data frame

df = pd.DataFrame(
          {"a":[11 ,12, 13], "b":[21, 22, 23], "c":[31, 32, 33]},    
          index = [1, 2, 3])
#     a   b   c
# 1  11  21  31
# 2  12  22  32
# 3  13  23  33

#  d is your list of dicts to dataframe
l = [{"a":1, "b":1}, {"a":2, "b":2}]
df = pd.DataFrame(l)
#    a  b
# 0  1  1
# 1  2  2

# numpy array to data frame

X = np.array([0, 3, 0, 1])
df = pd.DataFrame(X, index=np.arange(0,len(X)))

# Out[8]:
#    0
# 0  0
# 1  3
# 2  0
# 3  1

X = np.array([[0, 3, 0], [2, 2, 2], [7, 7, 7]])
df = pd.DataFrame(X, index=np.arange(0,len(X)))

# Out[10]: 
#    0  1  2
# 0  0  3  0
# 1  2  2  2
# 2  7  7  7

# from dict to dataframe
pd.DataFrame.from_dict(dict)

df.mean()  # mean of each column
df.shape   # (rows,columns)
df.columns # column names
df.index   # Describe index
df.info()  # Info on DataFrame
df.count() # Number of non-NA values

# column rename
df = pd.DataFrame({'aa':[1,2], 'bb': [10,20]})
df.columns = ['a', 'b']
df
#    a   b
# 0  1  10
# 1  2  20
df = df.rename({'a': 'axx'}, axis='columns') # or axis=1


df[3].value_counts() # frequency table of a column, e.g. column 3

df = df.values #convert a pandas dataframe (df) to a numpy ndarray,

df = df.rename(columns={'old_name':'new_name'}) # rename a column name

df.isnull().all() # check if ALL values are NaN in DataFrame

3.3.1 handle duplicates

df.drop_duplicates() # Remove duplicate rows (only considers columns).

df3 = df3[~df3.index.duplicated(keep='first')] #Remove rows with duplicate indices, using the duplicated method on the Pandas Index itself

# keep only the last entry from duplicate values
df = pd.DataFrame({'A': ['x1', 'x2', 'x3', 'x1',], 'B': [85, 70, 80, 85]})               
#    A   B
#0  x1  85
#1  x2  70
#2  x3  80
#3  x1  85
df2 = df.drop_duplicates(subset = ['A','B'], keep ='last').reset_index(drop = True)
df2
#    A   B
#0  x2  70
#1  x3  80
#2  x1  85

3.3.2 handle NaN

df.fillna(0) # Replace all the NaN values with Zero

df1 = df.replace(np.nan, '', regex=True) # Replace all the NaN with blank/empty string

df.dropna(axis=1, how='all') # Drop the columns where all elements are NaN

3.3.3 to numpy array

df = pd.DataFrame({'P': [1, 2], 'Q': [0.7, 0.85]},
                  index=['p', 'q'])
#    P     Q
# p  1  0.70
# q  2  0.85

# to structure numpy array. it will keep column name and data type information
df.to_records()
# rec.array([('p', 1, 0.7 ), ('q', 2, 0.85)],
#           dtype=[('index', 'O'), ('P', '<i8'), ('Q', '<f8')])

# convert back to dataframe
pd.DataFrame(df.to_records())
#   index  P     Q
# 0     p  1  0.70
# 1     q  2  0.85

pd.DataFrame(df.to_records()).set_index('index')
#        P     Q
# index         
# p      1  0.70
# q      2  0.85

3.3.4 sort

# Order rows by values of a column 
df.sort_values('mpg') # (low to high).
df.sort_values('mpg', ascending=False) # (high to low).

#sort by two or more columns?
df1 = df1.sort_values(['a', 'b'], ascending=[True, False])

# Shuffle DataFrame rows
df.sample(frac=1) # The frac  specifies the fraction of rows to return in the random sample,  frac=1 means return all rows.

# random n row selection
df.sample(n)

# reset index
df = df.reset_index(drop=True)

3.3.5 subset

# change value of a cell
df.at['C', 'x'] = 10

#--------- row selection ---------------

df.head(n) # Select first n rows.
df.tail(n) # Select last n rows.

# based on position
df[0:3] # select the first 3 rows (rows 0,1,2)
df[:5]  # select the first 5 rows (rows 0,1,2,3,4)
df[-1:] # select the last row 

#iloc: Select Column & Row by Positions (it only takes integers).
# rows:
df.iloc[0] # select the first row
df.iloc[2:] # select every row beginning with the third row
df.iloc[-1] # last row of data frame 
# Columns:
data.iloc[:,0] # first column of data frame 
data.iloc[:,1] # second column of data frame 
data.iloc[:,-1] # last column of data frame 
# rows and columns
df.iloc[0:3, 1:4] # select the first 3 rows (rows 0,1,2) and (columns 1, 2 and 3)

# return type
df.iloc[3]    # return a pandas series when one row is selected
df.iloc[[3]]  # return a pandas dataframe. pass a single valued list to return a dataframe
df.iloc[2:3]  # return a pandas dataframe when multiple rows are selected

df=pd.DataFrame(
          {'age':[30,24,32], 'name':['john','jack','luck'], 'gender':['M','M','F']},
          index=['a', 'b', 'c'])
#   age  name gender
# a   30  john      M
# b   24  jack      M
# c   32  luck      F

#loc: Select Column & Rows by name/index_label
# rows:
df.loc[:3] # slice up to and including label 3 (in the case of label is integer)
df.loc[:'b'] # select rows by b
df.loc['b':'c',:] # select rows from b to c
df.loc[['b','c'],:] # select rows by multiple names/labels
df.loc[df['column_name'] == some_value] # select rows whose column value equals a scalar
df.loc[~df['column_name'].isin(some_values)] # select rows whose column value is in an iterable
df.loc[(df['column_name'] == some_value) & df['other_column'].isin(some_values)] 

df[df.Length>7] # select rows that meet logical criteria.

# loop for each row
for index, row in df.iterrows():
    print row['column_name_1'], row['column_name_2']
# 10 100
# 11 110
# 12 120

# copy as a new df, not only reference
df2 = df[df['B'] == 'b.2'].copy()

#--------- column selection ---------------

# Select columns by names
df['name'] 
df[['age', 'name']] 
df.columns.get_loc("column_name")

df.loc[:, ['age', 'name']] # = df[['age', 'name']] 
# select col_name1, col_name2 from table where column_name = some_value.
df.loc[df['column_name'] == some_value][[col_name1, col_name2]]

# Select columns by regular expression
df.filter(regex='name|address')

# select columns by a list
lst = ['name','age','weight']
df[np.intersect1d(df.columns, lst)]
#   age  name
#0   30  john
#1   24  jack
#2   32  luc

# select Rows by integers and Columns by names
df.loc[df.index[1], 'name'] # second index by df.index[1]:
# or 
df.iloc[1, df.columns.get_loc('name')] # get position of column name by get_loc

# --------- Cell value selection ---------------

# if subset is one row dataframe, use .values[0] to get the value
df.loc[df['name']=='jack'] # subset result has only one row
#    age  name gender
# b   24  jack      M
df.loc[df['name']=='jack']['age'] # it return a series
# b    24
# Name: age, dtype: int64
df.loc[df['name']=='jack']['age'].values[0]
# 24

3.3.6 add

# append a column
df['new_col'] = pd.Series(np.arange(0, 100), index=df.index)

# append a list as a row
df = pd.DataFrame([[1, 2], [3, 4]], columns = ["a", "b"])
#    a  b
# 0  1  2
# 1  3  4

# fist approach
li = [5, 6]
df.loc[len(df)] = li
#    a  b
# 0  1  2
# 1  3  4
# 2  5  6

# second approach
a_series = pd.Series(li, index = df.columns)
df = df.append(a_series, ignore_index=True) # not use the index labels of a_series

3.3.7 remove

df.drop(df.tail(n).index) # drop last n rows
df.drop(df.head(n).index) # drop first n rows

# delete rows based on index
df = df.drop(df.index[1: 4])

# delete column on i index
df = df.drop(df.columns[i], axis=1) # note: it will remove both columns if there is duplicate names in columns, 

# delete column with column name
df = df.drop(['column_name_A', 'column_name_B'], axis=1)

# delete columns based on regular expression
df = df[df.columns.drop(list(df.filter(regex='Test')))]

# drop columns which have same values in all rows
nunique = df.apply(pd.Series.nunique)
cols_to_drop = nunique[nunique == 1].index
df.drop(cols_to_drop, axis=1)

3.3.8 summary

# sum each row of column
df.sum(axis=1)

3.3.9 group

# groupby dataframe by key
grouped = df.groupby('gender')

grouped.groups.keys() # list keys

for name, group in grouped:
    print('key:' name)
    print('value:' group)
    
data = pd.DataFrame({'user_id': ['a1', 'a1', 'a1', 'a2', 'a2', 'a2', 'a3', 'a3', 'a3'],
                     'product_id': ['p1', 'p1', 'p2', 'p1', 'p1', 'p1', 'p2', 'p2', 'p3'],
                     'product_count': [1, 2, 2, 2, 2, 1, 4, 10, 30]})
count_series = data.groupby(['user_id', 'product_id']).size()
count_series  # has a hierarchical index
# user_id  product_id
# a1       p1            2
#          p2            1
# a2       p1            3
# a3       p2            2
#          p3            1

# Converting a GroupBy output from Series to DataFrame
count_df = count_series.to_frame(name = '_size').reset_index()
count_df
# user_id product_id  _size
# 0      a1         p1     2
# 1      a1         p2     1
# 2      a2         p1     3
# 3      a3         p2     2
# 4      a3         p3     1
sum_series = data.groupby(['user_id', 'product_id'])['product_count'].sum() # Compute sum of group values.
sum_series
# user_id  product_id
# a1       p1             3
#          p2             2
# a2       p1             5
# a3       p2            14
#          p3            30

# select top n row from each group after group by in pandas
data.sort_values('product_count', ascending=False).groupby(['user_id'], sort=False).head(3)

3.3.10 combining DataFrames

# Append a dataframe
df1 = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'))
df1
#    A  B
# 0  1  2
# 1  3  4

df2 = pd.DataFrame([[1, 6], [7, 8]], columns=list('AB'))

# -----------  pd.concat  -----------------
df = pd.concat([df1, df2]) # concatenate a list of dataframes along row
#   A  B
#0  1  2
#1  3  4
#0  1  6
#1  7  8

# reset index if necessary
df = df.reset_index(drop=True) 

pd.concat([df1, df2], axis=1) # concatenate a list of dataframes along column
#   A  B  A  B  
#0  1  2  1  6
#1  3  4  7  8

# -----------  pd.merge, Database-style  -----------------
df = pd.merge(df1, df2, on="A") # how='inner' by default.
#   A  B_x  B_y
#0  1    2    6

df = pd.merge(df1, df2, on="A", how="left") 
#   A  B_x  B_y
#0  1    2  6.0
#1  3    4  NaN

3.3.11 SQL query

import pandas as pd
table_name = "test"
df = pd.DataFrame(
    {"a": [11, 12, 13], "b": [21, 22, 23], "c": ["c1", "c2", "c3"]},
    index=[1, 2, 3])
sql = '''select a, c from test where b >=22 '''

# ----------- sqlite3 -----------------------
import sqlite3
cnx = sqlite3.connect(':memory:')  # create a db connection
df.to_sql(name=table_name, con=cnx)  # load data into db
output = pd.read_sql(sql, cnx)
print(output)
#     a   c
# 0  12  c2
# 1  13  c3

# --------- sqlglot -------------------------
from sqlglot.executor import execute
tables = {table_name: df.to_dict("records")}  # load data into a dict
# {'test': [{'a': 11, 'b': 21, 'c': 'c1'}, {'a': 12, 'b': 22, 'c': 'c2'}, {'a': 13, 'b': 23, 'c': 'c3'}]}
output = execute(sql, tables=tables)
sql_df = pd.DataFrame(output.rows, columns=output.columns)
print(sql_df)

# ------- duckdb -----------------
import duckdb
g = globals()
g[table_name] = df  # load data in global variable
output = duckdb.query(sql).to_df()
print(output)

# --------- pandasql ------------
from pandasql import sqldf
g = globals()
g[table_name] = df  # load data in global variable
output = sqldf(sql)
print(output)

3.3.12 rolling

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.rolling.html

df['new_col'] = data['column'].rolling(5).mean() # Rolling Mean on pandas on a specific column

# Pandas Rolling Apply custom function
def my_test(vals):
    values = vals.values
    return values[0]
    
df['new_col'] = data['column'].rolling(5).apply(my_test)

3.3.13 type converting

# type of each column
df.dtypes

# convert ALL columns to strings
df = df.astype(str)
# convert selected columns 
df[["D", "E"]] = df[["D", "E"]].astype(int) 

# To Factors
s = pd.Series(['single', 'touching', 'nuclei', 'dusts', 'touching', 'single', 'nuclei'])
s_enc = pd.factorize(s) 

4 Data time

4.1 datetime

import datetime as dt

# date 
dt.date.today()           # datetime.date(2021, 11, 19)

# datetime 
dt.datetime.now()         # now         
dt.datetime.now().year    # now.month or now.hour, etc
dt.datetime.now().month   # now.month 
'%02d' % datetime.dt.now().month  # ‘04’
dt.datetime.now().strftime('%Y-%m-%d %H:%M:%S') # format time stamp
dt.datetime.now() - datetime.timedelta(days=2) # Day before Yesterday
dt.datetime.now() - datetime.timedelta(minutes=15)
dt.datetime.now().date() # only keep date part of datetime
dt.datetime.today() + dt.timedelta(days=2)  # after 2 days

# combine date and time strings to single datetime object 
mytime = dt.datetime.strptime('0130','%H%M').time()
mydatetime = dt.datetime.combine(dt.date.today(), mytime)

# ---- unix time ------
# convert (Unix) timestamp (the number of seconds since January 1st, 1970). 
dt.datetime.fromtimestamp(1485714600).strftime("%A, %B %d, %Y %I:%M:%S")
'Sunday, January 29, 2017 08:30:00'

# DateTime to Unix timestamp with 13 digits
presentDate = dt.datetime.now()
unix_timestamp = dt.datetime.timestamp(presentDate)*1000
print(unix_timestamp)

# substract 5 minutes from a Unix timestamp
five_minutes_before=$((timestamp - 5 * 60 * 1000))

Convert string to datetime

from datetime import datetime datetime.strptime(‘Jun 1 2005 1:33PM’, ‘%b %d %Y %I:%M%p’)

https://stackoverflow.com/questions/53892450/get-the-format-in-dateutil-parse https://stackabuse.com/converting-strings-to-datetime-in-python/

formats = ['%Y-%m-%dT%H:%M:%S*%f%z','%Y %b %d %H:%M:%S.%f %Z','%b %d %H:%M:%S %z %Y','%d/%b/%Y:%H:%M:%S %z','%b %d, %Y %I:%M:%S %p','%b %d %Y %H:%M:%S','%b %d %H:%M:%S %Y','%b %d %H:%M:%S %z','%b %d %H:%M:%S','%Y-%m-%dT%H:%M:%S%z','%Y-%m-%dT%H:%M:%S.%f%z','%Y-%m-%d %H:%M:%S %z','%Y-%m-%d %H:%M:%S%z','%Y-%m-%d %H:%M:%S,%f','%Y/%m/%d*%H:%M:%S','%Y %b %d %H:%M:%S.%f*%Z','%Y %b %d %H:%M:%S.%f','%Y-%m-%d %H:%M:%S,%f%z','%Y-%m-%d %H:%M:%S.%f','%Y-%m-%d %H:%M:%S.%f%z','%Y-%m-%dT%H:%M:%S.%f','%Y-%m-%dT%H:%M:%S','%Y-%m-%dT%H:%M:%S%Z','%Y-%m-%dT%H:%M:%S.%f','%Y-%m-%dT%H:%M:%S','%Y-%m-%d*%H:%M:%S:%f','%Y-%m-%d*%H:%M:%S','%y-%m-%d %H:%M:%S,%f %z','%y-%m-%d %H:%M:%S,%f','%y-%m-%d %H:%M:%S','%y/%m/%d %H:%M:%S','%y%m%d %H:%M:%S','%Y%m%d %H:%M:%S.%f','%m/%d/%y*%H:%M:%S','%m/%d/%Y*%H:%M:%S','%m/%d/%Y*%H:%M:%S*%f','%m/%d/%y %H:%M:%S %z','%m/%d/%Y %H:%M:%S %z','%H:%M:%S','%H:%M:%S.%f','%H:%M:%S,%f','%d/%b %H:%M:%S,%f','%d/%b/%Y:%H:%M:%S','%d/%b/%Y %H:%M:%S','%d-%b-%Y %H:%M:%S','%d-%b-%Y %H:%M:%S.%f','%d %b %Y %H:%M:%S','%d %b %Y %H:%M:%S*%f','%m%d_%H:%M:%S','%m%d_%H:%M:%S.%f','%m/%d/%Y %I:%M:%S %p:%f','%m/%d/%Y %H:%M:%S %p']

datestrings = ['2018-08-20T13:20:10*633+0000','2017 Mar 03 05:12:41.211 PDT','Jan 21 18:20:11 +0000 2017','19/Apr/2017:06:36:15 -0700','Dec 2, 2017 2:39:58 AM','Jun 09 2018 15:28:14','Apr 20 00:00:35 2010','Sep 28 19:00:00 +0000','Mar 16 08:12:04','2017-10-14T22:11:20+0000','2017-07-01T14:59:55.711+0000','2017-08-19 12:17:55 -0400','2017-08-19 12:17:55-0400','2017-06-26 02:31:29,573','2017/04/12*19:37:50','2018 Apr 13 22:08:13.211*PDT','2017 Mar 10 01:44:20.392','2017-03-10 14:30:12,655+0000','2018-02-27 15:35:20.311','2017-03-12 13:11:34.222-0700','2017-07-22T16:28:55.444','2017-09-08T03:13:10','2017-03-12T17:56:22-0700','2017-11-22T10:10:15.455','2017-02-11T18:31:44','2017-10-30*02:47:33:899','2017-07-04*13:23:55','11-02-11 16:47:35,985 +0000','10-06-26 02:31:29,573','10-04-19 12:00:17','06/01/22 04:11:05','150423 11:42:35','20150423 11:42:35.173','08/10/11*13:33:56','11/22/2017*05:13:11','05/09/2017*08:22:14*612','04/23/17 04:34:22 +0000','10/03/2017 07:29:46 -0700','11:42:35','11:42:35.173','11:42:35,173','23/Apr 11:42:35,173','23/Apr/2017:11:42:35','23/Apr/2017 11:42:35','23-Apr-2017 11:42:35','23-Apr-2017 11:42:35.883','23 Apr 2017 11:42:35','23 Apr 2017 10:32:35*311','0423_11:42:35','0423_11:42:35.883','8/5/2011 3:31:18 AM:234','9/28/2011 2:23:15 PM']

from datetime import datetime

def parse_timestamp(datestring, formats):
    for f in formats:
        try:
            d = datetime.strptime(datestring, f)
        except:
            continue
        return (d, f)
    return (datestring, 'Unable to parse format')

4.2 Pandas Timestamp

# --- convert string to panda datatime
df = pd.DataFrame({'timestamp_col':['2005-05-23 00:00:00', '2005-05-24 00:00:00']})
#                dt_col
#0  2005-05-23 00:00:00
#1  2005-05-24 00:00:00
df.timestamp_col = pd.to_datetime(df.timestamp_col, format="%Y-%m-%d %H:%M:%S")
type(df.timestamp_col[0])
# <class 'pandas._libs.tslibs.timestamps.Timestamp'>


# --- Group datetime column (timestamp_col) into hour and minute aggregations for values (value_col)
df.groupby([df.timestamp_col.dt.hour, df.timestamp_col.dt.minute]).value_col.sum()


# remove data time instance is NaT in a dataframe
df = df[df[3].apply(lambda x: not isinstance(x, pd._libs.tslib.NaTType))]

pd.period_range(start='2017-01-01', end='2017-10-01', freq='M')
# PeriodIndex(['2017-01', '2017-02', '2017-03', '2017-04', '2017-05', '2017-06',
#              '2017-07', '2017-08', '2017-09', '2017-10'],
#             dtype='period[M]', freq='M')

(time2-time1).total_seconds() #Convert timedelta to total seconds

# --- Converting between python_datetime and Pandas_Timestamp objects
ts = pd.Timestamp('2014-01-23 00:00:00', tz=None)
# Timestamp('2014-01-23 00:00:00', tz=None)
ts.to_pydatetime()
# datetime.datetime(2014, 1, 23, 0, 0)

# --- Convert unix time to panda.datetime
df
         date  price
# 0  1349720105  12.08
# 1  1349806505  12.35
df['date'] = pd.to_datetime(df['date'],unit='s')
df
                 date  price
# 0 2012-10-08 18:15:05  12.08
# 1 2012-10-09 18:15:05  12.35


result_ms=pd.to_datetime('1493530261000', unit='ms')
str(result_ms) # -> '2017-04-30 05:31:01'

# --- round time with dt.round ---
df = pd.DataFrame(
  {'Date' : pd.date_range('1/1/2011', periods=5, freq='3675S'),
   'Num' : np.random.rand(5)})
#                 Date       Num
#0 2011-01-01 00:00:00  0.580997
#1 2011-01-01 01:01:15  0.407332
#2 2011-01-01 02:02:30  0.786035
#3 2011-01-01 03:03:45  0.821792

# hourly round
df.assign(Date=df.Date.dt.round('H'))
#                 Date       Num
#0 2011-01-01 00:00:00  0.577957
#1 2011-01-01 01:00:00  0.995748
#2 2011-01-01 02:00:00  0.864013
#3 2011-01-01 03:00:00  0.468762

# --- Use Pandas rolling method, 3600s = 1 Hour -----
df_sample['occurrences_in_last_hour'] = df_sample['occurrences'].rolling('3600s').sum()
df_sample.head(15)
#                     occurrences  occurrences_in_last_hour
#Run time                                                   
#2020-01-01 00:00:00            1                       1.0
#2020-01-01 00:04:10            1                       2.0
#2020-01-01 00:08:20            1                       3.0
#2020-01-01 00:12:30            1                       4.0
#2020-01-01 00:16:40            1                       5.0

5 Regular expression

import re
# search
m = re.search('(?<=abc)def', 'abcdef')
m.group(0)
# 'def'

# sub
re.sub(regex_search,regex_replace,contents)

6 IO

6.1 file path

CODE_PATH = pathlib.Path(__file__).parent
DATA_PATH = CODE_PATH.joinpath('data')
LOG_PATH = CODE_PATH.joinpath('log.txt')

6.2 Read write CSV

df = pd.read_csv('file.csv', header=None, nrows=5)

df.to_dense().to_csv("myDataFrame.csv", index = False, sep=',', encoding='utf-8')

pd.read_excel('file.xlsx')
pd.to_excel('dir/myDataFrame.xlsx', sheet_name='Sheet1')

6.3 Read head lines of a large text file

file_name = "C:\\Users\\xxxxx\\Desktop\\2014101709-12.txt"

with open(file_name) as myfile:
    head = [next(myfile) for x in range(20)]
print(head)

6.4 List files only in the current directory

import os as os
files = [f for f in os.listdir('.') if os.path.isfile(f)]
for f in files:
    print(f)

6.5 read and write objects to plk files

def save_object(obj, filename):
    with open(filename, 'wb') as output:  # Overwrites any existing file.
        pickle.dump(obj, output, pickle.HIGHEST_PROTOCOL)
        
def read_object(filename):
    if Path(filename).is_file():
        with open(filename, 'rb') as input:
            o = pickle.load(input)
        return o
    else:
        return None      

7 Others

7.1 zip

numbers = [1, 2, 3]
letters = ['a', 'b', 'c']
zipped = zip(numbers, letters)
zipped  # Holds an iterator object
# <zip object at 0x7fa4831153c8>
type(zipped)
# <class 'zip'>
list(zipped)
# [(1, 'a'), (2, 'b'), (3, 'c')]

7.2 logging

import logging

try:
    print(5 / 0)
except Exception as e:
    logging.warning(e.__traceback__, exc_info=True)

7.3 lambda, map, filter, reduce

# --------- lambda -----------
lambda arguments: expression # Anonymous function objects 

# - function -
def add(x, y): 
  return x + y
add(2, 3)  # Output: 5

# - lambda arguments : expression -
add = lambda x, y : x + y 
add(2, 3) # Output: 5

# -------- map ----------
map(function, iterable(s)) # The map() iterates through all items in the given iterable and executes the function 

li= [1,3,7,8]
map_object = map(lambda x: x*2, li)
print(list(map_object)) # map() function returns the map_object type, can be convert to list

# -------- filter ----------
filter(function, iterable(s)) # filter() forms a new list that contains only elements that satisfy a certain condition, i.e. the function we passed returns True.

li= [1,3,7,8]
filter_object  = filter(lambda x: x%2==0, li)
print(list(filter_object))

# --------- reduce ----------
reduce(function, sequence[, initial]) # reduce() works by calling the function we passed for the first two items in the sequence. The result returned by the function is used in another call to function alongside with the next (third in this case), element

from functools import reduce
li= [1,3,7,8]
reduce_object  = reduce(lambda x,y: x+y, li)
print(reduce_object)

7.4 underscore _

# storing the value of last expression in interpreter
>>> 10 
10 
>>> _ * 3  
30

# For Ignoring the values
x, _, y = (1, 2, 3) # x = 1, y = 3

# _single_leading_underscore:  
# is used for declaring private variables, functions, methods and classes in a module
class _Base: # private class
    _hidden_factor = 2 # private variable
    
# __double_leading_underscore:
# for mangling 
class A:
    def __double_method(self): # the name will be mangled in "_ClassName__method" form.
        pass
        
# __double_leading_and_trailing_underscore__ : 
#  for special variables or methods 
__file__ # indicates the location of Python file
__init__ # will be executed at first when a instance of class is created

7.5 reload imported module

import importlib
import foo #import the module here, so that it can be reloaded.
importlib.reload(foo)

7.6 yield

The yield statement suspends a function’s execution and sends a value back to the caller. After the caller finish processing the value, the function continues execution immediately after the last yield run.

import time
# the following two generator functions are same
def simpleGeneratorFun():
    print("sending 1")
    yield 1
    print("sending 2")
    yield 2
    print("sending 3")
    yield 3

def simpleGeneratorFun():
    for x in [1, 2, 3]:
        print(f"sending {x}")
        yield x

# caller code to check above generator function
for value in simpleGeneratorFun():
    print(f"received {value}")
    time.sleep(5)
    print(f"finished sleep")

# - output -  
# sending 1
# received 1
# finished sleep
# sending 2
# received 2
# finished sleep
# sending 3
# received 3
# finished sleep

7.7 Abstract Classes

from abc import ABC, abstractmethod


# abc is a builtin module, we have to import ABC and abstractmethod
class Animal(ABC):  # Inherit from ABC(Abstract base class)
    @property
    def food_eaten(self):
        return self._food

    @food_eaten.setter
    def food_eaten(self, food):
        if food in self.diet:
            self._food = food
        else:
            raise ValueError(f"You can't feed this animal with {food}.")

    @property
    @abstractmethod  # combination of @property and @abstractmethod in order to define an abstract property.
    def diet(self):
        pass

    @abstractmethod  # Decorator to define an abstract method. # If a class inherits from an ABC, it must implement all it's abstract methods!
    def feed(self, time):
        pass


class Lion(Animal):
    @property
    def diet(self):
        return ["antelope", "cheetah", "buffaloe"]

    def feed(self, time):
        print(f"Feeding a lion with {self._food} meat! At {time}")


class Snake(Animal):
    @property
    def diet(self):
        return ["frog", "rabbit"]

    def feed(self, time):
        print(f"Feeding a snake with {self._food} meat! At {time}")


leo = Lion()
print(leo.diet)
leo.food_eaten = "antelope"
leo.feed("10:10 AM")

adam = Snake()
adam.food_eaten = "frog"
adam.feed("10:20 AM")

# ['antelope', 'cheetah', 'buffaloe']
# Feeding a lion with antelope meat! At 10:10 AM
# Feeding a snake with frog meat! At 10:20 AM

8 Real time plot with python

import numpy as np
import matplotlib.pyplot as plt
plt.axis([0, 10, 0, 1])
plt.ion()

for i in range(100):
    y = np.random.random()
    plt.scatter(i, y)
    plt.pause(0.5)
    
while True:
    plt.pause(0.5)

9 plot a hist graph

import random
values = [random.randint(0, 10) for _ in range(50)]
import matplotlib.pyplot as plt
plt.hist(values, bins=11)

# plot a vertical line
plt.axvline(x=3)
Home Page