Pandas

df.head()
df.tail()
df.info()
df.describe()
df.nunique()

Documentations

Snippets

groupby()

df.groupby(['action']).count()['controller']
df.groupby(['context_type','controller', 'action' ]).count()['links.user']
df.groupby(['action']).size()
df.groupby(['controller', 'action']).sum().reset_index()
df.groupby(['controller', 'action']).sum().unstack().reset_index()
df.groupby(['controller', 'action']).size().reset_index()
df.groupby(['controller', 'action']).size().unstack().reset_index()

Unique

# Count unique values for each column
df.nunique()
# Print unique values for a specific column
unique_value_in_a_column = df[''].unique()
# Print unique values for all columns
for col in df:
    print(col, df[col].unique())

Generate a sub-df

footprints = df[['created_at', 'context_type', 'action', 'controller', 'url' ]]

Drop Null()

footprints = footprints.dropna()

Convert list of json to df

df = pd.DataFrame.from_records(list_of_json)

df and csv

df = pd.read_csv('name.csv')
df.to_csv('name.csv')

df and sql

# Config sqlalchemy
from sqlalchemy import create_engine
engine = create_engine("mysql+mysqlconnector://{username}:"+"{password}"+"@{server}/{database}")

# Read
sql = '''{};'''
df = pd.read_sql(sql, engine)


def sqlcol(dfparam): 
    dtypedict = {}
    for i,j in zip(dfparam.columns, dfparam.dtypes):
        if "object" in str(j):
            if i == 'url':
                dtypedict.update({i: sqlalchemy.types.NVARCHAR(length=2048)})
            else:
                dtypedict.update({i: sqlalchemy.types.NVARCHAR(length=255)})

        if "datetime" in str(j):
            dtypedict.update({i: sqlalchemy.types.DateTime()})

        if "float" in str(j):
            dtypedict.update({i: sqlalchemy.types.Float(precision=3, asdecimal=True)})

        if "int" in str(j):
            dtypedict.update({i: sqlalchemy.types.INT()})

    return dtypedict


# Write
outputdict = list_sqlcol(df3)
df.to_sql(tbl_name, con=engine, if_exists='append', index=False, dtype=outputdict)

df.to_sql() and Redshift

# https://github.com/agawronski/pandas_redshift
import pandas_redshift as pr
pr.connect_to_redshift(dbname = '',
                        host = '',
                        port = 5439,
                        user = '',
                        password = '')
                        
pr.connect_to_s3(
    aws_access_key_id = '',
    aws_secret_access_key = '',
    bucket = ''
)

def list_sqlcol(df): 
    dtypelist = []
    for i,j in zip(df.columns, df.dtypes):
        if "object" in str(j):
            if i == 'url':
                dtypelist.append(sqlalchemy.types.NVARCHAR(length=2048))
            else:
                dtypelist.append(sqlalchemy.types.NVARCHAR(length=255))

        if "datetime" in str(j):
            dtypelist.append(sqlalchemy.types.DateTime())

        if "float" in str(j):
            dtypelist.append(sqlalchemy.types.Float(precision=3, asdecimal=True))

        if "int" in str(j):
            dtypelist.append(sqlalchemy.types.INT())

    return dtypelist
    
outputdict = list_sqlcol(df)
pr.pandas_to_redshift(
    data_frame = df,
    redshift_table_name = '',
    index = False,
    column_data_types = outputdict
)
pr.close_up_shop()

Rename columns header

# method 1
df3.rename(columns=lambda x: x.replace('.', '_'), inplace=True)

# method 2
df3.columns([])

# method 3
headers = df3.columns.values.tolist()
headers = list(map(lambda x: x.replace('.', '_') , headers))
df3.columns = headers

Filter rows by str in field

# step_2_clean_users_pageviews
df = df[~df['controller'].str.contains('-|%| ')].reset_index(drop=True)