Data Frame Manipulation -- Python pandas¶

In [1]:

            
                Copied!
                
import numpy as np
import pandas as pd
from pyprojroot import here

print(f"pandas=={pd.__version__}")
import numpy as np
import pandas as pd
from pyprojroot import here

print(f"pandas=={pd.__version__}")

pandas==1.2.2

Load csv¶

In [2]:

            
                Copied!
                
df = pd.read_csv(here() / ".data" / "titanic.csv")
df.head(3)
df = pd.read_csv(here() / ".data" / "titanic.csv")
df.head(3)

Out[2]:

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest
0	1	1	Allen, Miss. Elisabeth Walton	female	29.0000	0	0	24160	211.3375	B5	S	2	NaN	St Louis, MO
1	1	1	Allison, Master. Hudson Trevor	male	0.9167	1	2	113781	151.5500	C22 C26	S	11	NaN	Montreal, PQ / Chesterville, ON
2	1	0	Allison, Miss. Helen Loraine	female	2.0000	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON

Summary¶

In [3]:

            
                Copied!
                
# Number of rows
len(df)
# Number of rows
len(df)

Out[3]:

In [4]:

            
                Copied!
                
# Number of columns
len(df.columns)
# Number of columns
len(df.columns)

Out[4]:

In [5]:

            
                Copied!
                
# Both dimensions
df.shape
# Both dimensions
df.shape

Out[5]:

(1309, 14)

In [6]:

            
                Copied!
                
# Inspect types
df.dtypes
# Inspect types
df.dtypes

Out[6]:

pclass         int64
survived       int64
name          object
sex           object
age          float64
sibsp          int64
parch          int64
ticket        object
fare         float64
cabin         object
embarked      object
boat          object
body         float64
home.dest     object
dtype: object

In [7]:

            
                Copied!
                
# Descriptive statistics
df.describe()
# Descriptive statistics
df.describe()

Out[7]:

	pclass	survived	age	sibsp	parch	fare	body
count	1309.000000	1309.000000	1046.000000	1309.000000	1309.000000	1308.000000	121.000000
mean	2.294882	0.381971	29.881135	0.498854	0.385027	33.295479	160.809917
std	0.837836	0.486055	14.413500	1.041658	0.865560	51.758668	97.696922
min	1.000000	0.000000	0.166700	0.000000	0.000000	0.000000	1.000000
25%	2.000000	0.000000	21.000000	0.000000	0.000000	7.895800	72.000000
50%	3.000000	0.000000	28.000000	0.000000	0.000000	14.454200	155.000000
75%	3.000000	1.000000	39.000000	1.000000	0.000000	31.275000	256.000000
max	3.000000	1.000000	80.000000	8.000000	9.000000	512.329200	328.000000

Slice data¶

Doc: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html

Rows¶

In [8]:

            
                Copied!
                
# Select rows by number
df.iloc[1:4,:]

# Note that Python is 0-indexed

# Note that Python treats 1:4 as a half-open interval
# i.e., row 4 is not included
# Select rows by number
df.iloc[1:4,:]

# Note that Python is 0-indexed

# Note that Python treats 1:4 as a half-open interval
# i.e., row 4 is not included

Out[8]:

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest
1	1	1	Allison, Master. Hudson Trevor	male	0.9167	1	2	113781	151.55	C22 C26	S	11	NaN	Montreal, PQ / Chesterville, ON
2	1	0	Allison, Miss. Helen Loraine	female	2.0000	1	2	113781	151.55	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON
3	1	0	Allison, Mr. Hudson Joshua Creighton	male	30.0000	1	2	113781	151.55	C22 C26	S	NaN	135.0	Montreal, PQ / Chesterville, ON

In [9]:

            
                Copied!
                
# Select rows by index label (key)
df.loc[1:3]

# This works even for non-integer labels
# Note that label-slicing is end-inclusive
# Select rows by index label (key)
df.loc[1:3]

# This works even for non-integer labels
# Note that label-slicing is end-inclusive

Out[9]:

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest
1	1	1	Allison, Master. Hudson Trevor	male	0.9167	1	2	113781	151.55	C22 C26	S	11	NaN	Montreal, PQ / Chesterville, ON
2	1	0	Allison, Miss. Helen Loraine	female	2.0000	1	2	113781	151.55	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON
3	1	0	Allison, Mr. Hudson Joshua Creighton	male	30.0000	1	2	113781	151.55	C22 C26	S	NaN	135.0	Montreal, PQ / Chesterville, ON

In [10]:

            
                Copied!
                
# Select row by condition
df.loc[df['survived'] == 1].head(3)

# df[df['Survived'] == 1] also works
# Select row by condition
df.loc[df['survived'] == 1].head(3)

# df[df['Survived'] == 1] also works

Out[10]:

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest
0	1	1	Allen, Miss. Elisabeth Walton	female	29.0000	0	0	24160	211.3375	B5	S	2	NaN	St Louis, MO
1	1	1	Allison, Master. Hudson Trevor	male	0.9167	1	2	113781	151.5500	C22 C26	S	11	NaN	Montreal, PQ / Chesterville, ON
5	1	1	Anderson, Mr. Harry	male	48.0000	0	0	19952	26.5500	E12	S	3	NaN	New York, NY

In [11]:

            
                Copied!
                
# Select rows by multiple conditions
df.loc[(df['survived'] == 1) & (df['sex'] == 'female'),:].head(3)

# Note parentheses for order of operations
# Select rows by multiple conditions
df.loc[(df['survived'] == 1) & (df['sex'] == 'female'),:].head(3)

# Note parentheses for order of operations

Out[11]:

	pclass	survived	name	sex	age	sibsp	ticket	fare	cabin	embarked	boat	body	home.dest
0	1	1	Allen, Miss. Elisabeth Walton	female	29.0	0	24160	211.3375	B5	S	2	NaN	St Louis, MO
6	1	1	Andrews, Miss. Kornelia Theodosia	female	63.0	1	13502	77.9583	D7	S	10	NaN	Hudson, NY
8	1	1	Appleton, Mrs. Edward Dale (Charlotte Lamson)	female	53.0	2	11769	51.4792	C101	S	D	NaN	Bayside, Queens, NY

In [12]:

            
                Copied!
                
# Can also specify a "callable" function that takes the dataframe as input
df.loc[lambda dfx: (dfx['survived'] == 1) & (dfx['sex'] == 'female'), :].head(3)
# Can also specify a "callable" function that takes the dataframe as input
df.loc[lambda dfx: (dfx['survived'] == 1) & (dfx['sex'] == 'female'), :].head(3)

Out[12]:

	pclass	survived	name	sex	age	sibsp	ticket	fare	cabin	embarked	boat	body	home.dest
0	1	1	Allen, Miss. Elisabeth Walton	female	29.0	0	24160	211.3375	B5	S	2	NaN	St Louis, MO
6	1	1	Andrews, Miss. Kornelia Theodosia	female	63.0	1	13502	77.9583	D7	S	10	NaN	Hudson, NY
8	1	1	Appleton, Mrs. Edward Dale (Charlotte Lamson)	female	53.0	2	11769	51.4792	C101	S	D	NaN	Bayside, Queens, NY

In [13]:

            
                Copied!
                
# Select columns by name
df.loc[:, ['survived', 'pclass']].head(3)
# Select columns by name
df.loc[:, ['survived', 'pclass']].head(3)

Out[13]:

	survived	pclass
0	1	1
1	1	1
2	0	1

In [14]:

            
                Copied!
                
# Select column range by name
df.loc[:, 'survived':'name'].head(3)
# Select column range by name
df.loc[:, 'survived':'name'].head(3)

Out[14]:

	survived	name
0	1	Allen, Miss. Elisabeth Walton
1	1	Allison, Master. Hudson Trevor
2	0	Allison, Miss. Helen Loraine

Assign values¶

In [15]:

            
                Copied!
                
# Assign a (new) column
df.loc[:, 'sparkles'] = 8
df.head(3)
# Assign a (new) column
df.loc[:, 'sparkles'] = 8
df.head(3)

Out[15]:

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest	sparkles
0	1	1	Allen, Miss. Elisabeth Walton	female	29.0000	0	0	24160	211.3375	B5	S	2	NaN	St Louis, MO	8
1	1	1	Allison, Master. Hudson Trevor	male	0.9167	1	2	113781	151.5500	C22 C26	S	11	NaN	Montreal, PQ / Chesterville, ON	8
2	1	0	Allison, Miss. Helen Loraine	female	2.0000	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON	8

In [16]:

            
                Copied!
                
# Assign value to some rows
df.loc[df["survived"] == 1, "sparkles"] = 3
df.head(3)
# Assign value to some rows
df.loc[df["survived"] == 1, "sparkles"] = 3
df.head(3)

Out[16]:

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest	sparkles
0	1	1	Allen, Miss. Elisabeth Walton	female	29.0000	0	0	24160	211.3375	B5	S	2	NaN	St Louis, MO	3
1	1	1	Allison, Master. Hudson Trevor	male	0.9167	1	2	113781	151.5500	C22 C26	S	11	NaN	Montreal, PQ / Chesterville, ON	3
2	1	0	Allison, Miss. Helen Loraine	female	2.0000	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON	8

Group by¶

Named aggregation docs: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#named-aggregation

In [17]:

            
                Copied!
                
                    
                    
                
                

        
# This named aggregation syntax is new in version 0.25.0.
# 
df.groupby("pclass").agg(
    min_age = ("age", "min"),
    mean_fare = ("fare", np.mean),
    count = ("ticket", "size")
)
# This named aggregation syntax is new in version 0.25.0.
# 
df.groupby("pclass").agg(
    min_age = ("age", "min"),
    mean_fare = ("fare", np.mean),
    count = ("ticket", "size")
)

Out[17]:

	min_age	mean_fare	count
pclass
1	0.9167	87.508992	323
2	0.6667	21.179196	277
3	0.1667	13.302889	709

pandas defined aggregation functions

Function	Description
mean()	Compute mean of groups
sum()	Compute sum of group values
size()	Compute group sizes (include NaNs)
count()	Compute count of group (don't include NaNs)
std()	Standard deviation of groups
var()	Compute variance of groups
sem()	Standard error of the mean of groups
describe()	Generates descriptive statistics
first()	Compute first of group values
last()	Compute last of group values
nth()	Take nth value, or a subset if n is a list
min()	Compute min of group values
max()	Compute max of group values

Reshape¶

https://pandas.pydata.org/pandas-docs/stable/user_guide/reshaping.html

Spread / Cast / Pivot¶

In [18]:

            
                Copied!
                
# Count pivot table
pd.pivot_table(df, index="sex", columns="pclass", aggfunc="size")
# Count pivot table
pd.pivot_table(df, index="sex", columns="pclass", aggfunc="size")

Out[18]:

pclass	1	2	3
sex
female	144	106	216
male	179	171	493

In [19]:

            
                Copied!
                
# Aggregate function
df_wide = pd.pivot_table(df, values="age", index="sex", columns="pclass", aggfunc="median")
df_wide
# Aggregate function
df_wide = pd.pivot_table(df, values="age", index="sex", columns="pclass", aggfunc="median")
df_wide

Out[19]:

pclass	1	2	3
sex
female	36.0	28.0	22.0
male	42.0	29.5	25.0

Gather / Melt¶

In [20]:

            
                Copied!
                
df_wide.reset_index().melt(id_vars="sex", value_vars=[1, 2, 3], value_name="med_age")
df_wide.reset_index().melt(id_vars="sex", value_vars=[1, 2, 3], value_name="med_age")

Out[20]:

	sex	pclass	med_age
0	female	1	36.0
1	male	1	42.0
2	female	2	28.0
3	male	2	29.5
4	female	3	22.0
5	male	3	25.0

Join¶

In [21]:

            
                Copied!
                
                    
                    
                
                

        
left_df = df[['name', 'sex', 'age']].iloc[0:100]
right_df = df[['name', 'pclass', 'fare']].iloc[0:100]

left_df.merge(
    right_df, 
    on='name',    # left_on, right_on
    how='inner'   # 'left', 'right', 'outer'
).head()
left_df = df[['name', 'sex', 'age']].iloc[0:100]
right_df = df[['name', 'pclass', 'fare']].iloc[0:100]

left_df.merge(
    right_df, 
    on='name',    # left_on, right_on
    how='inner'   # 'left', 'right', 'outer'
).head()

Out[21]:

	name	sex	age	pclass	fare
0	Allen, Miss. Elisabeth Walton	female	29.0000	1	211.3375
1	Allison, Master. Hudson Trevor	male	0.9167	1	151.5500
2	Allison, Miss. Helen Loraine	female	2.0000	1	151.5500
3	Allison, Mr. Hudson Joshua Creighton	male	30.0000	1	151.5500
4	Allison, Mrs. Hudson J C (Bessie Waldo Daniels)	female	25.0000	1	151.5500

Rolling Join / As-of Join¶

https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.merge_asof.html

In [22]:

            
                Copied!
                
left_df = pd.DataFrame({'t': [1, 5, 10], 'left_value': ['a', 'b', 'c']})
left_df
left_df = pd.DataFrame({'t': [1, 5, 10], 'left_value': ['a', 'b', 'c']})
left_df

Out[22]:

	t	left_value
0	1	a
1	5	b
2	10	c

In [23]:

            
                Copied!
                
right_df = pd.DataFrame({'t': [1, 2, 3, 6, 7], 'right_value': [1, 2, 3, 6, 7]})
right_df
right_df = pd.DataFrame({'t': [1, 2, 3, 6, 7], 'right_value': [1, 2, 3, 6, 7]})
right_df

Out[23]:

	t	right_value
0	1	1
1	2	2
2	3	3
3	6	6
4	7	7

In [24]:

            
                Copied!
                
# Join in rows of right_df with latest value of t before
pd.merge_asof(left_df, right_df, on='t', direction='backward')
# Join in rows of right_df with latest value of t before
pd.merge_asof(left_df, right_df, on='t', direction='backward')

Out[24]:

	t	left_value	right_value
0	1	a	1
1	5	b	3
2	10	c	7

In [25]:

            
                Copied!
                
# Join in rows of right_df with next value of t after
pd.merge_asof(left_df, right_df, on='t', direction='forward')
# Join in rows of right_df with next value of t after
pd.merge_asof(left_df, right_df, on='t', direction='forward')

Out[25]:

	t	left_value	right_value
0	1	a	1.0
1	5	b	6.0
2	10	c	NaN

In [26]:

            
                Copied!
                
# Join in rows of right_df with nearest value of t
pd.merge_asof(left_df, right_df, on='t', direction='nearest')
# Join in rows of right_df with nearest value of t
pd.merge_asof(left_df, right_df, on='t', direction='nearest')

Out[26]:

	t	left_value	right_value
0	1	a	1
1	5	b	6
2	10	c	7

Row-bind¶

In [27]:

            
                Copied!
                
df1 = df.iloc[0:3]
df2 = df.iloc[3:6]
pd.concat([df1, df2])

# note that this matches column names by default
# controlled by `ignore_index` parameter
df1 = df.iloc[0:3]
df2 = df.iloc[3:6]
pd.concat([df1, df2])

# note that this matches column names by default
# controlled by `ignore_index` parameter

Out[27]:

	pclass	survived	name	sex	age	sibsp	parch	ticket	fare	cabin	embarked	boat	body	home.dest	sparkles
0	1	1	Allen, Miss. Elisabeth Walton	female	29.0000	0	0	24160	211.3375	B5	S	2	NaN	St Louis, MO	3
1	1	1	Allison, Master. Hudson Trevor	male	0.9167	1	2	113781	151.5500	C22 C26	S	11	NaN	Montreal, PQ / Chesterville, ON	3
2	1	0	Allison, Miss. Helen Loraine	female	2.0000	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON	8
3	1	0	Allison, Mr. Hudson Joshua Creighton	male	30.0000	1	2	113781	151.5500	C22 C26	S	NaN	135.0	Montreal, PQ / Chesterville, ON	8
4	1	0	Allison, Mrs. Hudson J C (Bessie Waldo Daniels)	female	25.0000	1	2	113781	151.5500	C22 C26	S	NaN	NaN	Montreal, PQ / Chesterville, ON	8
5	1	1	Anderson, Mr. Harry	male	48.0000	0	0	19952	26.5500	E12	S	3	NaN	New York, NY	3

Column-bind¶

In [28]:

            
                Copied!
                
df_left = df.iloc[0:3, 0:3]
df_right = df.iloc[0:3, 3:6]
pd.concat([df_left, df_right], axis=1, ignore_index=True)

# note that default `ignore_index=False` will join on index
df_left = df.iloc[0:3, 0:3]
df_right = df.iloc[0:3, 3:6]
pd.concat([df_left, df_right], axis=1, ignore_index=True)

# note that default `ignore_index=False` will join on index

Out[28]:

	0	1	2	3	4	5
0	1	1	Allen, Miss. Elisabeth Walton	female	29.0000	0
1	1	1	Allison, Master. Hudson Trevor	male	0.9167	1
2	1	0	Allison, Miss. Helen Loraine	female	2.0000	1

In [ ]: