Data Frame Manipulation -- Python pandas¶
In [1]:
Copied!
import numpy as np
import pandas as pd
from pyprojroot import here
print(f"pandas=={pd.__version__}")
import numpy as np
import pandas as pd
from pyprojroot import here
print(f"pandas=={pd.__version__}")
pandas==1.2.2
Load csv¶
In [2]:
Copied!
df = pd.read_csv(here() / ".data" / "titanic.csv")
df.head(3)
df = pd.read_csv(here() / ".data" / "titanic.csv")
df.head(3)
Out[2]:
| pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 0 | 0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO |
| 1 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.9167 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON |
| 2 | 1 | 0 | Allison, Miss. Helen Loraine | female | 2.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
Summary¶
In [3]:
Copied!
# Number of rows
len(df)
# Number of rows
len(df)
Out[3]:
1309
In [4]:
Copied!
# Number of columns
len(df.columns)
# Number of columns
len(df.columns)
Out[4]:
14
In [5]:
Copied!
# Both dimensions
df.shape
# Both dimensions
df.shape
Out[5]:
(1309, 14)
In [6]:
Copied!
# Inspect types
df.dtypes
# Inspect types
df.dtypes
Out[6]:
pclass int64 survived int64 name object sex object age float64 sibsp int64 parch int64 ticket object fare float64 cabin object embarked object boat object body float64 home.dest object dtype: object
In [7]:
Copied!
# Descriptive statistics
df.describe()
# Descriptive statistics
df.describe()
Out[7]:
| pclass | survived | age | sibsp | parch | fare | body | |
|---|---|---|---|---|---|---|---|
| count | 1309.000000 | 1309.000000 | 1046.000000 | 1309.000000 | 1309.000000 | 1308.000000 | 121.000000 |
| mean | 2.294882 | 0.381971 | 29.881135 | 0.498854 | 0.385027 | 33.295479 | 160.809917 |
| std | 0.837836 | 0.486055 | 14.413500 | 1.041658 | 0.865560 | 51.758668 | 97.696922 |
| min | 1.000000 | 0.000000 | 0.166700 | 0.000000 | 0.000000 | 0.000000 | 1.000000 |
| 25% | 2.000000 | 0.000000 | 21.000000 | 0.000000 | 0.000000 | 7.895800 | 72.000000 |
| 50% | 3.000000 | 0.000000 | 28.000000 | 0.000000 | 0.000000 | 14.454200 | 155.000000 |
| 75% | 3.000000 | 1.000000 | 39.000000 | 1.000000 | 0.000000 | 31.275000 | 256.000000 |
| max | 3.000000 | 1.000000 | 80.000000 | 8.000000 | 9.000000 | 512.329200 | 328.000000 |
Slice data¶
Rows¶
In [8]:
Copied!
# Select rows by number
df.iloc[1:4,:]
# Note that Python is 0-indexed
# Note that Python treats 1:4 as a half-open interval
# i.e., row 4 is not included
# Select rows by number
df.iloc[1:4,:]
# Note that Python is 0-indexed
# Note that Python treats 1:4 as a half-open interval
# i.e., row 4 is not included
Out[8]:
| pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.9167 | 1 | 2 | 113781 | 151.55 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON |
| 2 | 1 | 0 | Allison, Miss. Helen Loraine | female | 2.0000 | 1 | 2 | 113781 | 151.55 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
| 3 | 1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.0000 | 1 | 2 | 113781 | 151.55 | C22 C26 | S | NaN | 135.0 | Montreal, PQ / Chesterville, ON |
In [9]:
Copied!
# Select rows by index label (key)
df.loc[1:3]
# This works even for non-integer labels
# Note that label-slicing is end-inclusive
# Select rows by index label (key)
df.loc[1:3]
# This works even for non-integer labels
# Note that label-slicing is end-inclusive
Out[9]:
| pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.9167 | 1 | 2 | 113781 | 151.55 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON |
| 2 | 1 | 0 | Allison, Miss. Helen Loraine | female | 2.0000 | 1 | 2 | 113781 | 151.55 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON |
| 3 | 1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.0000 | 1 | 2 | 113781 | 151.55 | C22 C26 | S | NaN | 135.0 | Montreal, PQ / Chesterville, ON |
In [10]:
Copied!
# Select row by condition
df.loc[df['survived'] == 1].head(3)
# df[df['Survived'] == 1] also works
# Select row by condition
df.loc[df['survived'] == 1].head(3)
# df[df['Survived'] == 1] also works
Out[10]:
| pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 0 | 0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO |
| 1 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.9167 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON |
| 5 | 1 | 1 | Anderson, Mr. Harry | male | 48.0000 | 0 | 0 | 19952 | 26.5500 | E12 | S | 3 | NaN | New York, NY |
In [11]:
Copied!
# Select rows by multiple conditions
df.loc[(df['survived'] == 1) & (df['sex'] == 'female'),:].head(3)
# Note parentheses for order of operations
# Select rows by multiple conditions
df.loc[(df['survived'] == 1) & (df['sex'] == 'female'),:].head(3)
# Note parentheses for order of operations
Out[11]:
| pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | Allen, Miss. Elisabeth Walton | female | 29.0 | 0 | 0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO |
| 6 | 1 | 1 | Andrews, Miss. Kornelia Theodosia | female | 63.0 | 1 | 0 | 13502 | 77.9583 | D7 | S | 10 | NaN | Hudson, NY |
| 8 | 1 | 1 | Appleton, Mrs. Edward Dale (Charlotte Lamson) | female | 53.0 | 2 | 0 | 11769 | 51.4792 | C101 | S | D | NaN | Bayside, Queens, NY |
In [12]:
Copied!
# Can also specify a "callable" function that takes the dataframe as input
df.loc[lambda dfx: (dfx['survived'] == 1) & (dfx['sex'] == 'female'), :].head(3)
# Can also specify a "callable" function that takes the dataframe as input
df.loc[lambda dfx: (dfx['survived'] == 1) & (dfx['sex'] == 'female'), :].head(3)
Out[12]:
| pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | Allen, Miss. Elisabeth Walton | female | 29.0 | 0 | 0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO |
| 6 | 1 | 1 | Andrews, Miss. Kornelia Theodosia | female | 63.0 | 1 | 0 | 13502 | 77.9583 | D7 | S | 10 | NaN | Hudson, NY |
| 8 | 1 | 1 | Appleton, Mrs. Edward Dale (Charlotte Lamson) | female | 53.0 | 2 | 0 | 11769 | 51.4792 | C101 | S | D | NaN | Bayside, Queens, NY |
In [13]:
Copied!
# Select columns by name
df.loc[:, ['survived', 'pclass']].head(3)
# Select columns by name
df.loc[:, ['survived', 'pclass']].head(3)
Out[13]:
| survived | pclass | |
|---|---|---|
| 0 | 1 | 1 |
| 1 | 1 | 1 |
| 2 | 0 | 1 |
In [14]:
Copied!
# Select column range by name
df.loc[:, 'survived':'name'].head(3)
# Select column range by name
df.loc[:, 'survived':'name'].head(3)
Out[14]:
| survived | name | |
|---|---|---|
| 0 | 1 | Allen, Miss. Elisabeth Walton |
| 1 | 1 | Allison, Master. Hudson Trevor |
| 2 | 0 | Allison, Miss. Helen Loraine |
Assign values¶
In [15]:
Copied!
# Assign a (new) column
df.loc[:, 'sparkles'] = 8
df.head(3)
# Assign a (new) column
df.loc[:, 'sparkles'] = 8
df.head(3)
Out[15]:
| pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | sparkles | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 0 | 0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO | 8 |
| 1 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.9167 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON | 8 |
| 2 | 1 | 0 | Allison, Miss. Helen Loraine | female | 2.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON | 8 |
In [16]:
Copied!
# Assign value to some rows
df.loc[df["survived"] == 1, "sparkles"] = 3
df.head(3)
# Assign value to some rows
df.loc[df["survived"] == 1, "sparkles"] = 3
df.head(3)
Out[16]:
| pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | sparkles | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 0 | 0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO | 3 |
| 1 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.9167 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON | 3 |
| 2 | 1 | 0 | Allison, Miss. Helen Loraine | female | 2.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON | 8 |
Group by¶
Named aggregation docs: https://pandas.pydata.org/pandas-docs/stable/user_guide/groupby.html#named-aggregation
In [17]:
Copied!
# This named aggregation syntax is new in version 0.25.0.
#
df.groupby("pclass").agg(
min_age = ("age", "min"),
mean_fare = ("fare", np.mean),
count = ("ticket", "size")
)
# This named aggregation syntax is new in version 0.25.0.
#
df.groupby("pclass").agg(
min_age = ("age", "min"),
mean_fare = ("fare", np.mean),
count = ("ticket", "size")
)
Out[17]:
| min_age | mean_fare | count | |
|---|---|---|---|
| pclass | |||
| 1 | 0.9167 | 87.508992 | 323 |
| 2 | 0.6667 | 21.179196 | 277 |
| 3 | 0.1667 | 13.302889 | 709 |
pandas defined aggregation functions
| Function | Description |
|---|---|
| mean() | Compute mean of groups |
| sum() | Compute sum of group values |
| size() | Compute group sizes (include NaNs) |
| count() | Compute count of group (don't include NaNs) |
| std() | Standard deviation of groups |
| var() | Compute variance of groups |
| sem() | Standard error of the mean of groups |
| describe() | Generates descriptive statistics |
| first() | Compute first of group values |
| last() | Compute last of group values |
| nth() | Take nth value, or a subset if n is a list |
| min() | Compute min of group values |
| max() | Compute max of group values |
Reshape¶
Spread / Cast / Pivot¶
In [18]:
Copied!
# Count pivot table
pd.pivot_table(df, index="sex", columns="pclass", aggfunc="size")
# Count pivot table
pd.pivot_table(df, index="sex", columns="pclass", aggfunc="size")
Out[18]:
| pclass | 1 | 2 | 3 |
|---|---|---|---|
| sex | |||
| female | 144 | 106 | 216 |
| male | 179 | 171 | 493 |
In [19]:
Copied!
# Aggregate function
df_wide = pd.pivot_table(df, values="age", index="sex", columns="pclass", aggfunc="median")
df_wide
# Aggregate function
df_wide = pd.pivot_table(df, values="age", index="sex", columns="pclass", aggfunc="median")
df_wide
Out[19]:
| pclass | 1 | 2 | 3 |
|---|---|---|---|
| sex | |||
| female | 36.0 | 28.0 | 22.0 |
| male | 42.0 | 29.5 | 25.0 |
Gather / Melt¶
In [20]:
Copied!
df_wide.reset_index().melt(id_vars="sex", value_vars=[1, 2, 3], value_name="med_age")
df_wide.reset_index().melt(id_vars="sex", value_vars=[1, 2, 3], value_name="med_age")
Out[20]:
| sex | pclass | med_age | |
|---|---|---|---|
| 0 | female | 1 | 36.0 |
| 1 | male | 1 | 42.0 |
| 2 | female | 2 | 28.0 |
| 3 | male | 2 | 29.5 |
| 4 | female | 3 | 22.0 |
| 5 | male | 3 | 25.0 |
Join¶
In [21]:
Copied!
left_df = df[['name', 'sex', 'age']].iloc[0:100]
right_df = df[['name', 'pclass', 'fare']].iloc[0:100]
left_df.merge(
right_df,
on='name', # left_on, right_on
how='inner' # 'left', 'right', 'outer'
).head()
left_df = df[['name', 'sex', 'age']].iloc[0:100]
right_df = df[['name', 'pclass', 'fare']].iloc[0:100]
left_df.merge(
right_df,
on='name', # left_on, right_on
how='inner' # 'left', 'right', 'outer'
).head()
Out[21]:
| name | sex | age | pclass | fare | |
|---|---|---|---|---|---|
| 0 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 1 | 211.3375 |
| 1 | Allison, Master. Hudson Trevor | male | 0.9167 | 1 | 151.5500 |
| 2 | Allison, Miss. Helen Loraine | female | 2.0000 | 1 | 151.5500 |
| 3 | Allison, Mr. Hudson Joshua Creighton | male | 30.0000 | 1 | 151.5500 |
| 4 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female | 25.0000 | 1 | 151.5500 |
Rolling Join / As-of Join¶
In [22]:
Copied!
left_df = pd.DataFrame({'t': [1, 5, 10], 'left_value': ['a', 'b', 'c']})
left_df
left_df = pd.DataFrame({'t': [1, 5, 10], 'left_value': ['a', 'b', 'c']})
left_df
Out[22]:
| t | left_value | |
|---|---|---|
| 0 | 1 | a |
| 1 | 5 | b |
| 2 | 10 | c |
In [23]:
Copied!
right_df = pd.DataFrame({'t': [1, 2, 3, 6, 7], 'right_value': [1, 2, 3, 6, 7]})
right_df
right_df = pd.DataFrame({'t': [1, 2, 3, 6, 7], 'right_value': [1, 2, 3, 6, 7]})
right_df
Out[23]:
| t | right_value | |
|---|---|---|
| 0 | 1 | 1 |
| 1 | 2 | 2 |
| 2 | 3 | 3 |
| 3 | 6 | 6 |
| 4 | 7 | 7 |
In [24]:
Copied!
# Join in rows of right_df with latest value of t before
pd.merge_asof(left_df, right_df, on='t', direction='backward')
# Join in rows of right_df with latest value of t before
pd.merge_asof(left_df, right_df, on='t', direction='backward')
Out[24]:
| t | left_value | right_value | |
|---|---|---|---|
| 0 | 1 | a | 1 |
| 1 | 5 | b | 3 |
| 2 | 10 | c | 7 |
In [25]:
Copied!
# Join in rows of right_df with next value of t after
pd.merge_asof(left_df, right_df, on='t', direction='forward')
# Join in rows of right_df with next value of t after
pd.merge_asof(left_df, right_df, on='t', direction='forward')
Out[25]:
| t | left_value | right_value | |
|---|---|---|---|
| 0 | 1 | a | 1.0 |
| 1 | 5 | b | 6.0 |
| 2 | 10 | c | NaN |
In [26]:
Copied!
# Join in rows of right_df with nearest value of t
pd.merge_asof(left_df, right_df, on='t', direction='nearest')
# Join in rows of right_df with nearest value of t
pd.merge_asof(left_df, right_df, on='t', direction='nearest')
Out[26]:
| t | left_value | right_value | |
|---|---|---|---|
| 0 | 1 | a | 1 |
| 1 | 5 | b | 6 |
| 2 | 10 | c | 7 |
Row-bind¶
In [27]:
Copied!
df1 = df.iloc[0:3]
df2 = df.iloc[3:6]
pd.concat([df1, df2])
# note that this matches column names by default
# controlled by `ignore_index` parameter
df1 = df.iloc[0:3]
df2 = df.iloc[3:6]
pd.concat([df1, df2])
# note that this matches column names by default
# controlled by `ignore_index` parameter
Out[27]:
| pclass | survived | name | sex | age | sibsp | parch | ticket | fare | cabin | embarked | boat | body | home.dest | sparkles | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 0 | 0 | 24160 | 211.3375 | B5 | S | 2 | NaN | St Louis, MO | 3 |
| 1 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.9167 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | 11 | NaN | Montreal, PQ / Chesterville, ON | 3 |
| 2 | 1 | 0 | Allison, Miss. Helen Loraine | female | 2.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON | 8 |
| 3 | 1 | 0 | Allison, Mr. Hudson Joshua Creighton | male | 30.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | 135.0 | Montreal, PQ / Chesterville, ON | 8 |
| 4 | 1 | 0 | Allison, Mrs. Hudson J C (Bessie Waldo Daniels) | female | 25.0000 | 1 | 2 | 113781 | 151.5500 | C22 C26 | S | NaN | NaN | Montreal, PQ / Chesterville, ON | 8 |
| 5 | 1 | 1 | Anderson, Mr. Harry | male | 48.0000 | 0 | 0 | 19952 | 26.5500 | E12 | S | 3 | NaN | New York, NY | 3 |
Column-bind¶
In [28]:
Copied!
df_left = df.iloc[0:3, 0:3]
df_right = df.iloc[0:3, 3:6]
pd.concat([df_left, df_right], axis=1, ignore_index=True)
# note that default `ignore_index=False` will join on index
df_left = df.iloc[0:3, 0:3]
df_right = df.iloc[0:3, 3:6]
pd.concat([df_left, df_right], axis=1, ignore_index=True)
# note that default `ignore_index=False` will join on index
Out[28]:
| 0 | 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|---|
| 0 | 1 | 1 | Allen, Miss. Elisabeth Walton | female | 29.0000 | 0 |
| 1 | 1 | 1 | Allison, Master. Hudson Trevor | male | 0.9167 | 1 |
| 2 | 1 | 0 | Allison, Miss. Helen Loraine | female | 2.0000 | 1 |
In [ ]:
Copied!