In [93]:
# 1. Import pandas as pd and read_csv() simple.csv into a dataframe 'df' (optionally, auto-convert dates).
import pandas as pd
df = pd.read_csv('data/simple.csv', infer_datetime_format=True, parse_dates=['Date'])
df.head(5)
Out[93]:
In [94]:
# 2. Use df.dropna() to drop any sample that contains any Na/NaN values.
df.dropna().head(5)
Out[94]:
In [95]:
# 3. Use df.dropna() with the subset keyword argument to drop those rows without a Count.
df.dropna(subset=['Count']).head(5)
Out[95]:
In [96]:
# 4. Use df.fillna() to fill the NaN values in Count with 0.
df.fillna(0).head(5)
Out[96]:
In [97]:
# 5. Use pd.to_numeric() to convert "Weird Count" to numbers. After error, try with keyword errors='coerce'.
new_series = pd.to_numeric(df['Weird Count'], errors='coerce')
df['Weird Count'] = new_series
df.head(5)
Out[97]:
In [98]:
# 6. Use str.replace('\t', '') on the column Weird Date to delete any tabs.
df['Weird Date'] = df['Weird Date'].str.replace('\t', '')
df.head(5)
Out[98]:
In [99]:
# 7. Use .str.partition(',')[2] to chop WEEKDAY COMMA from Weird Date and make a new column.
df['Less Weird Date'] = df['Weird Date'].str.partition(',')[2]
df.head(5)
Out[99]:
In [100]:
# 8. Use .str.strip() to remove any whitespace in this new column.
df['Less Weird Date'] = df['Less Weird Date'].str.strip()
df.head(5)
Out[100]:
In [101]:
# 9. Use pd.to_datetime() to convert the weirdly formatted dates in Less Weird Dates to pandas datetimes.
df['Reconstituted Date'] = pd.to_datetime(df['Less Weird Date'])
df.head(5)
Out[101]:
In [102]:
# 10. Convert the nice pandas dates to month long period types using df[].dt.to_period().
df['Month'] = df['Reconstituted Date'].dt.to_period(freq='M')
df.head(5)
Out[102]:
In [103]:
# 11. Convert Count to an int using the column's "astype()" method.
df['Count'] = df['Count'].fillna(0)
df['Count'] = df['Count'].astype(int)
df.head(5)
Out[103]:
In [104]:
# 12. Import numpy as np. Run pd.isnull(np.NaN). Run None == np.NaN. Run np.NaN == np.NaN. What does that tell you?
import numpy as np
print(pd.isnull(np.NaN))
print(None == np.NaN)
print(np.NaN == np.NaN)
In [105]:
# 13. Do a database style inner join of the df and a copy (df.copy()) of the dataframe on Date using pd.merge()
copy = df.copy()
df.merge(copy, how='inner', on='Date').head(3)
Out[105]:
In [106]:
# 14. Combine the df with a copy of the df using concat, effectively stacking the df on top of itself.
pd.concat([df, copy]).head(5)
Out[106]:
In [107]:
# 15. Convert Count to string type using .astype(). If failure, use raise_on_error=False argument.
df['Count'].astype(str, raise_on_error=False).head(5)
Out[107]:
In [108]:
# 16. Bin the values in Count in groups of 10. 0-9, 10-19, 20-29, etc. using pd.cut.
df['Weird Count'] = pd.cut(df['Count'], bins=[-1, 9, 19, 29, 39, 49, 59])
df.head(5)
Out[108]:
In [109]:
# 17. Use pd.read_csv to read the CFPB CSB into dataframe 'df2'.
df2 = pd.read_csv('data/cfpb_complaints_with_fictitious_data.csv')
df2.head(1)
Out[109]:
In [110]:
# 18. Filter df2 down to ['Product', 'Sub-product', 'Complaint ID', and 'Date received']
df2 = df2[['Product', 'Sub-product', 'Complaint ID', 'Date received']]
df2.head(5)
Out[110]:
In [111]:
# 19. set_index() with ['Product', 'Sub-product] amd assign the result to df3.
df3 = df2.set_index(['Product', 'Sub-product'])