See also: Working with Text Data
See also: Pandas String Methods
See also: Time Series / Date Functionality
See also: Computational Tools
In [1]:
# 1. import pandas as pd, create a dataframe using CFPB.csv, and read_csv into 'df'.
import pandas as pd
df = pd.read_csv('data/cfpb_complaints_with_fictitious_data.csv')
df.head()
Out[1]:
In [2]:
# 2. Use the .str namespace of the Issue column to access the .str.upper() method
upper_df = df['Issue'].str.upper()
upper_df.head(3)
Out[2]:
In [3]:
# 3. Use the .str.split() method to split df['Issue'] into strings.
split_df = df['Issue'].str.split()
split_df.head(3)
Out[3]:
In [4]:
# 4. Use dir() on df['Issue'].str to get all the availible string methods.
dir_list = dir(df['Issue'].str)
dir_list[-10:]
Out[4]:
In [5]:
# 5. Use the .str.replace() method to replace the letters 'or' with '!' in 'Issue', and then capitalize.
replaced_df = df['Issue'].str.replace('or', '!')
replaced_df.head(5)
Out[5]:
In [6]:
# 6. Use .str.extract() with regex r'(\b\S\S\S\b)' to get the first 3-letter word from Complaint.
extracted_df = df['Consumer complaint narrative'].str.extract(r'(\s\S\S\S\s)', expand=False)
extracted_df.head(5)
Out[6]:
In [7]:
# 7. Use .str.contains() with 'lawyer' regex to select all rows with lawyer (boolean indexing)
mask = df['Consumer complaint narrative'].str.contains('lawyer')
lawyer_df = df[mask]
lawyer_df.head(3)
Out[7]:
In [8]:
# 8. Index the .str namespace of 'Issue' directly with [] to get the first three letters of each string.
df['Issue'].str[:3]
Out[8]:
In [9]:
# 9. Create a range of dates from 1/1/2000 to 12/31/2020 using pd.date_range and assign it to 'dindex'
dindex = pd.date_range('1/1/2000', '12/31/2020', freq='D')
dindex
Out[9]:
In [10]:
# 10. Create a times from 9am on 1/1/2000 to 9pm on 1/3/2000 using pd.date_range
daterange = pd.date_range('1/1/2000', '1/3/2000', freq='H')
daterange[:10]
Out[10]:
In [11]:
# 11. pd.read_csv the simple.csv with arguments: infer_datetime_format=True, parse_dates=['Date']. Assign to 'df'
df = pd.read_csv('data/simple.csv', infer_datetime_format=True, parse_dates=['Date'])
df
Out[11]:
In [12]:
# 12. Use the dataframe's set_index() with inplace=True to index on Date. Assign result to 'df'.
df.set_index('Date', inplace=True)
In [13]:
# 13. Now use the dataframe's resample method to mean() to get a biweekly average.
df.resample('2w').mean()
Out[13]:
In [14]:
# 14. Use the dataframe's rolling() method to get a 3 day rolling mean. Assign to 'roll_df'.
roll_df = df.rolling(3).mean()
roll_df
Out[14]:
In [15]:
# 15. Import matplotlib, set %matplotlib inline, use the plot method of roll_df[['Count]]
import matplotlib
%matplotlib inline
roll_df[['Count']].plot()
Out[15]: