In [1]:
import pandas as pd
import matplotlib as mpl
import datetime as dt
import re
%matplotlib inline
In [2]:
# The columns and columns names differed slightly between the
# two Excel files, so we clean them up here.
name_replace_pat = re.compile(r"[^a-z]+")
def convert_col_names(names):
return [ re.sub(name_replace_pat, "_", name.lower().strip())
for name in names ]
In [3]:
def rename_cols(df):
return df.rename(columns=dict(zip(df.columns, convert_col_names(df.columns))))
In [4]:
early_data = rename_cols(pd.read_csv("../data/FTC_2011-43014.csv"))
later_data = rename_cols(pd.read_csv("../data/FTC_5114-71014.csv"))
In [5]:
df = pd.concat([early_data, later_data])
In [6]:
df["created_datetime"] = df["created_date"].apply(lambda x: dt.datetime.strptime(str(x), "%m/%d/%Y") if not pd.isnull(x) else None)
In [7]:
df = df.set_index("created_datetime")
In [8]:
df.head()
Out[8]:
In [9]:
def clean_state_codes(state_name):
if type(state_name) == str and "|" in state_name:
non_null = filter(lambda x: x != "", state_name.split("|"))
if len(non_null):
return non_null[0]
else:
return None
else:
return state_name
In [10]:
df["clean_state"] = df["consumer_address_state_code"].apply(clean_state_codes)
In [11]:
# Note: selecting only complaints through June 2014, the last full month with data
ax = df.ix[:"2014-06"]["company_name"].resample("M", how="count").plot()
ax.set_title("Complaints Per Month\n")
pass
In [12]:
df["complaint_info_initial_contact_method"].value_counts()
Out[12]:
In [13]:
# Coalesce "Phone" types
df["phone"] = df['complaint_info_initial_contact_method'].apply(lambda x: int("Phone" in str(x)))
In [14]:
ax = df["phone"].ix[:"2014-06"].resample("M", how="count").plot()
ax.set_title("Phone-Related Complaints By Month\n")
pass
In [15]:
complaints_by_month = df["reference_number"].resample("MS", how="count")
In [16]:
first_six_months_2011 = complaints_by_month.ix["2011-01": "2011-06"].sum()
first_six_months_2012 = complaints_by_month.ix["2012-01": "2012-06"].sum()
first_six_months_2013 = complaints_by_month.ix["2013-01": "2013-06"].sum()
first_six_months_2014 = complaints_by_month.ix["2014-01": "2014-06"].sum()
In [17]:
print "Pct. Change 2012 vs. 2011: {pct:,.2f}%".format(pct=(first_six_months_2012-first_six_months_2011)/float(first_six_months_2011)*100)
print "Pct. Change 2013 vs. 2012: {pct:,.2f}%".format(pct=(first_six_months_2013-first_six_months_2012)/float(first_six_months_2012)*100)
print "Pct. Change 2014 vs. 2013: {pct:,.2f}%".format(pct=(first_six_months_2014-first_six_months_2013)/float(first_six_months_2013)*100)
In [18]:
first_six_months_2014/first_six_months_2013
Out[18]:
Washington DC's main area code — 202 — accounts for far more phone calls than any other area code.
In [18]:
df['company_phone_area_code'].value_counts().head(20)
Out[18]:
In [19]:
top_phone_numbers = pd.DataFrame(df.groupby(['company_phone_area_code', 'company_phone_number']).size(), columns=["complaints"])
In [20]:
top_phone_numbers.sort("complaints", ascending=False).head(20)
Out[20]:
In [21]:
df_202 = df[df["company_phone_area_code"] == "202"]
In [22]:
ax = df_202.ix[:"2014-06"]["phone"].resample("M", how="sum").plot()
ax.set_title(u"Area Code 202–Related Complaints\n")
pass
In [23]:
df_202_by_month = df_202["reference_number"].resample("MS", how="count")
In [24]:
print "Area code 202 accounted for {0:,.1f}% of all complaints during the first six months of 2014"\
.format(100 * float(df_202_by_month.ix["2014-01":"2014-06"].sum()/float(first_six_months_2014)))
In [25]:
df['clean_state'].dropna().value_counts().head(20)
Out[25]:
In [26]:
df['consumer_address_city'].value_counts().head(20)
Out[26]:
In [27]:
state_abbreviations = pd.read_csv("../data/state_abbreviations.csv").set_index("name")
state_populations = pd.read_csv("../data/state_pop_2013.csv").set_index("name")
In [28]:
state_pop = state_populations.join(state_abbreviations).set_index("abbr")
In [29]:
df_2014 = df.ix["2014"]
states_2014 = pd.DataFrame(df_2014.groupby(['clean_state']).size(), columns=[ "complaints" ])\
.join(state_pop)\
.dropna()
In [30]:
assert(len(states_2014) == 52) # 50 States + DC and PR
In [31]:
states_2014["complaints_per_100k"] = states_2014.apply(lambda x: round(x["complaints"] / float(x["population"]) * 100000, 1), axis=1)
In [32]:
states_2014.sort("complaints_per_100k", ascending=False)
Out[32]: