In [1]:
import pandas as pd
import matplotlib as mpl
import datetime as dt
%matplotlib inline
In [2]:
f1 = "../data/FTC_2011-43014.csv"
f2 = "../data/FTC_5114-71014.csv"
In [3]:
column_names=["Reference Number", "Created Date", "Consumer Address, City", "Consumer Address, State Code", "Company Name",
"Company Phone Area Code", "Company Phone, Number", "Company Phone, Extension",
"Complaint Info Initial Contact Method", "Complaint Info Product Service Code"]
In [4]:
df_1 = pd.io.parsers.read_csv(f1, names=column_names, index_col=0, skiprows=1)
df_2 = pd.io.parsers.read_csv(f2, names=column_names, index_col=0, skiprows=1)
In [5]:
df = pd.concat([df_1, df_2])
In [6]:
df.head()
Out[6]:
Converting the created dates to datetime objects and also cleaning up the state names. A few of the columns in the data sometimes contain | to delimit multiple options selected on a complaint. Want to eliminate those. The state function assumes that the first state is the right one if available.
Converting to a datetime allows us to see the huge uptick in IRS scam complaints to the FTC over time.
In [7]:
def clean_state_names(state_name):
state_name = str(state_name)
if "|" in state_name:
pieces = state_name.split("|")
if pieces[0] != "":
state = pieces[0]
elif pieces[1] != "":
state = pieces[1]
else:
state = None
else:
state = state_name
return state
In [8]:
df["clean_state"] = df["Consumer Address, State Code"].apply(lambda x: clean_state_names(x))
In [9]:
df["created_date"] = df["Created Date"].apply(lambda x: dt.datetime.strptime(str(x), "%m/%d/%Y") if not pd.isnull(x) else None)
In [10]:
df.dropna(subset=["created_date"]).set_index("created_date")["Company Name"].resample("M", how="count").plot()
Out[10]:
In [12]:
df.dropna(subset=["created_date"])['Complaint Info Initial Contact Method'].value_counts()
Out[12]:
In [13]:
df["phone"] = df['Complaint Info Initial Contact Method'].apply(lambda x: 1 if ("Phone" in str(x)) else 0)
In [14]:
df.dropna(subset=["created_date"]).set_index("created_date")["phone"].resample("M", how="count").plot()
Out[14]:
In [15]:
df["month"] = df["created_date"].apply(lambda x: x.month if not pd.isnull(x) else None)
df["year"] = df["created_date"].apply(lambda x: x.year if not pd.isnull(x) else None)
In [16]:
complaints_by_month = pd.DataFrame(df.groupby(["year", "month"])["created_date"].count())
In [17]:
first_six_months_2011 = complaints_by_month.ix[2011].ix[1:6]["created_date"].sum()
first_six_months_2012 = complaints_by_month.ix[2012].ix[1:6]["created_date"].sum()
first_six_months_2013 = complaints_by_month.ix[2013].ix[1:6]["created_date"].sum()
first_six_months_2014 = complaints_by_month.ix[2014].ix[1:6]["created_date"].sum()
In [18]:
print "Pct. Change 2012 vs. 2011: {pct:,.2f}%".format(pct=(first_six_months_2012-first_six_months_2011)/float(first_six_months_2011)*100)
print "Pct. Change 2013 vs. 2012: {pct:,.2f}%".format(pct=(first_six_months_2013-first_six_months_2012)/float(first_six_months_2012)*100)
print "Pct. Change 2014 vs. 2013: {pct:,.2f}%".format(pct=(first_six_months_2014-first_six_months_2013)/float(first_six_months_2013)*100)
In [19]:
df.dropna(subset=["created_date"])['Company Phone Area Code'].value_counts()
Out[19]:
In [20]:
top_phone_numbers = pd.DataFrame(df.groupby(['Company Phone Area Code', 'Company Phone, Number'])\
.count()\
.sort("created_date", ascending=False))
In [25]:
top_phone_numbers[:25]
Out[25]:
The majority of the phone calls come from the 202 area code. This is the area code for Washington, DC. Digging further into just those phone numbers.
In [26]:
df_202 = df[df["Company Phone Area Code"] == "202"]
In [28]:
df_202.dropna(subset=["created_date"]).set_index("created_date")["phone"].resample("M", how="sum").plot()
Out[28]:
In [29]:
complaints_from_202_by_month = pd.DataFrame(df_202.groupby(["year", "month"])\
["created_date"]\
.count())
In [30]:
complaints_from_202_by_month.head()
Out[30]:
In [31]:
first_six_months_2014_from_202 = complaints_from_202_by_month.ix[2014].ix[1:6]["created_date"].sum()
In [35]:
print "The percentage of phone calls from 202 area during the first six months of 2014 was {0:,.2f}%".format(float(first_six_months_2014_from_202/float(first_six_months_2014)*100))
print "There was just {0} reported scam call from 202 area code in June".format(complaints_from_202_by_month.ix[2014].ix[6].size)
In [36]:
df.dropna(subset=["created_date"])['clean_state'].value_counts()
Out[36]:
In [37]:
df.dropna(subset=["created_date"])['Consumer Address, City'].value_counts()
Out[37]:
In [38]:
df_2014 = df[df["created_date"] >= dt.datetime(year=2014, month=1, day=1)]
states_2014 = df_2014.dropna(subset=["created_date"])\
.groupby(['clean_state'])
In [39]:
states_df = pd.DataFrame(states_2014.size().order(ascending=False))
In [40]:
states_df["state_abbrev"] = states_df.index
In [41]:
state_abbrev_to_name = {
'AK': 'Alaska',
'AL': 'Alabama',
'AR': 'Arkansas',
'AS': 'American Samoa',
'AZ': 'Arizona',
'CA': 'California',
'CO': 'Colorado',
'CT': 'Connecticut',
'DC': 'District of Columbia',
'DE': 'Delaware',
'FL': 'Florida',
'GA': 'Georgia',
'GU': 'Guam',
'HI': 'Hawaii',
'IA': 'Iowa',
'ID': 'Idaho',
'IL': 'Illinois',
'IN': 'Indiana',
'KS': 'Kansas',
'KY': 'Kentucky',
'LA': 'Louisiana',
'MA': 'Massachusetts',
'MD': 'Maryland',
'ME': 'Maine',
'MI': 'Michigan',
'MN': 'Minnesota',
'MO': 'Missouri',
'MP': 'Northern Mariana Islands',
'MS': 'Mississippi',
'MT': 'Montana',
'NA': 'National',
'NC': 'North Carolina',
'ND': 'North Dakota',
'NE': 'Nebraska',
'NH': 'New Hampshire',
'NJ': 'New Jersey',
'NM': 'New Mexico',
'NV': 'Nevada',
'NY': 'New York',
'OH': 'Ohio',
'OK': 'Oklahoma',
'OR': 'Oregon',
'PA': 'Pennsylvania',
'PR': 'Puerto Rico',
'RI': 'Rhode Island',
'SC': 'South Carolina',
'SD': 'South Dakota',
'TN': 'Tennessee',
'TX': 'Texas',
'UT': 'Utah',
'VA': 'Virginia',
'VI': 'Virgin Islands',
'VT': 'Vermont',
'WA': 'Washington',
'WI': 'Wisconsin',
'WV': 'West Virginia',
'WY': 'Wyoming',
}
In [42]:
import csv
state_pop_dict = {}
with open('../data/state_pop_2013.csv', 'rU') as f:
no_heads = f.readlines()[1:]
reader = csv.reader(no_heads)
for row in reader:
state_pop_dict[row[0].lstrip("\xca")] = int(row[1].replace(",", ""))
In [43]:
def find_state_pop(abbrev):
try:
full_name = state_abbrev_to_name[abbrev]
except KeyError:
full_name = None
if full_name is not None:
population = state_pop_dict[full_name]
else:
population = None
return population
In [44]:
states_df["state_pop"] = states_df["state_abbrev"].apply(lambda x: find_state_pop(x))
In [45]:
states_df["complaints_per_100000"] = states_df.apply(lambda x: round(x[0] / float(x["state_pop"]) * 100000, 1), axis=1)
In [46]:
states_df.sort("complaints_per_100000", ascending=False)
Out[46]:
In [ ]: