In [1]:
import pandas as pd
import matplotlib as mpl
import datetime as dt
import re
%matplotlib inline

Loading The Data

The data for this analysis was obtained via a Freedom of Information Act request to the Federal Trade Commission. Originally, the data came in two Excel files, which have been converted to CSVs for the purposes of this analysis.


In [2]:
# The columns and columns names differed slightly between the
# two Excel files, so we clean them up here.
name_replace_pat = re.compile(r"[^a-z]+")
def convert_col_names(names):
    return [ re.sub(name_replace_pat, "_", name.lower().strip())
        for name in names ]

In [3]:
def rename_cols(df):
    return df.rename(columns=dict(zip(df.columns, convert_col_names(df.columns))))

In [4]:
early_data = rename_cols(pd.read_csv("../data/FTC_2011-43014.csv"))
later_data = rename_cols(pd.read_csv("../data/FTC_5114-71014.csv"))

In [5]:
df = pd.concat([early_data, later_data])

In [6]:
df["created_datetime"] = df["created_date"].apply(lambda x: dt.datetime.strptime(str(x), "%m/%d/%Y") if not pd.isnull(x) else None)

In [7]:
df = df.set_index("created_datetime")

In [8]:
df.head()


Out[8]:
company_name company_phone_area_code company_phone_extension company_phone_number complaint_info_initial_contact_method complaint_info_product_service_code consumer_address_city consumer_address_state_code created_date reference_number
created_datetime
2011-01-04 IRS NaN NaN NaN Phone 2143|9043 Waianae HI 01/04/2011 28741989
2011-01-10 Georgia State IRS 770 NaN 9448020 Phone 9043 Marietta GA 01/10/2011 28803726
2011-01-12 IRS 872 NaN 4796242 Phone 2142|9043 Fort Worth TX 01/12/2011 28827558
2011-01-13 IRS 800 NaN 8297650 Phone 9043 Orangeburg SC 01/13/2011 28842181
2011-01-19 IRS|Readers Digest 876| | 4355471| Phone 4190|9043 Morton MN 01/19/2011 28890200

Cleaning Up the "Created Date"s and State Codes

A few of the columns in the data sometimes contain | to delimit multiple options selected on a complaint. To simplify the analysis, we take just the first state in multi-state complaints.


In [9]:
def clean_state_codes(state_name):
    if type(state_name) == str and "|" in state_name:
        non_null = filter(lambda x: x != "", state_name.split("|"))
        if len(non_null):
            return non_null[0]
        else:
            return None
    else:
        return state_name

In [10]:
df["clean_state"] = df["consumer_address_state_code"].apply(clean_state_codes)

In [11]:
# Note: selecting only complaints through June 2014, the last full month with data
ax = df.ix[:"2014-06"]["company_name"].resample("M", how="count").plot()
ax.set_title("Complaints Per Month\n")
pass


What Types of Communication Are Scammers Using?


In [12]:
df["complaint_info_initial_contact_method"].value_counts()


Out[12]:
Phone Call: Landline       10044
Phone                       2769
Phone Call: Mobile/Cell     2543
Internet/E-mail              226
Mobile: Text/Email/IM         58
Mail                          57
I Initiated Contact           36
Unknown                       34
Internet Web Site             30
In Person                     16
Internet (Other)              11
Wireless                       7
Fax                            3
Print                          3
TV/Radio                       1
dtype: int64

In [13]:
# Coalesce "Phone" types
df["phone"] = df['complaint_info_initial_contact_method'].apply(lambda x: int("Phone" in str(x)))

In [14]:
ax = df["phone"].ix[:"2014-06"].resample("M", how="count").plot()
ax.set_title("Phone-Related Complaints By Month\n")
pass


Finding the Change Over Time

The trend in the graph is obviously a huge spike. Let's look at all the months for which we have a full set of data available and see just how big the change has been over time.


In [15]:
complaints_by_month = df["reference_number"].resample("MS", how="count")

In [16]:
first_six_months_2011 = complaints_by_month.ix["2011-01": "2011-06"].sum()
first_six_months_2012 = complaints_by_month.ix["2012-01": "2012-06"].sum()
first_six_months_2013 = complaints_by_month.ix["2013-01": "2013-06"].sum()
first_six_months_2014 = complaints_by_month.ix["2014-01": "2014-06"].sum()

In [17]:
print "Pct. Change 2012 vs. 2011: {pct:,.2f}%".format(pct=(first_six_months_2012-first_six_months_2011)/float(first_six_months_2011)*100)
print "Pct. Change 2013 vs. 2012: {pct:,.2f}%".format(pct=(first_six_months_2013-first_six_months_2012)/float(first_six_months_2012)*100)
print "Pct. Change 2014 vs. 2013: {pct:,.2f}%".format(pct=(first_six_months_2014-first_six_months_2013)/float(first_six_months_2013)*100)


Pct. Change 2012 vs. 2011: 5.63%
Pct. Change 2013 vs. 2012: 133.33%
Pct. Change 2014 vs. 2013: 3,529.14%

In [18]:
first_six_months_2014/first_six_months_2013


Out[18]:
36

Where Are These Phone Calls Coming From?

Washington DC's main area code — 202 — accounts for far more phone calls than any other area code.


In [18]:
df['company_phone_area_code'].value_counts().head(20)


Out[18]:
202    6113
415    1549
585     818
888     674
206     416
530     401
347     368
800     332
866     320
716     305
877     150
844     145
571     134
213     124
917     107
876      96
516      89
425      82
646      79
253      79
dtype: int64

In [19]:
top_phone_numbers = pd.DataFrame(df.groupby(['company_phone_area_code', 'company_phone_number']).size(), columns=["complaints"])

In [20]:
top_phone_numbers.sort("complaints", ascending=False).head(20)


Out[20]:
complaints
company_phone_area_code company_phone_number
585 6331814 165
6331792 148
415 2514089 131
2511037 126
800 8291040 103
415 2510522 96
202 2413122 94
5069044 89
2412158 84
2411435 80
415 4848569 78
202 2411375 78
347 5597963 78
202 5069319 75
347 5590378 71
202 2411818 68
5068708 67
2807395 66
530 2385813 64
206 4144646 60

In [21]:
df_202 = df[df["company_phone_area_code"] == "202"]

In [22]:
ax = df_202.ix[:"2014-06"]["phone"].resample("M", how="sum").plot()
ax.set_title(u"Area Code 202–Related Complaints\n")
pass



In [23]:
df_202_by_month = df_202["reference_number"].resample("MS", how="count")

In [24]:
print "Area code 202 accounted for {0:,.1f}% of all complaints during the first six months of 2014"\
    .format(100 * float(df_202_by_month.ix["2014-01":"2014-06"].sum()/float(first_six_months_2014)))


Area code 202 accounted for 38.9% of all complaints during the first six months of 2014

In [25]:
df['clean_state'].dropna().value_counts().head(20)


Out[25]:
CA    2471
NY    1189
FL     965
TX     942
NJ     663
PA     555
VA     545
IL     532
MA     503
GA     437
OH     414
AZ     413
WA     408
MD     360
MI     348
NC     339
IN     287
CO     280
OR     228
NV     223
dtype: int64

In [26]:
df['consumer_address_city'].value_counts().head(20)


Out[26]:
Las Vegas        130
Houston          112
New York         106
Brooklyn         103
San Jose         100
Los Angeles       95
Phoenix           86
San Diego         83
Chicago           75
San Francisco     71
Charlotte         66
Washington        66
Austin            65
Albuquerque       58
Denver            55
Tucson            53
Seattle           53
Portland          49
Indianapolis      49
Honolulu          48
dtype: int64

Normalizing the 2014 Complaints by State for 2013 Population


In [27]:
state_abbreviations = pd.read_csv("../data/state_abbreviations.csv").set_index("name")
state_populations = pd.read_csv("../data/state_pop_2013.csv").set_index("name")

In [28]:
state_pop = state_populations.join(state_abbreviations).set_index("abbr")

In [29]:
df_2014 = df.ix["2014"]
states_2014 = pd.DataFrame(df_2014.groupby(['clean_state']).size(), columns=[ "complaints" ])\
    .join(state_pop)\
    .dropna()

In [30]:
assert(len(states_2014) == 52) # 50 States + DC and PR

In [31]:
states_2014["complaints_per_100k"] = states_2014.apply(lambda x: round(x["complaints"] / float(x["population"]) * 100000, 1), axis=1)

In [32]:
states_2014.sort("complaints_per_100k", ascending=False)


Out[32]:
complaints population complaints_per_100k
clean_state
DE 83 925749 9.0
DC 50 646449 7.7
NV 197 2790136 7.1
HI 97 1404054 6.9
MA 424 6692824 6.3
NJ 555 8899339 6.2
VA 469 8260405 5.7
CA 2150 38332521 5.6
AZ 353 6626624 5.3
NM 107 2085287 5.1
NY 999 19651127 5.1
WA 350 6971406 5.0
OR 197 3930065 5.0
CT 171 3596080 4.8
MD 287 5928814 4.8
CO 241 5268367 4.6
NH 55 1323459 4.2
FL 763 19552860 3.9
ID 61 1612136 3.8
GA 366 9992167 3.7
IN 234 6570902 3.6
IL 446 12882135 3.5
PA 436 12773801 3.4
MN 176 5420380 3.2
ME 43 1328302 3.2
MI 290 9895622 2.9
RI 30 1051511 2.9
TX 770 26448193 2.9
WV 53 1854304 2.9
NC 280 9848060 2.8
OH 320 11570808 2.8
WI 157 5742713 2.7
MT 26 1015165 2.6
MO 149 6044171 2.5
WY 14 582658 2.4
TN 143 6495978 2.2
SC 106 4774839 2.2
KY 97 4395295 2.2
UT 63 2900872 2.2
NE 40 1868516 2.1
AL 97 4833722 2.0
LA 88 4625470 1.9
KS 51 2893957 1.8
AK 13 735132 1.8
IA 54 3090416 1.7
MS 48 2991207 1.6
VT 10 626630 1.6
AR 48 2959373 1.6
SD 11 844877 1.3
OK 45 3850568 1.2
ND 7 723393 1.0
PR 2 3615086 0.1