In [1]:
!pip install matplotlib
In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import dateutil.parser
First, I made a mistake naming the data set! It's 2015 data, not 2014 data. But yes, still use
311-2014.csv. You can rename it.
Import your data, but only the first 200,000 rows. You'll also want to change the index to be a datetime based on the Created Date column - you'll want to check if it's already a datetime, and parse it if not.
In [3]:
#df = pd.read_csv("small-311-2015.csv")
df = pd.read_csv("311-2014.csv", nrows=200000)
df.head(2)
Out[3]:
In [4]:
df.info()
In [5]:
def parse_date (str_date):
return dateutil.parser.parse(str_date)
df['created_dt']= df['Created Date'].apply(parse_date)
df.head(3)
Out[5]:
In [6]:
df.info()
What was the most popular type of complaint, and how many times was it filed?
In [6]:
df["Complaint Type"].value_counts().head(1)
Out[6]:
Make a horizontal bar graph of the top 5 most frequent complaint types.
In [63]:
df["Complaint Type"].value_counts().head(5).sort_values().plot(kind='barh')
Out[63]:
Which borough has the most complaints per capita? Since it's only 5 boroughs, you can do the math manually.
In [8]:
df["Borough"].value_counts()
Out[8]:
In [9]:
people_bronx= 1438159
people_queens= 2321580
people_manhattan=1636268
people_brooklyn= 2621793
people_staten_island= 473279
In [10]:
complaints_per_capita_bronx= 29610/people_bronx
complaints_per_capita_bronx
Out[10]:
In [11]:
complaints_per_capita_queens=46824/people_queens
complaints_per_capita_queens
Out[11]:
In [12]:
complaints_per_capita_manhattan=42050/people_manhattan
complaints_per_capita_manhattan
Out[12]:
In [13]:
complaints_per_capita_staten_island=473279/people_staten_island
complaints_per_capita_staten_island
Out[13]:
In [14]:
complaints_per_capita_brooklyn=2621793/people_brooklyn
complaints_per_capita_brooklyn
Out[14]:
According to your selection of data, how many cases were filed in March? How about May?
In [15]:
df.index = df['created_dt']
#del df['Created Date']
df.head()
Out[15]:
In [16]:
print("There were", len(df['2015-03']), "cases filed in March")
In [17]:
print("There were", len(df['2015-05']), "cases filed in May")
I'd like to see all of the 311 complaints called in on April 1st.
Surprise! We couldn't do this in class, but it was just a limitation of our data set
In [18]:
df['2015-04-01']
Out[18]:
What was the most popular type of complaint on April 1st?
What were the most popular three types of complaint on April 1st
In [19]:
df['2015-04-01']['Complaint Type'].value_counts().head(3)
Out[19]:
In [20]:
df.info()
In [ ]:
What month has the most reports filed? How many? Graph it.
In [21]:
df.resample('M').count()
Out[21]:
In [22]:
df.resample('M').index[0]
Out[22]:
In [23]:
import numpy as np
np.__version__
Out[23]:
In [24]:
df.resample('M').count().plot(y="Unique Key")
Out[24]:
In [25]:
ax= df.groupby(df.index.month).count().plot(y='Unique Key', legend=False)
ax.set_xticks([1,2,3,4,5,6,7,8,9,10,11, 12])
ax.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
ax.set_ylabel("Number of Complaints")
ax.set_title("311 complains in 2015")
Out[25]:
What week of the year has the most reports filed? How many? Graph the weekly complaints.
In [64]:
#df.resample('W').count().head(5)
df.resample('W').count().plot(y="Unique Key", color= "purple")
Out[64]:
Noise complaints are a big deal. Use .str.contains to select noise complaints, and make an chart of when they show up annually. Then make a chart about when they show up every day (cyclic).
In [27]:
df[df['Complaint Type'].str.contains("Noise")].head()
Out[27]:
In [28]:
noise_df= df[df['Complaint Type'].str.contains("Noise")]
In [29]:
noise_graph= noise_df.groupby(noise_df.index.month).count().plot(y='Unique Key', legend=False)
noise_graph.set_xticks([1,2,3,4,5,6,7,8,9,10,11, 12])
noise_graph.set_xticklabels(['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
noise_graph.set_ylabel("Number of Noise Complaints")
noise_graph.set_title("311 noise complains in 2015")
Out[29]:
In [65]:
noise_df.groupby(by=noise_df.index.hour)['Unique Key'].count().plot()
Out[65]:
In [30]:
noise_graph= noise_df.groupby(noise_df.index.dayofweek).count().plot(y='Unique Key', legend=False)
noise_graph.set_xticks([1,2,3,4,5,6,7])
noise_graph.set_xticklabels(['Mon', 'Tues', 'Wed', 'Thur', 'Fri', 'Sat', 'Sun'])
noise_graph.set_ylabel("Number of Noise Complaints")
noise_graph.set_title("311 noise complains in 2015")
Out[30]:
Which were the top five days of the year for filing complaints? How many on each of those days? Graph it.
In [31]:
daily_count= df['Unique Key'].resample('D').count().sort_values(ascending=False)
top_5_days= daily_count.head(5)
top_5_days
Out[31]:
In [32]:
ax = top_5_days.plot(kind='bar') # I dont know how to put names to the labels
ax.set_title("Top 5 days")
ax.set_xlabel("Day")
ax.set_ylabel("Complaints")
Out[32]:
What hour of the day are the most complaints? Graph a day of complaints.
In [33]:
hour_graph= df.groupby(df.index.hour).count().plot(y='Unique Key', legend=False)
hour_graph.set_xticks([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23])
hour_graph.set_title("A day of complaints")
hour_graph.set_xlabel("Hours")
hour_graph.set_ylabel("Complaints")
Out[33]:
One of the hours has an odd number of complaints. What are the most common complaints at that hour, and what are the most common complaints the hour before and after?
In [34]:
twelve_am_complaints= df[df.index.hour <1]
In [35]:
twelve_am_complaints.head()
Out[35]:
In [36]:
twelve_am_complaints['Complaint Type'].value_counts().head(5)
Out[36]:
In [37]:
one_am_complaints= df[df.index.hour == 1]
In [38]:
one_am_complaints['Complaint Type'].value_counts().head(5)
Out[38]:
In [39]:
eleven_pm_complaints= df[df.index.hour == 23]
eleven_pm_complaints['Complaint Type'].value_counts().head(5)
Out[39]:
So odd. What's the per-minute breakdown of complaints between 12am and 1am? You don't need to include 1am.
In [40]:
twelve_am_complaints.groupby(twelve_am_complaints.index.minute).count()
Out[40]:
Looks like midnight is a little bit of an outlier. Why might that be? Take the 5 most common agencies and graph the times they file reports at (all day, not just midnight).
In [41]:
df['Agency'].value_counts().head(5)
Out[41]:
In [42]:
df_NYPD = df[df['Agency'] == 'NYPD']
In [43]:
df_HPD = df[df['Agency'] == 'HPD']
In [44]:
df_DOT = df[df['Agency'] == 'DOT']
In [45]:
df_DPR= df[df['Agency'] == 'DPR']
In [46]:
df_DOHMH= df[df['Agency'] == 'DOHMH']
In [47]:
all_graph = df_NYPD.groupby(by= df_NYPD.index.hour).count().plot(y='Unique Key', label='NYPD complaints')
all_graph.set_xticks([0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23])
all_graph.set_title("A day of complaints by the top 5 agencies")
all_graph.set_xlabel("Hours")
all_graph.set_ylabel("Complaints")
df_HPD.groupby(by= df_HPD.index.hour).count().plot(y='Unique Key', ax=all_graph , label='HPD complaints')
df_DOT.groupby(by= df_DOT.index.hour).count().plot(y='Unique Key', ax=all_graph , label='DOT complaints')
df_DPR.groupby(by= df_DPR.index.hour).count().plot(y='Unique Key', ax=all_graph , label='DPR complaints')
df_DOHMH.groupby(by= df_DOHMH.index.hour).count().plot(y='Unique Key', ax=all_graph , label='DOHMH complaints')
Out[47]:
Graph those same agencies on an annual basis - make it weekly. When do people like to complain? When does the NYPD have an odd number of complaints?
In [48]:
all_graph = df_NYPD.groupby(by= df_NYPD.index.weekofyear).count().plot(y='Unique Key', label='NYPD complaints')
#all_graph.set_xticks([1,50])
all_graph.set_title("A year of complaints by the top 5 agencies")
all_graph.set_xlabel("Weeks")
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
df_HPD.groupby(by= df_HPD.index.week).count().plot(y='Unique Key', ax=all_graph , label='HPD complaints')
df_DOT.groupby(by= df_DOT.index.hour).count().plot(y='Unique Key', ax=all_graph , label='DOT complaints')
df_DPR.groupby(by= df_DPR.index.hour).count().plot(y='Unique Key', ax=all_graph , label='DPR complaints')
df_DOHMH.groupby(by= df_DOHMH.index.hour).count().plot(y='Unique Key', ax=all_graph , label='DOHMH complaints')
plt.legend(bbox_to_anchor=(0, 1), loc='best', ncol=1)
print("""May and June are the months with more complaints, followed by October, November and December.
In May the NYPD and HPD have an odd number of complaints""")
Maybe the NYPD deals with different issues at different times? Check the most popular complaints in July and August vs the month of May. Also check the most common complaints for the Housing Preservation Bureau (HPD) in winter vs. summer.
In [49]:
August_July = df["2015-07":"2015-08"]
August_July_complaints = August_July['Complaint Type'].value_counts().head(5)
August_July_complaints
Out[49]:
In [50]:
May = df['2015-05']
May_complaints= May['Complaint Type'].value_counts().head(5)
May_complaints
Out[50]:
In [51]:
# August_July_vs_May= August_July_complaints.plot(y='Unique Key', label='August - July complaints')
# August_July_vs_May.set_ylabel("Number of Complaints")
# August_July_vs_May.set_title("August-July vs May Complaints")
# May['Complaint Type'].value_counts().head(5).plot(y='Unique Key', ax=August_July_vs_May, label='May complaints')
# August_July_vs_May.set_xticks([1,2,3,4,5])
# August_July_vs_May.set_xticklabels(['Illegal Parking', 'Blocked Driveway', 'Noise - Street/Sidewalk', 'Street Condition', 'Noise - Commercial'])
In [52]:
#Most popular complaints of the HPD
df_HPD['Complaint Type'].value_counts().head(5)
Out[52]:
In [79]:
summer_complaints= df_HPD["2015-06":"2015-08"]['Complaint Type'].value_counts().head(5)
summer_complaints
Out[79]:
In [80]:
winter_complaints= df_HPD["2015-01":"2015-02"]['Complaint Type'].value_counts().head(5)
winter_complaints
Out[80]:
In [82]:
winter_complaints_dec= df_HPD["2015-12"]['Complaint Type'].value_counts().head(5)
winter_complaints_dec
Out[82]:
In [83]:
winter_results= df_HPD["2015-12"]['Complaint Type'].value_counts() + df_HPD["2015-01":"2015-02"]['Complaint Type'].value_counts()
In [84]:
winter_results
Out[84]:
In [ ]: