"Data is messy"
In [1]:
# Load the libraries
import numpy as np
import pandas as pd
In [2]:
# Load the data again!
df = pd.read_csv("data/Weed_Price.csv", parse_dates=[-1])
In [3]:
# Lets start the count to seeing about missing data
df.count()
Out[3]:
In [4]:
df['LowQ'].isnull()
Out[4]:
In [5]:
# We can see the bottom rows which have NaN values
df.tail()
Out[5]:
Pandas will represent missing value by NaN
What can we do this with missing value?
.dropna(how='any')
.fillna(value=dummy)
.fillna(method='ffill')
.interpolate()
We use the inplace = True
operator to avoid making a copy of the dataframe and changing the dataframe itself
In [6]:
# Lets sort this data frame by State and Date
df.sort(columns=['State','date'], inplace=True)
In [7]:
# Lets fill the missing value with last available value
df.fillna(method = "ffill", inplace=True)
In [8]:
df.count()
Out[8]:
In [ ]:
Fill the missing values with the mean for the column.
In [ ]:
In [10]:
df["State"].value_counts()
Out[10]:
Lets check the dates and see if they are all continuous
In [58]:
df1 = df[df.State=='California'].copy()
df2 = df[df.State=='California'].copy()
In [59]:
df1.shape
Out[59]:
In [60]:
print "Earliest Date:",df1.date.min(), "\n", "Latest Date:",df1.date.max(), "\n", \
"Number of days in between them:", df1.date.max() - df1.date.min()
In [61]:
df1.groupby(['date']).size()
Out[61]:
In [62]:
df1.set_index("date", inplace=True)
df2.set_index("date", inplace=True)
In [63]:
idx = pd.date_range(df1.index.min(), df1.index.max())
In [64]:
idx
Out[64]:
In [65]:
df1 = df1.reindex(idx, fill_value=0)
In [66]:
df1.shape
Out[66]:
Exercise Show the list of dates that were missing. Hint Leverage df2. Both df1 and df2 have date
as index
In [ ]:
In [68]:
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['week'] = pd.DatetimeIndex(df['date']).week
df['weekday'] = pd.DatetimeIndex(df['date']).weekday
In [69]:
df.dtypes
Out[69]:
In [70]:
df.tail()
Out[70]:
In [71]:
df['year'].value_counts()
Out[71]:
In [72]:
df['month'].value_counts()
Out[72]:
In [73]:
df["weekday"].value_counts()
Out[73]:
In [74]:
df_mean = df.groupby("State", as_index=False).mean()
In [75]:
df_mean.shape
Out[75]:
In [76]:
df_mean.head()
Out[76]:
Pivot Table
In [77]:
df.head()
Out[77]:
In [78]:
pd.pivot_table(df, values='HighQ', index=['State'], columns=["weekday"] )
Out[78]:
Exercise Get a cross tabulation: for each state, for each month, get the prices for the weekday as shown in the output
In [79]:
Out[79]:
In [81]:
df_demo = pd.read_csv("data/Demographics_State.csv")
In [82]:
df_demo.head()
Out[82]:
In [83]:
df_demo["percent_other"] = 100 - df_demo["percent_white"] - df_demo["percent_black"] - df_demo["percent_asian"] - df_demo["percent_hispanic"]
In [84]:
df_demo.head()
Out[84]:
Exercise Express median rent as a proportion of california's median rent (Compute it as a new column)
In [ ]:
In [23]:
# Let us change the column name region to State
df_demo = df_demo.rename(columns={'region':'State'})
In [24]:
df_demo.head()
Out[24]:
In [26]:
# We can now merge Demographic and Price mean data into one single data frame
df_merge = pd.merge(df_mean, df_demo, how='inner', on='State')
df_merge.head()
Out[26]:
What happened? Why is there no data in the dataframe?
In [27]:
# Change the State in df_mean to lower case
df_mean['State'] = df_mean['State'].str.lower()
In [28]:
df_mean.head()
Out[28]:
In [29]:
# We can now merge Demographic and Price mean data into one single data frame
df_merge = pd.merge(df_mean, df_demo, how='inner', on='State')
In [30]:
df_merge.head()
Out[30]:
In [31]:
# Filter data for location California
df_cal = df[df["State"] == "California"]
In [32]:
df_cal.shape
Out[32]:
In [33]:
# Filter data for year
df_2014 = df[df["year"] == 2014]
In [34]:
df_2014.shape
Out[34]:
In [35]:
df_cal_2014 = df[(df["year"] == 2014) & (df["State"] == "California")]
In [36]:
df_cal_2014.shape
Out[36]:
In [37]:
df_cal_2014.head()
Out[37]:
Exercise Find the % of hispanic population for the state with max white population (use df_demo
)
In [ ]:
In [38]:
df.describe()
Out[38]:
We can also use convenience functions like sum(), count(), mean() etc. to calculate these
In [39]:
df.HighQ.mean()
Out[39]:
In [40]:
# Lets do this the hard way
df.HighQ.sum()
Out[40]:
In [41]:
df.HighQ.count()
Out[41]:
In [42]:
df.HighQ.sum()/df.HighQ.count()
Out[42]:
In [43]:
df.HighQ.median()
Out[43]:
In [46]:
?df.sample
In [51]:
df_ca_sample = df[df.State=='California'].sample(n = 50, replace = True, random_state=123)
In [57]:
df_ca_sample.duplicated()
Out[57]:
In [58]:
df_ca_sample.loc[8572]
Out[58]:
In [69]:
df_ca_sample.iat[0, 0] = "Cal"
In [71]:
df_ca_sample.head()
Out[71]:
But changes when you copy
In [74]:
df_ca_sample2 = df_ca_sample
In [77]:
df_ca_sample2.iat[0, 0] = "CA"
df_ca_sample2.head()
Out[77]:
In [78]:
df_ca_sample.head()
Out[78]:
Fix the issue
In [79]:
df_ca_sample3 = df_ca_sample2.copy()
In [80]:
df_ca_sample3.head()
Out[80]:
In [81]:
df_ca_sample3.iat[0, 0] = "CALIFORNIA"
df_ca_sample3.head()
Out[81]:
In [82]:
df_ca_sample2.head()
Out[82]:
In [ ]: