"Data is messy"
In [1]:
# Load the libraries
import numpy as np
import pandas as pd
In [2]:
# Load the data again!
df = pd.read_csv("data/Weed_Price.csv", parse_dates=[-1])
In [3]:
# Lets start the count to seeing about missing data
df.count()
Out[3]:
In [4]:
df['LowQ'].isnull()
Out[4]:
In [5]:
# We can see the bottom rows which have NaN values
df.tail()
Out[5]:
Pandas will represent missing value by NaN
What can we do this with missing value?
.dropna(how='any')
.fillna(value=dummy)
.fillna(method='ffill')
.interpolate()
We use the inplace = True
operator to avoid making a copy of the dataframe and changing the dataframe itself
In [6]:
# Lets sort this data frame by State and Date
df.sort(columns=['State','date'], inplace=True)
In [7]:
# Lets fill the missing value with last available value
df.fillna(method = "ffill", inplace=True)
In [8]:
df.count()
Out[8]:
In [ ]:
Fill the missing values with the mean for the column.
In [ ]:
In [9]:
df["State"].value_counts()
Out[9]:
Lets check the dates and see if they are all continuous
In [10]:
df1 = df[df.State=='California'].copy()
df2 = df[df.State=='California'].copy()
In [11]:
df1.shape
Out[11]:
In [12]:
print "Earliest Date:",df1.date.min(), "\n", "Latest Date:",df1.date.max(), "\n", \
"Number of days in between them:", df1.date.max() - df1.date.min()
In [13]:
df1.groupby(['date']).size()
Out[13]:
In [14]:
df1.set_index("date", inplace=True)
df2.set_index("date", inplace=True)
In [15]:
idx = pd.date_range(df1.index.min(), df1.index.max())
In [16]:
idx
Out[16]:
In [17]:
df1 = df1.reindex(idx, fill_value=0)
In [18]:
df1.shape
Out[18]:
Exercise Show the list of dates that were missing. Hint Leverage df2. Both df1 and df2 have date
as index
In [ ]:
In [19]:
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df['week'] = pd.DatetimeIndex(df['date']).week
df['weekday'] = pd.DatetimeIndex(df['date']).weekday
In [20]:
df.dtypes
Out[20]:
In [21]:
df.tail()
Out[21]:
In [22]:
df['year'].value_counts()
Out[22]:
In [23]:
df['month'].value_counts()
Out[23]:
In [24]:
df["weekday"].value_counts()
Out[24]:
In [25]:
df_mean = df.groupby("State", as_index=False).mean()
In [26]:
df_mean.shape
Out[26]:
In [27]:
df_mean.head()
Out[27]:
Pivot Table
In [28]:
df.head()
Out[28]:
In [29]:
pd.pivot_table(df, values='HighQ', index=['State'], columns=["weekday"] )
Out[29]:
Exercise Get a cross tabulation: for each state, for each month, get the prices for the weekday as shown in the output
In [ ]:
In [30]:
df_demo = pd.read_csv("data/Demographics_State.csv")
In [31]:
df_demo.head()
Out[31]:
In [32]:
df_demo["percent_other"] = 100 - df_demo["percent_white"] - df_demo["percent_black"] - df_demo["percent_asian"] - df_demo["percent_hispanic"]
In [ ]:
df_demo.head()
Exercise Express median rent as a proportion of california's median rent (Compute it as a new column)
In [ ]:
In [ ]:
# Let us change the column name region to State
df_demo = df_demo.rename(columns={'region':'State'})
In [ ]:
df_demo.head()
In [ ]:
# We can now merge Demographic and Price mean data into one single data frame
df_merge = pd.merge(df_mean, df_demo, how='inner', on='State')
df_merge.head()
What happened? Why is there no data in the dataframe?
In [ ]:
# Change the State in df_mean to lower case
df_mean['State'] = df_mean['State'].str.lower()
In [ ]:
df_mean.head()
In [ ]:
# We can now merge Demographic and Price mean data into one single data frame
df_merge = pd.merge(df_mean, df_demo, how='inner', on='State')
In [ ]:
df_merge.head()
In [ ]:
# Filter data for location California
df_cal = df[df["State"] == "California"]
In [ ]:
df_cal.shape
In [ ]:
# Filter data for year
df_2014 = df[df["year"] == 2014]
In [ ]:
df_2014.shape
In [ ]:
df_cal_2014 = df[(df["year"] == 2014) & (df["State"] == "California")]
In [ ]:
df_cal_2014.shape
In [ ]:
df_cal_2014.head()
Exercise Find the % of hispanic population for the state with max white population (use df_demo
)
In [ ]:
In [ ]:
df.describe()
We can also use convenience functions like sum(), count(), mean() etc. to calculate these
In [ ]:
df.HighQ.mean()
In [ ]:
# Lets do this the hard way
df.HighQ.sum()
In [ ]:
df.HighQ.count()
In [ ]:
df.HighQ.sum()/df.HighQ.count()
In [ ]:
df.HighQ.median()
In [ ]:
?df.sample
In [ ]:
df_ca_sample = df[df.State=='California'].sample(n = 50, replace = True, random_state=123)
In [ ]:
df_ca_sample.duplicated()
In [ ]:
df_ca_sample.loc[8572]
In [ ]:
df_ca_sample.iat[0, 0] = "Cal"
In [ ]:
df_ca_sample.head()
But changes when you copy
In [ ]:
df_ca_sample2 = df_ca_sample
In [ ]:
df_ca_sample2.iat[0, 0] = "CA"
df_ca_sample2.head()
In [ ]:
df_ca_sample.head()
Fix the issue
In [ ]:
df_ca_sample3 = df_ca_sample2.copy()
In [ ]:
df_ca_sample3.head()
In [ ]:
df_ca_sample3.iat[0, 0] = "CALIFORNIA"
df_ca_sample3.head()
In [ ]:
df_ca_sample2.head()
In [ ]: