In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn import linear_model
from sklearn.preprocessing import OneHotEncoder

Reading CSV


In [2]:
csv_df = pd.DataFrame.from_csv("marijuana-street-price-clean.csv",header=0,index_col=False,sep=',',parse_dates=[-1])

In [3]:
#csv_df.head

In [4]:
csv_df_sort = csv_df.sort(columns=['State','date'])

In [5]:
# let us count the number of entries in each column

In [6]:
csv_df_sort.count()


Out[6]:
State     22899
HighQ     22899
HighQN    22899
MedQ      22899
MedQN     22899
LowQ      12342
LowQN     22899
date      22899
dtype: int64

In [7]:
csv_df_sort.dtypes


Out[7]:
State             object
HighQ            float64
HighQN             int64
MedQ             float64
MedQN              int64
LowQ             float64
LowQN              int64
date      datetime64[ns]
dtype: object

Pre processing by filling data


In [8]:
# we see that LowQ does not match the number of entries and this a common problem in time series analysis
# lets fill up the Nan value with the last known best value so that we can work on continuing time series analysis

In [9]:
csv_df_sort_fillna_ffil = csv_df_sort.fillna(method='ffill')

In [10]:
csv_df_sort_fillna_ffil.count()


Out[10]:
State     22899
HighQ     22899
HighQN    22899
MedQ      22899
MedQN     22899
LowQ      22899
LowQN     22899
date      22899
dtype: int64

In [11]:
#now we have cleaned data set to work with

In [12]:
global_mean_HighQ = csv_df_sort_fillna_ffil['HighQ'].mean()

In [13]:
global_mean_HighQ


Out[13]:
329.75985414210226

In [14]:
#now that we have the global mean, we can find how far the values of mean are for each state from global mean

In [15]:
print type(csv_df_sort_fillna_ffil.head())


<class 'pandas.core.frame.DataFrame'>

In [16]:
#group_by_state.get_group(('Alabama'))
# to check values for a given group
# now lets use agg function to get generate Mean,Median,Standard deviation,Variance and covariance for every state

In [17]:
basic_measures_state_wise_df = pd.DataFrame()

In [18]:
#lets get all the unique states in the data frame and construct the measures for each state
# normal method
'''
unique_states = pd.unique(csv_df_sort_fillna_ffil['State'].ravel())
for state in unique_states:
    state_df = csv_df_sort_fillna_ffil.loc[csv_df_sort_fillna_ffil['State'] == state]
    highq_df = state_df.groupby(['State'])['HighQ'].agg([np.mean,np.median,np.std,np.var,np.cov])
    highq_df.head()
    lowq_df = state_df.groupby(['State'],)['LowQ'].agg([np.mean,np.median,np.std,np.var,np.cov])
    highq_df = highq_df.rename(columns={'mean':'HighQ_Mean','median':'HighQ_Median','std':'HighQ_Std','var':'HighQ_Var','cov':'HighQ_Cov'})
    lowq_df = lowq_df.rename(columns={'mean':'LowQ_Mean','median':'LowQ_Median','std':'LowQ_Std','var':'LowQ_Var','cov':'LowQ_Cov'})
    highq_df['State'] = state
    lowq_df['State'] = state
    break
'''


Out[18]:
"\nunique_states = pd.unique(csv_df_sort_fillna_ffil['State'].ravel())\nfor state in unique_states:\n    state_df = csv_df_sort_fillna_ffil.loc[csv_df_sort_fillna_ffil['State'] == state]\n    highq_df = state_df.groupby(['State'])['HighQ'].agg([np.mean,np.median,np.std,np.var,np.cov])\n    highq_df.head()\n    lowq_df = state_df.groupby(['State'],)['LowQ'].agg([np.mean,np.median,np.std,np.var,np.cov])\n    highq_df = highq_df.rename(columns={'mean':'HighQ_Mean','median':'HighQ_Median','std':'HighQ_Std','var':'HighQ_Var','cov':'HighQ_Cov'})\n    lowq_df = lowq_df.rename(columns={'mean':'LowQ_Mean','median':'LowQ_Median','std':'LowQ_Std','var':'LowQ_Var','cov':'LowQ_Cov'})\n    highq_df['State'] = state\n    lowq_df['State'] = state\n    break\n"

In [19]:
csv_df_sort_fillna_ffil.head()


Out[19]:
State HighQ HighQN MedQ MedQN LowQ LowQN date
20094 Alabama 339.65 1033 198.04 926 147.15 122 2013-12-27
20859 Alabama 339.65 1033 198.04 926 147.15 122 2013-12-28
21573 Alabama 339.75 1036 198.26 929 149.49 123 2013-12-29
22287 Alabama 339.75 1036 198.81 930 149.49 123 2013-12-30
22797 Alabama 339.42 1040 198.68 932 149.49 123 2013-12-31

Generating statistical measures


In [20]:
statistics_df = pd.DataFrame(csv_df_sort_fillna_ffil.groupby(['State'],as_index=False).aggregate\
                             ({'HighQ': {'HighQ_Mean': np.mean,'HighQ_Median':np.median,'HighQ_Mode':stats.mode,'HighQ_Std':np.std,'HighQ_Var':np.var,'HighQ_Covar':np.cov},\
                               'LowQ':{'LowQ_Mean':np.mean,'LowQ_Median':np.median,'LowQ_Mode':stats.mode,'LowQ_Std':np.std,'LowQ_Var':np.var,'LowQ_Covar':np.cov}}))
print type(statistics_df)
statistics_df = statistics_df.rename(columns={'':'State'})
print type(statistics_df)
statistics_df.columns = statistics_df.columns.droplevel(0)
statistics_df.head()


<class 'pandas.core.frame.DataFrame'>
<class 'pandas.core.frame.DataFrame'>
Out[20]:
State HighQ_Std HighQ_Mode HighQ_Covar HighQ_Var HighQ_Mean HighQ_Median LowQ_Std LowQ_Covar LowQ_Median LowQ_Mode LowQ_Mean LowQ_Var
0 Alabama 1.539724 ([337.32], [10.0]) 2.370749 2.370749 339.561849 340.10 2.073186 4.298099 144.98 ([144.98], [216.0]) 145.978508 4.298099
1 Alaska 5.580105 ([285.65], [19.0]) 31.137567 31.137567 291.482004 289.81 13.188305 173.931381 403.33 ([403.33], [237.0]) 394.653964 173.931381
2 Arizona 2.015750 ([300.37], [10.0]) 4.063247 4.063247 300.667483 300.46 3.961371 15.692461 185.78 ([185.78], [213.0]) 188.500134 15.692461
3 Arkansas 6.866002 ([349.29], [11.0]) 47.141986 47.141986 348.056147 348.21 1.172681 1.375181 126.10 ([126.1], [219.0]) 126.771269 1.375181
4 California 1.727046 ([245.03], [8.0]) 2.982686 2.982686 245.376125 245.31 1.598252 2.554409 188.60 ([188.6], [211.0]) 189.783586 2.554409

Reading demographics and population data


In [21]:
#let us now read demographics data into a dataframe

In [22]:
demographics_df = pd.DataFrame.from_csv("Demographics_State.csv",header=0,index_col=False,sep=',')

In [23]:
demographics_df.head()


Out[23]:
region total_population percent_white percent_black percent_asian percent_hispanic per_capita_income median_rent median_age
0 alabama 4799277 67 26 1 4 23680 501 38.1
1 alaska 720316 63 3 5 6 32651 978 33.6
2 arizona 6479703 57 4 3 30 25358 747 36.3
3 arkansas 2933369 74 15 1 7 22170 480 37.5
4 california 37659181 40 6 13 38 29527 1119 35.4

In [24]:
demographics_df.shape


Out[24]:
(51, 9)

In [25]:
demographics_df.columns


Out[25]:
Index([u'region', u'total_population', u'percent_white', u'percent_black',
       u'percent_asian', u'percent_hispanic', u'per_capita_income',
       u'median_rent', u'median_age'],
      dtype='object')

In [26]:
#let us now read population data into a dataframe

In [27]:
population_df = pd.DataFrame.from_csv("Population_State.csv",header=0,index_col=False,sep=",")

In [28]:
population_df.head()


Out[28]:
region value
0 alabama 4777326
1 alaska 711139
2 arizona 6410979
3 arkansas 2916372
4 california 37325068

In [29]:
population_df.shape


Out[29]:
(51, 2)

In [30]:
population_df.describe()


Out[30]:
value
count 51.000000
mean 6061543.352941
std 6838088.582209
min 562803.000000
25% 1697554.500000
50% 4340167.000000
75% 6649654.500000
max 37325068.000000

In [31]:
population_df.columns


Out[31]:
Index([u'region', u'value'], dtype='object')

Merging stats data,demographic and population data


In [32]:
#we are now merging demographic and population data into one single data frame
#to do this , we are using pandas merge method which allows us to join two data frames like how we do it in sql
# we are using inner join and on the column region

In [33]:
population_demographic_merge_df = pd.merge(demographics_df,population_df,how='inner',on='region')

In [34]:
population_demographic_merge_df.head()


Out[34]:
region total_population percent_white percent_black percent_asian percent_hispanic per_capita_income median_rent median_age value
0 alabama 4799277 67 26 1 4 23680 501 38.1 4777326
1 alaska 720316 63 3 5 6 32651 978 33.6 711139
2 arizona 6479703 57 4 3 30 25358 747 36.3 6410979
3 arkansas 2933369 74 15 1 7 22170 480 37.5 2916372
4 california 37659181 40 6 13 38 29527 1119 35.4 37325068

In [35]:
population_demographic_merge_df.shape


Out[35]:
(51, 10)

In [36]:
population_demographic_merge_df = population_demographic_merge_df.rename(columns={'region':'State'})

In [37]:
population_demographic_merge_df.head()


Out[37]:
State total_population percent_white percent_black percent_asian percent_hispanic per_capita_income median_rent median_age value
0 alabama 4799277 67 26 1 4 23680 501 38.1 4777326
1 alaska 720316 63 3 5 6 32651 978 33.6 711139
2 arizona 6479703 57 4 3 30 25358 747 36.3 6410979
3 arkansas 2933369 74 15 1 7 22170 480 37.5 2916372
4 california 37659181 40 6 13 38 29527 1119 35.4 37325068

In [38]:
statistics_population_demographic_merge_df = pd.merge(population_demographic_merge_df,statistics_df,how='inner',on='State')

In [39]:
statistics_population_demographic_merge_df.head()


Out[39]:
State total_population percent_white percent_black percent_asian percent_hispanic per_capita_income median_rent median_age value ... HighQ_Covar HighQ_Var HighQ_Mean HighQ_Median LowQ_Std LowQ_Covar LowQ_Median LowQ_Mode LowQ_Mean LowQ_Var

0 rows × 22 columns


In [40]:
#the reason why there are no entries here is because in the population demographic df the States are in lower case
# let us now change the States to lower case in statistics df too

In [41]:
statistics_df.head()


Out[41]:
State HighQ_Std HighQ_Mode HighQ_Covar HighQ_Var HighQ_Mean HighQ_Median LowQ_Std LowQ_Covar LowQ_Median LowQ_Mode LowQ_Mean LowQ_Var
0 Alabama 1.539724 ([337.32], [10.0]) 2.370749 2.370749 339.561849 340.10 2.073186 4.298099 144.98 ([144.98], [216.0]) 145.978508 4.298099
1 Alaska 5.580105 ([285.65], [19.0]) 31.137567 31.137567 291.482004 289.81 13.188305 173.931381 403.33 ([403.33], [237.0]) 394.653964 173.931381
2 Arizona 2.015750 ([300.37], [10.0]) 4.063247 4.063247 300.667483 300.46 3.961371 15.692461 185.78 ([185.78], [213.0]) 188.500134 15.692461
3 Arkansas 6.866002 ([349.29], [11.0]) 47.141986 47.141986 348.056147 348.21 1.172681 1.375181 126.10 ([126.1], [219.0]) 126.771269 1.375181
4 California 1.727046 ([245.03], [8.0]) 2.982686 2.982686 245.376125 245.31 1.598252 2.554409 188.60 ([188.6], [211.0]) 189.783586 2.554409

In [42]:
statistics_df['State'] = statistics_df['State'].str.lower()
statistics_df.head()


Out[42]:
State HighQ_Std HighQ_Mode HighQ_Covar HighQ_Var HighQ_Mean HighQ_Median LowQ_Std LowQ_Covar LowQ_Median LowQ_Mode LowQ_Mean LowQ_Var
0 alabama 1.539724 ([337.32], [10.0]) 2.370749 2.370749 339.561849 340.10 2.073186 4.298099 144.98 ([144.98], [216.0]) 145.978508 4.298099
1 alaska 5.580105 ([285.65], [19.0]) 31.137567 31.137567 291.482004 289.81 13.188305 173.931381 403.33 ([403.33], [237.0]) 394.653964 173.931381
2 arizona 2.015750 ([300.37], [10.0]) 4.063247 4.063247 300.667483 300.46 3.961371 15.692461 185.78 ([185.78], [213.0]) 188.500134 15.692461
3 arkansas 6.866002 ([349.29], [11.0]) 47.141986 47.141986 348.056147 348.21 1.172681 1.375181 126.10 ([126.1], [219.0]) 126.771269 1.375181
4 california 1.727046 ([245.03], [8.0]) 2.982686 2.982686 245.376125 245.31 1.598252 2.554409 188.60 ([188.6], [211.0]) 189.783586 2.554409

In [43]:
#now lets merge the data again, the reason we are merging all this data is we are able to look at one state on multiple levels.

In [44]:
stats_pop_dem_merge_df = pd.merge(population_demographic_merge_df,statistics_df,how='inner',on='State')

In [45]:
stats_pop_dem_merge_df.head()


Out[45]:
State total_population percent_white percent_black percent_asian percent_hispanic per_capita_income median_rent median_age value ... HighQ_Covar HighQ_Var HighQ_Mean HighQ_Median LowQ_Std LowQ_Covar LowQ_Median LowQ_Mode LowQ_Mean LowQ_Var
0 alabama 4799277 67 26 1 4 23680 501 38.1 4777326 ... 2.370749 2.370749 339.561849 340.10 2.073186 4.298099 144.98 ([144.98], [216.0]) 145.978508 4.298099
1 alaska 720316 63 3 5 6 32651 978 33.6 711139 ... 31.137567 31.137567 291.482004 289.81 13.188305 173.931381 403.33 ([403.33], [237.0]) 394.653964 173.931381
2 arizona 6479703 57 4 3 30 25358 747 36.3 6410979 ... 4.063247 4.063247 300.667483 300.46 3.961371 15.692461 185.78 ([185.78], [213.0]) 188.500134 15.692461
3 arkansas 2933369 74 15 1 7 22170 480 37.5 2916372 ... 47.141986 47.141986 348.056147 348.21 1.172681 1.375181 126.10 ([126.1], [219.0]) 126.771269 1.375181
4 california 37659181 40 6 13 38 29527 1119 35.4 37325068 ... 2.982686 2.982686 245.376125 245.31 1.598252 2.554409 188.60 ([188.6], [211.0]) 189.783586 2.554409

5 rows × 22 columns


In [46]:
stats_pop_dem_merge_df.shape
stats_pop_dem_merge_df.columns


Out[46]:
Index([u'State', u'total_population', u'percent_white', u'percent_black',
       u'percent_asian', u'percent_hispanic', u'per_capita_income',
       u'median_rent', u'median_age', u'value', u'HighQ_Std', u'HighQ_Mode',
       u'HighQ_Covar', u'HighQ_Var', u'HighQ_Mean', u'HighQ_Median',
       u'LowQ_Std', u'LowQ_Covar', u'LowQ_Median', u'LowQ_Mode', u'LowQ_Mean',
       u'LowQ_Var'],
      dtype='object')

Generating insights using merged data


In [47]:
#so now we have data for 51 states with 20 feature columns. 
#The data is ready

In [48]:
white_greater_black = stats_pop_dem_merge_df[stats_pop_dem_merge_df.percent_white > stats_pop_dem_merge_df.percent_black]
black_greate_white =  stats_pop_dem_merge_df[stats_pop_dem_merge_df.percent_white < stats_pop_dem_merge_df.percent_black]

In [49]:
white_greater_black.shape,black_greate_white.shape


Out[49]:
((50, 22), (1, 22))

In [50]:
#given that the white population is high in almost every state, lets not do any comparison use the 
#percent of whites and blacks

In [51]:
#let us see if there is a correlation between per capita income and highQ_mean/lowQ_mean
correlation_btw_per_capita_highQ = stats.pearsonr(white_greater_black.per_capita_income,white_greater_black.HighQ_Mean)[0]
correlation_btw_per_capita_lowQ = stats.pearsonr(white_greater_black.per_capita_income,white_greater_black.LowQ_Mean)[0]

In [52]:
correlation_btw_per_capita_highQ,correlation_btw_per_capita_lowQ


Out[52]:
(0.10372726217632158, 0.34288507101733851)

In [53]:
# using population data let us see if we can correlate total population with that of price
population_correlation = stats.pearsonr(stats_pop_dem_merge_df.total_population,stats_pop_dem_merge_df.HighQ_Mean)[0]
population_correlation

# we see that it is iversely correlated to an extent which implies greater the population lesser is the price


Out[53]:
-0.17012234340734669

In [54]:
# let us see if there is a big difference in the value of High quality 
# weed prices based on states selling it legally and those that dont
legal_states = ['colorado']
illegal_states = ['wyoming']

legal_states_mean = stats_pop_dem_merge_df[stats_pop_dem_merge_df['State'].isin(legal_states)]['HighQ_Mean']
illegal_states_mean = stats_pop_dem_merge_df[stats_pop_dem_merge_df['State'].isin(illegal_states)]['HighQ_Mean']
legal_states_mean = legal_states_mean.reset_index()
illegal_states_mean = illegal_states_mean.reset_index()
print legal_states_mean.head()
print illegal_states_mean.head()

# we see that legal states sell highQ marijuana at a lesser price


   index  HighQ_Mean
0      5  238.918708
   index  HighQ_Mean
0     50   339.02078

In [55]:
# from this we understand that the value of low quality weed is somewhat affected by per capita income compared to 
# high quality weed
# we can use the same way to compute correlations amognst other attributes too

In [56]:
#before we go any further , lets try to find some interesting growth in weed prices for each state 
csv_df_sort_fillna_ffil.head()


Out[56]:
State HighQ HighQN MedQ MedQN LowQ LowQN date
20094 Alabama 339.65 1033 198.04 926 147.15 122 2013-12-27
20859 Alabama 339.65 1033 198.04 926 147.15 122 2013-12-28
21573 Alabama 339.75 1036 198.26 929 149.49 123 2013-12-29
22287 Alabama 339.75 1036 198.81 930 149.49 123 2013-12-30
22797 Alabama 339.42 1040 198.68 932 149.49 123 2013-12-31

In [57]:
country_mean_highQ = csv_df_sort_fillna_ffil.groupby(['date'],as_index=False).aggregate({'HighQ':np.mean})

In [58]:
country_mean_highQ.head()


Out[58]:
date HighQ
0 2013-12-27 335.498039
1 2013-12-28 335.494902
2 2013-12-29 335.582941
3 2013-12-30 335.490196
4 2013-12-31 335.505098

In [59]:
country_mean_highQ.min()


Out[59]:
date     2013-12-27 00:00:00
HighQ               322.1294
dtype: object

In [60]:
#we just have dates, we need to generate calendar week for each date

In [61]:
from datetime import datetime
# date format in %Y-%m-%d
def get_calendar_week(date):
    calendar_week = date.isocalendar()[1]
    return int(calendar_week)

In [ ]:


In [ ]:


In [62]:
# lets add calendar week to the data frame
csv_df_sort_fillna_ffil['Calendar_Week'] = csv_df_sort_fillna_ffil['date'].apply(get_calendar_week)

In [63]:
#lets check if it looks fine
csv_df_sort_fillna_ffil.head()


Out[63]:
State HighQ HighQN MedQ MedQN LowQ LowQN date Calendar_Week
20094 Alabama 339.65 1033 198.04 926 147.15 122 2013-12-27 52
20859 Alabama 339.65 1033 198.04 926 147.15 122 2013-12-28 52
21573 Alabama 339.75 1036 198.26 929 149.49 123 2013-12-29 52
22287 Alabama 339.75 1036 198.81 930 149.49 123 2013-12-30 1
22797 Alabama 339.42 1040 198.68 932 149.49 123 2013-12-31 1

In [64]:
#now let us try to group at week level and see during which week it was the lowest and for a given state
country_week_highQ_df = pd.DataFrame(csv_df_sort_fillna_ffil.groupby(['Calendar_Week'],as_index=False).aggregate({'HighQ':{'HighQ_Mean':np.mean}}))
country_week_highQ_df.columns = country_week_highQ_df.columns.droplevel(0)
country_week_highQ_df = country_week_highQ_df.rename(columns={'':'Calendar_Week'})
country_week_highQ_df.head()
print country_week_highQ_df.min(),country_week_highQ_df.max()


Calendar_Week      1.000000
HighQ_Mean       326.614174
dtype: float64 Calendar_Week     52.000000
HighQ_Mean       333.778207
dtype: float64

In [65]:
#now let us check on which day it was the lowest across country
country_day_highQ_df = pd.DataFrame(csv_df_sort_fillna_ffil.groupby(['date'],as_index=False).aggregate({'HighQ':{'HighQ_Mean':np.mean}}))
country_day_highQ_df.columns = country_day_highQ_df.columns.droplevel(0)
country_day_highQ_df = country_day_highQ_df.rename(columns={'':'date'})
country_day_highQ_df.head()
print country_day_highQ_df.min(),country_day_highQ_df.max()


date          2013-12-27 00:00:00
HighQ_Mean               322.1294
dtype: object date          2015-06-11 00:00:00
HighQ_Mean               335.5829
dtype: object

In [66]:
# now let us check when people buy the most across the country 
country_week_highQN_df = pd.DataFrame(csv_df_sort_fillna_ffil.groupby(['Calendar_Week'],as_index=False).aggregate({'HighQN':{'HighQN_Mean':np.mean}}))
country_week_highQN_df.columns = country_week_highQN_df.columns.droplevel(0)
country_week_highQN_df = country_week_highQN_df.rename(columns={'':'Calendar_Week'})
country_week_highQN_df.head()
print country_week_highQN_df.min(),country_week_highQN_df.max()


Calendar_Week       1.000000
HighQN_Mean      1951.098039
dtype: float64 Calendar_Week      52.000000
HighQN_Mean      2522.322129
dtype: float64

In [67]:
# The above set of statements work across the country based on date and calendar week
# Now let us look at what happens every month by generating calendar week for a month and then see during which parts 
# of the month the price is at its highest/lowest

In [68]:
def get_calendar_week_month(date):
    calendar_week_year = get_calendar_week(date)
    month = date.month
    year = date.year
    day = date.day
    start_of_month = str(year)+"-"+str(month)+"-"+"01"
    calendar_week_start_of_month = date.isocalendar()[1]
    # this happens only for dec
    if(calendar_week_year > calendar_week_start_of_month):
        return calendar_week_year - calendar_week_start_of_month + 1
    else:
        return calendar_week_year

In [69]:
# the function to generate calendar_Week_month works
get_calendar_week_month(datetime.strptime("2013-12-30","%Y-%m-%d"))


Out[69]:
1

In [70]:
csv_df_sort_fillna_ffil['Calendar_Week_Month'] = csv_df_sort_fillna_ffil['date'].apply(get_calendar_week_month)

In [71]:
csv_df_sort_fillna_ffil.head()


Out[71]:
State HighQ HighQN MedQ MedQN LowQ LowQN date Calendar_Week Calendar_Week_Month
20094 Alabama 339.65 1033 198.04 926 147.15 122 2013-12-27 52 52
20859 Alabama 339.65 1033 198.04 926 147.15 122 2013-12-28 52 52
21573 Alabama 339.75 1036 198.26 929 149.49 123 2013-12-29 52 52
22287 Alabama 339.75 1036 198.81 930 149.49 123 2013-12-30 1 1
22797 Alabama 339.42 1040 198.68 932 149.49 123 2013-12-31 1 1

In [72]:
#now let us try to group at week level and see during which week it was the lowest and for a given state
country_month_week_highQ_df = pd.DataFrame(csv_df_sort_fillna_ffil.groupby(['Calendar_Week_Month'],as_index=False).aggregate({'HighQ':{'HighQ_Mean':np.mean}}))
country_month_week_highQ_df.columns = country_month_week_highQ_df.columns.droplevel(0)
country_month_week_highQ_df = country_month_week_highQ_df.rename(columns={'':'Calendar_Week_Month'})
country_month_week_highQ_df.head()
print  country_month_week_highQ_df.min()
print country_month_week_highQ_df.max()


Calendar_Week_Month      1.000000
HighQ_Mean             326.614174
dtype: float64
Calendar_Week_Month     52.000000
HighQ_Mean             333.778207
dtype: float64

In [73]:
csv_df_sort_fillna_ffil.columns


Out[73]:
Index([u'State', u'HighQ', u'HighQN', u'MedQ', u'MedQN', u'LowQ', u'LowQN',
       u'date', u'Calendar_Week', u'Calendar_Week_Month'],
      dtype='object')

Using linear regression to predict values


In [74]:
#now we have some insights into the data , let us take the next step and use a liner regression model to predict the 
# prices of marijuana for the state alabama
columns_required = ['State','HighQ','date']
alabama_state = csv_df_sort_fillna_ffil[csv_df_sort_fillna_ffil['State'] == "Alabama"]
alabama_state_price_date = alabama_state.ix[:,columns_required]
alabama_state_price_date.dtypes


Out[74]:
State            object
HighQ           float64
date     datetime64[ns]
dtype: object

In [75]:
train,test = train_test_split(alabama_state_price_date,test_size = 0.2)

In [76]:
def changeDateToOrdinal(date):
    return date.toordinal()

In [77]:
train_pd = pd.DataFrame(train,columns=['State','Price','Date'])
train_pd['State'] = 1
train_pd['Price'] = train_pd['Price'].astype(float)
train_pd['Date_ord'] = train_pd['Date'].apply(changeDateToOrdinal)

train_pd.head()


Out[77]:
State Price Date Date_ord
0 1 338.61 2014-10-26 00:00:00 735532
1 1 339.96 2014-01-19 00:00:00 735252
2 1 340.32 2014-08-08 00:00:00 735453
3 1 339.13 2014-11-16 00:00:00 735553
4 1 341.17 2014-03-24 00:00:00 735316

In [78]:
train_pd.head()


Out[78]:
State Price Date Date_ord
0 1 338.61 2014-10-26 00:00:00 735532
1 1 339.96 2014-01-19 00:00:00 735252
2 1 340.32 2014-08-08 00:00:00 735453
3 1 339.13 2014-11-16 00:00:00 735553
4 1 341.17 2014-03-24 00:00:00 735316

In [79]:
test_pd = pd.DataFrame(test,columns=['State','Price','Date'])
test_pd['State'] =1
test_pd['Price'] = test_pd['Price'].astype(float)
test_pd['Date_ord'] = test_pd['Date'].apply(changeDateToOrdinal)

In [80]:
print train_pd.head()
train_pd.dtypes

print test_pd.head()
test_pd.dtypes


   State   Price                 Date  Date_ord
0      1  338.61  2014-10-26 00:00:00    735532
1      1  339.96  2014-01-19 00:00:00    735252
2      1  340.32  2014-08-08 00:00:00    735453
3      1  339.13  2014-11-16 00:00:00    735553
4      1  341.17  2014-03-24 00:00:00    735316
   State   Price                 Date  Date_ord
0      1  340.63  2014-06-04 00:00:00    735388
1      1  340.52  2014-06-01 00:00:00    735385
2      1  337.32  2015-06-10 00:00:00    735759
3      1  339.92  2014-02-14 00:00:00    735278
4      1  340.67  2014-09-08 00:00:00    735484
Out[80]:
State         int64
Price       float64
Date         object
Date_ord      int64
dtype: object

In [81]:
x_train = train_pd.ix[:,['State','Date_ord']]
y_train = train_pd.ix[:,['Price']]
print x_train.dtypes,y_train.dtypes
x_test = test_pd.ix[:,['State','Date_ord']]
y_test = test_pd.ix[:,['Price']]
x_train.head()


State       int64
Date_ord    int64
dtype: object Price    float64
dtype: object
Out[81]:
State Date_ord
0 1 735532
1 1 735252
2 1 735453
3 1 735553
4 1 735316

In [82]:
y_train.head()
y_train.dtypes


Out[82]:
Price    float64
dtype: object

In [83]:
print x_train.head()
x_train.dtypes


   State  Date_ord
0      1    735532
1      1    735252
2      1    735453
3      1    735553
4      1    735316
Out[83]:
State       int64
Date_ord    int64
dtype: object

In [84]:
ols = linear_model.LinearRegression(normalize=True, fit_intercept=True)
ols.fit(x_train, y_train, n_jobs=-1)
 
print ols.coef_
ols_predict = ols.predict(x_test)
 
#print ols_predict
#Setting 0.5 as the decision boundary. If values are above 0.5, the prediction is set to 1. This means the record belongs to class 2. Else, the prediction is set to 0
 
test_pd['Predicted_Price'] = ols_predict


[[ 0.        -0.0078273]]

In [85]:
test_pd.head()


Out[85]:
State Price Date Date_ord Predicted_Price
0 1 340.63 2014-06-04 00:00:00 735388 340.103428
1 1 340.52 2014-06-01 00:00:00 735385 340.126909
2 1 337.32 2015-06-10 00:00:00 735759 337.199499
3 1 339.92 2014-02-14 00:00:00 735278 340.964431
4 1 340.67 2014-09-08 00:00:00 735484 339.352007

In [ ]: