In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
import re
In [2]:
def clean(s):
try:
return " ".join(re.findall(r'\w+', s,flags = re.UNICODE | re.LOCALE)).lower()
except:
return " ".join(re.findall(r'\w+', "n",flags = re.UNICODE | re.LOCALE)).lower()
def length(s):
return len(s)/10
In [3]:
#donations = pd.read_csv('../data/donations.csv').sort('projectid')
projects = pd.read_csv('../data/projects.csv').sort('projectid')
outcomes = pd.read_csv('../data/outcomes.csv').sort('projectid')
resources = pd.read_csv('../data/resources.csv').sort('projectid')
sample = pd.read_csv('../data/sampleSubmission.csv').sort('projectid')
essays = pd.read_csv('../data/essays.csv').sort('projectid')
In [4]:
proj_type = np.zeros((projects.shape[0],7))
le1 = LabelEncoder()
le2 = LabelEncoder()
le1.fit(projects.projectid)
le2.fit(resources.project_resource_type)
resources.projectid = le1.transform(resources.projectid)
resources.project_resource_type=le2.transform(resources.project_resource_type)
proj_type[resources.projectid, resources.project_resource_type] = 1
proj_type = np.column_stack((projects.projectid,proj_type))
temp_df = pd.DataFrame(proj_type,columns=['projectid','resource_a', 'resource_b', 'resource_c', 'resource_d', 'resource_e','resource_f','resource_g'])
del proj_type
del resources
In [5]:
projects = projects.merge(outcomes, how ='inner')
projects = projects.merge(temp_df, how ='inner')
del temp_df
projects = projects.merge(essays,how='inner')
del essays
dates = np.array(projects.date_posted)
train_idx = np.where((dates < '2013-05-01') & (dates >= '2010-01-01'))[0]
test_idx = np.where(dates >= '2013-05-01')[0]
outcomes = np.array(projects.is_exciting)
In [6]:
projects.essay = projects.essay.apply(clean)
essay_length = projects.essay.apply(length)
projects.secondary_focus_area = projects.secondary_focus_area.fillna(projects.primary_focus_area)
projects.secondary_focus_subject = projects.secondary_focus_subject.fillna(projects.primary_focus_subject)
projects = projects.fillna(method='pad')
projects['essay_length'] = essay_length
projects['month'] = ''
projects['total_price'] = 0
projects['student'] = 0
for i in range(0,projects.shape[0]):
projects['month'][i] = projects.date_posted[i][5:7]
totalPrice = projects.total_price_excluding_optional_support[i]
if(totalPrice < 250):
projects.total_price[i] = 0
elif ((totalPrice >= 250)&(totalPrice < 400)):
projects.total_price[i] = 1
elif((totalPrice >= 400)&(totalPrice < 600)):
projects.total_price[i] = 2
elif((totalPrice >= 600)&(totalPrice < 10000)):
projects.total_price[i] = 3
elif((totalPrice >= 10000)&(totalPrice < 100000)):
projects.total_price[i] = 4
else:
projects.total_price[i] = 5
studentNo = int(projects.students_reached[i])
if(studentNo == 0):
projects.student[i] = 0
elif(studentNo <100):
projects.student[i] = (studentNo/5) + 1
elif(studentNo <= 500):
projects.student[i] = 100
else:
projects.student[i] = 1000
In [7]:
cols = ['school_city', 'school_state', 'school_zip', 'school_metro', 'school_district', 'school_county', 'school_charter', 'school_magnet',
'school_year_round', 'school_nlns', 'school_kipp', 'school_charter_ready_promise', 'teacher_prefix', 'teacher_teach_for_america', 'teacher_ny_teaching_fellow', 'primary_focus_subject','primary_focus_area',
'secondary_focus_subject', 'secondary_focus_area', 'resource_type', 'poverty_level', 'grade_level',
'students_reached', 'eligible_double_your_impact_match', 'eligible_almost_home_match','month', 'total_price','student','essay_length','resource_a', 'resource_b', 'resource_c', 'resource_d', 'resource_e','resource_f','resource_g' ]
projects = np.array(projects[cols])
In [8]:
for i in range(0, projects.shape[1]):
le = LabelEncoder()
projects[:,i] = le.fit_transform(projects[:,i])
projects = projects.astype(float)
In [9]:
train, crossval, outcomeTrain, outcomeCrossVal = projects[train_idx], projects[test_idx], outcomes[train_idx], outcomes[test_idx]
model = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', n_jobs = -1)
model.fit(train, outcomeTrain=='t')
preds3 = model.predict_proba(crossval)[:,1]
In [10]:
ohe = OneHotEncoder()
projects = ohe.fit_transform(projects)
In [11]:
train, crossval, outcomeTrain, outcomeCrossVal = projects[train_idx], projects[test_idx], outcomes[train_idx], outcomes[test_idx]
In [12]:
model = LogisticRegression(C = .1)
model.fit(train, outcomeTrain=='t')
preds1 = model.predict_proba(crossval)[:,1]
model = SGDClassifier(alpha = 0.001, loss = 'modified_huber', penalty = 'l2', n_iter = 1000, n_jobs = -1)
model.fit(train, outcomeTrain=='t')
preds2 = model.predict_proba(crossval)[:,1]
In [17]:
preds = 0.4*preds1 + 0.3*preds2 + 0.3*preds3
In [18]:
value = roc_auc_score(outcomeCrossVal=='t', preds)
value
Out[18]:
In [ ]: