In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from datetime import datetime

In [2]:
startTime = datetime.now()

In [3]:
#donations = pd.read_csv('../data/donations.csv').sort('projectid')
projects = pd.read_csv('../data/projects.csv').sort('projectid')
outcomes = pd.read_csv('../data/outcomes.csv').sort('projectid')
#resources = pd.read_csv('../data/resources.csv').sort('projectid')
sample = pd.read_csv('../data/sampleSubmission.csv').sort('projectid')
#essays = pd.read_csv('../data/essays.csv').sort('projectid')

In [4]:
dates = np.array(projects.date_posted)
train_idx = np.where(dates < '2014-01-01')[0]
test_idx = np.where(dates >= '2014-01-01')[0]

In [5]:
projects = projects.fillna(method='pad')

In [6]:
outcomes = np.array(outcomes.is_exciting)

In [7]:
projectCatogorialColumns = ['school_city', 'school_state', 'school_zip', 'school_metro', 'school_district', 'school_county', 'school_charter', 'school_magnet',
 'school_year_round', 'school_nlns', 'school_kipp', 'school_charter_ready_promise', 'teacher_prefix', 'teacher_teach_for_america', 'teacher_ny_teaching_fellow', 'primary_focus_subject','primary_focus_area', 
'secondary_focus_subject', 'secondary_focus_area', 'resource_type', 'poverty_level', 'grade_level',
'students_reached', 'eligible_double_your_impact_match', 'eligible_almost_home_match' ]

In [8]:
projects = np.array(projects[projectCatogorialColumns])

In [9]:
for i in range(0, projects.shape[1]):
    le = LabelEncoder()
    projects[:,i] = le.fit_transform(projects[:,i])
projects = projects.astype(float)

In [10]:
train = projects[train_idx]
test = projects[test_idx]
del projects

In [11]:
model = RandomForestClassifier(criterion = 'entropy' , n_estimators = 100, n_jobs = -1)
model.fit(train, outcomes=='t')


Out[11]:
RandomForestClassifier(bootstrap=True, compute_importances=None,
            criterion='entropy', max_depth=None, max_features='auto',
            min_density=None, min_samples_leaf=1, min_samples_split=2,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0)

In [12]:
preds = model.predict_proba(test)[:,1]

In [13]:
endTime = datetime.now()

In [14]:
sample['is_exciting'] = preds
sample.to_csv('predictions.csv', index = False)

In [15]:
print endTime - startTime


0:04:03.186728