In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn import cross_validation
from sklearn.metrics import roc_auc_score
import re

In [2]:
#donations = pd.read_csv('../data/donations.csv').sort('projectid')
projects = pd.read_csv('../data/projects.csv').sort('projectid')
#outcomes = pd.read_csv('../data/outcomes.csv').sort('projectid')
resources = pd.read_csv('../data/resources.csv')
#sample = pd.read_csv('../data/sampleSubmission.csv').sort('projectid')
#essays = pd.read_csv('../data/essays.csv').sort('projectid')

In [3]:
s = resources.project_resource_type.unique()

In [4]:
s.size


Out[4]:
7

In [5]:
arr = np.zeros((projects.shape[0],7))

In [6]:
arr.shape


Out[6]:
(664098, 7)

In [7]:
le = LabelEncoder()

In [8]:
le.fit(projects.projectid)


Out[8]:
LabelEncoder()

In [ ]: