In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [22]:
day = pd.read_csv("data/day.csv")
In [2]:
data = day.drop(["dteday", "instant", "casual", 'registered', 'cnt', 'yr'], axis=1)
In [3]:
data.columns
Out[3]:
In [4]:
data_raw = data.copy()
In [5]:
data.season = data.season.map({1: "spring", 2: "summer", 3: "fall", 4: 'winter'})
data.weathersit = data.weathersit.map({1: "clear, partly cloudy", 2: 'mist, cloudy', 3: 'light snow, light rain', 4:'heavy rain, snow and fog'})
data.mnth = pd.to_datetime(data.mnth, format="%m").dt.strftime("%b")
data.weekday = pd.to_datetime(data.weekday, format="%w").dt.strftime("%a")
In [6]:
data_dummies = pd.get_dummies(data, columns=['season', 'mnth', 'weekday', 'weathersit'])
In [20]:
data_dummies.head()
Out[20]:
In [8]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_raw.values, day.cnt.values, random_state=0)
In [9]:
from sklearn.linear_model import RidgeCV
ridge = RidgeCV().fit(X_train, y_train)
In [10]:
from sklearn.metrics import r2_score
In [11]:
ridge.score(X_train, y_train)
Out[11]:
In [12]:
ridge.score(X_test, y_test)
Out[12]:
In [13]:
from sklearn.tree import DecisionTreeRegressor
In [14]:
tree = DecisionTreeRegressor(max_depth=5).fit(X_train, y_train)
print(tree.score(X_train, y_train))
print(tree.score(X_test, y_test))
In [15]:
from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=500).fit(X_train, y_train)
print(forest.score(X_train, y_train))
print(forest.score(X_test, y_test))
In [19]:
data_raw.cnt = day.cnt
data_dummies.cnt = day.cnt
data_raw.to_csv("data/bike_day_raw.csv", index=None)
data_dummies.to_csv("data/bike_day_dummies.csv", index=None)
In [6]:
data = pd.read_csv("data/loan.csv")[::23]
In [7]:
data.shape
Out[7]:
In [8]:
data.head()
Out[8]:
In [9]:
counts = data.notnull().sum(axis=0).sort_values(ascending=False)
In [10]:
columns = counts[:52].index
In [11]:
data = data[columns]
In [12]:
data = data.dropna()
In [13]:
data.head()
Out[13]:
In [14]:
bad_statuses = ["Charged Off ", "Default", "Does not meet the credit policy. Status:Charged Off", "In Grace Period",
"Default Receiver", "Late (16-30 days)", "Late (31-120 days)"]
In [15]:
data['bad_status'] = data.loan_status.isin(bad_statuses)
In [16]:
data = data.drop(["url", "title", "id", "emp_title", "loan_status"], axis=1)
In [17]:
data.columns
Out[17]:
In [18]:
data.dtypes
Out[18]:
In [70]:
data.purpose.value_counts()
Out[70]:
In [37]:
float_columns = data.dtypes[data.dtypes == "float64"].index
In [38]:
data_float = data[float_columns]
In [39]:
data_float.shape
Out[39]:
In [40]:
X = data_float.values
y = data.bad_status.values
In [68]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
lr = LogisticRegression()
lr.fit(X_train, y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))
In [69]:
lr.coef_.shape
Out[69]:
In [71]:
plt.figure(figsize=(8, 8))
plt.barh(range(X.shape[1]), lr.coef_.ravel())
plt.yticks(np.arange(X.shape[1]) + .5, data_float_hard.columns.tolist(), va="center");
In [72]:
data_float_hard = data_float.drop(['total_rec_late_fee', "revol_util"], axis=1)
In [67]:
X = data_float_hard.values
In [114]:
train = pd.read_csv("data/shelter_train.csv")
test = pd.read_csv("data/shelter_test.csv")
In [115]:
train.head()
Out[115]:
In [116]:
data = pd.read_csv("data/bank-additional/bank-additional-full.csv", sep=";")
In [117]:
data.head()
Out[117]:
In [118]:
data.job.value_counts()
Out[118]:
In [119]:
data.columns
Out[119]:
In [120]:
data.dtypes
Out[120]:
In [121]:
target = data.y
data = data.drop("y", axis=1)
bla = pd.get_dummies(data)
In [122]:
bla.columns
Out[122]:
In [123]:
X = bla.values
y = target.values
In [124]:
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
lr = LogisticRegression()
lr.fit(X_train, y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))
In [130]:
plt.figure(figsize=(10, 12))
plt.barh(range(X.shape[1]), lr.coef_.ravel())
plt.yticks(np.arange(X.shape[1]) + .5, bla.columns.tolist(), va="center");
In [131]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
rf.score(X_train, y_train)
Out[131]:
In [132]:
rf.score(X_test, y_test)
Out[132]:
In [135]:
bla['target'] = target
bla.to_csv("data/bank-campaign.csv", index=None)
In [ ]: