notebook.community

Edit and run



In [1]:

    
# coding: utf-8

# # Prelim Analysis

import numpy as np
from sklearn import preprocessing
from sklearn.metrics import roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
#import the data and rename the columns


d = pd.read_csv('file.csv', encoding='ISO-8859-1',low_memory=False)#,  usecols=[0, 1, 2, 3, 8, 11, 13, 14, 26, 29, 35, 37, 84, 100, 103])
 
d = d.rename(columns={'eventid':'id', 'iyear':'year', 'imonth':'month', 'iday':'day', 'country_txt':'country', 'provstate':'state', 'success':'success','targtype1_txt':'target', 'targsubtype1_txt' : 'targetsub','weaptype1_txt':'weapon', 'attacktype1_txt':'attack','nkill':'fatalities', 'nwound':'injuries'})


d = d.drop(['id'],axis=1)

df_num = d.select_dtypes(include=[np.number])
df_inf = df_num.replace([np.inf, -np.inf], np.nan)
df_inf.replace([np.inf, -np.inf], np.nan)
df_filled = df_inf.fillna(0)

df_filled = df_filled.drop(['success'],axis=1)


# In[70]:

df_filled.corr().abs()



from numpy import float32
#df_filled.head()
df_transformed = df_filled.astype(float32)
#df_transformed.info()









    



//anaconda/lib/python3.6/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)



In [2]:

    
lm = LinearRegression()
y = d['success']
X = df_filled[['month', 'day','region','property','propextent','attacktype1','weaptype1','nperps','specificity' ]]
X_train, X_test,y_train, y_test = train_test_split(X,y,random_state=2)
#print(X_train.head())
lm.fit(X_train, y_train)
r = lm.score(X_train, y_train)
print (r)
pred_train = lm.predict(X_train)
pred_test = lm.predict(X_test)
print(lm.coef_)

mean_squared_error(pred_test,y_test)
plt.scatter(lm.predict(X_train), lm.predict(X_train) - y_train, c='b', s=40, alpha=0.5)
plt.scatter(lm.predict(X_test), lm.predict(X_test) - y_test, c='g', s=40)
#plt.hlines(y = 0, xmin = -400000000, xmax = 1400000000)


plt.title('Residual Plot using training (blue) and test (green) data')
plt.ylabel('Residuals')
plt.show()

import statsmodels.api as sm
fig = sm.qqplot(lm.predict(X_test) - y_test)
plt.show()









    



0.0372317320982
[ -1.73923957e-04  -1.68483111e-05  -1.83871397e-03  -7.29810486e-03
   3.15071149e-02   1.20843515e-02  -4.14460997e-03   5.30998137e-05
   7.99300431e-03]



In [3]:

    
#Random Forest
y_random = df_filled['multiple']
X_random = df_filled[['month', 'day','region','property','propextent','attacktype1','weaptype1','nperps','specificity' ]]
features_train, features_test,target_train, target_test = train_test_split(X_random,y_random, test_size = 0.2,random_state=0)
#Random Forest
forest=RandomForestClassifier(n_estimators=10)
forest = forest.fit( features_train, target_train)
output = forest.predict(features_test).astype(int)
forest.score(features_train, target_train )
print(forest.predict_proba([1,1,1,1,1,1,1,1,1]))









    



[[ 1.  0.]]






    



//anaconda/lib/python3.6/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)



In [4]:

    
from patsy import dmatrices
from sklearn.linear_model import LogisticRegression
#Logistic Regression
y_log, X_log = dmatrices('multiple ~ month + day + region + property + propextent + attacktype1 + weaptype1+ nperps + specificity', df_filled, return_type="dataframe")
y_log = np.ravel(y_log)
# instantiate a logistic regression model, and fit with X and y
model = LogisticRegression()
model = model.fit(X_log, y_log)
X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X_log, y_log, test_size=0.3, random_state=0)
model2 = LogisticRegression()
model2.fit(X_train, y_train)
print(model2.predict_proba([1,1,1,1,1,1,1,1,1]))









    



[[ 0.08274304  0.91725696]]






    



//anaconda/lib/python3.6/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
  DeprecationWarning)



In [19]:

    
def plots(lm,rf,lr):
    import matplotlib.pyplot as plt
    results = []
    results.append(lm)
    results.append(lr)
    results.append(rf)
    N = len(results)
    x = range(N)
    #y=['Linear Regression', 'Logistic Regression', 'Random Forest']
    width = 1/1.5
    plt.bar(x,results,width,color = "green")
    plt.savefig("static/images/fig.jpeg")
    plt.close()



In [20]:

    
plots(1,2,3)



In [ ]:



In [ ]: