In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import statistics
from sklearn.linear_model import LinearRegression
In [1]:
# df = pd.read_csv("/Users/sstrano/Downloads/SrSAExercise2018_inputs_with_results.csv")
p_file = "/Users/sstrano/Downloads/SrSAExercise2018_inputs_with_results_1.csv"
df = pd.read_csv(p_file, sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
d = pd.read_csv(p_file, sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
df['Primary Phone to Name']=df['Primary Phone to Name'].astype('category').cat.codes
df['Primary Phone Is Valid']=df['Primary Phone Is Valid'].astype('category').cat.codes
df['Primary Phone Line Type']=df['Primary Phone Line Type'].astype('category').cat.codes
df['Primary Address Is Valid']=df['Primary Address Is Valid'].astype('category').cat.codes
df['Primary Address to Name']=df['Primary Address to Name'].astype('category').cat.codes
df['Email Is Valid']=df['Email Is Valid'].astype('category').cat.codes
df['Email to Name']=df['Email to Name'].astype('category').cat.codes
df['IP Is Proxy']=df['IP Is Proxy'].astype('category').cat.codes
In [ ]:
# converting data types
df.dtypes.eq(object)
cols = df.columns[df.dtypes.eq(object)]
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=0)
for c in cols:
df[c] = pd.to_numeric(df[c], errors='coerce')
d = df[df.fraud_ind != 1]
df = df[df.fraud_ind != 0]
def Average(lst):
return sum(lst) / len(lst)
def stddev(data, ddof=0):
"""Calculates the population standard deviation
by default; specify ddof=1 to compute the sample
standard deviation."""
n = len(data)
if n < 2:
raise ValueError('variance requires at least two data points')
ss = _ss(data)
pvar = ss/(n-ddof)
return pvar**0.5
np.std(df['Email First Seen Days'])
np.mean(df['Email First Seen Days'])
# defining feature matrix(X) and response vector(y)
y = df.iloc[:,1] # fraudster yes or no
X = df.iloc[:,7:] # other variables
x = df.iloc[:,17]
df.iloc[:,1]=pd.to_numeric(df.iloc[:,1])
In [ ]:
filtered_data = df
npMatrix = np.matrix(filtered_data)
X, Y = npMatrix[:,14], npMatrix[:,17]
mdl = LinearRegression().fit(filtered_data[['fraud_ind']],filtered_data['Confidence Score'])
m = mdl.coef_[0]
b = mdl.intercept_
print "formula: y = {0}x + {1}".format(m, b)