In [2]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt 
import sklearn
import statistics
from sklearn.linear_model import LinearRegression


---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
<ipython-input-2-9167a3746a6c> in <module>
----> 1 import pandas as pd
      2 import numpy as np
      3 import matplotlib.pyplot as plt
      4 import sklearn
      5 import statistics

ModuleNotFoundError: No module named 'pandas'

In [1]:
# df = pd.read_csv("/Users/sstrano/Downloads/SrSAExercise2018_inputs_with_results.csv")
p_file = "/Users/sstrano/Downloads/SrSAExercise2018_inputs_with_results_1.csv"
df = pd.read_csv(p_file, sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
d = pd.read_csv(p_file, sep=',', error_bad_lines=False, index_col=False, dtype='unicode')


df['Primary Phone to Name']=df['Primary Phone to Name'].astype('category').cat.codes
df['Primary Phone Is Valid']=df['Primary Phone Is Valid'].astype('category').cat.codes
df['Primary Phone Line Type']=df['Primary Phone Line Type'].astype('category').cat.codes
df['Primary Address Is Valid']=df['Primary Address Is Valid'].astype('category').cat.codes
df['Primary Address to Name']=df['Primary Address to Name'].astype('category').cat.codes
df['Email Is Valid']=df['Email Is Valid'].astype('category').cat.codes
df['Email to Name']=df['Email to Name'].astype('category').cat.codes
df['IP Is Proxy']=df['IP Is Proxy'].astype('category').cat.codes


  File "<ipython-input-1-e0414d2c4dd3>", line 57
    print "formula: y = {0}x + {1}".format(m, b)
                                  ^
SyntaxError: invalid syntax

In [ ]:
# converting data types
df.dtypes.eq(object)
cols = df.columns[df.dtypes.eq(object)]
df[cols] = df[cols].apply(pd.to_numeric, errors='coerce', axis=0)
for c in cols:
    df[c] = pd.to_numeric(df[c], errors='coerce')

d = df[df.fraud_ind != 1]
df = df[df.fraud_ind != 0]

def Average(lst): 
    return sum(lst) / len(lst)

def stddev(data, ddof=0):
    """Calculates the population standard deviation
    by default; specify ddof=1 to compute the sample
    standard deviation."""
    n = len(data)
    if n < 2:
        raise ValueError('variance requires at least two data points')
    ss = _ss(data)
    pvar = ss/(n-ddof)
    return pvar**0.5

np.std(df['Email First Seen Days'])
np.mean(df['Email First Seen Days'])


# defining feature matrix(X) and response vector(y) 
y = df.iloc[:,1] # fraudster yes or no
X = df.iloc[:,7:] # other variables
x = df.iloc[:,17]
df.iloc[:,1]=pd.to_numeric(df.iloc[:,1])

In [ ]:
filtered_data = df
npMatrix = np.matrix(filtered_data)
X, Y = npMatrix[:,14], npMatrix[:,17]
mdl = LinearRegression().fit(filtered_data[['fraud_ind']],filtered_data['Confidence Score'])
m = mdl.coef_[0]
b = mdl.intercept_
print "formula: y = {0}x + {1}".format(m, b)