In [393]:
import operator
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats as st
import statsmodels.api as sm
import scipy.optimize as op
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC
import matplotlib.pyplot as plt
%matplotlib inline
filename = '/Users/excalibur/py/nanodegree/intro_ds/final_project/improved-dataset/turnstile_weather_v2.csv'
# import data
data = pd.read_csv(filename)
After comparing a few different methods (Ordinary Least Squares [OLS] from \textit{StatsModels}, two different regression techniques from \textit{scikit-learn}, the Broyden–Fletcher–Goldfarb–Shanno [BFGS] optimization algorithm from \texit{Scipy.optimize}, and a Normal Equations algebraic attempt), OLS from \textit{StatsModels} was chosen due to its consistently higher $r$ and $R^{2}$ values throughout various test sample sizes.