In [5]:
import sys
sys.path.append("./scripts")
from helpers import *
from proj1_helpers import *
from feature_processing import *
import numpy as np
import csv

In [13]:
def prepare_data(dataPath):

    yb , input_data, ids = load_csv_data(dataPath) # load data
    tx =  process_X(input_data)
    
    return ids,yb,tx

In [14]:
ids_train,y_train,tx_train = prepare_data('../data/train.csv')
ids_test,y_test,tx_test =  prepare_data('../data/test.csv')
print(tx_train.shape)


100%|██████████| 5/5 [00:02<00:00,  2.05it/s]
100%|██████████| 5/5 [00:05<00:00,  1.10s/it]
(250000, 73)


In [15]:
def sigmoid(t):
    numerator = np.exp(t)
    denominator = np.add(1,numerator)
    return (numerator/denominator)

In [16]:
def calculate_loss(y, tx, w):
    
    A = (np.exp(tx.dot(w))) + 1 
    mult = np.dot((np.dot(y.T,tx)),w)
    loss_vector = (np.log(A)) - mult
    loss_component = np.sum(loss_vector)
    return loss_component

In [17]:
def calculate_gradient(y, tx, w):
    """compute the gradient of loss."""
   
    step1 = sigmoid(tx.dot(w)) - y
    step2 = tx.T
    gradient = np.dot(step2,step1)
    return gradient

In [18]:
def learning_by_gradient_descent(y, tx, w, gamma):
    """
    Do one step of gradient descen using logistic regression.
    Return the loss and the updated w.
    """
    w_s = w
   
    w_s = w_s - gamma *  calculate_gradient(y, tx, w_s)
    loss =  calculate_loss(y, tx, w_s)  
    
    return loss,w_s

In [19]:
def logistic_regression_gradient_descent_demo(y, x):
    # init parameters
    max_iter = 10000
    threshold = 1e-8
    gamma = 0.000001
    losses = []

    w = np.zeros((x.shape[1], 1))

    # start the logistic regression
    for iter in range(max_iter):
        # get loss and update w.
        loss, w = learning_by_gradient_descent(y, x, w, gamma)
        # log info
        if iter % 100 == 0 and len(losses) > 1:
            print("Current iteration={i}, loss={l}".format(i=iter, l=np.abs(losses[-1] - losses[-2])))
            
            
        # converge criterion
        losses.append(loss)
        if len(losses) > 1 and np.abs(losses[-1] - losses[-2]) < threshold:
            break
    
    return loss,w

In [20]:
w = np.zeros((tx_train.shape[1], 1))
y_train = y_train.reshape((len(y_train),1))
y_train[y_train == -1.0] = 0.0

loss,w = logistic_regression_gradient_descent_demo(y_train, tx_train)
y_pred = predict_labels(w, tx_test)
create_csv_submission(ids_test, y_pred,'../data/baseline_submission.csv')


Current iteration=100, loss=4659286.7527832985
Current iteration=200, loss=8335007.398172379
Current iteration=300, loss=5427126.829332352
Current iteration=400, loss=4022302.8496751785
Current iteration=500, loss=3162457.276881218
Current iteration=600, loss=2597581.432374954
Current iteration=700, loss=2201783.690522194
Current iteration=800, loss=1910784.9320440292
Current iteration=900, loss=1687828.387383461
Current iteration=1000, loss=1510899.0029821396
Current iteration=1100, loss=1366498.5466079712
Current iteration=1200, loss=1246081.0660581589
Current iteration=1300, loss=1143985.993675232
Current iteration=1400, loss=1056272.7476530075
Current iteration=1500, loss=980080.0049915314
Current iteration=1600, loss=913265.1281394958
Current iteration=1700, loss=854187.7504796982
Current iteration=1800, loss=801570.2799119949
Current iteration=1900, loss=754403.0010118484
Current iteration=2000, loss=711877.4132528305
Current iteration=2100, loss=673338.5121927261
Current iteration=2200, loss=638250.0721492767
Current iteration=2300, loss=606168.9189147949
Current iteration=2400, loss=576725.5102434158
Current iteration=2500, loss=549609.065823555
Current iteration=2600, loss=524556.0824842453
Current iteration=2700, loss=501341.4221096039
Current iteration=2800, loss=479771.3694572449
Current iteration=2900, loss=459678.1961364746
Current iteration=3000, loss=440915.8707752228
Current iteration=3100, loss=423356.6399154663
Current iteration=3200, loss=406888.27135276794
Current iteration=3300, loss=391411.80521965027
Current iteration=3400, loss=376839.69835472107
Current iteration=3500, loss=363094.2774372101
Current iteration=3600, loss=350106.4377403259
Current iteration=3700, loss=337814.54036712646
Current iteration=3800, loss=326163.47172927856
Current iteration=3900, loss=315103.8370246887
Current iteration=4000, loss=304591.265750885
Current iteration=4100, loss=294585.81145477295
Current iteration=4200, loss=285051.43120384216
Current iteration=4300, loss=275955.5330963135
Current iteration=4400, loss=267268.58197784424
Current iteration=4500, loss=258963.75503730774
Current iteration=4600, loss=251016.64065170288
Current iteration=4700, loss=243404.97418022156
Current iteration=4800, loss=236108.40641212463
Current iteration=4900, loss=229108.29972839355
Current iteration=5000, loss=222387.5487689972
Current iteration=5100, loss=215930.42236328125
Current iteration=5200, loss=209722.42391586304
Current iteration=5300, loss=203750.1681251526
Current iteration=5400, loss=198001.27174186707
Current iteration=5500, loss=192464.2568397522
Current iteration=5600, loss=187128.46501159668
Current iteration=5700, loss=181983.9809741974
Current iteration=5800, loss=177021.56469345093
Current iteration=5900, loss=172232.59091186523
Current iteration=6000, loss=167608.99510383606
Current iteration=6100, loss=163143.22519493103
Current iteration=6200, loss=158828.1983013153
Current iteration=6300, loss=154657.26203346252
Current iteration=6400, loss=150624.1595993042
Current iteration=6500, loss=146722.998544693
Current iteration=6600, loss=142948.22243881226
Current iteration=6700, loss=139294.58540344238
Current iteration=6800, loss=135757.12908935547
Current iteration=6900, loss=132331.1617488861
Current iteration=7000, loss=129012.23924255371
Current iteration=7100, loss=125796.14781188965
Current iteration=7200, loss=122678.88838005066
Current iteration=7300, loss=119656.66216278076
Current iteration=7400, loss=116725.85759544373
Current iteration=7500, loss=113883.03832054138
Current iteration=7600, loss=111124.93216133118
Current iteration=7700, loss=108448.42095947266
Current iteration=7800, loss=105850.53131103516
Current iteration=7900, loss=103328.42589378357
Current iteration=8000, loss=100879.39559364319
Current iteration=8100, loss=98500.85211753845
Current iteration=8200, loss=96190.32117271423
Current iteration=8300, loss=93945.436170578
Current iteration=8400, loss=91763.93233680725
Current iteration=8500, loss=89643.64122200012
Current iteration=8600, loss=87582.48566246033
Current iteration=8700, loss=85578.47497940063
Current iteration=8800, loss=83629.70051193237
Current iteration=8900, loss=81734.33152198792
Current iteration=9000, loss=79890.61123847961
Current iteration=9100, loss=78096.85322761536
Current iteration=9200, loss=76351.43795204163
Current iteration=9300, loss=74652.8095703125
Current iteration=9400, loss=72999.47286224365
Current iteration=9500, loss=71389.99039840698
Current iteration=9600, loss=69822.9798488617
Current iteration=9700, loss=68297.11137580872
Current iteration=9800, loss=66811.10538291931
Current iteration=9900, loss=65363.73006629944

In [ ]: