In [ ]:
from zipfile import ZipFile
import numpy as np
import pandas as pd

In [ ]:
## Initalize the ZIp Container and load the data into pandas
zf = ZipFile( "./train.csv.zip" )
train_csv = pd.read_csv( zf.open( 'train.csv' ) ) #, nrows = 1000 )

In [ ]:
## Extract the target from the dataset: skip the ID column.
train_X, train_y = train_csv[ train_csv.columns[1:-1] ], train_csv[ train_csv.columns[ -1 ] ]

In [ ]:
## Load the test sample
zf = ZipFile( './test.csv.zip' )
df_test = pd.read_csv( zf.open( 'test.csv' ) ) #, nrows = 100 )

## Get the independent variables
X_test = df_test[df_test.columns[1:]]

In [ ]:
from sklearn.neighbors import KDTree

In [ ]:
kdt = KDTree( train_X, leaf_size = 30 )

In [ ]:
knn_index = kdt.query( X_test, k = 512, return_distance = False )

The sample data is $(x_i, t_i)_{i=1}^n\in\mathbb{N}^d\times \{0,1\}^K$, with $\sum_{k=1}^K t_k = 1$.

Let $\theta_k = (\theta_{kj})_{j=1}^d\in [0,1]^d$ for $k=1,\ldots, K$ with $\sum_{j=1}^d \theta_{kj} = 1$.

Suuppose each observation $(x,t)$ has the following joint distribution: $$p(x,t) = p(x|t) \pi\,,$$ where $\pi_t$ is the prior on the class $t$ and $$p(x | t) = \frac{\Gamma\bigl(1+\sum_{j=1}^d x_j\bigr)}{\prod_{j=1}^d \Gamma\bigl(1+x_j\bigr)}\prod_{j=1}^d \theta_{tj}^{x_j}\,.$$

The log-likeligood of the model is : $$ l(x,t) = \sum_{i=1}^n \sum_{k=1}^K t_{ik} \log \pi_k \frac{\Gamma\bigl(1+\sum_{j=1}^d x_{ij}\bigr)}{\prod_{j=1}^d \Gamma\bigl(1+x_{ij}\bigr)}\prod_{j=1}^d \theta_{tj}^{x_{ij}} \,,$$ whence $$ l(x,t) = \sum{i=1}^n \log\Gamma\bigl(1+\sum{j=1}^d x_{ij}\bigr)

- \sum_{i=1}^n \sum_{j=1}^d \log\Gamma\bigl(1+x_{ij}\bigr)
+ \sum_{i=1}^n \sum_{k=1}^k t_{ik} \sum_{j=1}^d x_{ij} \log \theta_{kj}
+ \sum_{i=1}^n \sum_{k=1}^k t_{ik} \log \pi_k\,. $$

Note that tho $1$-of-$K$ coding notation for $t_i$ is used. The lagrangian of the maximization problem subject to the $\sum_{j=1}^d \theta_{kj}=1$ constraint (up to irrelevant terms): $$\mathcal{L} = \ldots

+ \sum_{j=1}^d \sum_{k=1}^k (\log \theta_{kj}) \sum_{i=1}^n t_{ik} x_{ij}
+ \sum_{k=1}^k (\log \pi_k) \sum_{i=1}^n t_{ik}
+ \sum_{k=1}^K \lambda_k \bigl(1-\sum_{j=1}^d\theta_{kj}\bigr) + \mu \bigl(1-\sum_{k=1}^K\pi_k\bigr) \,, $$

whence $$\hat{\theta}_{kj} = \frac{\sum_{i=1}^n t_{ik} x_{ij}}{\lambda_k}\, \text{and}\, \hat{\pi}_k = \frac{\sum_{i=1}^n t_{ik}}{\mu} \,,$$ and $$ \hat{\theta}_{kj} = \frac{\sum_{i=1}^n t_{ik} x_{ij}}{\sum_{j=1}^d \sum_{i=1}^n t_{ik} x_{ij}}\, \text{and}\,\hat{\pi}_k = \frac{\sum_{i=1}^n t_{ik}}{n}\,.$$

Classificaiton is done by employing the max-aposteriori rule: for some $x=(x_j)_{j=1}^d \in\mathbb{N}$ $$k_x=\mathop{\text{argmax}}_{k=1,\ldots, K} p(x|t=k)\pi_k \,, $$ given by $$ k_x=\mathop{\text{argmax}}_{k=1,\ldots, K} \sum_{j=1}^d x_{j} \log \hat{\theta}_{kj} + \log \hat{\pi}_k \,.$$


In [ ]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier

In [ ]:
from sklearn.grid_search import GridSearchCV

In [ ]:
import matplotlib.pyplot as plt
%matplotlib inline

In [ ]:
from sklearn.cluster import KMeans

In [ ]:
kmc = KMeans( n_clusters = 9, verbose = 10, n_jobs = -1 ).fit( train_X )

In [ ]:
print kmc.predict( train_X )
print train_y

In [ ]:
from sklearn.linear_model import LogisticRegression

In [ ]:
lr = LogisticRegression(  )

In [ ]:
lr.transform()