In [1]:
from __future__ import print_function
from __future__ import division

from tqdm import *
import csv
import numpy as np
import pandas as pd
import scipy
import nltk
import sklearn
import random
import re
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.decomposition import PCA, RandomizedPCA
from sklearn import svm
from sklearn.grid_search import GridSearchCV, ParameterGrid
from sklearn.pipeline import Pipeline

In [2]:
path_to_csv_data = "/home/felipe/auto-tagger/data/RawRCV1/csv/reuters-rcv1-full.csv"

In [5]:
# documents = []
# tags = []

chunks = [
    [0,100000],
    [100001,200000],
    [200001,300000],
    [300001,400000],
    [400001,500000],
    [500001,600000],
    [600001,700000],
    [700001,800000],
    [800001,900000]
]
    
# http://stackoverflow.com/a/654046/436721
def read_and_save(begin,end,tags):
    with open(path_to_csv_data) as file:
        reader = csv.reader(file,escapechar='\\')      
              
        for (i,line) in enumerate(reader):
            
            if(i < begin):
                continue
                
            if(i >= end):
                return tags
            
            (id,title,body,labels) = line
            
            text = title+" "+body
                       
#             documents.append(text)
            tags.append(labels)

for fromindex,toindex in chunks:
    out_tags = read_and_save(fromindex,toindex,[])
    series = pd.Series(out_tags)
    
    partial_Y_df = series.str.get_dummies(sep=',').astype(np.uint8)
    
    print(partial_Y_df.info())


---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-5-4018a7caf08d> in <module>()
     38     series = pd.Series(out_tags)
     39 
---> 40     partial_Y_df = series.str.get_dummies(sep=',').astype(np.uint8)
     41 
     42     print(partial_Y_df.info())

/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/strings.pyc in get_dummies(self, sep)
   1624         result, name = str_get_dummies(data, sep)
   1625         return self._wrap_result(result, use_codes=(not self._is_categorical),
-> 1626                                  name=name, expand=True)
   1627 
   1628     @copy(str_translate)

/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/strings.pyc in _wrap_result(self, result, use_codes, name, expand)
   1391             if expand:
   1392                 cons = self._orig._constructor_expanddim
-> 1393                 return cons(result, columns=name, index=index)
   1394             else:
   1395                 # Must be a Series

/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    303                     if is_named_tuple(data[0]) and columns is None:
    304                         columns = data[0]._fields
--> 305                     arrays, columns = _to_arrays(data, columns, dtype=dtype)
    306                     columns = _ensure_index(columns)
    307 

/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/frame.pyc in _to_arrays(data, columns, coerce_float, dtype)
   5550         data = lmap(tuple, data)
   5551         return _list_to_arrays(data, columns, coerce_float=coerce_float,
-> 5552                                dtype=dtype)
   5553 
   5554 

/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/frame.pyc in _list_to_arrays(data, columns, coerce_float, dtype)
   5607         content = list(lib.to_object_array(data).T)
   5608     return _convert_object_array(content, columns, dtype=dtype,
-> 5609                                  coerce_float=coerce_float)
   5610 
   5611 

/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/frame.pyc in _convert_object_array(content, columns, coerce_float, dtype)
   5675         return arr
   5676 
-> 5677     arrays = [convert(arr) for arr in content]
   5678 
   5679     return arrays, columns

/home/felipe/venv2-global/local/lib/python2.7/site-packages/pandas/core/frame.pyc in convert(arr)
   5671     def convert(arr):
   5672         if dtype != object and dtype != np.object:
-> 5673             arr = lib.maybe_convert_objects(arr, try_float=coerce_float)
   5674             arr = _possibly_cast_to_datetime(arr, dtype)
   5675         return arr

KeyboardInterrupt: 

In [ ]:
(len(documents),len(tags))

In [ ]:
tags[0]

In [ ]:
def preprocessor(string):
    repl = re.sub('&lt;','',string)
    repl = re.sub('<[^>]+>','',string)
    return repl.lower()

In [ ]:
series = pd.Series(tags)

In [ ]:
# split = series.str.split(',');split

In [ ]:
Y_df = series.str.get_dummies(sep=',').astype(np.uint8)

In [ ]:
Y_df.info()

In [ ]:
Y_df =pd.get_dummies(series.str.split(',')).astype(np.uint8)

In [ ]:
Y_df.head(1).iloc[0].values

In [ ]: