In [1]:
import rdkit
import pandas as pd
import numpy as np
from scipy.stats.stats import pearsonr 
from sklearn import linear_model
from heapq import nlargest
from rdkit import Chem

In [2]:
fileName = "train.csv"
start = 700000
end = 750000
train = pd.read_csv(fileName,nrows=end)


---------------------------------------------------------------------------
MemoryError                               Traceback (most recent call last)
<ipython-input-2-9ffaa045f7bf> in <module>()
      2 start = 700000
      3 end = 750000
----> 4 train = pd.read_csv(fileName,nrows=end)

/home/luis/.virtualenvs/cs186/local/lib/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, na_fvalues, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, float_precision, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format, skip_blank_lines)
    463                     skip_blank_lines=skip_blank_lines)
    464 
--> 465         return _read(filepath_or_buffer, kwds)
    466 
    467     parser_f.__name__ = name

/home/luis/.virtualenvs/cs186/local/lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
    245                                   " together yet.")
    246     elif nrows is not None:
--> 247         return parser.read(nrows)
    248     elif chunksize or iterator:
    249         return parser

/home/luis/.virtualenvs/cs186/local/lib/python2.7/site-packages/pandas/io/parsers.pyc in read(self, nrows)
    716         index, columns, col_dict = self._create_index(ret)
    717 
--> 718         df = DataFrame(col_dict, columns=columns, index=index)
    719 
    720         if self.squeeze and len(df.columns) == 1:

/home/luis/.virtualenvs/cs186/local/lib/python2.7/site-packages/pandas/core/frame.pyc in __init__(self, data, index, columns, dtype, copy)
    206                                  dtype=dtype, copy=copy)
    207         elif isinstance(data, dict):
--> 208             mgr = self._init_dict(data, index, columns, dtype=dtype)
    209         elif isinstance(data, ma.MaskedArray):
    210             import numpy.ma.mrecords as mrecords

/home/luis/.virtualenvs/cs186/local/lib/python2.7/site-packages/pandas/core/frame.pyc in _init_dict(self, data, index, columns, dtype)
    334 
    335         return _arrays_to_mgr(arrays, data_names, index, columns,
--> 336                               dtype=dtype)
    337 
    338     def _init_ndarray(self, values, index, columns, dtype=None,

/home/luis/.virtualenvs/cs186/local/lib/python2.7/site-packages/pandas/core/frame.pyc in _arrays_to_mgr(arrays, arr_names, index, columns, dtype)
   4625     axes = [_ensure_index(columns), _ensure_index(index)]
   4626 
-> 4627     return create_block_manager_from_arrays(arrays, arr_names, axes)
   4628 
   4629 

/home/luis/.virtualenvs/cs186/local/lib/python2.7/site-packages/pandas/core/internals.pyc in create_block_manager_from_arrays(arrays, names, axes)
   3537 def create_block_manager_from_arrays(arrays, names, axes):
   3538     try:
-> 3539         blocks = form_blocks(arrays, names, axes)
   3540         mgr = BlockManager(blocks, axes)
   3541         mgr._consolidate_inplace()

/home/luis/.virtualenvs/cs186/local/lib/python2.7/site-packages/pandas/core/internals.pyc in form_blocks(arrays, names, axes)
   3603     blocks = []
   3604     if len(float_items):
-> 3605         float_blocks = _multi_blockify(float_items)
   3606         blocks.extend(float_blocks)
   3607 

/home/luis/.virtualenvs/cs186/local/lib/python2.7/site-packages/pandas/core/internals.pyc in _multi_blockify(tuples, dtype)
   3679 
   3680         values, placement = _stack_arrays(
-> 3681             list(tup_block), dtype)
   3682 
   3683         block = make_block(values, placement=placement)

/home/luis/.virtualenvs/cs186/local/lib/python2.7/site-packages/pandas/core/internals.pyc in _stack_arrays(tuples, dtype)
   3723     shape = (len(arrays),) + _shape_compat(first)
   3724 
-> 3725     stacked = np.empty(shape, dtype=dtype)
   3726     for i, arr in enumerate(arrays):
   3727         stacked[i] = _asarray_compat(arr)

MemoryError: 

In [5]:
train = train[start:]
print len(train.smiles)


50000

In [6]:
from rdkit.Chem import AllChem
m = Chem.MolFromSmiles(train.smiles[start])
x1 = AllChem.GetMorganFingerprintAsBitVect(m, 1, nBits=512, useFeatures=True)
x2= AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=512, useFeatures=True)
x3 = AllChem.GetMorganFingerprintAsBitVect(m, 3, nBits=1024, useFeatures=True)
x4 = AllChem.GetHashedAtomPairFingerprintAsBitVect(m, nBits=256)
x5 = AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(m , nBits=256)
new_row = x1 + x2 + x3 + x4 + x5
features = new_row

In [7]:
for i in range(start+1,end):
    if(i %10000 == 0):
        print i
    m = Chem.MolFromSmiles(train.smiles[i])
    x1 = AllChem.GetMorganFingerprintAsBitVect(m, 1, nBits=512, useFeatures=True)
    x2= AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=512, useFeatures=True)
    x3 = AllChem.GetMorganFingerprintAsBitVect(m, 3, nBits=1024, useFeatures=True)
    x4 = AllChem.GetHashedAtomPairFingerprintAsBitVect(m, nBits=256)
    x5 = AllChem.GetHashedTopologicalTorsionFingerprintAsBitVect(m , nBits=256)
    new_row = x1 + x2 + x3 + x4 + x5
    features = features + new_row


360000
370000
380000
390000

In [8]:
len(features)


Out[8]:
128000000

In [9]:
new_feats = np.array(features)

In [10]:
new_feats = new_feats.reshape(end-start,2560)

In [11]:
new_feats.shape


Out[11]:
(50000, 2560)

In [12]:
a = pd.DataFrame(new_feats)

In [13]:
a.to_csv("train"  + str(start))

In [ ]: