In [2]:
%matplotlib inline

import pandas as pd
import numpy as np
import os
import re

os.chdir('/Users/zhouyu/Documents/Zhou_Yu/DS/kaggle_challenge/text processing')


/Users/zhouyu/Documents/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [3]:
bio_data = pd.read_csv("biology.csv")

In [4]:
bio_data.shape


Out[4]:
(13196, 4)

In [8]:
bio_data.describe(include =['object'])


Out[8]:
title content tags
count 13196 13196 13196
unique 13190 13195 8245
top What is the name of this flower? <p>Can anyone explain how contraction of the c... evolution
freq 2 2 169

In [6]:
bio_data.head(5)


Out[6]:
id title content tags
0 1 what is the criticality of the ribosome bindin... <p>in prokaryotic translation, how critical fo... ribosome binding-sites translation synthetic-b...
1 2 how is rnase contamination in rna based experi... <p>does anyone have any suggestions to prevent... rna biochemistry
2 3 are lymphocyte sizes clustered in two groups? <p>tortora writes in <em>principles of anatomy... immunology cell-biology hematology
3 4 how long does antibiotic-dosed lb maintain goo... <p>various people in our lab will prepare a li... cell-culture
4 5 is exon order always preserved in splicing? <p>are there any cases in which the splicing m... splicing mrna spliceosome introns exons

In [5]:
# pre-process those strings
import string
from nltk.corpus import stopwords
features = []
for i in bio_data.columns:
    if bio_data[i].dtype == 'object':
        features.append(i)
        bio_data[i] = bio_data[i].map(text2list)

In [30]:
bio_data.columns[bio_data.duplicated().sum()>0]


/Users/zhouyu/Documents/anaconda/lib/python2.7/site-packages/pandas/indexes/base.py:1264: VisibleDeprecationWarning: using a boolean instead of an integer will result in an error in the future
  return getitem(key)
Out[30]:
'id'

In [31]:
bio_data.describe()


Out[31]:
id
count 13196.000000
mean 26207.265080
std 15391.494086
min 1.000000
25% 13522.250000
50% 26332.500000
75% 40584.250000
max 51264.000000

In [7]:
import string
s = "how can I, 'understand##' this c++ quesions?"
#use standarad library
#regex = re.compile('[%s]' % re.escape(string.punctuation))
#k=re.sub(regex,'',s)
#k.split(' ')
# use custom set
regtest = re.compile('[^a-zA-Z0-9_\\+\\-/]')
#re.sub(regtest,'',s)
#wordset = [word.strip().lower() for word in regtest.split(s)]
wordset = [word for word in regtest.split(s) if word!='']

wordset


Out[7]:
['how', 'can', 'I', 'understand', 'this', 'c++', 'quesions']

In [8]:
a = "A dsg B"
a.lower()


Out[8]:
'a dsg b'

In [ ]: