notebook.community

Edit and run



In [2]:

    
%matplotlib inline

import pandas as pd
import numpy as np
import os
import re

os.chdir('/Users/zhouyu/Documents/Zhou_Yu/DS/kaggle_challenge/text processing')









    



/Users/zhouyu/Documents/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')



In [3]:

    
bio_data = pd.read_csv("biology.csv")



In [4]:

    
bio_data.shape









    Out[4]:





(13196, 4)



In [8]:

    
bio_data.describe(include =['object'])









    Out[8]:






  
    
      
      title
      content
      tags
    
  
  
    
      count
      13196
      13196
      13196
    
    
      unique
      13190
      13195
      8245
    
    
      top
      What is the name of this flower?
      <p>Can anyone explain how contraction of the c...
      evolution
    
    
      freq
      2
      2
      169



In [6]:

    
bio_data.head(5)









    Out[6]:






  
    
      
      id
      title
      content
      tags
    
  
  
    
      0
      1
      what is the criticality of the ribosome bindin...
      <p>in prokaryotic translation, how critical fo...
      ribosome binding-sites translation synthetic-b...
    
    
      1
      2
      how is rnase contamination in rna based experi...
      <p>does anyone have any suggestions to prevent...
      rna biochemistry
    
    
      2
      3
      are lymphocyte sizes clustered in two groups?
      <p>tortora writes in <em>principles of anatomy...
      immunology cell-biology hematology
    
    
      3
      4
      how long does antibiotic-dosed lb maintain goo...
      <p>various people in our lab will prepare a li...
      cell-culture
    
    
      4
      5
      is exon order always preserved in splicing?
      <p>are there any cases in which the splicing m...
      splicing mrna spliceosome introns exons



In [5]:

    
# pre-process those strings
import string
from nltk.corpus import stopwords
features = []
for i in bio_data.columns:
    if bio_data[i].dtype == 'object':
        features.append(i)
        bio_data[i] = bio_data[i].map(text2list)



In [30]:

    
bio_data.columns[bio_data.duplicated().sum()>0]









    



/Users/zhouyu/Documents/anaconda/lib/python2.7/site-packages/pandas/indexes/base.py:1264: VisibleDeprecationWarning: using a boolean instead of an integer will result in an error in the future
  return getitem(key)






    Out[30]:





'id'



In [31]:

    
bio_data.describe()









    Out[31]:






  
    
      
      id
    
  
  
    
      count
      13196.000000
    
    
      mean
      26207.265080
    
    
      std
      15391.494086
    
    
      min
      1.000000
    
    
      25%
      13522.250000
    
    
      50%
      26332.500000
    
    
      75%
      40584.250000
    
    
      max
      51264.000000



In [7]:

    
import string
s = "how can I, 'understand##' this c++ quesions?"
#use standarad library
#regex = re.compile('[%s]' % re.escape(string.punctuation))
#k=re.sub(regex,'',s)
#k.split(' ')
# use custom set
regtest = re.compile('[^a-zA-Z0-9_\\+\\-/]')
#re.sub(regtest,'',s)
#wordset = [word.strip().lower() for word in regtest.split(s)]
wordset = [word for word in regtest.split(s) if word!='']

wordset









    Out[7]:





['how', 'can', 'I', 'understand', 'this', 'c++', 'quesions']



In [8]:

    
a = "A dsg B"
a.lower()









    Out[8]:





'a dsg b'



In [ ]:

	title	content	tags
count	13196	13196	13196
unique	13190	13195	8245
top	What is the name of this flower?	<p>Can anyone explain how contraction of the c...	evolution
freq	2	2	169

	id	title	content	tags
0	1	what is the criticality of the ribosome bindin...	<p>in prokaryotic translation, how critical fo...	ribosome binding-sites translation synthetic-b...
1	2	how is rnase contamination in rna based experi...	<p>does anyone have any suggestions to prevent...	rna biochemistry
2	3	are lymphocyte sizes clustered in two groups?	<p>tortora writes in <em>principles of anatomy...	immunology cell-biology hematology
3	4	how long does antibiotic-dosed lb maintain goo...	<p>various people in our lab will prepare a li...	cell-culture
4	5	is exon order always preserved in splicing?	<p>are there any cases in which the splicing m...	splicing mrna spliceosome introns exons

	id
count	13196.000000
mean	26207.265080
std	15391.494086
min	1.000000
25%	13522.250000
50%	26332.500000
75%	40584.250000
max	51264.000000