In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import os
import re

os.chdir('/Users/zhouyu/Documents/Zhou_Yu/DS/kaggle_challenge/text processing')


/Users/zhouyu/Documents/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:273: UserWarning: Matplotlib is building the font cache using fc-list. This may take a moment.
  warnings.warn('Matplotlib is building the font cache using fc-list. This may take a moment.')

In [6]:
raw_text = pd.read_csv("crypto.csv",index_col = None)
raw_text.head(1)


Out[6]:
id title content tags
0 3 What are the benefits of the two permutation t... <p>Why do we use a permutation table in the fi... block-cipher des permutation

In [5]:
raw_text['content'].iloc[0]


Out[5]:
'<p>Why do we use a permutation table in the first step of <a href="http://en.wikipedia.org/wiki/Data_Encryption_Standard" rel="nofollow">DES algorithm</a> and one at the end of the algorithm?</p>\n'

In [4]:
#from bs4 import BeautifulSoup4
import nltk
#nltk.download()
from nltk.corpus import stopwords

In [17]:
def text_to_words(rawtext):
    #split into individual words, remove HTML, only keep letters and number
    # convert letters to lower case
    reg_c = re.compile('[^a-zA-Z0-9_\\+\\-/]')
    words = [word for word in reg_c.split(rawtext.lower()) if word!='']
    stops = set(stopwords.words("english"))
    #take out stop words
    meaningful_words = [w for w in words if not w in stops]
    return(" ".join(meaningful_words))

cleaned_post = []
cleaned_target = []
for i in range(0,3):
    cleaned_post.append(text_to_words(raw_text['title'][i]+' '+raw_text['content'][i]))
    cleaned_target.append(text_to_words(raw_text['tags'][i]))
print cleaned_post


['criticality ribosome binding site relative start codon prokaryotic translation p prokaryotic translation critical efficient translation location ribosome binding site relative start codon /p p ideally supposed -7b away start -9 bases away even observable effect translation /p', 'rnase contamination rna based experiments prevented p anyone suggestions prevent rnase contamination working rna /p p tend issues degradation regardless whether use depc treated / rnase free water filtered pipette tips /p', 'lymphocyte sizes clustered two groups p tortora writes em principles anatomy physiology /em /p blockquote p lymphocytes may small 6 9 diameter large 10 14 diameter /p /blockquote p ranges quite close others taken mean lymphocytes sizes clustered two groups way saying lymphocytes 6-14 /p']

In [13]:



Out[13]:
id title content tags
0 1 What is the criticality of the ribosome bindin... <p>In prokaryotic translation, how critical fo... ribosome binding-sites translation synthetic-b...
1 2 How is RNAse contamination in RNA based experi... <p>Does anyone have any suggestions to prevent... rna biochemistry
2 3 Are lymphocyte sizes clustered in two groups? <p>Tortora writes in <em>Principles of Anatomy... immunology cell-biology hematology

In [ ]: