In [2]:
%matplotlib inline
import pandas as pd
import numpy as np
import os
import re
os.chdir('/Users/zhouyu/Documents/Zhou_Yu/DS/kaggle_challenge/text processing')
In [3]:
bio_data = pd.read_csv("biology.csv")
In [4]:
bio_data.shape
Out[4]:
In [8]:
bio_data.describe(include =['object'])
Out[8]:
In [6]:
bio_data.head(5)
Out[6]:
In [5]:
# pre-process those strings
import string
from nltk.corpus import stopwords
features = []
for i in bio_data.columns:
if bio_data[i].dtype == 'object':
features.append(i)
bio_data[i] = bio_data[i].map(text2list)
In [30]:
bio_data.columns[bio_data.duplicated().sum()>0]
Out[30]:
In [31]:
bio_data.describe()
Out[31]:
In [7]:
import string
s = "how can I, 'understand##' this c++ quesions?"
#use standarad library
#regex = re.compile('[%s]' % re.escape(string.punctuation))
#k=re.sub(regex,'',s)
#k.split(' ')
# use custom set
regtest = re.compile('[^a-zA-Z0-9_\\+\\-/]')
#re.sub(regtest,'',s)
#wordset = [word.strip().lower() for word in regtest.split(s)]
wordset = [word for word in regtest.split(s) if word!='']
wordset
Out[7]:
In [8]:
a = "A dsg B"
a.lower()
Out[8]:
In [ ]: