In [1]:
import pandas as pd
iris_filename = 'data/datasets-uci-iris.csv'
iris = pd.read_csv(iris_filename, sep=',',decimal='.',header=None,
names=['sepal_length','sepal_width','petal_length','petal_width','target'])
In [2]:
iris.describe()
Out[2]:
In [3]:
iris.fillna(1)
Out[3]:
In [4]:
bad_dataset = pd.read_csv('data/loading_example_1.csv',error_bad_lines=False)
In [5]:
bad_dataset.fillna(-1)
Out[5]:
In [6]:
bad_dataset.describe()
Out[6]:
In [7]:
bad_dataset
Out[7]:
In [8]:
iris_chunks = pd.read_csv(iris_filename,header=None,names=['c1','c2','c3','c4','c5'],chunksize=10)
In [9]:
iris_chunks
Out[9]:
In [10]:
for chunk in iris_chunks:
print chunk.shape
print chunk
In [11]:
iris_iterator = pd.read_csv(iris_filename,header=None,names=['c1','c2','c3','c4','c5'],iterator=True)
print iris_iterator.get_chunk(10)
print iris_iterator.get_chunk(20)
piece = iris_iterator.get_chunk(2)
piece
Out[11]:
In [12]:
import csv
with open(iris_filename,'rb') as data_stream:
for n,row in enumerate(csv.DictReader(data_stream,
fieldnames=['sepal_length','sepal_width','petal_length','petal_width','target'],
dialect='excel')):
if n==0:
print n,row
else:
break
In [13]:
import csv
with open(iris_filename,'rb') as data_stream:
for n,row in enumerate(csv.reader(data_stream,dialect='excel')):
if n==0:
print n,row
else:
break
In [14]:
import pandas as pd
my_own_datasest = pd.DataFrame({'col1':range(5),'col2':[1.0]*5,'col3':1.9,'col4':'Hello!'})
print my_own_datasest
print my_own_datasest.dtypes
In [15]:
my_own_datasest['col1']=my_own_datasest['col1'].astype(float)
print my_own_datasest.dtypes
In [16]:
mask_feature = iris['sepal_length'] > 6.0
print mask_feature.head(10)
In [17]:
mask_target=iris['target']=='Iris-virginica'
print mask_target.head(5)
In [18]:
print iris['target'].unique()
In [19]:
ground_targets_mean = iris.groupby(['target']).mean()
print ground_targets_mean
In [20]:
iris.sort_index(by='sepal_length').head()
Out[20]:
In [21]:
dataset = pd.read_csv('data/loading_example_2.csv',index_col=0)
dataset
Out[21]:
In [22]:
print dataset['val3'][104]
print dataset.loc[104,'val3']
print dataset.ix[104,2]
print dataset.iloc[4,2]
print dataset.ix[range(100,102),['val2','val3']]
In [23]:
import pandas as pd
categorical_feature = pd.Series(['sunny','cloudy','snowy','rainy','foggy'])
mapping = pd.get_dummies(categorical_feature)
mapping
Out[23]:
In [24]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ohe = OneHotEncoder()
levels = ['sunny','cloudy','snowy','rainy','foggy']
fit_levs = le.fit_transform(levels)
ohe.fit([[fit_levs[0]], [fit_levs[1]], [fit_levs[2]], [fit_levs[3]], [fit_levs[4]]])
print ohe.transform([le.transform(['sunny'])]).toarray()
print ohe.transform([le.transform(['cloudy'])]).toarray()
In [25]:
from sklearn.datasets import fetch_20newsgroups
categories = ['sci.med','sci.space']
twenty_sci_news = fetch_20newsgroups(categories = categories)
print twenty_sci_news.data[0]
print twenty_sci_news.filenames
print twenty_sci_news.target[0]
print twenty_sci_news.target_names[twenty_sci_news.target[0]]
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
word_count = count_vect.fit_transform(twenty_sci_news.data)
print word_count.shape
print word_count[0]
word_list = count_vect.get_feature_names()
for n in word_count[0].indices:
print "word:", word_list[n], " apperas", word_count[0,n], " times"
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vect = TfidfVectorizer(use_idf=False,norm='l1')
word_freq=tf_vect.fit_transform(twenty_sci_news.data)
word_list = tf_vect.get_feature_names()
for n in word_freq[0].indices:
print "word:",word_list[n], " has frequency ", word_freq[0,n]
In [26]:
text1='we love data science'
text2='data science is hard'
documents = [text1,text2]
print documents
count_vect_1_grams = CountVectorizer(ngram_range=(1,1),stop_words=[],min_df=1)
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print 'word list = ', word_list
print "text1 is described with", [word_list[n] + "("+ str(word_count[0,n]) + ")" for n in word_count[0].indices]
In [27]:
count_vect_1_grams = CountVectorizer(ngram_range=(2,2),stop_words=[],min_df=1)
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print 'word list = ', word_list
print "text1 is described with", [word_list[n] + "("+ str(word_count[0,n]) + ")" for n in word_count[0].indices]
In [28]:
count_vect_1_grams = CountVectorizer(ngram_range=(1,2),stop_words=[],min_df=1)
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print 'word list = ', word_list
print "text1 is described with", [word_list[n] + "("+ str(word_count[0,n]) + ")" for n in word_count[0].indices]
In [10]:
In [ ]: