In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
plt.style.use('ggplot')
In [3]:
x = Series(range(5,10))
In [4]:
x
Out[4]:
In [5]:
x.sum(), x.mean(), x.std()
Out[5]:
In [6]:
x**2
Out[6]:
In [7]:
x[x >= 8]
Out[7]:
In [8]:
x[1:4]
Out[8]:
In [9]:
x.index = list('abcde')
x
Out[9]:
In [10]:
x['a':'c']
Out[10]:
In [11]:
x[1:4]
Out[11]:
In [12]:
x.a, x.c, x.e
Out[12]:
In [13]:
y = Series([10, np.nan, np.nan, 13, 14])
y
Out[13]:
In [14]:
z = pd.concat([x, y])
z
Out[14]:
In [15]:
z = z.reset_index(drop=True)
z
Out[15]:
In [16]:
z.sum(), z.mean(), z.std()
Out[16]:
In [17]:
z[z.isnull()]
Out[17]:
In [18]:
z[z.notnull()]
Out[18]:
In [19]:
z.fillna(0)
Out[19]:
In [20]:
z.fillna(method='ffill')
Out[20]:
In [21]:
z.fillna(method='bfill')
Out[21]:
In [22]:
z.fillna(z.mean())
Out[22]:
In [23]:
z.index = pd.date_range('01-Jan-2016', periods=len(z))
In [24]:
z
Out[24]:
In [25]:
z.resample('W').sum()
Out[25]:
In [26]:
z.index.strftime('%b %d, %Y')
Out[26]:
In [27]:
url = 'https://raw.githubusercontent.com/mwaskom/seaborn-data/master/titanic.csv'
titanic = pd.read_csv(url)
In [28]:
titanic.shape
Out[28]:
In [29]:
titanic.columns
Out[29]:
In [30]:
# For display purposes, we will drop some columns
titanic = titanic[['survived', 'sex', 'age', 'fare',
'embarked', 'class', 'who', 'deck', 'embark_town',]]
In [31]:
titanic.dtypes
Out[31]:
In [32]:
titanic.ix[0]
Out[32]:
In [33]:
titanic.describe()
Out[33]:
In [34]:
titanic.head()
Out[34]:
In [35]:
titanic.tail()
Out[35]:
In [36]:
titanic.columns
Out[36]:
In [37]:
titanic.index
Out[37]:
In [38]:
titanic[['sex', 'age', 'class']].head()
Out[38]:
In [39]:
titanic[10:15]
Out[39]:
In [40]:
titanic.ix[10:15, 'age':'fare']
Out[40]:
In [41]:
titanic.ix[10:15, [1,3,5]]
Out[41]:
In [42]:
titanic[titanic.age < 2]
Out[42]:
In [43]:
titanic.sort_index().head()
Out[43]:
In [44]:
titanic.sort_values('age', ascending=True).head()
Out[44]:
In [45]:
titanic.sort_values(['survived', 'age'], ascending=[True, True]).head()
Out[45]:
In [46]:
sex_class = titanic.groupby(['sex', 'class'])
In [47]:
sex_class.count()
Out[47]:
In [48]:
df = sex_class.mean()
df['survived']
Out[48]:
In [49]:
sex_class.get_group(('female', 'First')).groupby('embark_town').count()
Out[49]:
In [50]:
pd.crosstab(titanic.survived, titanic['class'])
Out[50]:
In [51]:
def my_func(x):
return np.max(x)
In [52]:
mapped_funcs = {'embarked': 'count', 'age': ('mean', 'median', my_func), 'survived': sum}
sex_class.get_group(('female', 'First')).groupby('embark_town').agg(mapped_funcs)
Out[52]:
In [53]:
titanic.columns
Out[53]:
In [54]:
import seaborn as sns
sns.set_context(font_scale=4)
sns.factorplot(x='sex', y='age', hue='survived', col='class', kind='box', data=titanic)
pass
In [55]:
from pandas_datareader import data as web
import datetime
In [56]:
apple = web.DataReader('AAPL', 'google',
start = datetime.datetime(2015, 1, 1),
end = datetime.datetime(2015, 12, 31))
In [57]:
apple.head()
Out[57]:
In [58]:
apple.plot.line(y='Close', marker='o', markersize=3, linewidth=0.5)
pass
In [59]:
# Zoom in on large drop in August
aug = apple['2015-08-01':'2015-08-30']
aug.plot.line(y=['High', 'Low', 'Open', 'Close'], marker='o', markersize=10, linewidth=1)
pass
In [60]:
titanic.head(2)
Out[60]:
In [61]:
titanic.to_csv('titanic.csv', index=False)
In [62]:
t1 = pd.read_csv('titanic.csv')
t1.head(2)
Out[62]:
In [63]:
!pip install openpyxl
t1.to_excel('titanic.xlsx')
In [64]:
t2 = pd.read_excel('titanic.xlsx')
t2.head(2)
Out[64]:
In [65]:
import sqlite3
con = sqlite3.connect('titanic.db')
t2.to_sql('titanic', con, index=False, if_exists='replace')
In [66]:
t3 = pd.read_sql('select * from titanic', con)
t3.head(2)
Out[66]:
In [67]:
t3.to_json('titanic.json')
In [68]:
t4 = pd.read_json('titanic.json')
t4.head(2)
Out[68]:
In [69]:
t4 = t4[t3.columns]
t4.head(2)
Out[69]: