In [ ]:
This notebook is based on the "Intro to pandas data structures" by Greg Reda(http://www.gregreda.com/2013/10/26/intro-to-pandas-data-structures//)

In [6]:
# import and configure the required modules.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.set_option('max_columns', 50)
%matplotlib inline

In [9]:
# Series
s = pd.Series([8, 'This is a string', 3.14, -1.423423423423, "Another string!"])
s


Out[9]:
0                   8
1    This is a string
2                3.14
3            -1.42342
4     Another string!
dtype: object

In [10]:
d = {'New Delhi': 11, 'Bombay': 22, 'Kolkata': 33, 'Chennai': 44, 'Bangalore': 80}
cities = pd.Series(d)
cities


Out[10]:
Bangalore    80
Bombay       22
Chennai      44
Kolkata      33
New Delhi    11
dtype: int64

In [11]:
cities['Bangalore']


Out[11]:
80

In [13]:
cities[['Bangalore', 'Kolkata', 'Bombay']]


Out[13]:
Bangalore    80
Kolkata      33
Bombay       22
dtype: int64

In [14]:
cities < 20


Out[14]:
Bangalore    False
Bombay       False
Chennai      False
Kolkata      False
New Delhi     True
dtype: bool

In [15]:
cities > 20


Out[15]:
Bangalore     True
Bombay        True
Chennai       True
Kolkata       True
New Delhi    False
dtype: bool

In [16]:
cities[cities > 20]


Out[16]:
Bangalore    80
Bombay       22
Chennai      44
Kolkata      33
dtype: int64

In [18]:
print 'old value:', cities['Bangalore']
cities['Bangalore'] = 90
print 'New value:', cities['Bangalore']


old value: 80
New value: 90

In [19]:
cities


Out[19]:
Bangalore    90
Bombay       22
Chennai      44
Kolkata      33
New Delhi    11
dtype: int64

In [20]:
print(cities[cities < 90])
print('\n')
cities[cities < 90] = 750

print cities[cities < 90]


Bombay       22
Chennai      44
Kolkata      33
New Delhi    11
dtype: int64


Series([], dtype: int64)

In [21]:
cities


Out[21]:
Bangalore     90
Bombay       750
Chennai      750
Kolkata      750
New Delhi    750
dtype: int64

In [22]:
print 'Bangalore' in cities
print 'Mysore' in cities


True
False

In [23]:
cities / 3


Out[23]:
Bangalore     30.0
Bombay       250.0
Chennai      250.0
Kolkata      250.0
New Delhi    250.0
dtype: float64

In [24]:
np.square(cities)


Out[24]:
Bangalore      8100
Bombay       562500
Chennai      562500
Kolkata      562500
New Delhi    562500
dtype: int64

In [26]:
cities[['New Delhi', 'Bombay', 'Mangalore']]


Out[26]:
New Delhi    750.0
Bombay       750.0
Mangalore      NaN
dtype: float64

In [28]:
cities[['New Delhi', 'Bombay', 'Mangalore']] + cities[['Bangalore', 'Mysore']]


Out[28]:
Bangalore   NaN
Bombay      NaN
Mangalore   NaN
Mysore      NaN
New Delhi   NaN
dtype: float64

In [29]:
cities.notnull()


Out[29]:
Bangalore    True
Bombay       True
Chennai      True
Kolkata      True
New Delhi    True
dtype: bool

In [30]:
cities.isnull()


Out[30]:
Bangalore    False
Bombay       False
Chennai      False
Kolkata      False
New Delhi    False
dtype: bool

In [32]:
data = {'year': [2010, 2011, 2012, 2011, 2012, 2010, 2011, 2012],
       'team': ['Bears', 'Bears', 'Bears', 'Packers', 'Packers', 'Lions', 'Lions', 'Lions'],
       'wins': [11, 8, 10, 15, 11, 6, 10, 4],
       'losses': [5, 8, 6, 1, 5, 10, 6, 12]}

In [33]:
football = pd.DataFrame(data, columns=['year', 'team', 'wins', 'losses'])
football


Out[33]:
year team wins losses
0 2010 Bears 11 5
1 2011 Bears 8 8
2 2012 Bears 10 6
3 2011 Packers 15 1
4 2012 Packers 11 5
5 2010 Lions 6 10
6 2011 Lions 10 6
7 2012 Lions 4 12

In [38]:
!head -n 5 data.csv


Year,Age,Tm,Lg,W,L,W-L%,ERA,G,GS,GF,CG,SHO,SV,IP,H,R,ER,HR,BB,IBB,SO,HBP,BK,WP,BF,ERA+,WHIP,H/9,HR/9,BB/9,SO/9,SO/BB,Awards
1995,25,NYY,AL,5,3,.625,5.51,19,10,2,0,0,0,67.0,71,43,41,11,30,0,51,2,1,0,301,84,1.507,9.5,1.5,4.0,6.9,1.70,
1996,26,NYY,AL,8,3,.727,2.09,61,0,14,0,0,5,107.2,73,25,25,1,34,3,130,2,0,1,425,240,0.994,6.1,0.1,2.8,10.9,3.82,CYA-3MVP-12
1997,27,NYY,AL,6,4,.600,1.88,66,0,56,0,0,43,71.2,65,17,15,5,20,6,68,0,0,2,301,239,1.186,8.2,0.6,2.5,8.5,3.40,ASMVP-25
1998,28,NYY,AL,3,0,1.000,1.91,54,0,49,0,0,36,61.1,48,13,13,3,17,1,36,1,0,0,246,233,1.060,7.0,0.4,2.5,5.3,2.12,

In [41]:
from_csv = pd.read_csv("data.csv")
from_csv.head()


Out[41]:
Year Age Tm Lg W L W-L% ERA G GS GF CG SHO SV IP H R ER HR BB IBB SO HBP BK WP BF ERA+ WHIP H/9 HR/9 BB/9 SO/9 SO/BB Awards
0 1995 25 NYY AL 5 3 0.625 5.51 19 10 2 0 0 0 67.0 71 43 41 11 30 0 51 2 1 0 301 84 1.507 9.5 1.5 4.0 6.9 1.70 NaN
1 1996 26 NYY AL 8 3 0.727 2.09 61 0 14 0 0 5 107.2 73 25 25 1 34 3 130 2 0 1 425 240 0.994 6.1 0.1 2.8 10.9 3.82 CYA-3MVP-12
2 1997 27 NYY AL 6 4 0.600 1.88 66 0 56 0 0 43 71.2 65 17 15 5 20 6 68 0 0 2 301 239 1.186 8.2 0.6 2.5 8.5 3.40 ASMVP-25
3 1998 28 NYY AL 3 0 1.000 1.91 54 0 49 0 0 36 61.1 48 13 13 3 17 1 36 1 0 0 246 233 1.060 7.0 0.4 2.5 5.3 2.12 NaN
4 1999 29 NYY AL 4 3 0.571 1.83 66 0 63 0 0 45 69.0 43 15 14 2 18 3 52 3 1 2 268 257 0.884 5.6 0.3 2.3 6.8 2.89 ASCYA-3MVP-14

In [42]:
cols = ['num', 'game', 'date', 'team', 'home_away', 'opponent',
        'result', 'quarter', 'distance', 'receiver', 'score_before',
        'score_after']
no_headers = pd.read_csv('peyton-passing-TDs-2012.csv', sep=',', header=None, names=cols)
no_headers.head()


Out[42]:
num game date team home_away opponent result quarter distance receiver score_before score_after
0 1 1 2012-09-09 DEN NaN PIT W 31-19 3 71 Demaryius Thomas Trail 7-13 Lead 14-13*
1 2 1 2012-09-09 DEN NaN PIT W 31-19 4 1 Jacob Tamme Trail 14-19 Lead 22-19*
2 3 2 2012-09-17 DEN @ ATL L 21-27 2 17 Demaryius Thomas Trail 0-20 Trail 7-20
3 4 3 2012-09-23 DEN NaN HOU L 25-31 4 38 Brandon Stokley Trail 11-31 Trail 18-31
4 5 3 2012-09-23 DEN NaN HOU L 25-31 4 6 Joel Dreessen Trail 18-31 Trail 25-31

In [44]:
no_headers.to_csv('out.csv')

In [ ]: