Jupyter Notebook Basics


In [1]:
names = ['alice', 'jonathan', 'bobby']
ages = [24, 32, 45]
ranks = ['kinda cool', 'really cool', 'insanely cool']

In [3]:
for (name, age, rank) in zip(names, ages, ranks):
    print name, age, rank


alice 24 kinda cool
jonathan 32 really cool
bobby 45 insanely cool

In [4]:
for index, (name, age, rank) in enumerate(zip(names, ages, ranks)):
    print index, name, age, rank


0 alice 24 kinda cool
1 jonathan 32 really cool
2 bobby 45 insanely cool

In [5]:
# return, esc, shift+enter, ctrl+enter
# text keyboard shortcuts -- cmd > (right), < left,
# option delete (deletes words)
# type "h" for help
# tab
# shift-tab
# keyboard shortcuts
#  - a, b, y, m, dd, h, ctrl+shift+-

In [14]:
%matplotlib inline
%config InlineBackend.figure_format='retina'

import matplotlib.pyplot as plt
# no pylab
import seaborn as sns
sns.set_context('talk')
sns.set_style('darkgrid') 
plt.rcParams['figure.figsize'] = 12, 8  # plotsize 

import numpy as np
# don't do `from numpy import *`
import pandas as pd

In [9]:
# If you have a specific function that you'd like to import
from numpy.random import randn

In [10]:
x = np.arange(100)
y = np.sin(x)
plt.plot(x, y)#;


Out[10]:
[<matplotlib.lines.Line2D at 0x10c020290>]

In [12]:
%matplotlib notebook


/Users/jonathan/anaconda/lib/python2.7/site-packages/IPython/kernel/__init__.py:13: ShimWarning: The `IPython.kernel` package has been deprecated. You should import from ipykernel or jupyter_client instead.
  "You should import from ipykernel or jupyter_client instead.", ShimWarning)

In [13]:
x = np.arange(10)
y = np.sin(x)
plt.plot(x, y)#;


Out[13]:
[<matplotlib.lines.Line2D at 0x10ced5d50>]

Magics!

  • % and %% magics
  • interact
  • embed image
  • embed links, youtube
  • link notebooks

Check out http://matplotlib.org/gallery.html select your favorite.


In [15]:
%%bash
for num in {1..5}
do
    for infile in *;
    do
        echo $num $infile
    done
    wc $infile
done


1 00-Overview.html
1 00-Overview.ipynb
1 00-Overview.py
1 01-Initial_Demo.html
1 01-Initial_Demo.ipynb
1 01-Initial_Demo.py
1 02-Exploratory_(Interactive)_Data_Analysis.html
1 02-Exploratory_(Interactive)_Data_Analysis.ipynb
1 02-Exploratory_(Interactive)_Data_Analysis.py
1 03-Some_basics.html
1 03-Some_basics.ipynb
1 03-Some_basics.py
1 04-More_basics.html
1 04-More_basics.ipynb
1 04-More_basics.py
1 05-Pandas_Plotting_and_SQL.html
1 05-Pandas_Plotting_and_SQL.ipynb
1 05-Pandas_Plotting_and_SQL.py
1 06-Extras.html
1 06-Extras.ipynb
1 06-Extras.py
1 07-github-integration.html
1 07-github-integration.ipynb
1 07-github-integration.py
1 Data_Cleaning.ipynb
1 ex1.png
1 example.txt
1 github-integration.ipynb
     347     918    8952 github-integration.ipynb
2 00-Overview.html
2 00-Overview.ipynb
2 00-Overview.py
2 01-Initial_Demo.html
2 01-Initial_Demo.ipynb
2 01-Initial_Demo.py
2 02-Exploratory_(Interactive)_Data_Analysis.html
2 02-Exploratory_(Interactive)_Data_Analysis.ipynb
2 02-Exploratory_(Interactive)_Data_Analysis.py
2 03-Some_basics.html
2 03-Some_basics.ipynb
2 03-Some_basics.py
2 04-More_basics.html
2 04-More_basics.ipynb
2 04-More_basics.py
2 05-Pandas_Plotting_and_SQL.html
2 05-Pandas_Plotting_and_SQL.ipynb
2 05-Pandas_Plotting_and_SQL.py
2 06-Extras.html
2 06-Extras.ipynb
2 06-Extras.py
2 07-github-integration.html
2 07-github-integration.ipynb
2 07-github-integration.py
2 Data_Cleaning.ipynb
2 ex1.png
2 example.txt
2 github-integration.ipynb
     347     918    8952 github-integration.ipynb
3 00-Overview.html
3 00-Overview.ipynb
3 00-Overview.py
3 01-Initial_Demo.html
3 01-Initial_Demo.ipynb
3 01-Initial_Demo.py
3 02-Exploratory_(Interactive)_Data_Analysis.html
3 02-Exploratory_(Interactive)_Data_Analysis.ipynb
3 02-Exploratory_(Interactive)_Data_Analysis.py
3 03-Some_basics.html
3 03-Some_basics.ipynb
3 03-Some_basics.py
3 04-More_basics.html
3 04-More_basics.ipynb
3 04-More_basics.py
3 05-Pandas_Plotting_and_SQL.html
3 05-Pandas_Plotting_and_SQL.ipynb
3 05-Pandas_Plotting_and_SQL.py
3 06-Extras.html
3 06-Extras.ipynb
3 06-Extras.py
3 07-github-integration.html
3 07-github-integration.ipynb
3 07-github-integration.py
3 Data_Cleaning.ipynb
3 ex1.png
3 example.txt
3 github-integration.ipynb
     347     918    8952 github-integration.ipynb
4 00-Overview.html
4 00-Overview.ipynb
4 00-Overview.py
4 01-Initial_Demo.html
4 01-Initial_Demo.ipynb
4 01-Initial_Demo.py
4 02-Exploratory_(Interactive)_Data_Analysis.html
4 02-Exploratory_(Interactive)_Data_Analysis.ipynb
4 02-Exploratory_(Interactive)_Data_Analysis.py
4 03-Some_basics.html
4 03-Some_basics.ipynb
4 03-Some_basics.py
4 04-More_basics.html
4 04-More_basics.ipynb
4 04-More_basics.py
4 05-Pandas_Plotting_and_SQL.html
4 05-Pandas_Plotting_and_SQL.ipynb
4 05-Pandas_Plotting_and_SQL.py
4 06-Extras.html
4 06-Extras.ipynb
4 06-Extras.py
4 07-github-integration.html
4 07-github-integration.ipynb
4 07-github-integration.py
4 Data_Cleaning.ipynb
4 ex1.png
4 example.txt
4 github-integration.ipynb
     347     918    8952 github-integration.ipynb
5 00-Overview.html
5 00-Overview.ipynb
5 00-Overview.py
5 01-Initial_Demo.html
5 01-Initial_Demo.ipynb
5 01-Initial_Demo.py
5 02-Exploratory_(Interactive)_Data_Analysis.html
5 02-Exploratory_(Interactive)_Data_Analysis.ipynb
5 02-Exploratory_(Interactive)_Data_Analysis.py
5 03-Some_basics.html
5 03-Some_basics.ipynb
5 03-Some_basics.py
5 04-More_basics.html
5 04-More_basics.ipynb
5 04-More_basics.py
5 05-Pandas_Plotting_and_SQL.html
5 05-Pandas_Plotting_and_SQL.ipynb
5 05-Pandas_Plotting_and_SQL.py
5 06-Extras.html
5 06-Extras.ipynb
5 06-Extras.py
5 07-github-integration.html
5 07-github-integration.ipynb
5 07-github-integration.py
5 Data_Cleaning.ipynb
5 ex1.png
5 example.txt
5 github-integration.ipynb
     347     918    8952 github-integration.ipynb

In [20]:
print "hi"
!pwd


hi
/Users/jonathan/github/jupyter-best-practices/notebooks

In [17]:
!ping google.com


PING google.com (216.58.192.46): 56 data bytes
64 bytes from 216.58.192.46: icmp_seq=0 ttl=57 time=14.025 ms
64 bytes from 216.58.192.46: icmp_seq=1 ttl=57 time=14.696 ms
64 bytes from 216.58.192.46: icmp_seq=2 ttl=57 time=16.200 ms
64 bytes from 216.58.192.46: icmp_seq=3 ttl=57 time=14.233 ms
^C
--- google.com ping statistics ---
4 packets transmitted, 4 packets received, 0.0% packet loss
round-trip min/avg/max/stddev = 14.025/14.788/16.200/0.850 ms


In [18]:
this_is_magic = "Can you believe you can pass variables and strings like this?"

In [22]:
hey = !echo $this_is_magic

In [23]:
hey


Out[23]:
['Can you believe you can pass variables and strings like this?']

Numpy

If you have arrays of numbers, use numpy or pandas (built on numpy) to represent the data. Tons of very fast underlying code.


In [24]:
x = np.arange(10000)

print x  # smart printing


[   0    1    2 ..., 9997 9998 9999]

In [25]:
print x[0] # first element 
print x[-1] # last element
print x[0:5] # first 5 elements (also x[:5])
print x[:] # "Everything"


0
9999
[0 1 2 3 4]
[   0    1    2 ..., 9997 9998 9999]

In [26]:
print x[-5:] # last five elements


[9995 9996 9997 9998 9999]

In [27]:
print x[-5:-2]


[9995 9996 9997]

In [28]:
print x[-5:-1] # not final value -- not inclusive on right


[9995 9996 9997 9998]

In [ ]:


In [29]:
x = np.random.randint(5, 5000, (3, 5))

In [30]:
x


Out[30]:
array([[1674, 2256, 3525, 3811, 2539],
       [2051, 2165, 3020,  859, 4961],
       [3581, 3649, 3196, 2494, 1693]])

In [31]:
np.sum(x)


Out[31]:
41474

In [32]:
x.sum()


Out[32]:
41474

In [42]:
np.sum(x)


Out[42]:
41474

In [41]:
np.sum(x, axis=0)


Out[41]:
array([7306, 8070, 9741, 7164, 9193])

In [43]:
np.sum(x, axis=1)


Out[43]:
array([13805, 13056, 14613])

In [44]:
x.sum(axis=1)


Out[44]:
array([13805, 13056, 14613])

In [45]:
# Multi dimension array slice with a comma
x[:, 2]


Out[45]:
array([3525, 3020, 3196])

In [ ]:


In [46]:
y = np.linspace(10, 20, 11)
y


Out[46]:
array([ 10.,  11.,  12.,  13.,  14.,  15.,  16.,  17.,  18.,  19.,  20.])

In [47]:
np.linspace?

In [ ]:
np.linspace()
# shift-tab; shift-tab-tab
np.

In [48]:
def does_it(first=x, second=y):
    """This is my doc"""
    pass

In [49]:
y[[3, 5, 7]]


Out[49]:
array([ 13.,  15.,  17.])

In [ ]:
does_it()

In [51]:
num = 3000
x = np.linspace(1.0, 300.0, num)
y = np.random.rand(num)
z = np.sin(x)
np.savetxt("example.txt", np.transpose((x, y, z)))

In [52]:
%less example.txt

In [53]:
!wc example.txt


    3000    9000  226497 example.txt

In [54]:
!head example.txt


1.000000000000000000e+00 6.755524804082274626e-01 8.414709848078965049e-01
1.099699899966655625e+00 2.573566361522948709e-01 8.910711957212333889e-01
1.199399799933311028e+00 1.668969491698735208e-02 9.318214309529510020e-01
1.299099699899966653e+00 2.776170095289725026e-01 9.633169657262274921e-01
1.398799599866622279e+00 2.211580773928156773e-01 9.852449914594796354e-01
1.498499499833277682e+00 2.385799302492821461e-01 9.973877225308434014e-01
1.598199399799933307e+00 3.891476454005263763e-01 9.996245592899137833e-01
1.697899299766588932e+00 6.537744236852294222e-02 9.919332858340597081e-01
1.797599199733244557e+00 9.270871277442276348e-01 9.743902906531928254e-01
1.897299099699899960e+00 9.065700223682380265e-01 9.471698079515821211e-01

In [55]:
#Not a good idea
a = []
b = []
for line in open("example.txt", 'r'):
    a.append(line[0])
    b.append(line[2])
    
a[:10] # Whoops!


Out[55]:
['1', '1', '1', '1', '1', '1', '1', '1', '1', '1']

In [56]:
a = []
b = []
for line in open("example.txt", 'r'):
    line = line.split()
    a.append(line[0])
    b.append(line[2])
    
a[:10] # Strings!


Out[56]:
['1.000000000000000000e+00',
 '1.099699899966655625e+00',
 '1.199399799933311028e+00',
 '1.299099699899966653e+00',
 '1.398799599866622279e+00',
 '1.498499499833277682e+00',
 '1.598199399799933307e+00',
 '1.697899299766588932e+00',
 '1.797599199733244557e+00',
 '1.897299099699899960e+00']

In [57]:
a = []
b = []
for line in open("example.txt", 'r'):
    line = line.split()
    a.append(float(line[0]))
    b.append(float(line[2]))
    
a[:10] # Lists!


Out[57]:
[1.0,
 1.0996998999666556,
 1.199399799933311,
 1.2990996998999667,
 1.3987995998666223,
 1.4984994998332777,
 1.5981993997999333,
 1.697899299766589,
 1.7975991997332446,
 1.8972990996999]

In [58]:
# Do this!
a, b = np.loadtxt("example.txt", unpack=True, usecols=(0,2))

In [59]:
a


Out[59]:
array([   1.       ,    1.0996999,    1.1993998, ...,  299.8006002,
        299.9003001,  300.       ])

Matplotlib and Numpy


In [60]:
from numpy.random import randn

In [61]:
num = 50
x = np.linspace(2.5, 300, num)
y = randn(num)
plt.scatter(x, y)


Out[61]:
<matplotlib.collections.PathCollection at 0x10da07850>

In [64]:
y > 1


Out[64]:
array([False, False, False, False, False, False, False,  True,  True,
       False, False, False, False, False, False,  True, False, False,
       False, False, False, False,  True, False, False, False, False,
        True, False,  True, False, False, False, False, False, False,
       False, False, False, False,  True, False, False,  True, False,
       False, False,  True, False, False], dtype=bool)

In [65]:
y[y > 1]


Out[65]:
array([ 1.83893164,  1.74924294,  1.63759596,  1.61812317,  1.0952195 ,
        1.65513944,  1.55212009,  1.34197357,  1.16727989])

In [66]:
y[(y < 1) & (y > -1)]


Out[66]:
array([ 0.43218211,  0.53655612,  0.19196851,  0.18948493,  0.82077996,
       -0.22307154,  0.96933913,  0.6985783 ,  0.38431027,  0.1857278 ,
       -0.31456629, -0.44862121, -0.36135728, -0.00888057, -0.39866509,
       -0.91453153, -0.02413912, -0.65570203, -0.490552  , -0.19144114,
        0.19980678, -0.44656607, -0.03864452,  0.09292263, -0.52100933,
        0.64195428, -0.29391021, -0.7878031 , -0.03254576, -0.03610955,
        0.53898577,  0.09182785])

In [67]:
plt.scatter(x, y, c='b', s=50)
plt.scatter(x[(y < 1) & (y > -1)], y[(y < 1) & (y > -1)], c='r', s=50)


Out[67]:
<matplotlib.collections.PathCollection at 0x10da8ee50>

In [68]:
y[~((y < 1) & (y > -1))] = 1.0
plt.scatter(x, y, c='b')
plt.scatter(x, np.clip(y, -0.5, 0.5), color='red')


Out[68]:
<matplotlib.collections.PathCollection at 0x10db42150>

In [71]:
num = 350
slope = 0.3
x = randn(num) * 50. + 150.0 
y = randn(num) * 5 + x * slope
plt.scatter(x, y, c='b')


Out[71]:
<matplotlib.collections.PathCollection at 0x1112edb10>

In [72]:
# plt.scatter(x[(y < 1) & (y > -1)], y[(y < 1) & (y > -1)], c='r')
# np.argsort, np.sort, complicated index slicing
dframe = pd.DataFrame({'x': x, 'y': y})
g = sns.jointplot('x', 'y', data=dframe, kind="reg")


Grab Python version of ggplot http://ggplot.yhathq.com/


In [73]:
from ggplot import ggplot, aes, geom_line, stat_smooth, geom_dotplot, geom_point

In [74]:
ggplot(aes(x='x', y='y'), data=dframe) + geom_point() + stat_smooth(colour='blue', span=0.2)


Out[74]:
<ggplot: (289766805)>

In [ ]: