In [1]:
3 * 4
Out[1]:
In [2]:
x = [1, 2, 3]
print(x)
In [3]:
x.append(4)
print(x)
In [4]:
measurements = {'height': [1.70, 1.80, 1.50], 'weight': [60, 120, 50]}
measurements
Out[4]:
In [5]:
measurements['height']
Out[5]:
In [6]:
x = [1, 2, 3, 4]
[i**2 for i in x]
Out[6]:
In [7]:
def calc_bmi(weight, height):
return weight / height**2
[calc_bmi(w, h) for w, h in zip(measurements['weight'], measurements['height'])]
Out[7]:
In [8]:
import pandas as pd
import numpy as np
In [9]:
s = pd.Series([1,3,5,np.nan,6,8])
s
Out[9]:
In [10]:
dates = pd.date_range('20130101', periods=6)
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD'))
df
Out[10]:
In [11]:
df[df.A > 0]
Out[11]:
In [12]:
df.mean()
Out[12]:
In [13]:
df.mean(axis='columns')
Out[13]:
In [14]:
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1,index=list(range(4)),dtype='float32'),
'D' : np.array([3] * 4,dtype='int32'),
'E' : pd.Categorical(["test","train","test","train"]),
'F' : 'foo' })
df2
Out[14]:
In [15]:
df2.dtypes
Out[15]:
In [16]:
df = pd.DataFrame({'A' : ['foo', 'bar', 'foo', 'bar',
'foo', 'bar', 'foo', 'foo'],
'B' : ['one', 'one', 'two', 'three',
'two', 'two', 'one', 'three'],
'C' : np.random.randn(8),
'D' : np.random.randn(8)})
In [17]:
df
Out[17]:
In [18]:
df.groupby('A').sum()
Out[18]:
In [19]:
df.groupby(['A', 'B']).sum()
Out[19]:
In [20]:
%matplotlib inline
import seaborn as sns
In [21]:
x = np.random.normal(size=100)
sns.distplot(x);
In [22]:
mean, cov = [0, 1], [(1, .5), (.5, 1)]
data = np.random.multivariate_normal(mean, cov, 200)
df = pd.DataFrame(data, columns=["x", "y"])
df
Out[22]:
In [23]:
sns.jointplot(x="x", y="y", data=df, kind="kde");
In [24]:
iris = sns.load_dataset("iris")
sns.pairplot(iris);
In [25]:
tips = sns.load_dataset("tips")
In [26]:
tips.head()
Out[26]:
In [27]:
sns.lmplot(x="total_bill", y="tip", hue="smoker", data=tips);
In [28]:
sns.lmplot(x="total_bill", y="tip", col="day", data=tips,
col_wrap=2, size=3);
In [29]:
sns.factorplot(x="time", y="total_bill", hue="smoker",
col="day", data=tips, kind="box", size=4, aspect=.5);
Taken from http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html
In [30]:
from sklearn import svm
X = [[0, 0], [1, 1]]
y = [0, 1]
clf = svm.SVC()
clf.fit(X, y)
Out[30]:
In [31]:
clf.predict([[0, .5]])
Out[31]:
Taken from http://scikit-learn.org/stable/auto_examples/grid_search_digits.html and http://scikit-learn.org/stable/auto_examples/datasets/plot_digits_last_image.html
In [32]:
from sklearn import datasets
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
digits = datasets.load_digits()
In [33]:
import matplotlib.pyplot as plt
#Display the first digit
plt.figure(1, figsize=(3, 3))
plt.imshow(digits.images[-1], cmap=plt.cm.gray_r, interpolation='nearest')
plt.grid('off')
In [34]:
n_samples = len(digits.images)
X = digits.images.reshape((n_samples, -1))
y = digits.target
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.5, random_state=0)
In [35]:
# Set the parameters by cross-validation
tuned_parameters = [{'kernel': ['rbf'], 'gamma': [1e-3, 1e-4],
'C': [1, 10, 100, 1000]},
{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
clf = GridSearchCV(SVC(C=1), tuned_parameters, cv=5)
clf.fit(X_train, y_train)
Out[35]:
In [36]:
print(clf.best_params_)
In [37]:
y_true, y_pred = y_test, clf.predict(X_test)
ax = sns.heatmap(confusion_matrix(y_true, y_pred))
ax.set(xlabel='true label', ylabel='predicted label');
Taken from https://jakevdp.github.io/blog/2013/06/15/numba-vs-cython-take-2/
In [38]:
import numpy as np
X = np.random.random((1000, 3))
In [39]:
def pairwise_python(X):
M = X.shape[0]
N = X.shape[1]
D = np.empty((M, M), dtype=np.float)
for i in range(M):
for j in range(M):
d = 0.0
for k in range(N):
tmp = X[i, k] - X[j, k]
d += tmp * tmp
D[i, j] = np.sqrt(d)
return D
%timeit pairwise_python(X)
In [40]:
%load_ext cython
In [41]:
%%cython
import numpy as np
cimport cython
from libc.math cimport sqrt
@cython.boundscheck(False)
@cython.wraparound(False)
def pairwise_cython(double[:, ::1] X):
cdef int M = X.shape[0]
cdef int N = X.shape[1]
cdef double tmp, d
cdef double[:, ::1] D = np.empty((M, M), dtype=np.float64)
for i in range(M):
for j in range(M):
d = 0.0
for k in range(N):
tmp = X[i, k] - X[j, k]
d += tmp * tmp
D[i, j] = sqrt(d)
return np.asarray(D)
In [42]:
%timeit pairwise_cython(X)
In [43]:
from numba.decorators import jit
pairwise_numba = jit(pairwise_python)
# Run once to compile before timing
pairwise_numba(X)
%timeit pairwise_numba(X)
Taken from https://jakevdp.github.io/blog/2015/08/14/out-of-core-dataframes-in-python/
In [44]:
!ls -lahL POIWorld.csv
In [45]:
from dask import dataframe as dd
columns = ["name", "amenity", "Longitude", "Latitude"]
data = dd.read_csv('POIWorld.csv', usecols=columns)
data
Out[45]:
In [46]:
with_name = data[data.name.notnull()]
In [47]:
is_starbucks = with_name.name.str.contains('[Ss]tarbucks')
is_dunkin = with_name.name.str.contains('[Dd]unkin')
starbucks = with_name[is_starbucks]
dunkin = with_name[is_dunkin]
In [48]:
from dask.diagnostics import ProgressBar
In [49]:
with ProgressBar():
starbucks_count, dunkin_count = dd.compute(starbucks.name.count(), dunkin.name.count())
In [50]:
starbucks_count, dunkin_count
Out[50]:
In [51]:
locs = dd.compute(starbucks.Longitude,
starbucks.Latitude,
dunkin.Longitude,
dunkin.Latitude)
# extract arrays of values fro the series:
lon_s, lat_s, lon_d, lat_d = [loc.values for loc in locs]
In [52]:
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
def draw_USA():
"""initialize a basemap centered on the continental USA"""
plt.figure(figsize=(14, 10))
return Basemap(projection='lcc', resolution='l',
llcrnrlon=-119, urcrnrlon=-64,
llcrnrlat=22, urcrnrlat=49,
lat_1=33, lat_2=45, lon_0=-95,
area_thresh=10000)
m = draw_USA()
# Draw map background
m.fillcontinents(color='white', lake_color='#eeeeee')
m.drawstates(color='lightgray')
m.drawcoastlines(color='lightgray')
m.drawcountries(color='lightgray')
m.drawmapboundary(fill_color='#eeeeee')
# Plot the values in Starbucks Green and Dunkin Donuts Orange
style = dict(s=5, marker='o', alpha=0.5, zorder=2)
m.scatter(lon_s, lat_s, latlon=True,
label="Starbucks", color='#00592D', **style)
m.scatter(lon_d, lat_d, latlon=True,
label="Dunkin' Donuts", color='#FC772A', **style)
plt.legend(loc='lower left', frameon=False);
In [55]:
from bokeh.io import output_notebook
from bokeh.resources import CDN
from bokeh.plotting import figure, show
output_notebook(resources=CDN)
from __future__ import print_function
from math import pi
from bokeh.browserlib import view
from bokeh.document import Document
from bokeh.embed import file_html
from bokeh.models.glyphs import Circle, Text
from bokeh.models import (
BasicTicker, ColumnDataSource, Grid, GridPlot, LinearAxis,
DataRange1d, PanTool, Plot, WheelZoomTool
)
from bokeh.resources import INLINE
from bokeh.sampledata.iris import flowers
from bokeh.plotting import show
colormap = {'setosa': 'red', 'versicolor': 'green', 'virginica': 'blue'}
flowers['color'] = flowers['species'].map(lambda x: colormap[x])
source = ColumnDataSource(
data=dict(
petal_length=flowers['petal_length'],
petal_width=flowers['petal_width'],
sepal_length=flowers['sepal_length'],
sepal_width=flowers['sepal_width'],
color=flowers['color']
)
)
text_source = ColumnDataSource(
data=dict(xcenter=[125], ycenter=[135])
)
xdr = DataRange1d()
ydr = DataRange1d()
def make_plot(xname, yname, xax=False, yax=False, text=None):
plot = Plot(
x_range=xdr, y_range=ydr, background_fill="#efe8e2",
border_fill='white', title="", min_border=2, h_symmetry=False, v_symmetry=False,
plot_width=150, plot_height=150)
circle = Circle(x=xname, y=yname, fill_color="color", fill_alpha=0.2, size=4, line_color="color")
r = plot.add_glyph(source, circle)
xdr.renderers.append(r)
ydr.renderers.append(r)
xticker = BasicTicker()
if xax:
xaxis = LinearAxis()
plot.add_layout(xaxis, 'below')
xticker = xaxis.ticker
plot.add_layout(Grid(dimension=0, ticker=xticker))
yticker = BasicTicker()
if yax:
yaxis = LinearAxis()
plot.add_layout(yaxis, 'left')
yticker = yaxis.ticker
plot.add_layout(Grid(dimension=1, ticker=yticker))
plot.add_tools(PanTool(), WheelZoomTool())
if text:
text = " ".join(text.split('_'))
text = Text(
x={'field':'xcenter', 'units':'screen'},
y={'field':'ycenter', 'units':'screen'},
text=[text], angle=pi/4, text_font_style="bold", text_baseline="top",
text_color="#ffaaaa", text_alpha=0.7, text_align="center", text_font_size="28pt"
)
plot.add_glyph(text_source, text)
return plot
xattrs = ["petal_length", "petal_width", "sepal_width", "sepal_length"]
yattrs = list(reversed(xattrs))
plots = []
for y in yattrs:
row = []
for x in xattrs:
xax = (y == yattrs[-1])
yax = (x == xattrs[0])
text = x if (x==y) else None
plot = make_plot(x, y, xax, yax, text)
row.append(plot)
plots.append(row)
grid = GridPlot(children=plots, title="iris_splom")
In [56]:
show(grid)