In [1]:
def closest(position, hotspots):
x0, y0 = position
dbest, ibest = None, None
for i, (x, y) in enumerate(hotspots):
# Squared Euclidean
d = (x - x0)**2 + (y - y0)**2
if dbest is None or d < dbest:
dbest, ibest = d, i
return ibest
In [2]:
import random
positions = [(random.random(), random.random()) for k in xrange(10000000)]
In [3]:
timeit closest((.5, .5), positions)
In [4]:
%pylab
In [10]:
positions = rand(10000000, 2)
In [11]:
positions.itemsize
Out[11]:
In [12]:
x, y = positions[:,0], positions[:,1]
In [23]:
timeit distances = (x-.5)**2 + (y-.5)**2
In [25]:
timeit ibest = distances.argmin()
In [26]:
x = array([1, 2 ,3])
In [27]:
x.shape
Out[27]:
In [28]:
x.dtype
Out[28]:
In [29]:
def mul1(n):
return array([[(i + 1) * (j + 1) for i in xrange(n)] for j in xrange(n)])
In [31]:
mul1(4)
Out[31]:
In [32]:
timeit mul1(100)
In [33]:
rand(2, 5)
Out[33]:
In [34]:
fromstring('1 2 5 10', dtype=int, sep=' ')
Out[34]:
In [37]:
cd fbdata
In [38]:
loadtxt('0.edges')
Out[38]:
Pandas, dl data:
In [70]:
import urllib2, zipfile
In [66]:
url = 'http://ipython.rossant.net/'
filename = 'cities.zip'
address = 'http://ipython.rossant.net/cities.zip'
In [78]:
downloaded = urllib2.urlopen('http://ipython.rossant.net/cities.zip')
In [56]:
folder = 'citydata'
In [68]:
mkdir $folder
In [79]:
with open(filename, 'wb') as f:
f.write(downloaded.read())
In [80]:
with zipfile.ZipFile(filename) as zip:
zip.extractall(folder)
In [82]:
bookmark citiesdata citydata/
In [83]:
import pandas as pd
In [248]:
filename = 'citydata/worldcitiespop.txt'
In [249]:
data = pd.read_csv(filename)
In [86]:
type(data)
Out[86]:
In [88]:
data.shape, data.keys()
Out[88]:
In [95]:
data.head()
Out[95]:
In [96]:
data.AccentCity
Out[96]:
In [97]:
data.AccentCity[30000]
Out[97]:
In [99]:
data[data.AccentCity=='New York']
Out[99]:
In [100]:
ny = 2990572
data.ix[ny]
Out[100]:
In [126]:
population = array(data.Population)
In [102]:
population.shape
Out[102]:
In [103]:
population[ny]
Out[103]:
In [125]:
sum(1 - isnan(population))
Out[125]:
In [127]:
x = population[~_]
In [130]:
len(x), len(x)/float(len(population))
Views
In [131]:
x = rand(5); x
Out[131]:
In [132]:
y = x[::2]; y
Out[132]:
In [133]:
y[0] = 1; x
Out[133]:
In [135]:
y = x.copy()
z = array(x)
print y, z
In [136]:
ind = [0, 1, 0, 2]
In [137]:
x[ind]
Out[137]:
In [138]:
x = rand(6); x
Out[138]:
In [140]:
y = x.reshape((2,3)); y
Out[140]:
In [141]:
z = y.ravel(); z
Out[141]:
In [146]:
x = arange(3); x
Out[146]:
In [147]:
tile(x, (2, 1))
Out[147]:
In [149]:
repeat(x, 4)
Out[149]:
In [162]:
def mul2(n): # using broadcasting
row = arange(1,n+1)
col = arange(1,n+1).reshape(n,1)
return row * col
In [163]:
mul2(4)
Out[163]:
In [164]:
timeit mul2(100)
In [165]:
def mul3(n):
M = arange(1, n+1).reshape((-1, 1))
M = tile(M, (1, n))
N = arange(1, n+1).reshape((1, -1))
N = tile(N, (n, 1))
return M * N
In [166]:
timeit mul3(100)
In [169]:
87.5/25.7 # 3.4 times faster than the book if we use broadcasting
Out[169]:
Split
In [172]:
x = arange(6)
In [173]:
split(x,2)
Out[173]:
In [174]:
split(x, [2, 5]) # starting indices of the subarrays except the 1st
Out[174]:
In [234]:
def locate(x, y):
""" Return the city closest to the given location. """
locations = data[['Latitude', 'Longitude']].as_matrix() # !!!
d = locations - array([x, y])
distances = d[:,0]**2 + d[:,1]**2
closest = distances.argmin()
return data.AccentCity[closest]
In [235]:
print locate(48.861, 2.3358)
In [182]:
a = [2, 4, 64, 93449, 5, 94249, 394, 221]; a
Out[182]:
In [183]:
array(a)[[0, 3]]
Out[183]:
In [221]:
b = rand(5,5); b
Out[221]:
In [233]:
b[[0, 2]]
Out[233]:
In [228]:
b.T[[0, 2]].T # !!!
Out[228]:
In [237]:
data.Population.describe()
Out[237]:
In [1]:
import pandas as pd
In [2]:
adfile = 'bidtest-e.csv'
In [3]:
addata = pd.read_csv(adfile)
In [4]:
addata.describe()
Out[4]:
In [5]:
addata[addata.Campaign == 'Cevap Tv / Bid Test - 20'].head
Out[5]:
In [6]:
addata[addata.Campaign == 'Cevap Tv / Bid Test - 20']
Out[6]:
In [36]:
addata.columns
Out[36]:
In [37]:
data20 = addata[addata.Campaign == 'Cevap Tv / Bid Test - 20']; data20
Out[37]:
In [38]:
perfdata20 = data20[['Campaign', 'Clicks', 'CPC', 'Spent', 'Page Likes']]; perfdata20
Out[38]:
10m P
In [7]:
import pandas as pd
In [8]:
import numpy as np
In [10]:
s = pd.Series([1, 3, 5, np.nan, 6, 8]); s
Out[10]:
In [11]:
dates = pd.date_range('20130101', periods=6); dates
Out[11]:
In [12]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')); df
Out[12]:
In [13]:
df2 = pd.DataFrame({ 'A' : 1.,
'B' : pd.Timestamp('20130102'),
'C' : pd.Series(1, index=range(4), dtype='float32'),
'D' : np.array([3] * 4, dtype='int32'),
'E' : 'foo' }); df2
Out[13]:
In [14]:
df2.dtypes
Out[14]:
In [15]:
df.head()
Out[15]:
In [16]:
df.tail(3)
Out[16]:
In [17]:
df.index
Out[17]:
In [18]:
df.columns
Out[18]:
In [19]:
df.values
Out[19]:
In [20]:
df.describe()
Out[20]:
In [22]:
df.T
Out[22]:
In [23]:
df.sort_index(axis=1, ascending=False)
Out[23]:
In [26]:
df.sort(columns='B')
Out[26]:
In [28]:
df['A']
Out[28]:
In [29]:
df[0:3]
Out[29]:
In [30]:
df['20130102':'20130104']
Out[30]:
In [31]:
df.loc[dates[0]]
Out[31]:
In [33]:
df[['A', 'B']]
Out[33]:
In [35]:
df.loc[:,['A','B']] == df[['A', 'B']]
Out[35]:
In [ ]: