In [1]:
def closest(position, hotspots):
    x0, y0 = position
    dbest, ibest = None, None
    for i, (x, y) in enumerate(hotspots):
        # Squared Euclidean
        d = (x - x0)**2 + (y - y0)**2
        if dbest is None or d < dbest:
            dbest, ibest = d, i
    return ibest

In [2]:
import random
positions = [(random.random(), random.random()) for k in xrange(10000000)]

In [3]:
timeit closest((.5, .5), positions)


1 loops, best of 3: 3.56 s per loop


In [4]:
%pylab


Welcome to pylab, a matplotlib-based Python environment [backend: MacOSX].
For more information, type 'help(pylab)'.
/Users/sukruhasdemir/anaconda/lib/python2.7/site-packages/matplotlib/__init__.py:1033: UserWarning:  This call to matplotlib.use() has no effect
because the the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

  warnings.warn(_use_error_msg)

In [10]:
positions = rand(10000000, 2)

In [11]:
positions.itemsize


Out[11]:
8

In [12]:
x, y = positions[:,0], positions[:,1]

In [23]:
timeit distances = (x-.5)**2 + (y-.5)**2


10 loops, best of 3: 145 ms per loop

In [25]:
timeit ibest = distances.argmin()


100 loops, best of 3: 7.1 ms per loop


In [26]:
x = array([1, 2 ,3])

In [27]:
x.shape


Out[27]:
(3,)

In [28]:
x.dtype


Out[28]:
dtype('int64')


In [29]:
def mul1(n):
    return array([[(i + 1) * (j + 1) for i in xrange(n)] for j in xrange(n)])

In [31]:
mul1(4)


Out[31]:
array([[ 1,  2,  3,  4],
       [ 2,  4,  6,  8],
       [ 3,  6,  9, 12],
       [ 4,  8, 12, 16]])

In [32]:
timeit mul1(100)


100 loops, best of 3: 2.26 ms per loop


In [33]:
rand(2, 5)


Out[33]:
array([[ 0.72738347,  0.96407565,  0.89656497,  0.18950147,  0.44205401],
       [ 0.73213653,  0.00154226,  0.67972036,  0.28839524,  0.59011744]])


In [34]:
fromstring('1 2 5 10', dtype=int, sep=' ')


Out[34]:
array([ 1,  2,  5, 10])

In [37]:
cd fbdata


(bookmark:fbdata) -> /Users/sukruhasdemir/Dropbox/python-study/ipython-book/data/facebook
/Users/sukruhasdemir/Dropbox/python-study/ipython-book/data/facebook

In [38]:
loadtxt('0.edges')


Out[38]:
array([[ 236.,  186.],
       [ 122.,  285.],
       [  24.,  346.],
       ..., 
       [  85.,   75.],
       [  98.,  332.],
       [ 291.,  339.]])

Pandas, dl data:


In [70]:
import urllib2, zipfile

In [66]:
url = 'http://ipython.rossant.net/'
filename = 'cities.zip'
address = 'http://ipython.rossant.net/cities.zip'

In [78]:
downloaded = urllib2.urlopen('http://ipython.rossant.net/cities.zip')

In [56]:
folder = 'citydata'

In [68]:
mkdir $folder


mkdir: citydata: File exists

In [79]:
with open(filename, 'wb') as f:
    f.write(downloaded.read())

In [80]:
with zipfile.ZipFile(filename) as zip:
    zip.extractall(folder)

In [82]:
bookmark citiesdata citydata/


In [83]:
import pandas as pd

In [248]:
filename = 'citydata/worldcitiespop.txt'

In [249]:
data = pd.read_csv(filename)

In [86]:
type(data)


Out[86]:
pandas.core.frame.DataFrame

In [88]:
data.shape, data.keys()


Out[88]:
((3173958, 7),
 Index([Country, City, AccentCity, Region, Population, Latitude, Longitude], dtype=object))

In [95]:
data.head()


Out[95]:
Country City AccentCity Region Population Latitude Longitude
0 ad aixas Aix�s 6 NaN 42.483333 1.466667
1 ad aixirivali Aixirivali 6 NaN 42.466667 1.500000
2 ad aixirivall Aixirivall 6 NaN 42.466667 1.500000
3 ad aixirvall Aixirvall 6 NaN 42.466667 1.500000
4 ad aixovall Aixovall 6 NaN 42.466667 1.483333

In [96]:
data.AccentCity


Out[96]:
0                  Aix�s
1             Aixirivali
2             Aixirivall
3              Aixirvall
4               Aixovall
5                Andorra
6       Andorra la Vella
7        Andorra-Vieille
8                Andorre
9     Andorre-la-Vieille
10       Andorre-Vieille
11             Ansalonga
12                 Any�s
13                 Arans
14               Arinsal
...
3173943                Zandi
3173944              Zanyika
3173945           Zemalapala
3173946            Zemandana
3173947              Zemanda
3173948           Zibalonkwe
3173949          Zibunkululu
3173950                 Ziga
3173951    Zikamanas Village
3173952             Zimbabwe
3173953           Zimre Park
3173954          Ziyakamanas
3173955           Zizalisari
3173956              Zuzumba
3173957           Zvishavane
Name: AccentCity, Length: 3173958, dtype: object

In [97]:
data.AccentCity[30000]


Out[97]:
'Howasiyan'

In [99]:
data[data.AccentCity=='New York']


Out[99]:
Country City AccentCity Region Population Latitude Longitude
998166 gb new york New York H7 NaN 53.083333 -0.150000
1087431 hn new york New York 16 NaN 14.800000 -88.366667
1525856 jm new york New York 9 NaN 18.250000 -77.183333
1525857 jm new york New York 10 NaN 18.116667 -77.133333
1893972 mx new york New York 5 NaN 16.266667 -93.233333
2929399 us new york New York FL NaN 30.838333 -87.200833
2946036 us new york New York IA NaN 40.851667 -93.259722
2951120 us new york New York KY NaN 36.988889 -88.952500
2977571 us new york New York MO NaN 39.685278 -93.926667
2986561 us new york New York NM NaN 35.058611 -107.526667
2990572 us new york New York NY 8107916 40.714167 -74.006389
3029084 us new york New York TX NaN 32.167778 -95.668889

In [100]:
ny = 2990572
data.ix[ny]


Out[100]:
Country             us
City          new york
AccentCity    New York
Region              NY
Population     8107916
Latitude      40.71417
Longitude    -74.00639
Name: 2990572, dtype: object


In [126]:
population = array(data.Population)

In [102]:
population.shape


Out[102]:
(3173958,)

In [103]:
population[ny]


Out[103]:
8107916.0

In [125]:
sum(1 - isnan(population))


Out[125]:
47980

In [127]:
x = population[~_]

In [130]:
len(x), len(x)/float(len(population))


---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-130-491a9bd444aa> in <module>()
----> 1 len(x), len(x)/float(len(population))

TypeError: object of type 'numpy.float64' has no len()

Views


In [131]:
x = rand(5); x


Out[131]:
array([ 0.34893065,  0.23750285,  0.04166815,  0.31848057,  0.77538395])

In [132]:
y = x[::2]; y


Out[132]:
array([ 0.34893065,  0.04166815,  0.77538395])

In [133]:
y[0] = 1; x


Out[133]:
array([ 1.        ,  0.23750285,  0.04166815,  0.31848057,  0.77538395])

In [135]:
y = x.copy()
z = array(x)
print y, z


[ 1.          0.23750285  0.04166815  0.31848057  0.77538395] [ 1.          0.23750285  0.04166815  0.31848057  0.77538395]

In [136]:
ind = [0, 1, 0, 2]

In [137]:
x[ind]


Out[137]:
array([ 1.        ,  0.23750285,  1.        ,  0.04166815])


In [138]:
x = rand(6); x


Out[138]:
array([ 0.48047438,  0.33705905,  0.35312031,  0.70541943,  0.47897635,
        0.51386445])

In [140]:
y = x.reshape((2,3)); y


Out[140]:
array([[ 0.48047438,  0.33705905,  0.35312031],
       [ 0.70541943,  0.47897635,  0.51386445]])

In [141]:
z = y.ravel(); z


Out[141]:
array([ 0.48047438,  0.33705905,  0.35312031,  0.70541943,  0.47897635,
        0.51386445])


In [146]:
x = arange(3); x


Out[146]:
array([0, 1, 2])

In [147]:
tile(x, (2, 1))


Out[147]:
array([[0, 1, 2],
       [0, 1, 2]])

In [149]:
repeat(x, 4)


Out[149]:
array([0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2])

In [162]:
def mul2(n): # using broadcasting
    row = arange(1,n+1)
    col = arange(1,n+1).reshape(n,1)
    return row * col

In [163]:
mul2(4)


Out[163]:
array([[ 1,  2,  3,  4],
       [ 2,  4,  6,  8],
       [ 3,  6,  9, 12],
       [ 4,  8, 12, 16]])

In [164]:
timeit mul2(100)


10000 loops, best of 3: 25.7 us per loop

In [165]:
def mul3(n):
    M = arange(1, n+1).reshape((-1, 1))
    M = tile(M, (1, n))
    N = arange(1, n+1).reshape((1, -1))
    N = tile(N, (n, 1))
    return M * N

In [166]:
timeit mul3(100)


10000 loops, best of 3: 87.5 us per loop

In [169]:
87.5/25.7 # 3.4 times faster than the book if we use broadcasting


Out[169]:
3.404669260700389

Split


In [172]:
x = arange(6)

In [173]:
split(x,2)


Out[173]:
[array([0, 1, 2]), array([3, 4, 5])]

In [174]:
split(x, [2, 5]) # starting indices of the subarrays except the 1st


Out[174]:
[array([0, 1]), array([2, 3, 4]), array([5])]


In [234]:
def locate(x, y):
    """ Return the city closest to the given location. """
    locations = data[['Latitude', 'Longitude']].as_matrix() # !!!
    d = locations - array([x, y])
    distances = d[:,0]**2 + d[:,1]**2
    closest = distances.argmin()
    return data.AccentCity[closest]

In [235]:
print locate(48.861, 2.3358)


Paris


In [182]:
a = [2, 4, 64, 93449, 5, 94249, 394, 221]; a


Out[182]:
[2, 4, 64, 93449, 5, 94249, 394, 221]

In [183]:
array(a)[[0, 3]]


Out[183]:
array([    2, 93449])

In [221]:
b = rand(5,5); b


Out[221]:
array([[ 0.27576495,  0.41783964,  0.37600412,  0.07658866,  0.8082994 ],
       [ 0.53723549,  0.94009937,  0.38552889,  0.95726354,  0.27087212],
       [ 0.50706751,  0.37160832,  0.16437208,  0.08668012,  0.79990792],
       [ 0.60621513,  0.54430786,  0.0280515 ,  0.33886399,  0.97196962],
       [ 0.32431163,  0.24808972,  0.98717261,  0.6117578 ,  0.15646652]])

In [233]:
b[[0, 2]]


Out[233]:
array([[ 0.27576495,  0.41783964,  0.37600412,  0.07658866,  0.8082994 ],
       [ 0.50706751,  0.37160832,  0.16437208,  0.08668012,  0.79990792]])

In [228]:
b.T[[0, 2]].T # !!!


Out[228]:
array([[ 0.27576495,  0.37600412],
       [ 0.53723549,  0.38552889],
       [ 0.50706751,  0.16437208],
       [ 0.60621513,  0.0280515 ],
       [ 0.32431163,  0.98717261]])


In [237]:
data.Population.describe()


Out[237]:
count       47980.000000
mean        47719.570634
std        302888.715626
min             7.000000
25%          3732.000000
50%         10779.000000
75%         27990.500000
max      31480498.000000
dtype: float64



In [1]:
import pandas as pd

In [2]:
adfile = 'bidtest-e.csv'

In [3]:
addata = pd.read_csv(adfile)

In [4]:
addata.describe()


Out[4]:
&ltclass 'pandas.core.frame.DataFrame'>
Index: 8 entries, count to max
Data columns (total 16 columns):
Campaign ID           8  non-null values
Impressions           8  non-null values
Social Impressions    8  non-null values
Clicks                8  non-null values
Social Clicks         8  non-null values
CPC                   8  non-null values
CPM                   8  non-null values
Spent                 8  non-null values
Reach                 8  non-null values
Frequency             8  non-null values
Social Reach          8  non-null values
Actions               8  non-null values
Page Likes            8  non-null values
App Installs          1  non-null values
Event Responses       1  non-null values
Unique Clicks         8  non-null values
dtypes: float64(16)

In [5]:
addata[addata.Campaign == 'Cevap Tv / Bid Test - 20'].head


Out[5]:
<bound method DataFrame.head of          Date                  Campaign    Campaign ID  Impressions  \
0   08/19/2013  Cevap Tv / Bid Test - 20  6012463270255        26671   
11  08/20/2013  Cevap Tv / Bid Test - 20  6012463270255        12357   
22  08/21/2013  Cevap Tv / Bid Test - 20  6012463270255         6471   
33  08/22/2013  Cevap Tv / Bid Test - 20  6012463270255        39990   
44  08/23/2013  Cevap Tv / Bid Test - 20  6012463270255         4096   

    Social Impressions Social %  Clicks  Social Clicks Click-Through Rate  \
0                12150   45.56%     194             63             0.727%   
11                5736   46.42%     104             29             0.842%   
22                3713   57.38%      71             32             1.097%   
33               21430   53.59%     255            110             0.638%   
44                2882   70.36%      41             23             1.001%   

   Social CTR   CPC   CPM  Spent  Reach  Frequency  Social Reach  Actions  \
0      0.519%  0.10  0.75  20.00  25034        1.1         10992       33   
11     0.506%  0.11  0.95  11.80  11576        1.1          5116       13   
22     0.862%  0.10  1.08   6.96   6138        1.1          3345       15   
33     0.513%  0.12  0.75  30.00  33829        1.2         15983       59   
44     0.798%  0.08  0.84   3.43   3785        1.1          2519        7   

    Page Likes  App Installs  Event Responses  Unique Clicks Unique CTR  
0           16           NaN              NaN            194     0.775%  
11           9           NaN              NaN            103     0.890%  
22          10           NaN              NaN             71     1.157%  
33          42           NaN              NaN            252     0.745%  
44           6           NaN              NaN             41     1.083%  >

In [6]:
addata[addata.Campaign == 'Cevap Tv / Bid Test - 20']


Out[6]:
&ltclass 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 44
Data columns (total 22 columns):
Date                 5  non-null values
Campaign              5  non-null values
Campaign ID           5  non-null values
Impressions           5  non-null values
Social Impressions    5  non-null values
Social %              5  non-null values
Clicks                5  non-null values
Social Clicks         5  non-null values
Click-Through Rate    5  non-null values
Social CTR            5  non-null values
CPC                   5  non-null values
CPM                   5  non-null values
Spent                 5  non-null values
Reach                 5  non-null values
Frequency             5  non-null values
Social Reach          5  non-null values
Actions               5  non-null values
Page Likes            5  non-null values
App Installs          0  non-null values
Event Responses       0  non-null values
Unique Clicks         5  non-null values
Unique CTR            5  non-null values
dtypes: float64(6), int64(10), object(6)

In [36]:
addata.columns


Out[36]:
Index([Date, Campaign, Campaign ID, Impressions, Social Impressions, Social %, Clicks, Social Clicks, Click-Through Rate, Social CTR, CPC, CPM, Spent, Reach, Frequency, Social Reach, Actions, Page Likes, App Installs, Event Responses, Unique Clicks, Unique CTR], dtype=object)

In [37]:
data20 = addata[addata.Campaign == 'Cevap Tv / Bid Test - 20']; data20


Out[37]:
&ltclass 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 0 to 44
Data columns (total 22 columns):
Date                 5  non-null values
Campaign              5  non-null values
Campaign ID           5  non-null values
Impressions           5  non-null values
Social Impressions    5  non-null values
Social %              5  non-null values
Clicks                5  non-null values
Social Clicks         5  non-null values
Click-Through Rate    5  non-null values
Social CTR            5  non-null values
CPC                   5  non-null values
CPM                   5  non-null values
Spent                 5  non-null values
Reach                 5  non-null values
Frequency             5  non-null values
Social Reach          5  non-null values
Actions               5  non-null values
Page Likes            5  non-null values
App Installs          0  non-null values
Event Responses       0  non-null values
Unique Clicks         5  non-null values
Unique CTR            5  non-null values
dtypes: float64(6), int64(10), object(6)

In [38]:
perfdata20 = data20[['Campaign', 'Clicks', 'CPC', 'Spent', 'Page Likes']]; perfdata20


Out[38]:
Campaign Clicks CPC Spent Page Likes
0 Cevap Tv / Bid Test - 20 194 0.10 20.00 16
11 Cevap Tv / Bid Test - 20 104 0.11 11.80 9
22 Cevap Tv / Bid Test - 20 71 0.10 6.96 10
33 Cevap Tv / Bid Test - 20 255 0.12 30.00 42
44 Cevap Tv / Bid Test - 20 41 0.08 3.43 6

10m P


In [7]:
import pandas as pd

In [8]:
import numpy as np

In [10]:
s = pd.Series([1, 3, 5, np.nan, 6, 8]); s


Out[10]:
0     1
1     3
2     5
3   NaN
4     6
5     8
dtype: float64


In [11]:
dates = pd.date_range('20130101', periods=6); dates


Out[11]:
<class 'pandas.tseries.index.DatetimeIndex'>
[2013-01-01 00:00:00, ..., 2013-01-06 00:00:00]
Length: 6, Freq: D, Timezone: None

In [12]:
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')); df


Out[12]:
A B C D
2013-01-01 0.261933 1.333238 0.261987 -0.767643
2013-01-02 -1.298499 1.237161 0.288860 -0.491318
2013-01-03 -0.245996 -0.089789 -0.458865 0.704302
2013-01-04 1.424923 -1.434915 2.187872 1.131349
2013-01-05 -0.928424 -0.365961 -1.067766 -1.275187
2013-01-06 -0.696820 0.538560 -1.043366 -0.047781


In [13]:
df2 = pd.DataFrame({ 'A' : 1.,
                     'B' : pd.Timestamp('20130102'),
                     'C' : pd.Series(1, index=range(4), dtype='float32'),
                     'D' : np.array([3] * 4, dtype='int32'),
                     'E' : 'foo' }); df2


Out[13]:
A B C D E
0 1 2013-01-02 00:00:00 1 3 foo
1 1 2013-01-02 00:00:00 1 3 foo
2 1 2013-01-02 00:00:00 1 3 foo
3 1 2013-01-02 00:00:00 1 3 foo

In [14]:
df2.dtypes


Out[14]:
A           float64
B    datetime64[ns]
C           float32
D             int32
E            object
dtype: object


In [15]:
df.head()


Out[15]:
A B C D
2013-01-01 0.261933 1.333238 0.261987 -0.767643
2013-01-02 -1.298499 1.237161 0.288860 -0.491318
2013-01-03 -0.245996 -0.089789 -0.458865 0.704302
2013-01-04 1.424923 -1.434915 2.187872 1.131349
2013-01-05 -0.928424 -0.365961 -1.067766 -1.275187

In [16]:
df.tail(3)


Out[16]:
A B C D
2013-01-04 1.424923 -1.434915 2.187872 1.131349
2013-01-05 -0.928424 -0.365961 -1.067766 -1.275187
2013-01-06 -0.696820 0.538560 -1.043366 -0.047781

In [17]:
df.index


Out[17]:
<class 'pandas.tseries.index.DatetimeIndex'>
[2013-01-01 00:00:00, ..., 2013-01-06 00:00:00]
Length: 6, Freq: D, Timezone: None

In [18]:
df.columns


Out[18]:
Index([A, B, C, D], dtype=object)

In [19]:
df.values


Out[19]:
array([[ 0.26193297,  1.3332379 ,  0.26198715, -0.76764333],
       [-1.29849863,  1.23716062,  0.28885952, -0.49131795],
       [-0.24599623, -0.08978884, -0.45886478,  0.70430215],
       [ 1.42492316, -1.43491546,  2.18787224,  1.13134878],
       [-0.92842425, -0.36596123, -1.06776624, -1.27518688],
       [-0.69681987,  0.5385603 , -1.04336575, -0.04778135]])

In [20]:
df.describe()


Out[20]:
A B C D
count 6.000000 6.000000 6.000000 6.000000
mean -0.247147 0.203049 0.028120 -0.124380
std 0.982514 1.053730 1.214365 0.910189
min -1.298499 -1.434915 -1.067766 -1.275187
25% -0.870523 -0.296918 -0.897241 -0.698562
50% -0.471408 0.224386 -0.098439 -0.269550
75% 0.134951 1.062511 0.282141 0.516281
max 1.424923 1.333238 2.187872 1.131349

In [22]:
df.T


Out[22]:
2013-01-01 00:00:00 2013-01-02 00:00:00 2013-01-03 00:00:00 2013-01-04 00:00:00 2013-01-05 00:00:00 2013-01-06 00:00:00
A 0.261933 -1.298499 -0.245996 1.424923 -0.928424 -0.696820
B 1.333238 1.237161 -0.089789 -1.434915 -0.365961 0.538560
C 0.261987 0.288860 -0.458865 2.187872 -1.067766 -1.043366
D -0.767643 -0.491318 0.704302 1.131349 -1.275187 -0.047781

In [23]:
df.sort_index(axis=1, ascending=False)


Out[23]:
D C B A
2013-01-01 -0.767643 0.261987 1.333238 0.261933
2013-01-02 -0.491318 0.288860 1.237161 -1.298499
2013-01-03 0.704302 -0.458865 -0.089789 -0.245996
2013-01-04 1.131349 2.187872 -1.434915 1.424923
2013-01-05 -1.275187 -1.067766 -0.365961 -0.928424
2013-01-06 -0.047781 -1.043366 0.538560 -0.696820

In [26]:
df.sort(columns='B')


Out[26]:
A B C D
2013-01-04 1.424923 -1.434915 2.187872 1.131349
2013-01-05 -0.928424 -0.365961 -1.067766 -1.275187
2013-01-03 -0.245996 -0.089789 -0.458865 0.704302
2013-01-06 -0.696820 0.538560 -1.043366 -0.047781
2013-01-02 -1.298499 1.237161 0.288860 -0.491318
2013-01-01 0.261933 1.333238 0.261987 -0.767643


In [28]:
df['A']


Out[28]:
2013-01-01    0.261933
2013-01-02   -1.298499
2013-01-03   -0.245996
2013-01-04    1.424923
2013-01-05   -0.928424
2013-01-06   -0.696820
Freq: D, Name: A, dtype: float64

In [29]:
df[0:3]


Out[29]:
A B C D
2013-01-01 0.261933 1.333238 0.261987 -0.767643
2013-01-02 -1.298499 1.237161 0.288860 -0.491318
2013-01-03 -0.245996 -0.089789 -0.458865 0.704302

In [30]:
df['20130102':'20130104']


Out[30]:
A B C D
2013-01-02 -1.298499 1.237161 0.288860 -0.491318
2013-01-03 -0.245996 -0.089789 -0.458865 0.704302
2013-01-04 1.424923 -1.434915 2.187872 1.131349


In [31]:
df.loc[dates[0]]


Out[31]:
A    0.261933
B    1.333238
C    0.261987
D   -0.767643
Name: 2013-01-01 00:00:00, dtype: float64

In [33]:
df[['A', 'B']]


Out[33]:
A B
2013-01-01 0.261933 1.333238
2013-01-02 -1.298499 1.237161
2013-01-03 -0.245996 -0.089789
2013-01-04 1.424923 -1.434915
2013-01-05 -0.928424 -0.365961
2013-01-06 -0.696820 0.538560

In [35]:
df.loc[:,['A','B']] == df[['A', 'B']]


Out[35]:
A B
2013-01-01 True True
2013-01-02 True True
2013-01-03 True True
2013-01-04 True True
2013-01-05 True True
2013-01-06 True True

In [ ]: