In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [2]:
df = pd.read_csv('~/Downloads/search-terms.csv')

In [3]:
df.head(1)


Out[3]:
Query Impressions Clicks Average Position CTR
0 youtube 5,094,496 1,602 2.2 0.03%

In [ ]:


In [4]:
df.head(1)


Out[4]:
Query Impressions Clicks Average Position CTR
0 youtube 5,094,496 1,602 2.2 0.03%

In [5]:
df.Impressions = df.Impressions.apply(lambda x: x.replace(',',''))

In [6]:
df.Clicks = df.Clicks.apply(lambda x: x.replace(',',''))

In [7]:
df.CTR = df.CTR.apply(lambda x: x.rstrip('%'))

In [8]:
df.Impressions = pd.to_numeric(df['Impressions'])

In [9]:
df.Clicks = pd.to_numeric(df.Clicks)

In [10]:
df = df[df['CTR'].str.contains('<')==False]

In [11]:
df.CTR = pd.to_numeric(df.CTR)

In [12]:
plt.figure()


Out[12]:
<matplotlib.figure.Figure at 0x119322b50>
<matplotlib.figure.Figure at 0x119322b50>

In [13]:
df.plot(x='Average Position',y='CTR',kind='scatter')


Out[13]:
<matplotlib.axes._subplots.AxesSubplot at 0x1193026d0>

In [14]:
df.describe()


Out[14]:
Impressions Clicks Average Position CTR
count 9.940000e+02 994.000000 994.000000 994.000000
mean 2.323076e+04 1212.240443 6.291247 3.340322
std 2.342735e+05 26566.821580 3.330072 6.128082
min 2.005000e+03 0.000000 1.000000 0.000000
25% 2.796250e+03 9.000000 4.200000 0.170000
50% 4.287000e+03 55.000000 6.100000 0.995000
75% 8.928000e+03 217.750000 7.800000 3.647500
max 5.094496e+06 831764.000000 40.000000 59.150000

In [15]:
df.CTR.quantile(q=.75)


Out[15]:
3.6475

In [16]:
sns.regplot(x='Average Position',
           y='CTR',
           data=df[df.Clicks >0],
           logx=True,
           x_bins=50)


Out[16]:
<matplotlib.axes._subplots.AxesSubplot at 0x11bcacb50>

In [17]:
df.sort_values('Clicks',ascending=False)


Out[17]:
Query Impressions Clicks Average Position CTR
1 (not set) 4798264 831764 7.5 17.33
3 atlas obscura 547883 97432 1.3 17.78
4 robert the doll 369796 18235 6.0 4.93
8 aokigahara 194569 13908 2.7 7.15
16 aokigahara forest 105206 9834 2.5 9.35
104 atlasobscura 18465 6495 1.3 35.17
19 snake island 87763 6088 3.0 6.94
10 are mermaids real 176362 5400 7.8 3.06
79 riot act 22749 4207 3.3 18.49
112 obscura day 2016 17551 4128 3.4 23.52
21 hollow earth 80330 4077 4.9 5.08
5 denver airport 354856 3990 6.1 1.12
36 gabagool 47516 3413 3.0 7.18
81 thor's well 22151 3083 2.4 13.92
55 abandoned amusement parks 29989 3021 2.4 10.07
164 black mailbox 12733 2673 1.9 20.99
299 obscura day 7362 2580 2.7 35.04
226 sourtoe cocktail 10028 2566 2.4 25.59
2 vernal equinox 2225404 2419 5.3 0.11
28 leap day william 64459 2362 3.6 3.66
189 swing at the end of the world 11363 2244 2.0 19.75
333 los feliz murder mansion 6516 2172 1.7 33.33
48 goats in trees 32383 2113 2.7 6.53
30 boaty mcboatface 57056 1921 6.9 3.37
54 jawn 30592 1857 3.3 6.07
289 the lost city of heracleion 7658 1847 1.4 24.12
355 michigan triangle 6170 1836 1.0 29.76
25 slab city 68872 1717 5.4 2.49
6 suicide forest 319256 1703 6.2 0.53
38 sentinelese 45590 1655 4.0 3.63
... ... ... ... ... ...
431 bird 4995 0 5.5 0.00
397 purpose 5529 0 7.7 0.00
389 want swede 5634 0 6.2 0.00
353 bread 6209 0 6.7 0.00
330 squid 6594 0 5.9 0.00
335 faze 6484 0 8.1 0.00
635 typhoid 3314 0 7.1 0.00
655 jcpennys 3215 0 4.1 0.00
931 roosevelt 2171 0 5.4 0.00
656 mummy 3209 0 7.2 0.00
922 apple store 2189 0 29.0 0.00
908 map of world 2228 0 6.0 0.00
886 william 2298 0 6.5 0.00
850 gesture 2416 0 6.9 0.00
838 poison 2442 0 6.8 0.00
834 lenin 2452 0 3.8 0.00
829 cd 2471 0 6.9 0.00
100 gas station 18601 0 28.0 0.00
798 olives 2596 0 5.7 0.00
794 blackwell's island 2609 0 5.7 0.00
791 apple atlas 2615 0 6.1 0.00
144 misguided 14105 0 6.0 0.00
149 map of the world 13949 0 6.3 0.00
724 crayon 2904 0 6.8 0.00
706 egg 2994 0 4.7 0.00
702 prohibition 3021 0 7.2 0.00
677 salute 3137 0 6.5 0.00
337 atari 6471 0 6.3 0.00
663 modernism 3188 0 7.7 0.00
669 toilet paper 3163 0 17.0 0.00

994 rows × 5 columns


In [18]:
df[df['Average Position']<2].drop([3,104],axis=0).sort_values('Clicks',
                                                             ascending=False)


Out[18]:
Query Impressions Clicks Average Position CTR
164 black mailbox 12733 2673 1.9 20.99
333 los feliz murder mansion 6516 2172 1.7 33.33
289 the lost city of heracleion 7658 1847 1.4 24.12
355 michigan triangle 6170 1836 1.0 29.76
380 the black mailbox 5798 1653 1.7 28.51
198 what is located on the fifth floor of the supr... 11069 1559 1.0 14.08
615 read the riot act 3414 1529 1.5 44.79
978 unusual things to do in new york 2054 1215 1.2 59.15
894 unique things to do in florida 2286 1160 1.0 50.74
651 lake michigan triangle 3220 1134 1.0 35.22
168 what is located on the fifth floor of the supr... 12596 1085 1.0 8.61
634 sword in the stone real 3314 953 1.6 28.76
688 marble caves chile 3100 804 1.9 25.94
934 diaphonized 2158 733 1.1 33.97
609 deepest hole on earth 3426 676 1.1 19.73
693 castle mont rouge 3085 623 1.8 20.19
743 minister's treehouse 2821 593 1.9 21.02
753 atlanta prison farm 2791 553 1.7 19.81
632 blood lake texas 3315 548 1.9 16.53
175 david bowie 12077 510 1.5 4.22
959 atlasobscura.com 2112 440 1.6 20.83
774 gates of hell nj 2691 355 1.4 13.19
906 valley of square trees 2244 340 1.8 15.15
151 star wars 13846 298 1.0 2.15
441 spring equinox 2016 4861 133 1.8 2.74
727 ufo 2892 103 1.0 3.56
867 three sisters springs 2363 95 1.1 4.02
786 supreme court justices 2639 66 1.5 2.50
695 gold 3053 60 1.3 1.97
755 5th floor supreme court 2785 59 1.0 2.12
585 5th floor of supreme court 3605 59 1.2 1.64
445 mothers day ideas 4805 49 1.0 1.02
657 harry potter 3208 21 1.5 0.65
971 budweiser 2079 7 1.2 0.34

In [19]:
df[(df['Average Position'] >=10) & (df['Average Position'] <12)].sort_values('Clicks',
                                                                          ascending=False)


Out[19]:
Query Impressions Clicks Average Position CTR
58 real mermaid 28436 585 10.0 2.06
26 turkish delight 68866 227 10.0 0.33
157 marie laveau 13056 100 11.0 0.77
785 mayan ball game 2640 91 11.0 3.45
371 devil's kettle 5972 90 10.0 1.51
421 kings park psychiatric center 5115 86 10.0 1.68
734 tsavo lions 2868 68 11.0 2.37
176 lord of the rings online 11907 66 11.0 0.55
998 belle starr 2011 58 10.0 2.88
828 moonbow 2474 53 11.0 2.14
484 andaman people 4403 44 10.0 1.00
966 chill pill 2104 39 10.0 1.85
557 starbucks logo history 3866 34 11.0 0.88
172 blood types 12463 30 10.0 0.24
319 richard feynman 6856 30 10.0 0.44
311 starbucks symbol 7077 27 10.0 0.38
465 starbucks logo meaning 4581 24 11.0 0.52
767 louis wain 2736 24 11.0 0.88
924 old spanish trail 2183 23 10.0 1.05
868 denver colorado airport 2360 19 11.0 0.81
277 coconut crab 8102 18 10.0 0.22
361 pirates of the caribbean ride 6084 16 10.0 0.26
604 hamilton pool 3452 15 10.0 0.43
831 the texas chainsaw massacre 2467 9 10.0 0.36
685 what does the starbucks logo mean 3116 7 11.0 0.22
914 moscow metro-2 2217 6 10.0 0.27
925 eggs benedict 2179 4 11.0 0.18
962 map of america 2111 3 11.0 0.14
607 den airport 3442 3 11.0 0.09
946 wonderland amusement park 2135 2 10.0 0.09

In [24]:
df['Trees'] = [1 if x.find('tree')==True else 0 for x in df.Query]

In [26]:
df.Trees.value_counts()


Out[26]:
0    994
Name: Trees, dtype: int64

In [27]:
df.describe()


Out[27]:
Impressions Clicks Average Position CTR Trees
count 9.940000e+02 994.000000 994.000000 994.000000 994.0
mean 2.323076e+04 1212.240443 6.291247 3.340322 0.0
std 2.342735e+05 26566.821580 3.330072 6.128082 0.0
min 2.005000e+03 0.000000 1.000000 0.000000 0.0
25% 2.796250e+03 9.000000 4.200000 0.170000 0.0
50% 4.287000e+03 55.000000 6.100000 0.995000 0.0
75% 8.928000e+03 217.750000 7.800000 3.647500 0.0
max 5.094496e+06 831764.000000 40.000000 59.150000 0.0

In [31]:
df[5:100].plot(y='CTR',x='Impressions',kind='scatter')


Out[31]:
<matplotlib.axes._subplots.AxesSubplot at 0x11d3a5f10>

In [42]:
df[(df.Impressions > 25000) & (df.CTR < 5.0)].sort_values('Average Position').to_csv('High Impressions, Low CTR',index=False)

In [ ]: