In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

%matplotlib inline

In [2]:
df = pd.read_csv('~/Downloads/all-content.csv')

In [3]:



Out[3]:
Publisher Title Url Published Page Views Uniques Total Engaged Time Avg Engaged Time Social Actions Social Referrals ... Sharethrough Paid Desktop Referrals Amplify Paid Referrals Amplify Paid Mobile Referrals Amplify Paid Tablet Referrals Amplify Paid Desktop Referrals Gravity Paid Referrals Gravity Paid Mobile Referrals Gravity Paid Tablet Referrals Gravity Paid Desktop Referrals Nativo Paid Referrals
0 Atlas Obscura The Famous Photo of Chernobyl's Most Dange... http://www.atlasobscura.com/articles/the-famou... 2016-01-25T00:30:00 678116 593158.0 155172300000 228.828549 26930 476363 ... 0 0 0 0 0 0 0 0 0 0

1 rows × 115 columns


In [3]:
df.Title = df.Title.apply(lambda x: x.replace(''',"'"))

In [4]:
df.head(1)


Out[4]:
Publisher Title Url Published Page Views Uniques Total Engaged Time Avg Engaged Time Social Actions Social Referrals ... Sharethrough Paid Desktop Referrals Amplify Paid Referrals Amplify Paid Mobile Referrals Amplify Paid Tablet Referrals Amplify Paid Desktop Referrals Gravity Paid Referrals Gravity Paid Mobile Referrals Gravity Paid Tablet Referrals Gravity Paid Desktop Referrals Nativo Paid Referrals
0 Atlas Obscura The Famous Photo of Chernobyl's Most Dangerous... http://www.atlasobscura.com/articles/the-famou... 2016-01-25T00:30:00 678116 593158.0 155172300000 228.828549 26930 476363 ... 0 0 0 0 0 0 0 0 0 0

1 rows × 115 columns


In [5]:
df.Title = df.Title.apply(lambda x: x.replace('&',"&"))

In [6]:
df.Title = df.Title.apply(lambda x: x.replace('"',"1"))

In [7]:
df['len_hl'] = df.Title.apply(lambda x: len(x))

In [8]:
articles = df[df.Url.str.contains('/articles/')]

In [9]:
places = df[df.Url.str.contains('/places/')]

In [10]:
articles.len_hl.describe()


Out[10]:
count    2084.000000
mean       57.626679
std        14.553881
min        15.000000
25%        47.000000
50%        58.000000
75%        68.000000
max       116.000000
Name: len_hl, dtype: float64

In [11]:
places.len_hl.describe()


Out[11]:
count    1540.000000
mean       21.424675
std         9.157430
min         4.000000
25%        15.000000
50%        20.000000
75%        26.000000
max        63.000000
Name: len_hl, dtype: float64

In [13]:
articles.hist(column='len_hl',bins = 5)


Out[13]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x118fe62d0>]], dtype=object)

In [14]:
places.hist(column='len_hl',bins = 5)


Out[14]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11bacd310>]], dtype=object)

In [26]:
len('Pneumatic Tubes at Stanford Hospital ')


Out[26]:
37

In [27]:
37*2


Out[27]:
74

In [16]:
place_deks = pd.read_csv('~/Downloads/place_title_subtitles.csv')

In [17]:
place_deks.head(2)


Out[17]:
slug title subtitle
0 seven-noses-of-soho Seven Noses of Soho Hidden in plain sight, these nose sculptures h...
1 the-carousel-in-prospect-park The Carousel in Prospect Park You can still ride this 100-year-old work of a...

In [20]:
place_deks['length'] = place_deks.subtitle.apply(lambda x: len(str(x)))

In [21]:
place_deks.head(2)


Out[21]:
slug title subtitle length
0 seven-noses-of-soho Seven Noses of Soho Hidden in plain sight, these nose sculptures h... 104
1 the-carousel-in-prospect-park The Carousel in Prospect Park You can still ride this 100-year-old work of a... 83

In [22]:
place_deks.length.describe()


Out[22]:
count    9346.000000
mean       81.061952
std        26.809901
min         3.000000
25%        61.000000
50%        81.000000
75%       100.000000
max       187.000000
Name: length, dtype: float64

In [23]:
place_deks.hist(column='length',bins=100)


Out[23]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11c8bdb90>]], dtype=object)

In [24]:
article_deks = pd.read_csv('~/Downloads/article_title_subtitles.csv')

In [25]:
article_deks['length'] = article_deks.subtitle.apply(lambda x: len(str(x)))

In [26]:
article_deks.length.describe()


Out[26]:
count    3966.00000
mean       16.20121
std        27.52576
min         3.00000
25%         3.00000
50%         3.00000
75%         3.00000
max       161.00000
Name: length, dtype: float64

In [27]:
article_deks.hist(column='length',bins=100)


Out[27]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11c8bdc10>]], dtype=object)

In [30]:
article_deks[article_deks.length != 3].hist(column='length',bins=100)


Out[30]:
array([[<matplotlib.axes._subplots.AxesSubplot object at 0x11bd75fd0>]], dtype=object)

In [31]:
article_deks[article_deks.length != 3].describe()


Out[31]:
length
count 896.000000
mean 61.433036
std 26.657228
min 5.000000
25% 42.000000
50% 61.000000
75% 80.000000
max 161.000000

In [33]:
article_headline_list = articles[(articles.len_hl >38) & (articles.len_hl < 80)]

In [34]:
place_headline_list = places[(places.len_hl >10) & (places.len_hl < 40)]

In [35]:
article_dek_list = article_deks[(article_deks.length >38) & (article_deks.length < 90)]
place_dek_list = place_deks[(place_deks.length > 45) & (place_deks.length < 125)]

In [37]:
article_headline_list = article_headline_list.Title

In [39]:
import random

In [40]:
article_headline_list = random.sample(article_headline_list,20)

In [41]:
article_headline_list


Out[41]:
['How Power Naps Are Related to Near-Death Experiences',
 'Fleeting Wonders: The Reliably Miraculous Blood of St. Januarius',
 'Atlas Obscura Gift Guide: 8 Presents for the Exterior Decorator',
 'Before WWII, Americans in China Had Their Own Special Expat Courts',
 "Mapping Hollywood's Most Infamous Horror Houses",
 "A Round of Appaws for America's Best Pet Care Puns",
 'How the Birth Certificate Became a Ticket to Society',
 'The Peculiar History of Celebrity Dolls',
 "Found: The Universal Expression for 'Yeeeeah\x1aNo'",
 'Why There\x1as No Such Thing As A Brooklyn Accent',
 'The Self-Sacrificing Japanese Pilgrims Who Chose to be Swallowed by the Sea',
 "Fleeting Wonders: The Destruction of Palmyra's Arch of Triumph",
 'Extreme Bagpiping Situations, From Antarctica to the Beaches of D-Day',
 'Your New Favorite Honey Is Made Out Of Bug Poop and Bee Vomit',
 " The World's Longest-Running Experiment is Buried in a Secret Spot in Michigan",
 'The Moment Scotland Shut Down Its Last Coal-Fired Energy Plant',
 "Pizza Doesn't Really Stretch That Way, And Other Devious Food Stylist Tricks",
 "Shocking Scenes From Benjamin Franklin's Experimental Electricity Parties",
 'Places You Can No Longer Go: UpStairs Lounge',
 'Where Should Robots Go When They Retire?']

In [43]:
place_headline_list = place_headline_list.Title
place_headline_list = random.sample(place_headline_list,20)

In [44]:
place_headline_list


Out[44]:
['Buzz-A-Rama',
 'The Stromatolites of Hamelin Pool',
 'The Churchill War Rooms',
 'Llandwyn Island',
 'Post Office Bay',
 'The Daddy Long Legs Seashore Railway',
 'Bay Islands Underwater Museum',
 'Shwe Indein Pagoda',
 'Salse di Nirano',
 'Charyn Canyon',
 'Folly Ruins',
 'Lake Labynkyr',
 'Museums at Old City Cemetery',
 'The Heart of Voltaire',
 'Kekova Island Sunken Ruins',
 'Tourlitis Lighthouse',
 'Mills of Folon and Picon',
 'Haapsalu Airfield',
 'The Enchanted Forest of Orrius',
 'Ch\x1ateau de Cheverny']

In [45]:
place_headline_list = pd.Series(place_headline_list)

In [46]:
place_headline_list


Out[46]:
0                              Buzz-A-Rama
1        The Stromatolites of Hamelin Pool
2                  The Churchill War Rooms
3                          Llandwyn Island
4                          Post Office Bay
5     The Daddy Long Legs Seashore Railway
6            Bay Islands Underwater Museum
7                       Shwe Indein Pagoda
8                          Salse di Nirano
9                            Charyn Canyon
10                             Folly Ruins
11                           Lake Labynkyr
12            Museums at Old City Cemetery
13                   The Heart of Voltaire
14              Kekova Island Sunken Ruins
15                    Tourlitis Lighthouse
16                Mills of Folon and Picon
17                       Haapsalu Airfield
18          The Enchanted Forest of Orrius
19                     Chteau de Cheverny
dtype: object

In [53]:
place_dek_list = place_dek_list.subtitle

In [54]:
article_dek_list = article_dek_list.subtitle

In [55]:
place_dek_list = random.sample(place_dek_list,20)
article_dek_list = random.sample(article_dek_list,20)

In [56]:
place_dek_list


Out[56]:
['1500-year-old ghost settlement slowly faded away',
 'Riding on a pair of architectural dragons, this Thailand temple holds murals full of lurid imagery ',
 'The largest collection of death masks in Ukraine',
 'Golden cobblestones remember a time when this small alley was used to avoid having to give a Nazi salute',
 'Three Columns of Volcanic Rock Shoot Out of the Ocean',
 'This little replica of Lady Liberty provides a bit of NYC in a Japanese city ',
 'A visionary precursor to the Internet made of index cards',
 'An ominous concrete cube cuts a dystopian silhouette atop this California mountain.  ',
 "One of the world's oldest trees comes with a Welsh legend intertwining it with a spirit prophesying death",
 'A former insane asylum despised by Nelly Bly and Charles Dickens is now fashionable condos',
 'Statue of Edgar Hernandez, the first victim of swine flu',
 "This otherwise innocuous bodega was once the headquarters of the most feared assassin's guild in American history",
 'This small Irish village suffered the greatest loss of life when the Titanic sank',
 "California's most comprehensive horticultural library",
 'Hidden behind these mundane residential facades are surreal artistic landscapes. ',
 'An overlooked and unassuming engineering marvel ',
 'Curios, scientific antiques, oddities for sale.',
 'Formerly abandoned mental institute now back in business',
 "In this case, 'zero' marks the spot where the iconic Old Spanish Trail ends",
 'Dedicated to improving the plight of the rare scaly anteater']

In [57]:
article_dek_list


Out[57]:
['The Jller uses a method that is just as much art as it is science.',
 'Scientists solved a case of missing identity: who were these 430,000-year-old hominins?',
 'The International Criminal Court hears its first property case.',
 '"He was a little cutie," observers report.',
 'For the next two months the nation will be conserving energy with three-day weekends.',
 'Itinerant artists made almost 5,000 views of 2,400 different places. ',
 'World War II, in particular, allowed for large-scale, centralized blood collection. ',
 'The country has steadily been eliminating remnants of its Cold War past. ',
 'From Pete Townshend to Caleb Followill.',
 'The plants are speaking. Time to read what they have to say.',
 'The mysterious Cold War base is now an eerie, crumbling ruin. ',
 'Watch 4,000 pounds of rainbow trout careen down a slide into a lake.',
 'There is a scientific explanation for this.',
 'Back in the day, women with bare heads may as well have been naked.',
 'The scrolls were recovered from the only library to survive ancient times. ',
 'Feast your eyes on a stunning, hand-illustrated star chart from 1670.',
 'These eight-legged grumps may be trying to clock each other with shells.',
 'The video is a glimpse of the ethereal world inhabited by these ocean behemoths.',
 'A lot of damage has already been done. ',
 "Help us on our quest to locate America's most ancient computer."]

In [ ]: