In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
In [2]:
df = pd.read_csv('~/Downloads/all-content.csv')
In [3]:
Out[3]:
In [3]:
df.Title = df.Title.apply(lambda x: x.replace(''',"'"))
In [4]:
df.head(1)
Out[4]:
In [5]:
df.Title = df.Title.apply(lambda x: x.replace('&',"&"))
In [6]:
df.Title = df.Title.apply(lambda x: x.replace('"',"1"))
In [7]:
df['len_hl'] = df.Title.apply(lambda x: len(x))
In [8]:
articles = df[df.Url.str.contains('/articles/')]
In [9]:
places = df[df.Url.str.contains('/places/')]
In [10]:
articles.len_hl.describe()
Out[10]:
In [11]:
places.len_hl.describe()
Out[11]:
In [13]:
articles.hist(column='len_hl',bins = 5)
Out[13]:
In [14]:
places.hist(column='len_hl',bins = 5)
Out[14]:
In [26]:
len('Pneumatic Tubes at Stanford Hospital ')
Out[26]:
In [27]:
37*2
Out[27]:
In [16]:
place_deks = pd.read_csv('~/Downloads/place_title_subtitles.csv')
In [17]:
place_deks.head(2)
Out[17]:
In [20]:
place_deks['length'] = place_deks.subtitle.apply(lambda x: len(str(x)))
In [21]:
place_deks.head(2)
Out[21]:
In [22]:
place_deks.length.describe()
Out[22]:
In [23]:
place_deks.hist(column='length',bins=100)
Out[23]:
In [24]:
article_deks = pd.read_csv('~/Downloads/article_title_subtitles.csv')
In [25]:
article_deks['length'] = article_deks.subtitle.apply(lambda x: len(str(x)))
In [26]:
article_deks.length.describe()
Out[26]:
In [27]:
article_deks.hist(column='length',bins=100)
Out[27]:
In [30]:
article_deks[article_deks.length != 3].hist(column='length',bins=100)
Out[30]:
In [31]:
article_deks[article_deks.length != 3].describe()
Out[31]:
In [33]:
article_headline_list = articles[(articles.len_hl >38) & (articles.len_hl < 80)]
In [34]:
place_headline_list = places[(places.len_hl >10) & (places.len_hl < 40)]
In [35]:
article_dek_list = article_deks[(article_deks.length >38) & (article_deks.length < 90)]
place_dek_list = place_deks[(place_deks.length > 45) & (place_deks.length < 125)]
In [37]:
article_headline_list = article_headline_list.Title
In [39]:
import random
In [40]:
article_headline_list = random.sample(article_headline_list,20)
In [41]:
article_headline_list
Out[41]:
In [43]:
place_headline_list = place_headline_list.Title
place_headline_list = random.sample(place_headline_list,20)
In [44]:
place_headline_list
Out[44]:
In [45]:
place_headline_list = pd.Series(place_headline_list)
In [46]:
place_headline_list
Out[46]:
In [53]:
place_dek_list = place_dek_list.subtitle
In [54]:
article_dek_list = article_dek_list.subtitle
In [55]:
place_dek_list = random.sample(place_dek_list,20)
article_dek_list = random.sample(article_dek_list,20)
In [56]:
place_dek_list
Out[56]:
In [57]:
article_dek_list
Out[57]:
In [ ]: