In [2020]:
import psycopg2 as pg
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
%matplotlib inline
In [2021]:
# 30 May 2015 -> $ heroku pg:pull HEROKU_POSTGRESQL_COBALT_URL lcp --app lux
!psql lcp --help | head
In [2022]:
!psql -c "\dt" lcp
In [2023]:
!psql -c "\d+ wares" lcp
In [2024]:
psql_lcp_connection = pg.connect("dbname=lcp user=excalibur")
In [2025]:
psql_cursor = psql_lcp_connection.cursor()
In [2026]:
psql_cursor.execute("select * from wares")
In [2027]:
psql_cursor.fetchone()
Out[2027]:
In [2028]:
psql_cursor.close()
In [2029]:
wares_df = pd.io.sql.read_sql("select * from wares", psql_lcp_connection)
wares_df.head(1)
Out[2029]:
In [2030]:
wares_df.describe()
Out[2030]:
In [2031]:
wares_df.info()
In [2032]:
name_start_year_df = wares_df.loc[wares_df['start_year'].notnull(),['name','start_year']]
print name_start_year_df.count()
name_start_year_df.head(1)
Out[2032]:
In [2033]:
name_start_year_df = name_start_year_df.sort('start_year')
In [2034]:
unique_start_years = name_start_year_df['start_year'].unique()
In [2035]:
sns.boxplot(unique_start_years, names=['start_years'], vert=False)
plt.show()
In [2036]:
sns.boxplot(unique_start_years[1:], names=['start_years'], vert=False)
plt.show()
In [2037]:
start_year_counts = wares_df['start_year'].value_counts()
start_year_counts.head()
Out[2037]:
In [2038]:
plt.plot(wares_df['start_year'])
plt.show()
In [2039]:
plt.plot(wares_df[wares_df['start_year'] != wares_df['start_year'].min()]['start_year'])
plt.show()
In [2040]:
wares_df['period'].value_counts().head(10)
Out[2040]:
In [2041]:
desc_wares_df = wares_df[(wares_df['desc'] != "") & (wares_df['desc'].notnull())]
print desc_wares_df.shape
desc_wares_df.head(1)
Out[2041]:
In [2042]:
desc_words = nltk.wordpunct_tokenize(str(desc_wares_df['desc'].values))
print "num of words: ", len(desc_words)
print desc_words[:10]
In [2043]:
desc_vocab = set(desc_words)
print "num of vocab: ", len(desc_vocab)
In [2044]:
freq_dist = nltk.FreqDist(desc_words)
freq_dist
Out[2044]:
In [2045]:
freq_dist.tabulate(15)
In [2046]:
freq_dist['the']
Out[2046]:
In [2047]:
from nltk.corpus import stopwords
In [2048]:
stopwords = stopwords.words('english')
stopwords[:10]
Out[2048]:
In [2049]:
freq_dist.plot(20)
In [2050]:
freq_dist.plot(20, cumulative=True)
In [2051]:
desc_no_stopwords = [x.lower() for x in desc_words if x.lower() not in stopwords]
print "num of words: ", len(desc_no_stopwords)
print desc_no_stopwords[:10]
In [2052]:
import string
desc_nopunc_words = [x for x in desc_no_stopwords if x not in list(string.punctuation)]
print "num of words: ", len(desc_nopunc_words)
print desc_nopunc_words[:10]
In [2053]:
desc_vocab = set(desc_nopunc_words)
print "num of vocab: ", len(desc_vocab)
In [2054]:
freq_dist = nltk.FreqDist(desc_nopunc_words)
freq_dist
Out[2054]:
In [2055]:
freq_dist.plot(20)
In [2056]:
freq_dist.plot(20, cumulative=True)
In [2057]:
desc_bigrams = nltk.bigrams(desc_nopunc_words)
In [2058]:
freq_dist = nltk.FreqDist(desc_bigrams)
freq_dist
Out[2058]:
In [2059]:
freq_dist.plot(20, cumulative=True)
In [2060]:
desc_words_no_nums = [x.lower() for x in desc_nopunc_words if not x.isdigit()]
print "num of words: ", len(desc_words_no_nums)
print desc_words_no_nums[:10]
In [2061]:
desc_vocab = set(desc_words_no_nums)
print "num of vocab: ", len(desc_vocab)
In [2062]:
desc_bigrams = nltk.bigrams(desc_words_no_nums)
In [2063]:
freq_dist = nltk.FreqDist(desc_bigrams)
freq_dist
Out[2063]:
In [2064]:
plt.figure(figsize=(10,5))
freq_dist.plot(40, cumulative=True)
In [2065]:
descriptions = nltk.Text(desc_words)
In [2066]:
descriptions.count('ware')
Out[2066]:
In [2067]:
descriptions.concordance('ware')
In [2068]:
descriptions.collocations()
In [2069]:
descriptions.dispersion_plot(['ware', 'red', 'brown', 'white'])
In [2070]:
# character counts
[len(desc) for desc in desc_wares_df['desc']][:10]
Out[2070]:
In [2071]:
# word counts
[len(desc.split(' ')) for desc in desc_wares_df['desc']][:10]
Out[2071]:
In [2072]:
plt.figure(figsize=(10,5))
plt.hist([len(desc.split(' ')) for desc in desc_wares_df['desc']], bins=30)
plt.xlabel('description lengths')
plt.ylabel('frequencies')
plt.show()
In [2073]:
desc_wares_df.loc[:,'desc'] = desc_wares_df.loc[:,'desc'].str.lower()
In [2083]:
desc_wares_df['desc'].head()
Out[2083]:
In [2138]:
desc_nopunc_df = desc_wares_df.replace(to_replace={"desc":{"\W".format(stop_words):" "}}, regex=True)
desc_nopunc_df['desc'].head()
Out[2138]:
In [2139]:
stop_words = "|".join([sw.encode('ascii') for sw in stopwords])
stop_words
Out[2139]:
In [2142]:
desc_less_df = desc_nopunc_df.replace(to_replace={"desc":{"(^|\s+)({0})\s+".format(stop_words):" "}}, regex=True)
desc_less_df['desc'].head()
Out[2142]:
In [2143]:
# run twice cause it's late
desc_less_df = desc_less_df.replace(to_replace={"desc":{"(^|\s+)({0})\s+".format(stop_words):" "}}, regex=True)
desc_less_df['desc'].head()
Out[2143]:
In [2150]:
# run thrice cause it's late
desc_less_df = desc_less_df.replace(to_replace={"desc":{"(^|\s+)({0})\s+".format(stop_words):" "}}, regex=True)
desc_less_df['desc'].head()
Out[2150]:
In [2151]:
desc_less_df[desc_less_df['desc'].str.contains(' the ')]['desc']
Out[2151]:
In [2157]:
desc_less_df['desc']
Out[2157]:
want to compare each desc to each other; need to keep them separated, unlike below (maybe just in pandas, go lowercase, replace punc, and replace stopwords with nadda); also, as seen below, remove those double puncs that get through using regex (e.g., ".'")
In [2001]:
desc_words_no_nums
Out[2001]: