In [1]:
import psycopg2
from sqlalchemy import create_engine
import ast
import pandas as pd
import glob
import os
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
In [2]:
%matplotlib inline
In [3]:
dbname="kick"
tblname="info"
In [4]:
engine = create_engine(
'postgresql://localhost:5432/{dbname}'.format(dbname=dbname))
# Connect to database
conn = psycopg2.connect(dbname=dbname)
cur = conn.cursor()
Remind myself of the columns in the table:
In [5]:
cur.execute("SELECT column_name,data_type FROM information_schema.columns WHERE table_name = '{table}';".format(table=tblname))
rows = cur.fetchall()
pd.DataFrame(rows, columns=["column_name", "data_type"])
Out[5]:
Number of records in table:
In [6]:
cur.execute("SELECT COUNT(*) from {table}".format(table=tblname))
cur.fetchone()
Out[6]:
In [7]:
cur.execute("SELECT topic, COUNT(*) from {table} GROUP BY topic ORDER BY count DESC;".format(table=tblname))
rows = cur.fetchall()
df = pd.DataFrame(rows, columns=["topic", "count"])
# Plot findings
plt.rcParams["figure.figsize"] = [17,5]
df.plot(kind="bar", x="topic", y="count", legend=False)
plt.ylabel("Kickstarter projects")
plt.xlabel("Topic")
plt.title("Kickstarter projects by topic")
plt.tick_params(axis='x', labelsize=7)
In [8]:
"There are {num_topics} different types of Kickstarter projects".format(num_topics=df.shape[0])
Out[8]:
In [9]:
# Most popular project topic is
df[df["count"] == df["count"].max()]
Out[9]:
In [10]:
# Most rare project topic is
df[df["count"] == df["count"].min()]
Out[10]:
What are the rare projects?
In [11]:
cur.execute("SELECT id, blurb, goal*static_usd_rate as goal_usd FROM {table} WHERE topic = '{topic}'".format(table=tblname, topic="Taxidermy"))
rows = cur.fetchall()
for row in rows:
row_id, blurb, goal = row
print(">>> $%d | id: %s" % (goal, row_id),
blurb, sep="\n")
In [12]:
sql = "SELECT id, topic, goal*static_usd_rate as goal_usd FROM {table}".format(table=tblname)
cur.execute(sql)
rows = cur.fetchall()
df = pd.DataFrame(rows, columns=["id", "topic", "goal_usd"])
In [13]:
# Asking average
np.log10(df.goal_usd).plot.kde()
plt.xlabel("log(funding goal in USD)")
Out[13]:
In [14]:
"Most projects are asking for: $%d - $%d" % (10**2.5, 10**5)
Out[14]:
In [15]:
sns.barplot(x="topic", y="goal_usd",
data=df.groupby("topic").mean().reset_index().sort_values(by="goal_usd", ascending=False))
_ = plt.xticks(rotation='vertical')
plt.ylabel("Average goal (USD)")
plt.xlabel("Kickstarter project topic")
plt.title("Funding goals on Kickstarter by topic")
plt.tick_params(axis='x', labelsize=7)
"Movie Theaters" and "Space exploration" have the average higest funding goals
In [16]:
sql = "SELECT id, topic, goal, pledged, pledged/goal as progress FROM info ORDER BY progress DESC;"
cur.execute(sql)
rows = cur.fetchall()
df = pd.DataFrame(rows, columns=["id", "topic", "goal", "pledged", "progress"])
In [17]:
df["well_funded"] = df.progress >= 1
In [18]:
plt.rcParams["figure.figsize"] = [17,5]
sns.boxplot(x="topic", y="progress", data=df[df.well_funded].sort_values(by="topic"))
_ = plt.xticks(rotation='vertical')
plt.yscale('log')
plt.ylabel("Percent of funding goal")
plt.xlabel("Topic")
plt.title("Projects that were successfully funded by Topic")
plt.tick_params(axis='x', labelsize=7)
In [19]:
sns.barplot(x="topic", y="progress",
data=df[df.well_funded].groupby("topic").count().reset_index().sort_values(by="progress", ascending=False))
_ = plt.xticks(rotation='vertical')
plt.ylabel("Project that were successfully funded")
plt.xlabel("Topic")
plt.title("Projects that were successfully funded by Topic")
plt.tick_params(axis='x', labelsize=7)
In [20]:
plt.rcParams["figure.figsize"] = [17,5]
sns.boxplot(x="topic", y="progress",
data=df[np.invert(df.well_funded)].sort_values(by="topic"))
_ = plt.xticks(rotation='vertical')
plt.ylabel("Percent of funding goal met")
plt.xlabel("Topic")
plt.title("Pojects that have yet to meet their funding goals")
plt.tick_params(axis='x', labelsize=7)
In [21]:
sns.barplot(x="topic", y="progress",
data=df[np.invert(df.well_funded)].groupby("topic").count().reset_index().sort_values(by="progress", ascending=False))
_ = plt.xticks(rotation='vertical')
plt.ylabel("Project that were not yet successfully funded")
plt.xlabel("Topic")
plt.title("Pojects that have yet to meet their funding goals")
plt.tick_params(axis='x', labelsize=7)
In [22]:
# close communication with the PostgreSQL database server
cur.close()
# commit the changes
conn.commit()
# close connection
conn.close()