In [1]:
import pandas as pd
import psycopg2
import os
import seaborn as sns; sns.set(color_codes=True)
import matplotlib.pyplot as plt
from sqlalchemy import text
from sqlalchemy import create_engine

plt.rcParams['figure.figsize'] = [10, 5]

%matplotlib inline  

# Get environment variables
host = os.environ['IMS_HOSTNAME']
user = os.environ['IMS_USERNAME']
password = os.environ['IMS_PASSWORD']
dbname = os.environ['IMS_DB_NAME']
port = os.environ['IMS_PORT']

# Create connection to the database
engine_string = 'postgresql://%s:%s@%s:%s/%s' % (user, password, host, port, dbname)
engine = create_engine(engine_string)

# Data extraction
sql = """
--Flats per district
select
  B.district
  ,CAST(AVG(total_amount) as numeric(16,2)) as average_price
  ,CAST(MAX(total_amount) as numeric(16,2)) as max_price
  ,CAST(MIN(total_amount) as numeric(16,2)) as min_price
  ,CAST(AVG(room) as INT) as average_rooms
  ,CAST(MAX(room) as INT) as max_rooms
  ,CAST(MIN(room) as INT) as min_rooms
  ,COUNT(*) qty_flats
from 
  dw.fact_flat A 
  join dw.dim_district B on B.title_key = A.title_key
group by 
  B.district  
order by 
  8 desc, 2 desc;
"""

df = pd.read_sql(sql, engine)


/Library/Frameworks/Python.framework/Versions/2.7/lib/python2.7/site-packages/psycopg2/__init__.py:144: UserWarning: The psycopg2 wheel package will be renamed from release 2.8; in order to keep installing from binary please use "pip install psycopg2-binary" instead. For details see: <http://initd.org/psycopg/docs/install.html#binary-install-from-pypi>.
  """)

Relationship between Avg # Rooms x Avg Price


In [2]:
ax = sns.regplot(x="average_price", y="average_rooms", data=df, label=True)
ax.set_title('Relationship between Avg # Rooms x Avg Price')
fig = ax.get_figure()
fig.savefig("visual_insights/01_relation_avg_rooms_avg_price.png", dpi=400)


/Users/flavioclesio/Library/Python/2.7/lib/python/site-packages/scipy/stats/stats.py:1713: FutureWarning: Using a non-tuple sequence for multidimensional indexing is deprecated; use `arr[tuple(seq)]` instead of `arr[seq]`. In the future this will be interpreted as an array index, `arr[np.array(seq)]`, which will result either in an error or a different result.
  return np.add.reduce(sorted[indexer] * weights, axis=axis) / sumval

Distribution of Avg Price


In [3]:
ax = sns.distplot(df['average_price'])
ax.set_title('Distribution of Avg Price')
fig = ax.get_figure()
fig.savefig("visual_insights/02_distribution_avg_price.png", dpi=400)


Distribution of the Flats


In [4]:
ax = sns.distplot(df['qty_flats'])
ax.set_title('Distribution of the Flats')
fig = ax.get_figure()
fig.savefig("visual_insights/03_distribution_qty_flats.png", dpi=400)


Relationship between Avg Price x Max Price


In [5]:
ax = sns.jointplot(x="average_price", y="max_price", data=df, label='Relationship between Avg Price x Max Price', kind="hex", color="k")
ax.savefig("visual_insights/04_relation_avg_price_max_price.png", dpi=400)


Relationship between Avg Price x Avg Rooms


In [6]:
ax = sns.jointplot(x="average_price", y="average_rooms", data=df, label=True, kind="kde")
ax.savefig("visual_insights/05_relation_avg_price_avg_rooms.png")


Correlogram using main metrics


In [7]:
ax = sns.pairplot(df[['average_price', 'average_rooms','min_price', 'min_rooms']], kind="reg")
plt.legend()
plt.show()
ax.savefig("visual_insights/06_correlogram_main_metrics.png", dpi=400)