In [ ]:
import pandas as pd
import re

In [ ]:
data = pd.read_csv('dwsmall.csv')

In [ ]:
import psycopg2
conn = psycopg2.connect("dbname='cap' user='postgres' host='ec2-35-163-99-253.us-west-2.compute.amazonaws.com' port=9000 password ='secret'")
cursor = conn.cursor()
cursor.execute("SELECT * FROM articles limit 1000")
data = cursor.fetchall()
data = pd.DataFrame(data)
data.columns = ['site', 'title','author','secondary_authors','published_on','accessed_on','url','body' ,'html','newspaper_keywords','newspaper_summary','id']                 
#print(records)

In [ ]:
conn = psycopg2.connect("dbname='cap' user='postgres' host='ec2-35-163-99-253.us-west-2.compute.amazonaws.com' port=9000 password ='secret'")
cursor = conn.cursor()
cursor.execute('drop table if exists author_dim cascade')
cursor.execute('create table author_dim(author_id SERIAL,name TEXT,PRIMARY KEY(author_id))')

cursor.execute('drop table if exists site_dim cascade')
cursor.execute('create table site_dim(site_id SERIAL,domain text,url text,supersite_id int,PRIMARY KEY(site_id))')

cursor.execute('drop table if exists article_dim cascade')
cursor.execute('create table article_dim(article_id SERIAL, title text, published_date timestamp,accessed_date timestamp, body text,html text, keywords text, summary text,PRIMARY KEY(article_id))')

cursor.execute('drop table if exists statistic_dim cascade')
cursor.execute('create table statistic_dim(statistic_id SERIAL,NLP_results JSON,PRIMARY KEY(statistic_id))')

cursor.execute('drop table if exists fact cascade')
cursor.execute('create table fact(fact_id SERIAL,article_id int,author_id int,site_id int,statistic_id int,FOREIGN KEY (article_id) REFERENCES article_dim(article_id),FOREIGN KEY (author_id) REFERENCES author_dim(author_id),FOREIGN KEY (site_id) REFERENCES site_dim(site_id),FOREIGN KEY (statistic_id) REFERENCES statistic_dim(statistic_id))')

In [ ]:
from sqlalchemy import create_engine
article_table = data[['id','title', 'published_on','accessed_on', 'body', 'html','newspaper_keywords', 'newspaper_summary']]
#article_table.accessed_on = article_table.accessed_on[0].split(' ')[0]
article_table.columns = ['article_id','title', 'published_date','accessed_date', 'body', 'html','keywords', 'summary']
#article_table
engine = create_engine('postgresql://postgres:secret@ec2-35-163-99-253.us-west-2.compute.amazonaws.com:9000/cap')
article_table.to_sql('article_dim', engine, if_exists='append', index=None)
cursor.execute('select * from article_dim limit 1')
cursor.fetchall()

In [ ]:
author_table = pd.DataFrame(data=None, columns=['author_id', 'author'], index = None)
author_table.author = data['author'].unique()
author_table.author_id = author_table.index 
author_table

In [ ]:
author_tableMM = data[['author', 'id']]
author_tableMM.columns = ['author', 'article_id']
author_tableMM

In [ ]:
result = pd.merge(author_table, author_tableMM, on='author')
result[result.article_id==2910]

In [ ]:
author_table[author_table.author_id==507]

In [ ]:
s= data['url'][0]
#print(s)
s = s.replace("http://www.","")
#print(s)
domain = s.rsplit('/')[0]
#print(domain)
supersite = domain.rsplit('.')[0]
print(s, domain, supersite)

In [ ]:
site_table = pd.DataFrame(data=data[['id','url']], columns=['id','site_id', 'url', 'domain', 'supersite'], index = None)
site_table.url = data['url']
site_table.url = site_table.url.str.replace("http://www.","")
site_table.domain = site_table.url.apply(lambda x: pd.Series(str(x).split('/',1)))
site_table.supersite = site_table.domain.apply(lambda x: pd.Series(str(x).split('.',1)))
site_table.site_id = site_table.index
site_table.columns = ['article_id','site_id', 'URL', 'domain', 'supersite']
site_table

In [ ]:
result1 = pd.merge(author_tableMM, author_table, on='author')[['article_id','author_id']]
result2 = pd.merge(site_table, result1, on='article_id')[['article_id','author_id','site_id']]
result2

In [ ]:
from urllib.parse import urlparse
import re
pattern = re.compile('(^[^.]*[.])([^.]*)([.].*$)')
site_table = pd.DataFrame(data=data[['id','url']], columns=['id','site_id', 'url', 'domain', 'supersite'], index = None)
site_table.url = data['url']
#site_table.url =  urlparse(site_table.url).netloc
for x in site_table.url:
    x = urlparse(x).netloc


'''site_table.domain = site_table.url.apply(lambda x: pd.Series(str(x).split('/',1)))
site_table.supersite = site_table.domain.apply(lambda x: pd.Series(str(x).split('.',1)))
site_table.site_id = site_table.index
site_table.columns = ['article_id','site_id', 'URL', 'domain', 'supersite']
site_table'''

In [ ]:
site_table

In [ ]:
from urllib.parse import urlparse
import re
pattern = re.compile('(^[^.]*[.])([^.]*)([.].*$)')
a = urlparse(data.url[0]).netloc
print(a)
b = [pattern.sub(r'\2', x) for x in [a]][0]
b

In [ ]:
a = urlparse(data.url[0])
a

In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [ ]: