In [1]:
import pandas as pd
import re

In [2]:
data = pd.read_csv('dwsmall.csv')

In [97]:
article_table = data[['id','title', 'published_on','accessed_on', 'body', 'html','newspaper_keywords', 'newspaper_summary']][0:10]
article_table.accessed_on = a.accessed_on[0].split(' ')[0]
article_table.columns = ['article_id','title', 'published_on','accessed_on', 'body', 'html','newspaper_keywords', 'newspaper_summary']
article_table


Out[97]:
article_id title published_on accessed_on body html newspaper_keywords newspaper_summary
0 1946 Dan de baja a dos 'Centinelas' en universidad 2017-03-31 2017-04-01 Autoridades de la Procuraduría capitalina exho... <div><p>Autoridades de la Procuradur&#237;a ca... {universidad,la,se,dan,en,centinelas,del,que,l... Dos jóvenes vinculados con el grupo agresor de...
1 1947 2017년 최고의 만우절 장난을 모아봤다 2017-04-01 2017-04-01 게시됨: 2017년 04월 01일 15시 36분 KST 게시됨: 2017년 04월... <div><p>&#44172;&#49884;&#46120;: 2017&#45380;... {모아봤다,만우절,31,2017년,수,한,최고의,국제옐로우피스,장난을,만우절을,속에... 게시됨: 2017년 04월 01일 15시 36분 KST게시됨: 2017년 04월 0...
2 1948 이재명이 첫 2위를 달성했다 2017-03-31 2017-04-01 게시됨: 2017년 03월 31일 19시 28분 KST 게시됨: 2017년 03월... <div><p>&#44172;&#49884;&#46120;: 2017&#45380;... {달성했다,저지할,31일,것이라고,이재명이,최선을,후보의,2위를,후보는,첫,할,있다... 이 후보는 이날 부산 동래구 부산실내체육관에서 열린 민주당 영남권역 순회투표 뒤 기...
3 1949 이부진과 이혼소송 중인 임우재 자택 거금에 경매 나왔다 2017-03-31 2017-04-01 게시됨: 2017년 03월 31일 20시 07분 KST 게시됨: 2017년 03월... <div><p>&#44172;&#49884;&#46120;: 2017&#45380;... {중인,이혼소송,고문의,하나은행에서,31일,지지옥션은,이부진과,임,임우재,것으로,경... 게시됨: 2017년 03월 31일 20시 07분 KST게시됨: 2017년 03월 3...
4 1950 Bali Kaur Bassi Is The First Punjabi Model On ... 2017-03-31 2017-04-01 ADVERTISEMENT The 2017 fall runways during fa... <div><p class="sharebox-label">ADVERTISEMENT</... {fashion,bassi,bali,post,punjabi,shared,media,... So finally, one Canadian fashion magazine deci...
5 1951 [장국영 14주기②] 볼수록 보고싶다...장국영의 인생영화5 2017-04-01 2017-04-01 ADVERTISEMENT 장국영이 세상을 떠난지도 어느덧 14년. 강산이 바뀌어도... <div><p class="sharebox-label">ADVERTISEMENT</... {장국영,인생영화5,보고싶다장국영의,장국영의,장국영은,수,한,볼수록,14주기②,늘,... #천녀유혼(1987)장국영의 인생작에서 절대로 빼놓을 수 없는 작품이다.\n순박한 ...
6 1952 コメント 2017-04-01 2017-04-01 "The bride and groom both lost siblings within... <ul class="slides"> \t\t\t\t\t\t\t\t\t\t<li cl... {bride,ceremony,vows,day,way,コメント,groom,emotio... - Lauren Saldutti"All the bridesmaids and the ...
7 1955 Kasabian’s ‘You’re In Love With A Psycho’ Musi... 2017-03-31 2017-04-01 Kasabian’s ‘You’re In Love With A Psycho’ musi... <div><p><a href="http://www.huffingtonpost.co.... {psycho,outdated,slammed,music,damaging,kasabi... Kasabian’s ‘You’re In Love With A Psycho’ musi...
8 1956 Rabaisser Macron ou la nouvelle stratégie du m... 2017-03-31 2017-04-01 PRÉSIDENTIELLE 2017 - Sur les réseaux sociaux,... <div><p id="sharebox_260x60" class="advertisem... {et,les,la,macron,à,rabaisser,stratégie,du,ou,... Pour saper les fondations du candidat d'En Mar...
9 1957 Diese atemberaubenden Fotos zeigen, wie wichti... 2017-03-31 2017-04-01 Großeltern stellen in vielen Familien gleich n... <div><p class="ad_share_box"> ... {wichtig,zeigen,kinder,sind,großeltern,und,ist... Gerade bei regelmäßigem und intensiven Kontakt...

In [31]:
author_tableMM = data[['author', 'id']]
author_tableMM.columns = ['author', 'article_id']
author_tableMM


Out[31]:
author article_id
0 NaN 1946
1 NaN 1947
2 NaN 1948
3 NaN 1949
4 Monika Markovinovic 1950
5 NaN 1951
6 Kelsey Borresen 1952
7 Rachel Mcgrath 1955
8 Geoffroy Clavel 1956
9 Maximilian Marquardt 1957
10 Michelle Butterfield 1958
11 NaN 1959
12 Carla Herreria 1960
13 Kelsey Borresen 1961
14 Ryan Grenoble 1962
15 Maham Abedi 1963
16 Sebastian Christ 1964
17 Rachel Mcgrath 1965
18 Huffpost Staff 1966
19 NaN 1967
20 Natasha Hinde 1968
21 NaN 1969
22 NaN 1970
23 Daniel Tencer 1971
24 NaN 1972
25 NaN 1973
26 Ambre Lefèvre 1974
27 NaN 1975
28 NaN 1976
29 Anxo Pérez 1977
... ... ...
970 Benjamin Lee 2910
971 Emily Steel 2911
972 Andrew Anthony 2912
973 Ben Jacobs 2913
974 Kerry Ingram 2997
975 Dan Collyns 2915
976 Associated Press In Los Angeles 2916
977 Juliet Eilperin Is The Washington Post'S Senio... 2917
978 Associated Press 2918
979 Australian Associated Press 2919
980 Peter Bradshaw 2920
981 Emma Graham-Harrison 2921
982 David Smith 2922
983 Vanessa Thorpe 2923
984 Rachel Cooke 2924
985 Lawrence Donegan 2925
986 Adam Gabbatt 2926
987 John Abraham 2927
988 NaN 2928
989 Angelique Chrisafis 2936
990 Ed Helmore 2929
991 Press Association In Miami 2930
992 Associated Press 2931
993 NaN 2932
994 Sune Engel Rasmussen 2933
995 Andrew Pulver 2934
996 Paul Wilson 2935
997 Jamie Jackson 2937
998 NaN 2938
999 Shaun Walker 2939

1000 rows × 2 columns


In [104]:
result = pd.merge(author_table, author_tableMM, on='author')
result[result.article_id==2910]


Out[104]:
author_id author article_id
982 507 Benjamin Lee 2910

In [106]:
author_table[author_table.author_id==507]


Out[106]:
author_id author
507 507 Benjamin Lee

In [39]:
author_table = pd.DataFrame(data=None, columns=['author_id', 'author'], index = None)
author_table.author = data['author'].unique()
author_table.author_id = author_table.index 
author_table


Out[39]:
author_id author
0 0 NaN
1 1 Monika Markovinovic
2 2 Kelsey Borresen
3 3 Rachel Mcgrath
4 4 Geoffroy Clavel
5 5 Maximilian Marquardt
6 6 Michelle Butterfield
7 7 Carla Herreria
8 8 Ryan Grenoble
9 9 Maham Abedi
10 10 Sebastian Christ
11 11 Huffpost Staff
12 12 Natasha Hinde
13 13 Daniel Tencer
14 14 Ambre Lefèvre
15 15 Anxo Pérez
16 16 Jean-Baptiste Duval
17 17 Boris Reitschuster
18 18 Todd Van Luling
19 19 Caroline Frost
20 20 Staff Reporter
21 21 Verashni Pillay
22 22 Alexandre Boudet
23 23 Oscar García
24 24 Joy D'Souza
25 25 Patrick White
26 26 Kayla Greaves
27 27 Dr. Michael Laitman Auf Twitter Folgen
28 28 Babette Habenstein
29 29 Ferial Haffajee
... ... ...
495 495 Amber Jamieson
496 496 Lise Funderburg
497 497 Associated Press In Tulsa
498 498 Guardian Sport
499 499 Laurence Blair
500 500 Us Census Bureau
501 501 Jon Swaine
502 502 Lucy Mangan
503 503 Amanda Holpuch
504 504 Jamie Jackson
505 505 Associated Press In Louisville
506 506 Associated Press In Glendale
507 507 Benjamin Lee
508 508 Ben Jacobs
509 509 Kerry Ingram
510 510 Dan Collyns
511 511 Associated Press In Los Angeles
512 512 Australian Associated Press
513 513 Emma Graham-Harrison
514 514 David Smith
515 515 Vanessa Thorpe
516 516 Rachel Cooke
517 517 Lawrence Donegan
518 518 John Abraham
519 519 Ed Helmore
520 520 Press Association In Miami
521 521 Sune Engel Rasmussen
522 522 Andrew Pulver
523 523 Paul Wilson
524 524 Shaun Walker

525 rows × 2 columns


In [42]:
s= data['url'][0]
#print(s)
s = s.replace("http://www.","")
#print(s)
domain = s.rsplit('/')[0]
#print(domain)
supersite = domain.rsplit('.')[0]
print(s, domain, supersite)


huffingtonpost.com.mx/2017/03/31/echan-de-la-escuela-a-presuntos-miembros-de-los-centinelas_a_22020472/ huffingtonpost.com.mx huffingtonpost

In [95]:
site_table = pd.DataFrame(data=data[['id','url']], columns=['id','site_id', 'url', 'domain', 'supersite'], index = None)
site_table.url = data['url']
site_table.url = site_table.url.str.replace("http://www.","")
site_table.domain = site_table.url.apply(lambda x: pd.Series(str(x).split('/',1)))
site_table.supersite = site_table.domain.apply(lambda x: pd.Series(str(x).split('.',1)))
site_table.site_id = site_table.index
site_table.columns = ['article_id','site_id', 'URL', 'domain', 'supersite']
site_table


Out[95]:
article_id site_id URL domain supersite
0 1946 0 huffingtonpost.com.mx/2017/03/31/echan-de-la-e... huffingtonpost.com.mx huffingtonpost
1 1947 1 huffingtonpost.kr/2017/04/01/story_n_15737584.... huffingtonpost.kr huffingtonpost
2 1948 2 huffingtonpost.kr/2017/03/31/story_n_15722326.... huffingtonpost.kr huffingtonpost
3 1949 3 huffingtonpost.kr/2017/03/31/story_n_15722434.... huffingtonpost.kr huffingtonpost
4 1950 4 huffingtonpost.ca/2017/03/31/bali-kaur-bassi-p... huffingtonpost.ca huffingtonpost
5 1951 5 huffingtonpost.kr/2017/04/01/story_n_15735400.... huffingtonpost.kr huffingtonpost
6 1952 6 huffingtonpost.jp/2017/04/01/bride_n_15735980.... huffingtonpost.jp huffingtonpost
7 1955 7 huffingtonpost.co.uk/2017/03/31/kasabian-music... huffingtonpost.co.uk huffingtonpost
8 1956 8 huffingtonpost.fr/2017/03/31/rabaisser-macron-... huffingtonpost.fr huffingtonpost
9 1957 9 huffingtonpost.de/2017/03/31/fotos-grosseltern... huffingtonpost.de huffingtonpost
10 1958 10 huffingtonpost.ca/2017/03/31/westjet-april-foo... huffingtonpost.ca huffingtonpost
11 1959 11 huffingtonpost.kr/2017/03/31/story_n_15720684.... huffingtonpost.kr huffingtonpost
12 1960 12 huffingtonpost.kr/2017/03/31/story_n_15720674.... huffingtonpost.kr huffingtonpost
13 1961 13 huffingtonpost.kr/2017/04/01/story_n_15737746.... huffingtonpost.kr huffingtonpost
14 1962 14 huffingtonpost.jp/2017/03/31/911_n_15735046.html huffingtonpost.jp huffingtonpost
15 1963 15 huffingtonpost.ca/2017/03/31/reward-peel-musli... huffingtonpost.ca huffingtonpost
16 1964 16 huffingtonpost.de/2017/03/31/ende-der-populist... huffingtonpost.de huffingtonpost
17 1965 17 huffingtonpost.co.uk/2017/04/01/emmerdale-spoi... huffingtonpost.co.uk huffingtonpost
18 1966 18 huffingtonpost.in/2017/03/31/indian-student-wh... huffingtonpost.in huffingtonpost
19 1967 19 huffingtonpost.kr/2017/04/01/story_n_15732534.... huffingtonpost.kr huffingtonpost
20 1968 20 huffingtonpost.co.uk/2017/03/31/foods-you-dont... huffingtonpost.co.uk huffingtonpost
21 1969 21 huffingtonpost.ca/2017/03/31/veracruz-rape-cha... huffingtonpost.ca huffingtonpost
22 1970 22 huffingtonpost.ca/2017/03/31/measles-westjet-f... huffingtonpost.ca huffingtonpost
23 1971 23 huffingtonpost.ca/2017/03/31/money-laundering-... huffingtonpost.ca huffingtonpost
24 1972 24 huffingtonpost.kr/2017/04/01/story_n_15737576.... huffingtonpost.kr huffingtonpost
25 1973 25 http://quebec.huffingtonpost.ca/2017/03/31/pro... http: http:
26 1974 26 huffingtonpost.fr/2017/04/01/un-specialiste-no... huffingtonpost.fr huffingtonpost
27 1975 27 huffingtonpost.kr/2017/03/31/story_n_15722326.... huffingtonpost.kr huffingtonpost
28 1976 28 http://quebec.huffingtonpost.ca/gabriel-cholet... http: http:
29 1977 29 huffingtonpost.es/anxo-perez/el-dilema-centro-... huffingtonpost.es huffingtonpost
... ... ... ... ... ...
970 2910 970 https://www.theguardian.com/film/2017/mar/16/p... https: https:
971 2911 971 https://www.nytimes.com/2017/04/01/business/me... https: https:
972 2912 972 https://www.theguardian.com/books/2017/apr/02/... https: https:
973 2913 973 https://www.theguardian.com/us-news/2017/apr/0... https: https:
974 2997 974 http://college.usatoday.com/2017/03/30/paying-... http: http:
975 2915 975 https://www.theguardian.com/world/2017/apr/03/... https: https:
976 2916 976 https://www.theguardian.com/film/2017/apr/03/d... https: https:
977 2917 977 https://www.washingtonpost.com/national/health... https: https:
978 2918 978 https://www.theguardian.com/film/2017/apr/02/b... https: https:
979 2919 979 https://www.theguardian.com/australia-news/201... https: https:
980 2920 980 https://www.theguardian.com/film/2009/dec/04/t... https: https:
981 2921 981 https://www.theguardian.com/world/2017/apr/03/... https: https:
982 2922 982 https://www.theguardian.com/us-news/2017/jan/2... https: https:
983 2923 983 https://www.theguardian.com/music/2016/dec/11/... https: https:
984 2924 984 https://www.theguardian.com/world/2017/apr/02/... https: https:
985 2925 985 https://www.theguardian.com/sport/blog/2010/au... https: https:
986 2926 986 https://www.theguardian.com/us-news/2017/mar/3... https: https:
987 2927 987 https://www.theguardian.com/environment/climat... https: https:
988 2928 988 theregister.co.uk/2017/03/23/clearance_sale_sh... theregister.co.uk theregister
989 2936 989 https://www.theguardian.com/world/2017/apr/03/... https: https:
990 2929 990 https://www.theguardian.com/us-news/2017/apr/0... https: https:
991 2930 991 https://www.theguardian.com/sport/2017/apr/02/... https: https:
992 2931 992 https://www.theguardian.com/us-news/2017/apr/0... https: https:
993 2932 993 https://www.theguardian.com/world/2017/mar/31/... https: https:
994 2933 994 https://www.theguardian.com/world/2017/apr/02/... https: https:
995 2934 995 https://www.theguardian.com/artanddesign/2009/... https: https:
996 2935 996 https://www.theguardian.com/football/2017/apr/... https: https:
997 2937 997 https://www.theguardian.com/football/2017/apr/... https: https:
998 2938 998 http://news.sky.com/story/uber-halts-driverles... http: http:
999 2939 999 https://www.theguardian.com/world/2017/apr/02/... https: https:

1000 rows × 5 columns


In [ ]:


In [ ]:


In [ ]:


In [ ]:


In [47]:
create table fact(
	fact_id SERIAL,
	article_id	int references article_dim(article_id),
	author_id	int references author_dim(author_id),
	site_id	int references site_dim(site_id),
);


insert into author_dim
select distinct author 
from scraped;

insert into site_dim
select distinct site as name, url as domain, 
parse(url) as supersite id
from scraped;

insert into article_dim
select title, url, published_date, accessed_date, body, html, keywords, summary
from scraped;

insert into fact 
select article_id, author_id, site_id
from various tables, 
then commit;


  File "<ipython-input-47-d7b125f013da>", line 1
    create table article_dim(
               ^
SyntaxError: invalid syntax

In [ ]: