In [23]:
import urllib2
from collections import namedtuple
import datetime
from bs4 import BeautifulSoup
import re
import pandas as pd
import numpy as np
import networkx as nx
import itertools
import pickle as pickle
import time
from collections import Counter
import operator
New York Social Diary provides a fascinating lens onto New York's socially well-to-do. The data forms a natural social graph for New York's social elite. As shown in this report of a recent holiday party, almost all the photos have annotated captions labeling their subjects. We can think of this as implicitly implying a social graph: there is a connection between two individuals if they appear in a picture together.
In this project, I investigate these connections between the NYC elite.
There are two steps -- gathering the data and analyzing it.
(1) To gather the data, I grab all the relevant photo-captions and save them; and then parse them to retrieve relevant information.
(2) To analyze the data, I consider the problem in terms of a network or a graph. Any time a pair of people appear in a photo together, that is considered a link. This is an (undirected) multigraph with no self-loops, and has an obvious analog in terms of an undirected weighted graph.
The first step is to gather the data. I want photos from parties before December 1st, 2014. This link contains a list of (party) pages. For each party, I find the url, and grab all the photocaptions.
(1) As you can see, the url changes are consistent for each party. There is the base url, followed by the year, followed by the party name, with dashes in place of spaces
(2) I use python's datetime.strptime function to parse the dates.
In [1]:
max_date = "14/12/01"
max_pages = 25 #actually 24 but just in case
url_base = "http://www.newyorksocialdiary.com/"
url_page_call = "party-pictures?page="
cutofftime=datetime.datetime.strptime(max_date, '%y/%m/%d')
PicBasic = namedtuple('PicBasic', 'url, dateinfo')
def span_info(span):
urldata = span.select('span.field-content > a')
datedata = span.select('span.views-field-created > span.field-content')
if len(urldata)!=1 or len(datedata)!=1:
print "Uh oh! We did something wrong"
return None
return PicBasic (
url = urldata[0]['href'],
dateinfo = datetime.datetime.strptime(datedata[0].text, '%A, %B %d, %Y')
)
urladdons=[]
for i in range(max_pages):
pageno=i+1
url = url_base + url_page_call + str(pageno)
raw_page = urllib2.urlopen(url).read()
soup = BeautifulSoup(raw_page)
t2spans=soup.select('div.views-row')
span_links=[span_info(span) for span in t2spans]
url_links = [datapt.url for datapt in span_links if datapt.dateinfo<cutofftime]
urladdons.extend(url_links) #finds the add-on links
print(len(urladdons)) #number of party pages
print(urladdons[0]) #add-on url for the first (last chronologically) party we identified
all_pic_captions = []
max_parties = len(urladdons)
def has_class_and_face(tag):
return not tag.has_attr('color') and tag.has_attr('face')
for j in range(max_parties):
try:
soup = BeautifulSoup(urllib2.urlopen(url_base + urladdons[j]))
except: #sometimes the webpage is not responsive; therefore it is necessary to have except statements
try:
soup = BeautifulSoup(urllib2.urlopen(url_base + urladdons[j]))
except:
try:
soup = BeautifulSoup(urllib2.urlopen(url_base + urladdons[j]))
except:
pass
for a in soup.find_all(class_ = "photocaption"):
try:
names_with_white = str(a.get_text())
names = names_with_white.lstrip()
all_pic_captions.append(names)
except:
pass
In [14]:
#TAKE OUT THE PHOTOGRAPHER
print(len(all_pic_captions))
all_pic_captions = [caption for caption in all_pic_captions if not re.search(r'^Photographs by ',caption)]
print(len(all_pic_captions))
In [17]:
###### SAVE AS PICKLE DATAFRAME FILE ###############
####################################################
print(all_pic_captions[0])
df=pd.DataFrame(all_pic_captions, columns=['all_pic_captions'])
df.to_pickle('captions2.pickle')
print(len(all_pic_captions))
Now comes the parsing part.
Some captions are not useful: they contain long narrative texts that explain the event. We have to find some heuristic rules to separate captions that are a list of names from those that are not. A few heuristics include:
I separate the captions based on various forms of punctuation.
This site is pretty formal and likes to say things like "Mayor Michael Bloomberg" after his election but "Michael Bloomberg" before his election. There are many titles such as Mayor, CEO, etc, that need to be filtered out.
In [4]:
############### OPEN SAVED PICKLE FILE #################
###########RUN FROM HERE IF DOING PREVIOUS ANALYSIS#####
df=pd.io.pickle.read_pickle('captions2.pickle')
allcaptions=df['all_pic_captions']
###USE ONLY CAPTIONS UNDER SOME SUBJECTIVE CHARACTER LENGTH
subjective_cutoff = 250
smallcaps=[caption for caption in allcaptions if len(caption)<subjective_cutoff]
len(smallcaps)
Out[4]:
In [4]:
####IDENTIFY VERBS ##########
dfiltered=pd.DataFrame(smallcaps, columns=['smallcaps'])
capwords = [[re.sub(r'[^\w\-\s]','',word) for word in document.split()]
for document in smallcaps]
def extractverbcaps(words):
#function extracts nouns from a tokenized list of unigrams
twords=pos_tag(words)
vtags = ['VB','VBD','VBG','VBN','VBP','VBZ']
stopvwords=['van','left','right','honoree','de','host','dressed']
verbpresent=0
for word in twords:
if word[1] in vtags:
if not word[0].istitle(): #only check for upper-case words
if word[0] not in stopvwords:
verbpresent=1
return verbpresent
verbpresent=[extractverbcaps(caption) for caption in capwords]
dfiltered['verbpresent']=verbpresent
dfiltered['tokenized']=capwords
dfiltered.to_pickle('filteredcaptions.pickle') #saving to pickle file
filteredcaps=dfiltered[dfiltered['verbpresent'] == 0]['smallcaps']
#####GETTING RID OF HONORIFICS, ETC###
filteredcaps2= [re.sub(r'[(][a-zA-Z]+[)]','', caps) for caps in filteredcaps] #getting rid of everything inside brackets
hwords1=['Mr. ','Guest',' M.D.','PhD','Ph.D.',' Jr.',' Sr.','Mrs. ','Miss ','Doctor ','Dr. ','Dr ','Chair ','CEO ','the Honorable ','Mayor ','Prince ','Baroness ', 'Princess ', 'Honorees ', 'Honoree',' MD']
hwordsp=['Museum President ','Chief Curator ','Frick Director ','Police Commissioner ','Music Director ','Frick Trustee ','Historic Hudson Valley Trustee ', 'Museum President ','Public Theater Artistic Director ','Public Theater Executive Director ','Executive Director ','Cooper Union President ','The Hon. ','Dancing Chair ','Director Emerita ']
hwords2=['Hon. ','Lord ','Senator ','Deputy ','Director ','Dean ','Actor ','Actress ',' Esq.', 'Gov ','Governor ','Father ','Congresswoman ','Congressman ', 'Countess ','Awardee ','Chairman ','Commissioner ','Lady ','Ambassador ','President ','CEO ']
hwords=hwordsp+hwords1+hwords2
hwords = '|'.join(list(set(hwords)))
filteredcaps2= [re.sub(r'^\s+|\s+$','', caps) for caps in filteredcaps2]
filteredcaps2= [re.sub(hwords,'', caps) for caps in filteredcaps2]
In [15]:
##########REPLACING COUPLES###########
#On investigation, we find that there are a lot of couple names -- i.e. Mary and John Drew.
#To parse these, we need to have it in a "Mary Drew and John Drew" format.
newnames=[]
countno=0
capstring="([A-Z][a-z]+)\s+and\s+([A-Z][a-z]+)\s+([A-Z][a-z]+)" #string for Kelly and Tom Monro forex
begstring="^%s" % capstring #string if it appears in the beginning
andstring="\\s+and\\s+%s" % capstring
withstring="\\s+with\\s+%s" % capstring
otherstring="\\s+[a-z]+\\s+%s" % capstring
def findingpairs(xlistno):
namestr=[]
for names in xlistno:
nstr = names[0] + " " + names[2] + " and " + names[1] + " " + names[2]
namestr.append(nstr)
return(', '.join(namestr))
for xnames in filteredcaps2:
xlistno2=re.search(otherstring,xnames)
xlistno=re.search(begstring,xnames)
if xlistno2:
xno= re.findall(capstring,xnames)
if len(xno)>1:
xn=findingpairs(xno)
newnames.append(xn)
else:
newstring=xlistno2.group(1)+ " " + xlistno2.group(3) + " and " + xlistno2.group(2) + " " + xlistno2.group(3)
newnames.append(re.sub(capstring, newstring, xnames))
elif xlistno:
xno= re.findall(capstring,xnames)
if len(xno)>1:
xn=findingpairs(xno)
newnames.append(xn)
else:
newstring=xlistno.group(1)+ " " + xlistno.group(3) + " and " + xlistno.group(2) + " " + xlistno.group(3)
newnames.append(re.sub(capstring, newstring, xnames))
else:
newnames.append(xnames)
print(len(newnames))
print("\n WITHOUT REPLACING COUPLES \n")
print(filteredcaps2[30:50])
print("\n REPLACING COUPLES \n")
print(newnames[30:50])
In [16]:
## FURTHER PARSING TO GET IN LIST OF NAMES FORMAT ##
newnames2 = [re.split(r',\s+and\s+|,\s+with\s+|;\s|\s+and\s+|\s+amd\s+|,\s|\s+with\s+',mylistentries) for mylistentries in newnames]
nameslist = [[word for word in caps if word !='']
for caps in newnames2]
nameslist=[[re.sub(r'\s+$|^\s+|\s+\n|\n\s+|\n','', caps) for caps in names]
for names in nameslist]
nameslist=[names for names in nameslist if len(names)>1]
nameslist=[[caps for caps in names if names[0].istitle()]
for names in nameslist]
stopwords=['friend','her daughter','President','CEO','Hospital for Special Surgery', 'a friend','NYU','son','sons','wife','dean','daughters','friends','guest','Guest','children','daughter','his wife','squires','guests','family','left','right','presents','welcomes','honoree','host']
nameslist = [[names for names in nameinds if names not in stopwords]
for nameinds in nameslist]
print(nameslist[30:50])
(1) A simple question we can ask is 'who is the most popular'? The easiest way to answer this question is to look at how many connections everyone has -- returning the top 100 people and their degree.
(2) A similar way to determine popularity is to look at their pagerank. Pagerank is essentially the stationary distribution of a markov chain implied by the social graph.
(3) Another interesting question is who tend to co-occur with each other. We might even be able to use this analysis to detect instances of affairs and infidelities!
In [24]:
## FIRST WE ENTER THE DATA INTO GRAPH FORMAT (i.e., containing edges and nodes)
def joinlists(listname):
#function for joining list of lists (i.e. from x=([['a','b'],['c']]) to x=['a','b','c'])
#want only unique values for each document, so:
uniquelitems = [set(listitems) for listitems in listname]
newlist=list(itertools.chain.from_iterable(uniquelitems))
return newlist
tot_allwords=joinlists(nameslist)
uniquenames=list(set(tot_allwords))
edgelists = [sorted(captions) for captions in nameslist]
edgelists = [itertools.combinations(captions,2) for captions in edgelists]
edgelists = [list(captions) for captions in edgelists]
edgelists = sum(edgelists, [])
uniquedges=list(set(edgelists))
xedges=Counter(edgelists)
edgecounts=[xedges[namestr] for namestr in uniquedges]
wedges=[0]*len(edgecounts)
for i in range(len(edgecounts)):
wedges[i]=(uniquedges[i][0],uniquedges[i][1],edgecounts[i])
G=nx.Graph()
G.add_nodes_from(uniquenames)
G.add_weighted_edges_from(wedges)
In [30]:
## 1: DETERMINING MOST POPULAR NAMES THAT APPEAR IN THE PHOTO CAPTIONS
deg_names = G.degree(weight='weight')
deg_names_sorted = sorted(deg_names.items(), key=operator.itemgetter(1),reverse=True)
deg_half = [(d[0],d[1]/2) for d in deg_sort]
print("MOST POPULAR PEOPLE IN THE NYC SOCIAL SCENE \N")
for x in range(100):
print(deg_sort[x])
In [21]:
## 2: DETERMINING TOP 100 MOST INFLUENTIAL PEOPLE IN THE NYC SOCIAL SCENE
pagerankout=[]
pgpop=nx.pagerank(G, alpha=0.85, personalization=None, max_iter=100)
highest = sorted(pgpop, key=pgpop.get, reverse=True)
for eachname in highest[0:100]:
x=(eachname,pgpop[eachname])
pagerankout.append(x)
#saving to pickle file
output = open('../../miniprojects/questions/pagerank2.pickle','w')
pickle.dump(pagerankout,output)
output.close()
#sorted_by_pagerank=pickle.load(open('../../miniprojects/questions/pagerank2.pickle'))
print("MOST INFLUENTIAL PEOPLE IN THE NYC SOCIAL SCENE: \n")
#for sorted_people in sorted_by_pagerank:
for sorted_people in pagerankout:
print(sorted_people)
In [29]:
## 3: DETERMINING TOP CONNECTIONS
edge_list = G.edges(data=True)
edge_sort = sorted(edge_list, key=operator.itemgetter(2),reverse=True)
edge_mod = [((a[0],a[1]),a[2]['weight']) for a in edge_sort]
print("People that appear together most frequently in pictures: \n")
for x in range(100):
print(edge_mod[x])