notebook.community

Edit and run



In [ ]:

    
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
df_urls = pd.DataFrame()
df_urls = pd.read_table('urls_1.csv', sep=',')
#df_test = df_urls.iloc[0:10, :]
df_test = df_urls.loc[[536,115], :]
#df_test = df_urls.head(n=2)

df_test
def getEmail(x):
    page = requests.get(str(x))
    soup = BeautifulSoup(page.content, 'html.parser')
    emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", soup.text, re.I))
    #mailtos = soup.select('a[href^=mailto]')
    #return soup
    EmailSearch = [emails,soup]
    return EmailSearch
    
    
for index, row in df_test.iterrows():
    x = row['URLS']
    EmailSearch2 = getEmail(x)
    emails = EmailSearch2[0]
    soup = EmailSearch2[1]
    print (emails)
    #if emails==set():
        #print("EMAIL EQUALS SET")
        #for elem in soup(text=re.compile(r'Contact')):
            #parent = (elem.parent)
            #y = parent.get('href')
            #print(y)
            #EmailSearch3 = getEmail(y)
            #emails = EmailSearch3[0]
            
    #df_test.set_value(index, 'EMAIL', emails)
    #df_test.set_value(index, 'MAILTOS', mailtos)

df_test
    



#for index, row in df_test.iterrows():
    #x = row['URLS']
    #getEmail(x)
    #if emails == "" and mailtos =="":
    #this seems to crash when letting more than a few rows into the df

    #print (emails)
    #print (index)
    #print (row)
    #df_test.set_value(index, 'EMAIL', emails)
    #df_test.set_value(index, 'MAILTOS', mailtos)
    #How to write the email list values into multiple new columns next to the URL?
    #if df_test['EMAIL'] == " ":
        #df_test.set_value(index, 'EMAIL', 'NO EMAIL')
    
    #for elem in soup(text=re.compile(r'Contact')):
        #parent = (elem.parent)
        #x = parent.get('href')
        
    
    
    
    #for link in soup.find_all('a'):
    #for link in item.find_all('a'):
        #FIND A WAY TO ONLY DO 'a' where the text matches "contact" or "about us" or "contact" etc.
        #print (link.get('href'))
        #pass that href back into the email scraper
        #repeat the analysis
        
        #http://stackoverflow.com/a/14470573

#page = requests.get('https://northstarfund.org/')
#soup = BeautifulSoup(page.content, 'html.parser')
#print (soup)        
        
#page = requests.get(str(row['URLS']))
    
    
#df_test



In [ ]:

    
'''
# WORKING
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import os, json
from bs4 import BeautifulSoup
#import lxml

df_urls = pd.DataFrame()
df_urls = pd.read_table('urls_1.csv', sep=',')

#for row in df_urls:
#make this iterate through the rows in df_urls and append email addresses in an adjacent column. If multiple email addresses on page, add multiple columns?

page = requests.get('http://dusp.mit.edu/department/contact-directions')
tree = html.fromstring(page.text)
soup = BeautifulSoup(page.content, 'html.parser')

#mailtos = soup.select('a[href^=mailto]')

# a set of crawled emails
emails = set()
new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", soup.text, re.I))
#how to include mailto links (with non-email text) as another thing to capture?
#print (mailtos)
#print (other)

print (new_emails)

#soup.body.findAll(text=re.compile('^@$'))
#for elem in soup(text=re.compile(r'@')):
    #print (elem.parent)
    
#http://stackoverflow.com/a/866050    
    
#soup = BeautifulSoup(page.content, 'html.parser')
#para = soup.find_all('p')
#print (para)
#print(soup.prettify())

#append the results from emails to new columns


#print (tree)
#print (buyers)
#for i in tree:
    #emails = re.search("a", i)
    #print (emails)

#http://stackoverflow.com/a/3655588

#http://scraping.pro/simple-email-crawler-python/
'''



In [ ]:

    
from bs4 import BeautifulSoup
import requests
import requests.exceptions
import urllib
import urllib.parse
from collections import deque
import re
import sys
import pandas as pd

df_urls = pd.DataFrame()
df_urls = pd.read_table('urls_7.csv', sep=',')
df_test = df_urls.head(n=2)


for index, row in df_test.iterrows():
    inputurl = (row['Links'])
    new_urls = deque([inputurl])
    # a set of urls that we have already crawled
    processed_urls = set()
    # a set of crawled emails
    emails = set()
    # process urls one by one until we exhaust the queue 
    while len(new_urls):
        # move next url from the queue to the set of processed urls
        url = new_urls.popleft()
        processed_urls.add(url)
        # extract base url to resolve relative links
        parts = urllib.parse.urlsplit(url)
        base_url = "{0.scheme}://{0.netloc}".format(parts)
        path = url[:url.rfind('/')+1] if '/' in parts.path else url

        # path is full path... http://...

        # get url's content
        #print("Processing %s" % url)
        try:
            response = requests.get(url)
        except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
            # ignore pages with errors
            continue

        # extract all email addresses and add them into the resulting set
        new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I))
        emails.update(new_emails)

        # create a beautiful soup for the html document
        soup = BeautifulSoup(response.text, 'html.parser')

        # find and process all the anchors in the document
        for anchor in soup.find_all("a"):
            # extract link url from the anchor
            link = anchor.attrs["href"] if "href" in anchor.attrs else ''
            # resolve relative links
            if link.startswith('mailto'):
                emails.update(link)
                break
            elif link.startswith('/'):
                link = base_url + link
            elif not link.startswith(inputurl):
                break
            # add the new url to the queue if it was not enqueued nor processed yet
            if not link in new_urls and not link in processed_urls:
                new_urls.append(link)
        new_emailsStr = str(new_emails)
        df_test.set_value(index, 'Email', new_emailsStr)
        #df_test.set_value(index, 'MAILTOS', mailtos)   
        #print new_urls
df_test

#print new_emails



In [2]:

    
from bs4 import BeautifulSoup
import requests
import requests.exceptions
import urllib
import urllib.parse
from collections import deque
import re
import sys
import pandas as pd

df_urls = pd.DataFrame()
df_urls = pd.read_table('urls_7.csv', sep=',')
df_test = df_urls.head(n=10)


for index, row in df_test.iterrows():
    url = (row['Links'])
    #new_urls = deque([inputurl])
    # a set of urls that we have already crawled
    processed_urls = set()
    # a set of crawled emails
    emails = set()
    # process urls one by one until we exhaust the queue 
    # extract base url to resolve relative links
    parts = urllib.parse.urlsplit(url)
    base_url = "{0.scheme}://{0.netloc}".format(parts)
    path = url[:url.rfind('/')+1] if '/' in parts.path else url

    # path is full path... http://...

    #get url's content
    #print("Processing %s" % url)
    try:
        response = requests.get(url)
    except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
        # ignore pages with errors
        continue

    # extract all email addresses and add them into the resulting set
    new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I))
    emails.update(new_emails)

    # create a beautiful soup for the html document
    soup = BeautifulSoup(response.text, 'html.parser')

    # find and process all the anchors in the document
    for anchor in soup.find_all("a"):
        # extract link url from the anchor
        link = anchor.attrs["href"] if "href" in anchor.attrs else ''
        # resolve relative links
        if link.startswith('mailto'):
            emails.update(link)
            break
        elif link.startswith('/'):
            link = base_url + link
        elif not link.startswith(inputurl):
            break
        # add the new url to the queue if it was not enqueued nor processed yet
        if not link in new_urls and not link in processed_urls:
            new_urls.append(link)
    new_emailsStr = str(new_emails)
    df_test.set_value(index, 'Email', new_emailsStr)
    #df_test.set_value(index, 'MAILTOS', mailtos)   
    #print new_urls

#df_test.to_csv(EXPORT, sep=',', encoding='utf-16')
df_test









    Out[2]:






  
    
      
      Links
      Email
    
  
  
    
      0
      http://boggscenter.org/
      set()
    
    
      1
      http://equitytrust.org/
      set()
    
    
      2
      http://mass-ave.org/
      set()
    
    
      3
      http://projectsouth.org
      set()
    
    
      4
      http://ujimacoinc.org
      {'ujimacoinc@me.com'}
    
    
      5
      http://www.cflsp.org
      set()
    
    
      6
      http://www.self-help.org
      set()
    
    
      7
      http://1lovemovement.org/
      set()
    
    
      8
      http://1worker1vote.org
      set()
    
    
      9
      http://350deschutes.org/
      set()



In [ ]:

	Links	Email
0	http://boggscenter.org/	set()
1	http://equitytrust.org/	set()
2	http://mass-ave.org/	set()
3	http://projectsouth.org	set()
4	http://ujimacoinc.org	{'ujimacoinc@me.com'}
5	http://www.cflsp.org	set()
6	http://www.self-help.org	set()
7	http://1lovemovement.org/	set()
8	http://1worker1vote.org	set()
9	http://350deschutes.org/	set()