In [ ]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup
df_urls = pd.DataFrame()
df_urls = pd.read_table('urls_1.csv', sep=',')
#df_test = df_urls.iloc[0:10, :]
df_test = df_urls.loc[[536,115], :]
#df_test = df_urls.head(n=2)
df_test
def getEmail(x):
page = requests.get(str(x))
soup = BeautifulSoup(page.content, 'html.parser')
emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", soup.text, re.I))
#mailtos = soup.select('a[href^=mailto]')
#return soup
EmailSearch = [emails,soup]
return EmailSearch
for index, row in df_test.iterrows():
x = row['URLS']
EmailSearch2 = getEmail(x)
emails = EmailSearch2[0]
soup = EmailSearch2[1]
print (emails)
#if emails==set():
#print("EMAIL EQUALS SET")
#for elem in soup(text=re.compile(r'Contact')):
#parent = (elem.parent)
#y = parent.get('href')
#print(y)
#EmailSearch3 = getEmail(y)
#emails = EmailSearch3[0]
#df_test.set_value(index, 'EMAIL', emails)
#df_test.set_value(index, 'MAILTOS', mailtos)
df_test
#for index, row in df_test.iterrows():
#x = row['URLS']
#getEmail(x)
#if emails == "" and mailtos =="":
#this seems to crash when letting more than a few rows into the df
#print (emails)
#print (index)
#print (row)
#df_test.set_value(index, 'EMAIL', emails)
#df_test.set_value(index, 'MAILTOS', mailtos)
#How to write the email list values into multiple new columns next to the URL?
#if df_test['EMAIL'] == " ":
#df_test.set_value(index, 'EMAIL', 'NO EMAIL')
#for elem in soup(text=re.compile(r'Contact')):
#parent = (elem.parent)
#x = parent.get('href')
#for link in soup.find_all('a'):
#for link in item.find_all('a'):
#FIND A WAY TO ONLY DO 'a' where the text matches "contact" or "about us" or "contact" etc.
#print (link.get('href'))
#pass that href back into the email scraper
#repeat the analysis
#http://stackoverflow.com/a/14470573
#page = requests.get('https://northstarfund.org/')
#soup = BeautifulSoup(page.content, 'html.parser')
#print (soup)
#page = requests.get(str(row['URLS']))
#df_test
In [ ]:
'''
# WORKING
import re
import requests
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import os, json
from bs4 import BeautifulSoup
#import lxml
df_urls = pd.DataFrame()
df_urls = pd.read_table('urls_1.csv', sep=',')
#for row in df_urls:
#make this iterate through the rows in df_urls and append email addresses in an adjacent column. If multiple email addresses on page, add multiple columns?
page = requests.get('http://dusp.mit.edu/department/contact-directions')
tree = html.fromstring(page.text)
soup = BeautifulSoup(page.content, 'html.parser')
#mailtos = soup.select('a[href^=mailto]')
# a set of crawled emails
emails = set()
new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", soup.text, re.I))
#how to include mailto links (with non-email text) as another thing to capture?
#print (mailtos)
#print (other)
print (new_emails)
#soup.body.findAll(text=re.compile('^@$'))
#for elem in soup(text=re.compile(r'@')):
#print (elem.parent)
#http://stackoverflow.com/a/866050
#soup = BeautifulSoup(page.content, 'html.parser')
#para = soup.find_all('p')
#print (para)
#print(soup.prettify())
#append the results from emails to new columns
#print (tree)
#print (buyers)
#for i in tree:
#emails = re.search("a", i)
#print (emails)
#http://stackoverflow.com/a/3655588
#http://scraping.pro/simple-email-crawler-python/
'''
In [ ]:
from bs4 import BeautifulSoup
import requests
import requests.exceptions
import urllib
import urllib.parse
from collections import deque
import re
import sys
import pandas as pd
df_urls = pd.DataFrame()
df_urls = pd.read_table('urls_7.csv', sep=',')
df_test = df_urls.head(n=2)
for index, row in df_test.iterrows():
inputurl = (row['Links'])
new_urls = deque([inputurl])
# a set of urls that we have already crawled
processed_urls = set()
# a set of crawled emails
emails = set()
# process urls one by one until we exhaust the queue
while len(new_urls):
# move next url from the queue to the set of processed urls
url = new_urls.popleft()
processed_urls.add(url)
# extract base url to resolve relative links
parts = urllib.parse.urlsplit(url)
base_url = "{0.scheme}://{0.netloc}".format(parts)
path = url[:url.rfind('/')+1] if '/' in parts.path else url
# path is full path... http://...
# get url's content
#print("Processing %s" % url)
try:
response = requests.get(url)
except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
# ignore pages with errors
continue
# extract all email addresses and add them into the resulting set
new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I))
emails.update(new_emails)
# create a beautiful soup for the html document
soup = BeautifulSoup(response.text, 'html.parser')
# find and process all the anchors in the document
for anchor in soup.find_all("a"):
# extract link url from the anchor
link = anchor.attrs["href"] if "href" in anchor.attrs else ''
# resolve relative links
if link.startswith('mailto'):
emails.update(link)
break
elif link.startswith('/'):
link = base_url + link
elif not link.startswith(inputurl):
break
# add the new url to the queue if it was not enqueued nor processed yet
if not link in new_urls and not link in processed_urls:
new_urls.append(link)
new_emailsStr = str(new_emails)
df_test.set_value(index, 'Email', new_emailsStr)
#df_test.set_value(index, 'MAILTOS', mailtos)
#print new_urls
df_test
#print new_emails
In [2]:
from bs4 import BeautifulSoup
import requests
import requests.exceptions
import urllib
import urllib.parse
from collections import deque
import re
import sys
import pandas as pd
df_urls = pd.DataFrame()
df_urls = pd.read_table('urls_7.csv', sep=',')
df_test = df_urls.head(n=10)
for index, row in df_test.iterrows():
url = (row['Links'])
#new_urls = deque([inputurl])
# a set of urls that we have already crawled
processed_urls = set()
# a set of crawled emails
emails = set()
# process urls one by one until we exhaust the queue
# extract base url to resolve relative links
parts = urllib.parse.urlsplit(url)
base_url = "{0.scheme}://{0.netloc}".format(parts)
path = url[:url.rfind('/')+1] if '/' in parts.path else url
# path is full path... http://...
#get url's content
#print("Processing %s" % url)
try:
response = requests.get(url)
except (requests.exceptions.MissingSchema, requests.exceptions.ConnectionError):
# ignore pages with errors
continue
# extract all email addresses and add them into the resulting set
new_emails = set(re.findall(r"[a-z0-9\.\-+_]+@[a-z0-9\.\-+_]+\.[a-z]+", response.text, re.I))
emails.update(new_emails)
# create a beautiful soup for the html document
soup = BeautifulSoup(response.text, 'html.parser')
# find and process all the anchors in the document
for anchor in soup.find_all("a"):
# extract link url from the anchor
link = anchor.attrs["href"] if "href" in anchor.attrs else ''
# resolve relative links
if link.startswith('mailto'):
emails.update(link)
break
elif link.startswith('/'):
link = base_url + link
elif not link.startswith(inputurl):
break
# add the new url to the queue if it was not enqueued nor processed yet
if not link in new_urls and not link in processed_urls:
new_urls.append(link)
new_emailsStr = str(new_emails)
df_test.set_value(index, 'Email', new_emailsStr)
#df_test.set_value(index, 'MAILTOS', mailtos)
#print new_urls
#df_test.to_csv(EXPORT, sep=',', encoding='utf-16')
df_test
Out[2]:
In [ ]: