In [4]:
import feedparser
import smtplib
from bs4 import BeautifulSoup
import re
import glob
import os
import pandas as pd
from urllib2 import urlopen,Request  # for Python 3: from urllib.request import urlopen
import difflib
import time
import datetime
from apscheduler.schedulers.blocking import BlockingScheduler
import yaml
from sqlalchemy import create_engine, select
from sqlalchemy import MetaData, Column, Table, ForeignKey
from sqlalchemy import Integer, String, DateTime, Boolean

In [38]:
with open('../config.yaml') as f:
    cf = yaml.safe_load(f)

In [26]:
def load_craigs_page(entry):
    URL = entry['link']
    link_name = entry['id']
    exist_files = os.listdir(directory)

    duplicate = False
    for exist_file in exist_files:
        if exist_file in link_name:
            duplicate = True     
    if not duplicate:
        headers = {'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 5.5; WindowsNT)',
        'Accept' :'text/xml,application/xml,application/xhtml+xml,text/html;q=0.9,text/plain;q=0.8,image/png,*/*;q=0.5',
        'Accept-Language' : 'fr-fr,en-us;q=0.7,en;q=0.3',
        'Accept-Charset' : 'ISO-8859-1,utf-8;q=0.7,*;q=0.7'
        }
        print 'Added post ' + entry['id']
        req = Request(URL,None,headers)
        response = urlopen(req).read()
        post = BeautifulSoup(response)
        post_body_str = str(post.find(id='postingbody'))

        post_id_str = post.find('p','postinginfo',text=re.compile('post id'))
        post_id = str([int(s) for s in post_id_str.text.split() if s.isdigit()][0])
        f = open(directory+post_id+'.html','w')
        f.write(response)
        f.close()
        time.sleep(15)
    
def parse_hourly_wage(post):
    comp_div = post['soup'].find('section','userbody').find('div','bigattr')
    post_string = str(post['soup'].find(id='postingbody'))
    
    if comp_div:
       hr_pay = comp_div.text[len('compensation:')+1:]
    elif '/hr' in post_string:
        pay_index = post_string.index('/hr')
        hr_pay = post_string[pay_index-5:pay_index]
        pay_regex = re.findall(r'[-+]?[0-9]*\.?[0-9]+.',hr_pay)
        if len(pay_regex) > 0:
            hr_pay = pay_regex[0].strip()
    elif 'per hour' in post_string:
        pay_index = post_string.index('per hour')
        hr_pay = post_string[pay_index-5:pay_index]
    elif '$' in post_string:
        pay_index = post_string.index('$')
        hr_pay = post_string[pay_index:pay_index+8]
        hr_pay=''
    else:
        hr_pay = 'n/a'
    
    if hr_pay is not 'n/a':
        hr_pay = hr_pay.replace(',','')
        pay_regex = re.findall(r'[-+]?[0-9]*\.?[0-9]+.',hr_pay)
        if len(pay_regex) > 0:
            hr_pay = pay_regex[0].strip()
            hr_pay = hr_pay.replace('/','')
            hr_pay = hr_pay.replace('-','')
            hr_pay = hr_pay.replace('+','')
            if hr_pay.isdigit() and float(hr_pay)>25:
                hr_pay = 'n/a'
    return hr_pay

def parse_reply_options(post,post_id):
    reply_options_str = []
    if post.find('span','replybelow'):
        reply_options_str.append('reply below')
    elif post.find(id='replylink'):
        URL = 'http://philadelphia.craigslist.org/reply/phi/lab/'+post_id
        html = urlopen(URL).read()
        post_reply = BeautifulSoup(html)
        reply_options = post_reply.find('div','reply_options').find('ul').find_all('li')
        for option in reply_options:
            reply_options_str.append(option.text)
    return reply_options_str

def parse_phone_number(post):
    phone_numbers = re.findall(r'[0-9]{3}[.,-][0-9]{3}[.,-][0-9]{4}',post['body'])
    if len(phone_numbers)>0:
        return phone_numbers
    return 'n/a'

def parse_address(post):
    if post['soup'].find('div','mapaddress'):
        return post['soup'].find('div','mapaddress').text.rstrip()
    return 'n/a'

def form_text(post):
    pay = ''
    address = ''
    phone_number = ''
    if post['pay'] != 'n/a':
        pay_regex = re.findall(r'[-+]?[0-9]*\.?[0-9]+.',post['pay'])
        if len(pay_regex) > 0:
            pay = '$'+post['pay']+'/hr;'
    if post['address'] != 'n/a':
        address = post['address']+';'
    if post['phone_number'] != 'n/a':
        phone_number = post['phone_number'][0]
    text = phone_number+';'+pay+address+post['title']
    return text[:160]

def create_df(directory):
    posts={}
    for filename in glob.glob(directory+'*'):
        f = open(filename,'r')
        soup = BeautifulSoup(f.read())
        post_id = filename[len(directory):filename.index('.html')]
        
        post ={}
        post['post_id']=post_id
        post['title']=soup.find('title').text
        post['body']=str(soup.find(id='postingbody'))
        post['date_posted']=parse_date_posted(soup)
        post['soup']=soup
        post['pay']=parse_hourly_wage(post)
        post['address']=parse_address(post)
        post['phone_number']=parse_phone_number(post)
        post['text'] = form_text(post)
        post['sent']=False
        posts[post_id]=post
    df = pd.DataFrame(posts).T
    return df

def parse_date_posted(soup):
    return str(soup.find(id='display-date').text).replace('Posted: ','')
   

def get_valid_texts(df):
    df_new = df[df['phone_number'] != 'n/a']
    for row in df_new.iterrows():
        body_check = row[1]['body']
        if len([True for body in df_new['body'] if difflib.SequenceMatcher(None, body_check, body).ratio() > .25])>1:
            df_new=df_new[df_new.index != row[0]]
    return df_new

def read_rss_and_load(search_type,search_term,directory):
    clist_rss = 'http://philadelphia.craigslist.org/search/'+search_type+'?query='+search_term+'&s=0&format=rss'
    feed = feedparser.parse(clist_rss)
    link_name=feed['entries'][0].id
    file_name=link_name[link_name.rfind('/')+1:]

    if not os.path.exists(directory):
        os.makedirs(directory)
    for entry in feed['entries']:
       load_craigs_page(entry)

In [4]:
#search_term='cook&20philadelphia'
#search_type='fbh'
search_term='warehouse&20philadelphia'
search_type='lab'
directory = '../data/'+search_term+'/'

In [32]:
search={}
search['lab']=[]
search['lab'].append('warehouse&20philadelphia')
search['fbh']=[]
search['fbh'].append('dishwasher&20philadelphia')
search['fbh'].append('cook&20philadelphia')

In [42]:
print val


dishwasher&20philadelphia
cook&20philadelphia
warehouse&20philadelphia

In [11]:
#read_rss_and_load(search_type,search_term,directory)

In [27]:
df = create_df(directory)
df_valid = get_valid_texts(df)

In [28]:
df_valid


Out[28]:
address body date_posted pay phone_number post_id sent soup text title
4665470743 n/a <section id="postingbody">\n HIRING CUL... 2014-09-12 8:21pm n/a [215-952-5277] 4665470743 False <!DOCTYPE html> <html class="no-js"> <head> <t... 215-952-5277;HIRING CULINARY PROFESSIONALS at ... HIRING CULINARY PROFESSIONALS at the WELLS FAR...
4671228013 1223 w lincoln hwy <section id="postingbody">\n Part Time ... 2014-09-16 4:04pm n/a [215-757-1834] 4671228013 False <!DOCTYPE html> <html class="no-js"> <head> <t... 215-757-1834;1223 w lincoln hwy;part time ware... part time warehouse help
4672396387 n/a <section id="postingbody">\n LOOKING TO... 2014-09-17 11:58am STARTING AT MIN. WAGE [215-203-8706] 4672396387 False <!DOCTYPE html> <html class="no-js"> <head> <t... 215-203-8706;3 DIFFERENT JOB POSITIONS AVAILAB... 3 DIFFERENT JOB POSITIONS AVAILABLE AT MATTRES...
4679769756 n/a <section id="postingbody">\n Our food d... 2014-09-22 9:18am 15 [856-546-8787] 4679769756 False <!DOCTYPE html> <html class="no-js"> <head> <t... 856-546-8787;$15/hr;Warehouse Order Selector Warehouse Order Selector
4680170391 n/a <section id="postingbody">\n Keystone W... 2014-09-22 12:59pm Varies with position and skill level [610.277.8367, 610.277.8367, 215.875.2870, 610... 4680170391 False <!DOCTYPE html> <html class="no-js"> <head> <t... 610.277.8367;General Warehouse General Warehouse
4684929796 n/a <section id="postingbody">\n Forklift ,... 2014-09-25 11:26am n/a [609-712-8326, 267-467-0279] 4684929796 False <!DOCTYPE html> <html class="no-js"> <head> <t... 609-712-8326;Warehouse on New Jersey Warehouse on New Jersey
4694412061 n/a <section id="postingbody">\n Our client... 2014-10-01 12:53pm 9.50 [610-277-8367, 610.277.8367, 215.875.2870] 4694412061 False <!DOCTYPE html> <html class="no-js"> <head> <t... 610-277-8367;$9.50/hr;MATERIAL SCREEN PRINTER ... MATERIAL SCREEN PRINTER - 2ND SHIFT
4695611160 n/a <section id="postingbody">\n Best Perso... 2014-10-02 8:35am 7.50 [215-732-3100] 4695611160 False <!DOCTYPE html> <html class="no-js"> <head> <t... 215-732-3100;$7.50/hr;Warehouse Workers Warehouse Workers
4695618342 n/a <section id="postingbody">\n Seeking lo... 2014-10-02 8:43am 7.25 [215-218-9300] 4695618342 False <!DOCTYPE html> <html class="no-js"> <head> <t... 215-218-9300;$7.25/hr;Production/Packaging Production/Packaging
4696743011 n/a <section id="postingbody">\n Title: Fie... 2014-10-02 8:20pm 14.50 [877-858-2090] 4696743011 False <!DOCTYPE html> <html class="no-js"> <head> <t... 877-858-2090;$14.50/hr;Field Service Technicia... Field Service Technician (Durable Medical Equi...
4701790638 6240 Bristol Pike <section id="postingbody">\n<div>\n<p><a href=... 2014-10-06 12:10pm Competitive w/ Benefits [206.442.9000] 4701790638 False <!DOCTYPE html> <html class="no-js"> <head> <t... 206.442.9000;6240 Bristol Pike;Yard / Mechanic... Yard / Mechanic Assistant - Construction and H...
4704937147 n/a <section id="postingbody">\n Warehouse/... 2014-10-08 11:13am *** [215-688-5500] 4704937147 False <!DOCTYPE html> <html class="no-js"> <head> <t... 215-688-5500;WAREHOUSE/DRIVER WAREHOUSE/DRIVER
4705054470 3003 Mount Carmel Ave <section id="postingbody">\n Local heat... 2014-10-08 12:14pm Pay based on experience [215-663-1060] 4705054470 False <!DOCTYPE html> <html class="no-js"> <head> <t... 215-663-1060;3003 Mount Carmel Ave;Delivery Tr... Delivery Truck Driver & Warehouse Worker
4705149155 n/a <section id="postingbody">\n We are loo... 2014-10-08 1:03pm 12.00 [610.430.8490, 610.430.7909] 4705149155 False <!DOCTYPE html> <html class="no-js"> <head> <t... 610.430.8490;$12.00/hr;Warehouse - Shipping/Re... Warehouse - Shipping/Receiving/Inventory
4712930589 n/a <section id="postingbody">\n J &amp; J ... 2014-10-13 3:55pm 9.00 [215-773-9773] 4712930589 False <!DOCTYPE html> <html class="no-js"> <head> <t... 215-773-9773;$9.00/hr;Warehouse Jobs / Open House Warehouse Jobs / Open House
4716432334 n/a <section id="postingbody">\n Are you ti... 2014-10-15 6:21pm Competitive price list available [216-313-6298] 4716432334 False <!DOCTYPE html> <html class="no-js"> <head> <t... 216-313-6298;Window Installers Window Installers
4742511345 n/a <section id="postingbody">\n Opening fo... 2014-11-02 9:06am Salary depends on experience [610-275-7280] 4742511345 False <!DOCTYPE html> <html class="no-js"> <head> <t... 610-275-7280;Carpet Installer Carpet Installer
4744111419 n/a <section id="postingbody">\n Delivery p... 2014-11-03 11:31am n/a [877-581-0555] 4744111419 False <!DOCTYPE html> <html class="no-js"> <head> <t... 877-581-0555;Delivery Personnel Needed Earn $ ... Delivery Personnel Needed Earn $ Today!
4747756678 n/a <section id="postingbody">\n Pep Boys i... 2014-11-05 3:23pm 10.50 [845-469-6124] 4747756678 False <!DOCTYPE html> <html class="no-js"> <head> <t... 845-469-6124;$10.50/hr;Pep Boys is seeking a P... Pep Boys is seeking a PT inventory auditor for...
4749388886 303 Airport Rd <section id="postingbody">\n ... 2014-11-06 4:01pm 401k [215-345-4040] 4749388886 False <!DOCTYPE html> <html class="no-js"> <head> <t... 215-345-4040;$401k/hr;303 Airport Rd;Stock Person Stock Person
4749513267 n/a <section id="postingbody">\n Warehouse... 2014-11-06 5:16pm 8.25 [609-385-1432] 4749513267 False <!DOCTYPE html> <html class="no-js"> <head> <t... 609-385-1432;$8.25/hr;Warehouse Workers ""All ... Warehouse Workers ""All Shifts""

In [10]:
latest_id = df_valid['post_id'][len(df_valid)-1]

In [11]:
df.ix[latest_id]['body']


Out[11]:
'<section id="postingbody">\n        Are you tired of chasing your windows, your allied products and then your money?<br/>\nCome work with a company that cares about you and yours, We stock everything you need to install our jobs in one location, Simply come to our warehouse each morning, pick up your job and all related materials and go to the job site. No more running around! Our average job size is 10 to 20 windows. No more small jobs! We pay very competitive rates.We are hiring right now, we have work right now, we pay every week, we have work all year.  You need to have your own truck, tools, equipment, 5 years experience, liability insurance, workers comp and crew.  What could possibly stop you from calling me?216-313-6298, if I don\'t answer, leave me a message.<br/>\n</section>'

In [31]:
def send_text(from_addrs,username,password,to_addrs,msg):

    # The actual mail send
    server = smtplib.SMTP('smtp.gmail.com:587')
    server.starttls()
    server.login(username,password)
    server.sendmail(from_addrs, to_addrs, msg)
    server.quit()
    
toaddrs  = cf['toaddrs_demo']
msg = 'Hey Steve!'
send_mail(toaddrs,msg)