In [1]:
    
from bs4 import BeautifulSoup as bs
with open('gov/51787884.txt','r') as f:
    soup = bs(f.read())
    for tr in soup.select('tr'):
        th = tr.select('th')
        if len(th) > 0 :
            if th[0].text.encode('utf-8').strip() == '機關代碼':
                td = tr.select('td')[0].text.strip()
                print td
            if th[0].text.encode('utf-8').strip() == '機關名稱':
                td = tr.select('td')[0].text.strip()
                print td
    
    
    
In [2]:
    
from bs4 import BeautifulSoup as bs
dic = {'機關代碼': '', '機關名稱': '', '單位名稱': '' , '機關地址': '', '聯絡電話': ''}
with open('gov/51787884.txt','r') as f:
    soup = bs(f.read())
    for tr in soup.select('tr'):
        th = tr.select('th')
        if len(th) > 0 :
            th_value = th[0].text.encode('utf-8').strip()
            if th_value in dic:
                td = tr.select('td')[0].text.strip()
                dic[th_value] = td
    
In [4]:
    
for key in dic:
    print key, dic[key]
    
    
In [5]:
    
dic = {'標的分類': ''}
with open('gov/51787884.txt','r') as f:
    soup = bs(f.read())
    for tr in soup.select('tr'):
        th = tr.select('th')
        if len(th) > 0 :
            th_value = th[0].text.encode('utf-8').strip()
            if th_value in dic:
                td = tr.select('td')[0].text.strip()
                dic[th_value] = td
    
In [14]:
    
for ele in dic:
    print ele, repr(dic[ele])
    
    
In [16]:
    
a = '\t\t \n\n123\n324\t'
print a
print repr(a)
    
    
In [12]:
    
for ele in dic:
    print ele, ' '.join(dic[ele].split())
    
    
In [10]:
    
a = '         dddddd                  aaaaaaaaa                   bbbbbb   '
print a
print a.split()
print '|'.join(a.split())
print ''.join(a.split())
    
    
In [19]:
    
from datetime import date,datetime 
currenttime = datetime.now() 
print currenttime.strftime("%Y-%m-%d") 
print currenttime.strftime("%Y-%m-%d %H:%M") 
print type(currenttime.strftime("%Y-%m-%d %H:%M") )
    
    
In [21]:
    
a = '2014-05-03 14:00' 
print datetime.strptime(a, "%Y-%m-%d %H:%M")
print currenttime - datetime.strptime(a, "%Y-%m-%d %H:%M")
    
    
In [23]:
    
from datetime import timedelta
print currenttime 
print currenttime - timedelta(days = 1)
for i in xrange(1,11):
    print (currenttime - timedelta(days = i)).strftime('%Y/%m/%d')
    
    
In [30]:
    
response_date = '102/12/10 10:30' 
print response_date.split('/') 
print response_date.split('/', 1) 
print response_date.split('/', 1)[0] 
print int(response_date.split('/', 1)[0] ) + 1911
getyear = response_date.split('/', 1)
print str(int(getyear[0] ) + 1911 ) + '/' + getyear[1]
    
    
In [35]:
    
import re 
m = re.match( r"\$?(-?[0-9,]+)元", '352,111元') 
print m.group(1)
print m.group(1).split(',')
print ''.join(m.group(1).split(','))
print int(''.join(m.group(1).split(',')))
#print ''.join(m.group(1).split(','))
    
    
In [47]:
    
a = 'apple'
print a == 'apple'
import re
#re.match()
#re.search()
a = 'apple'
#  match word
print re.search('c', 'apple')
print re.search('a', 'apple')
print re.search('[abcdef]', 'apple')
print re.search('[abcdefghijklmnopqrstuvwxyz]', 'apple')
print re.search('[a-z]', 'apple')
# match numeric
print re.search('[0123456789]', 'apple1')
print re.search('[0-9]', 'apple1')
print re.search('\d', 'apple1') # \d == [0-9]
print re.search('[0-9a-zA-Z]', 'apple1')
print re.search('\w', 'apple1') # \w == [0-9a-zA-Z]
    
    
In [54]:
    
#email
emails = ['qoo@gmail.com', 'qpp@gmx.com', 'oop@gmail.com', '123@qq.com']
print [email for email in emails if re.search('gmail.com' , email)] # match gmail.com only
print [email for email in emails if re.search('g\w{2,3}.com' , email)] # match gxx.com and gxxx.com
print [email for email in emails if re.search('g\w{0,}.com' , email)] # match gxxxxxxxxxx.com or g.com
print [email for email in emails if re.search('g\w*.com' , email)] # {0,} == *
print [email for email in emails if re.search('g\w+.com' , email)] # {1,} == *
print [email for email in emails if re.search('g.+\.com' , email)] # . == \w\d!@#@%#$$%&%*&^*
    
    
In [58]:
    
phones = ['0912345678', '0912-345678', '0912-345-678', '09123456781823791738927398173812783213']
print [phone for phone in phones if re.search('\d{10}' , phone)]  
print [phone for phone in phones if re.search('\d{4}-?\d{6}' , phone)]  # {0,1} == ?
print [phone for phone in phones if re.search('\d{4}-?\d{3}-?\d{3}' , phone)]  
print [phone for phone in phones if re.search('^\d{4}-?\d{3}-?\d{3}$' , phone)]  # ^ initial, $ end
    
    
In [65]:
    
import requests
res = requests.get('http://ecapi.pchome.com.tw/ecshop/prodapi/v2/prod/DGBJAG-A9006UP86-000&store=DGBJAG&fields=Seq,Id,Name,Nick,Store,PreOrdDate,SpeOrdDate,Price,Discount,Pic,Weight,ISBN,Qty,Bonus,isBig,isSpec,isCombine,isDiy,isRecyclable,isCarrier,isMedical,isBigCart,isSnapUp,isDescAndIntroSync,isFoodContents,isHuge,isEnergySubsidy&_callback=jsonp_prod?_callback=jsonp_prod')
#print res.text
import re
m = re.search('jsonp_prod\((.+)\);}catch\(e\)', res.text)
import json
#print m.group(1) 
jd = json.loads(m.group(1))
print jd['DGBJAG-A9006UP86-000']['Price']['P']
    
    
In [68]:
    
import json
ACCESSTOKEN  = '<ACCESSTOKEN>'
fburl ='https://graph.facebook.com/v2.5/me/likes?access_token={}'
res = requests.get(fburl.format(ACCESSTOKEN))
jd = json.loads(res.text)
for likes in jd['data']:
    print likes['name']
    
    
In [70]:
    
import json
ACCESSTOKEN ='<ACCESSTOKEN>'
fburl ='https://graph.facebook.com/v2.5/DreamGirlsPuff/posts?access_token={}'
res = requests.get(fburl.format(ACCESSTOKEN))
jd = json.loads(res.text)
for message in jd['data']:
    if 'message' in message:
        print message['message']
    
    
In [78]:
    
import json
import jieba
from collections import Counter
ACCESSTOKEN ='<ACCESSTOKEN>'
fburl ='https://graph.facebook.com/v2.5/tsaiingwen/posts?access_token={}'
res = requests.get(fburl.format(ACCESSTOKEN))
dic = Counter()
jd = json.loads(res.text)
for message in jd['data']:
    if 'message' in message:
        for ele in jieba.cut(message['message']):
            dic[ele] += 1
    
In [81]:
    
for ele in dic.most_common(100):
    if len(ele[0]) >= 2:
        print ele[0], ele[1]
    
    
In [87]:
    
# -*- coding: utf-8 -*- 
from bs4 import BeautifulSoup 
from datetime import datetime 
import os, sys, re
    
In [100]:
    
def get_response_element (file_name): 
    f = open(file_name, 'r') 
    response_text = f.read() 
    f.close() 
    soup = BeautifulSoup(response_text) 
    tender_table = soup.select(".tender_table")[0] 
    tr = tender_table.select('tr') 
    return tr
    
In [101]:
    
def date_conversion(element):
        m = re.match(r"(\d+/\d+/\d+)",element)
        if m is not None:
                t = m.group(1).split('/', 1)
                if t[0] != '':
                        return datetime.strptime(str(int(t[0]) + 1911) + "/" + (''.join(t[1:]).split()[0] ), "%Y/%m/%d")
        else:
                return None
print date_conversion('102/03/09')
    
    
In [102]:
    
def money_conversion(element): 
    m = re.match( r"\$?-?([0-9,]+)", "".join(element.split())) 
    return int(''.join(m.group(0).split(',')))
print money_conversion('111,222元')
    
    
In [103]:
    
def remove_space(element): 
    return "".join(element.split())
print remove_space('      d                a              d       ')
    
    
In [104]:
    
dic = {'a':remove_space, 'b':money_conversion}
print dic['a']('adf          d                s')
    
    
In [105]:
    
name_map = {"機關代碼":"entity_code", "機關名稱":"procuring_entity","標案案號":"job_number","招標方式":"procurement_type",\
            "決標方式":"tender_awarding_type","標案名稱":"subject_of_procurement", "決標資料類別":"attr_of_tender_awarding", \
            "標的分類":"attr_of_procurement", "預算金額":"budget_value", "開標時間":"opening_date",\
            "決標公告日期":"tender_awarding_announce_date",\
            "歸屬計畫類別":"project_type","總決標金額":"total_tender_awarding_value","底價金額":"floor_price_value",\
            "決標日期":"tender_awarding_date", "pkAtmMain":"pkAtmMain"} 
tender_awards_map = {"機關代碼":remove_space, "機關名稱":remove_space, "標案案號":remove_space, \
                     "招標方式":remove_space,"決標方式":remove_space,"標案名稱":remove_space, "決標資料類別":remove_space, \
                     "標的分類":remove_space, "預算金額":money_conversion, "開標時間":date_conversion,"歸屬計畫類別":remove_space, \
                     "總決標金額":money_conversion,"底價金額":money_conversion,"決標日期":date_conversion, "決標公告日期":date_conversion}
    
In [110]:
    
def get_award_info_dic (element):
    returned_dic = {}
    for row in element:
        th = row.select('th')
        if len(th) > 0 :
            th_name = th[0].text.encode('utf-8').strip()
            if ('award_table_tr_1' or 'award_table_tr_2' or  'award_table_tr_6'  in row.get('class')) \
                and  (th_name in  tender_awards_map):
                returned_dic[name_map[th_name]]   =   tender_awards_map[th_name](row.select('td')[0].text.strip())
    return returned_dic
    
In [111]:
    
element = get_response_element('gov/51787884.txt')
dic = get_award_info_dic(element)
    
In [113]:
    
#print dic
for ele in dic:
    print ele, dic[ele]
    
    
In [114]:
    
# -*- coding: utf-8 -*- 
import sqlite3
db = sqlite3.connect('tender.sqlite') 
cur = db.cursor()
cur.execute('''CREATE TABLE IF NOT EXISTS Tender_awards(
                id INTEGER PRIMARY KEY,
                pkAtmMain TEXT,
                procuring_entity TEXT,
                entity_code TEXT,
                attr_of_procurement TEXT,
                opening_date DATETIME,
                procurement_type TEXT,
                tender_awarding_type TEXT,
                project_type TEXT,
                subject_of_procurement TEXT,
                job_number TEXT,
                budget_value BIGINTEGER,
                attr_of_tender_awarding TEXT,
                floor_price_value BIGINTEGER,
                tender_awarding_announce_date DATETIME,
                tender_awarding_date DATETIME,
                total_tender_awarding_value BIGINTEGER
                )''')
db.commit()
db.close()
    
In [ ]:
    
# insert into Tender_awards(opening_date, total_tender_awarding_value) vakues('2016-01-28 00:00:00', '59400000')
# insert into Tender_awards(opening_date, total_tender_awarding_value) vakues(?, ?)
    
In [116]:
    
a = {'a':1, 'b':2, 'c':3}
print a.keys()
    
    
In [123]:
    
element = get_response_element('gov/51787884.txt')
dic = get_award_info_dic(element)
statement = ' insert into Tender_awards({}) vakues({})'
print dic.keys()
print ', '.join(dic.keys())
print ','.join('?' * len(dic.keys()))
columns = ', '.join(dic.keys())
placeholders = ','.join('?' * len(dic.keys()))
print statement.format(columns, placeholders)
    
    
In [125]:
    
element = get_response_element('gov/51787884.txt')
dic = get_award_info_dic(element)
statement = ' insert into Tender_awards({}) values({})'
columns = ', '.join(dic.keys())
placeholders = ','.join('?' * len(dic.keys()))
#print statement.format(columns, placeholders)
import sqlite3
db = sqlite3.connect('tender.sqlite') 
cur = db.cursor()
cur.execute( statement.format(columns, placeholders), dic.values())
db.commit()
db.close()
    
In [127]:
    
import os
statement = ' insert into Tender_awards({}) values({})'
db = sqlite3.connect('tender.sqlite') 
cur = db.cursor()
for f in os.listdir('gov'):
    element = get_response_element('gov/' + f)
    dic = get_award_info_dic(element)
    columns = ', '.join(dic.keys())
    placeholders = ','.join('?' * len(dic.keys()))
    cur.execute( statement.format(columns, placeholders), dic.values())
    db.commit()
db.close()
    
In [129]:
    
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import unittest, time, re
driver = webdriver.Firefox()
driver.implicitly_wait(30)
base_url = "https://www.google.com.tw/"
        
driver.get(base_url + "/search?q=%E9%AB%98%E9%90%B5&ie=utf-8&oe=utf-8&gws_rd=cr&ei=rSfgVs64HsbNmwXB_oPQDw")
driver.find_element_by_link_text(u"時刻表與票價查詢").click()
Select(driver.find_element_by_id("StartStation")).select_by_visible_text(u"台北站")
Select(driver.find_element_by_id("EndStation")).select_by_visible_text(u"桃園站")
driver.find_element_by_css_selector("input.time_search_btn.time_search_btn_tw").click()
    
In [ ]: