In [1]:
from bs4 import BeautifulSoup as bs
with open('gov/51787884.txt','r') as f:
    soup = bs(f.read())
    for tr in soup.select('tr'):
        th = tr.select('th')
        if len(th) > 0 :
            if th[0].text.encode('utf-8').strip() == '機關代碼':
                td = tr.select('td')[0].text.strip()
                print td
            if th[0].text.encode('utf-8').strip() == '機關名稱':
                td = tr.select('td')[0].text.strip()
                print td


3.55
行政院環境保護署
C:\Anaconda2\lib\site-packages\bs4\__init__.py:166: UserWarning: No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.

To get rid of this warning, change this:

 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))

In [2]:
from bs4 import BeautifulSoup as bs
dic = {'機關代碼': '', '機關名稱': '', '單位名稱': '' , '機關地址': '', '聯絡電話': ''}
with open('gov/51787884.txt','r') as f:
    soup = bs(f.read())
    for tr in soup.select('tr'):
        th = tr.select('th')
        if len(th) > 0 :
            th_value = th[0].text.encode('utf-8').strip()
            if th_value in dic:
                td = tr.select('td')[0].text.strip()
                dic[th_value] = td

In [4]:
for key in dic:
    print key, dic[key]


單位名稱 廢棄物管理處
機關名稱 行政院環境保護署
機關代碼 3.55
聯絡電話 (02)23117722 分機 2624  2481
機關地址 100臺北市中正區中華路一段83號

In [5]:
dic = {'標的分類': ''}
with open('gov/51787884.txt','r') as f:
    soup = bs(f.read())
    for tr in soup.select('tr'):
        th = tr.select('th')
        if len(th) > 0 :
            th_value = th[0].text.encode('utf-8').strip()
            if th_value in dic:
                td = tr.select('td')[0].text.strip()
                dic[th_value] = td

In [14]:
for ele in dic:
    print ele, repr(dic[ele])


標的分類 u'<\u52de\u52d9\u985e>    \r\n                            94\r\n                            \u6c61\u6c34\u53ca\u5783\u573e\u8655\u7406\u3001\u516c\u5171\u885b\u751f\u53ca\u5176\u4ed6\u74b0\u4fdd\u670d\u52d9'

In [16]:
a = '\t\t \n\n123\n324\t'
print a
print repr(a)


		 

123
324	
'\t\t \n\n123\n324\t'

In [12]:
for ele in dic:
    print ele, ' '.join(dic[ele].split())


標的分類 <勞務類> 94 污水及垃圾處理、公共衛生及其他環保服務

In [10]:
a = '         dddddd                  aaaaaaaaa                   bbbbbb   '
print a
print a.split()
print '|'.join(a.split())
print ''.join(a.split())


         dddddd                  aaaaaaaaa                   bbbbbb   
['dddddd', 'aaaaaaaaa', 'bbbbbb']
dddddd|aaaaaaaaa|bbbbbb
ddddddaaaaaaaaabbbbbb

In [19]:
from datetime import date,datetime 
currenttime = datetime.now() 
print currenttime.strftime("%Y-%m-%d") 
print currenttime.strftime("%Y-%m-%d %H:%M") 
print type(currenttime.strftime("%Y-%m-%d %H:%M") )


2016-03-09
2016-03-09 19:08
<type 'str'>

In [21]:
a = '2014-05-03 14:00' 
print datetime.strptime(a, "%Y-%m-%d %H:%M")

print currenttime - datetime.strptime(a, "%Y-%m-%d %H:%M")


2014-05-03 14:00:00
676 days, 5:08:38.958000

In [23]:
from datetime import timedelta
print currenttime 
print currenttime - timedelta(days = 1)

for i in xrange(1,11):
    print (currenttime - timedelta(days = i)).strftime('%Y/%m/%d')


2016-03-09 19:08:38.958000
2016-03-08 19:08:38.958000
2016/03/08
2016/03/07
2016/03/06
2016/03/05
2016/03/04
2016/03/03
2016/03/02
2016/03/01
2016/02/29
2016/02/28

In [30]:
response_date = '102/12/10 10:30' 
print response_date.split('/') 
print response_date.split('/', 1) 
print response_date.split('/', 1)[0] 
print int(response_date.split('/', 1)[0] ) + 1911

getyear = response_date.split('/', 1)
print str(int(getyear[0] ) + 1911 ) + '/' + getyear[1]


['102', '12', '10 10:30']
['102', '12/10 10:30']
102
2013
2013/12/10 10:30

In [35]:
import re 
m = re.match( r"\$?(-?[0-9,]+)元", '352,111元') 
print m.group(1)
print m.group(1).split(',')
print ''.join(m.group(1).split(','))
print int(''.join(m.group(1).split(',')))
#print ''.join(m.group(1).split(','))


352,111
['352', '111']
352111
352111

In [47]:
a = 'apple'
print a == 'apple'
import re
#re.match()
#re.search()
a = 'apple'
#  match word
print re.search('c', 'apple')
print re.search('a', 'apple')
print re.search('[abcdef]', 'apple')
print re.search('[abcdefghijklmnopqrstuvwxyz]', 'apple')
print re.search('[a-z]', 'apple')
# match numeric
print re.search('[0123456789]', 'apple1')
print re.search('[0-9]', 'apple1')
print re.search('\d', 'apple1') # \d == [0-9]
print re.search('[0-9a-zA-Z]', 'apple1')
print re.search('\w', 'apple1') # \w == [0-9a-zA-Z]


True
None
<_sre.SRE_Match object at 0x0000000003F301D0>
<_sre.SRE_Match object at 0x0000000003F301D0>
<_sre.SRE_Match object at 0x0000000003F301D0>
<_sre.SRE_Match object at 0x0000000003F301D0>
<_sre.SRE_Match object at 0x0000000003F301D0>
<_sre.SRE_Match object at 0x0000000003F301D0>
<_sre.SRE_Match object at 0x0000000003F301D0>
<_sre.SRE_Match object at 0x0000000003F301D0>
<_sre.SRE_Match object at 0x0000000003F301D0>

In [54]:
#email
emails = ['qoo@gmail.com', 'qpp@gmx.com', 'oop@gmail.com', '123@qq.com']
print [email for email in emails if re.search('gmail.com' , email)] # match gmail.com only
print [email for email in emails if re.search('g\w{2,3}.com' , email)] # match gxx.com and gxxx.com
print [email for email in emails if re.search('g\w{0,}.com' , email)] # match gxxxxxxxxxx.com or g.com
print [email for email in emails if re.search('g\w*.com' , email)] # {0,} == *
print [email for email in emails if re.search('g\w+.com' , email)] # {1,} == *
print [email for email in emails if re.search('g.+\.com' , email)] # . == \w\d!@#@%#$$%&%*&^*


['qoo@gmail.com', 'oop@gmail.com']
['qpp@gmx.com']
['qoo@gmail.com', 'qpp@gmx.com', 'oop@gmail.com']
['qoo@gmail.com', 'qpp@gmx.com', 'oop@gmail.com']
['qoo@gmail.com', 'qpp@gmx.com', 'oop@gmail.com']
['qoo@gmail.com', 'qpp@gmx.com', 'oop@gmail.com']

In [58]:
phones = ['0912345678', '0912-345678', '0912-345-678', '09123456781823791738927398173812783213']
print [phone for phone in phones if re.search('\d{10}' , phone)]  
print [phone for phone in phones if re.search('\d{4}-?\d{6}' , phone)]  # {0,1} == ?
print [phone for phone in phones if re.search('\d{4}-?\d{3}-?\d{3}' , phone)]  
print [phone for phone in phones if re.search('^\d{4}-?\d{3}-?\d{3}$' , phone)]  # ^ initial, $ end


['0912345678', '09123456781823791738927398173812783213']
['0912345678', '0912-345678', '09123456781823791738927398173812783213']
['0912345678', '0912-345678', '0912-345-678', '09123456781823791738927398173812783213']
['0912345678', '0912-345678', '0912-345-678']

In [65]:
import requests
res = requests.get('http://ecapi.pchome.com.tw/ecshop/prodapi/v2/prod/DGBJAG-A9006UP86-000&store=DGBJAG&fields=Seq,Id,Name,Nick,Store,PreOrdDate,SpeOrdDate,Price,Discount,Pic,Weight,ISBN,Qty,Bonus,isBig,isSpec,isCombine,isDiy,isRecyclable,isCarrier,isMedical,isBigCart,isSnapUp,isDescAndIntroSync,isFoodContents,isHuge,isEnergySubsidy&_callback=jsonp_prod?_callback=jsonp_prod')
#print res.text
import re
m = re.search('jsonp_prod\((.+)\);}catch\(e\)', res.text)
import json
#print m.group(1) 
jd = json.loads(m.group(1))
print jd['DGBJAG-A9006UP86-000']['Price']['P']


12980

In [68]:
import json
ACCESSTOKEN  = '<ACCESSTOKEN>'
fburl ='https://graph.facebook.com/v2.5/me/likes?access_token={}'
res = requests.get(fburl.format(ACCESSTOKEN))

jd = json.loads(res.text)
for likes in jd['data']:
    print likes['name']


優仕達資訊股份有限公司
空姐報報Emily Post
LaLa徐佳瑩
數位行銷學院 neo Marketing School
KumaWash
非常木蘭
財務數據創新趨勢平台FinDit
廖若妤(廖小若)
產品開發週刊
爆料公社
一起看正妹
福邸 Foodie
GraphicStock
白馬八方尾根スキー場
Taiwan Startup Stadium 台灣新創競技場
Flowing Ocean 孚海國際
vonvon.me
BBC 中文網(繁體)
聞氫哥
東吳巨資學院
Wikimedia Taiwan
Periscope Data
BigData Taiwan
昕創千萬APP大賽
昕電視

In [70]:
import json
ACCESSTOKEN ='<ACCESSTOKEN>'
fburl ='https://graph.facebook.com/v2.5/DreamGirlsPuff/posts?access_token={}'
res = requests.get(fburl.format(ACCESSTOKEN))

jd = json.loads(res.text)
for message in jd['data']:
    if 'message' in message:
        print message['message']


結果是影片😂😂😂
一直以為是拍照😂
☀️☀️☀️
Choc girl
3月恰女生coming soon
今天的動力來源是~希望晚上吃牛排!!!☺️☺️☺️
Sprinkles 好吃!!!
Newport Beach 很舒服!!
Puff Kuo 冬夏混搭 很時尚😂
早安啊!
...Seriously???
看到報導我真的不知道該說什麼,我不是一個人住 車子也不是我一個人的 家裡還有哥哥妹妹們⋯他們也都有自己的生活 如果要因為我 連生活都不能自由了,我想應該沒人想跟我住了⋯
對我來說他們都是很乖的弟弟,我也很高興他們跟哥哥很好 
至於我平常買的東西都滿中性的
可能帶了幾次後會給我哥 那我哥想送給誰 那就是他的自由了 
總而言之 我不希望家人和朋友因為我不能有自己的生活或是因為我被誤會被罵 
我感到很抱歉  連我哥都能被寫成我緋聞的對象了 現在的報導只要是在我生活圈的異性都會被寫成緋聞吧???아  짜증나 好吧!我只好公布我的情人了⋯
hi ...nice to meet you ✋🏻
Happy Valentine's Day 💗
Hi~ it's me~(莎莉:這是圖奇的台詞…
看到暖暖的太陽
大家新年快樂,新的一年祝福大家都健康平安,也希望大家都能持續關心和幫助台南,也辛苦所有搜救人員和給予協助的大家~希望還能有更多的生還者🙏🏻
請大家有錢出錢有力出力,現在台南需要大家的幫助渡過難關~🙏🏻🙏🏻🙏🏻
以下是捐款資訊
大家可於今日下午4點直接至統一超商全台7-ELEVEN ibon便利生活站點選「台南震災(台南市政府)」捐款按鈕,即可現場繳費捐款,捐款帳號:社會救助金專戶009045065055臺灣銀行台南分行,指定0206震災,民眾若有捐款或物資捐贈詢事宜,可撥打(06)2995805或(06)2991111轉8061,也可傳真(06)2995759。
http://www.ettoday.net/news/20160206/645163.htm

捐血中心也嚴重血荒
請大家多多幫忙~
雲嘉南地區10個固定捐血地點,請參考新聞公佈的捐血地點
http://m.life.tw/?app=view&no=389239
這幾天還是好冷 總是離不開被窩⋯整個人懶懶的 希望過年期間出大太陽 想出去曬一曬 把寒氣和懶散都曬掉!!!
#Bella儂儂 2月號 
非常喜歡這次的造型風格
謝謝Bella 每次的拍攝都很愉快
也開始期待下一次的相遇了☺️
不知道大家喜歡嗎?😄
突然想起在國外出太陽時坐草地看書的畫面 覺得好舒服~😁晚安!
編:二月Bella儂儂 封面,敬請期待~

http://youtu.be/TgfAr28H9DM
路跑初體驗~
LOUIS VUITTON FOR unicef

#silverlockit 手鍊 
LV每售出一件 將有200美金捐至
UNICEF聯合國兒童基金會 協助全球
需要幫助的孩童  也可以至下方網站直接進行捐款 希望大家可以一起支持
給予這些孩子一個美好未來
用愛與關懷改變世界
@tiffanywenlo 
#makeapromise    #lvforunicef
 MAKEAPROMISE
了解詳情可以到以下網路
www.louisvuitton.com/lvforunicef
一度咕 。
編:2016年的第一本雜誌,大家都擁有了嗎~1/1已經發行了喔
祝大家 新年快樂

In [78]:
import json
import jieba
from collections import Counter
ACCESSTOKEN ='<ACCESSTOKEN>'
fburl ='https://graph.facebook.com/v2.5/tsaiingwen/posts?access_token={}'
res = requests.get(fburl.format(ACCESSTOKEN))

dic = Counter()
jd = json.loads(res.text)
for message in jd['data']:
    if 'message' in message:
        for ele in jieba.cut(message['message']):
            dic[ele] += 1

In [81]:
for ele in dic.most_common(100):
    if len(ele[0]) >= 2:
        print ele[0], ele[1]


我們 68
台灣 43
產業 25
一個 22
轉型 21
發展 18
大家 17
一起 16
可以 16
政府 15
社會 15
國家 14
推動 13
未來 13
就是 12
環境 12
農業 11
成為 10
新政府 10
二二八 10
透過 10
重要 10
正義 9
努力 9
建立 9
能源 8
和解 8
這個 8
以及 8
團結 8
國際 8
因為 8
goo 7
真相 7
企業 7
土地 7
歷史 7
政治 7
能夠 7
合作 7
新聞稿 7
全文 7
gl 7
永續 7
真正 7
http 7
面對 6
創新 6
氫能 6
人才 6
綠能 6

步驟 0 引用套件


In [87]:
# -*- coding: utf-8 -*- 
from bs4 import BeautifulSoup 
from datetime import datetime 
import os, sys, re

步驟1 建立資料抽出函式


In [100]:
def get_response_element (file_name): 
    f = open(file_name, 'r') 
    response_text = f.read() 
    f.close() 
    soup = BeautifulSoup(response_text) 
    tender_table = soup.select(".tender_table")[0] 
    tr = tender_table.select('tr') 
    return tr

步驟2 建立時間轉換函式


In [101]:
def date_conversion(element):
        m = re.match(r"(\d+/\d+/\d+)",element)
        if m is not None:
                t = m.group(1).split('/', 1)
                if t[0] != '':
                        return datetime.strptime(str(int(t[0]) + 1911) + "/" + (''.join(t[1:]).split()[0] ), "%Y/%m/%d")
        else:
                return None
print date_conversion('102/03/09')


2013-03-09 00:00:00

步驟3 建立金額轉換函式


In [102]:
def money_conversion(element): 
    m = re.match( r"\$?-?([0-9,]+)", "".join(element.split())) 
    return int(''.join(m.group(0).split(',')))
print money_conversion('111,222元')


111222

步驟4 建立移除空白函式


In [103]:
def remove_space(element): 
    return "".join(element.split())
print remove_space('      d                a              d       ')


dad

In [104]:
dic = {'a':remove_space, 'b':money_conversion}
print dic['a']('adf          d                s')


adfds

步驟5 建立轉換字典


In [105]:
name_map = {"機關代碼":"entity_code", "機關名稱":"procuring_entity","標案案號":"job_number","招標方式":"procurement_type",\
            "決標方式":"tender_awarding_type","標案名稱":"subject_of_procurement", "決標資料類別":"attr_of_tender_awarding", \
            "標的分類":"attr_of_procurement", "預算金額":"budget_value", "開標時間":"opening_date",\
            "決標公告日期":"tender_awarding_announce_date",\
            "歸屬計畫類別":"project_type","總決標金額":"total_tender_awarding_value","底價金額":"floor_price_value",\
            "決標日期":"tender_awarding_date", "pkAtmMain":"pkAtmMain"} 

tender_awards_map = {"機關代碼":remove_space, "機關名稱":remove_space, "標案案號":remove_space, \
                     "招標方式":remove_space,"決標方式":remove_space,"標案名稱":remove_space, "決標資料類別":remove_space, \
                     "標的分類":remove_space, "預算金額":money_conversion, "開標時間":date_conversion,"歸屬計畫類別":remove_space, \
                     "總決標金額":money_conversion,"底價金額":money_conversion,"決標日期":date_conversion, "決標公告日期":date_conversion}

步驟6 整理機關、已公告、決標資料


In [110]:
def get_award_info_dic (element):
    returned_dic = {}
    for row in element:
        th = row.select('th')
        if len(th) > 0 :
            th_name = th[0].text.encode('utf-8').strip()
            if ('award_table_tr_1' or 'award_table_tr_2' or  'award_table_tr_6'  in row.get('class')) \
                and  (th_name in  tender_awards_map):
                returned_dic[name_map[th_name]]   =   tender_awards_map[th_name](row.select('td')[0].text.strip())
    return returned_dic

In [111]:
element = get_response_element('gov/51787884.txt')
dic = get_award_info_dic(element)

In [113]:
#print dic
for ele in dic:
    print ele, dic[ele]


opening_date 2016-01-28 00:00:00
total_tender_awarding_value 5940000
tender_awarding_date 2016-02-26 00:00:00
entity_code 3.55
procurement_type 限制性招標(經公開評選或公開徵求)
procuring_entity 行政院環境保護署
attr_of_procurement <勞務類>94污水及垃圾處理、公共衛生及其他環保服務
tender_awarding_announce_date 2016-03-01 00:00:00
subject_of_procurement ​推動低碳垃圾清運及協助汰換清運車輛專案計畫
project_type 非屬愛台十二項計畫
attr_of_tender_awarding 決標公告
budget_value 6000000
floor_price_value 5940000
job_number 105A076
tender_awarding_type 準用最有利標

In [114]:
# -*- coding: utf-8 -*- 
import sqlite3

db = sqlite3.connect('tender.sqlite') 
cur = db.cursor()

cur.execute('''CREATE TABLE IF NOT EXISTS Tender_awards(
                id INTEGER PRIMARY KEY,
                pkAtmMain TEXT,
                procuring_entity TEXT,
                entity_code TEXT,
                attr_of_procurement TEXT,
                opening_date DATETIME,
                procurement_type TEXT,
                tender_awarding_type TEXT,
                project_type TEXT,
                subject_of_procurement TEXT,
                job_number TEXT,
                budget_value BIGINTEGER,
                attr_of_tender_awarding TEXT,
                floor_price_value BIGINTEGER,
                tender_awarding_announce_date DATETIME,
                tender_awarding_date DATETIME,
                total_tender_awarding_value BIGINTEGER
                )''')
db.commit()
db.close()

In [ ]:
# insert into Tender_awards(opening_date, total_tender_awarding_value) vakues('2016-01-28 00:00:00', '59400000')
# insert into Tender_awards(opening_date, total_tender_awarding_value) vakues(?, ?)

In [116]:
a = {'a':1, 'b':2, 'c':3}
print a.keys()


['a', 'c', 'b']

In [123]:
element = get_response_element('gov/51787884.txt')
dic = get_award_info_dic(element)
statement = ' insert into Tender_awards({}) vakues({})'
print dic.keys()
print ', '.join(dic.keys())
print ','.join('?' * len(dic.keys()))
columns = ', '.join(dic.keys())
placeholders = ','.join('?' * len(dic.keys()))
print statement.format(columns, placeholders)


['opening_date', 'total_tender_awarding_value', 'tender_awarding_date', 'entity_code', 'procurement_type', 'procuring_entity', 'attr_of_procurement', 'tender_awarding_announce_date', 'subject_of_procurement', 'project_type', 'attr_of_tender_awarding', 'budget_value', 'floor_price_value', 'job_number', 'tender_awarding_type']
opening_date, total_tender_awarding_value, tender_awarding_date, entity_code, procurement_type, procuring_entity, attr_of_procurement, tender_awarding_announce_date, subject_of_procurement, project_type, attr_of_tender_awarding, budget_value, floor_price_value, job_number, tender_awarding_type
?,?,?,?,?,?,?,?,?,?,?,?,?,?,?
 insert into Tender_awards(opening_date, total_tender_awarding_value, tender_awarding_date, entity_code, procurement_type, procuring_entity, attr_of_procurement, tender_awarding_announce_date, subject_of_procurement, project_type, attr_of_tender_awarding, budget_value, floor_price_value, job_number, tender_awarding_type) vakues(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)

In [125]:
element = get_response_element('gov/51787884.txt')
dic = get_award_info_dic(element)
statement = ' insert into Tender_awards({}) values({})'
columns = ', '.join(dic.keys())
placeholders = ','.join('?' * len(dic.keys()))

#print statement.format(columns, placeholders)

import sqlite3

db = sqlite3.connect('tender.sqlite') 
cur = db.cursor()
cur.execute( statement.format(columns, placeholders), dic.values())
db.commit()
db.close()

In [127]:
import os
statement = ' insert into Tender_awards({}) values({})'
db = sqlite3.connect('tender.sqlite') 
cur = db.cursor()

for f in os.listdir('gov'):
    element = get_response_element('gov/' + f)
    dic = get_award_info_dic(element)
    columns = ', '.join(dic.keys())
    placeholders = ','.join('?' * len(dic.keys()))
    cur.execute( statement.format(columns, placeholders), dic.values())
    db.commit()
db.close()

In [129]:
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import unittest, time, re


driver = webdriver.Firefox()
driver.implicitly_wait(30)
base_url = "https://www.google.com.tw/"
        
driver.get(base_url + "/search?q=%E9%AB%98%E9%90%B5&ie=utf-8&oe=utf-8&gws_rd=cr&ei=rSfgVs64HsbNmwXB_oPQDw")
driver.find_element_by_link_text(u"時刻表與票價查詢").click()
Select(driver.find_element_by_id("StartStation")).select_by_visible_text(u"台北站")
Select(driver.find_element_by_id("EndStation")).select_by_visible_text(u"桃園站")
driver.find_element_by_css_selector("input.time_search_btn.time_search_btn_tw").click()

In [ ]: