In [1]:
from bs4 import BeautifulSoup as bs
with open('gov/51787884.txt','r') as f:
soup = bs(f.read())
for tr in soup.select('tr'):
th = tr.select('th')
if len(th) > 0 :
if th[0].text.encode('utf-8').strip() == '機關代碼':
td = tr.select('td')[0].text.strip()
print td
if th[0].text.encode('utf-8').strip() == '機關名稱':
td = tr.select('td')[0].text.strip()
print td
In [2]:
from bs4 import BeautifulSoup as bs
dic = {'機關代碼': '', '機關名稱': '', '單位名稱': '' , '機關地址': '', '聯絡電話': ''}
with open('gov/51787884.txt','r') as f:
soup = bs(f.read())
for tr in soup.select('tr'):
th = tr.select('th')
if len(th) > 0 :
th_value = th[0].text.encode('utf-8').strip()
if th_value in dic:
td = tr.select('td')[0].text.strip()
dic[th_value] = td
In [4]:
for key in dic:
print key, dic[key]
In [5]:
dic = {'標的分類': ''}
with open('gov/51787884.txt','r') as f:
soup = bs(f.read())
for tr in soup.select('tr'):
th = tr.select('th')
if len(th) > 0 :
th_value = th[0].text.encode('utf-8').strip()
if th_value in dic:
td = tr.select('td')[0].text.strip()
dic[th_value] = td
In [14]:
for ele in dic:
print ele, repr(dic[ele])
In [16]:
a = '\t\t \n\n123\n324\t'
print a
print repr(a)
In [12]:
for ele in dic:
print ele, ' '.join(dic[ele].split())
In [10]:
a = ' dddddd aaaaaaaaa bbbbbb '
print a
print a.split()
print '|'.join(a.split())
print ''.join(a.split())
In [19]:
from datetime import date,datetime
currenttime = datetime.now()
print currenttime.strftime("%Y-%m-%d")
print currenttime.strftime("%Y-%m-%d %H:%M")
print type(currenttime.strftime("%Y-%m-%d %H:%M") )
In [21]:
a = '2014-05-03 14:00'
print datetime.strptime(a, "%Y-%m-%d %H:%M")
print currenttime - datetime.strptime(a, "%Y-%m-%d %H:%M")
In [23]:
from datetime import timedelta
print currenttime
print currenttime - timedelta(days = 1)
for i in xrange(1,11):
print (currenttime - timedelta(days = i)).strftime('%Y/%m/%d')
In [30]:
response_date = '102/12/10 10:30'
print response_date.split('/')
print response_date.split('/', 1)
print response_date.split('/', 1)[0]
print int(response_date.split('/', 1)[0] ) + 1911
getyear = response_date.split('/', 1)
print str(int(getyear[0] ) + 1911 ) + '/' + getyear[1]
In [35]:
import re
m = re.match( r"\$?(-?[0-9,]+)元", '352,111元')
print m.group(1)
print m.group(1).split(',')
print ''.join(m.group(1).split(','))
print int(''.join(m.group(1).split(',')))
#print ''.join(m.group(1).split(','))
In [47]:
a = 'apple'
print a == 'apple'
import re
#re.match()
#re.search()
a = 'apple'
# match word
print re.search('c', 'apple')
print re.search('a', 'apple')
print re.search('[abcdef]', 'apple')
print re.search('[abcdefghijklmnopqrstuvwxyz]', 'apple')
print re.search('[a-z]', 'apple')
# match numeric
print re.search('[0123456789]', 'apple1')
print re.search('[0-9]', 'apple1')
print re.search('\d', 'apple1') # \d == [0-9]
print re.search('[0-9a-zA-Z]', 'apple1')
print re.search('\w', 'apple1') # \w == [0-9a-zA-Z]
In [54]:
#email
emails = ['qoo@gmail.com', 'qpp@gmx.com', 'oop@gmail.com', '123@qq.com']
print [email for email in emails if re.search('gmail.com' , email)] # match gmail.com only
print [email for email in emails if re.search('g\w{2,3}.com' , email)] # match gxx.com and gxxx.com
print [email for email in emails if re.search('g\w{0,}.com' , email)] # match gxxxxxxxxxx.com or g.com
print [email for email in emails if re.search('g\w*.com' , email)] # {0,} == *
print [email for email in emails if re.search('g\w+.com' , email)] # {1,} == *
print [email for email in emails if re.search('g.+\.com' , email)] # . == \w\d!@#@%#$$%&%*&^*
In [58]:
phones = ['0912345678', '0912-345678', '0912-345-678', '09123456781823791738927398173812783213']
print [phone for phone in phones if re.search('\d{10}' , phone)]
print [phone for phone in phones if re.search('\d{4}-?\d{6}' , phone)] # {0,1} == ?
print [phone for phone in phones if re.search('\d{4}-?\d{3}-?\d{3}' , phone)]
print [phone for phone in phones if re.search('^\d{4}-?\d{3}-?\d{3}$' , phone)] # ^ initial, $ end
In [65]:
import requests
res = requests.get('http://ecapi.pchome.com.tw/ecshop/prodapi/v2/prod/DGBJAG-A9006UP86-000&store=DGBJAG&fields=Seq,Id,Name,Nick,Store,PreOrdDate,SpeOrdDate,Price,Discount,Pic,Weight,ISBN,Qty,Bonus,isBig,isSpec,isCombine,isDiy,isRecyclable,isCarrier,isMedical,isBigCart,isSnapUp,isDescAndIntroSync,isFoodContents,isHuge,isEnergySubsidy&_callback=jsonp_prod?_callback=jsonp_prod')
#print res.text
import re
m = re.search('jsonp_prod\((.+)\);}catch\(e\)', res.text)
import json
#print m.group(1)
jd = json.loads(m.group(1))
print jd['DGBJAG-A9006UP86-000']['Price']['P']
In [68]:
import json
ACCESSTOKEN = '<ACCESSTOKEN>'
fburl ='https://graph.facebook.com/v2.5/me/likes?access_token={}'
res = requests.get(fburl.format(ACCESSTOKEN))
jd = json.loads(res.text)
for likes in jd['data']:
print likes['name']
In [70]:
import json
ACCESSTOKEN ='<ACCESSTOKEN>'
fburl ='https://graph.facebook.com/v2.5/DreamGirlsPuff/posts?access_token={}'
res = requests.get(fburl.format(ACCESSTOKEN))
jd = json.loads(res.text)
for message in jd['data']:
if 'message' in message:
print message['message']
In [78]:
import json
import jieba
from collections import Counter
ACCESSTOKEN ='<ACCESSTOKEN>'
fburl ='https://graph.facebook.com/v2.5/tsaiingwen/posts?access_token={}'
res = requests.get(fburl.format(ACCESSTOKEN))
dic = Counter()
jd = json.loads(res.text)
for message in jd['data']:
if 'message' in message:
for ele in jieba.cut(message['message']):
dic[ele] += 1
In [81]:
for ele in dic.most_common(100):
if len(ele[0]) >= 2:
print ele[0], ele[1]
In [87]:
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from datetime import datetime
import os, sys, re
In [100]:
def get_response_element (file_name):
f = open(file_name, 'r')
response_text = f.read()
f.close()
soup = BeautifulSoup(response_text)
tender_table = soup.select(".tender_table")[0]
tr = tender_table.select('tr')
return tr
In [101]:
def date_conversion(element):
m = re.match(r"(\d+/\d+/\d+)",element)
if m is not None:
t = m.group(1).split('/', 1)
if t[0] != '':
return datetime.strptime(str(int(t[0]) + 1911) + "/" + (''.join(t[1:]).split()[0] ), "%Y/%m/%d")
else:
return None
print date_conversion('102/03/09')
In [102]:
def money_conversion(element):
m = re.match( r"\$?-?([0-9,]+)", "".join(element.split()))
return int(''.join(m.group(0).split(',')))
print money_conversion('111,222元')
In [103]:
def remove_space(element):
return "".join(element.split())
print remove_space(' d a d ')
In [104]:
dic = {'a':remove_space, 'b':money_conversion}
print dic['a']('adf d s')
In [105]:
name_map = {"機關代碼":"entity_code", "機關名稱":"procuring_entity","標案案號":"job_number","招標方式":"procurement_type",\
"決標方式":"tender_awarding_type","標案名稱":"subject_of_procurement", "決標資料類別":"attr_of_tender_awarding", \
"標的分類":"attr_of_procurement", "預算金額":"budget_value", "開標時間":"opening_date",\
"決標公告日期":"tender_awarding_announce_date",\
"歸屬計畫類別":"project_type","總決標金額":"total_tender_awarding_value","底價金額":"floor_price_value",\
"決標日期":"tender_awarding_date", "pkAtmMain":"pkAtmMain"}
tender_awards_map = {"機關代碼":remove_space, "機關名稱":remove_space, "標案案號":remove_space, \
"招標方式":remove_space,"決標方式":remove_space,"標案名稱":remove_space, "決標資料類別":remove_space, \
"標的分類":remove_space, "預算金額":money_conversion, "開標時間":date_conversion,"歸屬計畫類別":remove_space, \
"總決標金額":money_conversion,"底價金額":money_conversion,"決標日期":date_conversion, "決標公告日期":date_conversion}
In [110]:
def get_award_info_dic (element):
returned_dic = {}
for row in element:
th = row.select('th')
if len(th) > 0 :
th_name = th[0].text.encode('utf-8').strip()
if ('award_table_tr_1' or 'award_table_tr_2' or 'award_table_tr_6' in row.get('class')) \
and (th_name in tender_awards_map):
returned_dic[name_map[th_name]] = tender_awards_map[th_name](row.select('td')[0].text.strip())
return returned_dic
In [111]:
element = get_response_element('gov/51787884.txt')
dic = get_award_info_dic(element)
In [113]:
#print dic
for ele in dic:
print ele, dic[ele]
In [114]:
# -*- coding: utf-8 -*-
import sqlite3
db = sqlite3.connect('tender.sqlite')
cur = db.cursor()
cur.execute('''CREATE TABLE IF NOT EXISTS Tender_awards(
id INTEGER PRIMARY KEY,
pkAtmMain TEXT,
procuring_entity TEXT,
entity_code TEXT,
attr_of_procurement TEXT,
opening_date DATETIME,
procurement_type TEXT,
tender_awarding_type TEXT,
project_type TEXT,
subject_of_procurement TEXT,
job_number TEXT,
budget_value BIGINTEGER,
attr_of_tender_awarding TEXT,
floor_price_value BIGINTEGER,
tender_awarding_announce_date DATETIME,
tender_awarding_date DATETIME,
total_tender_awarding_value BIGINTEGER
)''')
db.commit()
db.close()
In [ ]:
# insert into Tender_awards(opening_date, total_tender_awarding_value) vakues('2016-01-28 00:00:00', '59400000')
# insert into Tender_awards(opening_date, total_tender_awarding_value) vakues(?, ?)
In [116]:
a = {'a':1, 'b':2, 'c':3}
print a.keys()
In [123]:
element = get_response_element('gov/51787884.txt')
dic = get_award_info_dic(element)
statement = ' insert into Tender_awards({}) vakues({})'
print dic.keys()
print ', '.join(dic.keys())
print ','.join('?' * len(dic.keys()))
columns = ', '.join(dic.keys())
placeholders = ','.join('?' * len(dic.keys()))
print statement.format(columns, placeholders)
In [125]:
element = get_response_element('gov/51787884.txt')
dic = get_award_info_dic(element)
statement = ' insert into Tender_awards({}) values({})'
columns = ', '.join(dic.keys())
placeholders = ','.join('?' * len(dic.keys()))
#print statement.format(columns, placeholders)
import sqlite3
db = sqlite3.connect('tender.sqlite')
cur = db.cursor()
cur.execute( statement.format(columns, placeholders), dic.values())
db.commit()
db.close()
In [127]:
import os
statement = ' insert into Tender_awards({}) values({})'
db = sqlite3.connect('tender.sqlite')
cur = db.cursor()
for f in os.listdir('gov'):
element = get_response_element('gov/' + f)
dic = get_award_info_dic(element)
columns = ', '.join(dic.keys())
placeholders = ','.join('?' * len(dic.keys()))
cur.execute( statement.format(columns, placeholders), dic.values())
db.commit()
db.close()
In [129]:
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import unittest, time, re
driver = webdriver.Firefox()
driver.implicitly_wait(30)
base_url = "https://www.google.com.tw/"
driver.get(base_url + "/search?q=%E9%AB%98%E9%90%B5&ie=utf-8&oe=utf-8&gws_rd=cr&ei=rSfgVs64HsbNmwXB_oPQDw")
driver.find_element_by_link_text(u"時刻表與票價查詢").click()
Select(driver.find_element_by_id("StartStation")).select_by_visible_text(u"台北站")
Select(driver.find_element_by_id("EndStation")).select_by_visible_text(u"桃園站")
driver.find_element_by_css_selector("input.time_search_btn.time_search_btn_tw").click()
In [ ]: