In [4]:
import requests
proxies = {
"http": "69.30.209.16:8000"
}
res = requests.get("http://www.python.org", proxies=proxies)
print res.text
In [1]:
f = open('gov/51099604_1026604961.txt', 'r')
response_text = f.read()
f.close()
In [2]:
from bs4 import BeautifulSoup
res = BeautifulSoup(response_text)
#print res.text
In [3]:
#for th in res.select('th'):
#print th
#print th.text.strip()
In [13]:
for tr in res.select('tr'):
th = tr.select('th')
if len(th) > 0:
#print tr.select('th')[0].text.strip()
if tr.select('th')[0].text.strip().encode('utf-8') == '機關代碼':
#print tr
print tr.select('td')[0].text.strip()
In [5]:
import requests
res = requests.get('http://www.comicvip.com/show/cool-103.html?ch=800')
print res.text
In [16]:
# -*- coding: utf-8 -*-
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import Select
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoAlertPresentException
import time, re
from bs4 import BeautifulSoup
import shutil
# use selenium to open firefox
driver = webdriver.Firefox()
# let firefox wait 30 seconds
driver.implicitly_wait(30)
# base url
base_url = "http://www.comicvip.com/show/cool-103.html?ch=800"
# sasve comic function
def savecomic(soup):
for img in soup.select('img'):
if 'jpg' in img.get('src'):
comic_link = img.get('src')
res = requests.get(comic_link, stream=True)
f = open(comic_link.split('/')[-1], 'wb')
shutil.copyfileobj(res.raw, f)
f.close()
driver.get(base_url)
soup = BeautifulSoup(driver.page_source)
savecomic(soup)
while True:
driver.implicitly_wait(1)
driver.find_element_by_css_selector("#next > b > font").click()
soup = BeautifulSoup(driver.page_source)
savecomic(soup)
# Close driver
driver.close()
#driver.find_element_by_css_selector("#next > b > font").click()
In [ ]:
from seleniumrequests import Firefox
from bs4 import BeautifulSoup
webdriver = Firefox()
response = webdriver.request('GET', 'http://www.comicvip.com/show/cool-103.html?ch=800')
soup = BeautifulSoup(response.text)
for img in soup.select('img'):
print img
In [8]:
import requests
import json
res = requests.get('https://api.muzikair.com/v3/air/play/px2gg8?page=webpage_all&keywords=&count=2&lang=tw')
#print res.text
jd = json.loads(res.text)
playlink = jd['data']['url']['track']
print playlink
res2 = requests.get(playlink, stream=True)
import shutil
f = open('x.mp3', 'wb')
shutil.copyfileobj(res2.raw, f)
f.close()
In [13]:
import requests
from bs4 import BeautifulSoup
#import os
#os.mkdir('gov')
res = requests.get('http://web.pcc.gov.tw/tps/main/pms/tps/atm/atmAwardAction.do?newEdit=false&searchMode=common&method=inquiryForPublic&pkAtmMain=51099604&tenderCaseNo=1026604961')
soup = BeautifulSoup(res.text, "html.parser")
f = open('gov/51099604_1026604961.txt', 'w')
printarea = soup.select('#printArea')[0].prettify('utf-8')
f.write(printarea)
f.close()
f = open('gov/51099604_1026604961.txt', 'r')
response_text = f.read()
f.close()
In [22]:
soup2 = BeautifulSoup(response_text)
for tr in soup2.select('tr'):
if len(tr.select('th')) > 0:
if tr.select('th')[0].text.strip().encode('utf-8') == '機關代碼':
#print tr.select('th')[0].text.strip().encode('utf-8')
print tr.select('td')[0].text.strip().encode('utf-8')
In [ ]:
<tr class="award_table_tr_1">
<th valign="middle" bgcolor="#DAEBED" class="T11b" width="200" align="left">機關代碼</th>
<td bgcolor="#EFF1F1" class="newstop" height="25">
3.13.30.20
</td>
</tr>
In [23]:
s = " \t string test \t \n\n\r" ' \tstring test \t \n\n\r'
print s
In [24]:
print s.strip()
In [25]:
print s.rstrip()
In [26]:
print s.lstrip()
In [28]:
soup2 = BeautifulSoup(response_text)
for tr in soup2.select('tr'):
if len(tr.select('th')) > 0:
if tr.select('th')[0].text.strip().encode('utf-8') == '機關代碼':
print tr.select('td')[0].text.strip().encode('utf-8')
if tr.select('th')[0].text.strip().encode('utf-8') == '機關名稱':
print tr.select('td')[0].text.strip().encode('utf-8')
if tr.select('th')[0].text.strip().encode('utf-8') == '單位名稱':
print tr.select('td')[0].text.strip().encode('utf-8')
In [29]:
dic = {'a':100, 'b':"yes", 'c':0.98}
print dic
In [30]:
#get keys in dictionary
print dic.keys()
#get values in dictionary
print dic.values()
In [31]:
#get value of given key
print dic['a']
#get value of given key
print dic.get('a')
In [32]:
#get value of given key
print dic['d']
In [36]:
#get value of given key
print dic.get('d')
print dic.get('d', 'qoo')
print dic.get('a', 'qoo')
In [39]:
dic['d'] = 'qoo'
print dic
qoo = {'e':1., 'f':99}
dic.update(qoo)
In [40]:
print dic
In [44]:
dic = {"機關代碼":"","機關名稱":"","機關地址":"", "聯絡人":""}
soup2 = BeautifulSoup(response_text)
for tr in soup2.select('tr'):
if len(tr.select('th')) > 0:
if tr.select('th')[0].text.strip().encode('utf-8') in dic:
print tr.select('th')[0].text.strip().encode('utf-8'),
print tr.select('td')[0].text.strip().encode('utf-8')
In [45]:
dic = {"機關代碼":"","機關名稱":"","機關地址":"", "聯絡人":""}
soup2 = BeautifulSoup(response_text)
for tr in soup2.select('tr'):
if len(tr.select('th')) > 0:
if tr.select('th')[0].text.strip().encode('utf-8') in dic:
dic[tr.select('th')[0].text.strip().encode('utf-8')] = tr.select('td')[0].text.strip().encode('utf-8')
In [49]:
print dic
for key in dic:
print key, dic[key]
In [59]:
dic = {"標的分類":""}
soup2 = BeautifulSoup(response_text)
for tr in soup2.select('tr'):
if len(tr.select('th')) > 0:
if tr.select('th')[0].text.strip().encode('utf-8') in dic:
print tr.select('td')[0].text.strip().encode('utf-8')
In [60]:
a ='\t\t string \n\t'
print a
In [61]:
print repr(a)
In [62]:
dic = {"標的分類":""}
soup2 = BeautifulSoup(response_text)
for tr in soup2.select('tr'):
if len(tr.select('th')) > 0:
if tr.select('th')[0].text.strip().encode('utf-8') in dic:
print repr(tr.select('td')[0].text.strip().encode('utf-8'))
In [67]:
a = '123,456,789,qoo'
print a.split(',')
print a.split(',',1)
print a.split(',',2)
In [74]:
c = ['1','2','3','4']
print '|'.join(c)
a = '123 21333333333333333333333 999'
print a.split()
print ' '.join(a.split())
print '|'.join(a.split())
In [76]:
dic = {"標的分類":""}
soup2 = BeautifulSoup(response_text)
for tr in soup2.select('tr'):
if len(tr.select('th')) > 0:
if tr.select('th')[0].text.strip().encode('utf-8') in dic:
print ' '.join(tr.select('td')[0].text.strip().encode('utf-8').split())
In [83]:
from datetime import date,datetime
currenttime = datetime.now()
print type(currenttime)
print type(currenttime.strftime("%Y-%m-%d"))
print currenttime.strftime("%Y-%m-%d")
print currenttime.strftime("%Y-%m-%d %H:%M:%S")
a = '2014-05-03 14:00'
print datetime.strptime(a, "%Y-%m-%d %H:%M")
print type(datetime.strptime(a, "%Y-%m-%d %H:%M"))
#t = '102/11/05 10:00'
In [90]:
t = '102/11/05 10:00'
getyear = t.split('/', 1)
print int(getyear[0]) + 1911
adtime = '/'.join([str(int(getyear[0]) + 1911), getyear[1]])
print datetime.strptime(adtime, '%Y/%m/%d %H:%M')
print type(datetime.strptime(adtime, '%Y/%m/%d %H:%M'))
In [91]:
import re
m = re.match( r"([0-9,]+)元", '6,288,452元')
print ''.join(m.group(1).split(','))
In [108]:
email1 = 'david@iii.com'
import re
m = re.match('(\w+)@([\w\.]+)', email1)
#print m.group(1)
#print m.group(2)
email_list = ['david@iii.com', 'qoo@oop.com', '12313213$999.com']
for email in email_list:
m = re.match('(\w+)@([\w\.]+)', email)
if m:
print "username:", m.group(1)
phone = '0912345678'
m = re.match('\d{10}', phone)
print m
phone_list = ['0912345678', '0912-345-678', '0912-345678', '10238018290829085024']
for phone in phone_list:
m = re.match('(\d+)', phone)
if m:
print "phone:", m.group(1)
print '============================'
for phone in phone_list:
m = re.match('(\d{4}-\d+)', phone)
if m:
print "phone:", m.group(1)
print '============================'
for phone in phone_list:
m = re.match('(\d{4}-{0,1}\d+)', phone)
if m:
print "phone:", m.group(1)
print '============================'
for phone in phone_list:
m = re.match('(\d{4}-?\d+)', phone)
if m:
print "phone:", m.group(1)
print '============================'
for phone in phone_list:
m = re.match('(\d{4}-?\d{3}-?\d{3})', phone)
if m:
print "phone:", m.group(1)
print '============================'
for phone in phone_list:
m = re.match('(^\d{4}-?\d{3}-?\d{3}$)', phone)
if m:
print "phone:", m.group(1)
In [125]:
import requests
res = requests.get('http://ecapi.pchome.com.tw/ecshop/prodapi/v2/prod/DGBHAB-19006DLL1&fields=Seq,Id,Name,Nick,Store,PreOrdDate,SpeOrdDate,Price,Discount,Pic,Weight,ISBN,Qty,Bonus,isBig,isSpec,isCombine,isDiy,isRecyclable,isCarrier,isMedical,isBigCart,isSnapUp,isDescAndIntroSync,isFoodContents,isHuge&_callback=jsonpcb_prodecshop?_callback=jsonpcb_prodecshop')
#print res.text
import re
m = re.match('.+"P":(\d+)},.*',res.text)
#print m.group(1)
import re
m = re.search('"P":(\d+)},',res.text)
#print m.group(1)
import re
m = re.search('jsonpcb_prodecshop\((.*?)\);', res.text)
#print m.group(1)
import json
jd = json.loads(m.group(1))
print jd['DGBHAB-19006DLL1-000']['Price']['P']
In [128]:
a = '我從口袋中撿到200元 超爽der 他只剩30元'
import re
m = re.search('(\d+)元', a)
print m.group(1)
import re
m = re.findall('(\d+)元', a)
print m
In [133]:
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup
from datetime import datetime
import os, sys, re
def get_response_element (file_name):
f = open(file_name, 'r')
response_text = f.read()
f.close()
soup = BeautifulSoup(response_text)
tender_table = soup.select('.table_block.tender_table')[0]
tr = tender_table.select('tr')
return tr
#print get_response_element('gov/51099604_1026604961.txt')
In [135]:
def date_conversion(element):
m = re.match(r"(\d+/\d+/\d+)",element)
if m is not None:
t = m.group(1).split('/', 1)
if t[0] != '':
return datetime.strptime(str(int(t[0]) + 1911) + "/" + (''.join(t[1:]).split()[0] ), "%Y/%m/%d")
else:
return None
print date_conversion('102/11/10')
print date_conversion('102/11/10 13:00')
In [148]:
def money_conversion(element):
m = re.match( r"\$?(-?[0-9,]+)", element)
return int(''.join(m.group(0).split(',')))
#print money_conversion('123,222,444')
#print money_conversion('123,222,444')
print money_conversion('-123,222,444元')
In [150]:
def remove_space(element):
return "".join(element.split())
print remove_space(' dsf dsf dsfdsfdsf')
In [161]:
name_map = {"機關代碼":"entity_code", "機關名稱":"procuring_entity","標案案號":"job_number","招標方式":"procurement_type",\
"決標方式":"tender_awarding_type","標案名稱":"subject_of_procurement", "決標資料類別":"attr_of_tender_awarding", \
"標的分類":"attr_of_procurement", "預算金額":"budget_value", "開標時間":"opening_date","決標公告日期":"tender_awarding_announce_date",\
"歸屬計畫類別":"project_type","總決標金額":"total_tender_awarding_value","底價金額":"floor_price_value",\
"決標日期":"tender_awarding_date", "pkAtmMain":"pkAtmMain"}
tender_awards_map = {"機關代碼":remove_space, "機關名稱":remove_space, "標案案號":remove_space, \
"招標方式":remove_space,"決標方式":remove_space,"標案名稱":remove_space, "決標資料類別":remove_space, \
"標的分類":remove_space, "預算金額":money_conversion, "開標時間":date_conversion,"歸屬計畫類別":remove_space, \
"總決標金額":money_conversion,"底價金額":money_conversion,"決標日期":date_conversion, "決標公告日期":date_conversion}
In [162]:
def a(str1):
return 'qoo' + str1
def b(str1):
return 'oop' + str1
func_map = {'a': a, 'b':b}
print func_map['a']('hello')
print func_map['b']('hello')
In [163]:
def get_award_info_dic (element):
returned_dic = {}
for row in element:
th = row.select('th')
if len(th) > 0:
th_name = th[0].text.encode('utf-8').strip()
if ('award_table_tr_1' or \
'award_table_tr_2' or \
'award_table_tr_6' in row.get('class')) \
and (th_name in tender_awards_map):
returned_dic[name_map[th_name]] = \
tender_awards_map[th_name](row.select('td')[0].text.strip())
return returned_dic
In [166]:
tr_ele = get_response_element('gov/51099604_1026604961.txt')
return_dic = get_award_info_dic(tr_ele)
In [167]:
for key in return_dic:
print key, return_dic[key]
In [169]:
a= '\t'
print a
b = r'\t'
print b
In [170]:
# -*- coding: utf-8 -*-
import sqlite3
db = sqlite3.connect('tender.sqlite')
#cur = db.cursor()
In [171]:
cur = db.cursor()
cur.execute('''CREATE TABLE IF NOT EXISTS Tender_awards(
id INTEGER PRIMARY KEY,
pkAtmMain TEXT,
procuring_entity TEXT,
entity_code TEXT,
attr_of_procurement TEXT,
opening_date DATETIME,
procurement_type TEXT,
tender_awarding_type TEXT,
project_type TEXT,
subject_of_procurement TEXT,
job_number TEXT,
budget_value BIGINTEGER,
attr_of_tender_awarding TEXT,
floor_price_value BIGINTEGER,
tender_awarding_announce_date DATETIME,
tender_awarding_date DATETIME,
total_tender_awarding_value BIGINTEGER
)''')
db.close()
In [172]:
tr_ele = get_response_element('gov/51099604_1026604961.txt')
return_dic = get_award_info_dic(tr_ele)
for key in return_dic:
print key, return_dic[key]
In [183]:
#'insert into table(c1, c2) values("val1", "val2")'
#'insert into table(c1, c2) values(?, ?)'
#print return_dic
placeholders = ', '.join(return_dic.keys())
values = ', '.join('?' * len(return_dic))
skeleton = 'insert into Tender_awards({}) values({});'
print skeleton.format(placeholders, values)
#return_dic.values()
Out[183]:
In [186]:
# -*- coding: utf-8 -*-
import sqlite3
db = sqlite3.connect('tender.sqlite')
cur = db.cursor()
tr_ele = get_response_element('gov/51099604_1026604961.txt')
return_dic = get_award_info_dic(tr_ele)
placeholders = ', '.join(return_dic.keys())
values = ', '.join('?' * len(return_dic))
skeleton = 'insert into Tender_awards({}) values({});'
cur.execute(skeleton.format(placeholders, values), return_dic.values())
db.commit()
db.close()
In [187]:
import sqlite3 as lite
salary = (
(1 , 'Paul' , 32 , 'California', 20000),
(2 , 'Allen' , 25 , 'Texas' , 15000),
(3 , 'Teddy' , 23 , 'Norway' , 20000),
(4 , 'Mark' , 25 , 'Rich-Mond' , 65000),
(5 , 'David' , 27 , 'Texas' , 85000),
(6 , 'Kim' , 22 , 'South-Hall', 45000),
(7 , 'James' , 24 , 'Houston' , 10000)
)
department = (
(1, 'IT Billing' ,1),
(2, 'Engineerin' ,2),
(3, 'Finance' ,7),
(4, 'Finance' ,5),
(5, 'Finance' ,6)
)
In [191]:
con = lite.connect('test.sqlite')
with con:
cur = con.cursor()
cur.execute("DROP TABLE IF EXISTS Salary")
cur.execute("DROP TABLE IF EXISTS Department")
cur.execute("CREATE TABLE Salary(ID INT, NAME TEXT, AGE INT, ADDRESS TEXT, SALARY INT)")
cur.executemany("INSERT INTO Salary VALUES(?, ?, ?, ?, ?)", salary)
cur.execute("CREATE TABLE Department(ID INT PRIMARY KEY, DEPT CHAR(50), EMP_ID INT)")
cur.executemany("INSERT INTO Department VALUES(?, ?, ?)", department)
con.close()
In [194]:
dbname = 'test.sqlite'
con = lite.connect(dbname)
items = tuple(range(1,1000000))
import time
with con:
cur = con.cursor()
cur.execute("DROP TABLE IF EXISTS ptest")
cur.execute("CREATE TABLE ptest(ID INT)")
start = time.time()
for i in range(0,1000000):
cur.execute("INSERT INTO ptest VALUES(%s)"%(i))
end = time.time()
print end -start
cur.execute("select count(1) from ptest")
rows = cur.fetchone()
print rows
con.close()
In [196]:
dbname = 'test.sqlite'
con = lite.connect(dbname)
with con:
cur = con.cursor()
cur.execute("DROP TABLE IF EXISTS ptest")
cur.execute("CREATE TABLE ptest(ID INT)")
start = time.time()
cur.executemany("INSERT INTO ptest VALUES(?)",((id_, ) for id_ in xrange(1000000)))
end = time.time()
print end -start
cur.execute("select count(1) from ptest")
rows = cur.fetchone()
print rows
con.close()
In [197]:
dbname = 'test.sqlite'
con = lite.connect(dbname)
with con:
cur = con.cursor()
cur.execute('select * from employee')
data = cur.fetchall()
for rec in data:
print rec
cur.close()
con.close()
In [199]:
dbname = 'test.sqlite'
con = lite.connect(dbname)
with con:
cur = con.cursor()
cur.execute('select * from employee')
data = cur.fetchone()
for rec in data:
print rec
data = cur.fetchone()
for rec in data:
print rec
cur.close()
con.close()
In [201]:
dbname = 'test.sqlite'
con = lite.connect(dbname)
with con:
cur = con.cursor()
cur.execute("select * from employee")
data = cur.fetchall()
for rec in data:
print rec
cur.execute("update employee set salary = 1000000 where name = 'qoo'")
con.commit()
# rollback
con.rollback()
cur.execute("select * from employee")
data = cur.fetchall()
for rec in data:
print rec
con.close()
In [202]:
def insert_award_info (cur, data_dic, filename):
file_param = filename.split(".txt")[0].split("_")
pkAtmMain = file_param[0]
data_dic['pkAtmMain'] = pkAtmMain
columns = ', '.join(data_dic.keys())
placeholders = ', '.join('?' * len(data_dic))
sql = 'INSERT INTO Tender_awards ({}) VALUES ({})'.format(columns, placeholders)
cur.execute(sql, data_dic.values())
In [205]:
db = sqlite3.connect('tender.sqlite')
cur = db.cursor()
path = "gov/"
dirs = os.listdir(path)
print dirs
In [206]:
db = sqlite3.connect('tender.sqlite')
cur = db.cursor()
path = "gov/"
dirs = os.listdir(path)
for filename in dirs:
ele = get_response_element(path + filename)
award_info_dic = get_award_info_dic(ele)
insert_award_info(cur, award_info_dic, filename)
db.commit()
db.close()
In [ ]: