In [5]:
x= ["duck","aardvark","crocodile", "emu", "bee"]
In [6]:
x.sort()
In [7]:
x
Out[7]:
In [8]:
### sorted by descending order
sorted(x,reverse=True)
Out[8]:
In [9]:
### sorted by second letter:
#sorted(x, key=??)
In [10]:
def get_second_letter(s):
return s[1]
In [11]:
get_second_letter("cheese")
Out[11]:
In [12]:
sorted(x,key=get_second_letter) #key is a parameter, value is a function:get_second_letter
Out[12]:
In [13]:
type(get_second_letter)
Out[13]:
In [14]:
# normal function: def nameofthefunction parameter: return expression
def get_second_letters(s):
return s[1]
In [15]:
# the previous is the same as:
get_second_letter = lambda s:s[1]
In [16]:
type(lambda s:s[1])
Out[16]:
In [17]:
print("hello")
In [18]:
sorted(x,key=lambda s:s[1])
Out[18]:
In [19]:
t= (5,10,15)
In [20]:
type(t)
Out[20]:
In [21]:
t[0]
Out[21]:
In [22]:
for item in t:
print(item*item)
In [23]:
#tuple is like a list but it can't be changed, it's called "immutable" data type
#one benefit is exactly that: it can't be changed.
#other benefit is that tuples are memory-efficient.
In [24]:
hello=[1,2,3]
In [25]:
foo=(1,2,3)
In [26]:
import sys
sys.getsizeof(hello)
Out[26]:
In [27]:
sys.getsizeof(foo)
Out[27]:
In [28]:
import re
test="one 1 two 2 three 3 four 4 five 5"
re.findall(r"\w+ \d",test)
Out[28]:
In [29]:
for item in re.findall(r"\w+ \d",test):
x=item.split("")
print(x[0])
print(x[1])
In [ ]:
test="one 1 two 2 three 3 four 4 five 5"
re.findall(r"(\w+) (\d)",test)
In [ ]:
all_subjects = open("enronsubjects.txt").read()
In [ ]:
for item in re.findall(r"(\d{3})-(\d{3})-(\d{4})", all_subjects):
print item[0]
In [ ]:
[item[0] for item in re.findall(r"(\d{3})-(\d{3})-(\d{4})", all_subjects)]
In [ ]:
re.findall(r"\$(\d+) ?(\w+)", all_subjects)
In [ ]:
vals=[]
for item in re.findall(r"\$(\d+) ?([mMbBkK])", all_subjects):
multiplier=item[1].lower()
number_val=int(item[0])
if multiplier=='k':
number_val *= 1000
elif multiplier=='m':
number_val *= 1000000
elif multiplier=='b':
number_val *= 1000000000
vals.append(number_val)
sum(vals)
In [ ]:
message = "this is a test, this is only a test"
In [ ]:
message.replace("this","that").replace("test","walrus") #.replace() replace part of a string
In [ ]:
re.findall(r"\d{3}-\d{3}-\d{4}", all_subjects)
In [ ]:
message="This is a test; this is only a test."
re.sub(r"[Tt]his", "that", message) #.sub():substitute a pattern
In [ ]:
re.sub(r"\b\w+\b", "WALRUS", message)
In [ ]:
anon = re.sub(r"\d{3}-\d{3}-\d{4}", "555-555-5555", all_subjects)
In [ ]:
re.findall(r".{,20}\d{3}-\d{3}-\d{4}.{,20}",anon)
In [ ]:
anon = re.sub(r"(\d{3}-\d{3}-\d{4}", r"\1-\2-XXXX", all_subjects)
In [32]:
from urllib.request import urlretrieve
urlretrieve("https://raw.githubusercontent.com/ledeprogram/data-and-databases/master/menupages-morningside-heights.html")
Out[32]:
In [ ]:
#urlretrieve(url, filename)
store:
research phase:
targets:
{'name':"Brad's, 'price': 1,'cuisine':['coffee']},
{'name':"Cafe Nana", 'price':0, 'cuisines':['Middle Eastern','Kosher']},
...
]
In [33]:
from bs4 import BeautifulSoup
In [34]:
raw_html = open("menupages-morningside-heights.html").read()
soup = BeautifulSoup(raw_html, "html.parser")
In [35]:
#just the name
search_table=soup.find("table", {'class':'search-results'})
table_body=search_table.find('tbody')
for tr_tag in table_body.find_all('tr'):
name_address_tag = tr_tag.find('td', {'class':'name-address'})
a_tag = name_address_tag.find('a')
print(a_tag.string)
In [36]:
search_table=soup.find("table", {'class':'search-results'})
table_body=search_table.find('tbody')
for tr_tag in table_body.find_all('tr'):
# get the restaurant name from the a inside a td
name_address_tag = tr_tag.find('td', {'class':'name-address'})
a_tag = name_address_tag.find('a')
restaurant_name=a_tag.string
#get the price from the span if present
price_tag = tr_tag.find('td', {'class':'price'})
price_span_tag = price_tag.find('span')
if price_span_tag:
price=int(price_span_tag.string)
else:
price=0
print(restaurant_name, price)
In [37]:
def get_name(tag):
return"TEST RESTAURANT"
def get_price(tag):
return"999999"
In [38]:
search_table=soup.find("table", {'class':'search-results'})
table_body=search_table.find('tbody')
for tr_tag in table_body.find_all('tr'):
restaurant_name = get_name(tr_tag)
price = get_price(tr_tag)
print(restaurant_name, price)
In [39]:
def get_name(tr_tag):
name_address_tag = tr_tag.find('td', {'class':'name-address'})
a_tag = name_address_tag.find('a')
restaurant_name=a_tag.string
return restaurant_name
def get_price(tr_tag):
price_tag = tr_tag.find('td', {'class':'price'})
price_span_tag = price_tag.find('span')
if price_span_tag:
price=int(price_span_tag.string)
else:
price=0
return price
def get_cuisines(tr_tag):
all_td_tags=tr_tag.find_all('td')
cuisine_tag = all_td_tags[4]
print(cuisine_tag)
cuisines = cuisine_tag.string
if cuisines:
cuisines_list= cuisines.split(", ")
else:
cuisine_list=[]
return cuisines_list
In [40]:
restaurants=[]
search_table=soup.find("table", {'class':'search-results'})
table_body=search_table.find('tbody')
for tr_tag in table_body.find_all('tr'):
restaurant_name = get_name(tr_tag)
price = get_price(tr_tag)
cuisines=get_cuisines(tr_tag)
rest_dict = {'name':restaurant_name, 'price':price, 'cuisines':cuisines}
restaurants.append(rest_dict)
restaurants
In [41]:
import pandas as pd
df = pd.DataFrame(restaurants)
In [42]:
df[df['price'] >2]
Out[42]:
what should those tables have in them? (columns and data types)
data normalization normal form
entities
restaurant name, price, list of cuisines
restaurant table: 'id' (unique integer identifying the restaurant)'name'(string with the restaurant's name) *'price'(integer that corresponds to the number of dollar signs)
cuisine table:
sample entry from restaurant table restaurant_id: 4 name: Brad's price:1 ... ... sameple entry from cuisine table restaurant_id: 4 kind: coffee-tea
restaurant_id: 4 kind: seafood ... select restaurant.restaurant_id from restaurant join cuisine on restaurant.restaurant_id=cuisine.restaurant_id
"set up ohase"-> creating database and creating tables "one time"->psql "working with data phase"-> inserting records, selecting stuff->python
sql_data_types like
In [43]:
import pg8000
conn= pg8000.connect(database="menupages")
In [44]:
type(conn)
Out[44]:
In [63]:
conn.rollback() #execyte this whever you make a SQL problem
In [46]:
cursor = conn.cursor()
cursor object:
.execute()<- execute a SQL statement.fetchone()<- fetches the first record of the results of a statement(as a list) *.fetchall()<- returns ALL the rows of the results of a statement(as a list)
In [47]:
cursor.execute("INSERT INTO restaurant(name,price) VALUES('Good Food Place',3)")
conn.commit()
In [48]:
cursor.execute("SELECT * FROM restaurant")
for item in cursor.fetchall():
print(item)
In [52]:
cursor.execute(
"INSERT INTO restaurant(name,price) VALUES('Palace of Vegan Nosh',3) RETURNING id")
results= cursor.fetchone() #return a list
conn.commit()
In [53]:
rowid=results[0]
In [54]:
rowid
Out[54]:
In [55]:
#will not work:
cursor.execute(
"INSERT INTO restaurant(name,price) VALUES ('Brad's', 1) RETURNING id")
rowid=cursor.fetchone()[0]
conn.commit()
In [56]:
#SQL Injection attack:
restaurant = "'Restaurant'); DELETE FROM restaurant;"
string in python -> "quote" "escape"-> valid sql statement very weird and difficult and arcane
In [64]:
rest_insert = "INSERT INTO restaurant(name, price) VALUES (%s, %s)" #%S placeholder
cursor.execute(rest_insert, ["Brad's", 1])
# pg8000 does the work: "INSERT INTO restaurant (name,price) VALUES ('Brad\'s',1)"
conn.commit()
In [65]:
cursor.execute("INSERT INTO restaurant(name, price) VALUES(%s, %s) RETURNING id",["Test Restaurant",2])
rowid=cursor.fetchone()[0]
conn.commit()
In [66]:
rowid
Out[66]:
In [67]:
# let's say Test Restaurant serves fondue and casseroles
cuisine_insert = "INSERT INTO cuisine(restaurant_id, kind) VALUES (%s, %s)"
cursor.execute(cuisine_insert, [rowid, "fondue"])
cursor.execute(cuisine_insert, [rowid,"casseroles"])
conn.commit()
In [ ]:
### Insert Many restaurants
In [68]:
restaurants
Out[68]:
In [73]:
#error
rest_insert = "INSERT INTO restaurant(name,price)VALUES(%s,%s)"
for item in restaurants:
cursor.execute(rest_insert, [item['name'], item['price']])
conn.commit()
In [70]:
first=restaurants[0]
first
Out[70]:
In [71]:
type(first['name'])
Out[71]:
so what happened? Why isn't this just a string?
whenver you use the .string attribute of a Beatufil Soup tag object, the type of that value is bs4.element.NavigableString
fortunately, there's an esay fix:'str(val)' to convert that value into a string use str(value)
In [72]:
rest_insert = "INSERT INTO restaurant(name,price)VALUES(%s,%s)"
for item in restaurants:
cursor.execute(rest_insert, [str(item['name']), item['price']])
conn.commit()
In [ ]:
# restaurants
for item in restaurants
sql to insert a restaurant
commit()
step2: for rest in rest:
for cuisine in cuisine
sql to insert
In [76]:
rest_insert="INSERT INTO restaurant(name,price)VALUES(%s,%s) RETURNING id"
cuisine_insert="INSERT INTO CUISINE(restaurant_id, kind)VALUES(%s,%s)"
for item in restaurants:
#insert restaurant, RETURNING id
print("inserting restaurant", item['name'])
cursor.execute(rest_insert, [str(item['name']), item['price']])
rowid=cursor.fetchone()[0]
for cuisine in item['cuisines']:
print(" -inserting cuisine",cuisine)
cursor.execute(cuisine_insert, [rowid, str(cuisine)])
#insert restaurant
conn.commit()
In [ ]: