In [55]:
import pandas as pd # Beautiful tool for data wrangling! e.g. '!pip install pandas' from a Notebook
# See https://mariadb.com/blog/how-connect-python-programs-mariadb e.g. '!pip install mysql' from Notebook
import mysql.connector as mariadb
import re
pd.set_option("display.max_rows",35) # Useful when having large Pandas DataFrames like we do here
In [2]:
conn = mariadb.connect(user='mos', password='', database='monuments_db',buffered=True)
cursor = conn.cursor()
In [112]:
cursor.execute("SELECT * FROM monuments_all WHERE country='se-bbr'")
all_bbr = pd.io.sql.read_sql('select * from monuments_all WHERE country="se-bbr"', conn)
all_bbr.shape
Out[112]:
In [3]:
table_name = "se_bbr" # I've renamed monuments_se-bbr_(se) to 'se_bbr' in local database, change to correct name
se_bbr = pd.io.sql.read_sql('select * from se_bbr', conn)
se_bbr.shape
Out[3]:
In [115]:
se_bbr.keys()
Out[115]:
In [81]:
se_bbr.ix[10:20, ["namn","funktion"]]
Out[81]:
In [107]:
first = se_bbr.groupby("kommun")[["funktion","namn"]].first()
first.head()
Out[107]:
In [110]:
first.loc[["Ale","Täby","Åre"],["funktion","namn"]]
Out[110]:
In [23]:
se_bbr.namn.str.extract('(?P<name>\[\[[\w\.\|\- ]+\]\])\,? ?(?P<name2>[ \w]+)? ?(?P<name3>\[\[[\w\.\|\- ]+\]\])? ?(?P<buildId>\([\w\.\d \|\:\-;,]+\))',expand=True)
Out[23]:
In [24]:
se_bbr.ix[5672]
Out[24]:
Let's check If we have extracted the two always occuring fields 'name' and 'buildId' for all objects
In [48]:
se_bbr_namn = se_bbr.namn.str.extract('(?P<name>\[\[[\w`\.,\|\- ]+\]\])\,? ?(?P<name2>[ \w]+)? ?(?P<name3>\[\[[\w\.\|\- ]+\]\])? ?(?P<buildId>\([\w\.\d \|\:\-;,]+\))',expand=True)
In [64]:
len(se_bbr_namn[se_bbr_namn["name"] == pd.np.nan])
Out[64]:
In [50]:
len(se_bbr_namn[se_bbr_namn["buildId"] != pd.np.nan])
Out[50]:
In [69]:
se_bbr[pd.isnull(se_bbr["namn"])]
Out[69]:
In [60]:
len(se_bbr_namn.name.value_counts())
Out[60]:
In [62]:
len(se_bbr_namn.name2.value_counts())
Out[62]:
In [63]:
len(se_bbr_namn.name3.value_counts())
Out[63]:
How many objects did we get the other fields 'name2' and 'name3' for?
In [51]:
se_bbr_namn.name3.value_counts()
Out[51]:
In [58]:
se_bbr[se_bbr.namn.str.contains("sproge", flags=re.IGNORECASE) == True]
Out[58]:
In [ ]: