In [18]:
import MySQLdb
import pickle
from collections import Counter
import pandas as pd

In [12]:
conn = MySQLdb.connect(user='mos', passwd='', db='monuments_db', charset='utf8', use_unicode=True)
cursor = conn.cursor()
cursor.execute("SET NAMES utf8")


Out[12]:
0

In [13]:
def create_list_of_source_tables_pickle():
    cursor.execute("SHOW TABLES")
    non_country_tables = set(["prox_search","admin_tree","commonscat","image","monuments_all"])
    results = cursor.fetchall()
    all_tables = []
    for pair in results:
        for item in pair:
            all_tables.append(item)
    country_tables =  list(set(all_tables) - non_country_tables) # set difference, then convert back to list
    print("len(country_tables). {}".format(len(country_tables)))
    
    cnt = Counter()
    for table in country_tables:
        sql = 'SELECT count(*) FROM `{}`'.format(table)
        cursor.execute(conn.escape_string(sql))
        res = cursor.fetchone()
        cnt.update({table: res[0]})
    empty_tables = []
    for table,count in cnt.items():
        if count == 0:
            empty_tables.append(table)
    print("len(empty_tables): {}".format(len(empty_tables)))
    
    final_tables = set(country_tables) - set(empty_tables)
    print("len(final_tables): {}".format(len(final_tables)))
    pickle.dump(final_tables, open("./final_tables.pickle","wb"))

In [14]:
def create_country_codes_pickle():
    print(list(dict(df.country.value_counts()).keys()))
    country_codes = list(dict(df.country.value_counts()).keys())
    pickle.dump(country_codes, open("./country_codes.pickle","wb"))

In [15]:
create_list_of_source_tables_pickle()


len(country_tables). 103
len(empty_tables): 6
len(final_tables): 97

In [17]:
# Load full table into memory
sql = "SELECT * FROM monuments_all"
df = pd.io.sql.read_sql(conn.escape_string(sql), conn)

create_country_codes_pickle()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-17-f356d49781c4> in <module>()
      1 # Load full table into memory
      2 sql = "SELECT * FROM monuments_all"
----> 3 df = pd.io.sql.read_sql(conn.escape_string(sql), conn)
      4 
      5 create_country_codes_pickle()

NameError: name 'pd' is not defined

In [ ]: