In [0]:

    
from google.colab import auth
auth.authenticate_user()



In [0]:

    
from google.cloud import bigquery
client = bigquery.Client(project='YOUR-PROJECT-NAME')

Table Schema



In [3]:

    
dataset_id = 'YOUR-DATASET-NAME'
table_id = 'YOUR-TABLE-NAME'

dataset_ref = client.dataset(dataset_id)
table_ref = dataset_ref.table(table_id)
table = client.get_table(table_ref)

# View table properties
for e in table.schema:
  print(e)









    



SchemaField('topics', 'STRING', 'NULLABLE', None, ())
SchemaField('title', 'STRING', 'NULLABLE', None, ())
SchemaField('content', 'STRING', 'NULLABLE', None, ())
SchemaField('title_embed', 'FLOAT', 'REPEATED', None, ())
SchemaField('content_embed', 'FLOAT', 'REPEATED', None, ())

Create Features



In [0]:

    
def get_flattened_embeddings(input_column='content_embed', n=512):
  L = []
  for i in range(n):
    L.append("{0}[OFFSET({1})] AS {0}_{1}".format(input_column, i))
  return (',').join(L)

def get_one_hot_vectors(input_column='topics', categories=['acl','gas']):
  L = []
  for cat in categories:
    L.append(
      "CASE WHEN '{1}' in UNNEST(SPLIT({0})) THEN 1 ELSE 0 END as {1}".format(
        input_column, cat))
  return (',').join(L)

def get_number(input_column='topics', categories=['acl','gas']):
  L = []
  for cat in categories:
    L.append(
      "(CASE WHEN '{1}' in UNNEST(SPLIT({0})) THEN 1 ELSE 0 END)".format(
        input_column, cat))
  return ('+').join(L)



In [0]:

    
def get_topics_and_count(dataset_id, table_id):
  sql = """
  WITH topics_flatten AS
  (
  SELECT topic FROM
    (
    SELECT 
      SPLIT(topics, ',') AS topic
    FROM
      `{0}.{1}`
    WHERE
      topics != ''
    ),
    unnest(topic) topic
  )

  SELECT 
    REPLACE(topic, '-', '_') AS topic,
    count(*) AS count
  FROM
    topics_flatten
  GROUP BY
    topic
  """.format(dataset_id, table_id)
  return sql

df = client.query(get_topics_and_count(dataset_id, table_id)).to_dataframe()



In [6]:

    
df.hist(bins=100)









    Out[6]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7f0f4f893668>]],
      dtype=object)



In [7]:

    
target_categories = list(df[df['count'] >= 100].topic)
target_categories[:2]









    Out[7]:





['acq', 'bop']



In [8]:

    
def create_dataset(
    dataset_id, table_id, categories=['acq'], embed='content_embed', n=512):
  sql = """
  CREATE OR REPLACE TABLE `{0}.input_data`
    AS
    (
      SELECT
        STRUCT( {2} ) AS label,
        STRUCT( {3} ) AS feature
      FROM
        `{0}.{1}`
      WHERE
        {4} > 0
    )
  """.format(
      dataset_id, table_id,
      get_one_hot_vectors(input_column='topics', categories=categories),
      get_flattened_embeddings(input_column=embed, n=n),
      get_number(input_column='topics', categories=categories))
  return sql

sql = create_dataset(
  dataset_id, table_id,
  categories=target_categories, embed='content_embed', n = 512)
client.query(sql)









    Out[8]:





<google.cloud.bigquery.job.QueryJob at 0x7f0f4f730a20>

Create Models



In [9]:

    
def create_model(dataset_id, target='acq'):
  sql = """
  CREATE OR REPLACE MODEL `{0}.model_{1}`
  OPTIONS (
      model_type='logistic_reg',
      input_label_cols=['label']) AS
  SELECT
      label.{1} as label,
      feature.*
  FROM
      `{0}.input_data`
  """.format(dataset_id, target)

  return sql

sql = create_model(dataset_id, target='acq')
client.query(sql)









    Out[9]:





<google.cloud.bigquery.job.QueryJob at 0x7f0f4cef7358>

Evaluate Models



In [10]:

    
def evaluate_model(dataset_id, target='acq', threshold=0.55):
  sql = """
  SELECT
    *
  FROM
    ML.EVALUATE(MODEL `{0}.model_{1}`,
      (
      SELECT
        label.{1} as label,
        feature.*
      FROM
        `{0}.input_data`),
      STRUCT({2} AS threshold))
  """.format(dataset_id, target, threshold)
  return sql
    
sql = evaluate_model(dataset_id, target='acq', threshold=0.55)
client.query(sql).to_dataframe()









    Out[10]:







  
    
      
      precision
      recall
      accuracy
      f1_score
      log_loss
      roc_auc
    
  
  
    
      0
      0.664107
      0.9
      0.864719
      0.764265
      0.395869
      0.948135



In [0]:

    
def evaluate_model_roc(dataset_id, target='acq'):
  sql = """
  SELECT
    *
  FROM
    ML.ROC_CURVE
    (
      MODEL `{0}.model_{1}`,
      (
        SELECT
          label.{1} as label,
          feature.*
        FROM
          `{0}.input_data`
      )
    )
  """.format(dataset_id, target)
  return sql

sql = evaluate_model_roc(dataset_id, target='acq')
df = client.query(sql).to_dataframe()



In [12]:

    
df.plot(x='false_positive_rate', y='recall', grid=True, title='roc curve')









    Out[12]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f0f4f889ef0>



In [0]:

    
def evaluate_model_pr(target='acq'):
  sql = """
  SELECT
    recall,
    true_positives / (true_positives + false_positives) AS precision
  FROM
    ML.ROC_CURVE(
      MODEL `{0}.model_{1}`,
      (SELECT label.{1} as label, feature.* FROM `{0}.input_data`))
  """.format(dataset_id, target)
  return sql

sql = evaluate_model_pr(target='acq')
df = client.query(sql).to_dataframe()



In [14]:

    
df.plot(x='precision', y='recall', grid=True, title='precision recall curve')









    Out[14]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f0f4ce262b0>



In [15]:

    
def evaluate_model_confusion_matrix(target='acq'):
    sql = """
    SELECT
      *
    FROM
      ML.CONFUSION_MATRIX(MODEL `{0}.model_{1}`,
      (
        SELECT
          label.{1} as label,
          feature.*
        FROM
          `{0}.input_data`))
    """.format(dataset_id, target)
    return sql

sql = evaluate_model_confusion_matrix(target='acq')
client.query(sql).to_dataframe()









    Out[15]:







  
    
      
      expected_label
      _0
      _1
    
  
  
    
      0
      0
      5610
      1250
    
    
      1
      1
      181
      2029

Multi-Class Logistic Regression



In [18]:

    
for target_category in target_categories[:3]:
  print("Creating a model for {0}...".format(target_category))
  client.query(create_model(dataset_id, target=target_category))









    



Creating a model for acq...
Creating a model for bop...
Creating a model for cpi...



In [25]:

    
d_cat_to_eval = {}

for target_category in target_categories[:3]:
  print("Evaluating a model for {0}...".format(target_category))
  sql = evaluate_model_roc(dataset_id, target=target_category)
  d_cat_to_eval[target_category] = client.query(sql).to_dataframe()









    



Evaluating a model for acq...
Evaluating a model for bop...
Evaluating a model for cpi...



In [26]:

    
d_cat_to_eval['acq'].plot(x='false_positive_rate', y='recall', grid=True, title='roc curve')









    Out[26]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f0f4f726fd0>



In [27]:

    
d_cat_to_eval['bop'].plot(x='false_positive_rate', y='recall', grid=True, title='roc curve')









    Out[27]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f0f4cd967b8>



In [28]:

    
d_cat_to_eval['cpi'].plot(x='false_positive_rate', y='recall', grid=True, title='roc curve')









    Out[28]:





<matplotlib.axes._subplots.AxesSubplot at 0x7f0f4c9ea630>