In [1]:

    
import datetime
import itertools
import random

import names
import numpy as np
import pandas as pd
import sqlalchemy as sa
from sqlalchemy import create_engine


connection_string = 'postgres://localhost:5432/VincentLa'
engine = create_engine(connection_string)

SCHEMA_NAME = 'tutorial_data_ingest'
engine.execute('CREATE SCHEMA IF NOT EXISTS ' + SCHEMA_NAME)

Generating Data for Examples of Unit Testing SQL Queries

At Clover, our analytics team will often write "unit" tests for SQL queries that contribute directly to our data pipeline. As an example, there are a couple queries in /queries/schema/tutorial_analytics.

current_scores.sql
doctor_dashboard.sql

We use pytest to test these SQL files. See /test_suite/test_current_scores.py and /test_suite/test_doctor_dashboard.py as examples of unit tests we write to test SQL files.



In [2]:

    
def create_patients():
    """Creating a table of patient and ids"""
    ids = list(range(1, 11))
    doctor_ids = ['dr' + str((i % 2) + 1) for i in ids]
    names = ['john', 'jeremy', 'mark', 'leslie', 'sam', 'matt', 'judy', 'parth', 'kevin', 'joshua']

    patients = {
        'patient_id': ids,
        'doctor_id': doctor_ids,
        'name': names
    }

    pd.DataFrame(patients).to_sql('patients', engine, schema=SCHEMA_NAME, index=False, if_exists='replace')



In [3]:

    
def create_risk_assessment_scores():
    """Creating a table of risk_assessments and scores"""
    scores = [
        (1, 'psychological', 100, datetime.date(2017, 1, 5)),
        (2, 'psychological', 96, datetime.date(2017, 1, 5)),
        (3, 'psychological', 89, datetime.date(2017, 1, 5)),
        (4, 'psychological', 75, datetime.date(2017, 1, 5)),
        (5, 'psychological', 81, datetime.date(2017, 1, 5)),
        (1, 'psychological', 90, datetime.date(2017, 1, 12)),
        (1, 'psychological', 92, datetime.date(2017, 1, 12)),
        (3, 'psychological', 94, datetime.date(2017, 1, 13)),
        (4, 'psychological', 85, datetime.date(2017, 1, 15)),
        (6, 'physical', 92, datetime.date(2017, 1, 7)),
        (7, 'physical', 85, datetime.date(2017, 1, 7)),
        (8, 'physical', 72, datetime.date(2017, 1, 7)),
        (9, 'physical', 73, datetime.date(2017, 1, 7)),
        (6, 'physical', 93, datetime.date(2017, 1, 8)),
        (6, 'physical', 94, datetime.date(2017, 1, 9)),
        (7, 'physical', 82, datetime.date(2017, 1, 15)),
        (8, 'physical', 75, datetime.date(2017, 1, 16)),
    ]
    labels = ['patient_id', 'assessment_type', 'risk_score', 'date_modified']
    pd.DataFrame.from_records(scores, columns=labels)\
        .to_sql('risk_assessments', engine, schema=SCHEMA_NAME, index=False, if_exists='replace')



In [4]:

    
def create_doctors():
    """Creating a table of doctors and ids"""
    doctor_ids = list(range(1, 3))
    doctor_ids = ['dr' + str(s) for s in doctor_ids]

    names = ['Dr. Smith', 'Dr. Smith']

    doctors = {
        'doctor_id': doctor_ids,
        'names': names,
    }

    pd.DataFrame(doctors).to_sql('doctors', engine, schema=SCHEMA_NAME, index=False, if_exists='replace')



In [5]:

    
create_doctors()
create_patients()
create_risk_assessment_scores()

Generate Data for Anomaly Detection Example

At Clover, one of the things we're really interested in is finding anomalies among provider behaviors. This section generates the data for a case study that we will present. In particular we will generate some claims data to look at.

Next, need to alter how we generate random procedure codes so that some doctors really are more likely to upcode



In [6]:

    
def draw_random_int():
    number = np.random.normal(100, 30)
    if number < 1:
        return 1
    else:
        return int(round(number))



In [7]:

    
upcoders = [0.05, 0.05, 0.15, 0.15, 0.6]
typicalcoders = [0.05, 0.1, 0.4, 0.4, 0.05]

drs = ['Dr. ' + names.get_full_name() for i in range(0, 1000)]
num_of_encounters = [draw_random_int() for i in range(0, 1000)]
personid = [random.randint(1, 25000) for i in range(0, sum(num_of_encounters))]
procedure_map = {
    1: 'Evaluation and Management, Lowest Intensity',
    2: 'Evaluation and Management, Second Lowest Intensity',
    3: 'Evaluation and Management, Medium Intensity',
    4: 'Evaluation and Management, High Intensity',
    5: 'Evaluation and Management, Highest Intensity',
}

doctors = list(itertools.chain(*[[drs[i]] * num_of_encounters[i] for i in range(0, len(num_of_encounters))]))

d = {
    'servicing_provider_npi': doctors,
    'personid': personid,
}
df = pd.DataFrame(d)

procedure_codes = []
for i in df.index:
    if drs.index(df.loc[i, 'servicing_provider_npi']) % 10 < 2:
        procedure_codes.append(np.random.choice(np.arange(1, 6), p=upcoders))
    else:
        procedure_codes.append(np.random.choice(np.arange(1, 6), p=typicalcoders))
procedure_codes = np.asarray(procedure_codes)

df['procedure_code'] = procedure_codes
df['procedure_name'] = df['procedure_code'].map(procedure_map)



In [8]:

    
df.head()









    Out[8]:






  
    
      
      personid
      servicing_provider_npi
      procedure_code
      procedure_name
    
  
  
    
      0
      11324
      Dr. Darlene Allen
      5
      Evaluation and Management, Highest Intensity
    
    
      1
      1633
      Dr. Darlene Allen
      5
      Evaluation and Management, Highest Intensity
    
    
      2
      18100
      Dr. Darlene Allen
      4
      Evaluation and Management, High Intensity
    
    
      3
      19251
      Dr. Darlene Allen
      5
      Evaluation and Management, Highest Intensity
    
    
      4
      16174
      Dr. Darlene Allen
      5
      Evaluation and Management, Highest Intensity



In [9]:

    
df.to_sql('claim_lines', engine, schema=SCHEMA_NAME, index=False, if_exists='replace')

	personid	servicing_provider_npi	procedure_code	procedure_name
0	11324	Dr. Darlene Allen	5	Evaluation and Management, Highest Intensity
1	1633	Dr. Darlene Allen	5	Evaluation and Management, Highest Intensity
2	18100	Dr. Darlene Allen	4	Evaluation and Management, High Intensity
3	19251	Dr. Darlene Allen	5	Evaluation and Management, Highest Intensity
4	16174	Dr. Darlene Allen	5	Evaluation and Management, Highest Intensity