Relabelling USA spending field to integers in order to run Spark


In [1]:
import google.datalab.bigquery as bq

In [3]:
%%bq query
select
mod_agency,
vendorname,
count(*) as count
from `fiery-set-171213.vrec.usa_spending_all`
where vendorcountrycode in ('UNITED STATES', 'USA: UNITED STATES OF AMERICA')
and contractingofficerbusinesssizedetermination in ('O: OTHER THAN SMALL BUSINESS', 'S: SMALL BUSINESS')
and mod_agency not in ("")
group by 1,2,3
order by count DESC
limit 100


Out[3]:
contractingofficerbusinesssizedeterminationmod_agencyvendornamecount
O: OTHER THAN SMALL BUSINESS97AS: DEFENSE LOGISTICS AGENCYAMERISOURCEBERGEN DRUG CORPORA676166
O: OTHER THAN SMALL BUSINESS97AS: DEFENSE LOGISTICS AGENCYCARDINAL HEALTH 200, LLC419654
O: OTHER THAN SMALL BUSINESS97AS: DEFENSE LOGISTICS AGENCYCARDINAL HEALTH, INC318861
O: OTHER THAN SMALL BUSINESS4730: FEDERAL ACQUISITION SERVICENATIONAL INDUSTRIES FOR THE BLIND308991
O: OTHER THAN SMALL BUSINESS97AS: DEFENSE LOGISTICS AGENCYOWENS & MINOR DISTRIBUTION, INC.262205
O: OTHER THAN SMALL BUSINESS97AS: DEFENSE LOGISTICS AGENCYAMERISOURCEBERGEN DRUG CORPORATION243671
O: OTHER THAN SMALL BUSINESS1900: STATE, DEPARTMENT OFMISCELLANEOUS FOREIGN CONTRACTORS237245
O: OTHER THAN SMALL BUSINESS97AS: DEFENSE LOGISTICS AGENCYSCIENCE APPLICATIONS INTERNATIONAL CORPORATION218120
O: OTHER THAN SMALL BUSINESS1900: STATE, DEPARTMENT OFMISCELLANEOUS FOREIGN AWARDEES217609
O: OTHER THAN SMALL BUSINESS4730: FEDERAL ACQUISITION SERVICEJ & L AMERICA INC210475
O: OTHER THAN SMALL BUSINESS2100: DEPT OF THE ARMYMISCELLANEOUS FOREIGN CONTRACTORS200091
S: SMALL BUSINESS4730: FEDERAL ACQUISITION SERVICEKAUFMAN COMPANY INC190028
O: OTHER THAN SMALL BUSINESS4730: FEDERAL ACQUISITION SERVICEOFFICEMAX INCORPORATED177307
O: OTHER THAN SMALL BUSINESS4730: FEDERAL ACQUISITION SERVICEIDSC HOLDING LIMITED LIABILITY COMPANY168934
S: SMALL BUSINESS4730: FEDERAL ACQUISITION SERVICEKAUFMAN COMPANY INCORPORATED168796
S: SMALL BUSINESS4730: FEDERAL ACQUISITION SERVICENATIONAL INDUSTRIES FOR THE BLIND160980
O: OTHER THAN SMALL BUSINESS4730: FEDERAL ACQUISITION SERVICEBOISE CASCADE OFFICE PRODUCTS155104
O: OTHER THAN SMALL BUSINESS97AS: DEFENSE LOGISTICS AGENCYMCKESSON CORPORATION142845
O: OTHER THAN SMALL BUSINESS97AS: DEFENSE LOGISTICS AGENCYLOCKHEED MARTIN CORPORATION123176
S: SMALL BUSINESS97AS: DEFENSE LOGISTICS AGENCYSUPPLYCORE INC.108350
O: OTHER THAN SMALL BUSINESS3600: VETERANS AFFAIRS, DEPARTMENT OFCARDINAL HEALTH 200 INCORPORATED THE MEDICAL PRODUCTS AND SERVICES GROUP OF CAR108246
O: OTHER THAN SMALL BUSINESS4730: FEDERAL ACQUISITION SERVICEW W GRAINGER, INC106331
O: OTHER THAN SMALL BUSINESS4730: FEDERAL ACQUISITION SERVICEGENERAL MOTORS CORPORATION103171
O: OTHER THAN SMALL BUSINESS4732: FEDERAL ACQUISITION SERVICENATIONAL INDUSTRIES FOR THE BLIND94107
O: OTHER THAN SMALL BUSINESS4730: FEDERAL ACQUISITION SERVICEFORD MOTOR COMPANY82619

(rows: 100, time: 2.9s, 5GB processed, job: job_hD7RmHPOc8l8UkWW1Z7SF55xZWA)

usa_mod_agency_relabel


In [11]:
%%bq query
select
mod_agency,
ROW_NUMBER() OVER (ORDER BY user DESC) row_num
from
  (select
  mod_agency
  #REGEXP_REPLACE(mod_agency, r',', '') as user
  from `fiery-set-171213.vrec.usa_spending_all`
  group by 1)


Out[11]:
useruser_row_num
9568: BROADCASTING BOARD OF GOVERNORS34
2024: FEDERAL LAW ENFORCEMENT TRAINING CENTER172
1438: OFFICE OF SURFACE MINING, RECLAMATION AND ENFORCEMENT220
9514: OCCUPATIONAL SAFETY AND HEALTH REVIEW COMMISSION49
6930: FEDERAL RAILROAD ADMINISTRATION115
1650: OCCUPATIONAL SAFETY AND HEALTH ADMINISTRATION184
1645: MINE SAFETY AND HEALTH ADMINISTRATION185
1630: EMPLOYMENT AND TRAINING ADMINISTRATION188
1425: BUREAU OF RECLAMATION226
1153: UNITED STATES TRADE AND DEVELOPMENT AGENCY278
9776: USTRANSCOM21
8656: OFFICE OF FEDERAL HOUSING ENTERPRISE OVERSIGHT64
7501: IMMED OFFICE OF THE SECRETARY OF HEALTH AND HUMAN SERVICES89
6957: PIPELINE AND HAZARDOUS MATERIALS SAFETY ADMINISTRATON107
2041: BUREAU OF ENGRAVING AND PRINTING167
12K3: ANIMAL AND PLANT HEALTH INSPECTION SERVICE248
97F5: WASHINGTON HEADQUARTERS SERVICES (WHS)6
97AV: DEFENSE SECURITY SERVICE14
955F: CONSUMER FINANCIAL PROTECTION BUREAU37
6959: FEDERAL MOTOR CARRIER SAFETY ADMINISTRATION106
6943: RESEARCH AND INNOVATIVE TECHNOLOGY ADMINISTRATION112
1352: MINORITY BUSINESS DEVELOPMENT AGENCY232
1304: OFFICE OF THE INSPECTOR GENERAL245
97BZ: DEFENSE FINANCE AND ACCOUNTING SERVICE (DFAS)12
96CE: U.S. ARMY CORPS OF ENGINEERS - CIVIL PROGRAM FINANCING ONLY30

(rows: 285, time: 1.9s, 1GB processed, job: job_8Cbb_dnPK8HlraYLbtxg2eASFcg)

usa_vendorname_relabel


In [13]:
%%bq query
select
vendorname,
ROW_NUMBER() OVER (ORDER BY vendorname DESC) row_num
from
  (select
  vendorname
  from `fiery-set-171213.vrec.usa_spending_all`
  group by 1)


Out[13]:
itemitem_row_num
orthopedic institute34
ZYKRONIX INCORPORATED172
ZWICK WINDOW SHADE COMPANY220
ZURIAN, STEVEN297
ZUHEIR INCORPORATED403
ZREYATECH, LLC491
ZORZA, INC.527
ZORN COMPRESSOR AND EQUIPMENT INCORPORATED535
ZOMEGA TECHNOLOGY CORPORATION660
ZOLLO CFP, KIMBERLEY FEENEY678
ZOLL SERVICES LLC691
ZOELIMALALA RAMANASE750
ZODIAC PROPERTIES II LIMITED LIABILITY COMPANY771
ZODIAC PROPERTIES772
ZKD INC.857
ZIVOTOFSKY, JACOB874
ZIP ENTERPRISES INC988
ZIMMER MACHINERY CORPORATION1157
ZIMMER GUNSUL FRASCA PARTNERSH1162
ZIENTEK ANTHONY H1258
ZIEMEK DENTAL LABORATORY, INC.1269
ZIA IMPERIAL LOGISTIC SERVICES1384
ZIA CONSULTANTS LIMITED LIABILITY COMPANY1398
ZIA BUS SALES INCORPORATED1400
ZHAO HONGSHEN1472

(rows: 952422, time: 4.7s, 1GB processed, job: job_fxH4-4V6yvl71wXrttJRtwR-y3w)

Get relabeled table


In [23]:
%%bq query
select
b.row_num as user_number,
c.row_num as item_number,
#vendorname as item,
#a.mod_agency as user,
count(*) as count
from `fiery-set-171213.vrec.usa_spending_all` as a
join `fiery-set-171213.vrec.usa_mod_agency_relabel` as b
on a.mod_agency = b.mod_agency
join `fiery-set-171213.vrec.usa_vendorname_relabel` as c
on a.vendorname = c.vendorname
where vendorcountrycode in ('UNITED STATES', 'USA: UNITED STATES OF AMERICA')
and contractingofficerbusinesssizedetermination in ('O: OTHER THAN SMALL BUSINESS', 'S: SMALL BUSINESS')
and a.mod_agency not in ("")
group by 1,2
order by count DESC
limit 10000


Out[23]:
user_numberitem_numbercount
16895988676166
143348288469971
16795270419655
16795221318862
16306804262224
16895987243671
178370992239677
16190617218145
178370994217610
143506505210493
161370992207995
143475028190245
143319130177307
143530873168934
143475027168907
143828160155104
142348288154628
16395476142845
16430916123212
16125565111677
150795276108281
14341327106597
143602245103172
14362324782619
1679527781609

(rows: 100, time: 5.7s, 5GB processed, job: job_-WNvK1vVuBDsgQhDRjQDTR7u-nw)