In [1]:
from io import StringIO
import requests
import json
import pandas as pd
import re
%matplotlib inline
In [2]:
# The code was removed by DSX for sharing.
In [3]:
df_data_2.head(10)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-3-39c82ccbdf92> in <module>()
----> 1 df_data_2.head(10)
NameError: name 'df_data_2' is not defined
In [4]:
df_data_2[['CASE_STATUS', 'SOC_NAME']]
#use this to make a chart with two indexes
Out[4]:
CASE_STATUS
SOC_NAME
0
CERTIFIED-WITHDRAWN
BIOCHEMISTS AND BIOPHYSICISTS
1
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
2
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
3
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
4
WITHDRAWN
CHIEF EXECUTIVES
5
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
6
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
7
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
8
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
9
WITHDRAWN
CHIEF EXECUTIVES
10
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
11
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
12
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
13
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
14
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
15
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
16
CERTIFIED-WITHDRAWN
FINANCIAL MANAGERS
17
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
18
CERTIFIED
CHIEF EXECUTIVES
19
CERTIFIED
CHIEF EXECUTIVES
20
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
21
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
22
CERTIFIED
CHIEF EXECUTIVES
23
CERTIFIED
CHIEF EXECUTIVES
24
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
25
CERTIFIED
CHIEF EXECUTIVES
26
CERTIFIED-WITHDRAWN
CHIEF EXECUTIVES
27
CERTIFIED
CHIEF EXECUTIVES
28
CERTIFIED
CHIEF EXECUTIVES
29
CERTIFIED
CHIEF EXECUTIVES
...
...
...
3002428
WITHDRAWN
Physical Therapists
3002429
WITHDRAWN
Physical Therapists
3002430
WITHDRAWN
Computer Systems Analysts
3002431
WITHDRAWN
Computer Support Specialists
3002432
WITHDRAWN
Computer Systems Analysts
3002433
WITHDRAWN
Biochemists and Biophysicists
3002434
WITHDRAWN
Computer Systems Analysts
3002435
WITHDRAWN
Computer Programmers
3002436
WITHDRAWN
Computer Programmers
3002437
WITHDRAWN
Health Specialties Teachers, Postsecondary
3002438
WITHDRAWN
Database Administrators
3002439
WITHDRAWN
Biological Scientists, All Other
3002440
WITHDRAWN
Software Developers, Applications
3002441
WITHDRAWN
Materials Scientists
3002442
WITHDRAWN
Market Research Analysts and Marketing Special...
3002443
WITHDRAWN
Commercial and Industrial Designers
3002444
WITHDRAWN
Commercial and Industrial Designers
3002445
NaN
NaN
3002446
NaN
NaN
3002447
NaN
NaN
3002448
NaN
NaN
3002449
NaN
NaN
3002450
NaN
NaN
3002451
NaN
NaN
3002452
NaN
NaN
3002453
NaN
NaN
3002454
NaN
NaN
3002455
NaN
NaN
3002456
NaN
NaN
3002457
NaN
NaN
3002458 rows × 2 columns
In [5]:
df_data_2['SOC_NAME'].value_counts()
Out[5]:
Computer Systems Analysts 291170
Computer Programmers 226574
SOFTWARE DEVELOPERS, APPLICATIONS 221783
COMPUTER SYSTEMS ANALYSTS 215353
Software Developers, Applications 192933
COMPUTER PROGRAMMERS 171972
COMPUTER OCCUPATIONS, ALL OTHER 108555
Computer Occupations, All Other 69315
Software Developers, Systems Software 44500
SOFTWARE DEVELOPERS, SYSTEMS SOFTWARE 39356
Management Analysts 38756
Financial Analysts 34141
Computer Software Engineers, Applications 33325
Accountants and Auditors 30708
MANAGEMENT ANALYSTS 28117
Mechanical Engineers 26219
Computer Occupations, All Other* 25628
Electronics Engineers, Except Computer 23323
Electrical Engineers 22162
Market Research Analysts and Marketing Specialists 22158
ACCOUNTANTS AND AUDITORS 22114
Physicians and Surgeons, All Other 22010
Database Administrators 21937
NETWORK AND COMPUTER SYSTEMS ADMINISTRATORS 21642
FINANCIAL ANALYSTS 19399
Operations Research Analysts 19171
Network and Computer Systems Administrators 18445
MECHANICAL ENGINEERS 18017
COMPUTER SYSTEMS ANALYST 17426
DATABASE ADMINISTRATORS 17045
...
ATMOSPHERIC, EARTH, MARINE, AND SPACE SCIENCES TEACHERS, POSTSECONDARY 1
FIRST-LINE SUPERVISORS OF FARMING, FISHING, AND FO 1
CHMEICAL ENGINEERS 1
INFORMATION SECURITY ANALYST 1
COMPUTER OCCUPATIONS/ALL OTHER 1
NANOTECHNOLOGY ENGINEERING TECHNOLOGISTS 1
COMPUTER OCCUPATION, ALL OTHERS 1
HARDWARD ENGINEERS 1
FINANCE MANAGERS 1
HEALTH DIAGNOSING AND TREATING PRACTITIONERS, ALL OTHER 1
ELECTRICAL ENGINEERING TECHNICIANS 1
MARKET RESEARCH ANALYTICS AND MARKETING SPECIALIST 1
ELECTRICAL ENGINEERINGS 1
BIOCHEMISTS OR BIOPHYSICISTS 1
CREDIT AUTHORIZERS, CHECKERS, AND CLERKS 1
COUNTER ATTENDANTS, CAFETERIA, FOOD CONCESSION, 1
SOFTWARE DEVELOPERS, APPLCIATIONS 1
COMPUTER SYSTEMS ANALYSTSS 1
SALESREPRESENTATIVES,WHOLESALEMANUFACTURING,TECHNI 1
PURCHASING AGENTS, EXCEPT WHOLESALE 1
SECURITIES, COMMODITIES AND SALES AGENTS 1
UROLOGISTS 1
DESIGNER, ALL OTHER 1
LIFE SCIENTISTS 1
SOFTWARE DEVELOPERS, APPLICATIONS, NONR&D 1
TITLE EXAMINERS, ABSTRACTORS, AND SEARCHERS 1
PROPERTY,REAL ESTATE,COMMUNITY ASSOCIATION MANAGER 1
Entertainment Attendants and Related Workers, All 1
BUS AND TRUCK MECHANICS AND DIESEL ENGINE 1
Hazardous Materials Removal Workers 1
Name: SOC_NAME, dtype: int64
In [6]:
df_data_2['SOC_NAME'].value_counts().count()
Out[6]:
2132
In [66]:
cleandata1=df_data_2.copy()
In [67]:
cleandata1['SOC_NAME']=df_data_2['SOC_NAME'].str.lower()
cleandata1['EMPLOYER_NAME']=df_data_2['EMPLOYER_NAME'].str.lower()
cleandata1['JOB_TITLE']=df_data_2['JOB_TITLE'].str.lower()
Here, we transform some strings to lowercase. This is because there are duplicate entries in the dataset which in both upper and lower. This increases redundancy
In [68]:
cleandata1['SOC_NAME'].value_counts()
Out[68]:
computer systems analysts 506523
software developers, applications 414716
computer programmers 398546
computer occupations, all other 177870
software developers, systems software 83856
management analysts 66873
financial analysts 53540
accountants and auditors 52822
mechanical engineers 44236
network and computer systems administrators 40087
database administrators 38982
market research analysts and marketing specialists 37737
electronics engineers, except computer 36574
operations research analysts 34260
electrical engineers 34108
physicians and surgeons, all other 33526
computer software engineers, applications 33387
computer and information systems managers 27536
computer occupations, all other* 26254
medical scientists, except epidemiologists 26159
physical therapists 21994
biochemists and biophysicists 21245
industrial engineers 19370
computer systems analyst 17426
statisticians 17101
biological scientists, all other 16367
marketing managers 16310
civil engineers 15970
web developers 15000
internists, general 13367
...
helpers--electricians 1
industrial designers 1
logistics manager 1
electronics engineers, expert computer 1
hardward engineers 1
soil and plant scientist 1
mechanical engineers, r&d (acwia only) 1
timing device assemblers and adjusters 1
telecommunications line installers and repairers 1
biological science teachers 1
dentist 1
computer systems engineers/arquitects 1
secondary school teachers, 1
computer systems anaysts 1
counseling psychologist 1
software engineer, senior 1
health diagnosing & treating practitioners, all ot 1
set and exhibit engineers 1
helpers, construction trades, all other 1
datebase administrators 1
commercial and insdistrial designers 1
software developers, appllications 1
computer occupations,all other 1
foreign language and literature teachers, postsecondary 1
dentists 1
railroad conductors and yardmasters 1
computer occupations, all other: information techn 1
computer network architect 1
criminal justice and law enforcement teachers, postsecondary 1
medical scientists, except epidemiologist 1
Name: SOC_NAME, dtype: int64
There is still alot of redundancy we can exploit. We can generalize these strings remove specalized strings into more general form. For example 'software engineer, senior' needs to be reduced to 'software engineer'. This also applies to the other columns with string attributes.
In [70]:
cleandata1['SOC_NAME'].value_counts().count()
Out[70]:
1585
We have reduced the number of names down to 1585
In [71]:
reducedf = pd.DataFrame({'SOC_NAME': cleandata1['SOC_NAME'].value_counts().index, 'Count':cleandata1['SOC_NAME'].value_counts().values})
#df['Counts'] = df.groupby(['SOC_NAME'])['Count'].transform('count') #I don't remember what I was trying to do here.
#df = df.set_index(['SOC_NAME'])
reducedf
Out[71]:
Count
SOC_NAME
0
506523
computer systems analysts
1
414716
software developers, applications
2
398546
computer programmers
3
177870
computer occupations, all other
4
83856
software developers, systems software
5
66873
management analysts
6
53540
financial analysts
7
52822
accountants and auditors
8
44236
mechanical engineers
9
40087
network and computer systems administrators
10
38982
database administrators
11
37737
market research analysts and marketing special...
12
36574
electronics engineers, except computer
13
34260
operations research analysts
14
34108
electrical engineers
15
33526
physicians and surgeons, all other
16
33387
computer software engineers, applications
17
27536
computer and information systems managers
18
26254
computer occupations, all other*
19
26159
medical scientists, except epidemiologists
20
21994
physical therapists
21
21245
biochemists and biophysicists
22
19370
industrial engineers
23
17426
computer systems analyst
24
17101
statisticians
25
16367
biological scientists, all other
26
16310
marketing managers
27
15970
civil engineers
28
15000
web developers
29
13367
internists, general
...
...
...
1555
1
helpers--electricians
1556
1
industrial designers
1557
1
logistics manager
1558
1
electronics engineers, expert computer
1559
1
hardward engineers
1560
1
soil and plant scientist
1561
1
mechanical engineers, r&d (acwia only)
1562
1
timing device assemblers and adjusters
1563
1
telecommunications line installers and repairers
1564
1
biological science teachers
1565
1
dentist
1566
1
computer systems engineers/arquitects
1567
1
secondary school teachers,
1568
1
computer systems anaysts
1569
1
counseling psychologist
1570
1
software engineer, senior
1571
1
health diagnosing & treating practitioners, al...
1572
1
set and exhibit engineers
1573
1
helpers, construction trades, all other
1574
1
datebase administrators
1575
1
commercial and insdistrial designers
1576
1
software developers, appllications
1577
1
computer occupations,all other
1578
1
foreign language and literature teachers, post...
1579
1
dentists
1580
1
railroad conductors and yardmasters
1581
1
computer occupations, all other: information t...
1582
1
computer network architect
1583
1
criminal justice and law enforcement teachers,...
1584
1
medical scientists, except epidemiologist
1585 rows × 2 columns
In [72]:
reducedf['Name1'] = ''
In [73]:
reducedf
Out[73]:
Count
SOC_NAME
Name1
0
506523
computer systems analysts
1
414716
software developers, applications
2
398546
computer programmers
3
177870
computer occupations, all other
4
83856
software developers, systems software
5
66873
management analysts
6
53540
financial analysts
7
52822
accountants and auditors
8
44236
mechanical engineers
9
40087
network and computer systems administrators
10
38982
database administrators
11
37737
market research analysts and marketing special...
12
36574
electronics engineers, except computer
13
34260
operations research analysts
14
34108
electrical engineers
15
33526
physicians and surgeons, all other
16
33387
computer software engineers, applications
17
27536
computer and information systems managers
18
26254
computer occupations, all other*
19
26159
medical scientists, except epidemiologists
20
21994
physical therapists
21
21245
biochemists and biophysicists
22
19370
industrial engineers
23
17426
computer systems analyst
24
17101
statisticians
25
16367
biological scientists, all other
26
16310
marketing managers
27
15970
civil engineers
28
15000
web developers
29
13367
internists, general
...
...
...
...
1555
1
helpers--electricians
1556
1
industrial designers
1557
1
logistics manager
1558
1
electronics engineers, expert computer
1559
1
hardward engineers
1560
1
soil and plant scientist
1561
1
mechanical engineers, r&d (acwia only)
1562
1
timing device assemblers and adjusters
1563
1
telecommunications line installers and repairers
1564
1
biological science teachers
1565
1
dentist
1566
1
computer systems engineers/arquitects
1567
1
secondary school teachers,
1568
1
computer systems anaysts
1569
1
counseling psychologist
1570
1
software engineer, senior
1571
1
health diagnosing & treating practitioners, al...
1572
1
set and exhibit engineers
1573
1
helpers, construction trades, all other
1574
1
datebase administrators
1575
1
commercial and insdistrial designers
1576
1
software developers, appllications
1577
1
computer occupations,all other
1578
1
foreign language and literature teachers, post...
1579
1
dentists
1580
1
railroad conductors and yardmasters
1581
1
computer occupations, all other: information t...
1582
1
computer network architect
1583
1
criminal justice and law enforcement teachers,...
1584
1
medical scientists, except epidemiologist
1585 rows × 3 columns
In [74]:
reducedf.iloc[3]['Count'] #example of accessing a location
Out[74]:
177870
In [75]:
%%timeit
for index, row in reducedf.iterrows():
names = row['SOC_NAME'].split(",")
if(names[0].endswith('*')):
reducedf.set_value([index],['Name1'],(names[0][:-1]))
if not (names[0].endswith('s')):
reducedf.set_value([index],['Name1'],(names[0]+'s'))
else:
reducedf.set_value([index],['Name1'],names[0])
1 loop, best of 3: 1.52 s per loop
In [76]:
reducedf
Out[76]:
Count
SOC_NAME
Name1
0
506523
computer systems analysts
computer systems analysts
1
414716
software developers, applications
software developers
2
398546
computer programmers
computer programmers
3
177870
computer occupations, all other
computer occupations
4
83856
software developers, systems software
software developers
5
66873
management analysts
management analysts
6
53540
financial analysts
financial analysts
7
52822
accountants and auditors
accountants and auditors
8
44236
mechanical engineers
mechanical engineers
9
40087
network and computer systems administrators
network and computer systems administrators
10
38982
database administrators
database administrators
11
37737
market research analysts and marketing special...
market research analysts and marketing special...
12
36574
electronics engineers, except computer
electronics engineers
13
34260
operations research analysts
operations research analysts
14
34108
electrical engineers
electrical engineers
15
33526
physicians and surgeons, all other
physicians and surgeons
16
33387
computer software engineers, applications
computer software engineers
17
27536
computer and information systems managers
computer and information systems managers
18
26254
computer occupations, all other*
computer occupations
19
26159
medical scientists, except epidemiologists
medical scientists
20
21994
physical therapists
physical therapists
21
21245
biochemists and biophysicists
biochemists and biophysicists
22
19370
industrial engineers
industrial engineers
23
17426
computer systems analyst
computer systems analysts
24
17101
statisticians
statisticians
25
16367
biological scientists, all other
biological scientists
26
16310
marketing managers
marketing managers
27
15970
civil engineers
civil engineers
28
15000
web developers
web developers
29
13367
internists, general
internists
...
...
...
...
1555
1
helpers--electricians
helpers--electricians
1556
1
industrial designers
industrial designers
1557
1
logistics manager
logistics managers
1558
1
electronics engineers, expert computer
electronics engineers
1559
1
hardward engineers
hardward engineers
1560
1
soil and plant scientist
soil and plant scientists
1561
1
mechanical engineers, r&d (acwia only)
mechanical engineers
1562
1
timing device assemblers and adjusters
timing device assemblers and adjusters
1563
1
telecommunications line installers and repairers
telecommunications line installers and repairers
1564
1
biological science teachers
biological science teachers
1565
1
dentist
dentists
1566
1
computer systems engineers/arquitects
computer systems engineers/arquitects
1567
1
secondary school teachers,
secondary school teachers
1568
1
computer systems anaysts
computer systems anaysts
1569
1
counseling psychologist
counseling psychologists
1570
1
software engineer, senior
software engineers
1571
1
health diagnosing & treating practitioners, al...
health diagnosing & treating practitioners
1572
1
set and exhibit engineers
set and exhibit engineers
1573
1
helpers, construction trades, all other
helpers
1574
1
datebase administrators
datebase administrators
1575
1
commercial and insdistrial designers
commercial and insdistrial designers
1576
1
software developers, appllications
software developers
1577
1
computer occupations,all other
computer occupations
1578
1
foreign language and literature teachers, post...
foreign language and literature teachers
1579
1
dentists
dentists
1580
1
railroad conductors and yardmasters
railroad conductors and yardmasters
1581
1
computer occupations, all other: information t...
computer occupations
1582
1
computer network architect
computer network architects
1583
1
criminal justice and law enforcement teachers,...
criminal justice and law enforcement teachers
1584
1
medical scientists, except epidemiologist
medical scientists
1585 rows × 3 columns
In [79]:
cleandata1['SOC_NAME'].value_counts().count()
Out[79]:
1585
In [98]:
(cleandata1.loc[(cleandata1['SOC_NAME']=='software developers, appllications')]) #an example of a query
Out[98]:
Unnamed: 0
CASE_STATUS
EMPLOYER_NAME
SOC_NAME
JOB_TITLE
FULL_TIME_POSITION
PREVAILING_WAGE
YEAR
WORKSITE
lon
lat
894328
894329
CERTIFIED
meridiansoft, inc.
software developers, appllications
software developer applications
Y
62421.0
2015.0
LEWIS CENTER, OHIO
-83.010099
40.198388
This person messed up the SOC_NAME
In [80]:
reducedf['Name1'].value_counts()
Out[80]:
software developers 37
secondary school teachers 14
securities 14
sales representatives 13
electronics engineers 13
computer occupations 11
computer systems analysts 10
computer software engineers 10
elementary school teachers 10
mechanical engineers 9
special education teachers 8
electrical engineers 7
education administrators 7
computer programmers 7
medical scientists 7
middle school teachers 7
cooks 6
foreign language and literature teachers 6
civil engineers 6
computer hardware engineers 5
atmospherics 5
dentists 5
computer network architects 5
propertys 4
computer and information systems managers 4
internists 4
criminal justice and law enforcement teachers 4
healthcare practitioners and technical workers 4
electrical and electronics repairers 4
business operations specialists 4
..
robotics engineers 1
detectives and criminal investigators 1
environmental science teachers 1
psychiatric aides 1
compuer systems analysts 1
audio-visual collections specialists 1
first-line supervisors and manager/supervisors - ls 1
jewelers 1
marketing specialists 1
medical seicentists 1
first-line supervisors/managers of production ands 1
landscape architects 1
heavy and tractor-trailer truck drivers 1
singers 1
software developers applications 1
first-line supervisors of construction trades and s 1
musical instrument repairers and tuners 1
english language & literature teachers 1
operation research analysts 1
respiratory therapy technicians 1
skincare specialists 1
cardiovascular technologists and technicians 1
nuclear engineers 1
first-line supervisors/managers of personal servics 1
commodities 1
project managers 1
healthcare social workers 1
choreographers 1
geneticists 1
comuter occupations 1
Name: Name1, dtype: int64
In [81]:
reducedf['Name1'].value_counts().count()
Out[81]:
1134
We have now cut the number of names in half from the original number.
In [82]:
reducedf.sort_values(['Name1'])
Out[82]:
Count
SOC_NAME
Name1
1286
1
13-2011.01
13-2011.01s
1096
2
15-1121
15-1121s
1243
1
15-1132
15-1132s
1272
1
15-1199.01 sw quality assurance engnrs & testers
15-1199.01 sw quality assurance engnrs & testers
1354
1
15-1199.01
15-1199.01s
1393
1
15-1199.08, business intelligence analysts
15-1199.08s
1314
1
17-2051
17-2051s
1552
1
17-2072
17-2072s
1461
1
27-3031
27-3031s
1426
1
29-1064.00-obstetricians and gynecologists
29-1064.00-obstetricians and gynecologists
1319
1
<font><font>carpinteros</font></font>
<font><font>carpinteros</font></font>s
1071
2
able seamen
able seamens
1026
2
accountant
accountants
74
3958
accountants
accountants
7
52822
accountants and auditors
accountants and auditors
564
26
actors
actors
80
3486
actuaries
actuaries
1539
1
acupuncturists
acupuncturists
882
4
adhesive bonding machine operators and tenders
adhesive bonding machine operators and tenders
821
6
administrative law judges, adjudicators, and h...
administrative law judges
152
1361
administrative services managers
administrative services managers
323
230
adult basic and secondary education and litera...
adult basic and secondary education and litera...
395
116
adult basic and secondary education and literacy
adult basic and secondary education and literacys
1113
2
adult basic second educ and lit teach and instruc
adult basic second educ and lit teach and inst...
485
50
adult literacy, remedial education, and ged te...
adult literacys
1322
1
advertising and promortions managers
advertising and promortions managers
94
2925
advertising and promotions managers
advertising and promotions managers
1126
2
advertising and promotions manager
advertising and promotions managers
342
195
advertising sales agents
advertising sales agents
351
179
aerospace engineering and operations technicians
aerospace engineering and operations technicians
...
...
...
...
1388
1
urologists
urologists
1107
2
ushers, lobby attendants, and ticket takers
ushers
901
4
validation engineer
validation engineers
422
90
validation engineers
validation engineers
113
2398
veterinarians
veterinarians
599
19
veterinary assistants and laboratory animal ca...
veterinary assistants and laboratory animal ca...
596
20
veterinary assistants and laboratory animal
veterinary assistants and laboratory animals
426
87
veterinary technologists and technicians
veterinary technologists and technicians
1502
1
vice president of investor relations
vice president of investor relations
958
3
video game designers
video game designers
1016
3
vocational education teachers, middle school
vocational education teachers
626
16
vocational education teachers, secondary school
vocational education teachers
331
215
vocational education teachers, postsecondary
vocational education teachers
492
47
vocational education teachers postsecondary
vocational education teachers postsecondarys
639
16
watch repairers
watch repairers
878
4
water and wastewater treatment plant and syste...
water and wastewater treatment plant and syste...
1008
3
water/wastewater engineers
water/wastewater engineers
437
81
web administrators
web administrators
1052
2
web developer
web developers
28
15000
web developers
web developers
1043
2
weighers, measurers, checkers, and samplers,
weighers
638
16
welders, cutters, solderers, and brazers
welders
1204
1
welding, soldering, and brazing machine setters,
weldings
1237
1
welding, soldering, and brazing machine setter...
weldings
205
746
wholesale and retail buyers, except farm products
wholesale and retail buyers
1259
1
wind energy engineers
wind energy engineers
1296
1
wind turbine service technicians
wind turbine service technicians
708
10
woodworkers, all other
woodworkers
181
981
writers and authors
writers and authors
237
557
zoologists and wildlife biologists
zoologists and wildlife biologists
1585 rows × 3 columns
In [83]:
reducedf['Name2'] = ""
In [84]:
%%timeit
regex = re.compile('[^a-z\s]')
for index, row in reducedf.iterrows():
reducedf.set_value([index],['Name2'],(regex.sub('', row['Name1'])))
1 loop, best of 3: 1.52 s per loop
In [85]:
reducedf.sort_values(['Name1'])
Out[85]:
Count
SOC_NAME
Name1
Name2
1286
1
13-2011.01
13-2011.01s
s
1096
2
15-1121
15-1121s
s
1243
1
15-1132
15-1132s
s
1272
1
15-1199.01 sw quality assurance engnrs & testers
15-1199.01 sw quality assurance engnrs & testers
sw quality assurance engnrs testers
1354
1
15-1199.01
15-1199.01s
s
1393
1
15-1199.08, business intelligence analysts
15-1199.08s
s
1314
1
17-2051
17-2051s
s
1552
1
17-2072
17-2072s
s
1461
1
27-3031
27-3031s
s
1426
1
29-1064.00-obstetricians and gynecologists
29-1064.00-obstetricians and gynecologists
obstetricians and gynecologists
1319
1
<font><font>carpinteros</font></font>
<font><font>carpinteros</font></font>s
fontfontcarpinterosfontfonts
1071
2
able seamen
able seamens
able seamens
1026
2
accountant
accountants
accountants
74
3958
accountants
accountants
accountants
7
52822
accountants and auditors
accountants and auditors
accountants and auditors
564
26
actors
actors
actors
80
3486
actuaries
actuaries
actuaries
1539
1
acupuncturists
acupuncturists
acupuncturists
882
4
adhesive bonding machine operators and tenders
adhesive bonding machine operators and tenders
adhesive bonding machine operators and tenders
821
6
administrative law judges, adjudicators, and h...
administrative law judges
administrative law judges
152
1361
administrative services managers
administrative services managers
administrative services managers
323
230
adult basic and secondary education and litera...
adult basic and secondary education and litera...
adult basic and secondary education and litera...
395
116
adult basic and secondary education and literacy
adult basic and secondary education and literacys
adult basic and secondary education and literacys
1113
2
adult basic second educ and lit teach and instruc
adult basic second educ and lit teach and inst...
adult basic second educ and lit teach and inst...
485
50
adult literacy, remedial education, and ged te...
adult literacys
adult literacys
1322
1
advertising and promortions managers
advertising and promortions managers
advertising and promortions managers
94
2925
advertising and promotions managers
advertising and promotions managers
advertising and promotions managers
1126
2
advertising and promotions manager
advertising and promotions managers
advertising and promotions managers
342
195
advertising sales agents
advertising sales agents
advertising sales agents
351
179
aerospace engineering and operations technicians
aerospace engineering and operations technicians
aerospace engineering and operations technicians
...
...
...
...
...
1388
1
urologists
urologists
urologists
1107
2
ushers, lobby attendants, and ticket takers
ushers
ushers
901
4
validation engineer
validation engineers
validation engineers
422
90
validation engineers
validation engineers
validation engineers
113
2398
veterinarians
veterinarians
veterinarians
599
19
veterinary assistants and laboratory animal ca...
veterinary assistants and laboratory animal ca...
veterinary assistants and laboratory animal ca...
596
20
veterinary assistants and laboratory animal
veterinary assistants and laboratory animals
veterinary assistants and laboratory animals
426
87
veterinary technologists and technicians
veterinary technologists and technicians
veterinary technologists and technicians
1502
1
vice president of investor relations
vice president of investor relations
vice president of investor relations
958
3
video game designers
video game designers
video game designers
1016
3
vocational education teachers, middle school
vocational education teachers
vocational education teachers
626
16
vocational education teachers, secondary school
vocational education teachers
vocational education teachers
331
215
vocational education teachers, postsecondary
vocational education teachers
vocational education teachers
492
47
vocational education teachers postsecondary
vocational education teachers postsecondarys
vocational education teachers postsecondarys
639
16
watch repairers
watch repairers
watch repairers
878
4
water and wastewater treatment plant and syste...
water and wastewater treatment plant and syste...
water and wastewater treatment plant and syste...
1008
3
water/wastewater engineers
water/wastewater engineers
waterwastewater engineers
437
81
web administrators
web administrators
web administrators
1052
2
web developer
web developers
web developers
28
15000
web developers
web developers
web developers
1043
2
weighers, measurers, checkers, and samplers,
weighers
weighers
638
16
welders, cutters, solderers, and brazers
welders
welders
1204
1
welding, soldering, and brazing machine setters,
weldings
weldings
1237
1
welding, soldering, and brazing machine setter...
weldings
weldings
205
746
wholesale and retail buyers, except farm products
wholesale and retail buyers
wholesale and retail buyers
1259
1
wind energy engineers
wind energy engineers
wind energy engineers
1296
1
wind turbine service technicians
wind turbine service technicians
wind turbine service technicians
708
10
woodworkers, all other
woodworkers
woodworkers
181
981
writers and authors
writers and authors
writers and authors
237
557
zoologists and wildlife biologists
zoologists and wildlife biologists
zoologists and wildlife biologists
1585 rows × 4 columns
In [86]:
reducedf['Name2'].value_counts().count()
Out[86]:
1118
Striping charactors did not help much.
At this point investigated a spellchecker in python. Was not able to get something to work within enviroment.
I am now going to consider removing entries that are unique, with a count of one.
In [87]:
dfName2Check = pd.DataFrame({'Name2': reducedf['Name2'].value_counts().index, 'Count':reducedf['Name2'].value_counts().values})
In [88]:
dfName2Check
Out[88]:
Count
Name2
0
37
software developers
1
14
securities
2
14
secondary school teachers
3
13
sales representatives
4
13
electronics engineers
5
11
computer occupations
6
10
computer software engineers
7
10
computer systems analysts
8
10
elementary school teachers
9
9
mechanical engineers
10
8
s
11
8
special education teachers
12
7
electrical engineers
13
7
middle school teachers
14
7
computer programmers
15
7
medical scientists
16
7
education administrators
17
6
cooks
18
6
foreign language and literature teachers
19
6
civil engineers
20
5
atmospherics
21
5
dentists
22
5
computer network architects
23
5
computer hardware engineers
24
4
business operations specialists
25
4
healthcare practitioners and technical workers
26
4
physicians and surgeons
27
4
computer and information systems managers
28
4
compensations
29
4
biological scientists
...
...
...
1088
1
morticians
1089
1
physical therapist aides
1090
1
electrical enginers
1091
1
detectives and criminal investigators
1092
1
environmental science teachers
1093
1
psychiatric aides
1094
1
marketing specialists
1095
1
medical seicentists
1096
1
landscape architects
1097
1
singers
1098
1
biochemist biophysicists
1099
1
robotics engineers
1100
1
musical instrument repairers and tuners
1101
1
firstline supervisorsmanagers of helpers
1102
1
operation research analysts
1103
1
respiratory therapy technicians
1104
1
skincare specialists
1105
1
cardiovascular technologists and technicians
1106
1
copmuter programmers
1107
1
firstline supervisors of personal service workers
1108
1
nuclear engineers
1109
1
clinicals
1110
1
commodities
1111
1
project managers
1112
1
healthcare social workers
1113
1
urologists
1114
1
orthodontists
1115
1
stonemasons
1116
1
tile and marble setters
1117
1
comuter occupations
1118 rows × 2 columns
Theres a problem, I am not getting a real representation of the occurence of names in the data. I now need to do something where I can get the actual number of occurences.
In [89]:
cleandata1
Out[89]:
Unnamed: 0
CASE_STATUS
EMPLOYER_NAME
SOC_NAME
JOB_TITLE
FULL_TIME_POSITION
PREVAILING_WAGE
YEAR
WORKSITE
lon
lat
0
1
CERTIFIED-WITHDRAWN
university of michigan
biochemists and biophysicists
postdoctoral research fellow
N
36067.00
2016.0
ANN ARBOR, MICHIGAN
-83.743038
42.280826
1
2
CERTIFIED-WITHDRAWN
goodman networks, inc.
chief executives
chief operating officer
Y
242674.00
2016.0
PLANO, TEXAS
-96.698886
33.019843
2
3
CERTIFIED-WITHDRAWN
ports america group, inc.
chief executives
chief process officer
Y
193066.00
2016.0
JERSEY CITY, NEW JERSEY
-74.077642
40.728158
3
4
CERTIFIED-WITHDRAWN
gates corporation, a wholly-owned subsidiary o...
chief executives
regional presiden, americas
Y
220314.00
2016.0
DENVER, COLORADO
-104.990251
39.739236
4
5
WITHDRAWN
peabody investments corp.
chief executives
president mongolia and india
Y
157518.40
2016.0
ST. LOUIS, MISSOURI
-90.199404
38.627003
5
6
CERTIFIED-WITHDRAWN
burger king corporation
chief executives
executive v p, global development and presiden...
Y
225000.00
2016.0
MIAMI, FLORIDA
-80.191790
25.761680
6
7
CERTIFIED-WITHDRAWN
bt and mk energy and commodities
chief executives
chief operating officer
Y
91021.00
2016.0
HOUSTON, TEXAS
-95.369803
29.760427
7
8
CERTIFIED-WITHDRAWN
globo mobile technologies, inc.
chief executives
chief operations officer
Y
150000.00
2016.0
SAN JOSE, CALIFORNIA
-121.886329
37.338208
8
9
CERTIFIED-WITHDRAWN
esi companies inc.
chief executives
president
Y
127546.00
2016.0
MEMPHIS, TEXAS
NaN
NaN
9
10
WITHDRAWN
lessard international llc
chief executives
president
Y
154648.00
2016.0
VIENNA, VIRGINIA
-77.265260
38.901222
10
11
CERTIFIED-WITHDRAWN
h.j. heinz company
chief executives
chief information officer, heinz north america
Y
182978.00
2016.0
PITTSBURGH, PENNSYLVANIA
-79.995886
40.440625
11
12
CERTIFIED-WITHDRAWN
dow corning corporation
chief executives
vice president and chief human resources officer
Y
163717.00
2016.0
MIDLAND, MICHIGAN
-84.247212
43.615583
12
13
CERTIFIED-WITHDRAWN
acushnet company
chief executives
treasurer and coo
Y
203860.80
2016.0
FAIRHAVEN, MASSACHUSETTS
NaN
NaN
13
14
CERTIFIED-WITHDRAWN
biocair, inc.
chief executives
chief commercial officer
Y
252637.00
2016.0
MIAMI, FLORIDA
-80.191790
25.761680
14
15
CERTIFIED-WITHDRAWN
newmont mining corporation
chief executives
board member
Y
105914.00
2016.0
GREENWOOD VILLAGE, COLORADO
-104.950814
39.617210
15
16
CERTIFIED-WITHDRAWN
vricon, inc.
chief executives
chief financial officer
Y
153046.00
2016.0
STERLING, VIRGINIA
-77.429130
39.006699
16
17
CERTIFIED-WITHDRAWN
cardiac science corporation
financial managers
vice president of finance
Y
90834.00
2016.0
WAUKESHA, WISCONSIN
-88.231481
43.011678
17
18
CERTIFIED-WITHDRAWN
westfield corporation
chief executives
general manager, operations
Y
164050.00
2016.0
LOS ANGELES, CALIFORNIA
-118.243685
34.052234
18
19
CERTIFIED
quicklogix llc
chief executives
ceo
Y
187200.00
2016.0
SANTA CLARA, CALIFORNIA
-121.955236
37.354108
19
20
CERTIFIED
mcchrystal group, llc
chief executives
president, northeast region
Y
241842.00
2016.0
ALEXANDRIA, VIRGINIA
-77.046921
38.804835
20
21
CERTIFIED-WITHDRAWN
cuddle barn, inc.
chief executives
chief operating officer (coo)
Y
117998.00
2016.0
COMMERCE, CALIFORNIA
-118.159793
34.000569
21
22
CERTIFIED-WITHDRAWN
westfield corporation
chief executives
general manager, operations
Y
164050.00
2016.0
LOS ANGELES, CALIFORNIA
-118.243685
34.052234
22
23
CERTIFIED
lomics, llc
chief executives
ceo
Y
99986.00
2016.0
SAN DIEGO, CALIFORNIA
-117.161084
32.715738
23
24
CERTIFIED
uc university high school education inc.
chief executives
chief financial officer
Y
99986.00
2016.0
CHULA VISTA, CALIFORNIA
-117.084196
32.640054
24
25
CERTIFIED-WITHDRAWN
vms communications llc
chief executives
chief operating officer
Y
159370.00
2016.0
MIAMI, FLORIDA
-80.191790
25.761680
25
26
CERTIFIED
quicklogix, inc.
chief executives
ceo
Y
187200.00
2016.0
SANTA CLARA, CALIFORNIA
-121.955236
37.354108
26
27
CERTIFIED-WITHDRAWN
foodessentials corporation
chief executives
chief executive officer
Y
130853.00
2016.0
CHICAGO, ILLINOIS
-87.629798
41.878114
27
28
CERTIFIED
hello inc.
chief executives
chief business officer
Y
215862.00
2016.0
SAN FRANCISCO, CALIFORNIA
-122.419415
37.774929
28
29
CERTIFIED
umbel corp
chief executives
vice president of engineering
Y
192088.00
2016.0
AUSTIN, TEXAS
-97.743061
30.267153
29
30
CERTIFIED
perspectives of freedom foundation, inc
chief executives
executive director
Y
95295.98
2016.0
WESTON, FLORIDA
-80.399775
26.100365
...
...
...
...
...
...
...
...
...
...
...
...
3002428
3002429
WITHDRAWN
avant healthcare professionals
physical therapists
physical therapist
Y
53601.60
2011.0
LEBANON, PENNSYLVANIA
NaN
NaN
3002429
3002430
WITHDRAWN
avant healthcare professionals
physical therapists
physical therapist
Y
53601.60
2011.0
LEBANON, PENNSYLVANIA
NaN
NaN
3002430
3002431
WITHDRAWN
trisync technologies, inc.
computer systems analysts
computer system analyst
Y
55245.00
2011.0
EDISON, NEW JERSEY
-74.412095
40.518715
3002431
3002432
WITHDRAWN
at last sportswear inc.
computer support specialists
computer support specialist
Y
36837.00
2011.0
SECAUCUS, NEW JERSEY
-74.056530
40.789545
3002432
3002433
WITHDRAWN
trisync technologies, inc.
computer systems analysts
computer system analyst
Y
55245.00
2011.0
EDISON, NEW JERSEY
-74.412095
40.518715
3002433
3002434
WITHDRAWN
the university of texas southwestern medical c...
biochemists and biophysicists
instructor
Y
36795.00
2011.0
DALLAS, TEXAS
-96.796988
32.776664
3002434
3002435
WITHDRAWN
trisync technologies, inc.
computer systems analysts
computer system analyst
Y
55245.00
2011.0
EDISON, NEW JERSEY
-74.412095
40.518715
3002435
3002436
WITHDRAWN
marlabs, inc
computer programmers
programmer/analyst
Y
77730.00
2011.0
DURHAM, NORTH CAROLINA
-78.898619
35.994033
3002436
3002437
WITHDRAWN
xtron software services, inc.
computer programmers
computer programmers
Y
89232.00
2011.0
SANTA CLARA, CALIFORNIA
-121.955236
37.354108
3002437
3002438
WITHDRAWN
university of mississippi medical center
health specialties teachers, postsecondary
assistant professor of anesthesiology
Y
34510.00
2011.0
JACKSON, MISSISSIPPI
-90.184810
32.298757
3002438
3002439
WITHDRAWN
canvas infotech, inc.
database administrators
data analyst
Y
53082.00
2011.0
PLEASANTON, CALIFORNIA
-121.874679
37.662431
3002439
3002440
WITHDRAWN
new york university
biological scientists, all other
adjunct associate professor
Y
37336.00
2011.0
NEW YORK, NEW YORK
-74.005941
40.712784
3002440
3002441
WITHDRAWN
oracle america, inc.
software developers, applications
software engineer (software developer 2)
Y
64800.00
2011.0
REDWOOD SHORES, CALIFORNIA
-122.245536
37.536413
3002441
3002442
WITHDRAWN
b & d dental corp.
materials scientists
materials scientist
N
70553.60
2011.0
WEST VALLEY, UTAH
-112.001050
40.691613
3002442
3002443
WITHDRAWN
medtech staffing & solutions, inc
market research analysts and marketing special...
market research analyst
Y
33800.00
2011.0
AKRON, OHIO
-81.519005
41.081445
3002443
3002444
WITHDRAWN
larsen & toubro limited
commercial and industrial designers
design engineer
Y
59800.00
2011.0
CHELMSFORD, MASSACHUSETTS
-71.367284
42.599814
3002444
3002445
WITHDRAWN
larsen & toubro limited
commercial and industrial designers
design engineer
Y
59800.00
2011.0
CHELMSFORD, MASSACHUSETTS
-71.367284
42.599814
3002445
3002446
NaN
NaN
NaN
NaN
NaN
NaN
NaN
BERKLEY HEIGHTS, NEW JERSEY
-74.431052
40.680873
3002446
3002447
NaN
NaN
NaN
NaN
NaN
NaN
NaN
SCHENECTADY , NEW YORK
-73.939569
42.814243
3002447
3002448
NaN
NaN
NaN
NaN
NaN
NaN
NaN
MOUTAIN VIEW, CALIFORNIA
-122.083851
37.386052
3002448
3002449
NaN
NaN
NaN
NaN
NaN
NaN
NaN
ST.PAUL, MINNESOTA
-93.089958
44.953703
3002449
3002450
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NEW TOWN, PENNSYLVANIA
-74.932260
40.228337
3002450
3002451
NaN
NaN
NaN
NaN
NaN
NaN
NaN
WESTMINISTER, COLORADO
-105.037205
39.836653
3002451
3002452
NaN
NaN
NaN
NaN
NaN
NaN
NaN
FREEMONT, CALIFORNIA
-121.988572
37.548270
3002452
3002453
NaN
NaN
NaN
NaN
NaN
NaN
NaN
LAVERGNE, TENNESSEE
-86.581939
36.015618
3002453
3002454
NaN
NaN
NaN
NaN
NaN
NaN
NaN
NYC, NEW YORK
-74.005941
40.712784
3002454
3002455
NaN
NaN
NaN
NaN
NaN
NaN
NaN
SOUTH LAKE, TEXAS
-97.134178
32.941236
3002455
3002456
NaN
NaN
NaN
NaN
NaN
NaN
NaN
CLINTON, NEW JERSEY
-74.909890
40.636768
3002456
3002457
NaN
NaN
NaN
NaN
NaN
NaN
NaN
OWINGS MILL, MARYLAND
-76.780253
39.419550
3002457
3002458
NaN
NaN
NaN
NaN
NaN
NaN
NaN
ALTANTA, GEORGIA
-84.387982
33.748995
3002458 rows × 11 columns
Going to go back to this df and create a copy. Will then over write the Soc_Name with the reduced name I have.
In [111]:
cleandata2 = cleandata1.copy()
In [130]:
name = cleandata2.iloc[3002440]['SOC_NAME'] #example of accessing a location
print(name)
newname = reducedf.loc[(reducedf['SOC_NAME']==name)]
newname1 = newname.iloc[0]['Name2']
print(newname1)
software developers, applications
Out[130]:
'software developers'
Test code to test algorithm
In [ ]:
for index, row in cleandata2.iterrows():
name = row['SOC_NAME']
newname = reducedf.loc[(reducedf['SOC_NAME']==name)]
newname1 = newname.iloc[0]['Name2']
cleandata2.set_value([index],['SOC_NAME'],newname1)
Here, we try to apply what we have reduced to the dataframe. We will see if we have made any progress, and what we can do next.
Content source: JonathonBeauregardII/Advanced-Data-Mining-Project-IBM
Similar notebooks: