In [1]:
from io import StringIO
import requests
import json
import pandas as pd
import re
%matplotlib inline

In [2]:
# The code was removed by DSX for sharing.

In [3]:
df_data_2.head(10)


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-3-39c82ccbdf92> in <module>()
----> 1 df_data_2.head(10)

NameError: name 'df_data_2' is not defined

In [4]:
df_data_2[['CASE_STATUS', 'SOC_NAME']]
#use this to make a chart with two indexes


Out[4]:
CASE_STATUS SOC_NAME
0 CERTIFIED-WITHDRAWN BIOCHEMISTS AND BIOPHYSICISTS
1 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
2 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
3 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
4 WITHDRAWN CHIEF EXECUTIVES
5 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
6 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
7 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
8 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
9 WITHDRAWN CHIEF EXECUTIVES
10 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
11 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
12 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
13 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
14 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
15 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
16 CERTIFIED-WITHDRAWN FINANCIAL MANAGERS
17 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
18 CERTIFIED CHIEF EXECUTIVES
19 CERTIFIED CHIEF EXECUTIVES
20 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
21 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
22 CERTIFIED CHIEF EXECUTIVES
23 CERTIFIED CHIEF EXECUTIVES
24 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
25 CERTIFIED CHIEF EXECUTIVES
26 CERTIFIED-WITHDRAWN CHIEF EXECUTIVES
27 CERTIFIED CHIEF EXECUTIVES
28 CERTIFIED CHIEF EXECUTIVES
29 CERTIFIED CHIEF EXECUTIVES
... ... ...
3002428 WITHDRAWN Physical Therapists
3002429 WITHDRAWN Physical Therapists
3002430 WITHDRAWN Computer Systems Analysts
3002431 WITHDRAWN Computer Support Specialists
3002432 WITHDRAWN Computer Systems Analysts
3002433 WITHDRAWN Biochemists and Biophysicists
3002434 WITHDRAWN Computer Systems Analysts
3002435 WITHDRAWN Computer Programmers
3002436 WITHDRAWN Computer Programmers
3002437 WITHDRAWN Health Specialties Teachers, Postsecondary
3002438 WITHDRAWN Database Administrators
3002439 WITHDRAWN Biological Scientists, All Other
3002440 WITHDRAWN Software Developers, Applications
3002441 WITHDRAWN Materials Scientists
3002442 WITHDRAWN Market Research Analysts and Marketing Special...
3002443 WITHDRAWN Commercial and Industrial Designers
3002444 WITHDRAWN Commercial and Industrial Designers
3002445 NaN NaN
3002446 NaN NaN
3002447 NaN NaN
3002448 NaN NaN
3002449 NaN NaN
3002450 NaN NaN
3002451 NaN NaN
3002452 NaN NaN
3002453 NaN NaN
3002454 NaN NaN
3002455 NaN NaN
3002456 NaN NaN
3002457 NaN NaN

3002458 rows × 2 columns


In [5]:
df_data_2['SOC_NAME'].value_counts()


Out[5]:
Computer Systems Analysts                                                 291170
Computer Programmers                                                      226574
SOFTWARE DEVELOPERS, APPLICATIONS                                         221783
COMPUTER SYSTEMS ANALYSTS                                                 215353
Software Developers, Applications                                         192933
COMPUTER PROGRAMMERS                                                      171972
COMPUTER OCCUPATIONS, ALL OTHER                                           108555
Computer Occupations, All Other                                            69315
Software Developers, Systems Software                                      44500
SOFTWARE DEVELOPERS, SYSTEMS SOFTWARE                                      39356
Management Analysts                                                        38756
Financial Analysts                                                         34141
Computer Software Engineers, Applications                                  33325
Accountants and Auditors                                                   30708
MANAGEMENT ANALYSTS                                                        28117
Mechanical Engineers                                                       26219
Computer Occupations, All Other*                                           25628
Electronics Engineers, Except Computer                                     23323
Electrical Engineers                                                       22162
Market Research Analysts and Marketing Specialists                         22158
ACCOUNTANTS AND AUDITORS                                                   22114
Physicians and Surgeons, All Other                                         22010
Database Administrators                                                    21937
NETWORK AND COMPUTER SYSTEMS ADMINISTRATORS                                21642
FINANCIAL ANALYSTS                                                         19399
Operations Research Analysts                                               19171
Network and Computer Systems Administrators                                18445
MECHANICAL ENGINEERS                                                       18017
COMPUTER SYSTEMS ANALYST                                                   17426
DATABASE ADMINISTRATORS                                                    17045
                                                                           ...  
ATMOSPHERIC, EARTH, MARINE, AND SPACE SCIENCES TEACHERS, POSTSECONDARY         1
FIRST-LINE SUPERVISORS OF FARMING, FISHING, AND FO                             1
CHMEICAL ENGINEERS                                                             1
INFORMATION SECURITY ANALYST                                                   1
COMPUTER OCCUPATIONS/ALL OTHER                                                 1
NANOTECHNOLOGY ENGINEERING TECHNOLOGISTS                                       1
COMPUTER OCCUPATION, ALL OTHERS                                                1
HARDWARD ENGINEERS                                                             1
FINANCE MANAGERS                                                               1
HEALTH DIAGNOSING AND TREATING PRACTITIONERS, ALL OTHER                        1
ELECTRICAL ENGINEERING TECHNICIANS                                             1
MARKET RESEARCH ANALYTICS AND MARKETING SPECIALIST                             1
ELECTRICAL ENGINEERINGS                                                        1
BIOCHEMISTS OR BIOPHYSICISTS                                                   1
CREDIT AUTHORIZERS, CHECKERS, AND CLERKS                                       1
COUNTER ATTENDANTS, CAFETERIA, FOOD CONCESSION,                                1
SOFTWARE DEVELOPERS, APPLCIATIONS                                              1
COMPUTER SYSTEMS ANALYSTSS                                                     1
SALESREPRESENTATIVES,WHOLESALEMANUFACTURING,TECHNI                             1
PURCHASING AGENTS, EXCEPT WHOLESALE                                            1
SECURITIES, COMMODITIES AND SALES AGENTS                                       1
UROLOGISTS                                                                     1
DESIGNER, ALL OTHER                                                            1
LIFE SCIENTISTS                                                                1
SOFTWARE DEVELOPERS, APPLICATIONS, NONR&AMP;D                                  1
TITLE EXAMINERS, ABSTRACTORS, AND SEARCHERS                                    1
PROPERTY,REAL ESTATE,COMMUNITY ASSOCIATION MANAGER                             1
Entertainment Attendants and Related Workers, All                              1
BUS AND TRUCK MECHANICS AND DIESEL ENGINE                                      1
Hazardous Materials Removal Workers                                            1
Name: SOC_NAME, dtype: int64

In [6]:
df_data_2['SOC_NAME'].value_counts().count()


Out[6]:
2132

In [66]:
cleandata1=df_data_2.copy()

In [67]:
cleandata1['SOC_NAME']=df_data_2['SOC_NAME'].str.lower()
cleandata1['EMPLOYER_NAME']=df_data_2['EMPLOYER_NAME'].str.lower()
cleandata1['JOB_TITLE']=df_data_2['JOB_TITLE'].str.lower()

Here, we transform some strings to lowercase. This is because there are duplicate entries in the dataset which in both upper and lower. This increases redundancy


In [68]:
cleandata1['SOC_NAME'].value_counts()


Out[68]:
computer systems analysts                                       506523
software developers, applications                               414716
computer programmers                                            398546
computer occupations, all other                                 177870
software developers, systems software                            83856
management analysts                                              66873
financial analysts                                               53540
accountants and auditors                                         52822
mechanical engineers                                             44236
network and computer systems administrators                      40087
database administrators                                          38982
market research analysts and marketing specialists               37737
electronics engineers, except computer                           36574
operations research analysts                                     34260
electrical engineers                                             34108
physicians and surgeons, all other                               33526
computer software engineers, applications                        33387
computer and information systems managers                        27536
computer occupations, all other*                                 26254
medical scientists, except epidemiologists                       26159
physical therapists                                              21994
biochemists and biophysicists                                    21245
industrial engineers                                             19370
computer systems analyst                                         17426
statisticians                                                    17101
biological scientists, all other                                 16367
marketing managers                                               16310
civil engineers                                                  15970
web developers                                                   15000
internists, general                                              13367
                                                                 ...  
helpers--electricians                                                1
industrial designers                                                 1
logistics manager                                                    1
electronics engineers, expert computer                               1
hardward engineers                                                   1
soil and plant scientist                                             1
mechanical engineers, r&d (acwia only)                               1
timing device assemblers and adjusters                               1
telecommunications line installers and repairers                     1
biological science teachers                                          1
dentist                                                              1
computer systems engineers/arquitects                                1
secondary school teachers,                                           1
computer systems anaysts                                             1
counseling psychologist                                              1
software engineer, senior                                            1
health diagnosing & treating practitioners, all ot                   1
set and exhibit engineers                                            1
helpers, construction trades, all other                              1
datebase administrators                                              1
commercial and insdistrial designers                                 1
software developers, appllications                                   1
computer occupations,all other                                       1
foreign language and literature teachers, postsecondary              1
dentists                                                             1
railroad conductors and yardmasters                                  1
computer occupations, all other: information techn                   1
computer network architect                                           1
criminal justice and law enforcement teachers, postsecondary         1
medical scientists, except epidemiologist                            1
Name: SOC_NAME, dtype: int64

There is still alot of redundancy we can exploit. We can generalize these strings remove specalized strings into more general form. For example 'software engineer, senior' needs to be reduced to 'software engineer'. This also applies to the other columns with string attributes.


In [70]:
cleandata1['SOC_NAME'].value_counts().count()


Out[70]:
1585

We have reduced the number of names down to 1585


In [71]:
reducedf = pd.DataFrame({'SOC_NAME': cleandata1['SOC_NAME'].value_counts().index, 'Count':cleandata1['SOC_NAME'].value_counts().values})
#df['Counts'] = df.groupby(['SOC_NAME'])['Count'].transform('count') #I don't remember what I was trying to do here.
#df = df.set_index(['SOC_NAME'])
reducedf


Out[71]:
Count SOC_NAME
0 506523 computer systems analysts
1 414716 software developers, applications
2 398546 computer programmers
3 177870 computer occupations, all other
4 83856 software developers, systems software
5 66873 management analysts
6 53540 financial analysts
7 52822 accountants and auditors
8 44236 mechanical engineers
9 40087 network and computer systems administrators
10 38982 database administrators
11 37737 market research analysts and marketing special...
12 36574 electronics engineers, except computer
13 34260 operations research analysts
14 34108 electrical engineers
15 33526 physicians and surgeons, all other
16 33387 computer software engineers, applications
17 27536 computer and information systems managers
18 26254 computer occupations, all other*
19 26159 medical scientists, except epidemiologists
20 21994 physical therapists
21 21245 biochemists and biophysicists
22 19370 industrial engineers
23 17426 computer systems analyst
24 17101 statisticians
25 16367 biological scientists, all other
26 16310 marketing managers
27 15970 civil engineers
28 15000 web developers
29 13367 internists, general
... ... ...
1555 1 helpers--electricians
1556 1 industrial designers
1557 1 logistics manager
1558 1 electronics engineers, expert computer
1559 1 hardward engineers
1560 1 soil and plant scientist
1561 1 mechanical engineers, r&d (acwia only)
1562 1 timing device assemblers and adjusters
1563 1 telecommunications line installers and repairers
1564 1 biological science teachers
1565 1 dentist
1566 1 computer systems engineers/arquitects
1567 1 secondary school teachers,
1568 1 computer systems anaysts
1569 1 counseling psychologist
1570 1 software engineer, senior
1571 1 health diagnosing & treating practitioners, al...
1572 1 set and exhibit engineers
1573 1 helpers, construction trades, all other
1574 1 datebase administrators
1575 1 commercial and insdistrial designers
1576 1 software developers, appllications
1577 1 computer occupations,all other
1578 1 foreign language and literature teachers, post...
1579 1 dentists
1580 1 railroad conductors and yardmasters
1581 1 computer occupations, all other: information t...
1582 1 computer network architect
1583 1 criminal justice and law enforcement teachers,...
1584 1 medical scientists, except epidemiologist

1585 rows × 2 columns


In [72]:
reducedf['Name1'] = ''

In [73]:
reducedf


Out[73]:
Count SOC_NAME Name1
0 506523 computer systems analysts
1 414716 software developers, applications
2 398546 computer programmers
3 177870 computer occupations, all other
4 83856 software developers, systems software
5 66873 management analysts
6 53540 financial analysts
7 52822 accountants and auditors
8 44236 mechanical engineers
9 40087 network and computer systems administrators
10 38982 database administrators
11 37737 market research analysts and marketing special...
12 36574 electronics engineers, except computer
13 34260 operations research analysts
14 34108 electrical engineers
15 33526 physicians and surgeons, all other
16 33387 computer software engineers, applications
17 27536 computer and information systems managers
18 26254 computer occupations, all other*
19 26159 medical scientists, except epidemiologists
20 21994 physical therapists
21 21245 biochemists and biophysicists
22 19370 industrial engineers
23 17426 computer systems analyst
24 17101 statisticians
25 16367 biological scientists, all other
26 16310 marketing managers
27 15970 civil engineers
28 15000 web developers
29 13367 internists, general
... ... ... ...
1555 1 helpers--electricians
1556 1 industrial designers
1557 1 logistics manager
1558 1 electronics engineers, expert computer
1559 1 hardward engineers
1560 1 soil and plant scientist
1561 1 mechanical engineers, r&d (acwia only)
1562 1 timing device assemblers and adjusters
1563 1 telecommunications line installers and repairers
1564 1 biological science teachers
1565 1 dentist
1566 1 computer systems engineers/arquitects
1567 1 secondary school teachers,
1568 1 computer systems anaysts
1569 1 counseling psychologist
1570 1 software engineer, senior
1571 1 health diagnosing & treating practitioners, al...
1572 1 set and exhibit engineers
1573 1 helpers, construction trades, all other
1574 1 datebase administrators
1575 1 commercial and insdistrial designers
1576 1 software developers, appllications
1577 1 computer occupations,all other
1578 1 foreign language and literature teachers, post...
1579 1 dentists
1580 1 railroad conductors and yardmasters
1581 1 computer occupations, all other: information t...
1582 1 computer network architect
1583 1 criminal justice and law enforcement teachers,...
1584 1 medical scientists, except epidemiologist

1585 rows × 3 columns


In [74]:
reducedf.iloc[3]['Count'] #example of accessing a location


Out[74]:
177870

In [75]:
%%timeit
for index, row in reducedf.iterrows():
    names = row['SOC_NAME'].split(",")
    if(names[0].endswith('*')):
        reducedf.set_value([index],['Name1'],(names[0][:-1]))
    if not (names[0].endswith('s')):
        reducedf.set_value([index],['Name1'],(names[0]+'s'))
    else:
        reducedf.set_value([index],['Name1'],names[0])


1 loop, best of 3: 1.52 s per loop

In [76]:
reducedf


Out[76]:
Count SOC_NAME Name1
0 506523 computer systems analysts computer systems analysts
1 414716 software developers, applications software developers
2 398546 computer programmers computer programmers
3 177870 computer occupations, all other computer occupations
4 83856 software developers, systems software software developers
5 66873 management analysts management analysts
6 53540 financial analysts financial analysts
7 52822 accountants and auditors accountants and auditors
8 44236 mechanical engineers mechanical engineers
9 40087 network and computer systems administrators network and computer systems administrators
10 38982 database administrators database administrators
11 37737 market research analysts and marketing special... market research analysts and marketing special...
12 36574 electronics engineers, except computer electronics engineers
13 34260 operations research analysts operations research analysts
14 34108 electrical engineers electrical engineers
15 33526 physicians and surgeons, all other physicians and surgeons
16 33387 computer software engineers, applications computer software engineers
17 27536 computer and information systems managers computer and information systems managers
18 26254 computer occupations, all other* computer occupations
19 26159 medical scientists, except epidemiologists medical scientists
20 21994 physical therapists physical therapists
21 21245 biochemists and biophysicists biochemists and biophysicists
22 19370 industrial engineers industrial engineers
23 17426 computer systems analyst computer systems analysts
24 17101 statisticians statisticians
25 16367 biological scientists, all other biological scientists
26 16310 marketing managers marketing managers
27 15970 civil engineers civil engineers
28 15000 web developers web developers
29 13367 internists, general internists
... ... ... ...
1555 1 helpers--electricians helpers--electricians
1556 1 industrial designers industrial designers
1557 1 logistics manager logistics managers
1558 1 electronics engineers, expert computer electronics engineers
1559 1 hardward engineers hardward engineers
1560 1 soil and plant scientist soil and plant scientists
1561 1 mechanical engineers, r&d (acwia only) mechanical engineers
1562 1 timing device assemblers and adjusters timing device assemblers and adjusters
1563 1 telecommunications line installers and repairers telecommunications line installers and repairers
1564 1 biological science teachers biological science teachers
1565 1 dentist dentists
1566 1 computer systems engineers/arquitects computer systems engineers/arquitects
1567 1 secondary school teachers, secondary school teachers
1568 1 computer systems anaysts computer systems anaysts
1569 1 counseling psychologist counseling psychologists
1570 1 software engineer, senior software engineers
1571 1 health diagnosing & treating practitioners, al... health diagnosing & treating practitioners
1572 1 set and exhibit engineers set and exhibit engineers
1573 1 helpers, construction trades, all other helpers
1574 1 datebase administrators datebase administrators
1575 1 commercial and insdistrial designers commercial and insdistrial designers
1576 1 software developers, appllications software developers
1577 1 computer occupations,all other computer occupations
1578 1 foreign language and literature teachers, post... foreign language and literature teachers
1579 1 dentists dentists
1580 1 railroad conductors and yardmasters railroad conductors and yardmasters
1581 1 computer occupations, all other: information t... computer occupations
1582 1 computer network architect computer network architects
1583 1 criminal justice and law enforcement teachers,... criminal justice and law enforcement teachers
1584 1 medical scientists, except epidemiologist medical scientists

1585 rows × 3 columns


In [79]:
cleandata1['SOC_NAME'].value_counts().count()


Out[79]:
1585

In [98]:
(cleandata1.loc[(cleandata1['SOC_NAME']=='software developers, appllications')]) #an example of a query


Out[98]:
Unnamed: 0 CASE_STATUS EMPLOYER_NAME SOC_NAME JOB_TITLE FULL_TIME_POSITION PREVAILING_WAGE YEAR WORKSITE lon lat
894328 894329 CERTIFIED meridiansoft, inc. software developers, appllications software developer applications Y 62421.0 2015.0 LEWIS CENTER, OHIO -83.010099 40.198388

This person messed up the SOC_NAME


In [80]:
reducedf['Name1'].value_counts()


Out[80]:
software developers                                    37
secondary school teachers                              14
securities                                             14
sales representatives                                  13
electronics engineers                                  13
computer occupations                                   11
computer systems analysts                              10
computer software engineers                            10
elementary school teachers                             10
mechanical engineers                                    9
special education teachers                              8
electrical engineers                                    7
education administrators                                7
computer programmers                                    7
medical scientists                                      7
middle school teachers                                  7
cooks                                                   6
foreign language and literature teachers                6
civil engineers                                         6
computer hardware engineers                             5
atmospherics                                            5
dentists                                                5
computer network architects                             5
propertys                                               4
computer and information systems managers               4
internists                                              4
criminal justice and law enforcement teachers           4
healthcare practitioners and technical workers          4
electrical and electronics repairers                    4
business operations specialists                         4
                                                       ..
robotics engineers                                      1
detectives and criminal investigators                   1
environmental science teachers                          1
psychiatric aides                                       1
compuer systems analysts                                1
audio-visual collections specialists                    1
first-line supervisors and manager/supervisors - ls     1
jewelers                                                1
marketing specialists                                   1
medical seicentists                                     1
first-line supervisors/managers of production ands      1
landscape architects                                    1
heavy and tractor-trailer truck drivers                 1
singers                                                 1
software developers applications                        1
first-line supervisors of construction trades and s     1
musical instrument repairers and tuners                 1
english language & literature teachers                  1
operation research analysts                             1
respiratory therapy technicians                         1
skincare specialists                                    1
cardiovascular technologists and technicians            1
nuclear engineers                                       1
first-line supervisors/managers of personal servics     1
commodities                                             1
project managers                                        1
healthcare social workers                               1
choreographers                                          1
geneticists                                             1
comuter occupations                                     1
Name: Name1, dtype: int64

In [81]:
reducedf['Name1'].value_counts().count()


Out[81]:
1134

We have now cut the number of names in half from the original number.


In [82]:
reducedf.sort_values(['Name1'])


Out[82]:
Count SOC_NAME Name1
1286 1 13-2011.01 13-2011.01s
1096 2 15-1121 15-1121s
1243 1 15-1132 15-1132s
1272 1 15-1199.01 sw quality assurance engnrs & testers 15-1199.01 sw quality assurance engnrs & testers
1354 1 15-1199.01 15-1199.01s
1393 1 15-1199.08, business intelligence analysts 15-1199.08s
1314 1 17-2051 17-2051s
1552 1 17-2072 17-2072s
1461 1 27-3031 27-3031s
1426 1 29-1064.00-obstetricians and gynecologists 29-1064.00-obstetricians and gynecologists
1319 1 <font><font>carpinteros</font></font> <font><font>carpinteros</font></font>s
1071 2 able seamen able seamens
1026 2 accountant accountants
74 3958 accountants accountants
7 52822 accountants and auditors accountants and auditors
564 26 actors actors
80 3486 actuaries actuaries
1539 1 acupuncturists acupuncturists
882 4 adhesive bonding machine operators and tenders adhesive bonding machine operators and tenders
821 6 administrative law judges, adjudicators, and h... administrative law judges
152 1361 administrative services managers administrative services managers
323 230 adult basic and secondary education and litera... adult basic and secondary education and litera...
395 116 adult basic and secondary education and literacy adult basic and secondary education and literacys
1113 2 adult basic second educ and lit teach and instruc adult basic second educ and lit teach and inst...
485 50 adult literacy, remedial education, and ged te... adult literacys
1322 1 advertising and promortions managers advertising and promortions managers
94 2925 advertising and promotions managers advertising and promotions managers
1126 2 advertising and promotions manager advertising and promotions managers
342 195 advertising sales agents advertising sales agents
351 179 aerospace engineering and operations technicians aerospace engineering and operations technicians
... ... ... ...
1388 1 urologists urologists
1107 2 ushers, lobby attendants, and ticket takers ushers
901 4 validation engineer validation engineers
422 90 validation engineers validation engineers
113 2398 veterinarians veterinarians
599 19 veterinary assistants and laboratory animal ca... veterinary assistants and laboratory animal ca...
596 20 veterinary assistants and laboratory animal veterinary assistants and laboratory animals
426 87 veterinary technologists and technicians veterinary technologists and technicians
1502 1 vice president of investor relations vice president of investor relations
958 3 video game designers video game designers
1016 3 vocational education teachers, middle school vocational education teachers
626 16 vocational education teachers, secondary school vocational education teachers
331 215 vocational education teachers, postsecondary vocational education teachers
492 47 vocational education teachers postsecondary vocational education teachers postsecondarys
639 16 watch repairers watch repairers
878 4 water and wastewater treatment plant and syste... water and wastewater treatment plant and syste...
1008 3 water/wastewater engineers water/wastewater engineers
437 81 web administrators web administrators
1052 2 web developer web developers
28 15000 web developers web developers
1043 2 weighers, measurers, checkers, and samplers, weighers
638 16 welders, cutters, solderers, and brazers welders
1204 1 welding, soldering, and brazing machine setters, weldings
1237 1 welding, soldering, and brazing machine setter... weldings
205 746 wholesale and retail buyers, except farm products wholesale and retail buyers
1259 1 wind energy engineers wind energy engineers
1296 1 wind turbine service technicians wind turbine service technicians
708 10 woodworkers, all other woodworkers
181 981 writers and authors writers and authors
237 557 zoologists and wildlife biologists zoologists and wildlife biologists

1585 rows × 3 columns


In [83]:
reducedf['Name2'] = ""

In [84]:
%%timeit
regex = re.compile('[^a-z\s]')

for index, row in reducedf.iterrows():
    reducedf.set_value([index],['Name2'],(regex.sub('', row['Name1'])))


1 loop, best of 3: 1.52 s per loop

In [85]:
reducedf.sort_values(['Name1'])


Out[85]:
Count SOC_NAME Name1 Name2
1286 1 13-2011.01 13-2011.01s s
1096 2 15-1121 15-1121s s
1243 1 15-1132 15-1132s s
1272 1 15-1199.01 sw quality assurance engnrs & testers 15-1199.01 sw quality assurance engnrs & testers sw quality assurance engnrs testers
1354 1 15-1199.01 15-1199.01s s
1393 1 15-1199.08, business intelligence analysts 15-1199.08s s
1314 1 17-2051 17-2051s s
1552 1 17-2072 17-2072s s
1461 1 27-3031 27-3031s s
1426 1 29-1064.00-obstetricians and gynecologists 29-1064.00-obstetricians and gynecologists obstetricians and gynecologists
1319 1 <font><font>carpinteros</font></font> <font><font>carpinteros</font></font>s fontfontcarpinterosfontfonts
1071 2 able seamen able seamens able seamens
1026 2 accountant accountants accountants
74 3958 accountants accountants accountants
7 52822 accountants and auditors accountants and auditors accountants and auditors
564 26 actors actors actors
80 3486 actuaries actuaries actuaries
1539 1 acupuncturists acupuncturists acupuncturists
882 4 adhesive bonding machine operators and tenders adhesive bonding machine operators and tenders adhesive bonding machine operators and tenders
821 6 administrative law judges, adjudicators, and h... administrative law judges administrative law judges
152 1361 administrative services managers administrative services managers administrative services managers
323 230 adult basic and secondary education and litera... adult basic and secondary education and litera... adult basic and secondary education and litera...
395 116 adult basic and secondary education and literacy adult basic and secondary education and literacys adult basic and secondary education and literacys
1113 2 adult basic second educ and lit teach and instruc adult basic second educ and lit teach and inst... adult basic second educ and lit teach and inst...
485 50 adult literacy, remedial education, and ged te... adult literacys adult literacys
1322 1 advertising and promortions managers advertising and promortions managers advertising and promortions managers
94 2925 advertising and promotions managers advertising and promotions managers advertising and promotions managers
1126 2 advertising and promotions manager advertising and promotions managers advertising and promotions managers
342 195 advertising sales agents advertising sales agents advertising sales agents
351 179 aerospace engineering and operations technicians aerospace engineering and operations technicians aerospace engineering and operations technicians
... ... ... ... ...
1388 1 urologists urologists urologists
1107 2 ushers, lobby attendants, and ticket takers ushers ushers
901 4 validation engineer validation engineers validation engineers
422 90 validation engineers validation engineers validation engineers
113 2398 veterinarians veterinarians veterinarians
599 19 veterinary assistants and laboratory animal ca... veterinary assistants and laboratory animal ca... veterinary assistants and laboratory animal ca...
596 20 veterinary assistants and laboratory animal veterinary assistants and laboratory animals veterinary assistants and laboratory animals
426 87 veterinary technologists and technicians veterinary technologists and technicians veterinary technologists and technicians
1502 1 vice president of investor relations vice president of investor relations vice president of investor relations
958 3 video game designers video game designers video game designers
1016 3 vocational education teachers, middle school vocational education teachers vocational education teachers
626 16 vocational education teachers, secondary school vocational education teachers vocational education teachers
331 215 vocational education teachers, postsecondary vocational education teachers vocational education teachers
492 47 vocational education teachers postsecondary vocational education teachers postsecondarys vocational education teachers postsecondarys
639 16 watch repairers watch repairers watch repairers
878 4 water and wastewater treatment plant and syste... water and wastewater treatment plant and syste... water and wastewater treatment plant and syste...
1008 3 water/wastewater engineers water/wastewater engineers waterwastewater engineers
437 81 web administrators web administrators web administrators
1052 2 web developer web developers web developers
28 15000 web developers web developers web developers
1043 2 weighers, measurers, checkers, and samplers, weighers weighers
638 16 welders, cutters, solderers, and brazers welders welders
1204 1 welding, soldering, and brazing machine setters, weldings weldings
1237 1 welding, soldering, and brazing machine setter... weldings weldings
205 746 wholesale and retail buyers, except farm products wholesale and retail buyers wholesale and retail buyers
1259 1 wind energy engineers wind energy engineers wind energy engineers
1296 1 wind turbine service technicians wind turbine service technicians wind turbine service technicians
708 10 woodworkers, all other woodworkers woodworkers
181 981 writers and authors writers and authors writers and authors
237 557 zoologists and wildlife biologists zoologists and wildlife biologists zoologists and wildlife biologists

1585 rows × 4 columns


In [86]:
reducedf['Name2'].value_counts().count()


Out[86]:
1118

Striping charactors did not help much.

At this point investigated a spellchecker in python. Was not able to get something to work within enviroment.

I am now going to consider removing entries that are unique, with a count of one.


In [87]:
dfName2Check = pd.DataFrame({'Name2': reducedf['Name2'].value_counts().index, 'Count':reducedf['Name2'].value_counts().values})

In [88]:
dfName2Check


Out[88]:
Count Name2
0 37 software developers
1 14 securities
2 14 secondary school teachers
3 13 sales representatives
4 13 electronics engineers
5 11 computer occupations
6 10 computer software engineers
7 10 computer systems analysts
8 10 elementary school teachers
9 9 mechanical engineers
10 8 s
11 8 special education teachers
12 7 electrical engineers
13 7 middle school teachers
14 7 computer programmers
15 7 medical scientists
16 7 education administrators
17 6 cooks
18 6 foreign language and literature teachers
19 6 civil engineers
20 5 atmospherics
21 5 dentists
22 5 computer network architects
23 5 computer hardware engineers
24 4 business operations specialists
25 4 healthcare practitioners and technical workers
26 4 physicians and surgeons
27 4 computer and information systems managers
28 4 compensations
29 4 biological scientists
... ... ...
1088 1 morticians
1089 1 physical therapist aides
1090 1 electrical enginers
1091 1 detectives and criminal investigators
1092 1 environmental science teachers
1093 1 psychiatric aides
1094 1 marketing specialists
1095 1 medical seicentists
1096 1 landscape architects
1097 1 singers
1098 1 biochemist biophysicists
1099 1 robotics engineers
1100 1 musical instrument repairers and tuners
1101 1 firstline supervisorsmanagers of helpers
1102 1 operation research analysts
1103 1 respiratory therapy technicians
1104 1 skincare specialists
1105 1 cardiovascular technologists and technicians
1106 1 copmuter programmers
1107 1 firstline supervisors of personal service workers
1108 1 nuclear engineers
1109 1 clinicals
1110 1 commodities
1111 1 project managers
1112 1 healthcare social workers
1113 1 urologists
1114 1 orthodontists
1115 1 stonemasons
1116 1 tile and marble setters
1117 1 comuter occupations

1118 rows × 2 columns

Theres a problem, I am not getting a real representation of the occurence of names in the data. I now need to do something where I can get the actual number of occurences.


In [89]:
cleandata1


Out[89]:
Unnamed: 0 CASE_STATUS EMPLOYER_NAME SOC_NAME JOB_TITLE FULL_TIME_POSITION PREVAILING_WAGE YEAR WORKSITE lon lat
0 1 CERTIFIED-WITHDRAWN university of michigan biochemists and biophysicists postdoctoral research fellow N 36067.00 2016.0 ANN ARBOR, MICHIGAN -83.743038 42.280826
1 2 CERTIFIED-WITHDRAWN goodman networks, inc. chief executives chief operating officer Y 242674.00 2016.0 PLANO, TEXAS -96.698886 33.019843
2 3 CERTIFIED-WITHDRAWN ports america group, inc. chief executives chief process officer Y 193066.00 2016.0 JERSEY CITY, NEW JERSEY -74.077642 40.728158
3 4 CERTIFIED-WITHDRAWN gates corporation, a wholly-owned subsidiary o... chief executives regional presiden, americas Y 220314.00 2016.0 DENVER, COLORADO -104.990251 39.739236
4 5 WITHDRAWN peabody investments corp. chief executives president mongolia and india Y 157518.40 2016.0 ST. LOUIS, MISSOURI -90.199404 38.627003
5 6 CERTIFIED-WITHDRAWN burger king corporation chief executives executive v p, global development and presiden... Y 225000.00 2016.0 MIAMI, FLORIDA -80.191790 25.761680
6 7 CERTIFIED-WITHDRAWN bt and mk energy and commodities chief executives chief operating officer Y 91021.00 2016.0 HOUSTON, TEXAS -95.369803 29.760427
7 8 CERTIFIED-WITHDRAWN globo mobile technologies, inc. chief executives chief operations officer Y 150000.00 2016.0 SAN JOSE, CALIFORNIA -121.886329 37.338208
8 9 CERTIFIED-WITHDRAWN esi companies inc. chief executives president Y 127546.00 2016.0 MEMPHIS, TEXAS NaN NaN
9 10 WITHDRAWN lessard international llc chief executives president Y 154648.00 2016.0 VIENNA, VIRGINIA -77.265260 38.901222
10 11 CERTIFIED-WITHDRAWN h.j. heinz company chief executives chief information officer, heinz north america Y 182978.00 2016.0 PITTSBURGH, PENNSYLVANIA -79.995886 40.440625
11 12 CERTIFIED-WITHDRAWN dow corning corporation chief executives vice president and chief human resources officer Y 163717.00 2016.0 MIDLAND, MICHIGAN -84.247212 43.615583
12 13 CERTIFIED-WITHDRAWN acushnet company chief executives treasurer and coo Y 203860.80 2016.0 FAIRHAVEN, MASSACHUSETTS NaN NaN
13 14 CERTIFIED-WITHDRAWN biocair, inc. chief executives chief commercial officer Y 252637.00 2016.0 MIAMI, FLORIDA -80.191790 25.761680
14 15 CERTIFIED-WITHDRAWN newmont mining corporation chief executives board member Y 105914.00 2016.0 GREENWOOD VILLAGE, COLORADO -104.950814 39.617210
15 16 CERTIFIED-WITHDRAWN vricon, inc. chief executives chief financial officer Y 153046.00 2016.0 STERLING, VIRGINIA -77.429130 39.006699
16 17 CERTIFIED-WITHDRAWN cardiac science corporation financial managers vice president of finance Y 90834.00 2016.0 WAUKESHA, WISCONSIN -88.231481 43.011678
17 18 CERTIFIED-WITHDRAWN westfield corporation chief executives general manager, operations Y 164050.00 2016.0 LOS ANGELES, CALIFORNIA -118.243685 34.052234
18 19 CERTIFIED quicklogix llc chief executives ceo Y 187200.00 2016.0 SANTA CLARA, CALIFORNIA -121.955236 37.354108
19 20 CERTIFIED mcchrystal group, llc chief executives president, northeast region Y 241842.00 2016.0 ALEXANDRIA, VIRGINIA -77.046921 38.804835
20 21 CERTIFIED-WITHDRAWN cuddle barn, inc. chief executives chief operating officer (coo) Y 117998.00 2016.0 COMMERCE, CALIFORNIA -118.159793 34.000569
21 22 CERTIFIED-WITHDRAWN westfield corporation chief executives general manager, operations Y 164050.00 2016.0 LOS ANGELES, CALIFORNIA -118.243685 34.052234
22 23 CERTIFIED lomics, llc chief executives ceo Y 99986.00 2016.0 SAN DIEGO, CALIFORNIA -117.161084 32.715738
23 24 CERTIFIED uc university high school education inc. chief executives chief financial officer Y 99986.00 2016.0 CHULA VISTA, CALIFORNIA -117.084196 32.640054
24 25 CERTIFIED-WITHDRAWN vms communications llc chief executives chief operating officer Y 159370.00 2016.0 MIAMI, FLORIDA -80.191790 25.761680
25 26 CERTIFIED quicklogix, inc. chief executives ceo Y 187200.00 2016.0 SANTA CLARA, CALIFORNIA -121.955236 37.354108
26 27 CERTIFIED-WITHDRAWN foodessentials corporation chief executives chief executive officer Y 130853.00 2016.0 CHICAGO, ILLINOIS -87.629798 41.878114
27 28 CERTIFIED hello inc. chief executives chief business officer Y 215862.00 2016.0 SAN FRANCISCO, CALIFORNIA -122.419415 37.774929
28 29 CERTIFIED umbel corp chief executives vice president of engineering Y 192088.00 2016.0 AUSTIN, TEXAS -97.743061 30.267153
29 30 CERTIFIED perspectives of freedom foundation, inc chief executives executive director Y 95295.98 2016.0 WESTON, FLORIDA -80.399775 26.100365
... ... ... ... ... ... ... ... ... ... ... ...
3002428 3002429 WITHDRAWN avant healthcare professionals physical therapists physical therapist Y 53601.60 2011.0 LEBANON, PENNSYLVANIA NaN NaN
3002429 3002430 WITHDRAWN avant healthcare professionals physical therapists physical therapist Y 53601.60 2011.0 LEBANON, PENNSYLVANIA NaN NaN
3002430 3002431 WITHDRAWN trisync technologies, inc. computer systems analysts computer system analyst Y 55245.00 2011.0 EDISON, NEW JERSEY -74.412095 40.518715
3002431 3002432 WITHDRAWN at last sportswear inc. computer support specialists computer support specialist Y 36837.00 2011.0 SECAUCUS, NEW JERSEY -74.056530 40.789545
3002432 3002433 WITHDRAWN trisync technologies, inc. computer systems analysts computer system analyst Y 55245.00 2011.0 EDISON, NEW JERSEY -74.412095 40.518715
3002433 3002434 WITHDRAWN the university of texas southwestern medical c... biochemists and biophysicists instructor Y 36795.00 2011.0 DALLAS, TEXAS -96.796988 32.776664
3002434 3002435 WITHDRAWN trisync technologies, inc. computer systems analysts computer system analyst Y 55245.00 2011.0 EDISON, NEW JERSEY -74.412095 40.518715
3002435 3002436 WITHDRAWN marlabs, inc computer programmers programmer/analyst Y 77730.00 2011.0 DURHAM, NORTH CAROLINA -78.898619 35.994033
3002436 3002437 WITHDRAWN xtron software services, inc. computer programmers computer programmers Y 89232.00 2011.0 SANTA CLARA, CALIFORNIA -121.955236 37.354108
3002437 3002438 WITHDRAWN university of mississippi medical center health specialties teachers, postsecondary assistant professor of anesthesiology Y 34510.00 2011.0 JACKSON, MISSISSIPPI -90.184810 32.298757
3002438 3002439 WITHDRAWN canvas infotech, inc. database administrators data analyst Y 53082.00 2011.0 PLEASANTON, CALIFORNIA -121.874679 37.662431
3002439 3002440 WITHDRAWN new york university biological scientists, all other adjunct associate professor Y 37336.00 2011.0 NEW YORK, NEW YORK -74.005941 40.712784
3002440 3002441 WITHDRAWN oracle america, inc. software developers, applications software engineer (software developer 2) Y 64800.00 2011.0 REDWOOD SHORES, CALIFORNIA -122.245536 37.536413
3002441 3002442 WITHDRAWN b & d dental corp. materials scientists materials scientist N 70553.60 2011.0 WEST VALLEY, UTAH -112.001050 40.691613
3002442 3002443 WITHDRAWN medtech staffing & solutions, inc market research analysts and marketing special... market research analyst Y 33800.00 2011.0 AKRON, OHIO -81.519005 41.081445
3002443 3002444 WITHDRAWN larsen & toubro limited commercial and industrial designers design engineer Y 59800.00 2011.0 CHELMSFORD, MASSACHUSETTS -71.367284 42.599814
3002444 3002445 WITHDRAWN larsen & toubro limited commercial and industrial designers design engineer Y 59800.00 2011.0 CHELMSFORD, MASSACHUSETTS -71.367284 42.599814
3002445 3002446 NaN NaN NaN NaN NaN NaN NaN BERKLEY HEIGHTS, NEW JERSEY -74.431052 40.680873
3002446 3002447 NaN NaN NaN NaN NaN NaN NaN SCHENECTADY , NEW YORK -73.939569 42.814243
3002447 3002448 NaN NaN NaN NaN NaN NaN NaN MOUTAIN VIEW, CALIFORNIA -122.083851 37.386052
3002448 3002449 NaN NaN NaN NaN NaN NaN NaN ST.PAUL, MINNESOTA -93.089958 44.953703
3002449 3002450 NaN NaN NaN NaN NaN NaN NaN NEW TOWN, PENNSYLVANIA -74.932260 40.228337
3002450 3002451 NaN NaN NaN NaN NaN NaN NaN WESTMINISTER, COLORADO -105.037205 39.836653
3002451 3002452 NaN NaN NaN NaN NaN NaN NaN FREEMONT, CALIFORNIA -121.988572 37.548270
3002452 3002453 NaN NaN NaN NaN NaN NaN NaN LAVERGNE, TENNESSEE -86.581939 36.015618
3002453 3002454 NaN NaN NaN NaN NaN NaN NaN NYC, NEW YORK -74.005941 40.712784
3002454 3002455 NaN NaN NaN NaN NaN NaN NaN SOUTH LAKE, TEXAS -97.134178 32.941236
3002455 3002456 NaN NaN NaN NaN NaN NaN NaN CLINTON, NEW JERSEY -74.909890 40.636768
3002456 3002457 NaN NaN NaN NaN NaN NaN NaN OWINGS MILL, MARYLAND -76.780253 39.419550
3002457 3002458 NaN NaN NaN NaN NaN NaN NaN ALTANTA, GEORGIA -84.387982 33.748995

3002458 rows × 11 columns

Going to go back to this df and create a copy. Will then over write the Soc_Name with the reduced name I have.


In [111]:
cleandata2 = cleandata1.copy()

In [130]:
name = cleandata2.iloc[3002440]['SOC_NAME'] #example of accessing a location
print(name)
newname = reducedf.loc[(reducedf['SOC_NAME']==name)]
newname1 = newname.iloc[0]['Name2']
print(newname1)


software developers, applications
Out[130]:
'software developers'

Test code to test algorithm


In [ ]:
for index, row in cleandata2.iterrows():
    name = row['SOC_NAME']
    newname = reducedf.loc[(reducedf['SOC_NAME']==name)]
    newname1 = newname.iloc[0]['Name2']
    cleandata2.set_value([index],['SOC_NAME'],newname1)

Here, we try to apply what we have reduced to the dataframe. We will see if we have made any progress, and what we can do next.