Ch14 Figure4


In [1]:
# The data science team used this data to create a word cloud of all the organization’s job injuries, and then the team presented a simple visualization of the cloud at their storytelling session

string = 'Rope, cordage, twine, tire cord, and tire fabric mills Nursing and residential care facilities Fire protection Rendering and meat byproduct processing Skiing facilities Police protection Interurban and rural bus transportation Veterinary services Travel trailer and camper manufacturing Manufactured home (mobile home) manufacturing Truss manufacturing Hog and pig farming Steel foundries (except investment) Hospitals Heavy and civil engineering construction Prefabricated wood building manufacturing Truck trailer manufacturing Iron foundries Materials recovery facilities Other nonferrous metal foundries (except die-casting) Aluminum foundries (except die-casting) Luggage and leather goods stores Scheduled passenger air transportation Correctional institutions Ambulance services Abrasion Brain Injuries Bruising Burns Cluster Headaches Concussions Congestive Heart Failure Construction Injuries Coronary Artery Disease Defective Products Dislocation Flail Chest Fracture Hemothorax Herniated Disc Hip Pointer Hypothermia Lacerations Pinched Nerve Pneumothorax Prescription Medications Quadriplegia Definition Rib Fracture Sciatica Spinal Cord Injury Temporalmandibular Joint Tendons Ligaments Fascia Injury Traumatic Brain Injury Whiplash'
tokens = string.replace(',', '').replace('-', '').replace('(', '').replace(')', '').replace('and', '').split(' ')

data = []

for i in range(500):
    
    claim_idx = np.arange(0, len(tokens)-1)
    np.random.shuffle(claim_idx)
    claim = ' '.join([tokens[x] for x in claim_idx[:rd.randint(1,30)]])
        
    data.append([i, claim])

df = pd.DataFrame(data, columns=['id', 'claim'])
# df.to_csv('csv_output/ch14_fig6.csv', index=False)
df = pd.read_csv('csv_output/ch14_fig6.csv')
df.head()


Out[1]:
id claim
0 0 civil building
1 1 Artery recovery Rendering Fracture Hospitals H...
2 2 Brain except Scheduled services Fracture Steel...
3 3 Materials Injury Quadriplegia manufacturing Hi...
4 4 foundries Correctional farming except Joint A...

In [2]:
df = pd.read_csv('csv_output/ch14_fig6.csv')

%matplotlib inline
sns.set_style("white")

all_claims = ''
for x in df.iterrows():
    all_claims += x[1][1] + ''

wgb = pd.DataFrame(all_claims.split(' ')).groupby(0)[0].count()
rank = wgb.sort_values(ascending=False)[1:11]

f, ax = plt.subplots(1, figsize=(8,6))

ax.barh(bottom = np.arange(10)[::-1], width=rank);
ax.set_yticks(np.arange(0,10)+.5);
ax.set_yticklabels(rank.index[::-1]);
ax.set_title('word count')


f.savefig('svg_output/ch14_fig6.svg', format='svg')



In [90]:
# join the joined claims text and use https://www.jasondavies.com/wordcloud/ to generate word cloud