In [94]:
import pandas as pd
import numpy as np
import itertools

In [95]:
data=pd.read_csv('savedrecs.txt',sep='\t',engine='python',index_col=False)

In [96]:
def getUniqueWords(allWords) :
    uniqueWords = [] 
    for j in allWords:
        if j in uniqueWords:
            pass
        else:
            uniqueWords.append(j)
    return uniqueWords

In [102]:
lines = []
with open("country_list.txt") as file:
    for line in file:
        line = line.strip() #or someother preprocessing
        lines.append(line)
lines


Out[102]:
['Afghanistan',
 'Albania',
 'Algeria',
 'American Samoa',
 'Andorra',
 'Angola',
 'Anguilla',
 'Antarctica',
 'Antigua and Barbuda',
 'Argentina',
 'Armenia',
 'Aruba',
 'Australia',
 'Austria',
 'Azerbaijan',
 'Bahamas',
 'Bahrain',
 'Bangladesh',
 'Barbados',
 'Belarus',
 'Belgium',
 'Belize',
 'Benin',
 'Bermuda',
 'Bhutan',
 'Bolivia',
 'Bosnia and Herzegovina',
 'Botswana',
 'Bouvet Island',
 'Brazil',
 'British Antarctic Territory',
 'British Indian Ocean Territory',
 'British Virgin Islands',
 'Brunei',
 'Bulgaria',
 'Burkina Faso',
 'Burundi',
 'Cambodia',
 'Cameroon',
 'Canada',
 'Canton and Enderbury Islands',
 'Cape Verde',
 'Cayman Islands',
 'Central African Republic',
 'Chad',
 'Chile',
 'China',
 'Christmas Island',
 'Cocos [Keeling] Islands',
 'Colombia',
 'Comoros',
 'Congo - Brazzaville',
 'Congo - Kinshasa',
 'Cook Islands',
 'Costa Rica',
 'Croatia',
 'Cuba',
 'Cyprus',
 'Czech Republic',
 'Côte d’Ivoire',
 'Denmark',
 'Djibouti',
 'Dominica',
 'Dominican Republic',
 'Dronning Maud Land',
 'East Germany',
 'Ecuador',
 'Egypt',
 'El Salvador',
 'Equatorial Guinea',
 'Eritrea',
 'Estonia',
 'Ethiopia',
 'Falkland Islands',
 'Faroe Islands',
 'Fiji',
 'Finland',
 'France',
 'French Guiana',
 'French Polynesia',
 'French Southern Territories',
 'French Southern and Antarctic Territories',
 'Gabon',
 'Gambia',
 'Georgia',
 'Germany',
 'Ghana',
 'Gibraltar',
 'Greece',
 'Greenland',
 'Grenada',
 'Guadeloupe',
 'Guam',
 'Guatemala',
 'Guernsey',
 'Guinea',
 'Guinea-Bissau',
 'Guyana',
 'Haiti',
 'Heard Island and McDonald Islands',
 'Honduras',
 'Hong Kong SAR China',
 'Hungary',
 'Iceland',
 'India',
 'Indonesia',
 'Iran',
 'Iraq',
 'Ireland',
 'Isle of Man',
 'Israel',
 'Italy',
 'Jamaica',
 'Japan',
 'Jersey',
 'Johnston Island',
 'Jordan',
 'Kazakhstan',
 'Kenya',
 'Kiribati',
 'Kuwait',
 'Kyrgyzstan',
 'Laos',
 'Latvia',
 'Lebanon',
 'Lesotho',
 'Liberia',
 'Libya',
 'Liechtenstein',
 'Lithuania',
 'Luxembourg',
 'Macau SAR China',
 'Macedonia',
 'Madagascar',
 'Malawi',
 'Malaysia',
 'Maldives',
 'Mali',
 'Malta',
 'Marshall Islands',
 'Martinique',
 'Mauritania',
 'Mauritius',
 'Mayotte',
 'Metropolitan France',
 'Mexico',
 'Micronesia',
 'Midway Islands',
 'Moldova',
 'Monaco',
 'Mongolia',
 'Montenegro',
 'Montserrat',
 'Morocco',
 'Mozambique',
 'Myanmar [Burma]',
 'Namibia',
 'Nauru',
 'Nepal',
 'Netherlands',
 'Netherlands Antilles',
 'Neutral Zone',
 'New Caledonia',
 'New Zealand',
 'Nicaragua',
 'Niger',
 'Nigeria',
 'Niue',
 'Norfolk Island',
 'North Korea',
 'North Vietnam',
 'Northern Mariana Islands',
 'Norway',
 'Oman',
 'Pacific Islands Trust Territory',
 'Pakistan',
 'Palau',
 'Palestinian Territories',
 'Panama',
 'Panama Canal Zone',
 'Papua New Guinea',
 'Paraguay',
 "People's Democratic Republic of Yemen",
 'Peru',
 'Philippines',
 'Pitcairn Islands',
 'Poland',
 'Portugal',
 'Puerto Rico',
 'Qatar',
 'Romania',
 'Russia',
 'Rwanda',
 'Réunion',
 'Saint Barthélemy',
 'Saint Helena',
 'Saint Kitts and Nevis',
 'Saint Lucia',
 'Saint Martin',
 'Saint Pierre and Miquelon',
 'Saint Vincent and the Grenadines',
 'Samoa',
 'San Marino',
 'Saudi Arabia',
 'Senegal',
 'Serbia',
 'Serbia and Montenegro',
 'Seychelles',
 'Sierra Leone',
 'Singapore',
 'Slovakia',
 'Slovenia',
 'Solomon Islands',
 'Somalia',
 'South Africa',
 'South Georgia and the South Sandwich Islands',
 'South Korea',
 'Spain',
 'Sri Lanka',
 'Sudan',
 'Suriname',
 'Svalbard and Jan Mayen',
 'Swaziland',
 'Sweden',
 'Switzerland',
 'Syria',
 'São Tomé and PrÃ\xadncipe',
 'Taiwan',
 'Tajikistan',
 'Tanzania',
 'Thailand',
 'Timor-Leste',
 'Togo',
 'Tokelau',
 'Tonga',
 'Trinidad and Tobago',
 'Tunisia',
 'Turkey',
 'Turkmenistan',
 'Turks and Caicos Islands',
 'Tuvalu',
 'U.S. Minor Outlying Islands',
 'U.S. Miscellaneous Pacific Islands',
 'U.S. Virgin Islands',
 'Uganda',
 'Ukraine',
 'Union of Soviet Socialist Republics',
 'United Arab Emirates',
 'United Kingdom',
 'United States',
 'Unknown or Invalid Region',
 'Uruguay',
 'Uzbekistan',
 'Vanuatu',
 'Vatican City',
 'Venezuela',
 'Vietnam',
 'Wake Island',
 'Wallis and Futuna',
 'Western Sahara',
 'Yemen',
 'Zambia',
 'Zimbabwe',
 'Ã…land Islands']

In [108]:
df= data[data['C1'].notnull()]
df_new=df['C1']
print(df_new[2])

for i in df_new.index:
    p=df_new[i].split()
    countries=['USA','Germany','France','China','Japan','Australia','Canada','Brazil','Mexico','South Africa',
           'India','Korea','Israel','Turkey','Saudi Arabia','Iran','Spain','Netherlands','Sweden','Norway',
           'Poland','Indonesia','Brazil','Switzerland','Denmark','Singapore','Iceland','Hong Kong','New Zealand','Belgium',
           'Austria','Italy','Czech','Greece','Qatar','Portugal','Hungary','Argentina','Romania','England',
           'Taiwan','Lithuania','Finland','Russia','Kazakhstan','Afghanistan','Albania']
    count=[]
    for i in p:
        if i not in count:
            for j in countries:
                if (i==j or i==j+';'):
                    count.append(i)
    country_list=[]
    for i in count:
        for j in countries:
            if (i==j or i==j+';'):
                country_list.append(j)
    print(list(set(country_list)))


ETH Zurich, Inst Teilchenphys, CH-8093 Zurich, Switzerland; Free Univ Amsterdam, Fac Sci, Div Phys & Astron, NL-1081 HV Amsterdam, Netherlands; Natl Inst Nucl Phys & High Energy Phys, NL-1009 DB Amsterdam, Netherlands
['USA', 'France']
['Russia', 'Poland', 'USA']
['Switzerland', 'Netherlands']
['India']
['Russia']
['China']
['India']
['Russia']
['Russia']
['Germany']
['Lithuania', 'USA']
['Norway']
['Finland']
['USA']
['Italy']
['France']
['USA']
['Germany', 'Japan', 'Kazakhstan', 'USA']
['Japan', 'Netherlands', 'France']
['Japan', 'USA', 'Italy']
['Germany', 'USA']
['Kazakhstan']
['Taiwan']
['Germany']
['Switzerland', 'Russia']
['Japan', 'USA']
['Spain']
['Russia']
['Russia']
['Japan']
['Germany']
['Japan']
['Germany']
['Japan']
['Japan']
['Germany', 'Italy']
['Japan']
['Japan', 'USA']
['Japan']
['Japan']
['Germany']
['Canada']
['Italy']
['USA']
['Germany', 'Russia']
['Japan']
['Japan']
['Japan']
['Germany', 'Japan', 'Sweden']

In [99]:
count=[]
countries=['France','USA','Japan','Sweden','Germany']
#print(countries)
for i in p:
    if i not in count:
        for j in countries:
            if (i==j or i==j+';'):
                count.append(i)
country_list=[]
for i in count:
    for j in countries:
        if (i==j or i==j+';'):
            country_list.append(j)
print(list(set(country_list)))


['Germany', 'Japan', 'Sweden']

In [100]:
country_list=[]
for i in count:
    for j in countries:
        if (i==j or i==j+';'):
            country_list.append(j)
print(list(set(country_list)))


['Germany', 'Japan', 'Sweden']

In [ ]:


In [ ]: