In [94]:
import pandas as pd
import numpy as np
import itertools

In [95]:
data=pd.read_csv('savedrecs.txt',sep='\t',engine='python',index_col=False)

In [96]:
def getUniqueWords(allWords) :
    uniqueWords = [] 
    for j in allWords:
        if j in uniqueWords:
            pass
        else:
            uniqueWords.append(j)
    return uniqueWords

In [97]:
text_file = open("country_list.txt")
lines = text_file.read().split(',')
lines


Out[97]:
["Afghanistan\nAlbania\nAlgeria\nAmerican Samoa\nAndorra\nAngola\nAnguilla\nAntarctica\nAntigua and Barbuda\nArgentina\nArmenia\nAruba\nAustralia\nAustria\nAzerbaijan\nBahamas\nBahrain\nBangladesh\nBarbados\nBelarus\nBelgium\nBelize\nBenin\nBermuda\nBhutan\nBolivia\nBosnia and Herzegovina\nBotswana\nBouvet Island\nBrazil\nBritish Antarctic Territory\nBritish Indian Ocean Territory\nBritish Virgin Islands\nBrunei\nBulgaria\nBurkina Faso\nBurundi\nCambodia\nCameroon\nCanada\nCanton and Enderbury Islands\nCape Verde\nCayman Islands\nCentral African Republic\nChad\nChile\nChina\nChristmas Island\nCocos [Keeling] Islands\nColombia\nComoros\nCongo - Brazzaville\nCongo - Kinshasa\nCook Islands\nCosta Rica\nCroatia\nCuba\nCyprus\nCzech Republic\nCôte d’Ivoire\nDenmark\nDjibouti\nDominica\nDominican Republic\nDronning Maud Land\nEast Germany\nEcuador\nEgypt\nEl Salvador\nEquatorial Guinea\nEritrea\nEstonia\nEthiopia\nFalkland Islands\nFaroe Islands\nFiji\nFinland\nFrance\nFrench Guiana\nFrench Polynesia\nFrench Southern Territories\nFrench Southern and Antarctic Territories\nGabon\nGambia\nGeorgia\nGermany\nGhana\nGibraltar\nGreece\nGreenland\nGrenada\nGuadeloupe\nGuam\nGuatemala\nGuernsey\nGuinea\nGuinea-Bissau\nGuyana\nHaiti\nHeard Island and McDonald Islands\nHonduras\nHong Kong SAR China\nHungary\nIceland\nIndia\nIndonesia\nIran\nIraq\nIreland\nIsle of Man\nIsrael\nItaly\nJamaica\nJapan\nJersey\nJohnston Island\nJordan\nKazakhstan\nKenya\nKiribati\nKuwait\nKyrgyzstan\nLaos\nLatvia\nLebanon\nLesotho\nLiberia\nLibya\nLiechtenstein\nLithuania\nLuxembourg\nMacau SAR China\nMacedonia\nMadagascar\nMalawi\nMalaysia\nMaldives\nMali\nMalta\nMarshall Islands\nMartinique\nMauritania\nMauritius\nMayotte\nMetropolitan France\nMexico\nMicronesia\nMidway Islands\nMoldova\nMonaco\nMongolia\nMontenegro\nMontserrat\nMorocco\nMozambique\nMyanmar [Burma]\nNamibia\nNauru\nNepal\nNetherlands\nNetherlands Antilles\nNeutral Zone\nNew Caledonia\nNew Zealand\nNicaragua\nNiger\nNigeria\nNiue\nNorfolk Island\nNorth Korea\nNorth Vietnam\nNorthern Mariana Islands\nNorway\nOman\nPacific Islands Trust Territory\nPakistan\nPalau\nPalestinian Territories\nPanama\nPanama Canal Zone\nPapua New Guinea\nParaguay\nPeople's Democratic Republic of Yemen\nPeru\nPhilippines\nPitcairn Islands\nPoland\nPortugal\nPuerto Rico\nQatar\nRomania\nRussia\nRwanda\nRéunion\nSaint Barthélemy\nSaint Helena\nSaint Kitts and Nevis\nSaint Lucia\nSaint Martin\nSaint Pierre and Miquelon\nSaint Vincent and the Grenadines\nSamoa\nSan Marino\nSaudi Arabia\nSenegal\nSerbia\nSerbia and Montenegro\nSeychelles\nSierra Leone\nSingapore\nSlovakia\nSlovenia\nSolomon Islands\nSomalia\nSouth Africa\nSouth Georgia and the South Sandwich Islands\nSouth Korea\nSpain\nSri Lanka\nSudan\nSuriname\nSvalbard and Jan Mayen\nSwaziland\nSweden\nSwitzerland\nSyria\nSão Tomé and PrÃ\xadncipe\nTaiwan\nTajikistan\nTanzania\nThailand\nTimor-Leste\nTogo\nTokelau\nTonga\nTrinidad and Tobago\nTunisia\nTurkey\nTurkmenistan\nTurks and Caicos Islands\nTuvalu\nU.S. Minor Outlying Islands\nU.S. Miscellaneous Pacific Islands\nU.S. Virgin Islands\nUganda\nUkraine\nUnion of Soviet Socialist Republics\nUnited Arab Emirates\nUnited Kingdom\nUnited States\nUnknown or Invalid Region\nUruguay\nUzbekistan\nVanuatu\nVatican City\nVenezuela\nVietnam\nWake Island\nWallis and Futuna\nWestern Sahara\nYemen\nZambia\nZimbabwe\nÃ…land Islands"]

In [98]:
df= data[data['C1'].notnull()]
df_new=df['C1']
print(df_new[22])

for i in df_new.index:
    p=df_new[i].split()
    #print(p)
    #print(getUniqueWords(p))
    countries=['USA','Germany','France','China','Japan','Australia','Canada','Brazil','Mexico','South Africa',
           'India','Korea','Israel','Turkey','Saudi Arabia','Iran','Spain','Netherlands','Sweden','Norway',
           'Poland','Indonesia','Brazil','Switzerland','Denmark','Singapore','Iceland','Hong Kong','New Zealand','Belgium',
           'Austria','Italy','Czech','Greece','Qatar','Portugal','Hungary','Argentina','Romania','England',
           'Taiwan','Lithuania','Finland','Russia','Kazakhstan']
    #print(countries)
    count=[]
#print(countries)
    for i in p:
        if i not in count:
            for j in countries:
                if (i==j or i==j+';'):
                    count.append(i)
    #print(count)
    country_list=[]
    for i in count:
        for j in countries:
            if (i==j or i==j+';'):
                country_list.append(j)
    print(list(set(country_list)))


Kazakh State Univ, Res Inst Expt & Theoret Phys, Almaty 480012, Kazakhstan
['USA', 'France']
['Russia', 'Poland', 'USA']
['Switzerland', 'Netherlands']
['India']
['Russia']
['China']
['India']
['Russia']
['Russia']
['Germany']
['Lithuania', 'USA']
['Norway']
['Finland']
['USA']
['Italy']
['France']
['USA']
['Germany', 'Japan', 'Kazakhstan', 'USA']
['Japan', 'Netherlands', 'France']
['Japan', 'USA', 'Italy']
['Germany', 'USA']
['Kazakhstan']
['Taiwan']
['Germany']
['Switzerland', 'Russia']
['Japan', 'USA']
['Spain']
['Russia']
['Russia']
['Japan']
['Germany']
['Japan']
['Germany']
['Japan']
['Japan']
['Germany', 'Italy']
['Japan']
['Japan', 'USA']
['Japan']
['Japan']
['Germany']
['Canada']
['Italy']
['USA']
['Germany', 'Russia']
['Japan']
['Japan']
['Japan']
['Germany', 'Japan', 'Sweden']

In [99]:
count=[]
countries=['France','USA','Japan','Sweden','Germany']
#print(countries)
for i in p:
    if i not in count:
        for j in countries:
            if (i==j or i==j+';'):
                count.append(i)
country_list=[]
for i in count:
    for j in countries:
        if (i==j or i==j+';'):
            country_list.append(j)
print(list(set(country_list)))


['Germany', 'Japan', 'Sweden']

In [100]:
country_list=[]
for i in count:
    for j in countries:
        if (i==j or i==j+';'):
            country_list.append(j)
print(list(set(country_list)))


['Germany', 'Japan', 'Sweden']

In [ ]:


In [ ]: