Problem Statement

Given a snippet of text in English, French, German, or Spanish, detect the snippet's language and print the language name. You may build an offline model for this. The snippet may contain one or more lines.

Constraints

The snippet will not exceed 3 kilobytes in size.The snippet will not exceed 3 kilobytes in size. The snippet will be in one of the following languages: English, French, German, or Spanish.



In [ ]:

    
# #Python Library Imports
import nltk
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords



In [16]:

    
lang_input = "The story of Rip Van Winkle is set in the years before and after the American Revolutionary War. In a pleasant village, at the foot of New York's Catskill Mountains, lives kindly Rip Van Winkle, a Dutch villager. Van Winkle enjoys solitary activities in the wilderness, but he is also loved by all in town—especially the children to whom he tells stories and gives toys. However, he tends to shirk hard work, to his nagging wife's dismay, which has caused his home and farm to fall into disarray. One autumn day, to escape his wife's nagging, Van Winkle wanders up the mountains with his dog, Wolf. Hearing his name called out, Rip sees a man wearing antiquated Dutch clothing; he is carrying a keg up the mountain and requires help."



In [17]:

    
lang_input









    Out[17]:





"The story of Rip Van Winkle is set in the years before and after the American Revolutionary War. In a pleasant village, at the foot of New York's Catskill Mountains, lives kindly Rip Van Winkle, a Dutch villager. Van Winkle enjoys solitary activities in the wilderness, but he is also loved by all in town\xe2\x80\x94especially the children to whom he tells stories and gives toys. However, he tends to shirk hard work, to his nagging wife's dismay, which has caused his home and farm to fall into disarray. One autumn day, to escape his wife's nagging, Van Winkle wanders up the mountains with his dog, Wolf. Hearing his name called out, Rip sees a man wearing antiquated Dutch clothing; he is carrying a keg up the mountain and requires help."



In [19]:

    
# #Tokenize the input text
wordpunct_tokenize(lang_input)









    Out[19]:





['The',
 'story',
 'of',
 'Rip',
 'Van',
 'Winkle',
 'is',
 'set',
 'in',
 'the',
 'years',
 'before',
 'and',
 'after',
 'the',
 'American',
 'Revolutionary',
 'War',
 '.',
 'In',
 'a',
 'pleasant',
 'village',
 ',',
 'at',
 'the',
 'foot',
 'of',
 'New',
 'York',
 "'",
 's',
 'Catskill',
 'Mountains',
 ',',
 'lives',
 'kindly',
 'Rip',
 'Van',
 'Winkle',
 ',',
 'a',
 'Dutch',
 'villager',
 '.',
 'Van',
 'Winkle',
 'enjoys',
 'solitary',
 'activities',
 'in',
 'the',
 'wilderness',
 ',',
 'but',
 'he',
 'is',
 'also',
 'loved',
 'by',
 'all',
 'in',
 'town\xe2',
 '\x80\x94',
 'especially',
 'the',
 'children',
 'to',
 'whom',
 'he',
 'tells',
 'stories',
 'and',
 'gives',
 'toys',
 '.',
 'However',
 ',',
 'he',
 'tends',
 'to',
 'shirk',
 'hard',
 'work',
 ',',
 'to',
 'his',
 'nagging',
 'wife',
 "'",
 's',
 'dismay',
 ',',
 'which',
 'has',
 'caused',
 'his',
 'home',
 'and',
 'farm',
 'to',
 'fall',
 'into',
 'disarray',
 '.',
 'One',
 'autumn',
 'day',
 ',',
 'to',
 'escape',
 'his',
 'wife',
 "'",
 's',
 'nagging',
 ',',
 'Van',
 'Winkle',
 'wanders',
 'up',
 'the',
 'mountains',
 'with',
 'his',
 'dog',
 ',',
 'Wolf',
 '.',
 'Hearing',
 'his',
 'name',
 'called',
 'out',
 ',',
 'Rip',
 'sees',
 'a',
 'man',
 'wearing',
 'antiquated',
 'Dutch',
 'clothing',
 ';',
 'he',
 'is',
 'carrying',
 'a',
 'keg',
 'up',
 'the',
 'mountain',
 'and',
 'requires',
 'help',
 '.']



In [29]:

    
# #Stopwords in differnt languages
stopwords.fileids()









    Out[29]:





[u'danish',
 u'dutch',
 u'english',
 u'finnish',
 u'french',
 u'german',
 u'hungarian',
 u'italian',
 u'norwegian',
 u'portuguese',
 u'russian',
 u'spanish',
 u'swedish',
 u'turkish']



In [32]:

    
stopwords.words('english')[1:5]









    Out[32]:





[u'me', u'my', u'myself', u'we']



In [33]:

    
stopwords.words('french')[1:5]









    Out[33]:





[u'aux', u'avec', u'ce', u'ces']



In [37]:

    
lang_input_tokenize = wordpunct_tokenize(lang_input)
lang_input_tokenize_lower = [word.lower() for word in lang_input_tokenize]

languages_ratios = {}

for language in stopwords.fileids():
    # #SET of stopwords for a particular language
    stopwords_set = set(stopwords.words(language))
    
    # #SET of stopwords in the input language
    lang_input_tokenize_lower_set = set(lang_input_tokenize_lower)
    
    # #INTERSECTION between the two SETs
    common_elements = lang_input_tokenize_lower_set.intersection(stopwords_set)
    
    # #Greatest Probability - Language
    languages_ratios[language] = len(common_elements)



In [38]:

    
languages_ratios









    Out[38]:





{u'danish': 3,
 u'dutch': 4,
 u'english': 23,
 u'finnish': 1,
 u'french': 1,
 u'german': 4,
 u'hungarian': 3,
 u'italian': 3,
 u'norwegian': 2,
 u'portuguese': 1,
 u'russian': 0,
 u'spanish': 3,
 u'swedish': 1,
 u'turkish': 0}



In [40]:

    
max(languages_ratios.iterkeys(), key=lambda k: languages_ratios[k])









    Out[40]:





u'english'



In [ ]:



In [ ]:



In [41]:

    
stopwords.words("english")









    Out[41]:





[u'i',
 u'me',
 u'my',
 u'myself',
 u'we',
 u'our',
 u'ours',
 u'ourselves',
 u'you',
 u'your',
 u'yours',
 u'yourself',
 u'yourselves',
 u'he',
 u'him',
 u'his',
 u'himself',
 u'she',
 u'her',
 u'hers',
 u'herself',
 u'it',
 u'its',
 u'itself',
 u'they',
 u'them',
 u'their',
 u'theirs',
 u'themselves',
 u'what',
 u'which',
 u'who',
 u'whom',
 u'this',
 u'that',
 u'these',
 u'those',
 u'am',
 u'is',
 u'are',
 u'was',
 u'were',
 u'be',
 u'been',
 u'being',
 u'have',
 u'has',
 u'had',
 u'having',
 u'do',
 u'does',
 u'did',
 u'doing',
 u'a',
 u'an',
 u'the',
 u'and',
 u'but',
 u'if',
 u'or',
 u'because',
 u'as',
 u'until',
 u'while',
 u'of',
 u'at',
 u'by',
 u'for',
 u'with',
 u'about',
 u'against',
 u'between',
 u'into',
 u'through',
 u'during',
 u'before',
 u'after',
 u'above',
 u'below',
 u'to',
 u'from',
 u'up',
 u'down',
 u'in',
 u'out',
 u'on',
 u'off',
 u'over',
 u'under',
 u'again',
 u'further',
 u'then',
 u'once',
 u'here',
 u'there',
 u'when',
 u'where',
 u'why',
 u'how',
 u'all',
 u'any',
 u'both',
 u'each',
 u'few',
 u'more',
 u'most',
 u'other',
 u'some',
 u'such',
 u'no',
 u'nor',
 u'not',
 u'only',
 u'own',
 u'same',
 u'so',
 u'than',
 u'too',
 u'very',
 u's',
 u't',
 u'can',
 u'will',
 u'just',
 u'don',
 u'should',
 u'now']



In [43]:

    
dict_stopwords = {}
arr_languages = ["english", "french", "german", "spanish"]

for var_lang in arr_languages:
    dict_stopwords[var_lang] = stopwords.words(var_lang)



In [44]:

    
print dict_stopwords









    



{'german': [u'aber', u'alle', u'allem', u'allen', u'aller', u'alles', u'als', u'also', u'am', u'an', u'ander', u'andere', u'anderem', u'anderen', u'anderer', u'anderes', u'anderm', u'andern', u'anderr', u'anders', u'auch', u'auf', u'aus', u'bei', u'bin', u'bis', u'bist', u'da', u'damit', u'dann', u'der', u'den', u'des', u'dem', u'die', u'das', u'da\xdf', u'derselbe', u'derselben', u'denselben', u'desselben', u'demselben', u'dieselbe', u'dieselben', u'dasselbe', u'dazu', u'dein', u'deine', u'deinem', u'deinen', u'deiner', u'deines', u'denn', u'derer', u'dessen', u'dich', u'dir', u'du', u'dies', u'diese', u'diesem', u'diesen', u'dieser', u'dieses', u'doch', u'dort', u'durch', u'ein', u'eine', u'einem', u'einen', u'einer', u'eines', u'einig', u'einige', u'einigem', u'einigen', u'einiger', u'einiges', u'einmal', u'er', u'ihn', u'ihm', u'es', u'etwas', u'euer', u'eure', u'eurem', u'euren', u'eurer', u'eures', u'f\xfcr', u'gegen', u'gewesen', u'hab', u'habe', u'haben', u'hat', u'hatte', u'hatten', u'hier', u'hin', u'hinter', u'ich', u'mich', u'mir', u'ihr', u'ihre', u'ihrem', u'ihren', u'ihrer', u'ihres', u'euch', u'im', u'in', u'indem', u'ins', u'ist', u'jede', u'jedem', u'jeden', u'jeder', u'jedes', u'jene', u'jenem', u'jenen', u'jener', u'jenes', u'jetzt', u'kann', u'kein', u'keine', u'keinem', u'keinen', u'keiner', u'keines', u'k\xf6nnen', u'k\xf6nnte', u'machen', u'man', u'manche', u'manchem', u'manchen', u'mancher', u'manches', u'mein', u'meine', u'meinem', u'meinen', u'meiner', u'meines', u'mit', u'muss', u'musste', u'nach', u'nicht', u'nichts', u'noch', u'nun', u'nur', u'ob', u'oder', u'ohne', u'sehr', u'sein', u'seine', u'seinem', u'seinen', u'seiner', u'seines', u'selbst', u'sich', u'sie', u'ihnen', u'sind', u'so', u'solche', u'solchem', u'solchen', u'solcher', u'solches', u'soll', u'sollte', u'sondern', u'sonst', u'\xfcber', u'um', u'und', u'uns', u'unse', u'unsem', u'unsen', u'unser', u'unses', u'unter', u'viel', u'vom', u'von', u'vor', u'w\xe4hrend', u'war', u'waren', u'warst', u'was', u'weg', u'weil', u'weiter', u'welche', u'welchem', u'welchen', u'welcher', u'welches', u'wenn', u'werde', u'werden', u'wie', u'wieder', u'will', u'wir', u'wird', u'wirst', u'wo', u'wollen', u'wollte', u'w\xfcrde', u'w\xfcrden', u'zu', u'zum', u'zur', u'zwar', u'zwischen'], 'spanish': [u'de', u'la', u'que', u'el', u'en', u'y', u'a', u'los', u'del', u'se', u'las', u'por', u'un', u'para', u'con', u'no', u'una', u'su', u'al', u'lo', u'como', u'm\xe1s', u'pero', u'sus', u'le', u'ya', u'o', u'este', u's\xed', u'porque', u'esta', u'entre', u'cuando', u'muy', u'sin', u'sobre', u'tambi\xe9n', u'me', u'hasta', u'hay', u'donde', u'quien', u'desde', u'todo', u'nos', u'durante', u'todos', u'uno', u'les', u'ni', u'contra', u'otros', u'ese', u'eso', u'ante', u'ellos', u'e', u'esto', u'm\xed', u'antes', u'algunos', u'qu\xe9', u'unos', u'yo', u'otro', u'otras', u'otra', u'\xe9l', u'tanto', u'esa', u'estos', u'mucho', u'quienes', u'nada', u'muchos', u'cual', u'poco', u'ella', u'estar', u'estas', u'algunas', u'algo', u'nosotros', u'mi', u'mis', u't\xfa', u'te', u'ti', u'tu', u'tus', u'ellas', u'nosotras', u'vosostros', u'vosostras', u'os', u'm\xedo', u'm\xeda', u'm\xedos', u'm\xedas', u'tuyo', u'tuya', u'tuyos', u'tuyas', u'suyo', u'suya', u'suyos', u'suyas', u'nuestro', u'nuestra', u'nuestros', u'nuestras', u'vuestro', u'vuestra', u'vuestros', u'vuestras', u'esos', u'esas', u'estoy', u'est\xe1s', u'est\xe1', u'estamos', u'est\xe1is', u'est\xe1n', u'est\xe9', u'est\xe9s', u'estemos', u'est\xe9is', u'est\xe9n', u'estar\xe9', u'estar\xe1s', u'estar\xe1', u'estaremos', u'estar\xe9is', u'estar\xe1n', u'estar\xeda', u'estar\xedas', u'estar\xedamos', u'estar\xedais', u'estar\xedan', u'estaba', u'estabas', u'est\xe1bamos', u'estabais', u'estaban', u'estuve', u'estuviste', u'estuvo', u'estuvimos', u'estuvisteis', u'estuvieron', u'estuviera', u'estuvieras', u'estuvi\xe9ramos', u'estuvierais', u'estuvieran', u'estuviese', u'estuvieses', u'estuvi\xe9semos', u'estuvieseis', u'estuviesen', u'estando', u'estado', u'estada', u'estados', u'estadas', u'estad', u'he', u'has', u'ha', u'hemos', u'hab\xe9is', u'han', u'haya', u'hayas', u'hayamos', u'hay\xe1is', u'hayan', u'habr\xe9', u'habr\xe1s', u'habr\xe1', u'habremos', u'habr\xe9is', u'habr\xe1n', u'habr\xeda', u'habr\xedas', u'habr\xedamos', u'habr\xedais', u'habr\xedan', u'hab\xeda', u'hab\xedas', u'hab\xedamos', u'hab\xedais', u'hab\xedan', u'hube', u'hubiste', u'hubo', u'hubimos', u'hubisteis', u'hubieron', u'hubiera', u'hubieras', u'hubi\xe9ramos', u'hubierais', u'hubieran', u'hubiese', u'hubieses', u'hubi\xe9semos', u'hubieseis', u'hubiesen', u'habiendo', u'habido', u'habida', u'habidos', u'habidas', u'soy', u'eres', u'es', u'somos', u'sois', u'son', u'sea', u'seas', u'seamos', u'se\xe1is', u'sean', u'ser\xe9', u'ser\xe1s', u'ser\xe1', u'seremos', u'ser\xe9is', u'ser\xe1n', u'ser\xeda', u'ser\xedas', u'ser\xedamos', u'ser\xedais', u'ser\xedan', u'era', u'eras', u'\xe9ramos', u'erais', u'eran', u'fui', u'fuiste', u'fue', u'fuimos', u'fuisteis', u'fueron', u'fuera', u'fueras', u'fu\xe9ramos', u'fuerais', u'fueran', u'fuese', u'fueses', u'fu\xe9semos', u'fueseis', u'fuesen', u'sintiendo', u'sentido', u'sentida', u'sentidos', u'sentidas', u'siente', u'sentid', u'tengo', u'tienes', u'tiene', u'tenemos', u'ten\xe9is', u'tienen', u'tenga', u'tengas', u'tengamos', u'teng\xe1is', u'tengan', u'tendr\xe9', u'tendr\xe1s', u'tendr\xe1', u'tendremos', u'tendr\xe9is', u'tendr\xe1n', u'tendr\xeda', u'tendr\xedas', u'tendr\xedamos', u'tendr\xedais', u'tendr\xedan', u'ten\xeda', u'ten\xedas', u'ten\xedamos', u'ten\xedais', u'ten\xedan', u'tuve', u'tuviste', u'tuvo', u'tuvimos', u'tuvisteis', u'tuvieron', u'tuviera', u'tuvieras', u'tuvi\xe9ramos', u'tuvierais', u'tuvieran', u'tuviese', u'tuvieses', u'tuvi\xe9semos', u'tuvieseis', u'tuviesen', u'teniendo', u'tenido', u'tenida', u'tenidos', u'tenidas', u'tened'], 'french': [u'au', u'aux', u'avec', u'ce', u'ces', u'dans', u'de', u'des', u'du', u'elle', u'en', u'et', u'eux', u'il', u'je', u'la', u'le', u'leur', u'lui', u'ma', u'mais', u'me', u'm\xeame', u'mes', u'moi', u'mon', u'ne', u'nos', u'notre', u'nous', u'on', u'ou', u'par', u'pas', u'pour', u'qu', u'que', u'qui', u'sa', u'se', u'ses', u'son', u'sur', u'ta', u'te', u'tes', u'toi', u'ton', u'tu', u'un', u'une', u'vos', u'votre', u'vous', u'c', u'd', u'j', u'l', u'\xe0', u'm', u'n', u's', u't', u'y', u'\xe9t\xe9', u'\xe9t\xe9e', u'\xe9t\xe9es', u'\xe9t\xe9s', u'\xe9tant', u'\xe9tante', u'\xe9tants', u'\xe9tantes', u'suis', u'es', u'est', u'sommes', u'\xeates', u'sont', u'serai', u'seras', u'sera', u'serons', u'serez', u'seront', u'serais', u'serait', u'serions', u'seriez', u'seraient', u'\xe9tais', u'\xe9tait', u'\xe9tions', u'\xe9tiez', u'\xe9taient', u'fus', u'fut', u'f\xfbmes', u'f\xfbtes', u'furent', u'sois', u'soit', u'soyons', u'soyez', u'soient', u'fusse', u'fusses', u'f\xfbt', u'fussions', u'fussiez', u'fussent', u'ayant', u'ayante', u'ayantes', u'ayants', u'eu', u'eue', u'eues', u'eus', u'ai', u'as', u'avons', u'avez', u'ont', u'aurai', u'auras', u'aura', u'aurons', u'aurez', u'auront', u'aurais', u'aurait', u'aurions', u'auriez', u'auraient', u'avais', u'avait', u'avions', u'aviez', u'avaient', u'eut', u'e\xfbmes', u'e\xfbtes', u'eurent', u'aie', u'aies', u'ait', u'ayons', u'ayez', u'aient', u'eusse', u'eusses', u'e\xfbt', u'eussions', u'eussiez', u'eussent'], 'english': [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now']}



In [54]:

    
lang_input_tokenize = wordpunct_tokenize(lang_input)
lang_input_tokenize_lower = [word.lower() for word in lang_input_tokenize]

languages_ratios = {}

dict_stopwords_1 = {'german': [u'aber', u'alle', u'allem', u'allen', u'aller', u'alles', u'als', u'also', u'am', u'an', u'ander', u'andere', u'anderem', u'anderen', u'anderer', u'anderes', u'anderm', u'andern', u'anderr', u'anders', u'auch', u'auf', u'aus', u'bei', u'bin', u'bis', u'bist', u'da', u'damit', u'dann', u'der', u'den', u'des', u'dem', u'die', u'das', u'da\xdf', u'derselbe', u'derselben', u'denselben', u'desselben', u'demselben', u'dieselbe', u'dieselben', u'dasselbe', u'dazu', u'dein', u'deine', u'deinem', u'deinen', u'deiner', u'deines', u'denn', u'derer', u'dessen', u'dich', u'dir', u'du', u'dies', u'diese', u'diesem', u'diesen', u'dieser', u'dieses', u'doch', u'dort', u'durch', u'ein', u'eine', u'einem', u'einen', u'einer', u'eines', u'einig', u'einige', u'einigem', u'einigen', u'einiger', u'einiges', u'einmal', u'er', u'ihn', u'ihm', u'es', u'etwas', u'euer', u'eure', u'eurem', u'euren', u'eurer', u'eures', u'f\xfcr', u'gegen', u'gewesen', u'hab', u'habe', u'haben', u'hat', u'hatte', u'hatten', u'hier', u'hin', u'hinter', u'ich', u'mich', u'mir', u'ihr', u'ihre', u'ihrem', u'ihren', u'ihrer', u'ihres', u'euch', u'im', u'in', u'indem', u'ins', u'ist', u'jede', u'jedem', u'jeden', u'jeder', u'jedes', u'jene', u'jenem', u'jenen', u'jener', u'jenes', u'jetzt', u'kann', u'kein', u'keine', u'keinem', u'keinen', u'keiner', u'keines', u'k\xf6nnen', u'k\xf6nnte', u'machen', u'man', u'manche', u'manchem', u'manchen', u'mancher', u'manches', u'mein', u'meine', u'meinem', u'meinen', u'meiner', u'meines', u'mit', u'muss', u'musste', u'nach', u'nicht', u'nichts', u'noch', u'nun', u'nur', u'ob', u'oder', u'ohne', u'sehr', u'sein', u'seine', u'seinem', u'seinen', u'seiner', u'seines', u'selbst', u'sich', u'sie', u'ihnen', u'sind', u'so', u'solche', u'solchem', u'solchen', u'solcher', u'solches', u'soll', u'sollte', u'sondern', u'sonst', u'\xfcber', u'um', u'und', u'uns', u'unse', u'unsem', u'unsen', u'unser', u'unses', u'unter', u'viel', u'vom', u'von', u'vor', u'w\xe4hrend', u'war', u'waren', u'warst', u'was', u'weg', u'weil', u'weiter', u'welche', u'welchem', u'welchen', u'welcher', u'welches', u'wenn', u'werde', u'werden', u'wie', u'wieder', u'will', u'wir', u'wird', u'wirst', u'wo', u'wollen', u'wollte', u'w\xfcrde', u'w\xfcrden', u'zu', u'zum', u'zur', u'zwar', u'zwischen'], 'spanish': [u'de', u'la', u'que', u'el', u'en', u'y', u'a', u'los', u'del', u'se', u'las', u'por', u'un', u'para', u'con', u'no', u'una', u'su', u'al', u'lo', u'como', u'm\xe1s', u'pero', u'sus', u'le', u'ya', u'o', u'este', u's\xed', u'porque', u'esta', u'entre', u'cuando', u'muy', u'sin', u'sobre', u'tambi\xe9n', u'me', u'hasta', u'hay', u'donde', u'quien', u'desde', u'todo', u'nos', u'durante', u'todos', u'uno', u'les', u'ni', u'contra', u'otros', u'ese', u'eso', u'ante', u'ellos', u'e', u'esto', u'm\xed', u'antes', u'algunos', u'qu\xe9', u'unos', u'yo', u'otro', u'otras', u'otra', u'\xe9l', u'tanto', u'esa', u'estos', u'mucho', u'quienes', u'nada', u'muchos', u'cual', u'poco', u'ella', u'estar', u'estas', u'algunas', u'algo', u'nosotros', u'mi', u'mis', u't\xfa', u'te', u'ti', u'tu', u'tus', u'ellas', u'nosotras', u'vosostros', u'vosostras', u'os', u'm\xedo', u'm\xeda', u'm\xedos', u'm\xedas', u'tuyo', u'tuya', u'tuyos', u'tuyas', u'suyo', u'suya', u'suyos', u'suyas', u'nuestro', u'nuestra', u'nuestros', u'nuestras', u'vuestro', u'vuestra', u'vuestros', u'vuestras', u'esos', u'esas', u'estoy', u'est\xe1s', u'est\xe1', u'estamos', u'est\xe1is', u'est\xe1n', u'est\xe9', u'est\xe9s', u'estemos', u'est\xe9is', u'est\xe9n', u'estar\xe9', u'estar\xe1s', u'estar\xe1', u'estaremos', u'estar\xe9is', u'estar\xe1n', u'estar\xeda', u'estar\xedas', u'estar\xedamos', u'estar\xedais', u'estar\xedan', u'estaba', u'estabas', u'est\xe1bamos', u'estabais', u'estaban', u'estuve', u'estuviste', u'estuvo', u'estuvimos', u'estuvisteis', u'estuvieron', u'estuviera', u'estuvieras', u'estuvi\xe9ramos', u'estuvierais', u'estuvieran', u'estuviese', u'estuvieses', u'estuvi\xe9semos', u'estuvieseis', u'estuviesen', u'estando', u'estado', u'estada', u'estados', u'estadas', u'estad', u'he', u'has', u'ha', u'hemos', u'hab\xe9is', u'han', u'haya', u'hayas', u'hayamos', u'hay\xe1is', u'hayan', u'habr\xe9', u'habr\xe1s', u'habr\xe1', u'habremos', u'habr\xe9is', u'habr\xe1n', u'habr\xeda', u'habr\xedas', u'habr\xedamos', u'habr\xedais', u'habr\xedan', u'hab\xeda', u'hab\xedas', u'hab\xedamos', u'hab\xedais', u'hab\xedan', u'hube', u'hubiste', u'hubo', u'hubimos', u'hubisteis', u'hubieron', u'hubiera', u'hubieras', u'hubi\xe9ramos', u'hubierais', u'hubieran', u'hubiese', u'hubieses', u'hubi\xe9semos', u'hubieseis', u'hubiesen', u'habiendo', u'habido', u'habida', u'habidos', u'habidas', u'soy', u'eres', u'es', u'somos', u'sois', u'son', u'sea', u'seas', u'seamos', u'se\xe1is', u'sean', u'ser\xe9', u'ser\xe1s', u'ser\xe1', u'seremos', u'ser\xe9is', u'ser\xe1n', u'ser\xeda', u'ser\xedas', u'ser\xedamos', u'ser\xedais', u'ser\xedan', u'era', u'eras', u'\xe9ramos', u'erais', u'eran', u'fui', u'fuiste', u'fue', u'fuimos', u'fuisteis', u'fueron', u'fuera', u'fueras', u'fu\xe9ramos', u'fuerais', u'fueran', u'fuese', u'fueses', u'fu\xe9semos', u'fueseis', u'fuesen', u'sintiendo', u'sentido', u'sentida', u'sentidos', u'sentidas', u'siente', u'sentid', u'tengo', u'tienes', u'tiene', u'tenemos', u'ten\xe9is', u'tienen', u'tenga', u'tengas', u'tengamos', u'teng\xe1is', u'tengan', u'tendr\xe9', u'tendr\xe1s', u'tendr\xe1', u'tendremos', u'tendr\xe9is', u'tendr\xe1n', u'tendr\xeda', u'tendr\xedas', u'tendr\xedamos', u'tendr\xedais', u'tendr\xedan', u'ten\xeda', u'ten\xedas', u'ten\xedamos', u'ten\xedais', u'ten\xedan', u'tuve', u'tuviste', u'tuvo', u'tuvimos', u'tuvisteis', u'tuvieron', u'tuviera', u'tuvieras', u'tuvi\xe9ramos', u'tuvierais', u'tuvieran', u'tuviese', u'tuvieses', u'tuvi\xe9semos', u'tuvieseis', u'tuviesen', u'teniendo', u'tenido', u'tenida', u'tenidos', u'tenidas', u'tened'], 'french': [u'au', u'aux', u'avec', u'ce', u'ces', u'dans', u'de', u'des', u'du', u'elle', u'en', u'et', u'eux', u'il', u'je', u'la', u'le', u'leur', u'lui', u'ma', u'mais', u'me', u'm\xeame', u'mes', u'moi', u'mon', u'ne', u'nos', u'notre', u'nous', u'on', u'ou', u'par', u'pas', u'pour', u'qu', u'que', u'qui', u'sa', u'se', u'ses', u'son', u'sur', u'ta', u'te', u'tes', u'toi', u'ton', u'tu', u'un', u'une', u'vos', u'votre', u'vous', u'c', u'd', u'j', u'l', u'\xe0', u'm', u'n', u's', u't', u'y', u'\xe9t\xe9', u'\xe9t\xe9e', u'\xe9t\xe9es', u'\xe9t\xe9s', u'\xe9tant', u'\xe9tante', u'\xe9tants', u'\xe9tantes', u'suis', u'es', u'est', u'sommes', u'\xeates', u'sont', u'serai', u'seras', u'sera', u'serons', u'serez', u'seront', u'serais', u'serait', u'serions', u'seriez', u'seraient', u'\xe9tais', u'\xe9tait', u'\xe9tions', u'\xe9tiez', u'\xe9taient', u'fus', u'fut', u'f\xfbmes', u'f\xfbtes', u'furent', u'sois', u'soit', u'soyons', u'soyez', u'soient', u'fusse', u'fusses', u'f\xfbt', u'fussions', u'fussiez', u'fussent', u'ayant', u'ayante', u'ayantes', u'ayants', u'eu', u'eue', u'eues', u'eus', u'ai', u'as', u'avons', u'avez', u'ont', u'aurai', u'auras', u'aura', u'aurons', u'aurez', u'auront', u'aurais', u'aurait', u'aurions', u'auriez', u'auraient', u'avais', u'avait', u'avions', u'aviez', u'avaient', u'eut', u'e\xfbmes', u'e\xfbtes', u'eurent', u'aie', u'aies', u'ait', u'ayons', u'ayez', u'aient', u'eusse', u'eusses', u'e\xfbt', u'eussions', u'eussiez', u'eussent'], 'english': [u'i', u'me', u'my', u'myself', u'we', u'our', u'ours', u'ourselves', u'you', u'your', u'yours', u'yourself', u'yourselves', u'he', u'him', u'his', u'himself', u'she', u'her', u'hers', u'herself', u'it', u'its', u'itself', u'they', u'them', u'their', u'theirs', u'themselves', u'what', u'which', u'who', u'whom', u'this', u'that', u'these', u'those', u'am', u'is', u'are', u'was', u'were', u'be', u'been', u'being', u'have', u'has', u'had', u'having', u'do', u'does', u'did', u'doing', u'a', u'an', u'the', u'and', u'but', u'if', u'or', u'because', u'as', u'until', u'while', u'of', u'at', u'by', u'for', u'with', u'about', u'against', u'between', u'into', u'through', u'during', u'before', u'after', u'above', u'below', u'to', u'from', u'up', u'down', u'in', u'out', u'on', u'off', u'over', u'under', u'again', u'further', u'then', u'once', u'here', u'there', u'when', u'where', u'why', u'how', u'all', u'any', u'both', u'each', u'few', u'more', u'most', u'other', u'some', u'such', u'no', u'nor', u'not', u'only', u'own', u'same', u'so', u'than', u'too', u'very', u's', u't', u'can', u'will', u'just', u'don', u'should', u'now']}

for var_lang in dict_stopwords_1.iterkeys():
    # #SET of stopwords for a particular language
    stopwords_set = set(dict_stopwords[var_lang])
    
    # #SET of stopwords in the input language
    lang_input_tokenize_lower_set = set(lang_input_tokenize_lower)
    
    # #INTERSECTION between the two SETs
    common_elements = lang_input_tokenize_lower_set.intersection(stopwords_set)
    
    # #Greatest Probability - Language
    languages_ratios[var_lang] = len(common_elements)
    
print max(languages_ratios.iterkeys(), key=lambda k: languages_ratios[k]).capitalize()









    



English



In [ ]: