Lang Detect



In [ ]:

    
from langdetect import detect
print detect("War doesn't show who's right, just who's left.")
print detect("Ein, zwei, drei, vier")

More Languages



In [ ]:

    
import unicodecsv 
with open('7languages.txt', 'rb') as input_file:
    row_reader = unicodecsv.reader(input_file)
    result = []
    for row in row_reader:
        lang = detect(row[0])
        result = result + [lang.encode('ascii','ignore')]
        print row[0] , "|" , lang

Confusion Matrix



In [ ]:

    
#cerate truth set
truth = ['ko','de','zh','ar','es','ja']
print truth
print result



In [ ]:

    
#Display Confusion Matrix
import nltk
print nltk.ConfusionMatrix(truth, result)

You Try It

Put Lang Detect to the test on the 8 Genesis texts
Output the detected language for each
Did it get them all right?



In [ ]: