Lang Detect


In [ ]:
from langdetect import detect
print detect("War doesn't show who's right, just who's left.")
print detect("Ein, zwei, drei, vier")

More Languages


In [ ]:
import unicodecsv 
with open('7languages.txt', 'rb') as input_file:
    row_reader = unicodecsv.reader(input_file)
    result = []
    for row in row_reader:
        lang = detect(row[0])
        result = result + [lang.encode('ascii','ignore')]
        print row[0] , "|" , lang

Confusion Matrix


In [ ]:
#cerate truth set
truth = ['ko','de','zh','ar','es','ja']
print truth
print result

In [ ]:
#Display Confusion Matrix
import nltk
print nltk.ConfusionMatrix(truth, result)

You Try It

  1. Put Lang Detect to the test on the 8 Genesis texts
  2. Output the detected language for each
  3. Did it get them all right?

In [ ]: