20. 자연어처리

1) 워드 클라우드

  • 단어의 크기를 단어의 빈도 수에 비례하도록 하여 단어를 아름답게 배치

In [1]:
import math, random, re
from collections import defaultdict, Counter
from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt


---------------------------------------------------------------------------
ImportError                               Traceback (most recent call last)
<ipython-input-1-fbf6abeb8355> in <module>()
      3 from bs4 import BeautifulSoup
      4 import requests
----> 5 import matplotlib.pyplot as plt

ImportError: No module named 'matplotlib'

In [28]:
#데이터 과학 관련 키워드목록, 빈도 0~100
data = [ ("big data", 100, 15), ("Hadoop", 95, 25), ("Python", 75, 50),
         ("R", 50, 40), ("machine learning", 80, 20), ("statistics", 20, 60),
         ("data science", 60, 70), ("analytics", 90, 3),
         ("team player", 85, 85), ("dynamic", 2, 90), ("synergies", 70, 0),
         ("actionable insights", 40, 30), ("think out of the box", 45, 10),
         ("self-starter", 30, 50), ("customer focus", 65, 15),
         ("thought leadership", 35, 35)]

아주 멋있어 보이기는 하지만, 딱히 어떤 정보를 제공하지는 않는다.

단어가 구인 광고에 등장하는 빈도를 가로축, 단어가 이력서에 등장하는 빈도를 세로축


In [29]:
def text_size(total):
    """equals 8 if total is 0, 28 if total is 200"""
    return 8 + total / 200 * 20

In [30]:
for word, job_popularity, resume_popularity in data:
    plt.text(job_popularity, resume_popularity, word,
             ha='center', va='center',
             size=text_size(job_popularity + resume_popularity))
plt.xlabel("Popularity on Job Postings")
plt.ylabel("Popularity on Resumes")
plt.axis([0, 100, 0, 100])
plt.show()


2) n-gram 모델


In [31]:
#유니코드 따옴표를 일반 아스키 따옴표로 변환
def fix_unicode(text):
    return text.replace(u"\u2019", "'")

In [59]:
def get_document():

    url = "http://radar.oreilly.com/2010/06/what-is-data-science.html"
    html = requests.get(url).text
    soup = BeautifulSoup(html, 'html5lib')

    #content = soup.find("div", "entry-content")       # NoneType Error
    content = soup.find("div", "article-body")         # find article-body div
    
    regex = r"[\w']+|[\.]"                             # 단어나 마침표에 해당하는 문자열

    document = []

    for paragraph in content("p"):
        words = re.findall(regex, fix_unicode(paragraph.text))
        document.extend(words)

    return document

In [72]:
document = get_document()
#document


Out[72]:
["We've",
 'all',
 'heard',
 'it',
 'according',
 'to',
 'Hal',
 'Varian',
 'statistics',
 'is',
 'the',
 'next',
 'sexy',
 'job',
 '.',
 'Five',
 'years',
 'ago',
 'in',
 'What',
 'is',
 'Web',
 '2',
 '.',
 '0',
 'Tim',
 "O'Reilly",
 'said',
 'that',
 'data',
 'is',
 'the',
 'next',
 'Intel',
 'Inside',
 '.',
 'But',
 'what',
 'does',
 'that',
 'statement',
 'mean',
 'Why',
 'do',
 'we',
 'suddenly',
 'care',
 'about',
 'statistics',
 'and',
 'about',
 'data',
 'In',
 'this',
 'post',
 'I',
 'examine',
 'the',
 'many',
 'sides',
 'of',
 'data',
 'science',
 'the',
 'technologies',
 'the',
 'companies',
 'and',
 'the',
 'unique',
 'skill',
 'sets',
 '.',
 'The',
 'web',
 'is',
 'full',
 'of',
 'data',
 'driven',
 'apps',
 '.',
 'Almost',
 'any',
 'e',
 'commerce',
 'application',
 'is',
 'a',
 'data',
 'driven',
 'application',
 '.',
 "There's",
 'a',
 'database',
 'behind',
 'a',
 'web',
 'front',
 'end',
 'and',
 'middleware',
 'that',
 'talks',
 'to',
 'a',
 'number',
 'of',
 'other',
 'databases',
 'and',
 'data',
 'services',
 'credit',
 'card',
 'processing',
 'companies',
 'banks',
 'and',
 'so',
 'on',
 '.',
 'But',
 'merely',
 'using',
 'data',
 "isn't",
 'really',
 'what',
 'we',
 'mean',
 'by',
 'data',
 'science',
 '.',
 'A',
 'data',
 'application',
 'acquires',
 'its',
 'value',
 'from',
 'the',
 'data',
 'itself',
 'and',
 'creates',
 'more',
 'data',
 'as',
 'a',
 'result',
 '.',
 "It's",
 'not',
 'just',
 'an',
 'application',
 'with',
 'data',
 "it's",
 'a',
 'data',
 'product',
 '.',
 'Data',
 'science',
 'enables',
 'the',
 'creation',
 'of',
 'data',
 'products',
 '.',
 'One',
 'of',
 'the',
 'earlier',
 'data',
 'products',
 'on',
 'the',
 'Web',
 'was',
 'the',
 'CDDB',
 'database',
 '.',
 'The',
 'developers',
 'of',
 'CDDB',
 'realized',
 'that',
 'any',
 'CD',
 'had',
 'a',
 'unique',
 'signature',
 'based',
 'on',
 'the',
 'exact',
 'length',
 'in',
 'samples',
 'of',
 'each',
 'track',
 'on',
 'the',
 'CD',
 '.',
 'Gracenote',
 'built',
 'a',
 'database',
 'of',
 'track',
 'lengths',
 'and',
 'coupled',
 'it',
 'to',
 'a',
 'database',
 'of',
 'album',
 'metadata',
 'track',
 'titles',
 'artists',
 'album',
 'titles',
 '.',
 'If',
 "you've",
 'ever',
 'used',
 'iTunes',
 'to',
 'rip',
 'a',
 'CD',
 "you've",
 'taken',
 'advantage',
 'of',
 'this',
 'database',
 '.',
 'Before',
 'it',
 'does',
 'anything',
 'else',
 'iTunes',
 'reads',
 'the',
 'length',
 'of',
 'every',
 'track',
 'sends',
 'it',
 'to',
 'CDDB',
 'and',
 'gets',
 'back',
 'the',
 'track',
 'titles',
 '.',
 'If',
 'you',
 'have',
 'a',
 'CD',
 "that's",
 'not',
 'in',
 'the',
 'database',
 'including',
 'a',
 'CD',
 "you've",
 'made',
 'yourself',
 'you',
 'can',
 'create',
 'an',
 'entry',
 'for',
 'an',
 'unknown',
 'album',
 '.',
 'While',
 'this',
 'sounds',
 'simple',
 'enough',
 "it's",
 'revolutionary',
 'CDDB',
 'views',
 'music',
 'as',
 'data',
 'not',
 'as',
 'audio',
 'and',
 'creates',
 'new',
 'value',
 'in',
 'doing',
 'so',
 '.',
 'Their',
 'business',
 'is',
 'fundamentally',
 'different',
 'from',
 'selling',
 'music',
 'sharing',
 'music',
 'or',
 'analyzing',
 'musical',
 'tastes',
 'though',
 'these',
 'can',
 'also',
 'be',
 'data',
 'products',
 '.',
 'CDDB',
 'arises',
 'entirely',
 'from',
 'viewing',
 'a',
 'musical',
 'problem',
 'as',
 'a',
 'data',
 'problem',
 '.',
 'Google',
 'is',
 'a',
 'master',
 'at',
 'creating',
 'data',
 'products',
 '.',
 "Here's",
 'a',
 'few',
 'examples',
 'Google',
 "isn't",
 'the',
 'only',
 'company',
 'that',
 'knows',
 'how',
 'to',
 'use',
 'data',
 '.',
 'Facebook',
 'and',
 'LinkedIn',
 'use',
 'patterns',
 'of',
 'friendship',
 'relationships',
 'to',
 'suggest',
 'other',
 'people',
 'you',
 'may',
 'know',
 'or',
 'should',
 'know',
 'with',
 'sometimes',
 'frightening',
 'accuracy',
 '.',
 'Amazon',
 'saves',
 'your',
 'searches',
 'correlates',
 'what',
 'you',
 'search',
 'for',
 'with',
 'what',
 'other',
 'users',
 'search',
 'for',
 'and',
 'uses',
 'it',
 'to',
 'create',
 'surprisingly',
 'appropriate',
 'recommendations',
 '.',
 'These',
 'recommendations',
 'are',
 'data',
 'products',
 'that',
 'help',
 'to',
 'drive',
 "Amazon's",
 'more',
 'traditional',
 'retail',
 'business',
 '.',
 'They',
 'come',
 'about',
 'because',
 'Amazon',
 'understands',
 'that',
 'a',
 'book',
 "isn't",
 'just',
 'a',
 'book',
 'a',
 'camera',
 "isn't",
 'just',
 'a',
 'camera',
 'and',
 'a',
 'customer',
 "isn't",
 'just',
 'a',
 'customer',
 'customers',
 'generate',
 'a',
 'trail',
 'of',
 'data',
 'exhaust',
 'that',
 'can',
 'be',
 'mined',
 'and',
 'put',
 'to',
 'use',
 'and',
 'a',
 'camera',
 'is',
 'a',
 'cloud',
 'of',
 'data',
 'that',
 'can',
 'be',
 'correlated',
 'with',
 'the',
 "customers'",
 'behavior',
 'the',
 'data',
 'they',
 'leave',
 'every',
 'time',
 'they',
 'visit',
 'the',
 'site',
 '.',
 'The',
 'thread',
 'that',
 'ties',
 'most',
 'of',
 'these',
 'applications',
 'together',
 'is',
 'that',
 'data',
 'collected',
 'from',
 'users',
 'provides',
 'added',
 'value',
 '.',
 'Whether',
 'that',
 'data',
 'is',
 'search',
 'terms',
 'voice',
 'samples',
 'or',
 'product',
 'reviews',
 'the',
 'users',
 'are',
 'in',
 'a',
 'feedback',
 'loop',
 'in',
 'which',
 'they',
 'contribute',
 'to',
 'the',
 'products',
 'they',
 'use',
 '.',
 "That's",
 'the',
 'beginning',
 'of',
 'data',
 'science',
 '.',
 'In',
 'the',
 'last',
 'few',
 'years',
 'there',
 'has',
 'been',
 'an',
 'explosion',
 'in',
 'the',
 'amount',
 'of',
 'data',
 "that's",
 'available',
 '.',
 'Whether',
 "we're",
 'talking',
 'about',
 'web',
 'server',
 'logs',
 'tweet',
 'streams',
 'online',
 'transaction',
 'records',
 'citizen',
 'science',
 'data',
 'from',
 'sensors',
 'government',
 'data',
 'or',
 'some',
 'other',
 'source',
 'the',
 'problem',
 "isn't",
 'finding',
 'data',
 "it's",
 'figuring',
 'out',
 'what',
 'to',
 'do',
 'with',
 'it',
 '.',
 'And',
 "it's",
 'not',
 'just',
 'companies',
 'using',
 'their',
 'own',
 'data',
 'or',
 'the',
 'data',
 'contributed',
 'by',
 'their',
 'users',
 '.',
 "It's",
 'increasingly',
 'common',
 'to',
 'mashup',
 'data',
 'from',
 'a',
 'number',
 'of',
 'sources',
 '.',
 'Data',
 'Mashups',
 'in',
 'R',
 'analyzes',
 'mortgage',
 'foreclosures',
 'in',
 'Philadelphia',
 'County',
 'by',
 'taking',
 'a',
 'public',
 'report',
 'from',
 'the',
 'county',
 "sheriff's",
 'office',
 'extracting',
 'addresses',
 'and',
 'using',
 'Yahoo',
 'to',
 'convert',
 'the',
 'addresses',
 'to',
 'latitude',
 'and',
 'longitude',
 'then',
 'using',
 'the',
 'geographical',
 'data',
 'to',
 'place',
 'the',
 'foreclosures',
 'on',
 'a',
 'map',
 'another',
 'data',
 'source',
 'and',
 'group',
 'them',
 'by',
 'neighborhood',
 'valuation',
 'neighborhood',
 'per',
 'capita',
 'income',
 'and',
 'other',
 'socio',
 'economic',
 'factors',
 '.',
 'The',
 'question',
 'facing',
 'every',
 'company',
 'today',
 'every',
 'startup',
 'every',
 'non',
 'profit',
 'every',
 'project',
 'site',
 'that',
 'wants',
 'to',
 'attract',
 'a',
 'community',
 'is',
 'how',
 'to',
 'use',
 'data',
 'effectively',
 'not',
 'just',
 'their',
 'own',
 'data',
 'but',
 'all',
 'the',
 'data',
 "that's",
 'available',
 'and',
 'relevant',
 '.',
 'Using',
 'data',
 'effectively',
 'requires',
 'something',
 'different',
 'from',
 'traditional',
 'statistics',
 'where',
 'actuaries',
 'in',
 'business',
 'suits',
 'perform',
 'arcane',
 'but',
 'fairly',
 'well',
 'defined',
 'kinds',
 'of',
 'analysis',
 '.',
 'What',
 'differentiates',
 'data',
 'science',
 'from',
 'statistics',
 'is',
 'that',
 'data',
 'science',
 'is',
 'a',
 'holistic',
 'approach',
 '.',
 "We're",
 'increasingly',
 'finding',
 'data',
 'in',
 'the',
 'wild',
 'and',
 'data',
 'scientists',
 'are',
 'involved',
 'with',
 'gathering',
 'data',
 'massaging',
 'it',
 'into',
 'a',
 'tractable',
 'form',
 'making',
 'it',
 'tell',
 'its',
 'story',
 'and',
 'presenting',
 'that',
 'story',
 'to',
 'others',
 '.',
 'To',
 'get',
 'a',
 'sense',
 'for',
 'what',
 'skills',
 'are',
 'required',
 "let's",
 'look',
 'at',
 'the',
 'data',
 'lifecycle',
 'where',
 'it',
 'comes',
 'from',
 'how',
 'you',
 'use',
 'it',
 'and',
 'where',
 'it',
 'goes',
 '.',
 'Data',
 'is',
 'everywhere',
 'your',
 'government',
 'your',
 'web',
 'server',
 'your',
 'business',
 'partners',
 'even',
 'your',
 'body',
 '.',
 'While',
 'we',
 "aren't",
 'drowning',
 'in',
 'a',
 'sea',
 'of',
 'data',
 "we're",
 'finding',
 'that',
 'almost',
 'everything',
 'can',
 'or',
 'has',
 'been',
 'instrumented',
 '.',
 'At',
 "O'Reilly",
 'we',
 'frequently',
 'combine',
 'publishing',
 'industry',
 'data',
 'from',
 'Nielsen',
 'BookScan',
 'with',
 'our',
 'own',
 'sales',
 'data',
 'publicly',
 'available',
 'Amazon',
 'data',
 'and',
 'even',
 'job',
 'data',
 'to',
 'see',
 "what's",
 'happening',
 'in',
 'the',
 'publishing',
 'industry',
 '.',
 'Sites',
 'like',
 'Infochimps',
 'and',
 'Factual',
 'provide',
 'access',
 'to',
 'many',
 'large',
 'datasets',
 'including',
 'climate',
 'data',
 'MySpace',
 'activity',
 'streams',
 'and',
 'game',
 'logs',
 'from',
 'sporting',
 'events',
 '.',
 'Factual',
 'enlists',
 'users',
 'to',
 'update',
 'and',
 'improve',
 'its',
 'datasets',
 'which',
 'cover',
 'topics',
 'as',
 'diverse',
 'as',
 'endocrinologists',
 'to',
 'hiking',
 'trails',
 '.',
 'Much',
 'of',
 'the',
 'data',
 'we',
 'currently',
 'work',
 'with',
 'is',
 'the',
 'direct',
 'consequence',
 'of',
 'Web',
 '2',
 '.',
 '0',
 'and',
 'of',
 "Moore's",
 'Law',
 'applied',
 'to',
 'data',
 '.',
 'The',
 'web',
 'has',
 'people',
 'spending',
 'more',
 ...]

In [71]:
###+순차적으로 등장하는 단어들에 대한 정보를 얻기 위함?
a = ["We've",'all','heard', 'it']
b = ["We've",'all','heard', 'it']
list(zip(a,b))


Out[71]:
[('a', 'd'), ('b', 'e')]

In [76]:
bigrams = list(zip(document, document[1:]))
transitions = defaultdict(list)
for prev, current in bigrams:
    transitions[prev].append(current)

In [77]:
#transitions
transitions


Out[77]:
defaultdict(list,
            {'generation': ['of'],
             'initial': ['data'],
             'applied': ['to', 'to'],
             'already': ['reduced'],
             'verify': ['them'],
             'five': ['minutes'],
             'processing': ['companies', 'to', 'fails', '.', 'pipeline'],
             'throw': ['the'],
             'feedback': ['loop'],
             'they': ['leave',
              'visit',
              'contribute',
              'use',
              'go',
              "aren't",
              'had',
              'generate',
              'are',
              'decided'],
             'speak': ['coherently'],
             'thread': ['that'],
             'modern': ['web'],
             'made': ['yourself', 'the', 'recommendations', 'that'],
             'Factual': ['provide', 'enlists'],
             'alternative': ['but'],
             'creativity': ['for'],
             'presenting': ['that', 'data', 'results'],
             'versatile': ['According'],
             'community': ['is'],
             'investigate': ['the'],
             'this': ['post',
              'database',
              'sounds',
              'data',
              'scale',
              'video',
              'difference',
              'animation',
              'is',
              'versatile',
              'into',
              'but'],
             'BI': ['to'],
             'parsers': ['and', 'for'],
             'interaction': ['between'],
             'online': ['transaction', 'and', '.', '.'],
             'dozen': ['or', 'or'],
             '000': ['MB', 'postings', 'cores'],
             'eventual': ['consistency'],
             'increased': ['from', 'sophistication'],
             'capacity': ['on', 'storage', 'demands', 'continues'],
             'up': ['messy',
              'more',
              'into',
              'that',
              'with',
              'in',
              'into',
              'that',
              'with'],
             'increases': ['in', 'in'],
             'lets': ['developers', 'you'],
             'music': ['as', 'sharing', 'or', 'by'],
             'advance': ['conflicts'],
             'jiujitsu': ['using', 'identifying'],
             'costs': ['100', 'a'],
             'usually': ['impossible'],
             'dataspaces': ['.'],
             'CPU': ['speed'],
             'providing': ['preconfigured', 'one'],
             'consumer': ['equipment', 'oriented'],
             'Martin': ['Wattenberg'],
             'sporting': ['events'],
             'All': ['of'],
             'server': ['logs', 'your'],
             'identifying': ['music'],
             'give': ['us'],
             'such': ['as', 'as', '.'],
             'titles': ['artists', '.', '.'],
             'looked': ['at'],
             'terms': ['voice'],
             'dpatil': ['the'],
             'gotten': ['some'],
             'about': ['statistics',
              'data',
              'because',
              'web',
              '40000',
              'half',
              'the',
              'large',
              'big',
              'the',
              'the',
              'what',
              'testing',
              'the'],
             'designed': ['for', 'for', 'to', 'for'],
             "LinkedIn's": ['membership', 'data'],
             "couldn't": ['store'],
             'red': ['herring'],
             'natural': ['language', 'language'],
             'explosion': ['in'],
             'intermediate': ['results', 'results'],
             "week's": ['small'],
             'does': ['that', 'anything', 'that'],
             'SQL': ['like'],
             "It's": ['not',
              'increasingly',
              'reported',
              'usually',
              'easy',
              'easer',
              'an',
              'the'],
             'die': ['.'],
             'scatter': ['plots'],
             'comprehensive': ['package', 'graphics'],
             'SnapTell': ['and'],
             'appear': ['.'],
             'consumers': ['and'],
             'examine': ['the'],
             'devices': ['and'],
             'amount': ['of'],
             'willingness': ['to'],
             'has': ['been',
              'been',
              'people',
              'increased',
              'moved',
              'more',
              'indexed',
              'an',
              'proven',
              'been',
              'been',
              'just',
              'eaten',
              'become',
              'excellent'],
             'camera': ["isn't", 'and', 'is'],
             'agile': ['data', 'practices', 'flexible'],
             'law': ['as'],
             'rolled': ['back'],
             'Goggles': ['and'],
             'economic': ['factors'],
             'That': ["isn't", 'joke'],
             'neither': ['term'],
             'Disambiguation': ['is'],
             'two': ['dozen'],
             'Scripting': ['languages'],
             'visualization': ['and', 'is', 'itself'],
             'phone': ['and'],
             'complements': ['them'],
             'think': ['about', "it's", 'outside'],
             'Facebook': ['and', 'or', 'possibly', 'and'],
             'open': ['source', 'source', 'source'],
             'power': ['the'],
             'really': ['what',
              'a',
              'necessary',
              'care',
              'telling',
              'what',
              'to'],
             'programming': ['task'],
             'skills': ['are', '.', 'ranging', 'and'],
             'out': ['what',
              "what's",
              'of',
              'whether',
              'just',
              'if',
              'with',
              'incrementally',
              'how',
              'how'],
             "hasn't": ['died'],
             'language': ['processing',
              'understanding',
              'and',
              'processing',
              'called',
              'and',
              'particularly'],
             'grammatical': ['structure'],
             'queries': ['a'],
             'book': ["isn't", 'a'],
             'Display': ['of'],
             'how': ['to',
              'to',
              'you',
              'do',
              "Google's",
              'bad',
              'things',
              'you',
              'economies',
              'to',
              'to',
              'to'],
             'audio': ['and', 'all', 'stream'],
             '92': ['percent'],
             'patience': ['the'],
             'Hammerbacher': ['2', 'said', 'in', 'in'],
             'Storing': ['data'],
             'incorporate': ['recommendation'],
             'feeds': ['web'],
             'tools': ['like', 'to', 'discarded', 'like', 'for'],
             "aren't": ['drowning', 'well', 'concerned', '.'],
             'company': ['that', 'today', 'with'],
             "We've": ['all', 'all', 'all'],
             'BigTable': ['and'],
             'computing': ['cluster', '.', 'power', 'skills', 'time'],
             'able': ['to', 'to'],
             'Or': ['the'],
             'ancient': ['Unix'],
             "Yau's": ['FlowingData'],
             'Even': ['a'],
             'picture': ['with', 'may', 'is', 'the'],
             'artists': ['album', "they're"],
             'nothing': ['of'],
             'easily': ['described', 'be'],
             'connections': ['then'],
             'Stanford': ['with'],
             'early': ["'80s"],
             'addresses': ['and', 'to'],
             'Precision': ['has'],
             'faces': ['cars'],
             'then': ['using',
              'distributed',
              'combined',
              'combine',
              'use',
              'going',
              'branched',
              'looking'],
             'training': ['set', 'sets', 'data'],
             'trivially': ['simple'],
             'others': ['.', 'the'],
             '2012': ['Nissan'],
             'languages': ['or', 'such'],
             'warehouses': ['but'],
             'increasingly': ['common', 'finding'],
             'understanding': ['the', 'the', 'of', 'how'],
             'entirely': ['from'],
             'hour': ['.'],
             'It': ['would',
              'incorporates',
              'is',
              "isn't",
              'has',
              'would',
              'then',
              'started',
              'was'],
             'climate': ['data'],
             'widely': ['applicable'],
             'Since': ['the'],
             'locked': ['up'],
             'A': ['data', 'picture'],
             'from': ['the',
              'selling',
              'viewing',
              'users',
              'sensors',
              'a',
              'the',
              'traditional',
              'statistics',
              'how',
              'Nielsen',
              'sporting',
              '10',
              '1',
              'a',
              'ancient',
              'many',
              'gigabytes',
              'search',
              'Twitter',
              'the',
              'traditional',
              'machine',
              'traditional',
              'a',
              'the',
              'it',
              'initial',
              'it',
              'the',
              'it'],
             'with': ['data',
              'sometimes',
              'what',
              'the',
              'it',
              'gathering',
              'our',
              'is',
              'geolocation',
              'the',
              'all',
              'tools',
              'an',
              'badly',
              'the',
              'the',
              'human',
              'Apple',
              'the',
              'data',
              'data',
              'reality',
              'it',
              '10',
              'faster',
              'clients',
              'the',
              'a',
              'hundreds',
              'which',
              'a',
              'most',
              'more',
              'the',
              'R',
              'a',
              '.',
              'patience',
              'new',
              'very'],
             'e': ['commerce'],
             'capture': ['all'],
             'classification': ['for', 'error'],
             'an': ['application',
              'entry',
              'unknown',
              'explosion',
              'even',
              'increase',
              'HTML',
              'array',
              'easy',
              'allure',
              'extremely',
              'extremely',
              'obvious',
              'experimental',
              'hour',
              'ill',
              'important',
              'excellent',
              'excellent',
              'important',
              'essential',
              'odd',
              'end',
              'epidemic',
              'algorithm',
              'agile',
              'audio',
              'important'],
             'hard': ['scientists'],
             'contribute': ['to'],
             'of': ['data',
              'data',
              'other',
              'data',
              'the',
              'CDDB',
              'each',
              'track',
              'album',
              'this',
              'every',
              'friendship',
              'data',
              'data',
              'these',
              'data',
              'data',
              'sources',
              'analysis',
              'data',
              'the',
              'Web',
              "Moore's",
              'data',
              'them',
              'which',
              'sale',
              'your',
              'this',
              '360',
              'cores',
              'about',
              'the',
              'CPU',
              "Moore's",
              'that',
              'data',
              'any',
              'wild',
              'data',
              'tools',
              'your',
              'ozone',
              'a',
              'the',
              'subtasks',
              'the',
              'the',
              'data',
              'steam',
              'the',
              'the',
              'database',
              'multiple',
              'a',
              'operations',
              'analysis',
              'finance',
              'databases',
              'these',
              "Google's",
              'them',
              'building',
              'identical',
              'processors',
              'answers',
              'MapReduce',
              'the',
              'Linux',
              'which',
              'a',
              'huge',
              'time',
              'followers',
              'the',
              'the',
              'the',
              'students',
              'the',
              'known',
              'public',
              'a',
              'data',
              'data',
              'data',
              'numbers',
              'Quantitative',
              'the',
              'Flowing',
              'what',
              'the',
              'the',
              'my',
              'the',
              'Walmart',
              'the',
              'cancer',
              'a',
              'presenting',
              'question',
              'our',
              'the',
              'grant',
              'creating',
              'hours',
              'developer',
              'hours',
              'computing',
              'data',
              'what',
              'data',
              'the',
              'person',
              'products',
              'a',
              'data',
              'their',
              'millions',
              'travellers',
              'successful',
              'Hal'],
             'companies': ['and',
              'banks',
              'using',
              'telecommunications',
              'and',
              'that',
              'who',
              'like'],
             'have': ['a',
              'to',
              'to',
              'is',
              'no',
              'real',
              'already',
              'had',
              'built',
              'found',
              '1',
              'very',
              'established',
              'found',
              'to',
              'to',
              'humans',
              'asked',
              'a',
              'to',
              'to',
              'been',
              'all'],
             'as': ['a',
              'data',
              'audio',
              'a',
              'diverse',
              'endocrinologists',
              'applied',
              'an',
              'awk',
              'Perl',
              'storage',
              'the',
              'needed',
              'it',
              'clean',
              "you'd",
              'such',
              'scientist',
              'Dataspaces',
              'Dataspaces'],
             'easy': ['task', 'to', 'to', 'to', 'to'],
             'single': ['reduce', 'set', 'tool'],
             'statistical': ['models', 'packages', 'work'],
             'many': ['sides',
              'large',
              'of',
              'job',
              'many',
              'websites',
              'applications',
              'nodes',
              'processors',
              'large',
              'modern',
              'libraries',
              'commercial',
              'kinds',
              'packages',
              'of'],
             'start': ['thinking', 'a', 'a'],
             'incrementally': ['.', 'rather', 'the'],
             'consequence': ['of'],
             'processes': ['data'],
             'mined': ['and', '.'],
             'scientists': ['are',
              'tend',
              'particularly',
              'started',
              'combine'],
             "We're": ['increasingly', 'discussing'],
             'no': ['alternative'],
             'easier': ['to', 'to', 'to'],
             'form': ['making'],
             'looking': ['at', 'at', 'at', 'up', 'for'],
             'job': ['.',
              'data',
              'but',
              'listings',
              'postings',
              'posting',
              'listings',
              'only',
              'as'],
             'system': ['but'],
             'last': ['few'],
             'clusters': ['.', 'that'],
             'around': ['times', 'faces', 'data'],
             'scraping': ["hasn't"],
             'reads': ['the'],
             'piece': ['of'],
             'In': ['this',
              'the',
              'data',
              'the',
              'hindsight',
              'software',
              'addition',
              'addition'],
             'large': ['datasets',
              'snakes',
              'number',
              'problem',
              'computing',
              'searches',
              'data',
              'collection',
              'job',
              'problems',
              'difficult'],
             'starts': ['by'],
             'Unix': ['utilities'],
             'events': ['.', 'that'],
             'key': ['Hadoop', 'component', 'to'],
             'wild': ['and', 'data'],
             'Computing': ['a'],
             'stressed': ['traditional'],
             'essentially': ['a'],
             'testing': ['.', 'hypotheses'],
             'supermarket': ['is'],
             'recommendations': ['.', 'are', 'accordingly'],
             'industry': ['data', '.', '.', 'is'],
             'dissimilar': ['products'],
             'everyone': ['who'],
             'including': ['a', 'climate', 'the'],
             'combine': ['publishing', 'the', 'entrepreneurship'],
             'scale': ['.'],
             'batch': ['system'],
             'survival': ['depends'],
             'definition': ["I've"],
             'reporting': ['.'],
             'identity': ['using'],
             'Although': ['R'],
             'exposes': ['their'],
             'interesting': ["It's", '.', 'products'],
             'Quantitative': ['Information'],
             'talks': ['to'],
             'processors': ['the', 'and', 'as'],
             'voice': ['samples'],
             'hand': ['.'],
             'found': ['it', 'a'],
             'animations': ['that'],
             'cluster': ['.'],
             'Mason': ['hmason', 'says', 'came'],
             'stop': ['being', 'information', 'shopping'],
             'fill': ['the'],
             'sensors': ['government'],
             'beyond': ['the', 'a'],
             'Intel': ['Inside', 'Inside'],
             'common': ['to'],
             'decades': ['.'],
             'extend': ['R'],
             'provides': ['added', 'commercial', 'an'],
             'schemas': ['evolve'],
             'back': ['the', 'if', 'to', 'to'],
             'saying': ['you'],
             'statement': ['mean'],
             'that': ['data',
              'statement',
              'talks',
              'any',
              'knows',
              'help',
              'a',
              'can',
              'can',
              'ties',
              'data',
              'data',
              'wants',
              'data',
              'story',
              'almost',
              'data',
              'data',
              'are',
              'was',
              'something',
              'the',
              'the',
              'were',
              'sounds',
              'problem',
              'are',
              'have',
              'can',
              'allure',
              'MapReduce',
              'they',
              'can',
              "you'd",
              'enables',
              "person's",
              'eating',
              'one',
              'data',
              'the',
              'provides',
              'they',
              'when',
              'show',
              'give',
              'would',
              'looked',
              'members',
              'analyzed',
              'built',
              'appears',
              'gave',
              'signature',
              'are',
              'the',
              'find',
              'bit',
              'the',
              'people',
              'nobody',
              'in',
              'the',
              'data'],
             'hackingdata': ["we're"],
             'tackle': ['all'],
             'worth': ['a', 'a'],
             'contributed': ['by'],
             'obvious': ['solution', 'is'],
             'automated': ['data'],
             'conversions': ['than'],
             'Hilary': ['Mason', 'Mason', 'Mason'],
             'making': ['it', 'data', 'guesses', 'sure', 'a', 'connections'],
             'There': ['are', 'are', 'was'],
             'importance': ['of'],
             'components': ['.'],
             'socio': ['economic'],
             "Google's": ['BigTable', 'biggest', 'ad'],
             'my': ['favorites'],
             'structure': ['of'],
             'expand': ["today's"],
             'visit': ['the'],
             'impossible': ['to'],
             'box': ['to'],
             'come': ['about', 'in', 'from', 'up'],
             'HOP': ['is'],
             'transactions': ['not', 'that'],
             'neighborhood': ['valuation', 'per'],
             'direct': ['consequence'],
             'something': ['different', 'is', 'with'],
             'geolocation': ['or', 'skills'],
             'Mike': ['Driscoll'],
             'reviews': ['the'],
             'runs': ['the'],
             'Online': ['Prototype'],
             'because': ['Amazon', 'automated', 'everyone'],
             'unique': ['skill', 'signature'],
             'Most': ['of', 'data'],
             'now': ['terabyte', 'expect', 'ask'],
             'develop': ['and', 'training'],
             'several': ["it's"],
             'one': ['of',
              'stop',
              'of',
              'of',
              'advertisement',
              'stop',
              'place',
              'in'],
             'suits': ['perform'],
             'surf': ['the'],
             '1982': ['weighing'],
             'look': ['at', 'at', 'at', 'up', 'for', 'like'],
             'Nathan': ["Yau's"],
             'entering': ['the'],
             'precompute': ['much'],
             'quirky': ['language'],
             'nobody': ['remembers'],
             'reliability': ['requirements'],
             'thinking': ['about'],
             'differently': ['it'],
             'strong': ['mathematical'],
             'uses': ['it'],
             'heart': ['of'],
             'hampered': ['by'],
             'pounds': ['now'],
             'NoSQL': ['databases'],
             "you'd": ['otherwise', 'like'],
             'non': ['profit'],
             'shared': ['experience'],
             'suggest': ['other'],
             'Ben': ["Fry's"],
             "Turk's": ['marketplace'],
             'role': ['in', 'in'],
             'diverse': ['as'],
             'frequently': ['combine', 'missing', 'all', '.', 'called', 'the'],
             "Moore's": ['Law', 'Law', 'law'],
             'existence': ['of'],
             'sure': ['that'],
             'at': ['creating',
              'the',
              'roughly',
              'bits',
              'hand',
              "O'Reilly",
              'job',
              'this',
              'Cloudera',
              'bit',
              'many',
              'Stanford',
              'a',
              'what',
              'Facebook',
              'a',
              'LinkedIn',
              'LinkedIn',
              "members'",
              'profiles',
              'events',
              'books',
              'once',
              'bit'],
             "That's": ['the', 'the', 'where', 'not', 'an'],
             'Visual': ['Display'],
             'badly': ['behaved'],
             'entry': ['for'],
             'more': ['data',
              'traditional',
              'time',
              'than',
              'storage',
              'data',
              'data',
              'interesting',
              'and',
              'frequently',
              'conversions',
              'scatter',
              'detailed',
              'tractable'],
             'since': ['many'],
             'significant': ['body', 'or'],
             'running': ['Linux'],
             'At': ["O'Reilly", 'some', "IBM's"],
             'earlier': ['data'],
             'perform': ['arcane', 'computations', 'a'],
             'hmason': ['data'],
             'album': ['metadata', 'titles', '.'],
             'associated': ['with'],
             'layer': ['depletion'],
             'computations': ['on', 'make'],
             'Increased': ['storage'],
             'mountain': ['of'],
             'holistic': ['approach'],
             'price': ['reduction'],
             'any': ['e', 'CD', 'data', 'one', 'data', 'given'],
             'Elastic': ['MapReduce', 'MapReduce'],
             'sounds': ['simple', 'like'],
             'reports': ['on'],
             'expands': ['to'],
             'conquer': ['strategy'],
             'find': ['to', 'out', 'the', 'those', 'out', 'new'],
             'across': ['a', 'many', 'an', 'many', 'thousands', "LinkedIn's"],
             'exact': ['length'],
             'public': ['report', 'website', 'use', 'photos'],
             'database': ['behind',
              '.',
              'of',
              'of',
              '.',
              'including',
              'for',
              'or',
              'model',
              'systems',
              'servers',
              'Hive',
              'but',
              'is'],
             'majors': ['.'],
             'possible': ['to', '.', 'to', 'to'],
             'outside': ['of', 'the'],
             '70s': ['were'],
             'great': ['place', 'example'],
             'point': ['traditional', "it's"],
             'next': ['sexy', 'Intel', "week's", 'generation', 'decades'],
             'extract': ['value'],
             'join': ['the'],
             'but': ['all',
              'fairly',
              'there',
              'to',
              'tools',
              'big',
              'different',
              'in',
              'not',
              'Hadoop',
              'a',
              'how',
              'it',
              'we',
              'newer',
              'also'],
             'County': ['by'],
             'update': ['and'],
             'originated': ['with'],
             'sharing': ['music'],
             'developers': ['of', 'have', 'explore', 'and'],
             'added': ['value', 'value'],
             'could': ['even', 'author'],
             'storage': ['capacity', 'has', 'is', 'capacity', 'capacity'],
             'development': ['agile', 'project'],
             "today's": ['big'],
             'story': ['and', 'to', 'which', "isn't", '.', 'the'],
             'turn': ['around', 'this'],
             'graphics': ['facilities', 'package'],
             'platforms': ['or', 'are', 'have'],
             'inexpensively': ['possibly'],
             'maps': ['from'],
             "you'll": ['get'],
             'scientist': ['at', '.', '.', 'at', 'was', 'at'],
             'might': ['not', 'mean', 'be', 'be', 'like'],
             'new': ['value',
              'breed',
              'data',
              'insights',
              'products',
              'ways',
              'Intel'],
             'marketplace': ['for'],
             'loop': ['in'],
             'extensions': ['extend'],
             'there': ['has', 'was', "isn't", 'are', 'are', 'are'],
             'biggest': ['problem'],
             'correlated': ['with'],
             'distributing': ['an'],
             'multistage': ['processing'],
             '25': ['GB'],
             'messy': ['.', 'HTML', 'and'],
             'iterate': ['over'],
             'retail': ['business', 'transactions', 'chain'],
             'finding': ['data', 'data', 'that'],
             'Tim': ["O'Reilly"],
             "customers'": ['behavior'],
             'fairly': ['well', 'comprehensive'],
             'medium': ['and'],
             'which': ['they',
              'cover',
              'can',
              'may',
              'originated',
              'you',
              'is',
              'are',
              'provides',
              'there',
              'lets',
              'exposes',
              'to',
              'survival'],
             'Data': ['science',
              'Mashups',
              'is',
              'expands',
              'Mashups',
              'conditioning',
              'is',
              'is',
              'science',
              'science',
              'scientists',
              'is',
              '3'],
             'disciplines': ['it'],
             'centric': ['industries'],
             'length': ['in', 'of', 'and'],
             'called': ['NoSQL', 'Pig', 'a'],
             'entrepreneurs': ['.'],
             'pyrotechnics': ['.'],
             'Walmart': ['over'],
             'difficult': ['and', 'problem', 'problem'],
             'toward': ['its'],
             'asking': ['whether', 'the'],
             'asked': ['a'],
             'revolutionary': ['CDDB'],
             'working': ['with', 'with'],
             "shopper's": ['cards'],
             'per': ['capita', 'gram', 'dollar'],
             'when': ['the', 'she', 'the', 'you'],
             'pickles': ['causes', '.'],
             'post': ['I'],
             'Nutshell': ['generated'],
             'analyzing': ['musical', 'an'],
             'de': ['allocate', 'facto'],
             'correlations': ['across'],
             'analyzed': ['.', 'the', 'a'],
             "haven't": ['stressed'],
             "isn't": ['really',
              'the',
              'just',
              'just',
              'just',
              'finding',
              'just',
              'going',
              '.',
              'always',
              'just',
              'superseded',
              'just',
              'as',
              'what'],
             'doing': ['so'],
             'particularly': ['Elastic', 'to', 'if', 'physicists'],
             'days': ['.'],
             'old': ['style'],
             'going': ['to', 'to', 'to', 'back', 'to'],
             'mining': ['your'],
             "What's": ['less'],
             'English': ['and', '.'],
             'Patil': ['chief', 'described', 'calls'],
             'right': ['questions'],
             'smaller': ['problems', 'auxiliary'],
             'Java': ['and'],
             'wrong': ['with'],
             'biology': ['building'],
             'So': ['how'],
             'game': ['logs'],
             'source': ['the', 'and', 'a', 'implementation', 'R'],
             'died': ['and'],
             'delivers': ['intermediate'],
             'needs': ['to'],
             'hiking': ['trails'],
             'required': ["let's"],
             'finish': ['for'],
             'needed': ['paying'],
             'dollars': ['.'],
             'do': ['we',
              'with',
              'we',
              'the',
              'the',
              'data',
              'you',
              'you',
              'it',
              'the',
              'with',
              'something',
              'you',
              'massive',
              'know'],
             "There's": ['a'],
             'Roger': ['Magoulas'],
             'step': ['of', 'in'],
             'the': ['next',
              'next',
              'many',
              'technologies',
              'companies',
              'unique',
              'data',
              'creation',
              'earlier',
              'Web',
              'CDDB',
              'exact',
              'CD',
              'length',
              'track',
              'database',
              'only',
              "customers'",
              'data',
              'site',
              'users',
              'products',
              'beginning',
              'last',
              'amount',
              'problem',
              'data',
              'county',
              'addresses',
              'geographical',
              'foreclosures',
              'data',
              'wild',
              'data',
              'publishing',
              'data',
              'direct',
              'ones',
              'early',
              'reduction',
              'first',
              'increase',
              'space',
              'more',
              'web',
              'analysis',
              'foundation',
              'metadata',
              'Philadelphia',
              'HTML',
              'dirty',
              'job',
              'data',
              'quality',
              'missing',
              'incongruous',
              'discovery',
              'data',
              'problem',
              'data',
              'problem',
              'data',
              'trick',
              'growing',
              'grammatical',
              'English',
              'Cassandra',
              'Python',
              'problem',
              'Natural',
              'classification',
              'set',
              'word',
              'size',
              'data',
              'problem',
              'data',
              'most',
              'understanding',
              'data',
              'organizations',
              'relational',
              'data',
              'kind',
              'difference',
              'logical',
              'MapReduce',
              'map',
              'intermediate',
              'results',
              'Hadoop',
              "world's",
              'key',
              'time',
              'key',
              'performance',
              'HBase',
              'right',
              'number',
              'calculation',
              'experiments',
              'data',
              'most',
              'OpenCV',
              'toolbox',
              'application',
              'grammar',
              'joke',
              'point',
              'existence',
              'conclusions',
              'data',
              'open',
              'numbers',
              'stories',
              'classic',
              'data',
              'first',
              'data',
              'state',
              'art',
              'visualizations',
              'growth',
              'aesthetics',
              'visualization',
              'spread',
              'spread',
              'data',
              'tools',
              'data',
              'kind',
              'data',
              'first',
              'results',
              'organization',
              'people',
              'best',
              'most',
              'data',
              'big',
              'big',
              'data',
              'story',
              'data',
              'process',
              'group',
              'process',
              'Cornell',
              'heart',
              'CDDB',
              'same',
              'puzzle',
              'era',
              'winners',
              'people',
              'companies',
              'same',
              'data',
              'nascent',
              '2012',
              'willingness',
              'ability',
              'ability',
              'box',
              'problem',
              'companies',
              'core',
              'vanguard',
              'shared',
              'URLs',
              'next',
              'next',
              'new',
              'low',
              '70s'],
             'some': ['other', 'point', 'hints', 'data', 'creativity'],
             'level': ['.', 'dataflow'],
             'sea': ['of'],
             'reality': ['of'],
             'define': ['a'],
             'work': ['with',
              '.',
              'with',
              'without',
              'if',
              '.',
              '.',
              'R',
              "That's",
              'with'],
             'whether': ['sales', "you're", 'this'],
             'directly': ['machine', 'is'],
             'cloud': ['of'],
             'and': ['about',
              'the',
              'middleware',
              'data',
              'so',
              'creates',
              'coupled',
              'gets',
              'creates',
              'LinkedIn',
              'uses',
              'a',
              'put',
              'a',
              'using',
              'longitude',
              'group',
              'other',
              'relevant',
              'data',
              'presenting',
              'where',
              'even',
              'Factual',
              'game',
              'improve',
              'of',
              'leaving',
              'frequent',
              "that's",
              'number',
              'increase',
              'a',
              'analyzed',
              'use',
              'other',
              "isn't",
              'other',
              'be',
              'machine',
              'Python',
              'you',
              'more',
              "you'll",
              'want',
              'other',
              'next',
              'are',
              'understanding',
              'reporting',
              'their',
              'replication',
              'slow',
              '5',
              "Amazon's",
              'are',
              'to',
              'enormous',
              'conquer',
              'then',
              'de',
              'reliability',
              'other',
              'consumers',
              'testing',
              'particularly',
              'different',
              "it's",
              'delivers',
              'mobile',
              'building',
              'SnapTell',
              'even',
              'look',
              'Mahout',
              'tune',
              'making',
              'other',
              'its',
              'quirky',
              'newer',
              'a',
              'presenting',
              'Ben',
              'if',
              'the',
              'implement',
              'come',
              'made',
              'added',
              'then',
              'the',
              'find',
              'artists',
              'the',
              'data',
              'use',
              'LinkedIn',
              'made'],
             'product': ['.', 'reviews', 'cycles', 'or', 'that'],
             'sites': ['like'],
             'else': ['iTunes'],
             'until': ['after'],
             'tune': ['the'],
             'relationships': ['to'],
             'Oil': ['companies'],
             'knows': ['how'],
             'conclusions': ["you're", '.'],
             'Language': ['Toolkit'],
             'author': ['a'],
             'similar': ['to'],
             'its': ['value',
              'story',
              'datasets',
              'own',
              'EC2',
              'comprehensive',
              'story',
              'story',
              'goal'],
             'own': ['data', 'data', 'sales', 'story'],
             'human': ['language', 'intelligence'],
             'Dynamo': ['and'],
             'perhaps': ['a'],
             'Casey': ["Reas'"],
             'never': ['an', 'conceived'],
             'sexy': ['job'],
             'winners': ['will'],
             'practicing': ['data'],
             'array': ['of'],
             'partners': ['even'],
             'tapped': ['into'],
             'OpenCV': ['library'],
             'trails': ['.'],
             'iteratively': ['.'],
             'concerned': ['about'],
             "that's": ['not',
              'available',
              'available',
              'where',
              'directly',
              'generated',
              'going',
              'different',
              'not',
              'going'],
             'services': ['credit', 'microformats', 'like'],
             'performance': ['and'],
             'accordingly': ['.'],
             'yet': ['know'],
             'CD': ['had', '.', "you've", "that's", "you've"],
             'support': ['complex', '.'],
             'almost': ['everything', 'all', 'always'],
             'regression': ['analysis'],
             'Prediction': ['API'],
             'studying': ['the'],
             'were': ['too', 'insufficient', 'the', 'real'],
             'generating': ['data', 'and'],
             'figure': ['out', 'out', 'out'],
             'problem': ['as',
              '.',
              "isn't",
              'involves',
              '.',
              'is',
              '.',
              '.',
              'across',
              'creating',
              '.',
              'with',
              '.',
              'that',
              'though',
              'that',
              'from',
              'or'],
             "Tufte's": ['Visual'],
             'where': ['actuaries',
              'it',
              'it',
              "Moore's",
              "it's",
              'services',
              'art'],
             'Web': ['2', 'was', '2'],
             'state': ['where', 'of'],
             'For': ['example', 'computer'],
             'Wattenberg': ['wattenberg'],
             'accept': ['all'],
             'And': ["it's", 'that', 'as', 'this'],
             'cores': ['.', 'running'],
             'plus': ['thousands'],
             'gigabyte': ['disk'],
             'We': ['are', 'now', "don't"],
             'office': ['extracting', '.'],
             'are': ['data',
              'in',
              'involved',
              'required',
              'annotated',
              'consumer',
              'seeing',
              'easier',
              'extremely',
              'essential',
              'easily',
              'we',
              'similar',
              'designed',
              'designed',
              'increasing',
              'frequently',
              'the',
              'designed',
              'two',
              'then',
              'then',
              'several',
              'associated',
              'many',
              'valid',
              'many',
              'really',
              'many',
              'full',
              'you',
              'built',
              'but',
              'inherently',
              'following'],
             'spreadsheet': ['.'],
             'entrepreneurship': ['with'],
             'missing': ['or', 'do', 'points'],
             'easer': ['to'],
             'thousands': ['of', 'of', 'of'],
             'increasing': ['faster'],
             'latitude': ['and'],
             'themselves': ['Storing'],
             'style': ['screen'],
             'get': ['a', '.', 'better', 'a', 'presentable', 'a'],
             'figuring': ['out'],
             'taken': ['advantage'],
             'superseded': ['by'],
             'Learning': ['course'],
             'carefully': ['collected'],
             'kept': ['pace'],
             'team': ['member'],
             'money': ['generating'],
             'understands': ['that'],
             'growing': ['Apple'],
             'But': ['what',
              'merely',
              "we've",
              'old',
              'Hadoop',
              'it',
              "that's",
              'the',
              'the'],
             "You're": ['likely'],
             'wants': ['to'],
             'organizations': ['that'],
             'approach': ['.', 'which'],
             'coupled': ['it', 'to'],
             'hugely': ['important'],
             'brought': ['it'],
             'a': ['data',
              'database',
              'web',
              'number',
              'result',
              'data',
              'unique',
              'database',
              'database',
              'CD',
              'CD',
              'CD',
              'musical',
              'data',
              'master',
              'few',
              'book',
              'book',
              'camera',
              'camera',
              'customer',
              'customer',
              'trail',
              'camera',
              'cloud',
              'feedback',
              'number',
              'public',
              'map',
              'community',
              'holistic',
              'tractable',
              'sense',
              'sea',
              'trail',
              'price',
              '32',
              'gram',
              'purchase',
              'state',
              'public',
              'spreadsheet',
              'standard',
              'database',
              'simple',
              'job',
              'sense',
              'large',
              'lot',
              'red',
              'long',
              'horde',
              'schema',
              'complex',
              'new',
              'few',
              'data',
              'divide',
              'programming',
              'number',
              'single',
              'search',
              'single',
              'home',
              'simple',
              'data',
              'distributed',
              'high',
              'one',
              'calculation',
              'batch',
              'trending',
              'recommendation',
              'quintessential',
              'cell',
              'RESTful',
              'de',
              'training',
              'significant',
              'large',
              'few',
              'cost',
              'few',
              'relatively',
              'few',
              'Nutshell',
              'random',
              'role',
              'basic',
              'background',
              'single',
              'thousand',
              'picture',
              'thousand',
              'set',
              'graph',
              'foundational',
              'new',
              'dozen',
              'sense',
              'fairly',
              'great',
              'body',
              'flu',
              'population',
              'matter',
              'successful',
              'question',
              'few',
              'consumer',
              'team',
              'multistage',
              'hypothesis',
              'regression',
              'strong',
              'discipline',
              'lot',
              'high',
              'relatively',
              'valuable',
              'huge',
              'huge',
              'large',
              'great',
              'very',
              'much',
              'signature',
              'database',
              'data',
              'company',
              'solution',
              'problem',
              'lot',
              'hugely'],
             'Non': ['Relational'],
             'service': ['in'],
             'mathematicians': ['programmers'],
             "you're": ['going', 'looking', 'asking', 'asking', 'drawing'],
             'drives': ['in', 'are'],
             'mathematical': ['background'],
             'face': ['detection'],
             'every': ['track',
              'time',
              'company',
              'startup',
              'non',
              'project',
              'level'],
             'publicly': ['available'],
             'art': ['particularly', 'comes', '.'],
             'said': ['that', 'on'],
             '250': ['pounds'],
             'big': ['data', 'is', 'is', 'data', 'picture', 'problem'],
             'apps': ['.'],
             'founder': ['of'],
             'basic': ['skill'],
             '32': ['GB'],
             'report': ['from', 'only'],
             'nodes': ['to'],
             'differentiates': ['data'],
             'arcane': ['but'],
             'Apple': ['job', 'from', 'industry', 'you', 'paying'],
             'microSD': ['card'],
             'machines': ['by'],
             'Infochimps': ['and'],
             'unknown': ['album'],
             'useless': ['if'],
             'broadly': ['defined'],
             'millisecond': ['accuracy'],
             'Alumni': ['group'],
             'file': ['that'],
             'who': ['runs', 'dies', 'figure'],
             'points': ['That', 'at'],
             "it's": ['a',
              'revolutionary',
              'figuring',
              'not',
              'usable',
              'not',
              'the',
              'possible',
              'possible',
              'easy',
              'about',
              'telling',
              'mining'],
             'ties': ['most'],
             'on': ['.',
              'the',
              'the',
              'the',
              'a',
              'every',
              'Facebook',
              'a',
              'long',
              'sites',
              'trending',
              'Twitter',
              'any',
              'getting',
              'track',
              'data'],
             'divide': ['and'],
             'identical': ['subtasks'],
             'foundation': ['of'],
             'enables': ['the', 'stream', 'features'],
             'systems': ['stop'],
             'crucial': ['to', 'to', 'to'],
             'streams': ['online', 'and'],
             'Linux': ['brought', 'machines'],
             'over': ['time', 'time', 'data', 'a'],
             'leave': ['every', 'an', 'behind'],
             'heard': ['it', 'a', 'big', 'the'],
             'numbers': ['.', '.', 'mean'],
             'same': ['result', 'conclusion'],
             'schema': ['in', '.'],
             'mashup': ['data'],
             'You': ['can', "don't", 'have', 'need'],
             'facto': ['standard'],
             'actuaries': ['in'],
             'allocate': ['and', 'processors'],
             "doesn't": ['work'],
             'plays': ['an', 'a'],
             'oriented': ['web'],
             'Describing': ['the'],
             'vanguard': ['but'],
             'solution': ['to', 'for', '.'],
             'ways': ['to'],
             'mortgage': ['foreclosures'],
             'counting': ['increases'],
             'When': ['natural', "you've"],
             'answers': ['.'],
             'Visualization': ['is', 'is'],
             "we're": ['talking',
              'finding',
              'trying',
              'discussing',
              'entering'],
             'Andrew': ["Ng's"],
             'MySpace': ['activity'],
             'MB': ['to'],
             'would': ['be', 'be', 'have', 'take', 'start'],
             'selling': ['music'],
             '.': ['Five',
              '0',
              'But',
              'The',
              'Almost',
              "There's",
              'But',
              'A',
              "It's",
              'Data',
              'One',
              'The',
              'Gracenote',
              'If',
              'Before',
              'If',
              'While',
              'Their',
              'CDDB',
              'Google',
              "Here's",
              'Facebook',
              'Amazon',
              'These',
              'They',
              'The',
              'Whether',
              "That's",
              'In',
              'Whether',
              'And',
              "It's",
              'Data',
              'The',
              'Using',
              'What',
              "We're",
              'To',
              'Data',
              'While',
              'At',
              'Sites',
              'Factual',
              'Much',
              '0',
              'The',
              'Mobile',
              'Point',
              'All',
              'Since',
              '6',
              'But',
              'RAM',
              'Hitachi',
              'Whether',
              'The',
              'Data',
              'The',
              'The',
              'Increased',
              "That's",
              'So',
              'We',
              'But',
              'Many',
              'They',
              'The',
              'This',
              'If',
              'Data',
              "You're",
              'It',
              'To',
              'Scripting',
              'Once',
              'Data',
              'If',
              'If',
              'In',
              "It's",
              'If',
              'Roger',
              'While',
              'To',
              'And',
              'Try',
              'Google',
              'Disambiguation',
              'When',
              "That's",
              'If',
              'For',
              '01',
              'If',
              '01',
              "We've",
              'Oil',
              'And',
              'The',
              "We're",
              'At',
              'What',
              'Information',
              'They',
              'They',
              'Most',
              'Traditional',
              'Managing',
              'The',
              'Relational',
              'While',
              'Do',
              'Most',
              '92',
              '93',
              'To',
              'These',
              'They',
              'Many',
              'While',
              'Data',
              'Google',
              'In',
              'In',
              "It's",
              "What's",
              'The',
              "Yahoo's",
              'Many',
              "Amazon's",
              'You',
              'Hadoop',
              'It',
              'If',
              'Hadoop',
              'In',
              'Traditional',
              'If',
              'But',
              'Faster',
              "It's",
              'Hadoop',
              'Hadoop',
              'Near',
              'These',
              'As',
              'According',
              'ly',
              'Machine',
              'We',
              'You',
              'Andrew',
              'There',
              'Google',
              'For',
              'Mechanical',
              'Machine',
              'The',
              'Once',
              "It's",
              'Even',
              'While',
              'According',
              'It',
              "We've",
              'That',
              'More',
              'But',
              'Data',
              'Statistics',
              'Statistics',
              'It',
              'While',
              'Although',
              'It',
              'If',
              'A',
              'The',
              'To',
              'Edward',
              'But',
              'Visualization',
              'According',
              'Visualization',
              'Hilary',
              'Once',
              'There',
              'GnuPlot',
              'At',
              'Nathan',
              'One',
              'And',
              'Does',
              'Does',
              'There',
              "It's",
              'Data',
              'Describing',
              'Physicists',
              'They',
              'When',
              'You',
              'You',
              'Scientists',
              'Patil',
              'It',
              'But',
              'Asking',
              'It',
              'In',
              'Then',
              'The',
              'It',
              'It',
              'This',
              'CDDB',
              'But',
              'Computing',
              'Entrepreneurship',
              "Patil's",
              "That's",
              'We',
              'Hilary',
              'Her',
              'ly',
              'ly',
              'No',
              'In',
              'Data',
              'They',
              'They',
              'They',
              'Google',
              'They',
              'ly',
              'Whether',
              'The',
              'Data',
              '1',
              'Whether',
              '2'],
             'Relational': ['databases', 'databases'],
             'value': ['from', 'in', '.', 'iteratively', 'from'],
             'local': ['supermarket'],
             'readings': ['that'],
             'cover': ['topics'],
             'itself': ['and', 'becomes', 'but'],
             'otherwise': ['have'],
             'vision': ['the'],
             'between': ['5', 'developers'],
             'well': ['defined', 'behaved', 'you'],
             'model': ['.'],
             'cancer': ['throughout'],
             'everything': ['can', 'from'],
             '1': ['000', '.', '010', '012', 'The'],
             'applicable': ['to'],
             'lifecycle': ['where'],
             'exploring': ['and'],
             'attended': ['.'],
             'creation': ['of'],
             'organization': ['3'],
             'incorporates': ['HDFS', 'a'],
             'ly': ["it's", 'is', 'is', 'are'],
             'behind': ['a', 'whenever', 'Google'],
             'according': ['to'],
             'throughout': ['a'],
             'defined': ['kinds', 'problems'],
             'citizen': ['science'],
             'calls': ['data'],
             'joke': ['that', "doesn't"],
             'creates': ['more', 'new'],
             'build': ['information',
              'clusters',
              'interesting',
              'the',
              'data'],
             'dataspora': ['statistics'],
             'minutes': ['or'],
             'kind': ['of', 'of', 'of'],
             'insights': ['into'],
             'unsolvable': ['see'],
             'always': ['possible', 'requires'],
             'telecommunications': ['companies'],
             'frequent': ["shopper's"],
             'Elefant': ['Weka'],
             'branched': ['out'],
             'No': ['one'],
             'newer': ['technologies',
              'techniques',
              'extensions',
              'companies'],
             '0': ['Tim', 'and', '.', '.'],
             'aesthetics': ['of'],
             'he': ['put'],
             'massaging': ['it'],
             'nicely': ['in'],
             'she': ['gets', 'starts'],
             'distributed': ['across', 'across', 'filesystem', 'computing'],
             'Processing': ['is'],
             'What': ['is', 'differentiates', 'are'],
             'different': ['from',
              'from',
              'forms',
              'According',
              '.',
              'assumptions',
              'datasets',
              'algorithms'],
             'part': ['of', 'of', 'of', 'of'],
             'assumptions': ['different'],
             'become': ['a'],
             'processor': ['speed'],
             'analyses': ['to'],
             'too': ['low'],
             'Northern': ['Europe'],
             '2015': ["they're"],
             'databases': ['and', 'are', 'appear', 'or', 'though', 'are'],
             'guesses': ['about'],
             'consume': ['Atom'],
             'pursue': ['intriguing'],
             'low': ['1', 'values'],
             'science': ['the',
              '.',
              'enables',
              '.',
              'data',
              'from',
              'is',
              '.',
              'what',
              'at',
              '.',
              "isn't",
              'it',
              '.',
              'requires',
              'to',
              'group',
              'group',
              'majors'],
             'involves': ['human', 'making'],
             'economies': ['work'],
             'property': ['Jeff'],
             'subtasks': ['that', 'which'],
             'increase': ['of', 'in', 'of'],
             'example': ['if', 'of', '.'],
             'Soup': ['natural'],
             'APIs': ['and'],
             'sends': ['it'],
             'creatively': ['to'],
             'features': ['like', 'only'],
             'routinely': ['.'],
             'conflicts': ['with'],
             'close': ['to'],
             'logs': ['tweet', 'from'],
             'though': ['these', 'neither', '.', 'not'],
             'use': ['data',
              'patterns',
              'and',
              '.',
              'data',
              'it',
              'of',
              'anything',
              'Mechanical',
              'them',
              'one',
              'via',
              'data'],
             'creating': ['data', 'large', 'the'],
             'computer': ['science',
              'vision',
              'science',
              'science',
              'science'],
             'willing': ['to'],
             'clean': ['as'],
             'advertisement': ['for'],
             'just': ['an',
              'a',
              'a',
              'a',
              'companies',
              'their',
              'the',
              'geek',
              'announced',
              'a',
              'about',
              'how',
              'the',
              'a',
              'spent',
              'throw'],
             'tastes': ['though'],
             '3': ['.', 'Where', 'Information'],
             'terabyte': ['drives'],
             'problems': ['ranging',
              '.',
              'ranging',
              'up',
              '.',
              'to',
              "here's"],
             'search': ['for', 'for', 'terms', 'across', 'to'],
             'fluctuation': ['.'],
             'simpler': ['.'],
             'improve': ['its'],
             'merely': ['using'],
             'cents': ['each'],
             'allure': ['but', 'is'],
             'requires': ['something', 'a', 'skills'],
             'near': ['real'],
             'signature': ['based', 'based', 'in'],
             'anomalous': ['data'],
             'random': ['fluctuation'],
             'servers': ['is'],
             'generated': ['automatically', 'by', '2'],
             'Near': ['real'],
             'exhaust': ['that', 'you'],
             '2': ['.', '.', 'hackingdata', 'percent', 'Information'],
             'whenever': ['you'],
             'lot': ['about', 'of', 'of'],
             'today': ['every'],
             'anyone': ['practicing'],
             'richer': ['data'],
             'multiple': ['unstructured'],
             'hints': ['at'],
             'dirty': ['work'],
             'photos': ['available', 'from'],
             'goes': ['.', 'far'],
             'hire': ['a'],
             'without': ['investing'],
             'coherently': ['.'],
             'enough': ["it's"],
             'Python': ['are', 'language', 'Elefant', 'design'],
             'tractable': ['form', 'problem'],
             'enabling': ['agile'],
             'The': ['web',
              'developers',
              'thread',
              'question',
              'web',
              'importance',
              'more',
              'data',
              'first',
              'foreclosure',
              'most',
              'need',
              'most',
              'Turk',
              'problem',
              'result',
              'future',
              'part',
              'ability',
              'NASA'],
             'involved': ['with'],
             'means': ['.'],
             'sources': ['.', 'of', 'all', 'in', 'to', 'and'],
             'flu': ['virus'],
             'us': ['here', 'new'],
             'forms': ['.'],
             'Cloudera': ['which'],
             'ignore': ['the', 'anomalous'],
             'Whether': ['that', "we're", 'you', "it's", 'humans'],
             'advised': ['mobile'],
             'quite': ['differently'],
             'learning': ['libraries',
              '.',
              'is',
              'PyBrain',
              'algorithms',
              'almost',
              'and'],
             'Office': ['2015'],
             'formats': ['that', "that's", 'including'],
             'may': ['know', 'be', 'not', 'or', 'not'],
             'comparative': ['if'],
             'Traditional': ['relational', 'data'],
             'dies': ['has'],
             'This': ['data', 'is'],
             'libraries': ['.', 'available', '.'],
             'view': ['the'],
             'epidemic': ['and'],
             'bad': ['your'],
             'time': ['they',
              'online',
              '.',
              'you',
              '.',
              '.',
              'data',
              'reports',
              'MapReduce',
              '.',
              '.',
              'plus',
              'to'],
             'even': ['your',
              'job',
              'richer',
              'getting',
              'days',
              'an',
              'face',
              'have'],
             'relatively': ['large', 'small'],
             'death': ['because'],
             'full': ['of', 'fledged'],
             'recommended': ['.'],
             'logical': ['descendants'],
             'sets': ['.', '.'],
             'ability': ['to', 'to', 'to'],
             'Toolkit': ['library'],
             'very': ['useful',
              'flexible',
              'effective',
              'difficult',
              'broadly'],
             'thousand': ['data', 'words', 'numbers'],
             'auctions': ['work'],
             'rather': ['than', 'than', 'than'],
             'RAM': ['has'],
             'task': ['the', 'but', 'up', 'is', '.'],
             'Five': ['years'],
             'much': ['bigger', 'easier', 'of', 'more'],
             'rich': ['APIs'],
             'delayed': ['because'],
             'reduce': ['task'],
             'becomes': ['part'],
             'simple': ['enough', 'task', 'MapReduce', 'program', '.'],
             'datasets': ['including',
              'which',
              'for',
              'effectively',
              'present',
              'the',
              'using',
              'quickly',
              'and'],
             'telling': ['its', 'you', 'you', "isn't", '.'],
             'home': ['at'],
             'require': ['soft', 'millisecond'],
             'than': ['kept',
              'for',
              'sales',
              'another',
              'computer',
              'tackling'],
             'capita': ['income'],
             'hypotheses': ['and'],
             'complex': ['transactions', 'set'],
             'ever': ['used', 'seen'],
             'Nielsen': ['BookScan'],
             'snakes': ['.'],
             'instrumented': ['.'],
             'by': ['data',
              'their',
              'taking',
              'neighborhood',
              'the',
              'Excel',
              'telling',
              'a',
              'providing',
              'extremely',
              'newer',
              'making',
              'analyzing',
              'Jeff',
              'Jeff'],
             'solve': ['a', 'a'],
             'awk': ['to'],
             'popularized': ['the'],
             "I've": ['heard'],
             'viewing': ['a'],
             'did': ['you'],
             "Here's": ['a'],
             'Information': ['platforms', 'is', 'Platforms', 'Platforms'],
             'together': ['is', 'fundamentally', 'at'],
             'gets': ['back', 'a'],
             'see': ["what's", 'classification', 'midomi'],
             '6': ['GHz'],
             'comes': ['from', 'in', 'and', 'close', 'in'],
             'data': ['is',
              'In',
              'science',
              'driven',
              'driven',
              'services',
              "isn't",
              'science',
              'application',
              'itself',
              'as',
              "it's",
              'product',
              'products',
              'products',
              'not',
              'products',
              'problem',
              'products',
              '.',
              'products',
              'exhaust',
              'that',
              'they',
              'collected',
              'is',
              'science',
              "that's",
              'from',
              'or',
              "it's",
              'or',
              'contributed',
              'from',
              'to',
              'source',
              'effectively',
              'but',
              "that's",
              'effectively',
              'science',
              'science',
              'in',
              'scientists',
              'massaging',
              'lifecycle',
              "we're",
              'from',
              'publicly',
              'and',
              'to',
              'MySpace',
              'we',
              '.',
              'wherever',
              'trail',
              'would',
              "isn't",
              'you',
              'exhaust',
              '.',
              'science',
              'useful',
              'analysis',
              'conditioning',
              'into',
              'in',
              'feeds',
              'in',
              'are',
              'used',
              'was',
              'sources',
              'conditioning',
              'you',
              '.',
              'is',
              'is',
              'after',
              'is',
              'collection',
              'science',
              'and',
              'at',
              'adds',
              'analysis',
              'but',
              'centric',
              'is',
              'itself',
              'problems',
              '.',
              'run',
              "that's",
              'warehouses',
              'rather',
              'formats',
              'changes',
              'platforms',
              'sources',
              '.',
              'driven',
              'analysis',
              'is',
              'platform',
              'problems',
              'platform',
              'analysis',
              'analysis',
              'as',
              'analysis',
              'scientist',
              'scientist',
              'with',
              'perhaps',
              'points',
              'analysis',
              'science',
              'speak',
              'or',
              'might',
              'are',
              'and',
              'analysis',
              'visualization',
              'science',
              'scientist',
              'conditioning',
              'is',
              'set',
              'might',
              '.',
              'tell',
              'sources',
              'was',
              'were',
              'science',
              'science',
              'samples',
              'intensive',
              'scientists',
              '.',
              'you',
              'out',
              'is',
              'scientists',
              'product',
              'all',
              'jiujitsu',
              'jiujitsu',
              'creatively',
              'scientist',
              '.',
              'that',
              'industry',
              'products',
              'collection',
              'conditioning',
              'what',
              'successfully',
              '.',
              'to',
              'it',
              'was'],
             'way': ['to', 'to'],
             'reduction': ['of', 'in'],
             'size': ['and', 'of'],
             'track': ['on',
              'lengths',
              'titles',
              'sends',
              'titles',
              'lengths'],
             'dollar': ['or'],
             'ill': ['advised'],
             'their': ['own',
              'users',
              'own',
              'schemas',
              'Prediction',
              'machine',
              'libraries',
              'datastreams',
              'success',
              'path'],
             'PyBrain': ['in'],
             'program': ['that'],
             '5': ['.', '.'],
             'circles': ['around'],
             'component': ['of'],
             'engine': ['is'],
             'platform': ['though', '.', 'Hadoop'],
             'descendants': ['of'],
             'sale': ['devices'],
             'Entrepreneurship': ['is'],
             'software': ['development', 'decided'],
             'causes': ['death'],
             'MHz': ['to'],
             'bits': ['per', 'per'],
             'sales': ['data', 'to', 'to'],
             "world's": ['largest'],
             'parse': ['plain', 'the'],
             'bigger': ['increases'],
             "IBM's": ['Many'],
             'behavior': ['the'],
             'commercial': ['support', 'statistical'],
             'Magoulas': ['who'],
             'started': ['out', 'looking', 'small'],
             'finance': ['that'],
             'surprisingly': ['appropriate'],
             'make': ['it',
              'online',
              'a',
              'that',
              'it',
              'it',
              'it',
              'it',
              'from'],
             'real': ['people', 'time', 'time', 'time', 'time', '.'],
             'consumable': ['.'],
             'kinds': ['of', 'of'],
             'insufficient': ['computing', '.'],
             'Physicists': ['have'],
             'Southern': ['Europe'],
             "O'Reilly": ['said', 'we', 'was'],
             '10': ['MHz', '000', '000'],
             'income': ['and'],
             'body': ['.', 'of', 'Or'],
             'automatically': ['from'],
             'Much': ['of'],
             "don't": ['require', 'have', 'yet'],
             'FlowingData': ['blog'],
             'files': ['with'],
             'foreclosure': ['data'],
             'foundational': ['text'],
             'Does': ['it', 'a'],
             'techniques': ['for', 'from'],
             'millions': ['of'],
             'posting': ['you'],
             'Mobile': ['applications'],
             'classify': ['them', 'them', 'a'],
             'bit': ['.', '.', '.', '.'],
             'here': ['.', '.'],
             'announced': ['their'],
             'trending': ['topics', 'topics', 'topics'],
             'group': ['them',
              'at',
              'together',
              'he',
              'at',
              'recommendation',
              '.'],
             'basically': ['a'],
             'traditional': ['retail',
              'statistics',
              'techniques',
              'data',
              'analysis',
              'statistics',
              'business',
              'computer'],
             'core': ['of'],
             'want': ['to', 'to'],
             'records': ['citizen'],
             'closer': ['interaction'],
             'should': ['know'],
             'Perl': ['and'],
             'far': ['beyond'],
             'using': ['data',
              'their',
              'Yahoo',
              'the',
              'Google',
              'SQL',
              'photos',
              'smaller'],
             'through': ['a'],
             'artificial': ['intelligence', 'intelligence'],
             'necessary': ['to', 'for'],
             'successful': ['retail', 'businesses'],
             'labor': ['.'],
             'speed': ['has', '.', '.'],
             'after': ['all', "you've"],
             'Where': ['do'],
             'will': ['find', 'be', 'be'],
             'largest': ['production'],
             'results': ['are', 'into', 'in', '.', 'it', 'of'],
             'process': ['.', 'of', 'worked', 'that', 'it'],
             'Sites': ['like'],
             'stream': ['processing', 'directly'],
             "sheriff's": ['office', 'office'],
             'end': ['and', 'to', 'solution'],
             '01': ['each', 'to'],
             'One': ['of', 'of'],
             'creative': ['visualizations'],
             "let's": ['look'],
             'gigabytes': ['to'],
             'Weka': ['in'],
             'If': ["you've",
              'you',
              "you've",
              'data',
              'data',
              'the',
              'you',
              'you',
              'anything',
              'you',
              "there's"],
             'students': ['this'],
             'show': ['how'],
             'acquires': ['its'],
             'products': ['.',
              'on',
              '.',
              '.',
              'that',
              'they',
              'by',
              'available',
              'that',
              'are',
              '.',
              'from',
              '.',
              'incrementally'],
             'present': ['computational'],
             'divided': ['into'],
             'error': ['detection'],
             'I': ['examine', "haven't"],
             'extracting': ['addresses'],
             'businesses': ['will'],
             'statistics': ['is',
              'and',
              'where',
              'is',
              'building',
              'is',
              'to',
              'work'],
             'instrumental': ['in'],
             'reported': ['that'],
             'sometimes': ['frightening'],
             'accuracy': ['.', '.'],
             'result': ['.', 'was', '.'],
             'API': ['which'],
             'Hadoop': ['project',
              'application',
              'developers',
              'to',
              'images',
              'goes',
              'datasets',
              'is',
              'has',
              'and',
              'is',
              'Online',
              'processes',
              '.',
              'or'],
             'other': ['databases',
              'people',
              'users',
              'source',
              'socio',
              'newer',
              'languages',
              'data',
              'components',
              'disciplines',
              'data',
              'members'],
             'calculation': ['it', 'then'],
             'it': ['according',
              'to',
              'does',
              'to',
              'to',
              '.',
              'into',
              'tell',
              'comes',
              'and',
              'goes',
              'possible',
              'and',
              '.',
              '.',
              'well',
              'simpler',
              'necessary',
              'and',
              'onto',
              'much',
              '.',
              'might',
              'easy',
              'easier',
              'arrives',
              'takes',
              'complements',
              'comes',
              '.',
              '.',
              'up',
              '.',
              'look',
              'involves',
              "isn't",
              'tell',
              'started',
              'was',
              '.',
              'The',
              'all',
              'to',
              'to',
              'to',
              'to',
              "that's",
              'appears'],
             'first': ['gigabyte', 'step', 'step', 'data', 'flippant'],
             'moved': ['from'],
             'searching': ['a'],
             'Faster': ['computations'],
             'package': ['library', 'Casey'],
             'space': ['you'],
             'tend': ['to'],
             'Try': ['using'],
             "Ng's": ['Machine'],
             'Pig': ['and'],
             'so': ['on', '.', 'products', 'does'],
             'be': ['data',
              'mined',
              'correlated',
              'mined',
              'useless',
              'fun',
              'dealing',
              'nice',
              'ready',
              'willing',
              'more',
              'able',
              'rolled',
              'distributed',
              'widely',
              'called',
              'current',
              'worth',
              'interesting',
              'saying',
              'hard',
              'the',
              'built',
              'able',
              'a'],
             'tweet': ['streams'],
             'years': ['ago', 'there', 'ago'],
             'meaningful': ['definition'],
             'publishing': ['industry', 'industry'],
             'we': ['suddenly',
              'mean',
              "aren't",
              'frequently',
              'currently',
              "couldn't",
              'make',
              'trying',
              'could',
              'now',
              'do'],
             'relational': ['database', 'database'],
             'them': ['by',
              'are',
              'only',
              'open',
              '.',
              'inexpensively',
              'into',
              '.',
              '.',
              'the'],
             'interests': ['you'],
             'words': ['but'],
             'microformats': ['and'],
             'depletion': ['was'],
             'questions': ['and'],
             'parsed': ['the'],
             'annual': ['growth'],
             'blog': ['is'],
             'path': ['.'],
             'day': ['a'],
             'Scientists': ['also'],
             'not': ['just',
              'in',
              'as',
              'just',
              'just',
              'just',
              'counting',
              'know',
              'really',
              'absolute',
              'finish',
              'be',
              'really',
              'just',
              'a',
              'unsolvable'],
             'text': ['in', 'for'],
             'disambiguating': ['Apple'],
             'usable': ['.'],
             'images': ['for'],
             'Why': ['do'],
             'pace': ['with'],
             'fun': ['to'],
             'center': ['stage'],
             'DJ': ['Patil'],
             'decided': ['that', 'to'],
             'Her': ['job'],
             'possibilities': ['that'],
             'requiring': ['geolocation'],
             "what's": ['happening', 'happening', 'important'],
             'Edward': ["Tufte's"],
             'denies': ['this'],
             'Mashups': ['in', 'in'],
             'notice': ['that'],
             'consistency': ['to', 'is', 'but', 'and'],
             'small': ['.', 'simple', 'and'],
             'answer': ['to'],
             'practices': ['are'],
             'seen': ['much', 'the', 'a'],
             'trying': ['to', 'to', 'to', 'to', 'to'],
             'image': ['matching'],
             'HDFS': ['a'],
             'difference': ['between', 'is'],
             'values': ['whch'],
             'Media': ['visualization'],
             'virus': ['through'],
             "Yahoo's": ['claim'],
             '010': ['or'],
             'mathematics': ['to'],
             'following': ['their'],
             'site': ['.', 'that'],
             'incongruous': ['.', 'do', 'data'],
             'factors': ['.'],
             'analyzes': ['mortgage'],
             'percent': ['annual', '.', 'more'],
             'cycles': ['closer'],
             ...})

In [78]:
transitions['.']


Out[78]:
['Five',
 '0',
 'But',
 'The',
 'Almost',
 "There's",
 'But',
 'A',
 "It's",
 'Data',
 'One',
 'The',
 'Gracenote',
 'If',
 'Before',
 'If',
 'While',
 'Their',
 'CDDB',
 'Google',
 "Here's",
 'Facebook',
 'Amazon',
 'These',
 'They',
 'The',
 'Whether',
 "That's",
 'In',
 'Whether',
 'And',
 "It's",
 'Data',
 'The',
 'Using',
 'What',
 "We're",
 'To',
 'Data',
 'While',
 'At',
 'Sites',
 'Factual',
 'Much',
 '0',
 'The',
 'Mobile',
 'Point',
 'All',
 'Since',
 '6',
 'But',
 'RAM',
 'Hitachi',
 'Whether',
 'The',
 'Data',
 'The',
 'The',
 'Increased',
 "That's",
 'So',
 'We',
 'But',
 'Many',
 'They',
 'The',
 'This',
 'If',
 'Data',
 "You're",
 'It',
 'To',
 'Scripting',
 'Once',
 'Data',
 'If',
 'If',
 'In',
 "It's",
 'If',
 'Roger',
 'While',
 'To',
 'And',
 'Try',
 'Google',
 'Disambiguation',
 'When',
 "That's",
 'If',
 'For',
 '01',
 'If',
 '01',
 "We've",
 'Oil',
 'And',
 'The',
 "We're",
 'At',
 'What',
 'Information',
 'They',
 'They',
 'Most',
 'Traditional',
 'Managing',
 'The',
 'Relational',
 'While',
 'Do',
 'Most',
 '92',
 '93',
 'To',
 'These',
 'They',
 'Many',
 'While',
 'Data',
 'Google',
 'In',
 'In',
 "It's",
 "What's",
 'The',
 "Yahoo's",
 'Many',
 "Amazon's",
 'You',
 'Hadoop',
 'It',
 'If',
 'Hadoop',
 'In',
 'Traditional',
 'If',
 'But',
 'Faster',
 "It's",
 'Hadoop',
 'Hadoop',
 'Near',
 'These',
 'As',
 'According',
 'ly',
 'Machine',
 'We',
 'You',
 'Andrew',
 'There',
 'Google',
 'For',
 'Mechanical',
 'Machine',
 'The',
 'Once',
 "It's",
 'Even',
 'While',
 'According',
 'It',
 "We've",
 'That',
 'More',
 'But',
 'Data',
 'Statistics',
 'Statistics',
 'It',
 'While',
 'Although',
 'It',
 'If',
 'A',
 'The',
 'To',
 'Edward',
 'But',
 'Visualization',
 'According',
 'Visualization',
 'Hilary',
 'Once',
 'There',
 'GnuPlot',
 'At',
 'Nathan',
 'One',
 'And',
 'Does',
 'Does',
 'There',
 "It's",
 'Data',
 'Describing',
 'Physicists',
 'They',
 'When',
 'You',
 'You',
 'Scientists',
 'Patil',
 'It',
 'But',
 'Asking',
 'It',
 'In',
 'Then',
 'The',
 'It',
 'It',
 'This',
 'CDDB',
 'But',
 'Computing',
 'Entrepreneurship',
 "Patil's",
 "That's",
 'We',
 'Hilary',
 'Her',
 'ly',
 'ly',
 'No',
 'In',
 'Data',
 'They',
 'They',
 'They',
 'Google',
 'They',
 'ly',
 'Whether',
 'The',
 'Data',
 '1',
 'Whether',
 '2']

In [39]:
#시작 단어를 선택해야 하는데,, 마침표 다음에 등장하는 단어들중 임의로 하나를 선택하는것도 방법.
def generate_using_bigrams(transitions):
    current = "."   # 다음단어가 문장의 시작이라는 것을 의미
    result = []
    while True:
        next_word_candidates = transitions[current]    # bigrams (current, _)
        current = random.choice(next_word_candidates)  # choose one at random
        result.append(current)                         # append it to results
        if current == ".": return " ".join(result)     # if "." 종료

In [40]:
random.seed(0)
print("bigram sentences")
for i in range(10):
    print(i, generate_using_bigrams(transitions))
print()
#터무니 없는 문장이지만, 데이터 과학과 관련되어 보일법한 웹사이트를 만들때 사용할 만한 것들이기도 하다...?


bigram sentences
0 But that's going to be current to generate a large searches correlates what that can be saying you can you want to figure out what's important role in enabling agile practices are then combine entrepreneurship with hundreds of track titles artists album .
1 CDDB views music by Jeff Hammerbacher said on the Philadelphia County by analyzing musical problem isn't the relational database .
2 If anything from machine consumable .
3 It was probably generated by their path .
4 Roger Magoulas who runs the metadata track titles .
5 The result .
6 Amazon understands that nobody remembers says that nobody remembers says that gave them open source R is really necessary for working with gathering data collection tools like the ability to be nice if it necessary to do data scientists particularly physicists rather than for distributing an audio stream processing companies banks and other disciplines it arrives and are easier to use .
7 Point of them open source the low 1 The Turk is a distributed across many of data is a multistage processing companies like Infochimps and added value in size of data might like a schema in which lets developers have 1 The first gigabyte disk drives in .
8 It then use one stop information platform though these applications to many modern web applications it's figuring out incrementally rather than sales to develop and the data sources and 5 .
9 Five years ago .

  • bigram : 두개의 연속적인 단어
  • trigram : 3개의 연속적인 단어를 보는..(n-gram도 있디만 3개 정도만 봐도 충분..)

In [ ]:
###+순차적으로 등장하는 단어들에 대한 정보를 얻기 위함?
a = ["We've",'all','heard', 'it']
b = ["We've",'all','heard', 'it']
b = ["We've",'all','heard', 'it']
list(zip(a,b))

In [42]:
#trigrams : 직전 두개의 단어에 의해 다음 단어가 결정됨
trigrams = list(zip(document, document[1:], document[2:]))
trigram_transitions = defaultdict(list)
starts = []

In [46]:
for prev, current, next in trigrams:
    if prev == ".":              # 만약 이전단어가 마침표 였다면
        starts.append(current)   # 이제 새로운 단어의 시작을 의미
    trigram_transitions[(prev, current)].append(next)

In [47]:
#운장은 앞서 바이그램과 비슷한 방식으로 생성할 수 있다
def generate_using_trigrams(starts, trigram_transitions):
    current = random.choice(starts)   # choose a random starting word
    prev = "."                        # and precede it with a '.'
    result = [current]
    while True:
        next_word_candidates = trigram_transitions[(prev, current)]
        next = random.choice(next_word_candidates)

        prev, current = current, next
        result.append(current)

        if current == ".":
            return " ".join(result)

In [48]:
print("trigram sentences")
for i in range(10):
    print(i, generate_using_trigrams(starts, trigram_transitions))
print()
#조금 더 괜찮은 문장..


trigram sentences
0 In data science what you search for and uses it to a database of album metadata track titles .
1 More to the products they use .
2 GnuPlot is very effective R incorporates a fairly comprehensive graphics package Casey Reas' and Ben Fry's Processing is the state of the key component of a complex set of operations fails .
3 Facebook and LinkedIn have all tapped into their datastreams and made recommendations accordingly .
4 Increased storage capacity on every level .
5 While there are many libraries available for machine learning .
6 If you have to look at bits per dollar or raw capacity storage has more than kept pace with the customers' behavior the data itself and creates more data you will find to put into it .
7 The thread that ties most of these applications together is that they had built the world's largest production Hadoop application with 10 000 postings with the data .
8 They aren't well behaved XML files with all the data you can do something with it and where it goes .
9 Traditional data analysis algorithms is that data useful The first step of any data analysis has been an explosion in the publishing industry data from sensors government data or some other source the problem .

  • trigram을 사용하면 다음 단어를 생성하는 각 단계에서 선택할 수 있는 단어의 수가 bigram을 사용할 때마다 훨씬 적어졌고, 선택할 수 있는 단어가 딱 하나만 존재하는 경우도 많았을 것이다.
  • 즉, 이미 어떤 문서상에 존재했던 문장(또는 긴문구)하나를 그대로 생성했을 가능성도 있다.
  • 이는 데이터 과학에 대한 더 많은 에세이들을 모으고, 이를 토대로 n-gram 모델을 구축하는 것을 의미!

**3) 문법**

  • 문법에 기반하여 말이 되는 문장을 생성하는 것
  • 품사란 무엇이며, 그것들을 어떻게 조합하면 문장이 되는지..
  • 명사 다음에는 항상 동사가 따른다...는 방식

In [79]:
#항목 앞에 밑줄이 있으면 더 확장할 수 있는 규칙이고, 나머지는 종결어 라고하자.
# 예, '_s'는 문장(sentence) 규칙을 의미, '_NP'는 명사구(noun phrase), '_VP'는 동사구
grammar = {
    "_S"  : ["_NP _VP"],
    "_NP" : ["_N",
             "_A _NP _P _A _N"],
    "_VP" : ["_V",
             "_V _NP"],
    "_N"  : ["data science", "Python", "regression"],
    "_A"  : ["big", "linear", "logistic"],
    "_P"  : ["about", "near"],
    "_V"  : ["learns", "trains", "tests", "is"]
}
['_S']
['_NP','_VP'] 
['_N','_VP'] 
['Python','_VP'] 
['Python','_V','_NP'] 
['Python','trains','_NP'] 
['Python','trains','_A','_NP','_P','_A','_N'] 
['Python','trains','logistic','_NP','_P','_A','_N']
['Python','trains','logistic','_N','_P','_A','_N'] 
['Python','trains','logistic','data science','_P','_A','_N'] 
['Python','trains','logistic','data science','about','_A', '_N'] 
['Python','trains','logistic','data science','about','logistic','_N'] 
['Python','trains','logistic','data science','about','logistic','Python']

In [80]:
# 특정 항목이 종결어인지 아닌지?
def is_terminal(token):
    return token[0] != "_"

# 각 항목을 대체 가능한 다른 항목 또는 항목들로 변환시키는 함수
def expand(grammar, tokens):
    for i, token in enumerate(tokens):

        # 종결어는 건너뜀
        if is_terminal(token): continue

        # 종결어가 아닌 단어는 대체할 수 있는 항목을 임의로 선택
        replacement = random.choice(grammar[token])

        if is_terminal(replacement):
            tokens[i] = replacement
        else:
            tokens = tokens[:i] + replacement.split() + tokens[(i+1):]
        # 새로운 단어의 list에 expand를 적용
        return expand(grammar, tokens)

    # 이제 모든 단어가 종결어 이기때문에 종료
    return tokens

def generate_sentence(grammar):
    return expand(grammar, ["_S"])

print("grammar sentences")
for i in range(10):
    print(i, " ".join(generate_sentence(grammar)))
print()


grammar sentences
0 Python trains
1 logistic data science about linear Python learns regression
2 big data science near linear regression trains linear big Python near logistic regression about linear Python
3 logistic linear Python near linear data science about big Python trains
4 big linear data science near linear regression about linear Python is
5 big logistic big Python about logistic Python about linear regression near big regression trains linear data science near logistic data science
6 linear linear regression near linear Python about logistic data science learns
7 logistic big data science about linear Python near logistic data science learns
8 logistic linear linear data science about logistic data science near linear regression near big regression tests logistic big linear linear Python near big regression near big regression about big Python near linear data science
9 regression learns big regression about linear regression

**5) 토픽 모델링**


In [94]:
#단어의 분포에 따라 각 토픽에 weight를 할당
def sample_from(weights):
    '''i를 weight[i] / sum(weight)의 확률로 반환'''
    total = sum(weights)
    rnd = total * random.random()       # 0과 total 사이를 균일하게 선택
    for i, w in enumerate(weights):
        rnd -= w                        # return the smallest i such that
        if rnd <= 0: return i           # sum(weights[:(i+1)]) >= rnd
결국, weight가 [1,1,3] 이라면 
1/5의 확룔로 0, 
1/5의 확률로 1, 
3/5의 확률로 2를 반환

In [83]:
documents = [
    ["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
    ["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
    ["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
    ["R", "Python", "statistics", "regression", "probability"],
    ["machine learning", "regression", "decision trees", "libsvm"],
    ["Python", "R", "Java", "C++", "Haskell", "programming languages"],
    ["statistics", "probability", "mathematics", "theory"],
    ["machine learning", "scikit-learn", "Mahout", "neural networks"],
    ["neural networks", "deep learning", "Big Data", "artificial intelligence"],
    ["Hadoop", "Java", "MapReduce", "Big Data"],
    ["statistics", "R", "statsmodels"],
    ["C++", "deep learning", "artificial intelligence", "probability"],
    ["pandas", "R", "Python"],
    ["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
    ["libsvm", "regression", "support vector machines"]
]

In [87]:
#총 K=4개의 토픽을 반환해 보자!
K = 4

#각 토픽이 각 문서에 할당되는 횟수 (Counter는 각각의 문서를 의미)
document_topic_counts = [Counter()
                         for _ in documents]

#각 단어가 각 토픽에 할당되는 횟수 (Counter는 각 토픽을 의미)
topic_word_counts = [Counter() for _ in range(K)]

#각 토픽에 할당죄는 총 단어수 (각각의 숫자는 각 토픽을 의미)
topic_counts = [0 for _ in range(K)]

#각 문서에 포함되는 총 단어수 (각각의 숫자는 각 문서를 의미)
document_lengths = [len(d) for d in documents]

#단어 종류의 수
distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)

#총 문서의 수
D = len(documents)

In [88]:
# documents[3]의 문서중 토픽 1과 관련 있는 단어의 수를 구하면.
document_topic_counts[3][1]


Out[88]:
0

In [89]:
#npl라는 단어가 토픽 2와 연관지어 나오는 횟수는?
topic_word_counts[2]["nlp"]


Out[89]:
0

In [90]:
def p_topic_given_document(topic, d, alpha=0.1):
    """문서 d의 모든 단어 중에서 topic에 속하는
    단어의 비율 (smoothing을 더한 비율)"""

    return ((document_topic_counts[d][topic] + alpha) /
            (document_lengths[d] + K * alpha))

def p_word_given_topic(word, topic, beta=0.1):
    """topic에 속한 단어 중에서 word의 비율 (smoothing을 더한 비율)"""

    return ((topic_word_counts[topic][word] + beta) /
            (topic_counts[topic] + W * beta))

def topic_weight(d, word, k):
    """문서와 문서의 단어가 주어지면, k번째 토픽의 weight를 반환"""

    return p_word_given_topic(word, k) * p_topic_given_document(k, d)

def choose_new_topic(d, word):
    return sample_from([topic_weight(d, word, k)
                        for k in range(K)])

In [95]:
random.seed(0)
document_topics = [[random.randrange(K) for word in document]
                   for document in documents]

for d in range(D):
    for word, topic in zip(documents[d], document_topics[d]):
        document_topic_counts[d][topic] += 1
        topic_word_counts[topic][word] += 1
        topic_counts[topic] += 1

for iter in range(1000):
    for d in range(D):
        for i, (word, topic) in enumerate(zip(documents[d],
                                              document_topics[d])):

            # remove this word / topic from the counts
            # so that it doesn't influence the weights
            document_topic_counts[d][topic] -= 1
            topic_word_counts[topic][word] -= 1
            topic_counts[topic] -= 1
            document_lengths[d] -= 1

            # choose a new topic based on the weights
            new_topic = choose_new_topic(d, word)
            document_topics[d][i] = new_topic

            # and now add it back to the counts
            document_topic_counts[d][new_topic] += 1
            topic_word_counts[new_topic][word] += 1
            topic_counts[new_topic] += 1
            document_lengths[d] += 1

In [93]:
#토픽의 의미를 찾기위해 각 토픽에 대해 가장 영향력이 높은(weight 값이 큰) 단어들이 무언인지 보자
for k, word_counts in enumerate(topic_word_counts):
    for word, count in word_counts.most_common():
        if count > 0: print(k, word, count)


0 pandas 2
0 scikit-learn 2
0 regression 1
0 statistics 1
0 artificial intelligence 1
0 Java 1
0 Big Data 1
0 Hadoop 1
0 statsmodels 1
0 HBase 1
0 libsvm 1
0 R 1
0 C++ 1
0 Haskell 1
1 neural networks 2
1 deep learning 2
1 databases 1
1 Postgres 1
1 numpy 1
1 MySQL 1
1 Cassandra 1
1 MongoDB 1
1 Mahout 1
1 Python 1
1 HBase 1
1 theory 1
1 decision trees 1
2 regression 2
2 Java 2
2 R 2
2 Python 2
2 Postgres 1
2 machine learning 1
2 statistics 1
2 artificial intelligence 1
2 MongoDB 1
2 HBase 1
2 Cassandra 1
2 mathematics 1
2 probability 1
2 statsmodels 1
2 C++ 1
2 scipy 1
3 Big Data 2
3 probability 2
3 machine learning 1
3 R 1
3 statistics 1
3 programming languages 1
3 NoSQL 1
3 libsvm 1
3 support vector machines 1
3 Spark 1
3 Python 1
3 MapReduce 1
3 Storm 1

In [96]:
# 단어들을 보고 다음고 ㅏ같이 이름을 지정해주자
topic_names = ["Big Data and programming languages",
               "databases",
               "machine learning",
               "statistics"]

#사용자의 관심사가 무엇인지 알아볼 수 있다.
for document, topic_counts in zip(documents, document_topic_counts):
    print(document)
    for topic, count in topic_counts.most_common():
        if count > 0:
            print(topic_names[topic], count)
    print()


['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']
statistics 5
Big Data and programming languages 4
machine learning 4

['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
machine learning 5
databases 3
statistics 2

['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas']
Big Data and programming languages 6
machine learning 4
databases 2

['R', 'Python', 'statistics', 'regression', 'probability']
machine learning 6
Big Data and programming languages 3
statistics 1

['machine learning', 'regression', 'decision trees', 'libsvm']
machine learning 3
statistics 3
databases 2

['Python', 'R', 'Java', 'C++', 'Haskell', 'programming languages']
Big Data and programming languages 5
machine learning 4
statistics 3

['statistics', 'probability', 'mathematics', 'theory']
databases 3
Big Data and programming languages 2
statistics 2
machine learning 1

['machine learning', 'scikit-learn', 'Mahout', 'neural networks']
databases 4
Big Data and programming languages 2
machine learning 2

['neural networks', 'deep learning', 'Big Data', 'artificial intelligence']
databases 5
Big Data and programming languages 2
statistics 1

['Hadoop', 'Java', 'MapReduce', 'Big Data']
Big Data and programming languages 5
statistics 2
machine learning 1

['statistics', 'R', 'statsmodels']
machine learning 4
Big Data and programming languages 2

['C++', 'deep learning', 'artificial intelligence', 'probability']
machine learning 5
Big Data and programming languages 1
databases 1
statistics 1

['pandas', 'R', 'Python']
machine learning 3
Big Data and programming languages 2
statistics 1

['databases', 'HBase', 'Postgres', 'MySQL', 'MongoDB']
databases 9
machine learning 1

['libsvm', 'regression', 'support vector machines']
machine learning 27
Big Data and programming languages 22
databases 14
statistics 10


In [ ]: