1) 워드 클라우드
In [1]:
import math, random, re
from collections import defaultdict, Counter
from bs4 import BeautifulSoup
import requests
import matplotlib.pyplot as plt
---------------------------------------------------------------------------
ImportError Traceback (most recent call last)
<ipython-input-1-fbf6abeb8355> in <module>()
3 from bs4 import BeautifulSoup
4 import requests
----> 5 import matplotlib.pyplot as plt
ImportError: No module named 'matplotlib'
In [28]:
#데이터 과학 관련 키워드목록, 빈도 0~100
data = [ ("big data", 100, 15), ("Hadoop", 95, 25), ("Python", 75, 50),
("R", 50, 40), ("machine learning", 80, 20), ("statistics", 20, 60),
("data science", 60, 70), ("analytics", 90, 3),
("team player", 85, 85), ("dynamic", 2, 90), ("synergies", 70, 0),
("actionable insights", 40, 30), ("think out of the box", 45, 10),
("self-starter", 30, 50), ("customer focus", 65, 15),
("thought leadership", 35, 35)]
단어가 구인 광고에 등장하는 빈도를 가로축, 단어가 이력서에 등장하는 빈도를 세로축
In [29]:
def text_size(total):
"""equals 8 if total is 0, 28 if total is 200"""
return 8 + total / 200 * 20
In [30]:
for word, job_popularity, resume_popularity in data:
plt.text(job_popularity, resume_popularity, word,
ha='center', va='center',
size=text_size(job_popularity + resume_popularity))
plt.xlabel("Popularity on Job Postings")
plt.ylabel("Popularity on Resumes")
plt.axis([0, 100, 0, 100])
plt.show()
2) n-gram 모델
In [31]:
#유니코드 따옴표를 일반 아스키 따옴표로 변환
def fix_unicode(text):
return text.replace(u"\u2019", "'")
In [59]:
def get_document():
url = "http://radar.oreilly.com/2010/06/what-is-data-science.html"
html = requests.get(url).text
soup = BeautifulSoup(html, 'html5lib')
#content = soup.find("div", "entry-content") # NoneType Error
content = soup.find("div", "article-body") # find article-body div
regex = r"[\w']+|[\.]" # 단어나 마침표에 해당하는 문자열
document = []
for paragraph in content("p"):
words = re.findall(regex, fix_unicode(paragraph.text))
document.extend(words)
return document
In [72]:
document = get_document()
#document
Out[72]:
["We've",
'all',
'heard',
'it',
'according',
'to',
'Hal',
'Varian',
'statistics',
'is',
'the',
'next',
'sexy',
'job',
'.',
'Five',
'years',
'ago',
'in',
'What',
'is',
'Web',
'2',
'.',
'0',
'Tim',
"O'Reilly",
'said',
'that',
'data',
'is',
'the',
'next',
'Intel',
'Inside',
'.',
'But',
'what',
'does',
'that',
'statement',
'mean',
'Why',
'do',
'we',
'suddenly',
'care',
'about',
'statistics',
'and',
'about',
'data',
'In',
'this',
'post',
'I',
'examine',
'the',
'many',
'sides',
'of',
'data',
'science',
'the',
'technologies',
'the',
'companies',
'and',
'the',
'unique',
'skill',
'sets',
'.',
'The',
'web',
'is',
'full',
'of',
'data',
'driven',
'apps',
'.',
'Almost',
'any',
'e',
'commerce',
'application',
'is',
'a',
'data',
'driven',
'application',
'.',
"There's",
'a',
'database',
'behind',
'a',
'web',
'front',
'end',
'and',
'middleware',
'that',
'talks',
'to',
'a',
'number',
'of',
'other',
'databases',
'and',
'data',
'services',
'credit',
'card',
'processing',
'companies',
'banks',
'and',
'so',
'on',
'.',
'But',
'merely',
'using',
'data',
"isn't",
'really',
'what',
'we',
'mean',
'by',
'data',
'science',
'.',
'A',
'data',
'application',
'acquires',
'its',
'value',
'from',
'the',
'data',
'itself',
'and',
'creates',
'more',
'data',
'as',
'a',
'result',
'.',
"It's",
'not',
'just',
'an',
'application',
'with',
'data',
"it's",
'a',
'data',
'product',
'.',
'Data',
'science',
'enables',
'the',
'creation',
'of',
'data',
'products',
'.',
'One',
'of',
'the',
'earlier',
'data',
'products',
'on',
'the',
'Web',
'was',
'the',
'CDDB',
'database',
'.',
'The',
'developers',
'of',
'CDDB',
'realized',
'that',
'any',
'CD',
'had',
'a',
'unique',
'signature',
'based',
'on',
'the',
'exact',
'length',
'in',
'samples',
'of',
'each',
'track',
'on',
'the',
'CD',
'.',
'Gracenote',
'built',
'a',
'database',
'of',
'track',
'lengths',
'and',
'coupled',
'it',
'to',
'a',
'database',
'of',
'album',
'metadata',
'track',
'titles',
'artists',
'album',
'titles',
'.',
'If',
"you've",
'ever',
'used',
'iTunes',
'to',
'rip',
'a',
'CD',
"you've",
'taken',
'advantage',
'of',
'this',
'database',
'.',
'Before',
'it',
'does',
'anything',
'else',
'iTunes',
'reads',
'the',
'length',
'of',
'every',
'track',
'sends',
'it',
'to',
'CDDB',
'and',
'gets',
'back',
'the',
'track',
'titles',
'.',
'If',
'you',
'have',
'a',
'CD',
"that's",
'not',
'in',
'the',
'database',
'including',
'a',
'CD',
"you've",
'made',
'yourself',
'you',
'can',
'create',
'an',
'entry',
'for',
'an',
'unknown',
'album',
'.',
'While',
'this',
'sounds',
'simple',
'enough',
"it's",
'revolutionary',
'CDDB',
'views',
'music',
'as',
'data',
'not',
'as',
'audio',
'and',
'creates',
'new',
'value',
'in',
'doing',
'so',
'.',
'Their',
'business',
'is',
'fundamentally',
'different',
'from',
'selling',
'music',
'sharing',
'music',
'or',
'analyzing',
'musical',
'tastes',
'though',
'these',
'can',
'also',
'be',
'data',
'products',
'.',
'CDDB',
'arises',
'entirely',
'from',
'viewing',
'a',
'musical',
'problem',
'as',
'a',
'data',
'problem',
'.',
'Google',
'is',
'a',
'master',
'at',
'creating',
'data',
'products',
'.',
"Here's",
'a',
'few',
'examples',
'Google',
"isn't",
'the',
'only',
'company',
'that',
'knows',
'how',
'to',
'use',
'data',
'.',
'Facebook',
'and',
'LinkedIn',
'use',
'patterns',
'of',
'friendship',
'relationships',
'to',
'suggest',
'other',
'people',
'you',
'may',
'know',
'or',
'should',
'know',
'with',
'sometimes',
'frightening',
'accuracy',
'.',
'Amazon',
'saves',
'your',
'searches',
'correlates',
'what',
'you',
'search',
'for',
'with',
'what',
'other',
'users',
'search',
'for',
'and',
'uses',
'it',
'to',
'create',
'surprisingly',
'appropriate',
'recommendations',
'.',
'These',
'recommendations',
'are',
'data',
'products',
'that',
'help',
'to',
'drive',
"Amazon's",
'more',
'traditional',
'retail',
'business',
'.',
'They',
'come',
'about',
'because',
'Amazon',
'understands',
'that',
'a',
'book',
"isn't",
'just',
'a',
'book',
'a',
'camera',
"isn't",
'just',
'a',
'camera',
'and',
'a',
'customer',
"isn't",
'just',
'a',
'customer',
'customers',
'generate',
'a',
'trail',
'of',
'data',
'exhaust',
'that',
'can',
'be',
'mined',
'and',
'put',
'to',
'use',
'and',
'a',
'camera',
'is',
'a',
'cloud',
'of',
'data',
'that',
'can',
'be',
'correlated',
'with',
'the',
"customers'",
'behavior',
'the',
'data',
'they',
'leave',
'every',
'time',
'they',
'visit',
'the',
'site',
'.',
'The',
'thread',
'that',
'ties',
'most',
'of',
'these',
'applications',
'together',
'is',
'that',
'data',
'collected',
'from',
'users',
'provides',
'added',
'value',
'.',
'Whether',
'that',
'data',
'is',
'search',
'terms',
'voice',
'samples',
'or',
'product',
'reviews',
'the',
'users',
'are',
'in',
'a',
'feedback',
'loop',
'in',
'which',
'they',
'contribute',
'to',
'the',
'products',
'they',
'use',
'.',
"That's",
'the',
'beginning',
'of',
'data',
'science',
'.',
'In',
'the',
'last',
'few',
'years',
'there',
'has',
'been',
'an',
'explosion',
'in',
'the',
'amount',
'of',
'data',
"that's",
'available',
'.',
'Whether',
"we're",
'talking',
'about',
'web',
'server',
'logs',
'tweet',
'streams',
'online',
'transaction',
'records',
'citizen',
'science',
'data',
'from',
'sensors',
'government',
'data',
'or',
'some',
'other',
'source',
'the',
'problem',
"isn't",
'finding',
'data',
"it's",
'figuring',
'out',
'what',
'to',
'do',
'with',
'it',
'.',
'And',
"it's",
'not',
'just',
'companies',
'using',
'their',
'own',
'data',
'or',
'the',
'data',
'contributed',
'by',
'their',
'users',
'.',
"It's",
'increasingly',
'common',
'to',
'mashup',
'data',
'from',
'a',
'number',
'of',
'sources',
'.',
'Data',
'Mashups',
'in',
'R',
'analyzes',
'mortgage',
'foreclosures',
'in',
'Philadelphia',
'County',
'by',
'taking',
'a',
'public',
'report',
'from',
'the',
'county',
"sheriff's",
'office',
'extracting',
'addresses',
'and',
'using',
'Yahoo',
'to',
'convert',
'the',
'addresses',
'to',
'latitude',
'and',
'longitude',
'then',
'using',
'the',
'geographical',
'data',
'to',
'place',
'the',
'foreclosures',
'on',
'a',
'map',
'another',
'data',
'source',
'and',
'group',
'them',
'by',
'neighborhood',
'valuation',
'neighborhood',
'per',
'capita',
'income',
'and',
'other',
'socio',
'economic',
'factors',
'.',
'The',
'question',
'facing',
'every',
'company',
'today',
'every',
'startup',
'every',
'non',
'profit',
'every',
'project',
'site',
'that',
'wants',
'to',
'attract',
'a',
'community',
'is',
'how',
'to',
'use',
'data',
'effectively',
'not',
'just',
'their',
'own',
'data',
'but',
'all',
'the',
'data',
"that's",
'available',
'and',
'relevant',
'.',
'Using',
'data',
'effectively',
'requires',
'something',
'different',
'from',
'traditional',
'statistics',
'where',
'actuaries',
'in',
'business',
'suits',
'perform',
'arcane',
'but',
'fairly',
'well',
'defined',
'kinds',
'of',
'analysis',
'.',
'What',
'differentiates',
'data',
'science',
'from',
'statistics',
'is',
'that',
'data',
'science',
'is',
'a',
'holistic',
'approach',
'.',
"We're",
'increasingly',
'finding',
'data',
'in',
'the',
'wild',
'and',
'data',
'scientists',
'are',
'involved',
'with',
'gathering',
'data',
'massaging',
'it',
'into',
'a',
'tractable',
'form',
'making',
'it',
'tell',
'its',
'story',
'and',
'presenting',
'that',
'story',
'to',
'others',
'.',
'To',
'get',
'a',
'sense',
'for',
'what',
'skills',
'are',
'required',
"let's",
'look',
'at',
'the',
'data',
'lifecycle',
'where',
'it',
'comes',
'from',
'how',
'you',
'use',
'it',
'and',
'where',
'it',
'goes',
'.',
'Data',
'is',
'everywhere',
'your',
'government',
'your',
'web',
'server',
'your',
'business',
'partners',
'even',
'your',
'body',
'.',
'While',
'we',
"aren't",
'drowning',
'in',
'a',
'sea',
'of',
'data',
"we're",
'finding',
'that',
'almost',
'everything',
'can',
'or',
'has',
'been',
'instrumented',
'.',
'At',
"O'Reilly",
'we',
'frequently',
'combine',
'publishing',
'industry',
'data',
'from',
'Nielsen',
'BookScan',
'with',
'our',
'own',
'sales',
'data',
'publicly',
'available',
'Amazon',
'data',
'and',
'even',
'job',
'data',
'to',
'see',
"what's",
'happening',
'in',
'the',
'publishing',
'industry',
'.',
'Sites',
'like',
'Infochimps',
'and',
'Factual',
'provide',
'access',
'to',
'many',
'large',
'datasets',
'including',
'climate',
'data',
'MySpace',
'activity',
'streams',
'and',
'game',
'logs',
'from',
'sporting',
'events',
'.',
'Factual',
'enlists',
'users',
'to',
'update',
'and',
'improve',
'its',
'datasets',
'which',
'cover',
'topics',
'as',
'diverse',
'as',
'endocrinologists',
'to',
'hiking',
'trails',
'.',
'Much',
'of',
'the',
'data',
'we',
'currently',
'work',
'with',
'is',
'the',
'direct',
'consequence',
'of',
'Web',
'2',
'.',
'0',
'and',
'of',
"Moore's",
'Law',
'applied',
'to',
'data',
'.',
'The',
'web',
'has',
'people',
'spending',
'more',
...]
In [71]:
###+순차적으로 등장하는 단어들에 대한 정보를 얻기 위함?
a = ["We've",'all','heard', 'it']
b = ["We've",'all','heard', 'it']
list(zip(a,b))
Out[71]:
[('a', 'd'), ('b', 'e')]
In [76]:
bigrams = list(zip(document, document[1:]))
transitions = defaultdict(list)
for prev, current in bigrams:
transitions[prev].append(current)
In [77]:
#transitions
transitions
Out[77]:
defaultdict(list,
{'generation': ['of'],
'initial': ['data'],
'applied': ['to', 'to'],
'already': ['reduced'],
'verify': ['them'],
'five': ['minutes'],
'processing': ['companies', 'to', 'fails', '.', 'pipeline'],
'throw': ['the'],
'feedback': ['loop'],
'they': ['leave',
'visit',
'contribute',
'use',
'go',
"aren't",
'had',
'generate',
'are',
'decided'],
'speak': ['coherently'],
'thread': ['that'],
'modern': ['web'],
'made': ['yourself', 'the', 'recommendations', 'that'],
'Factual': ['provide', 'enlists'],
'alternative': ['but'],
'creativity': ['for'],
'presenting': ['that', 'data', 'results'],
'versatile': ['According'],
'community': ['is'],
'investigate': ['the'],
'this': ['post',
'database',
'sounds',
'data',
'scale',
'video',
'difference',
'animation',
'is',
'versatile',
'into',
'but'],
'BI': ['to'],
'parsers': ['and', 'for'],
'interaction': ['between'],
'online': ['transaction', 'and', '.', '.'],
'dozen': ['or', 'or'],
'000': ['MB', 'postings', 'cores'],
'eventual': ['consistency'],
'increased': ['from', 'sophistication'],
'capacity': ['on', 'storage', 'demands', 'continues'],
'up': ['messy',
'more',
'into',
'that',
'with',
'in',
'into',
'that',
'with'],
'increases': ['in', 'in'],
'lets': ['developers', 'you'],
'music': ['as', 'sharing', 'or', 'by'],
'advance': ['conflicts'],
'jiujitsu': ['using', 'identifying'],
'costs': ['100', 'a'],
'usually': ['impossible'],
'dataspaces': ['.'],
'CPU': ['speed'],
'providing': ['preconfigured', 'one'],
'consumer': ['equipment', 'oriented'],
'Martin': ['Wattenberg'],
'sporting': ['events'],
'All': ['of'],
'server': ['logs', 'your'],
'identifying': ['music'],
'give': ['us'],
'such': ['as', 'as', '.'],
'titles': ['artists', '.', '.'],
'looked': ['at'],
'terms': ['voice'],
'dpatil': ['the'],
'gotten': ['some'],
'about': ['statistics',
'data',
'because',
'web',
'40000',
'half',
'the',
'large',
'big',
'the',
'the',
'what',
'testing',
'the'],
'designed': ['for', 'for', 'to', 'for'],
"LinkedIn's": ['membership', 'data'],
"couldn't": ['store'],
'red': ['herring'],
'natural': ['language', 'language'],
'explosion': ['in'],
'intermediate': ['results', 'results'],
"week's": ['small'],
'does': ['that', 'anything', 'that'],
'SQL': ['like'],
"It's": ['not',
'increasingly',
'reported',
'usually',
'easy',
'easer',
'an',
'the'],
'die': ['.'],
'scatter': ['plots'],
'comprehensive': ['package', 'graphics'],
'SnapTell': ['and'],
'appear': ['.'],
'consumers': ['and'],
'examine': ['the'],
'devices': ['and'],
'amount': ['of'],
'willingness': ['to'],
'has': ['been',
'been',
'people',
'increased',
'moved',
'more',
'indexed',
'an',
'proven',
'been',
'been',
'just',
'eaten',
'become',
'excellent'],
'camera': ["isn't", 'and', 'is'],
'agile': ['data', 'practices', 'flexible'],
'law': ['as'],
'rolled': ['back'],
'Goggles': ['and'],
'economic': ['factors'],
'That': ["isn't", 'joke'],
'neither': ['term'],
'Disambiguation': ['is'],
'two': ['dozen'],
'Scripting': ['languages'],
'visualization': ['and', 'is', 'itself'],
'phone': ['and'],
'complements': ['them'],
'think': ['about', "it's", 'outside'],
'Facebook': ['and', 'or', 'possibly', 'and'],
'open': ['source', 'source', 'source'],
'power': ['the'],
'really': ['what',
'a',
'necessary',
'care',
'telling',
'what',
'to'],
'programming': ['task'],
'skills': ['are', '.', 'ranging', 'and'],
'out': ['what',
"what's",
'of',
'whether',
'just',
'if',
'with',
'incrementally',
'how',
'how'],
"hasn't": ['died'],
'language': ['processing',
'understanding',
'and',
'processing',
'called',
'and',
'particularly'],
'grammatical': ['structure'],
'queries': ['a'],
'book': ["isn't", 'a'],
'Display': ['of'],
'how': ['to',
'to',
'you',
'do',
"Google's",
'bad',
'things',
'you',
'economies',
'to',
'to',
'to'],
'audio': ['and', 'all', 'stream'],
'92': ['percent'],
'patience': ['the'],
'Hammerbacher': ['2', 'said', 'in', 'in'],
'Storing': ['data'],
'incorporate': ['recommendation'],
'feeds': ['web'],
'tools': ['like', 'to', 'discarded', 'like', 'for'],
"aren't": ['drowning', 'well', 'concerned', '.'],
'company': ['that', 'today', 'with'],
"We've": ['all', 'all', 'all'],
'BigTable': ['and'],
'computing': ['cluster', '.', 'power', 'skills', 'time'],
'able': ['to', 'to'],
'Or': ['the'],
'ancient': ['Unix'],
"Yau's": ['FlowingData'],
'Even': ['a'],
'picture': ['with', 'may', 'is', 'the'],
'artists': ['album', "they're"],
'nothing': ['of'],
'easily': ['described', 'be'],
'connections': ['then'],
'Stanford': ['with'],
'early': ["'80s"],
'addresses': ['and', 'to'],
'Precision': ['has'],
'faces': ['cars'],
'then': ['using',
'distributed',
'combined',
'combine',
'use',
'going',
'branched',
'looking'],
'training': ['set', 'sets', 'data'],
'trivially': ['simple'],
'others': ['.', 'the'],
'2012': ['Nissan'],
'languages': ['or', 'such'],
'warehouses': ['but'],
'increasingly': ['common', 'finding'],
'understanding': ['the', 'the', 'of', 'how'],
'entirely': ['from'],
'hour': ['.'],
'It': ['would',
'incorporates',
'is',
"isn't",
'has',
'would',
'then',
'started',
'was'],
'climate': ['data'],
'widely': ['applicable'],
'Since': ['the'],
'locked': ['up'],
'A': ['data', 'picture'],
'from': ['the',
'selling',
'viewing',
'users',
'sensors',
'a',
'the',
'traditional',
'statistics',
'how',
'Nielsen',
'sporting',
'10',
'1',
'a',
'ancient',
'many',
'gigabytes',
'search',
'Twitter',
'the',
'traditional',
'machine',
'traditional',
'a',
'the',
'it',
'initial',
'it',
'the',
'it'],
'with': ['data',
'sometimes',
'what',
'the',
'it',
'gathering',
'our',
'is',
'geolocation',
'the',
'all',
'tools',
'an',
'badly',
'the',
'the',
'human',
'Apple',
'the',
'data',
'data',
'reality',
'it',
'10',
'faster',
'clients',
'the',
'a',
'hundreds',
'which',
'a',
'most',
'more',
'the',
'R',
'a',
'.',
'patience',
'new',
'very'],
'e': ['commerce'],
'capture': ['all'],
'classification': ['for', 'error'],
'an': ['application',
'entry',
'unknown',
'explosion',
'even',
'increase',
'HTML',
'array',
'easy',
'allure',
'extremely',
'extremely',
'obvious',
'experimental',
'hour',
'ill',
'important',
'excellent',
'excellent',
'important',
'essential',
'odd',
'end',
'epidemic',
'algorithm',
'agile',
'audio',
'important'],
'hard': ['scientists'],
'contribute': ['to'],
'of': ['data',
'data',
'other',
'data',
'the',
'CDDB',
'each',
'track',
'album',
'this',
'every',
'friendship',
'data',
'data',
'these',
'data',
'data',
'sources',
'analysis',
'data',
'the',
'Web',
"Moore's",
'data',
'them',
'which',
'sale',
'your',
'this',
'360',
'cores',
'about',
'the',
'CPU',
"Moore's",
'that',
'data',
'any',
'wild',
'data',
'tools',
'your',
'ozone',
'a',
'the',
'subtasks',
'the',
'the',
'data',
'steam',
'the',
'the',
'database',
'multiple',
'a',
'operations',
'analysis',
'finance',
'databases',
'these',
"Google's",
'them',
'building',
'identical',
'processors',
'answers',
'MapReduce',
'the',
'Linux',
'which',
'a',
'huge',
'time',
'followers',
'the',
'the',
'the',
'students',
'the',
'known',
'public',
'a',
'data',
'data',
'data',
'numbers',
'Quantitative',
'the',
'Flowing',
'what',
'the',
'the',
'my',
'the',
'Walmart',
'the',
'cancer',
'a',
'presenting',
'question',
'our',
'the',
'grant',
'creating',
'hours',
'developer',
'hours',
'computing',
'data',
'what',
'data',
'the',
'person',
'products',
'a',
'data',
'their',
'millions',
'travellers',
'successful',
'Hal'],
'companies': ['and',
'banks',
'using',
'telecommunications',
'and',
'that',
'who',
'like'],
'have': ['a',
'to',
'to',
'is',
'no',
'real',
'already',
'had',
'built',
'found',
'1',
'very',
'established',
'found',
'to',
'to',
'humans',
'asked',
'a',
'to',
'to',
'been',
'all'],
'as': ['a',
'data',
'audio',
'a',
'diverse',
'endocrinologists',
'applied',
'an',
'awk',
'Perl',
'storage',
'the',
'needed',
'it',
'clean',
"you'd",
'such',
'scientist',
'Dataspaces',
'Dataspaces'],
'easy': ['task', 'to', 'to', 'to', 'to'],
'single': ['reduce', 'set', 'tool'],
'statistical': ['models', 'packages', 'work'],
'many': ['sides',
'large',
'of',
'job',
'many',
'websites',
'applications',
'nodes',
'processors',
'large',
'modern',
'libraries',
'commercial',
'kinds',
'packages',
'of'],
'start': ['thinking', 'a', 'a'],
'incrementally': ['.', 'rather', 'the'],
'consequence': ['of'],
'processes': ['data'],
'mined': ['and', '.'],
'scientists': ['are',
'tend',
'particularly',
'started',
'combine'],
"We're": ['increasingly', 'discussing'],
'no': ['alternative'],
'easier': ['to', 'to', 'to'],
'form': ['making'],
'looking': ['at', 'at', 'at', 'up', 'for'],
'job': ['.',
'data',
'but',
'listings',
'postings',
'posting',
'listings',
'only',
'as'],
'system': ['but'],
'last': ['few'],
'clusters': ['.', 'that'],
'around': ['times', 'faces', 'data'],
'scraping': ["hasn't"],
'reads': ['the'],
'piece': ['of'],
'In': ['this',
'the',
'data',
'the',
'hindsight',
'software',
'addition',
'addition'],
'large': ['datasets',
'snakes',
'number',
'problem',
'computing',
'searches',
'data',
'collection',
'job',
'problems',
'difficult'],
'starts': ['by'],
'Unix': ['utilities'],
'events': ['.', 'that'],
'key': ['Hadoop', 'component', 'to'],
'wild': ['and', 'data'],
'Computing': ['a'],
'stressed': ['traditional'],
'essentially': ['a'],
'testing': ['.', 'hypotheses'],
'supermarket': ['is'],
'recommendations': ['.', 'are', 'accordingly'],
'industry': ['data', '.', '.', 'is'],
'dissimilar': ['products'],
'everyone': ['who'],
'including': ['a', 'climate', 'the'],
'combine': ['publishing', 'the', 'entrepreneurship'],
'scale': ['.'],
'batch': ['system'],
'survival': ['depends'],
'definition': ["I've"],
'reporting': ['.'],
'identity': ['using'],
'Although': ['R'],
'exposes': ['their'],
'interesting': ["It's", '.', 'products'],
'Quantitative': ['Information'],
'talks': ['to'],
'processors': ['the', 'and', 'as'],
'voice': ['samples'],
'hand': ['.'],
'found': ['it', 'a'],
'animations': ['that'],
'cluster': ['.'],
'Mason': ['hmason', 'says', 'came'],
'stop': ['being', 'information', 'shopping'],
'fill': ['the'],
'sensors': ['government'],
'beyond': ['the', 'a'],
'Intel': ['Inside', 'Inside'],
'common': ['to'],
'decades': ['.'],
'extend': ['R'],
'provides': ['added', 'commercial', 'an'],
'schemas': ['evolve'],
'back': ['the', 'if', 'to', 'to'],
'saying': ['you'],
'statement': ['mean'],
'that': ['data',
'statement',
'talks',
'any',
'knows',
'help',
'a',
'can',
'can',
'ties',
'data',
'data',
'wants',
'data',
'story',
'almost',
'data',
'data',
'are',
'was',
'something',
'the',
'the',
'were',
'sounds',
'problem',
'are',
'have',
'can',
'allure',
'MapReduce',
'they',
'can',
"you'd",
'enables',
"person's",
'eating',
'one',
'data',
'the',
'provides',
'they',
'when',
'show',
'give',
'would',
'looked',
'members',
'analyzed',
'built',
'appears',
'gave',
'signature',
'are',
'the',
'find',
'bit',
'the',
'people',
'nobody',
'in',
'the',
'data'],
'hackingdata': ["we're"],
'tackle': ['all'],
'worth': ['a', 'a'],
'contributed': ['by'],
'obvious': ['solution', 'is'],
'automated': ['data'],
'conversions': ['than'],
'Hilary': ['Mason', 'Mason', 'Mason'],
'making': ['it', 'data', 'guesses', 'sure', 'a', 'connections'],
'There': ['are', 'are', 'was'],
'importance': ['of'],
'components': ['.'],
'socio': ['economic'],
"Google's": ['BigTable', 'biggest', 'ad'],
'my': ['favorites'],
'structure': ['of'],
'expand': ["today's"],
'visit': ['the'],
'impossible': ['to'],
'box': ['to'],
'come': ['about', 'in', 'from', 'up'],
'HOP': ['is'],
'transactions': ['not', 'that'],
'neighborhood': ['valuation', 'per'],
'direct': ['consequence'],
'something': ['different', 'is', 'with'],
'geolocation': ['or', 'skills'],
'Mike': ['Driscoll'],
'reviews': ['the'],
'runs': ['the'],
'Online': ['Prototype'],
'because': ['Amazon', 'automated', 'everyone'],
'unique': ['skill', 'signature'],
'Most': ['of', 'data'],
'now': ['terabyte', 'expect', 'ask'],
'develop': ['and', 'training'],
'several': ["it's"],
'one': ['of',
'stop',
'of',
'of',
'advertisement',
'stop',
'place',
'in'],
'suits': ['perform'],
'surf': ['the'],
'1982': ['weighing'],
'look': ['at', 'at', 'at', 'up', 'for', 'like'],
'Nathan': ["Yau's"],
'entering': ['the'],
'precompute': ['much'],
'quirky': ['language'],
'nobody': ['remembers'],
'reliability': ['requirements'],
'thinking': ['about'],
'differently': ['it'],
'strong': ['mathematical'],
'uses': ['it'],
'heart': ['of'],
'hampered': ['by'],
'pounds': ['now'],
'NoSQL': ['databases'],
"you'd": ['otherwise', 'like'],
'non': ['profit'],
'shared': ['experience'],
'suggest': ['other'],
'Ben': ["Fry's"],
"Turk's": ['marketplace'],
'role': ['in', 'in'],
'diverse': ['as'],
'frequently': ['combine', 'missing', 'all', '.', 'called', 'the'],
"Moore's": ['Law', 'Law', 'law'],
'existence': ['of'],
'sure': ['that'],
'at': ['creating',
'the',
'roughly',
'bits',
'hand',
"O'Reilly",
'job',
'this',
'Cloudera',
'bit',
'many',
'Stanford',
'a',
'what',
'Facebook',
'a',
'LinkedIn',
'LinkedIn',
"members'",
'profiles',
'events',
'books',
'once',
'bit'],
"That's": ['the', 'the', 'where', 'not', 'an'],
'Visual': ['Display'],
'badly': ['behaved'],
'entry': ['for'],
'more': ['data',
'traditional',
'time',
'than',
'storage',
'data',
'data',
'interesting',
'and',
'frequently',
'conversions',
'scatter',
'detailed',
'tractable'],
'since': ['many'],
'significant': ['body', 'or'],
'running': ['Linux'],
'At': ["O'Reilly", 'some', "IBM's"],
'earlier': ['data'],
'perform': ['arcane', 'computations', 'a'],
'hmason': ['data'],
'album': ['metadata', 'titles', '.'],
'associated': ['with'],
'layer': ['depletion'],
'computations': ['on', 'make'],
'Increased': ['storage'],
'mountain': ['of'],
'holistic': ['approach'],
'price': ['reduction'],
'any': ['e', 'CD', 'data', 'one', 'data', 'given'],
'Elastic': ['MapReduce', 'MapReduce'],
'sounds': ['simple', 'like'],
'reports': ['on'],
'expands': ['to'],
'conquer': ['strategy'],
'find': ['to', 'out', 'the', 'those', 'out', 'new'],
'across': ['a', 'many', 'an', 'many', 'thousands', "LinkedIn's"],
'exact': ['length'],
'public': ['report', 'website', 'use', 'photos'],
'database': ['behind',
'.',
'of',
'of',
'.',
'including',
'for',
'or',
'model',
'systems',
'servers',
'Hive',
'but',
'is'],
'majors': ['.'],
'possible': ['to', '.', 'to', 'to'],
'outside': ['of', 'the'],
'70s': ['were'],
'great': ['place', 'example'],
'point': ['traditional', "it's"],
'next': ['sexy', 'Intel', "week's", 'generation', 'decades'],
'extract': ['value'],
'join': ['the'],
'but': ['all',
'fairly',
'there',
'to',
'tools',
'big',
'different',
'in',
'not',
'Hadoop',
'a',
'how',
'it',
'we',
'newer',
'also'],
'County': ['by'],
'update': ['and'],
'originated': ['with'],
'sharing': ['music'],
'developers': ['of', 'have', 'explore', 'and'],
'added': ['value', 'value'],
'could': ['even', 'author'],
'storage': ['capacity', 'has', 'is', 'capacity', 'capacity'],
'development': ['agile', 'project'],
"today's": ['big'],
'story': ['and', 'to', 'which', "isn't", '.', 'the'],
'turn': ['around', 'this'],
'graphics': ['facilities', 'package'],
'platforms': ['or', 'are', 'have'],
'inexpensively': ['possibly'],
'maps': ['from'],
"you'll": ['get'],
'scientist': ['at', '.', '.', 'at', 'was', 'at'],
'might': ['not', 'mean', 'be', 'be', 'like'],
'new': ['value',
'breed',
'data',
'insights',
'products',
'ways',
'Intel'],
'marketplace': ['for'],
'loop': ['in'],
'extensions': ['extend'],
'there': ['has', 'was', "isn't", 'are', 'are', 'are'],
'biggest': ['problem'],
'correlated': ['with'],
'distributing': ['an'],
'multistage': ['processing'],
'25': ['GB'],
'messy': ['.', 'HTML', 'and'],
'iterate': ['over'],
'retail': ['business', 'transactions', 'chain'],
'finding': ['data', 'data', 'that'],
'Tim': ["O'Reilly"],
"customers'": ['behavior'],
'fairly': ['well', 'comprehensive'],
'medium': ['and'],
'which': ['they',
'cover',
'can',
'may',
'originated',
'you',
'is',
'are',
'provides',
'there',
'lets',
'exposes',
'to',
'survival'],
'Data': ['science',
'Mashups',
'is',
'expands',
'Mashups',
'conditioning',
'is',
'is',
'science',
'science',
'scientists',
'is',
'3'],
'disciplines': ['it'],
'centric': ['industries'],
'length': ['in', 'of', 'and'],
'called': ['NoSQL', 'Pig', 'a'],
'entrepreneurs': ['.'],
'pyrotechnics': ['.'],
'Walmart': ['over'],
'difficult': ['and', 'problem', 'problem'],
'toward': ['its'],
'asking': ['whether', 'the'],
'asked': ['a'],
'revolutionary': ['CDDB'],
'working': ['with', 'with'],
"shopper's": ['cards'],
'per': ['capita', 'gram', 'dollar'],
'when': ['the', 'she', 'the', 'you'],
'pickles': ['causes', '.'],
'post': ['I'],
'Nutshell': ['generated'],
'analyzing': ['musical', 'an'],
'de': ['allocate', 'facto'],
'correlations': ['across'],
'analyzed': ['.', 'the', 'a'],
"haven't": ['stressed'],
"isn't": ['really',
'the',
'just',
'just',
'just',
'finding',
'just',
'going',
'.',
'always',
'just',
'superseded',
'just',
'as',
'what'],
'doing': ['so'],
'particularly': ['Elastic', 'to', 'if', 'physicists'],
'days': ['.'],
'old': ['style'],
'going': ['to', 'to', 'to', 'back', 'to'],
'mining': ['your'],
"What's": ['less'],
'English': ['and', '.'],
'Patil': ['chief', 'described', 'calls'],
'right': ['questions'],
'smaller': ['problems', 'auxiliary'],
'Java': ['and'],
'wrong': ['with'],
'biology': ['building'],
'So': ['how'],
'game': ['logs'],
'source': ['the', 'and', 'a', 'implementation', 'R'],
'died': ['and'],
'delivers': ['intermediate'],
'needs': ['to'],
'hiking': ['trails'],
'required': ["let's"],
'finish': ['for'],
'needed': ['paying'],
'dollars': ['.'],
'do': ['we',
'with',
'we',
'the',
'the',
'data',
'you',
'you',
'it',
'the',
'with',
'something',
'you',
'massive',
'know'],
"There's": ['a'],
'Roger': ['Magoulas'],
'step': ['of', 'in'],
'the': ['next',
'next',
'many',
'technologies',
'companies',
'unique',
'data',
'creation',
'earlier',
'Web',
'CDDB',
'exact',
'CD',
'length',
'track',
'database',
'only',
"customers'",
'data',
'site',
'users',
'products',
'beginning',
'last',
'amount',
'problem',
'data',
'county',
'addresses',
'geographical',
'foreclosures',
'data',
'wild',
'data',
'publishing',
'data',
'direct',
'ones',
'early',
'reduction',
'first',
'increase',
'space',
'more',
'web',
'analysis',
'foundation',
'metadata',
'Philadelphia',
'HTML',
'dirty',
'job',
'data',
'quality',
'missing',
'incongruous',
'discovery',
'data',
'problem',
'data',
'problem',
'data',
'trick',
'growing',
'grammatical',
'English',
'Cassandra',
'Python',
'problem',
'Natural',
'classification',
'set',
'word',
'size',
'data',
'problem',
'data',
'most',
'understanding',
'data',
'organizations',
'relational',
'data',
'kind',
'difference',
'logical',
'MapReduce',
'map',
'intermediate',
'results',
'Hadoop',
"world's",
'key',
'time',
'key',
'performance',
'HBase',
'right',
'number',
'calculation',
'experiments',
'data',
'most',
'OpenCV',
'toolbox',
'application',
'grammar',
'joke',
'point',
'existence',
'conclusions',
'data',
'open',
'numbers',
'stories',
'classic',
'data',
'first',
'data',
'state',
'art',
'visualizations',
'growth',
'aesthetics',
'visualization',
'spread',
'spread',
'data',
'tools',
'data',
'kind',
'data',
'first',
'results',
'organization',
'people',
'best',
'most',
'data',
'big',
'big',
'data',
'story',
'data',
'process',
'group',
'process',
'Cornell',
'heart',
'CDDB',
'same',
'puzzle',
'era',
'winners',
'people',
'companies',
'same',
'data',
'nascent',
'2012',
'willingness',
'ability',
'ability',
'box',
'problem',
'companies',
'core',
'vanguard',
'shared',
'URLs',
'next',
'next',
'new',
'low',
'70s'],
'some': ['other', 'point', 'hints', 'data', 'creativity'],
'level': ['.', 'dataflow'],
'sea': ['of'],
'reality': ['of'],
'define': ['a'],
'work': ['with',
'.',
'with',
'without',
'if',
'.',
'.',
'R',
"That's",
'with'],
'whether': ['sales', "you're", 'this'],
'directly': ['machine', 'is'],
'cloud': ['of'],
'and': ['about',
'the',
'middleware',
'data',
'so',
'creates',
'coupled',
'gets',
'creates',
'LinkedIn',
'uses',
'a',
'put',
'a',
'using',
'longitude',
'group',
'other',
'relevant',
'data',
'presenting',
'where',
'even',
'Factual',
'game',
'improve',
'of',
'leaving',
'frequent',
"that's",
'number',
'increase',
'a',
'analyzed',
'use',
'other',
"isn't",
'other',
'be',
'machine',
'Python',
'you',
'more',
"you'll",
'want',
'other',
'next',
'are',
'understanding',
'reporting',
'their',
'replication',
'slow',
'5',
"Amazon's",
'are',
'to',
'enormous',
'conquer',
'then',
'de',
'reliability',
'other',
'consumers',
'testing',
'particularly',
'different',
"it's",
'delivers',
'mobile',
'building',
'SnapTell',
'even',
'look',
'Mahout',
'tune',
'making',
'other',
'its',
'quirky',
'newer',
'a',
'presenting',
'Ben',
'if',
'the',
'implement',
'come',
'made',
'added',
'then',
'the',
'find',
'artists',
'the',
'data',
'use',
'LinkedIn',
'made'],
'product': ['.', 'reviews', 'cycles', 'or', 'that'],
'sites': ['like'],
'else': ['iTunes'],
'until': ['after'],
'tune': ['the'],
'relationships': ['to'],
'Oil': ['companies'],
'knows': ['how'],
'conclusions': ["you're", '.'],
'Language': ['Toolkit'],
'author': ['a'],
'similar': ['to'],
'its': ['value',
'story',
'datasets',
'own',
'EC2',
'comprehensive',
'story',
'story',
'goal'],
'own': ['data', 'data', 'sales', 'story'],
'human': ['language', 'intelligence'],
'Dynamo': ['and'],
'perhaps': ['a'],
'Casey': ["Reas'"],
'never': ['an', 'conceived'],
'sexy': ['job'],
'winners': ['will'],
'practicing': ['data'],
'array': ['of'],
'partners': ['even'],
'tapped': ['into'],
'OpenCV': ['library'],
'trails': ['.'],
'iteratively': ['.'],
'concerned': ['about'],
"that's": ['not',
'available',
'available',
'where',
'directly',
'generated',
'going',
'different',
'not',
'going'],
'services': ['credit', 'microformats', 'like'],
'performance': ['and'],
'accordingly': ['.'],
'yet': ['know'],
'CD': ['had', '.', "you've", "that's", "you've"],
'support': ['complex', '.'],
'almost': ['everything', 'all', 'always'],
'regression': ['analysis'],
'Prediction': ['API'],
'studying': ['the'],
'were': ['too', 'insufficient', 'the', 'real'],
'generating': ['data', 'and'],
'figure': ['out', 'out', 'out'],
'problem': ['as',
'.',
"isn't",
'involves',
'.',
'is',
'.',
'.',
'across',
'creating',
'.',
'with',
'.',
'that',
'though',
'that',
'from',
'or'],
"Tufte's": ['Visual'],
'where': ['actuaries',
'it',
'it',
"Moore's",
"it's",
'services',
'art'],
'Web': ['2', 'was', '2'],
'state': ['where', 'of'],
'For': ['example', 'computer'],
'Wattenberg': ['wattenberg'],
'accept': ['all'],
'And': ["it's", 'that', 'as', 'this'],
'cores': ['.', 'running'],
'plus': ['thousands'],
'gigabyte': ['disk'],
'We': ['are', 'now', "don't"],
'office': ['extracting', '.'],
'are': ['data',
'in',
'involved',
'required',
'annotated',
'consumer',
'seeing',
'easier',
'extremely',
'essential',
'easily',
'we',
'similar',
'designed',
'designed',
'increasing',
'frequently',
'the',
'designed',
'two',
'then',
'then',
'several',
'associated',
'many',
'valid',
'many',
'really',
'many',
'full',
'you',
'built',
'but',
'inherently',
'following'],
'spreadsheet': ['.'],
'entrepreneurship': ['with'],
'missing': ['or', 'do', 'points'],
'easer': ['to'],
'thousands': ['of', 'of', 'of'],
'increasing': ['faster'],
'latitude': ['and'],
'themselves': ['Storing'],
'style': ['screen'],
'get': ['a', '.', 'better', 'a', 'presentable', 'a'],
'figuring': ['out'],
'taken': ['advantage'],
'superseded': ['by'],
'Learning': ['course'],
'carefully': ['collected'],
'kept': ['pace'],
'team': ['member'],
'money': ['generating'],
'understands': ['that'],
'growing': ['Apple'],
'But': ['what',
'merely',
"we've",
'old',
'Hadoop',
'it',
"that's",
'the',
'the'],
"You're": ['likely'],
'wants': ['to'],
'organizations': ['that'],
'approach': ['.', 'which'],
'coupled': ['it', 'to'],
'hugely': ['important'],
'brought': ['it'],
'a': ['data',
'database',
'web',
'number',
'result',
'data',
'unique',
'database',
'database',
'CD',
'CD',
'CD',
'musical',
'data',
'master',
'few',
'book',
'book',
'camera',
'camera',
'customer',
'customer',
'trail',
'camera',
'cloud',
'feedback',
'number',
'public',
'map',
'community',
'holistic',
'tractable',
'sense',
'sea',
'trail',
'price',
'32',
'gram',
'purchase',
'state',
'public',
'spreadsheet',
'standard',
'database',
'simple',
'job',
'sense',
'large',
'lot',
'red',
'long',
'horde',
'schema',
'complex',
'new',
'few',
'data',
'divide',
'programming',
'number',
'single',
'search',
'single',
'home',
'simple',
'data',
'distributed',
'high',
'one',
'calculation',
'batch',
'trending',
'recommendation',
'quintessential',
'cell',
'RESTful',
'de',
'training',
'significant',
'large',
'few',
'cost',
'few',
'relatively',
'few',
'Nutshell',
'random',
'role',
'basic',
'background',
'single',
'thousand',
'picture',
'thousand',
'set',
'graph',
'foundational',
'new',
'dozen',
'sense',
'fairly',
'great',
'body',
'flu',
'population',
'matter',
'successful',
'question',
'few',
'consumer',
'team',
'multistage',
'hypothesis',
'regression',
'strong',
'discipline',
'lot',
'high',
'relatively',
'valuable',
'huge',
'huge',
'large',
'great',
'very',
'much',
'signature',
'database',
'data',
'company',
'solution',
'problem',
'lot',
'hugely'],
'Non': ['Relational'],
'service': ['in'],
'mathematicians': ['programmers'],
"you're": ['going', 'looking', 'asking', 'asking', 'drawing'],
'drives': ['in', 'are'],
'mathematical': ['background'],
'face': ['detection'],
'every': ['track',
'time',
'company',
'startup',
'non',
'project',
'level'],
'publicly': ['available'],
'art': ['particularly', 'comes', '.'],
'said': ['that', 'on'],
'250': ['pounds'],
'big': ['data', 'is', 'is', 'data', 'picture', 'problem'],
'apps': ['.'],
'founder': ['of'],
'basic': ['skill'],
'32': ['GB'],
'report': ['from', 'only'],
'nodes': ['to'],
'differentiates': ['data'],
'arcane': ['but'],
'Apple': ['job', 'from', 'industry', 'you', 'paying'],
'microSD': ['card'],
'machines': ['by'],
'Infochimps': ['and'],
'unknown': ['album'],
'useless': ['if'],
'broadly': ['defined'],
'millisecond': ['accuracy'],
'Alumni': ['group'],
'file': ['that'],
'who': ['runs', 'dies', 'figure'],
'points': ['That', 'at'],
"it's": ['a',
'revolutionary',
'figuring',
'not',
'usable',
'not',
'the',
'possible',
'possible',
'easy',
'about',
'telling',
'mining'],
'ties': ['most'],
'on': ['.',
'the',
'the',
'the',
'a',
'every',
'Facebook',
'a',
'long',
'sites',
'trending',
'Twitter',
'any',
'getting',
'track',
'data'],
'divide': ['and'],
'identical': ['subtasks'],
'foundation': ['of'],
'enables': ['the', 'stream', 'features'],
'systems': ['stop'],
'crucial': ['to', 'to', 'to'],
'streams': ['online', 'and'],
'Linux': ['brought', 'machines'],
'over': ['time', 'time', 'data', 'a'],
'leave': ['every', 'an', 'behind'],
'heard': ['it', 'a', 'big', 'the'],
'numbers': ['.', '.', 'mean'],
'same': ['result', 'conclusion'],
'schema': ['in', '.'],
'mashup': ['data'],
'You': ['can', "don't", 'have', 'need'],
'facto': ['standard'],
'actuaries': ['in'],
'allocate': ['and', 'processors'],
"doesn't": ['work'],
'plays': ['an', 'a'],
'oriented': ['web'],
'Describing': ['the'],
'vanguard': ['but'],
'solution': ['to', 'for', '.'],
'ways': ['to'],
'mortgage': ['foreclosures'],
'counting': ['increases'],
'When': ['natural', "you've"],
'answers': ['.'],
'Visualization': ['is', 'is'],
"we're": ['talking',
'finding',
'trying',
'discussing',
'entering'],
'Andrew': ["Ng's"],
'MySpace': ['activity'],
'MB': ['to'],
'would': ['be', 'be', 'have', 'take', 'start'],
'selling': ['music'],
'.': ['Five',
'0',
'But',
'The',
'Almost',
"There's",
'But',
'A',
"It's",
'Data',
'One',
'The',
'Gracenote',
'If',
'Before',
'If',
'While',
'Their',
'CDDB',
'Google',
"Here's",
'Facebook',
'Amazon',
'These',
'They',
'The',
'Whether',
"That's",
'In',
'Whether',
'And',
"It's",
'Data',
'The',
'Using',
'What',
"We're",
'To',
'Data',
'While',
'At',
'Sites',
'Factual',
'Much',
'0',
'The',
'Mobile',
'Point',
'All',
'Since',
'6',
'But',
'RAM',
'Hitachi',
'Whether',
'The',
'Data',
'The',
'The',
'Increased',
"That's",
'So',
'We',
'But',
'Many',
'They',
'The',
'This',
'If',
'Data',
"You're",
'It',
'To',
'Scripting',
'Once',
'Data',
'If',
'If',
'In',
"It's",
'If',
'Roger',
'While',
'To',
'And',
'Try',
'Google',
'Disambiguation',
'When',
"That's",
'If',
'For',
'01',
'If',
'01',
"We've",
'Oil',
'And',
'The',
"We're",
'At',
'What',
'Information',
'They',
'They',
'Most',
'Traditional',
'Managing',
'The',
'Relational',
'While',
'Do',
'Most',
'92',
'93',
'To',
'These',
'They',
'Many',
'While',
'Data',
'Google',
'In',
'In',
"It's",
"What's",
'The',
"Yahoo's",
'Many',
"Amazon's",
'You',
'Hadoop',
'It',
'If',
'Hadoop',
'In',
'Traditional',
'If',
'But',
'Faster',
"It's",
'Hadoop',
'Hadoop',
'Near',
'These',
'As',
'According',
'ly',
'Machine',
'We',
'You',
'Andrew',
'There',
'Google',
'For',
'Mechanical',
'Machine',
'The',
'Once',
"It's",
'Even',
'While',
'According',
'It',
"We've",
'That',
'More',
'But',
'Data',
'Statistics',
'Statistics',
'It',
'While',
'Although',
'It',
'If',
'A',
'The',
'To',
'Edward',
'But',
'Visualization',
'According',
'Visualization',
'Hilary',
'Once',
'There',
'GnuPlot',
'At',
'Nathan',
'One',
'And',
'Does',
'Does',
'There',
"It's",
'Data',
'Describing',
'Physicists',
'They',
'When',
'You',
'You',
'Scientists',
'Patil',
'It',
'But',
'Asking',
'It',
'In',
'Then',
'The',
'It',
'It',
'This',
'CDDB',
'But',
'Computing',
'Entrepreneurship',
"Patil's",
"That's",
'We',
'Hilary',
'Her',
'ly',
'ly',
'No',
'In',
'Data',
'They',
'They',
'They',
'Google',
'They',
'ly',
'Whether',
'The',
'Data',
'1',
'Whether',
'2'],
'Relational': ['databases', 'databases'],
'value': ['from', 'in', '.', 'iteratively', 'from'],
'local': ['supermarket'],
'readings': ['that'],
'cover': ['topics'],
'itself': ['and', 'becomes', 'but'],
'otherwise': ['have'],
'vision': ['the'],
'between': ['5', 'developers'],
'well': ['defined', 'behaved', 'you'],
'model': ['.'],
'cancer': ['throughout'],
'everything': ['can', 'from'],
'1': ['000', '.', '010', '012', 'The'],
'applicable': ['to'],
'lifecycle': ['where'],
'exploring': ['and'],
'attended': ['.'],
'creation': ['of'],
'organization': ['3'],
'incorporates': ['HDFS', 'a'],
'ly': ["it's", 'is', 'is', 'are'],
'behind': ['a', 'whenever', 'Google'],
'according': ['to'],
'throughout': ['a'],
'defined': ['kinds', 'problems'],
'citizen': ['science'],
'calls': ['data'],
'joke': ['that', "doesn't"],
'creates': ['more', 'new'],
'build': ['information',
'clusters',
'interesting',
'the',
'data'],
'dataspora': ['statistics'],
'minutes': ['or'],
'kind': ['of', 'of', 'of'],
'insights': ['into'],
'unsolvable': ['see'],
'always': ['possible', 'requires'],
'telecommunications': ['companies'],
'frequent': ["shopper's"],
'Elefant': ['Weka'],
'branched': ['out'],
'No': ['one'],
'newer': ['technologies',
'techniques',
'extensions',
'companies'],
'0': ['Tim', 'and', '.', '.'],
'aesthetics': ['of'],
'he': ['put'],
'massaging': ['it'],
'nicely': ['in'],
'she': ['gets', 'starts'],
'distributed': ['across', 'across', 'filesystem', 'computing'],
'Processing': ['is'],
'What': ['is', 'differentiates', 'are'],
'different': ['from',
'from',
'forms',
'According',
'.',
'assumptions',
'datasets',
'algorithms'],
'part': ['of', 'of', 'of', 'of'],
'assumptions': ['different'],
'become': ['a'],
'processor': ['speed'],
'analyses': ['to'],
'too': ['low'],
'Northern': ['Europe'],
'2015': ["they're"],
'databases': ['and', 'are', 'appear', 'or', 'though', 'are'],
'guesses': ['about'],
'consume': ['Atom'],
'pursue': ['intriguing'],
'low': ['1', 'values'],
'science': ['the',
'.',
'enables',
'.',
'data',
'from',
'is',
'.',
'what',
'at',
'.',
"isn't",
'it',
'.',
'requires',
'to',
'group',
'group',
'majors'],
'involves': ['human', 'making'],
'economies': ['work'],
'property': ['Jeff'],
'subtasks': ['that', 'which'],
'increase': ['of', 'in', 'of'],
'example': ['if', 'of', '.'],
'Soup': ['natural'],
'APIs': ['and'],
'sends': ['it'],
'creatively': ['to'],
'features': ['like', 'only'],
'routinely': ['.'],
'conflicts': ['with'],
'close': ['to'],
'logs': ['tweet', 'from'],
'though': ['these', 'neither', '.', 'not'],
'use': ['data',
'patterns',
'and',
'.',
'data',
'it',
'of',
'anything',
'Mechanical',
'them',
'one',
'via',
'data'],
'creating': ['data', 'large', 'the'],
'computer': ['science',
'vision',
'science',
'science',
'science'],
'willing': ['to'],
'clean': ['as'],
'advertisement': ['for'],
'just': ['an',
'a',
'a',
'a',
'companies',
'their',
'the',
'geek',
'announced',
'a',
'about',
'how',
'the',
'a',
'spent',
'throw'],
'tastes': ['though'],
'3': ['.', 'Where', 'Information'],
'terabyte': ['drives'],
'problems': ['ranging',
'.',
'ranging',
'up',
'.',
'to',
"here's"],
'search': ['for', 'for', 'terms', 'across', 'to'],
'fluctuation': ['.'],
'simpler': ['.'],
'improve': ['its'],
'merely': ['using'],
'cents': ['each'],
'allure': ['but', 'is'],
'requires': ['something', 'a', 'skills'],
'near': ['real'],
'signature': ['based', 'based', 'in'],
'anomalous': ['data'],
'random': ['fluctuation'],
'servers': ['is'],
'generated': ['automatically', 'by', '2'],
'Near': ['real'],
'exhaust': ['that', 'you'],
'2': ['.', '.', 'hackingdata', 'percent', 'Information'],
'whenever': ['you'],
'lot': ['about', 'of', 'of'],
'today': ['every'],
'anyone': ['practicing'],
'richer': ['data'],
'multiple': ['unstructured'],
'hints': ['at'],
'dirty': ['work'],
'photos': ['available', 'from'],
'goes': ['.', 'far'],
'hire': ['a'],
'without': ['investing'],
'coherently': ['.'],
'enough': ["it's"],
'Python': ['are', 'language', 'Elefant', 'design'],
'tractable': ['form', 'problem'],
'enabling': ['agile'],
'The': ['web',
'developers',
'thread',
'question',
'web',
'importance',
'more',
'data',
'first',
'foreclosure',
'most',
'need',
'most',
'Turk',
'problem',
'result',
'future',
'part',
'ability',
'NASA'],
'involved': ['with'],
'means': ['.'],
'sources': ['.', 'of', 'all', 'in', 'to', 'and'],
'flu': ['virus'],
'us': ['here', 'new'],
'forms': ['.'],
'Cloudera': ['which'],
'ignore': ['the', 'anomalous'],
'Whether': ['that', "we're", 'you', "it's", 'humans'],
'advised': ['mobile'],
'quite': ['differently'],
'learning': ['libraries',
'.',
'is',
'PyBrain',
'algorithms',
'almost',
'and'],
'Office': ['2015'],
'formats': ['that', "that's", 'including'],
'may': ['know', 'be', 'not', 'or', 'not'],
'comparative': ['if'],
'Traditional': ['relational', 'data'],
'dies': ['has'],
'This': ['data', 'is'],
'libraries': ['.', 'available', '.'],
'view': ['the'],
'epidemic': ['and'],
'bad': ['your'],
'time': ['they',
'online',
'.',
'you',
'.',
'.',
'data',
'reports',
'MapReduce',
'.',
'.',
'plus',
'to'],
'even': ['your',
'job',
'richer',
'getting',
'days',
'an',
'face',
'have'],
'relatively': ['large', 'small'],
'death': ['because'],
'full': ['of', 'fledged'],
'recommended': ['.'],
'logical': ['descendants'],
'sets': ['.', '.'],
'ability': ['to', 'to', 'to'],
'Toolkit': ['library'],
'very': ['useful',
'flexible',
'effective',
'difficult',
'broadly'],
'thousand': ['data', 'words', 'numbers'],
'auctions': ['work'],
'rather': ['than', 'than', 'than'],
'RAM': ['has'],
'task': ['the', 'but', 'up', 'is', '.'],
'Five': ['years'],
'much': ['bigger', 'easier', 'of', 'more'],
'rich': ['APIs'],
'delayed': ['because'],
'reduce': ['task'],
'becomes': ['part'],
'simple': ['enough', 'task', 'MapReduce', 'program', '.'],
'datasets': ['including',
'which',
'for',
'effectively',
'present',
'the',
'using',
'quickly',
'and'],
'telling': ['its', 'you', 'you', "isn't", '.'],
'home': ['at'],
'require': ['soft', 'millisecond'],
'than': ['kept',
'for',
'sales',
'another',
'computer',
'tackling'],
'capita': ['income'],
'hypotheses': ['and'],
'complex': ['transactions', 'set'],
'ever': ['used', 'seen'],
'Nielsen': ['BookScan'],
'snakes': ['.'],
'instrumented': ['.'],
'by': ['data',
'their',
'taking',
'neighborhood',
'the',
'Excel',
'telling',
'a',
'providing',
'extremely',
'newer',
'making',
'analyzing',
'Jeff',
'Jeff'],
'solve': ['a', 'a'],
'awk': ['to'],
'popularized': ['the'],
"I've": ['heard'],
'viewing': ['a'],
'did': ['you'],
"Here's": ['a'],
'Information': ['platforms', 'is', 'Platforms', 'Platforms'],
'together': ['is', 'fundamentally', 'at'],
'gets': ['back', 'a'],
'see': ["what's", 'classification', 'midomi'],
'6': ['GHz'],
'comes': ['from', 'in', 'and', 'close', 'in'],
'data': ['is',
'In',
'science',
'driven',
'driven',
'services',
"isn't",
'science',
'application',
'itself',
'as',
"it's",
'product',
'products',
'products',
'not',
'products',
'problem',
'products',
'.',
'products',
'exhaust',
'that',
'they',
'collected',
'is',
'science',
"that's",
'from',
'or',
"it's",
'or',
'contributed',
'from',
'to',
'source',
'effectively',
'but',
"that's",
'effectively',
'science',
'science',
'in',
'scientists',
'massaging',
'lifecycle',
"we're",
'from',
'publicly',
'and',
'to',
'MySpace',
'we',
'.',
'wherever',
'trail',
'would',
"isn't",
'you',
'exhaust',
'.',
'science',
'useful',
'analysis',
'conditioning',
'into',
'in',
'feeds',
'in',
'are',
'used',
'was',
'sources',
'conditioning',
'you',
'.',
'is',
'is',
'after',
'is',
'collection',
'science',
'and',
'at',
'adds',
'analysis',
'but',
'centric',
'is',
'itself',
'problems',
'.',
'run',
"that's",
'warehouses',
'rather',
'formats',
'changes',
'platforms',
'sources',
'.',
'driven',
'analysis',
'is',
'platform',
'problems',
'platform',
'analysis',
'analysis',
'as',
'analysis',
'scientist',
'scientist',
'with',
'perhaps',
'points',
'analysis',
'science',
'speak',
'or',
'might',
'are',
'and',
'analysis',
'visualization',
'science',
'scientist',
'conditioning',
'is',
'set',
'might',
'.',
'tell',
'sources',
'was',
'were',
'science',
'science',
'samples',
'intensive',
'scientists',
'.',
'you',
'out',
'is',
'scientists',
'product',
'all',
'jiujitsu',
'jiujitsu',
'creatively',
'scientist',
'.',
'that',
'industry',
'products',
'collection',
'conditioning',
'what',
'successfully',
'.',
'to',
'it',
'was'],
'way': ['to', 'to'],
'reduction': ['of', 'in'],
'size': ['and', 'of'],
'track': ['on',
'lengths',
'titles',
'sends',
'titles',
'lengths'],
'dollar': ['or'],
'ill': ['advised'],
'their': ['own',
'users',
'own',
'schemas',
'Prediction',
'machine',
'libraries',
'datastreams',
'success',
'path'],
'PyBrain': ['in'],
'program': ['that'],
'5': ['.', '.'],
'circles': ['around'],
'component': ['of'],
'engine': ['is'],
'platform': ['though', '.', 'Hadoop'],
'descendants': ['of'],
'sale': ['devices'],
'Entrepreneurship': ['is'],
'software': ['development', 'decided'],
'causes': ['death'],
'MHz': ['to'],
'bits': ['per', 'per'],
'sales': ['data', 'to', 'to'],
"world's": ['largest'],
'parse': ['plain', 'the'],
'bigger': ['increases'],
"IBM's": ['Many'],
'behavior': ['the'],
'commercial': ['support', 'statistical'],
'Magoulas': ['who'],
'started': ['out', 'looking', 'small'],
'finance': ['that'],
'surprisingly': ['appropriate'],
'make': ['it',
'online',
'a',
'that',
'it',
'it',
'it',
'it',
'from'],
'real': ['people', 'time', 'time', 'time', 'time', '.'],
'consumable': ['.'],
'kinds': ['of', 'of'],
'insufficient': ['computing', '.'],
'Physicists': ['have'],
'Southern': ['Europe'],
"O'Reilly": ['said', 'we', 'was'],
'10': ['MHz', '000', '000'],
'income': ['and'],
'body': ['.', 'of', 'Or'],
'automatically': ['from'],
'Much': ['of'],
"don't": ['require', 'have', 'yet'],
'FlowingData': ['blog'],
'files': ['with'],
'foreclosure': ['data'],
'foundational': ['text'],
'Does': ['it', 'a'],
'techniques': ['for', 'from'],
'millions': ['of'],
'posting': ['you'],
'Mobile': ['applications'],
'classify': ['them', 'them', 'a'],
'bit': ['.', '.', '.', '.'],
'here': ['.', '.'],
'announced': ['their'],
'trending': ['topics', 'topics', 'topics'],
'group': ['them',
'at',
'together',
'he',
'at',
'recommendation',
'.'],
'basically': ['a'],
'traditional': ['retail',
'statistics',
'techniques',
'data',
'analysis',
'statistics',
'business',
'computer'],
'core': ['of'],
'want': ['to', 'to'],
'records': ['citizen'],
'closer': ['interaction'],
'should': ['know'],
'Perl': ['and'],
'far': ['beyond'],
'using': ['data',
'their',
'Yahoo',
'the',
'Google',
'SQL',
'photos',
'smaller'],
'through': ['a'],
'artificial': ['intelligence', 'intelligence'],
'necessary': ['to', 'for'],
'successful': ['retail', 'businesses'],
'labor': ['.'],
'speed': ['has', '.', '.'],
'after': ['all', "you've"],
'Where': ['do'],
'will': ['find', 'be', 'be'],
'largest': ['production'],
'results': ['are', 'into', 'in', '.', 'it', 'of'],
'process': ['.', 'of', 'worked', 'that', 'it'],
'Sites': ['like'],
'stream': ['processing', 'directly'],
"sheriff's": ['office', 'office'],
'end': ['and', 'to', 'solution'],
'01': ['each', 'to'],
'One': ['of', 'of'],
'creative': ['visualizations'],
"let's": ['look'],
'gigabytes': ['to'],
'Weka': ['in'],
'If': ["you've",
'you',
"you've",
'data',
'data',
'the',
'you',
'you',
'anything',
'you',
"there's"],
'students': ['this'],
'show': ['how'],
'acquires': ['its'],
'products': ['.',
'on',
'.',
'.',
'that',
'they',
'by',
'available',
'that',
'are',
'.',
'from',
'.',
'incrementally'],
'present': ['computational'],
'divided': ['into'],
'error': ['detection'],
'I': ['examine', "haven't"],
'extracting': ['addresses'],
'businesses': ['will'],
'statistics': ['is',
'and',
'where',
'is',
'building',
'is',
'to',
'work'],
'instrumental': ['in'],
'reported': ['that'],
'sometimes': ['frightening'],
'accuracy': ['.', '.'],
'result': ['.', 'was', '.'],
'API': ['which'],
'Hadoop': ['project',
'application',
'developers',
'to',
'images',
'goes',
'datasets',
'is',
'has',
'and',
'is',
'Online',
'processes',
'.',
'or'],
'other': ['databases',
'people',
'users',
'source',
'socio',
'newer',
'languages',
'data',
'components',
'disciplines',
'data',
'members'],
'calculation': ['it', 'then'],
'it': ['according',
'to',
'does',
'to',
'to',
'.',
'into',
'tell',
'comes',
'and',
'goes',
'possible',
'and',
'.',
'.',
'well',
'simpler',
'necessary',
'and',
'onto',
'much',
'.',
'might',
'easy',
'easier',
'arrives',
'takes',
'complements',
'comes',
'.',
'.',
'up',
'.',
'look',
'involves',
"isn't",
'tell',
'started',
'was',
'.',
'The',
'all',
'to',
'to',
'to',
'to',
"that's",
'appears'],
'first': ['gigabyte', 'step', 'step', 'data', 'flippant'],
'moved': ['from'],
'searching': ['a'],
'Faster': ['computations'],
'package': ['library', 'Casey'],
'space': ['you'],
'tend': ['to'],
'Try': ['using'],
"Ng's": ['Machine'],
'Pig': ['and'],
'so': ['on', '.', 'products', 'does'],
'be': ['data',
'mined',
'correlated',
'mined',
'useless',
'fun',
'dealing',
'nice',
'ready',
'willing',
'more',
'able',
'rolled',
'distributed',
'widely',
'called',
'current',
'worth',
'interesting',
'saying',
'hard',
'the',
'built',
'able',
'a'],
'tweet': ['streams'],
'years': ['ago', 'there', 'ago'],
'meaningful': ['definition'],
'publishing': ['industry', 'industry'],
'we': ['suddenly',
'mean',
"aren't",
'frequently',
'currently',
"couldn't",
'make',
'trying',
'could',
'now',
'do'],
'relational': ['database', 'database'],
'them': ['by',
'are',
'only',
'open',
'.',
'inexpensively',
'into',
'.',
'.',
'the'],
'interests': ['you'],
'words': ['but'],
'microformats': ['and'],
'depletion': ['was'],
'questions': ['and'],
'parsed': ['the'],
'annual': ['growth'],
'blog': ['is'],
'path': ['.'],
'day': ['a'],
'Scientists': ['also'],
'not': ['just',
'in',
'as',
'just',
'just',
'just',
'counting',
'know',
'really',
'absolute',
'finish',
'be',
'really',
'just',
'a',
'unsolvable'],
'text': ['in', 'for'],
'disambiguating': ['Apple'],
'usable': ['.'],
'images': ['for'],
'Why': ['do'],
'pace': ['with'],
'fun': ['to'],
'center': ['stage'],
'DJ': ['Patil'],
'decided': ['that', 'to'],
'Her': ['job'],
'possibilities': ['that'],
'requiring': ['geolocation'],
"what's": ['happening', 'happening', 'important'],
'Edward': ["Tufte's"],
'denies': ['this'],
'Mashups': ['in', 'in'],
'notice': ['that'],
'consistency': ['to', 'is', 'but', 'and'],
'small': ['.', 'simple', 'and'],
'answer': ['to'],
'practices': ['are'],
'seen': ['much', 'the', 'a'],
'trying': ['to', 'to', 'to', 'to', 'to'],
'image': ['matching'],
'HDFS': ['a'],
'difference': ['between', 'is'],
'values': ['whch'],
'Media': ['visualization'],
'virus': ['through'],
"Yahoo's": ['claim'],
'010': ['or'],
'mathematics': ['to'],
'following': ['their'],
'site': ['.', 'that'],
'incongruous': ['.', 'do', 'data'],
'factors': ['.'],
'analyzes': ['mortgage'],
'percent': ['annual', '.', 'more'],
'cycles': ['closer'],
...})
In [78]:
transitions['.']
Out[78]:
['Five',
'0',
'But',
'The',
'Almost',
"There's",
'But',
'A',
"It's",
'Data',
'One',
'The',
'Gracenote',
'If',
'Before',
'If',
'While',
'Their',
'CDDB',
'Google',
"Here's",
'Facebook',
'Amazon',
'These',
'They',
'The',
'Whether',
"That's",
'In',
'Whether',
'And',
"It's",
'Data',
'The',
'Using',
'What',
"We're",
'To',
'Data',
'While',
'At',
'Sites',
'Factual',
'Much',
'0',
'The',
'Mobile',
'Point',
'All',
'Since',
'6',
'But',
'RAM',
'Hitachi',
'Whether',
'The',
'Data',
'The',
'The',
'Increased',
"That's",
'So',
'We',
'But',
'Many',
'They',
'The',
'This',
'If',
'Data',
"You're",
'It',
'To',
'Scripting',
'Once',
'Data',
'If',
'If',
'In',
"It's",
'If',
'Roger',
'While',
'To',
'And',
'Try',
'Google',
'Disambiguation',
'When',
"That's",
'If',
'For',
'01',
'If',
'01',
"We've",
'Oil',
'And',
'The',
"We're",
'At',
'What',
'Information',
'They',
'They',
'Most',
'Traditional',
'Managing',
'The',
'Relational',
'While',
'Do',
'Most',
'92',
'93',
'To',
'These',
'They',
'Many',
'While',
'Data',
'Google',
'In',
'In',
"It's",
"What's",
'The',
"Yahoo's",
'Many',
"Amazon's",
'You',
'Hadoop',
'It',
'If',
'Hadoop',
'In',
'Traditional',
'If',
'But',
'Faster',
"It's",
'Hadoop',
'Hadoop',
'Near',
'These',
'As',
'According',
'ly',
'Machine',
'We',
'You',
'Andrew',
'There',
'Google',
'For',
'Mechanical',
'Machine',
'The',
'Once',
"It's",
'Even',
'While',
'According',
'It',
"We've",
'That',
'More',
'But',
'Data',
'Statistics',
'Statistics',
'It',
'While',
'Although',
'It',
'If',
'A',
'The',
'To',
'Edward',
'But',
'Visualization',
'According',
'Visualization',
'Hilary',
'Once',
'There',
'GnuPlot',
'At',
'Nathan',
'One',
'And',
'Does',
'Does',
'There',
"It's",
'Data',
'Describing',
'Physicists',
'They',
'When',
'You',
'You',
'Scientists',
'Patil',
'It',
'But',
'Asking',
'It',
'In',
'Then',
'The',
'It',
'It',
'This',
'CDDB',
'But',
'Computing',
'Entrepreneurship',
"Patil's",
"That's",
'We',
'Hilary',
'Her',
'ly',
'ly',
'No',
'In',
'Data',
'They',
'They',
'They',
'Google',
'They',
'ly',
'Whether',
'The',
'Data',
'1',
'Whether',
'2']
In [39]:
#시작 단어를 선택해야 하는데,, 마침표 다음에 등장하는 단어들중 임의로 하나를 선택하는것도 방법.
def generate_using_bigrams(transitions):
current = "." # 다음단어가 문장의 시작이라는 것을 의미
result = []
while True:
next_word_candidates = transitions[current] # bigrams (current, _)
current = random.choice(next_word_candidates) # choose one at random
result.append(current) # append it to results
if current == ".": return " ".join(result) # if "." 종료
In [40]:
random.seed(0)
print("bigram sentences")
for i in range(10):
print(i, generate_using_bigrams(transitions))
print()
#터무니 없는 문장이지만, 데이터 과학과 관련되어 보일법한 웹사이트를 만들때 사용할 만한 것들이기도 하다...?
bigram sentences
0 But that's going to be current to generate a large searches correlates what that can be saying you can you want to figure out what's important role in enabling agile practices are then combine entrepreneurship with hundreds of track titles artists album .
1 CDDB views music by Jeff Hammerbacher said on the Philadelphia County by analyzing musical problem isn't the relational database .
2 If anything from machine consumable .
3 It was probably generated by their path .
4 Roger Magoulas who runs the metadata track titles .
5 The result .
6 Amazon understands that nobody remembers says that nobody remembers says that gave them open source R is really necessary for working with gathering data collection tools like the ability to be nice if it necessary to do data scientists particularly physicists rather than for distributing an audio stream processing companies banks and other disciplines it arrives and are easier to use .
7 Point of them open source the low 1 The Turk is a distributed across many of data is a multistage processing companies like Infochimps and added value in size of data might like a schema in which lets developers have 1 The first gigabyte disk drives in .
8 It then use one stop information platform though these applications to many modern web applications it's figuring out incrementally rather than sales to develop and the data sources and 5 .
9 Five years ago .
In [ ]:
###+순차적으로 등장하는 단어들에 대한 정보를 얻기 위함?
a = ["We've",'all','heard', 'it']
b = ["We've",'all','heard', 'it']
b = ["We've",'all','heard', 'it']
list(zip(a,b))
In [42]:
#trigrams : 직전 두개의 단어에 의해 다음 단어가 결정됨
trigrams = list(zip(document, document[1:], document[2:]))
trigram_transitions = defaultdict(list)
starts = []
In [46]:
for prev, current, next in trigrams:
if prev == ".": # 만약 이전단어가 마침표 였다면
starts.append(current) # 이제 새로운 단어의 시작을 의미
trigram_transitions[(prev, current)].append(next)
In [47]:
#운장은 앞서 바이그램과 비슷한 방식으로 생성할 수 있다
def generate_using_trigrams(starts, trigram_transitions):
current = random.choice(starts) # choose a random starting word
prev = "." # and precede it with a '.'
result = [current]
while True:
next_word_candidates = trigram_transitions[(prev, current)]
next = random.choice(next_word_candidates)
prev, current = current, next
result.append(current)
if current == ".":
return " ".join(result)
In [48]:
print("trigram sentences")
for i in range(10):
print(i, generate_using_trigrams(starts, trigram_transitions))
print()
#조금 더 괜찮은 문장..
trigram sentences
0 In data science what you search for and uses it to a database of album metadata track titles .
1 More to the products they use .
2 GnuPlot is very effective R incorporates a fairly comprehensive graphics package Casey Reas' and Ben Fry's Processing is the state of the key component of a complex set of operations fails .
3 Facebook and LinkedIn have all tapped into their datastreams and made recommendations accordingly .
4 Increased storage capacity on every level .
5 While there are many libraries available for machine learning .
6 If you have to look at bits per dollar or raw capacity storage has more than kept pace with the customers' behavior the data itself and creates more data you will find to put into it .
7 The thread that ties most of these applications together is that they had built the world's largest production Hadoop application with 10 000 postings with the data .
8 They aren't well behaved XML files with all the data you can do something with it and where it goes .
9 Traditional data analysis algorithms is that data useful The first step of any data analysis has been an explosion in the publishing industry data from sensors government data or some other source the problem .
**3) 문법**
In [79]:
#항목 앞에 밑줄이 있으면 더 확장할 수 있는 규칙이고, 나머지는 종결어 라고하자.
# 예, '_s'는 문장(sentence) 규칙을 의미, '_NP'는 명사구(noun phrase), '_VP'는 동사구
grammar = {
"_S" : ["_NP _VP"],
"_NP" : ["_N",
"_A _NP _P _A _N"],
"_VP" : ["_V",
"_V _NP"],
"_N" : ["data science", "Python", "regression"],
"_A" : ["big", "linear", "logistic"],
"_P" : ["about", "near"],
"_V" : ["learns", "trains", "tests", "is"]
}
['_S']
['_NP','_VP']
['_N','_VP']
['Python','_VP']
['Python','_V','_NP']
['Python','trains','_NP']
['Python','trains','_A','_NP','_P','_A','_N']
['Python','trains','logistic','_NP','_P','_A','_N']
['Python','trains','logistic','_N','_P','_A','_N']
['Python','trains','logistic','data science','_P','_A','_N']
['Python','trains','logistic','data science','about','_A', '_N']
['Python','trains','logistic','data science','about','logistic','_N']
['Python','trains','logistic','data science','about','logistic','Python']
In [80]:
# 특정 항목이 종결어인지 아닌지?
def is_terminal(token):
return token[0] != "_"
# 각 항목을 대체 가능한 다른 항목 또는 항목들로 변환시키는 함수
def expand(grammar, tokens):
for i, token in enumerate(tokens):
# 종결어는 건너뜀
if is_terminal(token): continue
# 종결어가 아닌 단어는 대체할 수 있는 항목을 임의로 선택
replacement = random.choice(grammar[token])
if is_terminal(replacement):
tokens[i] = replacement
else:
tokens = tokens[:i] + replacement.split() + tokens[(i+1):]
# 새로운 단어의 list에 expand를 적용
return expand(grammar, tokens)
# 이제 모든 단어가 종결어 이기때문에 종료
return tokens
def generate_sentence(grammar):
return expand(grammar, ["_S"])
print("grammar sentences")
for i in range(10):
print(i, " ".join(generate_sentence(grammar)))
print()
grammar sentences
0 Python trains
1 logistic data science about linear Python learns regression
2 big data science near linear regression trains linear big Python near logistic regression about linear Python
3 logistic linear Python near linear data science about big Python trains
4 big linear data science near linear regression about linear Python is
5 big logistic big Python about logistic Python about linear regression near big regression trains linear data science near logistic data science
6 linear linear regression near linear Python about logistic data science learns
7 logistic big data science about linear Python near logistic data science learns
8 logistic linear linear data science about logistic data science near linear regression near big regression tests logistic big linear linear Python near big regression near big regression about big Python near linear data science
9 regression learns big regression about linear regression
**5) 토픽 모델링**
In [94]:
#단어의 분포에 따라 각 토픽에 weight를 할당
def sample_from(weights):
'''i를 weight[i] / sum(weight)의 확률로 반환'''
total = sum(weights)
rnd = total * random.random() # 0과 total 사이를 균일하게 선택
for i, w in enumerate(weights):
rnd -= w # return the smallest i such that
if rnd <= 0: return i # sum(weights[:(i+1)]) >= rnd
결국, weight가 [1,1,3] 이라면
1/5의 확룔로 0,
1/5의 확률로 1,
3/5의 확률로 2를 반환
In [83]:
documents = [
["Hadoop", "Big Data", "HBase", "Java", "Spark", "Storm", "Cassandra"],
["NoSQL", "MongoDB", "Cassandra", "HBase", "Postgres"],
["Python", "scikit-learn", "scipy", "numpy", "statsmodels", "pandas"],
["R", "Python", "statistics", "regression", "probability"],
["machine learning", "regression", "decision trees", "libsvm"],
["Python", "R", "Java", "C++", "Haskell", "programming languages"],
["statistics", "probability", "mathematics", "theory"],
["machine learning", "scikit-learn", "Mahout", "neural networks"],
["neural networks", "deep learning", "Big Data", "artificial intelligence"],
["Hadoop", "Java", "MapReduce", "Big Data"],
["statistics", "R", "statsmodels"],
["C++", "deep learning", "artificial intelligence", "probability"],
["pandas", "R", "Python"],
["databases", "HBase", "Postgres", "MySQL", "MongoDB"],
["libsvm", "regression", "support vector machines"]
]
In [87]:
#총 K=4개의 토픽을 반환해 보자!
K = 4
#각 토픽이 각 문서에 할당되는 횟수 (Counter는 각각의 문서를 의미)
document_topic_counts = [Counter()
for _ in documents]
#각 단어가 각 토픽에 할당되는 횟수 (Counter는 각 토픽을 의미)
topic_word_counts = [Counter() for _ in range(K)]
#각 토픽에 할당죄는 총 단어수 (각각의 숫자는 각 토픽을 의미)
topic_counts = [0 for _ in range(K)]
#각 문서에 포함되는 총 단어수 (각각의 숫자는 각 문서를 의미)
document_lengths = [len(d) for d in documents]
#단어 종류의 수
distinct_words = set(word for document in documents for word in document)
W = len(distinct_words)
#총 문서의 수
D = len(documents)
In [88]:
# documents[3]의 문서중 토픽 1과 관련 있는 단어의 수를 구하면.
document_topic_counts[3][1]
Out[88]:
0
In [89]:
#npl라는 단어가 토픽 2와 연관지어 나오는 횟수는?
topic_word_counts[2]["nlp"]
Out[89]:
0
In [90]:
def p_topic_given_document(topic, d, alpha=0.1):
"""문서 d의 모든 단어 중에서 topic에 속하는
단어의 비율 (smoothing을 더한 비율)"""
return ((document_topic_counts[d][topic] + alpha) /
(document_lengths[d] + K * alpha))
def p_word_given_topic(word, topic, beta=0.1):
"""topic에 속한 단어 중에서 word의 비율 (smoothing을 더한 비율)"""
return ((topic_word_counts[topic][word] + beta) /
(topic_counts[topic] + W * beta))
def topic_weight(d, word, k):
"""문서와 문서의 단어가 주어지면, k번째 토픽의 weight를 반환"""
return p_word_given_topic(word, k) * p_topic_given_document(k, d)
def choose_new_topic(d, word):
return sample_from([topic_weight(d, word, k)
for k in range(K)])
In [95]:
random.seed(0)
document_topics = [[random.randrange(K) for word in document]
for document in documents]
for d in range(D):
for word, topic in zip(documents[d], document_topics[d]):
document_topic_counts[d][topic] += 1
topic_word_counts[topic][word] += 1
topic_counts[topic] += 1
for iter in range(1000):
for d in range(D):
for i, (word, topic) in enumerate(zip(documents[d],
document_topics[d])):
# remove this word / topic from the counts
# so that it doesn't influence the weights
document_topic_counts[d][topic] -= 1
topic_word_counts[topic][word] -= 1
topic_counts[topic] -= 1
document_lengths[d] -= 1
# choose a new topic based on the weights
new_topic = choose_new_topic(d, word)
document_topics[d][i] = new_topic
# and now add it back to the counts
document_topic_counts[d][new_topic] += 1
topic_word_counts[new_topic][word] += 1
topic_counts[new_topic] += 1
document_lengths[d] += 1
In [93]:
#토픽의 의미를 찾기위해 각 토픽에 대해 가장 영향력이 높은(weight 값이 큰) 단어들이 무언인지 보자
for k, word_counts in enumerate(topic_word_counts):
for word, count in word_counts.most_common():
if count > 0: print(k, word, count)
0 pandas 2
0 scikit-learn 2
0 regression 1
0 statistics 1
0 artificial intelligence 1
0 Java 1
0 Big Data 1
0 Hadoop 1
0 statsmodels 1
0 HBase 1
0 libsvm 1
0 R 1
0 C++ 1
0 Haskell 1
1 neural networks 2
1 deep learning 2
1 databases 1
1 Postgres 1
1 numpy 1
1 MySQL 1
1 Cassandra 1
1 MongoDB 1
1 Mahout 1
1 Python 1
1 HBase 1
1 theory 1
1 decision trees 1
2 regression 2
2 Java 2
2 R 2
2 Python 2
2 Postgres 1
2 machine learning 1
2 statistics 1
2 artificial intelligence 1
2 MongoDB 1
2 HBase 1
2 Cassandra 1
2 mathematics 1
2 probability 1
2 statsmodels 1
2 C++ 1
2 scipy 1
3 Big Data 2
3 probability 2
3 machine learning 1
3 R 1
3 statistics 1
3 programming languages 1
3 NoSQL 1
3 libsvm 1
3 support vector machines 1
3 Spark 1
3 Python 1
3 MapReduce 1
3 Storm 1
In [96]:
# 단어들을 보고 다음고 ㅏ같이 이름을 지정해주자
topic_names = ["Big Data and programming languages",
"databases",
"machine learning",
"statistics"]
#사용자의 관심사가 무엇인지 알아볼 수 있다.
for document, topic_counts in zip(documents, document_topic_counts):
print(document)
for topic, count in topic_counts.most_common():
if count > 0:
print(topic_names[topic], count)
print()
['Hadoop', 'Big Data', 'HBase', 'Java', 'Spark', 'Storm', 'Cassandra']
statistics 5
Big Data and programming languages 4
machine learning 4
['NoSQL', 'MongoDB', 'Cassandra', 'HBase', 'Postgres']
machine learning 5
databases 3
statistics 2
['Python', 'scikit-learn', 'scipy', 'numpy', 'statsmodels', 'pandas']
Big Data and programming languages 6
machine learning 4
databases 2
['R', 'Python', 'statistics', 'regression', 'probability']
machine learning 6
Big Data and programming languages 3
statistics 1
['machine learning', 'regression', 'decision trees', 'libsvm']
machine learning 3
statistics 3
databases 2
['Python', 'R', 'Java', 'C++', 'Haskell', 'programming languages']
Big Data and programming languages 5
machine learning 4
statistics 3
['statistics', 'probability', 'mathematics', 'theory']
databases 3
Big Data and programming languages 2
statistics 2
machine learning 1
['machine learning', 'scikit-learn', 'Mahout', 'neural networks']
databases 4
Big Data and programming languages 2
machine learning 2
['neural networks', 'deep learning', 'Big Data', 'artificial intelligence']
databases 5
Big Data and programming languages 2
statistics 1
['Hadoop', 'Java', 'MapReduce', 'Big Data']
Big Data and programming languages 5
statistics 2
machine learning 1
['statistics', 'R', 'statsmodels']
machine learning 4
Big Data and programming languages 2
['C++', 'deep learning', 'artificial intelligence', 'probability']
machine learning 5
Big Data and programming languages 1
databases 1
statistics 1
['pandas', 'R', 'Python']
machine learning 3
Big Data and programming languages 2
statistics 1
['databases', 'HBase', 'Postgres', 'MySQL', 'MongoDB']
databases 9
machine learning 1
['libsvm', 'regression', 'support vector machines']
machine learning 27
Big Data and programming languages 22
databases 14
statistics 10
In [ ]:
Content source: rnder/data-science-from-scratch
Similar notebooks: