In [1]:
# Import Dependencies
import random
import re
import numpy as np
import nltk

In [2]:
text = """Global warming, also referred to as climate change, is the observed century-scale rise in the average temperature of the Earth's climate system and its related effects. Multiple lines of scientific evidence show that the climate system is warming. Many of the observed changes since the 1950s are unprecedented in the instrumental temperature record, which extends back to the mid-19th century, and in paleoclimate proxy records of climate change over thousands of years."""

In [3]:
# N-Gram Model
n = 3

In [4]:
ngrams = {}

In [5]:
words = nltk.word_tokenize(text)

In [6]:
words


Out[6]:
['Global',
 'warming',
 ',',
 'also',
 'referred',
 'to',
 'as',
 'climate',
 'change',
 ',',
 'is',
 'the',
 'observed',
 'century-scale',
 'rise',
 'in',
 'the',
 'average',
 'temperature',
 'of',
 'the',
 'Earth',
 "'s",
 'climate',
 'system',
 'and',
 'its',
 'related',
 'effects',
 '.',
 'Multiple',
 'lines',
 'of',
 'scientific',
 'evidence',
 'show',
 'that',
 'the',
 'climate',
 'system',
 'is',
 'warming',
 '.',
 'Many',
 'of',
 'the',
 'observed',
 'changes',
 'since',
 'the',
 '1950s',
 'are',
 'unprecedented',
 'in',
 'the',
 'instrumental',
 'temperature',
 'record',
 ',',
 'which',
 'extends',
 'back',
 'to',
 'the',
 'mid-19th',
 'century',
 ',',
 'and',
 'in',
 'paleoclimate',
 'proxy',
 'records',
 'of',
 'climate',
 'change',
 'over',
 'thousands',
 'of',
 'years',
 '.']

In [7]:
for i in range(len(words)-n):
    gram = ' '.join(words[i:i+n])
    if gram not in ngrams.keys():
        ngrams[gram] = []
    ngrams[gram].append(words[i+n])

In [8]:
ngrams


Out[8]:
{'Global warming ,': ['also'],
 'warming , also': ['referred'],
 ', also referred': ['to'],
 'also referred to': ['as'],
 'referred to as': ['climate'],
 'to as climate': ['change'],
 'as climate change': [','],
 'climate change ,': ['is'],
 'change , is': ['the'],
 ', is the': ['observed'],
 'is the observed': ['century-scale'],
 'the observed century-scale': ['rise'],
 'observed century-scale rise': ['in'],
 'century-scale rise in': ['the'],
 'rise in the': ['average'],
 'in the average': ['temperature'],
 'the average temperature': ['of'],
 'average temperature of': ['the'],
 'temperature of the': ['Earth'],
 'of the Earth': ["'s"],
 "the Earth 's": ['climate'],
 "Earth 's climate": ['system'],
 "'s climate system": ['and'],
 'climate system and': ['its'],
 'system and its': ['related'],
 'and its related': ['effects'],
 'its related effects': ['.'],
 'related effects .': ['Multiple'],
 'effects . Multiple': ['lines'],
 '. Multiple lines': ['of'],
 'Multiple lines of': ['scientific'],
 'lines of scientific': ['evidence'],
 'of scientific evidence': ['show'],
 'scientific evidence show': ['that'],
 'evidence show that': ['the'],
 'show that the': ['climate'],
 'that the climate': ['system'],
 'the climate system': ['is'],
 'climate system is': ['warming'],
 'system is warming': ['.'],
 'is warming .': ['Many'],
 'warming . Many': ['of'],
 '. Many of': ['the'],
 'Many of the': ['observed'],
 'of the observed': ['changes'],
 'the observed changes': ['since'],
 'observed changes since': ['the'],
 'changes since the': ['1950s'],
 'since the 1950s': ['are'],
 'the 1950s are': ['unprecedented'],
 '1950s are unprecedented': ['in'],
 'are unprecedented in': ['the'],
 'unprecedented in the': ['instrumental'],
 'in the instrumental': ['temperature'],
 'the instrumental temperature': ['record'],
 'instrumental temperature record': [','],
 'temperature record ,': ['which'],
 'record , which': ['extends'],
 ', which extends': ['back'],
 'which extends back': ['to'],
 'extends back to': ['the'],
 'back to the': ['mid-19th'],
 'to the mid-19th': ['century'],
 'the mid-19th century': [','],
 'mid-19th century ,': ['and'],
 'century , and': ['in'],
 ', and in': ['paleoclimate'],
 'and in paleoclimate': ['proxy'],
 'in paleoclimate proxy': ['records'],
 'paleoclimate proxy records': ['of'],
 'proxy records of': ['climate'],
 'records of climate': ['change'],
 'of climate change': ['over'],
 'climate change over': ['thousands'],
 'change over thousands': ['of'],
 'over thousands of': ['years'],
 'thousands of years': ['.']}

In [9]:
current_gram = ' '.join(words[0:n])
result = current_gram

for i in range(30):
    if current_gram not in ngrams.keys():
        break
    possibilities = ngrams[current_gram]
    next_item = possibilities[random.randrange(len(possibilities))]
    result += ' ' + next_item
    rwords = nltk.word_tokenize(result)
    current_gram = ' '.join(rwords[len(rwords)-n:len(words)])

In [10]:
result


Out[10]:
"Global warming , also referred to as climate change , is the observed century-scale rise in the average temperature of the Earth 's climate system and its related effects . Multiple lines of"