In [1]:
from pymongo import MongoClient
from pprint import PrettyPrinter

In [2]:
uri = "mongodb://bdsdwriter:imbdsdwriter@10.60.90.121:27017/NLP"

In [3]:
client = MongoClient(uri)
client


Out[3]:
MongoClient('10.60.90.121', 27017)

In [4]:
wiki = client['NLP']['wiki']

In [6]:
wiki.count()


Out[6]:
12969173

隨便找一筆


In [7]:
wiki.find_one()


Out[7]:
{u'_id': ObjectId('56b44d59c799d54cdeafc29d'),
 u'identical': u'Computer accessibility',
 u'title': u'AccessibleComputing'}

In [8]:
%timeit wiki.find_one()


1000 loops, best of 3: 1.65 ms per loop

找出title完全符合的一筆


In [9]:
wiki.find_one({'title': 'HTTP web server'})


Out[9]:
{u'_id': ObjectId('56b46ddec799d54cdec139e2'),
 u'identical': u'Web server',
 u'title': u'HTTP web server'}

In [10]:
%timeit wiki.find_one({'title': 'HTTP web server'})


1 loops, best of 3: 731 ms per loop

In [11]:
p = PrettyPrinter()
p.pprint(wiki.find_one({'title': 'Deep learning'}))


{u'_id': ObjectId('56b51d5ac799d54cde2ea331'),
 u'categories': [u'Machine learning', u'Artificial neural networks'],
 u'related': [u'Student approaches to learning',
              u'machine learning',
              u'algorithm',
              u'linear transformation',
              u'learning representation',
              u'Vector space',
              u'Scale-invariant feature transform',
              u'Imperial College London',
              u'Feature',
              u'Unsupervised learning',
              u'Semi-supervised learning',
              u'feature learning',
              u'feature extraction',
              u'Springer Berlin Heidelberg',
              u'neuroscience',
              u'nervous system',
              u'neural coding',
              u'brain',
              u'#Deep neural networks',
              u'convolutional neural network',
              u'deep belief network',
              u'recurrent neural network',
              u'computer vision',
              u'automatic speech recognition',
              u'natural language processing',
              u'bioinformatics',
              u'buzzword',
              u'neural network',
              u'IEEE Spectrum',
              u'Nonlinear filter',
              u'Supervised learning',
              u'artificial neural network',
              u'propositional formula',
              u'greedy algorithm',
              u'supervised learning',
              u'feature engineering',
              u'Principal Component Analysis',
              u'unsupervised learning',
              u'Universal approximation theorem',
              u'Mathematics of Control, Signals, and Systems',
              u'Bayesian inference',
              u'feedforward neural networks',
              u'continuous functions',
              u'George Cybenko',
              u'sigmoid function',
              u'Probabilistic',
              u'optimization',
              u'training',
              u'test',
              u'generalization',
              u'cumulative distribution function',
              u'Deep belief network',
              u'dropout',
              u'Regularization',
              u'Geoff Hinton',
              u'Yoshua Bengio',
              u'Yann LeCun',
              u'Juergen Schmidhuber',
              u'artificial neural networks',
              u'Neocognitron',
              u'Kunihiko Fukushima',
              u'backpropagation',
              u'ZIP code',
              u'Brendan Frey',
              u'wake-sleep algorithm',
              u'Peter Dayan',
              u'Geoffrey Hinton',
              u'vanishing gradient problem',
              u'Sepp Hochreiter',
              u'Gabor filter',
              u'support vector machine',
              u'mixture model',
              u'Hidden Markov model',
              u'feedforward neural network',
              u'restricted Boltzmann machine',
              u'TIMIT',
              u'MNIST database',
              u'image classification',
              u'graphics processing unit',
              u'Nobel laureate',
              u'David H. Hubel',
              u'Torsten Wiesel',
              u'primary visual cortex',
              u'simple cell',
              u'complex cell',
              u'convolution',
              u'back-propagation',
              u'J\xfcrgen Schmidhuber',
              u'long short term memory',
              u'Rprop',
              u'feature detection',
              u'latent variable',
              u'Lower bound',
              u'log likelihood',
              u'generative model',
              u'Neural Computation',
              u'Google Brain',
              u'Andrew Ng',
              u'Jeff Dean',
              u'YouTube',
              u'GPU',
              u'IDSIA',
              u'NYU',
              u'object detection',
              u'Natural language processing',
              u'language model',
              u'acoustic model',
              u'discriminative model',
              u'stochastic gradient descent',
              u'loss function',
              u'Reinforcement learning',
              u'activation function',
              u'multiclass classification',
              u'softmax activation function',
              u'cross entropy',
              u'rectified linear unit',
              u'overfitting',
              u'weight decay',
              u'sparse matrix',
              u'Dropout',
              u'gradient descent',
              u'Local optimum',
              u'Hyperparameter optimization',
              u'Extreme Learning Machines',
              u'Weightless neural networks',
              u'Function composition',
              u'undirected graph',
              u'contrastive divergence',
              u'maximum likelihood',
              u'Gibbs sampling',
              u'DeepDream',
              u'non-stationary',
              u'Hebbian',
              u'cochlea',
              u'retina',
              u'Auditory cortex',
              u'Visual cortex',
              u'ReLU',
              u'sleep apnea',
              u'Markov random field',
              u'Graph',
              u'graphical model',
              u'Latent variable',
              u'random variables',
              u'Binary variable',
              u'Restricted Boltzmann machine',
              u'Object recognition',
              u'speech recognition',
              u'inference',
              u'Markov chain Monte Carlo',
              u'auto encoder',
              u'Linear classifier',
              u'Robustness',
              u'stochastic mapping',
              u'minimization algorithm',
              u'cross-entropy',
              u'logistic regression',
              u'convex optimization problem',
              u'Closed-form expression',
              u'Multilayer perceptron',
              u'Sigmoid function',
              u'Artificial neuron',
              u'Discriminative model',
              u'covariance',
              u'Convex optimization',
              u'bilinear map',
              u'tensor',
              u'Computer cluster',
              u'CPU',
              u'Statistical classification',
              u'regression analysis',
              u'Real number',
              u'bipartite graph',
              u'Prior probability',
              u'probability mass',
              u'Probability density',
              u'energy function',
              u'Conditional probability distribution',
              u'marginalizing out',
              u'Bayesian network',
              u'Deep Boltzmann Machines',
              u'Training set',
              u'degree of freedom',
              u'Hierarchical Bayesian model',
              u'statistics',
              u'cognitive science',
              u'hierarchical Dirichlet process',
              u'Log probability',
              u'Predictive modelling',
              u'Inference',
              u'Greedy algorithm',
              u'Markov chain',
              u'Convolutional neural network',
              u'kernel principal component analysis',
              u'Principal component analysis',
              u'dimensionality reduction',
              u'mutual information',
              u'K-nearest neighbor',
              u'validation set',
              u'#Deep stacking networks',
              u'Q-learning',
              u'reinforcement learning',
              u'Google DeepMind',
              u'Atari 2600',
              u'distributed representations',
              u'self-organizing map',
              u'sparse distributed memory',
              u'hierarchical temporal memory',
              u'content-addressable memory',
              u'encoder',
              u'Binary decoder',
              u'Long short-term memory',
              u'Recurrent neural network',
              u'Instance-based learning',
              u'nearest neighbour',
              u'k-nearest neighbors algorithm',
              u'Sparse distributed memory',
              u'Turing Machine',
              u'long-term memory',
              u'Facebook',
              u'question answering',
              u'random-access memory',
              u'computer architecture',
              u'Processor register',
              u'Arithmetic logic unit',
              u'Pointer',
              u'probability distribution',
              u'LSTM',
              u'Structured prediction',
              u'machine translation',
              u'Language model',
              u'dialect',
              u'American English',
              u'MIT Technology Review',
              u'negative sampling',
              u'word embedding',
              u'vector space',
              u'probabilistic context free grammar',
              u'Statistical parsing',
              u'sentiment analysis',
              u'pharmaceutical industry',
              u'chemical compounds',
              u'Biological target',
              u'biomolecules',
              u'cytotoxicity',
              u'NIH',
              u'FDA',
              u'National Center for Advancing Translational Sciences',
              u'virtual screening',
              u'Google',
              u'Stanford University',
              u'Customer relationship management',
              u'RFM',
              u'Customer lifetime value',
              u'autoencoder',
              u'Gene Ontology',
              u'brain development',
              u'cognitive neuroscientist',
              u'nerve growth factor',
              u'self-organization',
              u'transducer',
              u'New York Times',
              u'cognition',
              u'primate',
              u'Artificial general intelligence',
              u'artificial intelligence',
              u'Automatic image annotation',
              u'Vladimir Vapnik',
              u'DeepMind Technologies',
              u'Atari',
              u'Baidu',
              u'Gary Marcus',
              u'causality',
              u'Watson',
              u'deductive reasoning',
              u'behavioral modernity',
              u'consciousness',
              u'The Guardian',
              u'OpenCog',
              u'Ben Goertzel',
              u'grammar',
              u'commonsense reasoning',
              u'Production',
              u'Grammar induction',
              u'Torch',
              u'Theano',
              u'Deeplearning4j',
              u'OpenNN',
              u'Matlab',
              u'GNU Octave',
              u'Javascript',
              u'Gensim',
              u'TensorFlow',
              u'Julia',
              u'Sparse coding',
              u'Compressed Sensing',
              u'Connectionism',
              u'Self-organizing map',
              u'Applications of artificial intelligence',
              u'List of artificial intelligence projects',
              u'Reservoir computing',
              u'Liquid state machine',
              u'Echo state network'],
 u'title': u'Deep learning'}

使用regular expression


In [12]:
for item in wiki.find({'title': {'$regex':'[Dd]eep [Ll]earning'}}):
    print item['title']


Deep learning
Deep Learning
Google deep learning project

In [13]:
%timeit a = [item['title'] for item in wiki.find({'title': {'$regex':'[Dd]eep [Ll]earning'}})]


1 loops, best of 3: 13.8 s per loop

Categories

找Machine learning會有兩個項目,一個是article,一個是category。

category會有u'isCategory': 1.0


In [14]:
for item in wiki.find({'title': 'Machine learning'}):
    p.pprint(item)


{u'_id': ObjectId('56b5858fc799d54cde6ffe7e'),
 u'categories': [u'Machine learning', u'Learning', u'Cybernetics'],
 u'related': [u'computer science',
              u'pattern recognition',
              u'computational learning theory',
              u'artificial intelligence',
              u'algorithm',
              u'learning',
              u'data',
              u'Machine Learning',
              u'Mathematical model',
              u'computational statistics',
              u'mathematical optimization',
              u'spam filter',
              u'optical character recognition',
              u'IEEE Signal Processing Society',
              u'Learning to rank',
              u'computer vision',
              u'data mining',
              u'predictive analytics',
              u'predictive modelling',
              u'Arthur Samuel',
              u'Tom M. Mitchell',
              u'Operational definition',
              u'Alan Turing',
              u'Computing Machinery and Intelligence',
              u'Supervised learning',
              u'Map',
              u'Unsupervised learning',
              u'feature learning',
              u'Reinforcement learning',
              u'Autonomous car',
              u'semi-supervised learning',
              u'Transduction',
              u'support vector machine',
              u'linear classifier',
              u'learning to learn',
              u'inductive bias',
              u'Developmental robotics',
              u'robot learning',
              u'Statistical classification',
              u'multi-label classification',
              u'regression analysis',
              u'Cluster analysis',
              u'Density estimation',
              u'Probability distribution',
              u'Dimensionality reduction',
              u'Topic modeling',
              u'natural language',
              u'neural network',
              u'perceptron',
              u'ADALINE',
              u'generalized linear model',
              u'Probability theory',
              u'GOFAI',
              u'expert system',
              u'inductive logic programming',
              u'information retrieval',
              u'connectionism',
              u'John Hopfield',
              u'David Rumelhart',
              u'Geoff Hinton',
              u'backpropagation',
              u'probability theory',
              u'internet',
              u'Data mining',
              u'discovery',
              u'Knowledge discovery',
              u'ECML PKDD',
              u'loss function',
              u'statistics',
              u'Michael I. Jordan',
              u'data science',
              u'Leo Breiman',
              u'Random forest',
              u'Mehryar Mohri',
              u'MIT Press',
              u'theoretical computer science',
              u'bias\u2013variance decomposition',
              u'Time complexity',
              u'time complexity',
              u'statistical inference',
              u'decision tree',
              u'artificial neural network',
              u'biological neural networks',
              u'artificial neuron',
              u'computation',
              u'non-linear',
              u'statistical',
              u'data modeling',
              u'joint probability distribution',
              u'logic programming',
              u'Entailment',
              u'Inductive programming',
              u'supervised learning',
              u'statistical classification',
              u'unsupervised learning',
              u'data analysis',
              u'graphical model',
              u'random variables',
              u'conditional independence',
              u'directed acyclic graph',
              u'inference',
              u'principal components analysis',
              u'cluster analysis',
              u'Manifold learning',
              u'Sparse coding',
              u'Multilinear subspace learning',
              u'tensor',
              u'Deep learning',
              u'Recommendation systems',
              u'strongly NP-hard',
              u'K-SVD',
              u'Search algorithm',
              u'Heuristic',
              u'natural selection',
              u'Mutation',
              u'Crossover',
              u'Chromosome',
              u'evolutionary algorithm',
              u'Adaptive website',
              u'Affective computing',
              u'Bioinformatics',
              u'Brain-machine interfaces',
              u'Cheminformatics',
              u'DNA sequence',
              u'Computational finance',
              u'Computer vision',
              u'object recognition',
              u'credit card fraud',
              u'Strategy game',
              u'TD-Gammon',
              u'Communications of the ACM',
              u'Information retrieval',
              u'Internet fraud',
              u'Marketing',
              u'Machine perception',
              u'Diagnosis',
              u'Natural language processing',
              u'Mathematical optimization',
              u'metaheuristic',
              u'Online advertising',
              u'Recommender system',
              u'Robot locomotion',
              u'Search engines',
              u'Sentiment analysis',
              u'Sequence mining',
              u'Software engineering',
              u'Speech recognition',
              u'handwriting recognition',
              u'Stock market',
              u'Structural health monitoring',
              u'Syntactic pattern recognition',
              u'Economics',
              u'Finance',
              u'Netflix',
              u'Netflix Prize',
              u'AT&T Labs',
              u'Ensemble Averaging',
              u'ArXiv',
              u'Software suite',
              u'dlib',
              u'ELKI',
              u'Encog',
              u'H2o',
              u'Apache Mahout',
              u'mlpy',
              u'MLPACK',
              u'MOA',
              u'ND4J',
              u'Deeplearning4j',
              u'Numenta',
              u'OpenCV',
              u'OpenNN',
              u'Orange',
              u'R',
              u'scikit-learn',
              u'scikit-image',
              u'Shogun',
              u'TensorFlow',
              u'Torch',
              u'Apache Spark',
              u'Yooreeka',
              u'Weka',
              u'Mallet',
              u'KNIME',
              u'RapidMiner',
              u'Amazon Web Services',
              u'Angoss',
              u'Databricks',
              u'Google APIs',
              u'SPSS Modeler',
              u'KXEN Inc.',
              u'LIONsolver',
              u'Mathematica',
              u'MATLAB',
              u'Azure machine learning studio',
              u'Neural Designer',
              u'NeuroSolutions',
              u'Oracle Data Mining',
              u'RCASE',
              u'SAS',
              u'STATISTICA',
              u'Journal of Machine Learning Research',
              u'Neural Computation',
              u'Conference on Neural Information Processing Systems',
              u'International Conference on Machine Learning',
              u'International Conference on Learning Representations',
              u'Adaptive control',
              u'Adversarial machine learning',
              u'Automatic reasoning',
              u'Big data',
              u'Cache language model',
              u'Cognitive model',
              u'Cognitive science',
              u'Computational intelligence',
              u'Computational neuroscience',
              u'Data science',
              u'Ethics of artificial intelligence',
              u'Existential risk from advanced artificial intelligence',
              u'Explanation-based learning',
              u'List of important publications in computer science',
              u'List of machine learning algorithms',
              u'Trevor Hastie',
              u'Robert Tibshirani',
              u'Ryszard S. Michalski',
              u'Ray Solomonoff',
              u'Dartmouth Conferences',
              u'Andrew Ng',
              u'GNU Octave',
              u'Stanford University'],
 u'title': u'Machine learning'}
{u'_id': ObjectId('56b58de9c799d54cde73bd12'),
 u'categories': [u'Artificial intelligence', u'Learning'],
 u'isCategory': 1.0,
 u'related': [u'statistics', u'computer science'],
 u'title': u'Machine learning'}

In [ ]: