notebook.community

Edit and run



In [5]:

    
# -*- coding: utf-8 -*- 
import requests
import re
import urlparse
import time
import csv
import json 
import codecs
# In this example we're trying to collect e-mail addresses from a website

# Basic e-mail regexp:
# letter/number/dot/comma @ letter/number/dot/comma . letter/number
# email_re = re.compile(r'([\w\.,]+@[\w\.,]+\.\w+)')

# HTML <a> regexp
# Matches href="" attribute
# link_re = re.compile(r'href="(.*?)"')

import csv, StringIO



In [15]:

    
import psycopg2

try:
    conn = psycopg2.connect("dbname='naver_map' user='dbuser' host='localhost' password='dbpass'")
except Exception as error:
    print error









    



FATAL:  role "dbuser" does not exist



In [6]:

    
class UnicodeWriter(object):
    
    def __init__(self, f, dialect=csv.excel_tab, encoding="utf-16", **kwds):
        # Redirect output to a queue
        self.queue = StringIO.StringIO()
        self.writer = csv.writer(self.queue, dialect=dialect, **kwds)
        self.stream = f
        self.encoding = encoding
    
    def writerow(self, row):
        # Modified from original: now using unicode(s) to deal with e.g. ints
        self.writer.writerow([unicode(s).encode("utf-8") for s in row])
        # Fetch UTF-8 output from the queue ...
        data = self.queue.getvalue()
        data = data.decode("utf-8")
        # ... and reencode it into the target encoding
        data = data.encode(self.encoding)
        # write to the target stream
        self.stream.write(data)
        # empty queue
        self.queue.truncate(0)
    
    def writerows(self, rows):
        for row in rows:
            self.writerow(row)



In [7]:

    
class UnicodeDictWriter(UnicodeWriter):
    def __init__(self, f, fields, dialect=csv.excel_tab,
            encoding="utf-16", **kwds):
        super(UnicodeDictWriter, self).__init__(f, dialect, encoding, **kwds)
        self.fields = fields
    
    def writerow(self, drow):
        row = [drow.get(field, '') for field in self.fields]
        super(UnicodeDictWriter, self).writerow(row)



In [4]:

    
def crawl():
    base_url = 'http://map.naver.com/search2/getSiteInfo.nhn?id=p'
    headers = {'Accept':'application/json, text/javascript, */*; q=0.01',
    'Cookie':'npic=IhHf0sIV2sNKzQWajD4MaBwbICouHNRqL1nNOqO2UonEKdRkaylWM4W4CeiWb7PVCA==; NNB=5FIFGLS5NDWFG; WMONID=ZzV3T54OUA2; nsr_acl=1; BMR=s=1431578263011&r=http%3A%2F%2Ftvcast.naver.com%2Fv%2F368791%2Flist%2F33215&r2=http%3A%2F%2Ftvcast.naver.com%2Fv%2F366519; nid_inf=611665233; NID_AUT=9QZuR23S9LQ7QBjvLANeSh0M1EZ3+agZ/9aMV50PEtWhNBjpjMnae3Nn+cp9dARD; NID_SES=AAABThR1q17xHlztdAbeV9JamitaH1MHMr5F8HnuchL0JKOFaYBhzkycsGNFKqKnSfG3eYZeycNv28ELaNO9ueFAJC5fGjcw0mK3uxd5UInjv+nB7/2FmJGxCBIkoZZ8RiJNxea7RL6DVV/4xWkjwX0ym8I6zDf2nt4b8JjIlvG3S4SHzTZqb7R2lXEMrdy0ixXq1+zUm3EtOp/UC/NS5wxcX513F0gl8TNtEzEbjpD4oKBw8aIYj8+QQ+//CxkK3XItDBagxXMZ4F33AEK9XrK6iNVdI4+9fWM3fDphClhgy5K0M/8bvgajppPyt39fOEFpsV3pEQuMSkatFm7ioS52hdhKhVDI3oQXDPar37Yy41svh7lGvF46XHXr6GWuwrndUahdrKtC0FJ5iWBr9HcgmImq+FZ8J9r9qCMAV8/ecUjxzP1Q6nNFfskzlwUru56+Ow==; ns_load_time=1431690925387; ns_sid=45dc8c80-faf9-11e4-823e-836f0b3835a6; JSESSIONID=918A0B13C8717968F0C70A2B5FADFD9E; page_uid=SSUPrwpyLjRssa60D5RssssssZw-166244; _naver_usersession_=sMJVm/9grSbKBn0vg2lFzg==',
    'Host':'map.naver.com',
    'Referer':'http://map.naver.com/',
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36',
    'X-Requested-With':'XMLHttpRequest'}

    outfile = open('map_data19000.json','a')
    start = time.time()

    for idx in range(10000):

        url = base_url+str(idx+29000)
        # print url
        res = requests.get(url,headers=headers)
        
        if idx%1000 == 0:
            print idx+29000
            print res.json()
        try:

            if 'result' in res.json():
                map_dict = {}
                result = res.json()['result']
                json.dump(result, outfile)
                outfile.write(',')
            elif 'error' in res.json():
                pass
            else:
                # print res.json()['error']['msg']
                pass
        except:
            print res.text
            break
    # Check if successful
    if(res.status_code != 200):
        return []

    end = time.time()
    print end - start


# crawl()
# map_json = readJson()

def readJson():
    with open('map_data.json', 'wr') as f:
        f = '['+f.read()[:-1]+']'
        data = json.loads(f)
        return data

# import ijson

# for prefix, theType, value in ijson.parse(open('map_data.json')):
#     print prefix, theType, value




map_data = readJson()
for each in map_data:
   print each['name'], each['category'],'x:',each['x'], 'y:',each['y'], each['address'],each['phone']

# map_dict['id'] = result['id']
            # map_dict['isSite'] = result['isSite']
            # map_dict['name'] = result['name']
            # map_dict['type'] = result['type']
            # map_dict['x'] = result['x']
            # map_dict['y'] = result['y']
            # map_dict['posExact']= result['posExact']
            # map_dict['address']= result['address']
            # map_dict['roadAddr']= result['roadAddr']
            # map_dict['phone']= result['phone']
            # map_dict['isCallLink']= result['isCallLink']
            # map_dict['displayHomepage']= result['displayHomepage']
            # map_dict['landingHomepage']= result['landingHomepage']
            # map_dict['ppc']= result['ppc']
            # map_dict['ktCallMd']= result['ktCallMd']
            # map_dict['couponUrl']= result['couponUrl']
            # map_dict['imageURL']= result['imageURL']
            # map_dict['imageModDate']= result['imageModDate']
            # map_dict['point']= result['point']
            # map_dict['starPoint']= result['starPoint']
            # map_dict['pointCount']= result['pointCount']
            # map_dict['reviewScoreDisplay']= result['reviewScoreDisplay']
            # map_dict['description']= result['description']
            # map_dict['reviewCount']= result['reviewCount']
            # map_dict['reviewDisplay']= result['reviewDisplay']
            # map_dict['category']= result['category']
            # map_dict['streetPanorama']= result['streetPanorama']
            # map_dict['skyPanorama']= result['skyPanorama']
            # map_dict['insidePanorama']= result['insidePanorama']
            # map_dict['interiorPanorama']= result['interiorPanorama']
            # map_dict['bizHour']= result['bizHour']
            # map_dict['options']= result['options']
            # map_dict['indoor']= result['indoor']
            # map_dict['poiInfo']= result['poiInfo']
            # map_dict['itemMetaData']= result['itemMetaData']
            # map_dict['subwayId']= result['subwayId']
            # map_dict['endPageUrl']= result['endPageUrl']
            # map_dict['entranceCoords']= result['entranceCoords']
            # map_dict['movieInfo']= result['movieInfo']
            # map_dict['petrolInfo']= result['petrolInfo']
            # map_dict['theme']= result['theme']
            # map_dict['hasCardBenefit']= result['hasCardBenefit']
            # map_dict['isPollingPlace']= result['isPollingPlace']
            # map_dict['menuExist']= result['menuExist']
            # map_dict['isDeadUrl']= result['isDeadUrl']

            # w = UnicodeWriter(outfile, map_dict.items())
            # w.writerow(map_dict)









    



---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-4-95c0c589eb45> in <module>()
    139 
    140 
--> 141 map_data = readJson()
    142 for each in map_data:
    143    print each['name'], each['category'],'x:',each['x'], 'y:',each['y'], each['address'],each['phone']

<ipython-input-4-95c0c589eb45> in readJson()
    127 def readJson():
    128     with open('map_data.json', 'wr') as f:
--> 129         f = '['+f.read()[:-1]+']'
    130         data = json.loads(f)
    131         return data

IOError: [Errno 9] Bad file descriptor



In [ ]: