本文实例讲述了scrapy自定义pipeline类实现将采集数据保存到mongodb的方法。分享给大家供大家参考。具体如下:
# Standard Python library imports
# 3rd party modules
import pymongo
from scrapy import log
from scrapy.conf import settings
from scrapy.exceptions import DropItem
class MongoDBPipeline(object):
def __init__(self):
self.server = settings[\'MONGODB_SERVER\']
self.port = settings[\'MONGODB_PORT\']
self.db = settings[\'MONGODB_DB\']
self.col = settings[\'MONGODB_COLLECTION\']
connection = pymongo.Connection(self.server, self.port)
db = connection[self.db]
self.collection = db[self.col]
def process_item(self, item, spider):
err_msg = \'\'
for field, data in item.items():
if not data:
err_msg += \'Missing %s of poem from %s\\n\' % (field, item[\'url\'])
if err_msg:
raise DropItem(err_msg)
self.collection.insert(dict(item))
log.msg(\'Item written to MongoDB database %s/%s\' % (self.db, self.col),
level=log.DEBUG, spider=spider)
return item
希望本文所述对大家的python程序设计有所帮助。