Scrapy 实现Mongodb Pipline存储数据
这里只提供一下代码:具体逻辑很简单
需要安装的就是pymongo
from scrapy.contrib.exporter import XmlItemExporter,JsonItemExporter,JsonLinesItemExporter,CsvItemExporter from scrapy import signals,log from scrapy.exceptions import DropItem import datetime,pymongo import MySQLdb class MongoDBPipeline(object): def __init__(self,mongo_server,mongo_port,mongo_db,mongo_collection): self.mongo_server = mongo_server self.mongo_port = mongo_port self.mongo_db = mongo_db self.mongo_collection = mongo_collection @classmethod def from_crawler(cls,crawler): return cls( mongo_server=crawler.settings.get('MONGO_SERVER'), mongo_port=crawler.settings.get('MONGO_PORT'), mongo_db=crawler.settings.get('MONGO_DB'), mongo_collection=crawler.settings.get('MONGO_COLLECTION') ) def open_spider(self,spider): self.client = pymongo.MongoClient(self.mongo_server,self.mongo_port) self.db = self.client[self.mongo_db] self.collection = self.mongo_collection def close_spider(self,spider): self.client.close() def process_item(self,item,spider): valid = True for data in item: if not data: valid = False raise DropItem("Missing {0}!".format(data)) if valid: for n in range(len(item['title'])): self.db[self.collection].insert( { 'url':item['url'][n], 'title':item['title'][n], 'create_time':item['create_time'] } ) # self.db[self.collection].insert(dict(item)) log.msg("Question added to MongoDB database!", level=log.DEBUG, spider=spider) return item
同时不要忘记了加进来实现,配置文件加入下面代码
ITEM_PIPELINES = { 'apple.pipelines.MongoDBPipeline':200 }
版权声明
由 durban创作并维护的 Gowhich博客采用创作共用保留署名-非商业-禁止演绎4.0国际许可证。
本文首发于 博客( https://www.gowhich.com ),版权所有,侵权必究。
本文永久链接: https://www.gowhich.com/blog/624
版权声明
由 durban创作并维护的 Gowhich博客采用创作共用保留署名-非商业-禁止演绎4.0国际许可证。
本文首发于 Gowhich博客( https://www.gowhich.com ),版权所有,侵权必究。
本文永久链接: https://www.gowhich.com/blog/624