代码很详细,就不注释啦!有问题尽情留言,有问必答。。。
spider爬虫模块:
# -*- coding: utf-8 -*- from scrapy.linkextractors import LinkExtractor from
scrapy.spiders import CrawlSpider, Rule from anjuke2.items import Anjuke2Item
from scrapy import Request import time class Anju2Spider(CrawlSpider): name =
'anju2' allowed_domains = ['sh.zu.anjuke.com'] start_urls =
['http://sh.zu.anjuke.com/'] page_link =
LinkExtractor(restrict_xpaths='//div[@class="page-content"]/div/a') rules = (
Rule(page_link, callback='parse_item', follow=True), ) def parse_item(self,
response): infolist = response.xpath('//div[@class="list-content"]/div')
infolist = infolist[2:-1] for house in infolist: try: name =
house.xpath('.//div[@class="zu-info"]/h3/a/text()')[0].extract() huxing =
house.xpath('.//div[@class="zu-info"]/p/text()')[0].extract().split(' ')[-1]
louceng = house.xpath('.//div[@class="zu-info"]/p/text()')[2].extract() mianji
= house.xpath('.//div[@class="zu-info"]/p/text()')[1].extract() addrss =
house.xpath('.//div[@class="zu-info"]/address/a/text()')[0].extract()
chuzufangshi =
house.xpath('.//div[@class="zu-info"]/p[2]/span[1]/text()')[0].extract() rent =
house.xpath('.//div[@class="zu-side"]/p/strong/text()')[0].extract() item =
Anjuke2Item() city =
response.xpath('//div[@class="cityselect"]/div[1]/text()')[0].extract().split('
')[-1] item['city'] = city item['name'] = name item['huxing'] = huxing
item['louceng'] = louceng item['mianji'] = mianji item['addrss'] = addrss
item['chuzufangshi'] = chuzufangshi item['rent'] = rent yield item except
Exception as e: print(e)
pipeline管道模块:
#写入本地/
import json class Anjuke2Pipeline(object): def open_spider(self,spider):
self.fp=open('上海.txt','w',encoding='utf8') def close_spider(self,spider):
self.fp.close() def process_item(self, item, spider): dic=dict(item)
string=json.dumps(dic,ensure_ascii=False) self.fp.write(string+'\n') return item
#写入MongoDB/
import pymongo class mongodbPipeline(object): def open_spider(self,spider):
self.client=pymongo.MongoClient(host='localhost',port=27017) def
close_spider(self,spider): self.client.close() def
process_item(self,item,spider): db=self.client.anjuke clo=db.zufang
clo.insert(dict(item)) return item
#写入pymysql数据库
import pymysql class mysqlPipeline(object): def open_spider(self,spider):
self.connect=pymysql.connect(host='127.0.0.1',port=3306,user='root',pwd='123456',database='anjuke',charset='utf8')
def close_spider(self,spider): self.connect.close() def
process_item(self,item,spider): self.save_mysql(item) return item def
save_mysql(self,item): cursor=self.connect.cursor() sql='insert into
zufang(city,title,huxing,louceng,mianji,addrss,chuzufangshi,rent)
values("%s","%s","%s","%s","%s","%s","%s","%s")' %
(item['city'],item['name'],item['huxing'],item['louceng'],item['mianji'],item['addrss'],item['chuzufangshi'],item['rent'])
try: cursor.execute(sql) self.connect.commit() except Exception as e: print(e)
self.connect.rollback()
#写入sqlite
from scrapy.utils.project import get_project_settings import sqlite3 class
sqllitPipeline(object): def open_spider(self,spider): settings =
get_project_settings() self.db=sqlite3.connect('sql.db') self.cur =
self.db.cursor() sql1 = '''create table zufang( city char(50) not null, title
char(50) not null, huxing char(50) not null, louceng char(50) not null, mianji
char(50) not null, addrss char(50) not null, chuzufangshi char(50) not null,
rent char(50) not null)''' self.cur.execute(sql1) def
close_spider(self,spider): self.db.close() def process_item(self,item,spider):
self.save_to_sqlite(item) return item def save_to_sqlite(self,item):
sql='insert into
zufang(city,title,huxing,louceng,mianji,addrss,chuzufangshi,rent)
values("%s","%s","%s","%s","%s","%s","%s","%s")' %
(item['city'],item['name'],item['huxing'],item['louceng'],item['mianji'],item['addrss'],item['chuzufangshi'],item['rent'])
try: self.cur.execute(sql) self.db.commit() except Exception as e: print(e)
self.db.rollback()
items模块:
import scrapy class Anjuke2Item(scrapy.Item): # define the fields for your
item here like: # name = scrapy.Field() city = scrapy.Field() name =
scrapy.Field() huxing = scrapy.Field() louceng = scrapy.Field() mianji =
scrapy.Field() addrss = scrapy.Field() chuzufangshi = scrapy.Field() rent =
scrapy.Field()
settings模块:
BOT_NAME = 'anjuke2' SPIDER_MODULES = ['anjuke2.spiders'] NEWSPIDER_MODULE =
'anjuke2.spiders' USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36'
ROBOTSTXT_OBEY = False DOWNLOAD_DELAY = 3 ITEM_PIPELINES = {
'anjuke2.pipelines.Anjuke2Pipeline': 300, #
'anjuke2.pipelines.mongodbPipeline': 301, # 'anjuke2.pipelines.mysqlPipeline':
302, # 'anjuke2.pipelines.sqllitPipeline': 303, }