python

超轻量级php框架startmvc

scrapy爬虫完整实例

更新时间:2020-05-19 15:18:01 作者:startmvc
本文主要通过实例介绍了scrapy框架的使用,分享了两个例子,爬豆瓣文本例程douban和图片例

本文主要通过实例介绍了scrapy框架的使用,分享了两个例子,爬豆瓣文本例程 douban 和图片例程 douban_imgs ,具体如下。

例程1: douban

目录树


douban
--douban
 --spiders
 --__init__.py
 --bookspider.py
 --douban_comment_spider.py
 --doumailspider.py
 --__init__.py
 --items.py
 --pipelines.py
 --settings.py
--scrapy.cfg

–spiders–init.py


# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

bookspider.py


# -*- coding:utf-8 -*-
'''by sudo rm -rf http://imchenkun.com'''
import scrapy
from douban.items import DoubanBookItem


class BookSpider(scrapy.Spider):
 name = 'douban-book'
 allowed_domains = ['douban.com']
 start_urls = [
 'https://book.douban.com/top250'
 ]

 def parse(self, response):
 # 请求第一页
 yield scrapy.Request(response.url, callback=self.parse_next)

 # 请求其它页
 for page in response.xpath('//div[@class="paginator"]/a'):
 link = page.xpath('@href').extract()[0]
 yield scrapy.Request(link, callback=self.parse_next)

 def parse_next(self, response):
 for item in response.xpath('//tr[@class="item"]'):
 book = DoubanBookItem()
 book['name'] = item.xpath('td[2]/div[1]/a/@title').extract()[0]
 book['content'] = item.xpath('td[2]/p/text()').extract()[0]
 book['ratings'] = item.xpath('td[2]/div[2]/span[2]/text()').extract()[0]
 yield book

douban_comment_spider.py


# -*- coding:utf-8 -*-
import scrapy
from faker import Factory
from douban.items import DoubanMovieCommentItem
import urlparse
f = Factory.create()


class MailSpider(scrapy.Spider):
 name = 'douban-comment'
 allowed_domains = ['accounts.douban.com', 'douban.com']
 start_urls = [
 'https://www.douban.com/'
 ]

 headers = {
 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 'Accept-Encoding': 'gzip, deflate, br',
 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
 'Connection': 'keep-alive',
 'Host': 'accounts.douban.com',
 'User-Agent': f.user_agent()
 }

 formdata = {
 'form_email': '你的邮箱',
 'form_password': '你的密码',
 # 'captcha-solution': '',
 # 'captcha-id': '',
 'login': '登录',
 'redir': 'https://www.douban.com/',
 'source': 'None'
 }

 def start_requests(self):
 return [scrapy.Request(url='https://www.douban.com/accounts/login',
 headers=self.headers,
 meta={'cookiejar': 1},
 callback=self.parse_login)]

 def parse_login(self, response):
 # 如果有验证码要人为处理
 if 'captcha_image' in response.body:
 print 'Copy the link:'
 link = response.xpath('//img[@class="captcha_image"]/@src').extract()[0]
 print link
 captcha_solution = raw_input('captcha-solution:')
 captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True)['id']
 self.formdata['captcha-solution'] = captcha_solution
 self.formdata['captcha-id'] = captcha_id
 return [scrapy.FormRequest.from_response(response,
 formdata=self.formdata,
 headers=self.headers,
 meta={'cookiejar': response.meta['cookiejar']},
 callback=self.after_login
 )]

 def after_login(self, response):
 print response.status
 self.headers['Host'] = "www.douban.com"
 yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews',
 meta={'cookiejar': response.meta['cookiejar']},
 headers=self.headers,
 callback=self.parse_comment_url)
 yield scrapy.Request(url='https://movie.douban.com/subject/22266320/reviews',
 meta={'cookiejar': response.meta['cookiejar']},
 headers=self.headers,
 callback=self.parse_next_page,
 dont_filter = True) #不去重

 def parse_next_page(self, response):
 print response.status
 try:
 next_url = response.urljoin(response.xpath('//span[@class="next"]/a/@href').extract()[0])
 print "下一页"
 print next_url
 yield scrapy.Request(url=next_url,
 meta={'cookiejar': response.meta['cookiejar']},
 headers=self.headers,
 callback=self.parse_comment_url,
 dont_filter = True)
 yield scrapy.Request(url=next_url,
 meta={'cookiejar': response.meta['cookiejar']},
 headers=self.headers,
 callback=self.parse_next_page,
 dont_filter = True)
 except:
 print "Next page Error"
 return

 def parse_comment_url(self, response):
 print response.status
 for item in response.xpath('//div[@class="main review-item"]'):
 comment_url = item.xpath('header/h3[@class="title"]/a/@href').extract()[0]
 comment_title = item.xpath('header/h3[@class="title"]/a/text()').extract()[0]
 print comment_title
 print comment_url
 yield scrapy.Request(url=comment_url,
 meta={'cookiejar': response.meta['cookiejar']},
 headers=self.headers,
 callback=self.parse_comment)

 def parse_comment(self, response):
 print response.status
 for item in response.xpath('//div[@id="content"]'):
 comment = DoubanMovieCommentItem()
 comment['useful_num'] = item.xpath('//div[@class="main-panel-useful"]/button[1]/text()').extract()[0].strip()
 comment['no_help_num'] = item.xpath('//div[@class="main-panel-useful"]/button[2]/text()').extract()[0].strip()
 comment['people'] = item.xpath('//span[@property="v:reviewer"]/text()').extract()[0]
 comment['people_url'] = item.xpath('//header[@class="main-hd"]/a[1]/@href').extract()[0]
 comment['star'] = item.xpath('//header[@class="main-hd"]/span[1]/@title').extract()[0]

 data_type = item.xpath('//div[@id="link-report"]/div/@data-original').extract()[0]
 print "data_type: "+data_type
 if data_type == '0':
 comment['comment'] = "\t#####\t".join(map(lambda x:x.strip(), item.xpath('//div[@id="link-report"]/div/p/text()').extract()))
 elif data_type == '1':
 comment['comment'] = "\t#####\t".join(map(lambda x:x.strip(), item.xpath('//div[@id="link-report"]/div[1]/text()').extract()))
 comment['title'] = item.xpath('//span[@property="v:summary"]/text()').extract()[0]
 comment['comment_page_url'] = response.url
 #print comment
 yield comment

doumailspider.py


# -*- coding:utf-8 -*-
'''by sudo rm -rf http://imchenkun.com'''
import scrapy
from faker import Factory
from douban.items import DoubanMailItem
import urlparse
f = Factory.create()


class MailSpider(scrapy.Spider):
 name = 'douban-mail'
 allowed_domains = ['accounts.douban.com', 'douban.com']
 start_urls = [
 'https://www.douban.com/'
 ]

 headers = {
 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 'Accept-Encoding': 'gzip, deflate, br',
 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
 'Connection': 'keep-alive',
 'Host': 'accounts.douban.com',
 'User-Agent': f.user_agent()
 }

 formdata = {
 'form_email': '你的邮箱',
 'form_password': '你的密码',
 # 'captcha-solution': '',
 # 'captcha-id': '',
 'login': '登录',
 'redir': 'https://www.douban.com/',
 'source': 'None'
 }

 def start_requests(self):
 return [scrapy.Request(url='https://www.douban.com/accounts/login',
 headers=self.headers,
 meta={'cookiejar': 1},
 callback=self.parse_login)]

 def parse_login(self, response):
 # 如果有验证码要人为处理
 if 'captcha_image' in response.body:
 print 'Copy the link:'
 link = response.xpath('//img[@class="captcha_image"]/@src').extract()[0]
 print link
 captcha_solution = raw_input('captcha-solution:')
 captcha_id = urlparse.parse_qs(urlparse.urlparse(link).query, True)['id']
 self.formdata['captcha-solution'] = captcha_solution
 self.formdata['captcha-id'] = captcha_id
 return [scrapy.FormRequest.from_response(response,
 formdata=self.formdata,
 headers=self.headers,
 meta={'cookiejar': response.meta['cookiejar']},
 callback=self.after_login
 )]

 def after_login(self, response):
 print response.status
 self.headers['Host'] = "www.douban.com"
 return scrapy.Request(url='https://www.douban.com/doumail/',
 meta={'cookiejar': response.meta['cookiejar']},
 headers=self.headers,
 callback=self.parse_mail)

 def parse_mail(self, response):
 print response.status
 for item in response.xpath('//div[@class="doumail-list"]/ul/li'):
 mail = DoubanMailItem()
 mail['sender_time'] = item.xpath('div[2]/div/span[1]/text()').extract()[0]
 mail['sender_from'] = item.xpath('div[2]/div/span[2]/text()').extract()[0]
 mail['url'] = item.xpath('div[2]/p/a/@href').extract()[0]
 mail['title'] = item.xpath('div[2]/p/a/text()').extract()[0]
 print mail
 yield mail

init.py

(此文件内无代码)

items.py


# -*- coding: utf-8 -*-
import scrapy


class DoubanBookItem(scrapy.Item):
 name = scrapy.Field() # 书名
 price = scrapy.Field() # 价格
 edition_year = scrapy.Field() # 出版年份
 publisher = scrapy.Field() # 出版社
 ratings = scrapy.Field() # 评分
 author = scrapy.Field() # 作者
 content = scrapy.Field()


class DoubanMailItem(scrapy.Item):
 sender_time = scrapy.Field() # 发送时间
 sender_from = scrapy.Field() # 发送人
 url = scrapy.Field() # 豆邮详细地址
 title = scrapy.Field() # 豆邮标题

class DoubanMovieCommentItem(scrapy.Item):
 useful_num = scrapy.Field() # 多少人评论有用
 no_help_num = scrapy.Field() # 多少人评论无用
 people = scrapy.Field() # 评论者
 people_url = scrapy.Field() # 评论者页面
 star = scrapy.Field() # 评分
 comment = scrapy.Field() # 评论
 title = scrapy.Field() # 标题
 comment_page_url = scrapy.Field()# 当前页

pipelines.py


# -*- coding: utf-8 -*-


class DoubanBookPipeline(object):
 def process_item(self, item, spider):
 info = item['content'].split(' / ') # [法] 圣埃克苏佩里 / 马振聘 / 人民文学出版社 / 2003-8 / 22.00元
 item['name'] = item['name']
 item['price'] = info[-1]
 item['edition_year'] = info[-2]
 item['publisher'] = info[-3]
 return item


class DoubanMailPipeline(object):
 def process_item(self, item, spider):
 item['title'] = item['title'].replace(' ', '').replace('\\n', '')
 return item


class DoubanMovieCommentPipeline(object):
 def process_item(self, item, spider):
 return item

settings.py


# -*- coding: utf-8 -*-

# Scrapy settings for douban project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'douban'

SPIDER_MODULES = ['douban.spiders']
NEWSPIDER_MODULE = 'douban.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
from faker import Factory
f = Factory.create()
USER_AGENT = f.user_agent()

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
DEFAULT_REQUEST_HEADERS = {
 'Host': 'book.douban.com',
 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 'Accept-Language': 'zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3',
 'Accept-Encoding': 'gzip, deflate, br',
 'Connection': 'keep-alive',
}
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'douban.middlewares.MyCustomSpiderMiddleware': 543,
#}

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'douban.middlewares.MyCustomDownloaderMiddleware': 543,
#}

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
 #'douban.pipelines.DoubanBookPipeline': 300,
 #'douban.pipelines.DoubanMailPipeline': 600,
 'douban.pipelines.DoubanMovieCommentPipeline': 900,
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

scrapy.cfg


# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = douban.settings

[deploy]
#url = http://localhost:6800/
project = douban

例程2: douban_imgs

目录树


douban_imgs
--douban
 --spiders
 --__init__.py
 --download_douban.py
 --__init__.py
 --items.py
 --pipelines.py
 --run_spider.py
 --settings.py
--scrapy.cfg

–spiders–init.py


# This package will contain the spiders of your Scrapy project
#
# Please refer to the documentation for information on how to create and manage
# your spiders.

download_douban.py


# coding=utf-8
from scrapy.spiders import Spider
import re
from scrapy import Request
from douban_imgs.items import DoubanImgsItem


class download_douban(Spider):
 name = 'download_douban'

 default_headers = {
 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
 'Accept-Encoding': 'gzip, deflate, sdch, br',
 'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
 'Cache-Control': 'max-age=0',
 'Connection': 'keep-alive',
 'Host': 'www.douban.com',
 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
 }

 def __init__(self, url='1638835355', *args, **kwargs):
 self.allowed_domains = ['douban.com']
 self.start_urls = [
 'http://www.douban.com/photos/album/%s/' % (url)]
 self.url = url
 # call the father base function

 #super(download_douban, self).__init__(*args, **kwargs)

 def start_requests(self):

 for url in self.start_urls:
 yield Request(url=url, headers=self.default_headers, callback=self.parse)

 def parse(self, response):
 list_imgs = response.xpath('//div[@class="photolst clearfix"]//img/@src').extract()
 if list_imgs:
 item = DoubanImgsItem()
 item['image_urls'] = list_imgs
 yield item

init.py

(此文件内无代码)

items.py


# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy
from scrapy import Item, Field


class DoubanImgsItem(scrapy.Item):
 # define the fields for your item here like:
 # name = scrapy.Field()
 image_urls = Field()
 images = Field()
 image_paths = Field()

pipelines.py


# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
from scrapy import Request
from scrapy import log


class DoubanImgsPipeline(object):
 def process_item(self, item, spider):
 return item


class DoubanImgDownloadPipeline(ImagesPipeline):
 default_headers = {
 'accept': 'image/webp,image/*,*/*;q=0.8',
 'accept-encoding': 'gzip, deflate, sdch, br',
 'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6',
 'cookie': 'bid=yQdC/AzTaCw',
 'referer': 'https://www.douban.com/photos/photo/2370443040/',
 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
 }

 def get_media_requests(self, item, info):
 for image_url in item['image_urls']:
 self.default_headers['referer'] = image_url
 yield Request(image_url, headers=self.default_headers)

 def item_completed(self, results, item, info):
 image_paths = [x['path'] for ok, x in results if ok]
 if not image_paths:
 raise DropItem("Item contains no images")
 item['image_paths'] = image_paths
 return item

run_spider.py


from scrapy import cmdline
cmd_str = 'scrapy crawl download_douban'
cmdline.execute(cmd_str.split(' '))

settings.py


# -*- coding: utf-8 -*-

# Scrapy settings for douban_imgs project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# http://doc.scrapy.org/en/latest/topics/settings.html
# http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'douban_imgs'

SPIDER_MODULES = ['douban_imgs.spiders']
NEWSPIDER_MODULE = 'douban_imgs.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent
# USER_AGENT = 'douban_imgs (+http://www.yourdomain.com)'

# Configure maximum concurrent requests performed by Scrapy (default: 16)
# CONCURRENT_REQUESTS=32

# Configure a delay for requests for the same website (default: 0)
# See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
# DOWNLOAD_DELAY=3
# The download delay setting will honor only one of:
# CONCURRENT_REQUESTS_PER_DOMAIN=16
# CONCURRENT_REQUESTS_PER_IP=16

# Disable cookies (enabled by default)
# COOKIES_ENABLED=False

# Disable Telnet Console (enabled by default)
# TELNETCONSOLE_ENABLED=False

# Override the default request headers:
# DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
# }

# Enable or disable spider middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
# SPIDER_MIDDLEWARES = {
# 'douban_imgs.middlewares.MyCustomSpiderMiddleware': 543,
# }

# Enable or disable downloader middlewares
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
# DOWNLOADER_MIDDLEWARES = {
# 'douban_imgs.middlewares.MyCustomDownloaderMiddleware': 543,
# }

# Enable or disable extensions
# See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
# EXTENSIONS = {
# 'scrapy.telnet.TelnetConsole': None,
# }

# Configure item pipelines
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
 'douban_imgs.pipelines.DoubanImgDownloadPipeline': 300,
}

IMAGES_STORE = 'D:\\doubanimgs'
#IMAGES_STORE = '/tmp'

IMAGES_EXPIRES = 90


# Enable and configure the AutoThrottle extension (disabled by default)
# See http://doc.scrapy.org/en/latest/topics/autothrottle.html
# NOTE: AutoThrottle will honour the standard settings for concurrency and delay
# AUTOTHROTTLE_ENABLED=True
# The initial download delay
# AUTOTHROTTLE_START_DELAY=5
# The maximum download delay to be set in case of high latencies
# AUTOTHROTTLE_MAX_DELAY=60
# Enable showing throttling stats for every response received:
# AUTOTHROTTLE_DEBUG=False

# Enable and configure HTTP caching (disabled by default)
# See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
# HTTPCACHE_ENABLED=True
# HTTPCACHE_EXPIRATION_SECS=0
# HTTPCACHE_DIR='httpcache'
# HTTPCACHE_IGNORE_HTTP_CODES=[]
# HTTPCACHE_STORAGE='scrapy.extensions.httpcache.FilesystemCacheStorage'

scrapy.cfg


# Automatically created by: scrapy startproject
#
# For more information about the [deploy] section see:
# https://scrapyd.readthedocs.org/en/latest/deploy.html

[settings]
default = douban_imgs.settings

[deploy]
#url = http://localhost:6800/
project = douban_imgs

总结

以上就是本文关于scrapy爬虫完整实例的全部内容,希望对大家有所帮助。感兴趣的朋友可以继续参阅本站其他相关专题,如有不足之处,欢迎留言指出。感谢朋友们对本站的支持!

scrapy 爬虫实例 scrapy分布式爬虫实例 scrapy 简单爬虫实例 scrapy 完整实例 python爬虫框架scrapy python 爬虫 scrapy