python

超轻量级php框架startmvc

Python抓取聚划算商品分析页面获取商品信息并以XML格式保存到本地

更新时间:2020-05-22 08:30:01 作者:startmvc
本文实例为大家分享了Python抓取聚划算商品页面获取商品信息并保存的具体代码,供大家参

本文实例为大家分享了Python抓取聚划算商品页面获取商品信息并保存的具体代码,供大家参考,具体内容如下


#!/user/bin/python 
# -*- coding: gbk -*- 
#Spider.py 
 
import urllib2 
import httplib 
import StringIO 
import gzip 
import re 
import chardet 
import sys 
import os 
import datetime 
from xml.dom.minidom import Document 
from BeautifulSoup import BeautifulSoup 
 
## 这段代码是用于解决控制台打印汉字报错的问题 
reload(sys) 
sys.setdefaultencoding("utf8") 
##################################################### 
 
## debug模式开关,开启后可以看到Http请求的头部信息以及debug日志 
DEBUG = 1 
NO_DEBUG = 0 
httplib.HTTPConnection.debuglevel = DEBUG 
## 是否显示爬取网页源代码开关 
showSrcCode = False 
## 压缩方式 
ZIP_TYPE = "gzip" 
 
fileName = "auctions" 
location = "d://spiderData/" 
 
## header 
headerConfig = {"User-Agent":"taobao-yanyuan.qzs", "Accept-encoding":ZIP_TYPE} 
##################################################### 
 
 
#############class SpiderConfig ##################### 
class SpiderConfig: 
 """ 
 configuration for spider name and url 
 """ 
 def __init__(self, name, url): 
 self.name = name 
 self.url = url 
##################################################### 
 
##############class SpiderAuctionDomain############## 
class SpiderAuctionDomain: 
 """ 
 Store information with auctions spidered by python 
 """ 
 title = "" 
 url = "" 
 img = "" 
 price = "" 
 
 def __init__(self): 
 pass 
 
##################################################### 
 
########class SpiderDefaultErrorHandler############## 
class SpiderDefaultErrorHandler(urllib2.HTTPDefaultErrorHandler): 
 def http_error_default(self, req, fp, code, msg, hdrs): 
 """ 
 default error process handler for spider 
 """ 
 result = urllib2.HTTPError(req.get_full_url(), code, msg, hdrs, fp) 
 result.status = code 
 result.url = req.get_full_url() 
 
 print "<", result.url, "Exception code :", result.status, ">" 
 
 return result 
##################################################### 
 
#############class SpiderHandler##################### 
class SpiderHandler: 
 """ 
 spider handler 
 """ 
 
 def spider(self, spiderConfig): 
 try: 
 request = urllib2.Request(spiderConfig.url) 
 
 ## configure request hreader 
 for key,val in headerConfig.items(): 
 request.add_header(key, val) 
 
 ## build opener 
 opener = urllib2.build_opener(SpiderDefaultErrorHandler()) 
 
 ## open request 
 openRequest = opener.open(request) 
 
 ## read data 
 spiderData = openRequest.read() 
 
 ## close 
 opener.close() 
 
 if 0 == len(spiderData): 
 return 
 
 if ZIP_TYPE== openRequest.headers.get("Content-Encoding"): 
 spiderData = SpiderHandler.gzipData(self, spiderData) 
 
 if httplib.HTTPConnection.debuglevel == DEBUG and showSrcCode: 
 print spiderData 
 
 # parse html 
 SpiderHandler.parse(self, spiderData) 
 
 except Exception,x: 
 print "spider process Exception:", x 
 
 
 
 def parse(self, spiderData): 
 """ 
 parse html content 
 """ 
 
 if httplib.HTTPConnection.debuglevel == DEBUG: 
 charsetAnalyze = chardet.detect(spiderData) 
 print "analyze spider data encode :",charsetAnalyze["encoding"] 
 
 print "执行解析", fileName 
 
 soup = BeautifulSoup(spiderData) 
 encode = soup.originalEncoding 
 
 encoding = lambda x : x.encode(encode) 
 
 if httplib.HTTPConnection.debuglevel == DEBUG: 
 print "识别到编码:", encode 
 title = soup.head.title.string 
 print encoding(title) 
 
 spiderContents = soup.findAll(name="div", attrs={"class":"main-box avil"}) 
 auctions = ["%s" % s for s in spiderContents] 
 
 if auctions is None: 
 return 
 
 auctionList = [] 
 
 for auc in auctions: 
 auctionDomain = SpiderAuctionDomain() 
 # parse auction link 
 links = re.search(re.compile(r'<a href=[\"|\']http://ju.taobao.com/tg/life_home.htm\?item_id=([^>]*)[\"|\']', re.IGNORECASE), auc) 
 if links is not None : 
 auctionDomain.link = encoding("http://ju.taobao.com/tg/life_home.htm?item_id=" + "".join(["%s" % s for s in links.groups() if len(s) > 0])) 
 
 #parse auction title 
 titles = re.search(re.compile(r"([^>]*)</a></h2>", re.IGNORECASE), auc) 
 if titles is not None: 
 auctionDomain.title = encoding("".join(["%s" % t for t in titles.groups() if len(t) > 0])) 
 
 #parse auction price 
 price = re.search(re.compile(r"<strong class=\"J_juPrices\".*</b>([^<]*)</strong>", re.IGNORECASE), auc) 
 if price is not None: 
 auctionDomain.price = "".join(["%s" % p for p in price.groups() if len(p) > 0]) 
 
 #parse image url 
 imgs = re.search(re.compile(r"<img src=[\'\"]([^>]*)[\'\"]", re.IGNORECASE), auc) 
 if imgs is not None: 
 auctionDomain.img = "".join(["%s" % i for i in imgs.groups() if len(i) > 0]) 
 
 auctionList.append(auctionDomain) 
 
 print "成功解析商品信息:" 
 for a in auctionList: 
 print "--->",a.title 
 
 # sort auction list 
 auctionList = SpiderHandler.sortAuctionList(self, auctionList) 
 
 # save in file 
 SpiderHandler.save(self, auctionList) 
 
 print "解析完成" 
 
 pass 
 
 def sortAuctionList(self, auctionList): 
 """ 
 冒泡排序,按照价格排序 
 """ 
 length = len(auctionList) 
 if length < 2: 
 return auctionList 
 else: 
 for i in range(length-1): 
 for j in range(length - i -1): 
 if float(auctionList[j].price) > float(auctionList[j+1].price): 
 auctionList[j], auctionList[j+1] = auctionList[j+1], auctionList[j] 
 return auctionList 
 pass 
 
 def save(self, auctionList): 
 if auctionList is not None: 
 doc = Document() 
 
 auctions = doc.createElement("auctions") 
 doc.appendChild(auctions) 
 
 for auc in auctionList: 
 auction = doc.createElement("auction") 
 auctions.appendChild(auction) 
 
 SpiderHandler.generateXML(self, doc, auction, "title", auc.title) 
 SpiderHandler.generateXML(self, doc, auction, "price", auc.price) 
 SpiderHandler.generateXML(self, doc, auction, "img", auc.img) 
 SpiderHandler.generateXML(self, doc, auction, "link", auc.link) 
 
 if False == os.path.exists(location): 
 os.mkdir(location) 
 
 file = open(location+fileName+".xml", 'w') 
 file.write(doc.toprettyxml()) 
 file.close() 
 
 if httplib.HTTPConnection.debuglevel == DEBUG: 
 print doc.toprettyxml() 
 
 def generateXML(self, doc, f, name, txt): 
 c = doc.createElement(name) 
 f.appendChild(c) 
 c.appendChild(doc.createTextNode(txt)) 
 
 def gzipData(self, spiderData): 
 """ 
 get data from gzip 
 """ 
 if 0 == len(spiderData): 
 return spiderData 
 spiderDataStream = StringIO.StringIO(spiderData) 
 spiderData = gzip.GzipFile(fileobj=spiderDataStream).read() 
 return spiderData 
##################################################### 
 
if __name__ == "__main__": 
 nowtime = lambda:datetime.datetime.strftime(datetime.datetime.now(),"%Y年%m月%d日 %H时%m分%S秒") 
 
 needSpiderUrl = {"suzhou":"http://ju.taobao.com/suzhou", 
 "hangzhou":"http://ju.taobao.com/hangzhou", 
 "shanghai":"http://ju.taobao.com/shanghai", 
 "beijing":"http://ju.taobao.com/beijing", 
 "chengdu":"http://ju.taobao.com/chengdu"} 
 
 configList = [] 
 for k,v in needSpiderUrl.items(): 
 spiderConfig = SpiderConfig(k, v) 
 configList.append(spiderConfig) 
 
 spiderHandler = SpiderHandler() 
 
 print "爬虫执行开始时间:",nowtime() 
 for spiderConfig in configList: 
 fileName = spiderConfig.name 
 spiderHandler.spider(spiderConfig) 
 
 print "爬虫执行完毕时间:",nowtime() 

更多内容请参考专题《python爬取功能汇总》进行学习。

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持脚本之家。

Python抓取聚划算商品页面 Python抓取商品页面信息 Python抓取信息并保存