python

超轻量级php框架startmvc

Python爬虫包BeautifulSoup学习实例(五)

更新时间:2020-06-06 14:54:01 作者:startmvc
本文为大家分享了Python爬虫包BeautifulSoup学习实例,具体内容如下BeautifulSoup使用BeautifulSoup抓

本文为大家分享了Python爬虫包BeautifulSoup学习实例,具体内容如下

BeautifulSoup

使用BeautifulSoup抓取豆瓣电影的一些信息。


# -*- coding: utf-8 -*-
# @Author: HaonanWu
# @Date: 2016-12-24 16:18:01
# @Last Modified by: HaonanWu
# @Last Modified time: 2016-12-24 17:25:33

import urllib2
import json
from bs4 import BeautifulSoup

def nowplaying_movies(url):
 user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
 headers = {'User-Agent':user_agent}
 request = urllib2.Request(url = url, headers = headers)
 response = urllib2.urlopen(request)
 soup_packetpage = BeautifulSoup(response, 'lxml')
 items = soup_packetpage.findAll("li", class_="list-item")
 # items = soup_packetpage.findAll("li", {"class" : "list-item"}) 等价写法
 movies = []
 for item in items:
 if item.attrs['data-category'] == 'nowplaying':
 movie = {}
 movie['title'] = item.attrs['data-title']
 movie['score'] = item.attrs['data-score']
 movie['director'] = item.attrs['data-director']
 movie['actors'] = item.attrs['data-actors']
 movies.append(movie)
 print('%(title)s|%(score)s|%(director)s|%(actors)s' % movie)

 return movies



if __name__ == '__main__':
 url = 'https://movie.douban.com/nowplaying/beijing/'
 movies = nowplaying_movies(url)

 print('%s' % json.dumps(movies, sort_keys=True, indent=4, separators=(',', ': ')))

HTMLParser

使用HTMLParser实现上述功能

这里有一些HTMLParser的基础教程

由于HtmlParser自2006年以后就再没更新,目前很多人推荐使用jsoup代替它。


# -*- coding: utf-8 -*-
# @Author: HaonanWu
# @Date: 2016-12-24 15:57:54
# @Last Modified by: HaonanWu
# @Last Modified time: 2016-12-24 17:03:27
from HTMLParser import HTMLParser
import urllib2
import json

class MovieParser(HTMLParser):
 def __init__(self):
 HTMLParser.__init__(self)
 self.movies = []

 def handle_starttag(self, tag, attrs):
 def _attr(attrlist, attrname):
 for attr in attrlist:
 if attr[0] == attrname:
 return attr[1]
 return None
 if tag == 'li' and _attr(attrs, 'data-title') and _attr(attrs, 'data-category') == 'nowplaying':
 movie = {}
 movie['title'] = _attr(attrs, 'data-title')
 movie['score'] = _attr(attrs, 'data-score')
 movie['director'] = _attr(attrs, 'data-director')
 movie['actors'] = _attr(attrs, 'data-actors')
 self.movies.append(movie)
 print('%(title)s|%(score)s|%(director)s|%(actors)s' % movie)


def nowplaying_movies(url):
 headers = {'User-Agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'} 
 req = urllib2.Request(url, headers=headers)
 s = urllib2.urlopen(req)
 parser = MovieParser()
 parser.feed(s.read())
 s.close()
 return parser.movies


if __name__ == '__main__':
 url = 'https://movie.douban.com/nowplaying/beijing/'
 movies = nowplaying_movies(url)

 print('%s' % json.dumps(movies, sort_keys=True, indent=4, separators=(',', ': ')))

以上全部为本篇文章的全部内容,希望对大家的学习有所帮助,也希望大家多多支持脚本之家。

python 爬虫包 BeautifulSoup