python

超轻量级php框架startmvc

python爬取本站电子书信息并入库的实现代码

更新时间:2020-08-22 22:48:01 作者:startmvc
入门级爬虫:只抓取书籍名称,信息及下载地址并存储到数据库数据库工具类:DBUtil.pyimport

入门级爬虫:只抓取书籍名称,信息及下载地址并存储到数据库

数据库工具类:DBUtil.py


import pymysql

class DBUtils(object):
 def connDB(self): #连接数据库
 conn=pymysql.connect(host='192.168.251.114',port=3306, user='root',passwd='b6f3g2',db='yangsj',charset='utf8');
 cur=conn.cursor();
 return (conn,cur);

 def exeUpdate(self,conn,cur,sql): #更新或插入操作
 sta=cur.execute(sql);
 conn.commit();
 return (sta);

 def exeDelete(self,conn,cur,IDs): #删除操作 demo 没用到
 sta=0;
 for eachID in IDs.split(' '):
 sta+=cur.execute("delete from students where Id=%d"%(int(eachID)));
 conn.commit();
 return (sta);

 def exeQuery(self,cur,sql): #查找操作
 effect_row = cur.execute(sql);
 return (effect_row,cur);

 def connClose(self,conn,cur): #关闭连接,释放资源
 cur.close();
 conn.close();

if __name__ == '__main__':
 dbUtil = DBUtils();
 conn,cur = dbUtil.connDB();

书籍操作文件 bookOpe.py


from DBUtil import DBUtils
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
logging.basicConfig(
 level=logging.INFO
)
class BookOperator(object):
 def __addBook(self,book):
 logging.info("add book:%s" % book.bookName);
 dbUtil = DBUtils();
 conn,cur = dbUtil.connDB();
 insertBookSql = ("insert into book (bookName,bookUrl,bookInfo) values ('%s','%s','%s');"%(book.bookName,book.downLoadUrl,book.mainInfo));
 dbUtil.exeUpdate(conn,cur,insertBookSql);
 dbUtil.connClose(conn,cur);
 def __selectLastBookId(self):
 logging.info("selectLastBookId ");
 dbUtil = DBUtils();
 conn,cur = dbUtil.connDB();
 selectLastBookSql = "select id from book order by id desc limit 1";
 effect_row,cur = dbUtil.exeQuery(cur,selectLastBookSql);
 bookId = cur.fetchone()[0];
 dbUtil.connClose(conn,cur);
 return bookId;
 def __addBookDownLoadInfos(self,downLoadInfos,bookId):
 logging.info("add bookId:%s" % bookId);
 dbUtil = DBUtils();
 conn,cur = dbUtil.connDB();
 for downLoadinfo in downLoadInfos:
 insertBookDownLoadInfo = ("insert into book_down_url (bookId,downName,downUrl) values ('%s','%s','%s');"%(bookId,downLoadinfo.downName,downLoadinfo.downUrl));
 dbUtil.exeUpdate(conn,cur,insertBookDownLoadInfo);
 dbUtil.connClose(conn,cur);
 def addBookInfo(self,book):
 logging.info("add bookInfo:%s" % book.bookName);
 self.__addBook(book);
 bookId = self.__selectLastBookId();
 self.__addBookDownLoadInfos(book.downLoadInfos,bookId);
if __name__ == '__main__':
 bookope = BookOperator();
 book = Book("aaa","yang","cccc");
 book.addDownLoadUrl(DownLoadInfo("aaa.html","书籍"));
 bookope.addBookInfo(book);

书籍信息文件 bookInfo.py


import sys
sys.encoding = "utf8"
class Book(object):
 #书籍信息#
 def __init__(self,mainInfo,downLoadUrl,bookName):
 self.mainInfo = mainInfo;
 self.downLoadUrl = downLoadUrl;
 self.bookName = bookName;
 self.downLoadInfos = [];
 def addDownLoadUrl(self,downloadInfo):
 self.downLoadInfos.append(downloadInfo);
 def print_book_info(self):
 print ("bookName :%s" % (self.bookName));
class DownLoadInfo(object):
 #下载信息#
 def __init__(self,downUrl,downName):
 self.downUrl = downUrl;
 self.downName = downName;
 def print_down_info(self):
 print ("downLoad %s - %s" % (self.downUrl,self.downName));

51job界面解析文件 FiveOneJobFetch.py


import requests
from bs4 import BeautifulSoup
import sys
from bookInfo import Book
from bookInfo import DownLoadInfo
import logging
sys.encoding = "utf8"
class PageFetch(object):
 host = "//www.jb51.net/"; #域名+分类
 category = "books/"; #具体请求页
 def __init__(self,pageUrl):
 self.pageUrl = pageUrl; #完整URL
 self.url = PageFetch.host+PageFetch.category + pageUrl;
 def __getPageContent(self):
 req = requests.get(self.url);
 if req.status_code == 200:
 req.encoding = "gb2312";
 strText = req.text;
 return strText;
 else:
 return "";
 def getPageContent(url):
 req = requests.get(url);
 if req.status_code == 200:
 req.encoding = "gb2312";
 strText = req.text;
 return strText;
 else:
 return "";
 def __getMaxPageNumAndUrl(self):
 fetchUrl = self.pageUrl;
 #获取分页地址 分页url 形如 list45_2.html 2为页号#
 maxPageNum = 0;
 maxLink = "";
 while maxLink == "":
 url = PageFetch.host+PageFetch.category +fetchUrl;
 reqContent = PageFetch.getPageContent(url)
 soup = BeautifulSoup (reqContent,"html.parser");
 for ul in soup.select(".plist"):
 print ("数据");
 print (ul);
 maxPageNum = ul.select("strong")[0].text;
 alink = ul.select("a");
 if alink[-1]['href'] == "#":
 maxLink = alink[1]['href'];
 else:
 fetchUrl = alink[-1]['href'];
 return maxPageNum,maxLink;
 def __formatPage(self,pageNum):
 #格式化url 形如 list45_2.html#
 lineBeginSite = self.pageUrl.index("_")+1;
 docBeginSite = self.pageUrl.index(".");
 return self.pageUrl[:lineBeginSite]+str(pageNum+1)+self.pageUrl[docBeginSite:];
 def getBookPageList(self):
 #获取书籍每页的URL#
 shortPageList = [];
 maxPageNum,urlPattern = self.__getMaxPageNumAndUrl();
 for i in range(int(maxPageNum)):
 shortPageList.append(self.host +self.category+ self.__formatPage(i));
 return shortPageList;
 def getDownloadPage(url):
 downPage= [];
 reqContent = PageFetch.getPageContent(url);
 soup = BeautifulSoup (reqContent,"html.parser");
 for a in soup.select(".cur-cat-list .btn-dl"):
 downPage.append(PageFetch.host+a['href']);
 return downPage;
 def getBookInfo(url):
 logging.info("获取书籍信息url:%s" % url);
 reqContent = PageFetch.getPageContent(url);
 soup = BeautifulSoup (reqContent,"html.parser");
 mainInfo = (soup.select("#soft-intro"))[0].text.replace("截图:","").replace("'","");
 title = (soup.select("dl dt h1"))[0].text.replace("'","");
 book = Book(mainInfo,url,title);
 for ul in soup.select(".ul_Address"):
 for li in ul.select("li"):
 downLoadInfo = DownLoadInfo(li.select("a")[0]['href'],li.select("a")[0].text);
 book.addDownLoadUrl(downLoadInfo);
 return book;
if __name__ == '__main__':
 p = PageFetch("list152_1.html");
 shortPageList = p.getBookPageList();
 downPage= [];
 for page in shortPageList:
 downLoadPage = PageFetch.getDownloadPage(page);
 downPage = downPage+downLoadPage;
 print ("================汇总如下===============================");
 for bookDownLoadPage in downPage:
 book = PageFetch.getBookInfo(bookDownLoadPage);
 print (book.bookName+":%s" % book.downLoadUrl);
 for d in book.downLoadInfos:
 print ("%s - %s" % (d.downUrl,d.downName));
 # p = PageFetch("list977_1.html");
 # p = p.getMaxPageNumAndUrl();
 # print (p);

执行文件,以上文件copy在相同的文件夹下 执行此文件即可 51Job.py


from FiveOneJobFetch import PageFetch
from bookInfo import Book
from bookInfo import DownLoadInfo
from bookOpe import BookOperator

def main(url):
 p = PageFetch(url);
 shortPageList = p.getBookPageList();
 bookOperator = BookOperator();
 downPage= [];
 for page in shortPageList:
 downLoadPage = PageFetch.getDownloadPage(page);
 downPage = downPage+downLoadPage;
 for bookDownLoadPage in downPage:
 book = PageFetch.getBookInfo(bookDownLoadPage);
 bookOperator.addBookInfo(book);
 print ("数据抓取成功:"+url);

if __name__ == '__main__':
 urls = ["list152_35.html","list300_2.html","list476_6.html","list977_2.html","list572_5.html","list509_2.html","list481_1.html","list576_1.html","list482_1.html","list483_1.html","list484_1.html"];
 for url in urls:
 main(url);

数据库表:书籍信息表和下载地址表


CREATE TABLE `book` (
`id` INT(11) NOT NULL AUTO_INCREMENT,
`bookName` VARCHAR(200) NULL DEFAULT NULL,
`bookUrl` VARCHAR(500) NULL DEFAULT NULL,
`bookInfo` TEXT NULL,
PRIMARY KEY (`id`)
)
COLLATE='utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=2936;

CREATE TABLE `book_down_url` (
`id` INT(11) NOT NULL AUTO_INCREMENT,
`bookId` INT(11) NOT NULL DEFAULT '0',
`downName` VARCHAR(200) NOT NULL DEFAULT '0',
`downUrl` VARCHAR(2000) NOT NULL DEFAULT '0',
PRIMARY KEY (`id`)
)
COLLATE='utf8mb4_general_ci'
ENGINE=InnoDB
AUTO_INCREMENT=44441;

git地址:https://git.oschina.net/yangsj/BookFetch/tree/master

python爬虫