python3爬虫实战(一):小说
一、前言
以前看过择天记,挺喜欢的,这次选它为目标(ps:支持正版,本文纯属学习交流)csdn地址:python3爬虫实战之小说(一) - CSDN博客
github地址:https://github.com/FanShuixing/git_webspider/tree/master/novel
import requests
def crawle():
url=https://www.qtshu.com/zetianji/
req=requests.get(url=url)
html=req.text
print(html)
crawle()
输出如下图:
import requests
def crawle():
url=https://www.qtshu.com/zetianji/
req=requests.get(url=url)
req.encoding=utf-8 html=req.text
print(html)
crawle()
这下网页就被正确的列印出来了。
只是单纯的下载网页没有什么意义,爬虫主要的目的是提取数据,接下来我们通过pyquery提取标题和链接,在任意一个标题下右键审查元素,得到下图,可见整篇文章内容都是在class为booklist clearfix的标签中import requestsfrom pyquery import PyQuery as pq
def crawle():
url=https://www.qtshu.com/zetianji/
req=requests.get(url=url)
req.encoding=utf-8
html=req.text
for each in items:
title=each.text()
url=each.find(a).attr(href)
print(title,url)
crawle()
def Text(url):
req=requests.get(url=url)
req.encoding=utf-8
html=req.text
doc=pq(html)
item=doc(.contentbox p).text()
print(item)
# crawle()
url=https://www.qtshu.com/zetianji/2988454.html
Text(url)
import requests
from pyquery import PyQuery as pq
def crawle():
url=https://www.qtshu.com/zetianji/
req=requests.get(url=url)
req.encoding=utf-8
html=req.text
# print(html)
#初始化pyquery对象
doc=pq(html)
#传入css选择器
items=doc(.booklist ul li).items()
for each in items:
title=each.text()
url=each.find(a).attr(href)
#有的没有url
if url:
url=https://www.qtshu.com/zetianji/+url
print(title,url)
Text(url)
def Text(url):
print(正在提取:,url)
req=requests.get(url=url)
req.encoding=utf-8
html=req.text
doc=pq(html)
item=doc(.contentbox p).text()
print(item)
crawle()
# url=https://www.qtshu.com/zetianji/2988454.html
# Text(url)
```
数据提取出来当然是拿来用的,我们需要把数据存储下来。
```
import requests
from pyquery import PyQuery as pq
import os,sys
os.chdir(E:/爬虫数据)
def crawle():
url=https://www.qtshu.com/zetianji/
req=requests.get(url=url)
req.encoding=utf-8
html=req.text
# print(html)
#初始化pyquery对象
doc=pq(html)
#传入css选择器
items=doc(.booklist ul li).items()
#m和index是为了计算下载进度
m=len(doc(.booklist ul li))
fr=open(择天记.txt,w)
index=1
for each in items:
title=each.text()
url=each.find(a).attr(href)
#有的没有url
if url:
index+=1
url=https://www.qtshu.com/zetianji/+url
# print(title,url)
text=Text(url)
fr.write(title)
fr.write(
)
fr.write(text)
fr.write(
)
sys.stdout.write(已下载:%.3f%%%float(index/m)+
)
sys.stdout.flush()
from bs4 import BeautifulSoup
def Text(url):
print(正在提取:,url)
req=requests.get(url=url)
req.encoding=utf-8
html=req.text
doc=pq(html)
item=doc(.contentbox p).text()
return item
crawle()
# url=https://www.qtshu.com/zetianji/2988454.html
# Text(url)
a=¥
b=¥
print(a)
print(b)
print(a.encode())
print(b.encode())
a=¥
b=¥
print(a)
print(b)
print(a.encode())
print(b.encode())
print(
)with open(a.txt,w) as fr:
fr.write(a)
if ¥ in a:
a=a.replace(¥,¥)
if xa5 in a:
a=a.replace(xa5,)
import requests
from pyquery import PyQuery as pq
import os,sys
os.chdir(E:/爬虫数据)
def crawle():
url=https://www.qtshu.com/zetianji/
req=requests.get(url=url)
req.encoding=utf-8
html=req.text
# print(html)
#初始化pyquery对象
doc=pq(html)
#传入css选择器
items=doc(.booklist ul li).items()
#m和index是为了计算进度
m=len(doc(.booklist ul li))
index=1
fr=open(择天记.txt,w)
print(**100)
print( *50,欢迎学习交流)
print(**100)
for each in items:
title=each.text()
url=each.find(a).attr(href)
#有的没有url
if url:
index+=1
url=https://www.qtshu.com/zetianji/+url
# print(title,url)
text=Text(url)
fr.write(title)
fr.write(
)
fr.write(text)
fr.write(
)
#进度条显示
sys.stdout.write(已下载:%.3f%%%float(index/m)+
)
sys.stdout.flush()
from bs4 import BeautifulSoup
def Text(url):
req=requests.get(url=url)
req.encoding=utf-8
html=req.text
doc=pq(html)
item=doc(.contentbox p).text()
if ufeff in item:
item=item.replace(ufeff,)
if xa0 in item:
item=item.replace(xa0,)
return item
crawle()
# url=https://www.qtshu.com/zetianji/2988454.html
# Text(url)
这儿有个福利,以前在学爬虫教程的时候写好了下载并缓存到本地的类,我们可以直接调用这个类,需要改的地方只有一处
import requests
from pyquery import PyQuery as pq
import os,sys
from Cache import Downloader
from Cache import DiskCache
os.chdir(E:/爬虫数据)
def crawle():
url=https://www.qtshu.com/zetianji/
req=requests.get(url=url)
req.encoding=utf-8
html=req.text
# print(html)
#初始化pyquery对象
doc=pq(html)
#传入css选择器
items=doc(.booklist ul li).items()
#m和index是为了计算进度
m=len(doc(.booklist ul li))
index=1
fr=open(择天记.txt,w)
print(**100)
print( *50,欢迎学习交流)
print(**100)
for each in items:
title=each.text()
url=each.find(a).attr(href)
#有的没有url
if url:
index+=1
url=https://www.qtshu.com/zetianji/+url
# print(title,url)
text=Text(url)
fr.write(title)
fr.write(
)
fr.write(text)
fr.write(
)
#进度条显示
sys.stdout.write(已下载:%.3f%%%float(index/m)+
)
sys.stdout.flush()
from bs4 import BeautifulSoup
def Text(url):
html=D(url)
doc=pq(html)
item=doc(.contentbox p).text()
if ufeff in item:
item=item.replace(ufeff,)
if xa0 in item:
item=item.replace(xa0,)
return item
cache=DiskCache()
D=Downloader(cache)
crawle()
# url=https://www.qtshu.com/zetianji/2988454.html
# Text(url)
#Cache.py
import urllib.request
import re,os
import pickle,requests
from urllib.parse import quote
os.chdir("E:/爬虫数据")
#这个模块可以当作模板来使用,不怎么用修改
class Downloader:
def __init__(self,cache):
self.cache=cache
def __call__(self,url):
result=None
if self.cache:
try:
result=self.cache[url]
# print("已调入数据")
except:
pass
if result==None:
result=self.download(url)
if self.cache:
# print(type(url))
self.cache[url]=result
return result
def download(self,url,num_retry=2):
# print(下载中:,url)
headers={User-agent:Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)}
req=requests.get(url=url,headers=headers)
req.encoding=utf-8
html=req.text
return(html)
class DiskCache:
def __init__(self,cache_dir=Cache):
self.cache_dir=cache_dir
def __getitem__(self,url):
path=self.url_to_path(url)
if os.path.exists(path):
with open(path,rb) as fp:
return pickle.load(fp)
def __setitem__(self,url,result):
path=self.url_to_path(url)
folder=os.path.dirname(path)
if not os.path.exists(folder):
os.makedirs(folder)
with open(path,wb) as fp:
fp.write(pickle.dumps(result))
def url_to_path(self,url):
components=urllib.parse.urlsplit(url)
path=components.path
if not path: #即path为空
path=nihao
elif path.endswith(/):
path+=index.html
filename=components.netloc+path+components.query
filename=re.sub([^/0-9a-zA-Z-.,;_],_,filename)
filename=/.join(segment[:255] for segment in filename.split(/))
return(os.path.join(self.cache_dir,filename))
if __name__==__main__:
a=Downloader(DiskCache())
a.download(https://book.douban.com/subject/1059336/)
req.encoding=utf-8
这个Cache模块非常方便,它会把网页缓存到本地,当需要的时候会被调用。
推荐阅读: