模拟登录进行爬虫:
抓包分析:
如果显示您的连接不是私密连接,右键图标,属性,在位置那里打一个空格。添加 --test-type --ignore-certificate-errors
我们看到,马上开搞!
爬一下书架的藏书:
分析观察一下:
效果:
code:
import requests
headers = {
'User-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36',
}
session = requests.session()
#给登录的路由发包
session.post
res = session.get
res.encoding = "utf8"
data = res.json().get("data")
print(data)
etree解析:
import requests
from lxml import etree
headers = {
'User-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36',
}
session = requests.session()
#给登录的路由发包
session.post
res = session.get
res.encoding = "utf8"
data = res.json().get("data")
for bookDict in data:
bookId = bookDict.get('bookId')
res = requests.get
res.encoding = 'utf-8'
selector = etree.HTML(res.text)
urls = selector.xpath('//dl[@class="Volume"]/dd/a')
for url in urls:
each_href = url.xpath("./@href")[0]
print(each_href)
each_title = url.xpath("./span/text()")[0].strip()
print(each_title)
最终版本:
import requests
from lxml import etree
headers = {
'User-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36',
}
session = requests.session()
#给登录的路由发包
session.post
res = session.get
res.encoding = "utf8"
data = res.json().get("data")
for bookDict in data:
bookId = bookDict.get('bookId')
res = requests.get
res.encoding = 'utf-8'
selector = etree.HTML(res.text)
urls = selector.xpath('//dl[@class="Volume"]/dd/a')
for url in urls:
each_href = url.xpath("./@href")[0]
each_title = url.xpath("./span/text()")[0].strip()
res = requests.get
res.encoding = 'utf-8'
each_html = res.text
selector = etree.HTML(res.text)
text = selector.xpath('//div[contains(@class,"content")]/div[@class="p"]/p[position()<last()]/text()')
print(text)
写入本地的txt:
import requests
from lxml import etree
import os
headers = {
'User-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Mobile Safari/537.36',
}
session = requests.session()
def login():
# 给登录的路由发包
session.post
#拿到书架返回的数据
def get_books():
res = session.get
res.encoding = "utf8"
data = res.json().get("data")
return data
#给每一本书分类
def get_each_book(data):
for bookDict in data:
bookId = bookDict.get('bookId')
bookName = bookDict.get('bookName')
book_path = os.path.join("书房", bookName)
if not os.path.exists(book_path):
os.mkdir(book_path)
get_chapter(bookName, bookId, book_path)
# 爬每一本书
def get_chapter(bookName, bookId, book_path):
res = requests.get
res.encoding = "utf8"
selector = etree.HTML(res.text)
urls = selector.xpath('//dl[@class="Volume"]/dd/a')
for url in urls:
each_href = url.xpath("./@href")[0]
each_title = url.xpath("./span/text()")[0].strip()
res = requests.get
res.encoding = 'utf-8'
each_html = res.text
selector = etree.HTML(res.text)
each_text = selector.xpath('//div[contains(@class,"content")]/div[@class="p"]/p[position()<last()]/text()')
download(book_path, each_title, each_text)
print("{}书的{}章节下载完成".format(bookName, bookId))
#下载
def download(book_path, each_title, each_text):
each_title = each_title + '.txt'
each_path = os.path.join(book_path, each_title)
with open(each_path, "w", encoding='utf-8') as fp:
for line in each_text:
fp.write(line + "\n")
login()
data = get_books()
folder_path = "书房"
if not os.path.exists(folder_path):
os.mkdir(folder_path)
get_each_book(data)