反爬策略会监测是否是自动化控件在操纵浏览器,webdriver默认会给一个友好的提示,但是我们希望它邪恶一点,一般在middleware.py
中添加这个配置项:
from selenium import webdriver
import time
from scrapy.http import HtmlResponse
class HandlessMiddleware(object):
def __init__(self):
# super(HandlessMiddleware, self).__init__()
option = webdriver.ChromeOptions()
option.add_argument("--incognito") # 配置隐私模式
# option.add_argument('--disable-gpu') # 禁用GPU
option.add_argument('lang=zh_CN.UTF-8') # 设置
# 取消自动化控制提示
option.add_experimental_option("excludeSwitches", ["enable-automation"])
option.add_experimental_option('useAutomationExtension', False)
# 设置随机的useragent
# option.add_argument(
# 'user-agent=' + self.ua.random)
# option.add_argument('headless')
prefs = {
# "profile.managed_default_content_settings.images": 2, # 禁止加载图片
# 'permissions.default.stylesheet': 2, # 禁止加载css
}
option.add_experimental_option("prefs", prefs)
self.browser = webdriver.Chrome(chrome_options=option)
# 执行js取消webdriver的标识
self.browser.execute_cdp_cmd("Page.addScriptToEvaluateOnNewDocument", {
"source": """
Object.defineProperty(navigator, 'webdriver', {
get: () => undefined
})
"""
})
self.browser.implicitly_wait(5) # 获取页面元素时智能等待5s
def process_request(self, request, spider):
pass
然后在settings.py
中添加配置:
DOWNLOADER_MIDDLEWARES = {
'项目名.middlewares.MeituanPurchaseDownloaderMiddleware': None, # 设置为None
'项目名.middlewares.HandlessMiddleware': 200, # 添加这个middleware
}