Python爬虫+IDM 快速下载全网页

发布于 2021-10-26  13 次阅读


准备工作

  • Python 3.8.
  • request
  • pip install beautifulsoup4
  • pip install lxml
  • re

基础表达式

删除标签前内容
replacedStr = re.sub(r".*(标记)", "", context_all)

删除字符串所有符号(保留英文,汉字)
strs = "how aasd[的啊外脚///...手架公司much for the maple syrup? $20.99? That's ricidulous!!!"
print (strs)
nstr = re.sub(r'[?|$|.|!]',r'',strs)
print (nstr)
nestr = re.sub(r'[^a-zA-Z0-9 \u4e00-\u9fa5]',r'',nstr)
print (nestr)


删除掉前几组 数字
inputStr = "hello 123 world 456 nihao 789"
replacedStr = re.sub(r"(?P<number>\d+)", '', inputStr, 2)
print (replacedStr)

匹配带次数选择
replacedStr = (re.sub(r"</a>,.*?top\">","\r", replacedStr,1))


Python正则表达式:
. 匹配任意1个字符(除了\n)
[ ] 匹配[ ]中列举的字符
\d 匹配数字,即0-9
\D 匹配非数字,即不是数字
\s 匹配空白,即 空格,tab键
\S 匹配非空白
\w 匹配非特殊字符,即a-z、A-Z、0-9、_、汉字
\W 匹配特殊字符,即非字母、非数字、非汉字、非_

Python+IDM


# -*- codeing = utf-8 -*-
import sys
import requests    #导入用于发送请求的库
import re    #导入筛选出小说内容的库
import time # 导入时间库
import urllib.request# url request
import os

from bs4 import BeautifulSoup


#IDM 使用模块
def IDMdownload(DownUrl, DownPath, FileName):
    IDMPath = r"C:\Program Files (x86)\Internet Download Manager"
    os.chdir(IDMPath)
    IDM = "IDMan.exe"
    print (FileName)
    command = ' '.join([IDM, '/d', DownUrl, '/p', DownPath, '/f', FileName, '/n', '/a'])
    os.system(command)

# get 网页网站列表
def getDownloadUrl(rulpath):
    nameN0 = []
    nameN1 = []
    nameR0 = []
    nameR1 = []
    nameD0 = []
    nameD1 = [] 
    #模拟浏览器方式打开网页... 避免被ban
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.50',
    }



    context_url = rulpath
    #获取网页内容
    req = requests.get(url = context_url,headers=headers)
    #转码
    req.encoding = 'utf-8'
    #读取数据
    html = req.text
    #使用BeautifulSoup解析
    soup = BeautifulSoup(html, 'lxml')
    #寻找我们需要的标签
    context_all = soup.find_all(['table'])
    context_all = str(context_all)
    #全文删除没用的字符
    context_all = (re.sub(r"【】", '' , context_all))
    context_all = (re.sub(r"】", '' , context_all))
    context_all = (re.sub(r"【", '' , context_all))

    #获取获取名字的字节位置
    for N in re.finditer((r"<tr><td><a href="), context_all):
        #print(m.start(), m.end())
        nameN0.append(N.start())
        nameN1.append(N.end())
        #print(nameN)
    #获取获取下载url的字节位置
    for R in re.finditer((r"</a></td><td><div align=\"center\"><a href=\""), context_all):
        nameR0.append(R.start())
        nameR1.append(R.end())
        #print(nameR)
    #获取结束的字节位置
    for D in re.finditer((r"epub\"><img src="), context_all):
        nameD0.append(D.start())
        nameD1.append(D.end())
        #print(nameD)

    for i in range(len(nameD0)):

        # 提取文件名
        filename = (context_all[(nameN1[i]+20):(nameR0[i]-1)]+'.epub')
        filename = (re.sub(r" ", '' , filename))
        filename = (re.sub(r" ", '' , filename))
        filename = (re.sub(r"\[", '' , filename))
        filename = (re.sub(r"]", '' , filename))
        filename = (re.sub(r"\*", '' , filename))

        # 提取url
        path = context_all[nameR1[i]:nameD0[i]]+'epub'

        #下载文件的路径
        DownPath = ("D:\\VYou\\re\\B\\")

        IDMdownload(path, DownPath, filename)
        #print(path, DownPath, filename)
        
        print("save -> %s " % filename)
        #停顿时间, 减少对网站服务器的负载
        time.sleep(4)

if __name__ == "__main__":  # 当程序执行时
    #网页目标地址
    base_url = '*******************'

    for i in range(25,56):
        time.sleep(15)
        if i<10:
            print ('T0'+str(i))
            suffix_url = ('/T0'+str(i))
            getDownloadUrl(base_url+suffix_url)

        else:
            print ('T'+str(i))  
            suffix_url =('/T'+str(i))
            getDownloadUrl(base_url+suffix_url)

Python 转存 txt

# -*- codeing = utf-8 -*-
import requests    #导入用于发送请求的库
import re    #导入筛选出小说内容的库
import time # 导入时间库
from sys import stdout
from typing import Dict, List
from bs4 import BeautifulSoup


list_int=0
def base_date(book_url):



    chapter_url_list=[]



    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36 Edg/88.0.705.50',
    }

    #response_1 是通过 requests 库 get 方法向小说网址发出请求之后得到的响应,而 response_1.text 表示纯文本
    response_1=requests.get(book_url,headers=headers)

    response_1.encoding='utf-8'
    #小说章节目录相关信息所在的文本
    chapter=response_1.text

    regx="<td><a href=..(.*).htm target=_blank>"
    chapter_href_list=re.findall(regx, response_1.text)


    for i in chapter_href_list:
        url=base_url+i+'.htm'
        chapter_url_list.append(url)

    return (chapter_url_list)

def get_content(target):


    req = requests.get(url = target)
    req.encoding = 'utf-8'
    html = req.text

    soup = BeautifulSoup(html, 'lxml')

    context_all = soup.find_all(['p','a'])
    context_all = str(context_all)

    replacedStr = (re.sub(r".*?<a href=\"#MENU-top\">","", context_all,1))
    replacedStr = (re.sub(r"</a>,.*?top\">","\r", replacedStr,1))
    replacedStr = (re.sub(r"</a>","\r", replacedStr))
    replacedStr = (re.sub(r", <p>","", replacedStr))
    replacedStr = (re.sub(r"<br/>","", replacedStr))
    replacedStr = (re.sub(r"</p> ","\r", replacedStr))
    replacedStr = (re.sub(r"</p>, <a href=.*","\r", replacedStr))

    return replacedStr

def write_date(target):
    global list_int
    list_int += 1
    context = get_content(target)
    #print (context)
    book_name= context.split('\r', 1)[0]

    book_name_sort = re.sub(r'[^a-zA-Z0-9 \u4e00-\u9fa5]',r'',book_name)

    f = open((str(list_int)+book_name_sort+'.txt'),mode='w',encoding='utf-8')         
    f.write(context)  # write 写入
    f.close()  


if __name__ == "__main__":  # 当程序执行时
    # 调用函数
    book_url = '****'
    web_address = base_date(book_url)
    #print (web_address)
    for i in web_address:
        if i[-6:-5].isdigit():
            print (i)
            write_date(i)
            continue
            
    print("爬取完毕!")