
#引入模块
import requests
from lxml import etree
from urllib import request
import os
from requests.models import Response
if __name__ == '__main__':
#创建保存文件夹:如果没有直接创建,如果有保存路径
if not os.path.exists('./jian'):
os.mkdir('./jian')
#爬取网页地址
url ='https://sc.chinaz.com/jianli/free.html'
# User-Agent 伪装网页信息不过这里没有用到
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36'
# }
#获取网页信息
jian = requests.get(url)
response = str(jian.content,'utf-8')
# print(response)
#使用etree模块获取网页下载地址信息
parser = etree.HTML(response)
r = parser.xpath('//div[@]//div/a/@href')
#使用for循环爬取你想要的页数(也可以用input自己输入)
for a in range(2,3):
url1 ='https://sc.chinaz.com/jianli/free_%d.html'
list_url =format(url1%a)
new_jian = requests.get(list_url)
data_new_jian = str(new_jian.content,'utf-8')
# print(data_new_jian)
new_parser = etree.HTML(data_new_jian)
new_r = new_parser.xpath('//div[@]//div/a/@href')
#第一页网址与之后网页格式不相同故需要合并到一个list(除第一页其余格式相同)
list_r= r + new_r
# print(list_r)
# with open('./jian.html','w',encoding= 'utf-8') as fp:
# fp.write(response)
# # print('!!!')
# parser = etree.HTMLParser(encoding= 'utf-8')
# tree = etree.parse('jian.html',parser= parser)
# r = tree.xpath('//div[@]//div/a/@href')
# # print(r)
#获取网页信息缺少‘https:’ 使用for 循环组合完整网页
for j in list_r:
# print(j)
jian_url = 'https:' + j
# print(jian_url)
#在爬取网页地址出现原网址报错使用 try: excepte 打印报错信息并继续执行(作者是初学者,暂时不会写爬取到那个网址出错,打印出报错网址,继续学习后续持续更新)
try:
#获取网页信息
jian2 = requests.get(jian_url)
response1 = str(jian2.content,'utf-8')
# with open('./jian3.html','w',encoding= 'utf-8') as fp:
# fp.write(response1)
# # print(response1)
except Exception as result:
print('发生错误请查看:',result)
# parser1 = etree.HTMLParser(encoding= 'utf-8')
# tree1 = etree.parse('jian3.html', parser= parser1)
#使用etree模块获取网页下载地址信息链接
paresr1 = etree.HTML(response1)
r1 = paresr1.xpath('//div[@]/ul/li/a/@href')[1]
jian_name = paresr1.xpath('//div[@]/h1/text()')[0]
# print(r1)
# print(jian_name)
#保存到创建文件夹
rar = requests.get(r1)
with open('./jian/' + str(jian_name) + '.rar','wb') as code:
code.write(rar.content)
print('执行完毕!!!')
#本文章内使用网址仅供学习使用,如有侵权请立刻联系作者,立刻删除
欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)