
今天分享一篇关于搜狗问问采集组合文章,希望对大家有所帮助。
附上代码:
# encoding='utf-8'
# coding: utf-8
# Author: 小章哥儿
# Date: 2021-08-03
from lxml import etree
import re
import requests
import time
class Sogou():
def __init__(self):
return
def get_html(self, keyword):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"}
url = f'https://www.sogou.com/sogou?query={keyword}&ie=utf8&insite=wenwen.sogou.com'
html = requests.get(url, headers=headers)
return html.text
def collect_urls(self, keyword):
"""
采集问答关键词前三个标题和链接,列表 元祖形式
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"}
html = self.get_html(keyword)
selector = etree.HTML(html)
questions = [i.xpath('string(.)').replace('搜狗问问', '').replace('搜狗', '').replace('-', '') for i in
selector.xpath('//div[@class="vrwrap"]/h3[@class="vrTitle"]/a')]
link = ['https://www.sogou.com' + i for i in
selector.xpath('//div[@class="vrwrap"]/h3[@class="vrTitle"]/a/@href')]
links = link[:5] or link[:4] or link[:3] or link[:2] or link[:1]
uu = []
try:
for i in links:
url = requests.get(i, headers=headers).text
obj = re.search('https://wenwen.sogou.com/z/(.*?).htm', url)
obj = f'https://wenwen.sogou.com/z/{obj.group(1)}.htm'
# print(obj) # 采集的链接
self.parser_answer(obj, keyword)
uu.append(url)
except AttributeError:
print('采集链接未能解析成功')
pairs = list(zip(questions, uu))
return pairs[:5]
# 采集问答内容
def parser_answer(self, url, keyword):
"""
采集问答内容
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36"}
html = requests.get(url, headers=headers)
selector = etree.HTML(html.text)
# answers = [
# i.xpath('string(.)').replace('\ue010', '').replace('\ufeff', '').replace('\u3000', '').replace('\u2022',
# '').replace(
# '\ue5e5', '').replace('\n', '').replace('\xa0', '').replace(' ', '。').replace(
# '\r', '') for i in selector.xpath('//pre')]
answers = []
for i in selector.xpath('//pre'):
an = i.xpath('string(.)').replace('\u2003', '').replace('\ue010', '').replace('\ufeff', '').replace(
'\u3000', '').replace('\u2022', '').replace(
'\ue5e5', '').replace('\n', '').replace('\xa0', '').replace(' ', '。').replace(
'\r', '').replace( '\u339b', '').replace(r'^\u[A-Za-z0-9]{1,5}|\x[A-Za-z0-9]{1,4}$', '')
answers.append(an)
answers = [i for i in answers if '?' not in i and '?' not in i and len(set(i)) > 2 and '为什么' not in i]
# 让多个内容变成字典 进行字典的形式排序
answer_dict = {answer: len(answer) for answer in answers}
answers = [i[0] for i in sorted(answer_dict.items(), key=lambda asd: asd[1])]
url = f'C:\Users\Administrator.PC-20190922SNXD\Desktop\搜狗问问\{keyword}.txt'
with open(url, 'a+') as f:
try:
f.write('00' + str(answers[0]) + '\n\n')
except IndexError:
print('列表索引超出')
def star():
so = Sogou()
txtfile = 'C:\Users\Administrator.PC-20190922SNXD\Desktop\key.txt'
try:
with open(txtfile, 'r+') as f:
for i in iter(f.readlines()):
keyword = i.strip()
time.sleep(3)
so.collect_urls(keyword)
print(f'{keyword}***文章组合完成啦')
except UnicodeEncodeError:
with open(txtfile, 'r+') as f:
ha = f.readline()
ha.replace(ha, '')
for i in iter(f.readlines()):
keyword = i.strip()
time.sleep(3)
so.collect_urls(keyword)
print(f'{keyword}***文章生成啦')
if __name__ == '__main__':
"""
需要设置两个参数:
第63行:需要手动创建桌面目录 目前“搜狗问问”就是桌面目录名称
第70行:需要手动创建txt文本,关键词一行一个
注:无法采集内容,更换源码中变量headers中的“UA”
"""
star()欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)