
最近抽空写了数据抓取微博用户发布在奉天承芸超话的视频,代码运行通过,抓取代码如下:
后续出个去掉抓取重复网页的优化版本,等我稍闲下来。
def run(self):
s = requests.Session()
#暂时先抓self.TotalPage=200页
for i in range(1,self.TotalPage):
time.sleep(1)
if self.end_flag :
break
if i == 1:
self.url = self.basic_weibo_url
else:
self.url = self.basic_weibo_url + '&since_id=' + str(self.since_id)
weibo_response = s.get(self.url,timeout=(3.15,10))
weibo_data = weibo_response.json()
if weibo_response.status_code != 200:
print("end process for reponse.status_code=", weibo_response.status_code)
self.end_flag = True
return self.end_flag
if weibo_data['data'].get('cards'):
cards = weibo_data['data']['cards']
if weibo_data['data'].get('cardlistInfo'):
cardlistInfo = weibo_data['data']['cardlistInfo']
self.since_id = cardlistInfo.get('since_id')
if self.since_id == '':
print("end process for since_id is empty")
self.end_flag = True
return self.end_flag
for card in cards:
if card.get('mblog'):
m_blog = card.get('mblog')
text = m_blog.get('text')
mid = m_blog.get('mid')
source = m_blog.get('source')
screen_name = m_blog.get('user').get('screen_name')
# created_at = ''
# edit_at = ''
created_at = m_blog.get('created_at')
edit_at = m_blog.get('edit_at')
#发布在奉天承芸超话里面
if source == self.weibo_str:
# print([mid,text,screen_name])
if m_blog.get('page_info'):
page_info = m_blog.get('page_info')
# content2 = ''
content2 = page_info.get('content2')
type = page_info.get('type')
if page_info.get('type') == "video":
page_url = page_info.get('page_url')
# play_count = ''
play_count = page_info.get('play_count')
page_title = page_info.get('title')
self.item_num = self.item_num + 1
if page_info.get('page_pic'):
video_pic_url = ''
page_pic = page_info.get('page_pic')
video_pic_url = page_pic['url']
# print(video_pic_url)
print("append")
ws1.append(
[str(self.item_num), page_title, screen_name, content2, "1-视频", "3-双人合作舞台-饭拍",
"2-微博", page_url, video_pic_url, created_at, play_count, '', ''])
wb.save('media.xlsx')
else:
print("end process for there is not cardlistInfo")
self.end_flag = True
return self.end_flag欢迎分享,转载请注明来源:内存溢出
微信扫一扫
支付宝扫一扫
评论列表(0条)