Python实现的爬虫功能代码

Python实现的爬虫功能代码,第1张

概述本文实例讲述了Python实现的爬虫功能。分享给大家供大家参考,具体如下:主要用到urllib2、BeautifulSoup模块

本文实例讲述了Python实现的爬虫功能。分享给大家供大家参考,具体如下:

主要用到urllib2、BeautifulSoup模块

#enCoding=utf-8import reimport requestsimport urllib2import datetimeimport MysqLdbfrom bs4 import BeautifulSoupimport sysreload(sys)sys.setdefaultencoding("utf-8")class SplIDer(object):  def __init__(self):  print u'开始爬取内容...'  ##用来获取网页源代码  def getsource(self,url):  headers = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_3) AppleWebKit/537.36 (KHTML,like Gecko) Chrome/50.0.2652.0 Safari/537.36'}  req = urllib2.Request(url=url,headers=headers)  socket = urllib2.urlopen(req)  content = socket.read()  socket.close()  return content  ##changepage用来生产不同页数的链接  def changepage(self,url,total_page):    Now_page = int(re.search('page/(\d+)',re.S).group(1))  page_group = []  for i in range(Now_page,total_page+1):    link = re.sub('page/(\d+)','page/%d' % i,re.S)    page_group.append(link)  return page_group  #获取字内容  def getchildrencon(self,child_url):  conobj = {}  content = self.getsource(child_url)  soup = BeautifulSoup(content,'HTML.parser',from_enCoding='utf-8')  content = soup.find('div',{'class':'c-article_content'})  img = re.findall('src="(.*?)"',str(content),re.S)  conobj['con'] = content.get_text()  conobj['img'] = (';').join(img)  return conobj  ##获取内容  def getcontent(self,HTML_doc):  soup = BeautifulSoup(HTML_doc,from_enCoding='utf-8')  tag = soup.find_all('div',{'class':'promo-Feed-headline'})  info = {}  i = 0  for link in tag:    info[i] = {}    Title_desc = link.find('h3')    info[i]['Title'] = Title_desc.get_text()    post_date = link.find('div',{'class':'post-date'})    pos_d = post_date['data-date'][0:10]    info[i]['content_time'] = pos_d    info[i]['source'] = 'whowhatwear'    source_link = link.find('a',href=re.compile(r"section=fashion-trends"))    source_url = 'http://www.whowhatwear.com'+source_link['href']    info[i]['source_url'] = source_url    in_content = self.getsource(source_url)    in_soup = BeautifulSoup(in_content,from_enCoding='utf-8')    soup_content = in_soup.find('section',{'class':'Widgets-List-content'})    info[i]['content'] = soup_content.get_text().strip('\n')    text_con = in_soup.find('section',{'class':'text'})    summary = text_con.get_text().strip('\n') if text_con.text != None else NulL    info[i]['summary'] = summary[0:200]+'...';    img_List = re.findall('src="(.*?)"',str(soup_content),re.S)    info[i]['imgs'] = (';').join(img_List)    info[i]['create_time'] = datetime.datetime.Now().strftime("%Y-%m-%d %H:%M:%s")    i+=1  #print info  #exit()  return info  def saveinfo(self,content_info):  conn = MysqLdb.Connect(host='127.0.0.1',user='root',passwd='123456',port=3306,db='test',charset='utf8')  cursor = conn.cursor()  for each in content_info:    for k,v in each.items():    sql = "insert into t_fashion_spIDer2(`Title`,`summary`,`content`,`content_time`,`imgs`,`source`,`source_url`,`create_time`) values ('%s','%s','%s')" % (MysqLdb.escape_string(v['Title']),MysqLdb.escape_string(v['summary']),MysqLdb.escape_string(v['content']),v['content_time'],v['imgs'],v['source'],v['source_url'],v['create_time'])    cursor.execute(sql)  conn.commit()  cursor.close()  conn.close()if __name__ == '__main__':  classinfo = []  p_num = 5  url = 'http://www.whowhatwear.com/section/fashion-trends/page/1'  jikesplIDer = SplIDer()  all_links = jikesplIDer.changepage(url,p_num)  for link in all_links:  print u'正在处理页面:' + link  HTML = jikesplIDer.getsource(link)  info = jikesplIDer.getcontent(HTML)  classinfo.append(info)  jikesplIDer.saveinfo(classinfo)

更多关于Python相关内容可查看本站专题:《Python Socket编程技巧总结》、《Python数据结构与算法教程》、《Python函数使用技巧总结》、《Python字符串 *** 作技巧汇总》、《Python入门与进阶经典教程》及《Python文件与目录 *** 作技巧汇总》

希望本文所述对大家Python程序设计有所帮助。

总结

以上是内存溢出为你收集整理的Python实现的爬虫功能代码全部内容,希望文章能够帮你解决Python实现的爬虫功能代码所遇到的程序开发问题。

如果觉得内存溢出网站内容还不错,欢迎将内存溢出网站推荐给程序员好友。

欢迎分享,转载请注明来源:内存溢出

原文地址:https://www.54852.com/langs/1202064.html

(0)
打赏 微信扫一扫微信扫一扫 支付宝扫一扫支付宝扫一扫
上一篇 2022-06-04
下一篇2022-06-04

发表评论

登录后才能评论

评论列表(0条)

    保存