Exemple de tutoriel d'implémentation du robot d'exploration avec requêtes et lxml-Tutoriel Python-php.cn

Exemple de tutoriel d'implémentation du robot d'exploration avec requêtes et lxml

PHP中文网

Libérer： 2017-06-20 14:46:43

original

2151 Les gens l'ont consulté

# module de requêtes pour demander des pages
# sélecteur de build HTML du module lxml (réponse formatée)
# from lxml import html
# demandes d'importation

# réponse = requêtes.get(url).content

# sélecteur = html.formatstring (réponse)

# hrefs = selector.xpath('/html/body//div[@class='feed-item _j_feed_item']/a/@href' )

# Prenez l'url = 'https://www.mafengwo.cn/gonglve/ziyouxing/2033.html' comme exemple

# python 2.7import requestsfrom lxml import htmlimport os

Copier après la connexion

1 # 获取首页中子页的url链接2 def get_page_urls(url):3     response = requests.get(url).content4     # 通过lxml的html来构建选择器5     selector = html.fromstring(response)6     urls = []7     for i in selector.xpath("/html/body//div[@class='feed-item _j_feed_item']/a/@href"):8         urls.append(i)9     return urls

Copier après la connexion

1 # get title from a child's html(div[@class='title'])2 def get_page_a_title(url):3     '''url is ziyouxing's a@href'''4     response = requests.get(url).content5     selector = html.fromstring(response)6     # get xpath by chrome's tool  -->  /html/body//div[@class='title']/text()7     a_title = selector.xpath("/html/body//div[@class='title']/text()")8     return a_title

Copier après la connexion

 1 # 获取页面选择器(通过lxml的html构建) 2 def get_selector(url): 3     response = requests.get(url).content 4     selector = html.fromstring(response) 5     return selector

Copier après la connexion

# 通过chrome的开发者工具分析html页面结构后发现，我们需要获取的文本内容主要显示在div[@class='l-topic']和div[@class='p-section']中

Copier après la connexion

1  # 获取所需的文本内容2  def get_page_content(selector):3      # /html/body/div[2]/div[2]/div[1]/div[@class='l-topic']/p/text()4      page_title = selector.xpath("//div[@class='l-topic']/p/text()")5      # /html/body/div[2]/div[2]/div[1]/div[2]/div[15]/div[@class='p-section']/text()6      page_content = selector.xpath("//div[@class='p-section']/text()")7      return page_title,page_content

Copier après la connexion

1 # 获取页面中的图片url地址2 def get_image_urls(selector):3     imagesrcs = selector.xpath("//img[@class='_j_lazyload']/@src")4     return imagesrcs

Copier après la connexion

  # 获取图片的标题

Copier après la connexion

1 def get_image_title(selector, num)2     # num 是从2开始的3     url = "/html/body/div[2]/div[2]/div[1]/div[2]/div["+num+"]/span[@class='img-an']/text()"4     if selector.xpath(url) is not None:5         image_title = selector.xpath(url)6     else:7         image_title = "map"+str(num) # 没有就起一个8     return image_title

Copier après la connexion

  # 下载图片

Copier après la connexion

 1 def downloadimages(selector,number): 2     '''number是用来计数的''' 3     urls = get_image_urls() 4     num = 2 5     amount = len(urls) 6     for url in urls: 7         image_title = get_image_title(selector, num) 8         filename = "/home/WorkSpace/tour/words/result"+number+"/+"image_title+".jpg" 9         if not os.path.exists(filename):10             os.makedirs(filename)11         print('downloading %s image %s' %(number, image_title))12         with open(filename, 'wb') as f:13             f.write(requests.get(url).content)14         num += 115     print "已经下载了%s张图" %num

Copier après la connexion

# 入口，启动并把获取的数据存入文件中if __name__ =='__main__':
    url = ''urls = get_page_urls(url)# turn to get response from htmlnumber = 1for i in urls:
        selector = get_selector(i)# download images      downloadimages(selector,number)# get text and write into a filepage_title, page_content = get_page_content(selector)
        result = page_title+'\n'+page_content+'\n\n'path = "/home/WorkSpace/tour/words/result"+num+"/"if not os.path.exists(filename):
            os.makedirs(filename)
        filename = path + "num"+".txt"with open(filename,'wb') as f:
            f.write(result)print result

Copier après la connexion

Cela termine le robot Vous devez analyser soigneusement la structure HTML avant d'explorer la page. Certaines pages sont écrites par js Généré, ceci. La page est relativement simple et n'implique pas de traitement js. Il y aura des partages pertinents dans les futurs essais

Ce qui précède est le contenu détaillé de. pour plus d'informations, suivez d'autres articles connexes sur le site Web de PHP en chinois!