# module de requêtes pour demander des pages
# sélecteur de build HTML du module lxml (réponse formatée)
# from lxml import html
# demandes d'importation
# réponse = requêtes.get(url).content
# sélecteur = html.formatstring (réponse)
# hrefs = selector.xpath('/html/body//div[@class='feed-item _j_feed_item']/a/@href' )
# Prenez l'url = 'https://www.mafengwo.cn/gonglve/ziyouxing/2033.html' comme exemple
# python 2.7import requestsfrom lxml import htmlimport os
1 # 获取首页中子页的url链接2 def get_page_urls(url):3 response = requests.get(url).content4 # 通过lxml的html来构建选择器5 selector = html.fromstring(response)6 urls = []7 for i in selector.xpath("/html/body//div[@class='feed-item _j_feed_item']/a/@href"):8 urls.append(i)9 return urls
1 # get title from a child's html(div[@class='title'])2 def get_page_a_title(url):3 '''url is ziyouxing's a@href'''4 response = requests.get(url).content5 selector = html.fromstring(response)6 # get xpath by chrome's tool --> /html/body//div[@class='title']/text()7 a_title = selector.xpath("/html/body//div[@class='title']/text()")8 return a_title
1 # 获取页面选择器(通过lxml的html构建) 2 def get_selector(url): 3 response = requests.get(url).content 4 selector = html.fromstring(response) 5 return selector
# 通过chrome的开发者工具分析html页面结构后发现,我们需要获取的文本内容主要显示在div[@class='l-topic']和div[@class='p-section']中
1 # 获取所需的文本内容2 def get_page_content(selector):3 # /html/body/div[2]/div[2]/div[1]/div[@class='l-topic']/p/text()4 page_title = selector.xpath("//div[@class='l-topic']/p/text()")5 # /html/body/div[2]/div[2]/div[1]/div[2]/div[15]/div[@class='p-section']/text()6 page_content = selector.xpath("//div[@class='p-section']/text()")7 return page_title,page_content
1 # 获取页面中的图片url地址2 def get_image_urls(selector):3 imagesrcs = selector.xpath("//img[@class='_j_lazyload']/@src")4 return imagesrcs
# 获取图片的标题
1 def get_image_title(selector, num)2 # num 是从2开始的3 url = "/html/body/div[2]/div[2]/div[1]/div[2]/div["+num+"]/span[@class='img-an']/text()"4 if selector.xpath(url) is not None:5 image_title = selector.xpath(url)6 else:7 image_title = "map"+str(num) # 没有就起一个8 return image_title
# 下载图片
1 def downloadimages(selector,number): 2 '''number是用来计数的''' 3 urls = get_image_urls() 4 num = 2 5 amount = len(urls) 6 for url in urls: 7 image_title = get_image_title(selector, num) 8 filename = "/home/WorkSpace/tour/words/result"+number+"/+"image_title+".jpg" 9 if not os.path.exists(filename):10 os.makedirs(filename)11 print('downloading %s image %s' %(number, image_title))12 with open(filename, 'wb') as f:13 f.write(requests.get(url).content)14 num += 115 print "已经下载了%s张图" %num
# 入口,启动并把获取的数据存入文件中if __name__ =='__main__': url = ''urls = get_page_urls(url)# turn to get response from htmlnumber = 1for i in urls: selector = get_selector(i)# download images downloadimages(selector,number)# get text and write into a filepage_title, page_content = get_page_content(selector) result = page_title+'\n'+page_content+'\n\n'path = "/home/WorkSpace/tour/words/result"+num+"/"if not os.path.exists(filename): os.makedirs(filename) filename = path + "num"+".txt"with open(filename,'wb') as f: f.write(result)print result
Cela termine le robot Vous devez analyser soigneusement la structure HTML avant d'explorer la page. Certaines pages sont écrites par js Généré, ceci. La page est relativement simple et n'implique pas de traitement js. Il y aura des partages pertinents dans les futurs essais
Ce qui précède est le contenu détaillé de. pour plus d'informations, suivez d'autres articles connexes sur le site Web de PHP en chinois!