Home >Backend Development >Python Tutorial >Python crawler code sample sharing

Python crawler code sample sharing

angryTom
angryTomOriginal
2020-03-06 15:19:1212346browse

This article mainly introduces three python crawler project example codes, using the urllib2 library. The example codes in the article are very detailed and have certain reference learning value for everyone's study or work. Friends in need can refer to it.

Python crawler code sample sharing

Python crawler code example sharing

1. Crawling story paragraph: Recommended learning:PythonVideoTutorial

Note: Some codes cannot run normally, but they still have certain reference value.

#encoding=utf-8
import urllib2
 
import re
 
 
class neihanba():
  def spider(self):
    '''
    爬虫的主调度器
    '''
    isflow=True#判断是否进行下一页
    page=1
    while isflow:
      url="http://www.neihanpa.com/article/list_5_"+str(page)+".html"
      html=self.load(url)
      self.deal(html,page)
      panduan=raw_input("是否继续(y/n)!")
      if panduan=="y":
        isflow=True
        page+=1
      else:
        isflow=False
  def load(self,url):
    '''
    针对url地址进行全部爬去
    :param url: url地址
    :return: 返回爬去的内容
    '''
    header = {
      "User-Agent": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
    }
    request = urllib2.Request(url, headers=header)
    response = urllib2.urlopen(request)
    html = response.read()
    return html
  def deal(self,html,page):
    '''
    对之前爬去的内容进行正则匹配,匹配出标题和正文内容
    :param html:之前爬去的内容
    :param page: 正在爬去的页码
    '''
    parrten=re.compile(&#39;<li class="piclist\d+">(.*?)</li>&#39;,re.S)
    titleList=parrten.findall(html)
    for title in titleList:
      parrten1=re.compile(&#39;<a href="/article/\d+.html" rel="external nofollow" >(.*)</a>&#39;)
      ti1=parrten1.findall(title)
      parrten2=re.compile(&#39;<div class="f18 mb20">(.*?)</div>&#39;,re.S)
      til2=parrten2.findall(title)
      for t in ti1:
        tr=t.replace("<b>","").replace("</b>","")
        self.writeData(tr,page)
      for t in til2:
        tr=t.replace("<p>","").replace("</p>","").replace("<br>","").replace("<br />","").replace("&ldquo","\"").replace("&rdquo","\"")
        self.writeData(tr,page)
  def writeData(self,context,page):
    &#39;&#39;&#39;
    将最终爬去的内容写入文件中
    :param context: 匹配好的内容
    :param page: 当前爬去的页码数
    &#39;&#39;&#39;
    fileName = "di" + str(page) + "yehtml.txt"
    with open(fileName, "a") as file:
      file.writelines(context + "\n")
if __name__ == &#39;__main__&#39;:
  n=neihanba()
  n.spider()

2. Crawl Zhilian:

#encoding=utf-8
import urllib
import urllib2
 
import re
 
 
class zhiLian():
  def spider(self,position,workPlace):
    &#39;&#39;&#39;
    爬虫的主调度器
    :param position: 职位
    :param workPlace: 工作地点
    &#39;&#39;&#39;
    url="http://sou.zhaopin.com/jobs/searchresult.ashx?"
    url+=urllib.urlencode({"jl":workPlace})
    url+="&"
    url+=urllib.urlencode({"kw":position})
    isflow=True#是否进行下一页的爬去
    page=1
    while isflow:
      url+="&"+str(page)
      html=self.load(url)
      self.deal1(html,page)
      panduan = raw_input("是否继续爬虫下一页(y/n)!")
      if panduan == "y":
        isflow = True
        page += 1
      else:
        isflow = False
  def load(self,url):
    &#39;&#39;&#39;
    针对url地址进行全部爬去
    :param url: url地址
    :return: 返回爬去的内容
    &#39;&#39;&#39;
    header = {
      "User-Agent": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
    }
    request = urllib2.Request(url, headers=header)
    response = urllib2.urlopen(request)
    html = response.read()
    return html
  def deal1(self,html,page):
    &#39;&#39;&#39;
 
    对之前爬去的内容进行正则匹配,匹配职位所对应的链接
    :param html:之前爬去的内容
    :param page: 正在爬去的页码
    &#39;&#39;&#39;
    parrten=re.compile(&#39;<a\s+style="font-weight:\s+bold"\s+par="ssidkey=y&ss=\d+&ff=\d+&sg=\w+&so=\d+"\s+href="(.*?)" rel="external nofollow" target="_blank">.*?</a>&#39;,re.S)
    til=parrten.findall(html)#爬去链接
    for t in til:
      self.deal2(t,page)
  def deal2(self,t,page):
    &#39;&#39;&#39;
    进行二次爬虫,然后在新的页面中对公司、薪资、工作经验进行匹配
    :param t: url地址
    :param page: 当前匹配的页数
    &#39;&#39;&#39;
    html=self.load(t)#返回二次爬虫的内容
    parrten1=re.compile(&#39;<a\s+onclick=".*?"\s+href=".*?" rel="external nofollow" \s+target="_blank">(.*?)\s+.*?<img\s+class=".*?"\s+src=".*?"\s+border="\d+"\s+vinfo=".*?"></a>&#39;,re.S)
    parrten2=re.compile(&#39;<li><span>职位月薪:</span><strong>(.*?) <a.*?>.*?</a></strong></li>&#39;,re.S)
    parrent3=re.compile(&#39;<li><span>工作经验:</span><strong>(.*?)</strong></li>&#39;,re.S)
    til1=parrten1.findall(html)
    til2=parrten2.findall(html)
    til3=parrent3.findall(html)
    str=""
    for t in til1:
      t=t.replace(&#39;<img title="专属页面" src="//img03.zhaopin.cn/2012/img/jobs/icon.png" border="0" />&#39;,"")
      str+=t
      str+="\t"
    for t in til2:
      str+=t
      str += "\t"
    for t in til3:
      str+=t
    self.writeData(str,page)
  def writeData(self,context,page):
    &#39;&#39;&#39;
    将最终爬去的内容写入文件中
    :param context: 匹配好的内容
     :param page: 当前爬去的页码数
    &#39;&#39;&#39;
    fileName = "di" + str(page) + "yehtml.txt"
    with open(fileName, "a") as file:
      file.writelines(context + "\n")
if __name__ == &#39;__main__&#39;:
  position=raw_input("请输入职位:")
  workPlace=raw_input("请输入工作地点:")
  z=zhiLian()
  z.spider(position,workPlace)

3. Crawl Tieba:

#encoding=utf-8
import urllib
import urllib2
 
import re
 
 
class teiba():
  def spider(self,name,startPage,endPage):
    url="http://tieba.baidu.com/f?ie=utf-8&"
    url+=urllib.urlencode({"kw":name})
    for page in range(startPage,endPage+1):
      pn=50*(page-1)
      urlFull=url+"&"+urllib.urlencode({"pn":pn})
      html=self.loadPage(url)
      self.dealPage(html,page)
 
  def loadPage(self,url):
    header={
      "User-Agent":" Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"
    }
    request=urllib2.Request(url,headers=header)
    response=urllib2.urlopen(request)
    html=response.read()
    return html
  def dealPage(self,html,page):
    partten=re.compile(r&#39;<a\s+rel="noreferrer"\s+href="/p/\d+" rel="external nofollow" \s+title=".*?"\s+target="_blank" class="j_th_tit\s+">(.*?)</a>&#39;,re.S)
    titleList=partten.findall(html)
    rstr=r&#39;<span\s+class="topic-tag"\s+data-name=".*?">#(.*?)#</span>&#39;
    for title in titleList:
      title=re.sub(rstr,"",title)
      self.writePage(title,page)
  def writePage(self,context,page):
    fileName="di"+str(page)+"yehtml.txt"
    with open(fileName,"a") as file:
      file.writelines(context+"\n")
if __name__ == &#39;__main__&#39;:
  name=raw_input("请输入贴吧名:")
  startPage=raw_input("请输入起始页:")
  endPage=raw_input("请输入终止页:")
  t=teiba()
  t.spider(name,int(startPage),int(endPage))

More related tutorials, Please pay attention to the Python Tutorial column.

The above is the detailed content of Python crawler code sample sharing. For more information, please follow other related articles on the PHP Chinese website!

Statement:
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn