• 技术文章 >后端开发 >Python教程

    python通过urllib2爬网页上种子下载示例

    2016-06-16 08:45:09原创711
    通过urllib2、re模块抓种子

    思路

    1.用程序登录论坛(如果需要登录才能访问的版块)

    2.访问指定版块

    3.遍历帖子(先取指定页,再遍历页面所有帖子的url)

    4.循环访问所有帖子url,从帖子页面代码中取种子下载地址(通过正则表达式或第三方页面解析库)

    5.访问种子页面下载种子

    复制代码 代码如下:


    import urllib
    import urllib2
    import cookielib
    import re
    import sys
    import os

    # site is website address | fid is part id
    site = "http://xxx.yyy.zzz/"
    source = "thread0806.php?fid=x&search=&page="

    btSave = "./clyzwm/"
    if os.path.isdir(btSave):
    print btSave + " existing"
    else:
    os.mkdir(btSave)

    logfile = "./clyzwm/down.log"
    errorfile = "./clyzwm/error.log"
    sucfile = "./clyzwm/sucess.log"

    headers = {'User-Agent' : 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36',
    'Referer' : 'http://xxx.yyy.zzz/'}

    def btDown(url, dirPath):
    logger(logfile, "download file : " + url)
    try:
    #pageCode = urllib2.urlopen(url).read()
    #print pageCode
    btStep1 = re.findall('http://[\w]+\.[\w]+\.[\w]{0,4}/[\w]{2,6}\.php\?[\w]{2,6}=([\w]+)', url, re.I)
    #print btStep1
    if len(btStep1)>0:
    ref = btStep1[0]
    downsite = ""
    downData = {}
    if len(ref)>20:
    downsite = re.findall('http://www.[\w]+\.[\w]+/', url)[0]
    downsite = downsite + "download.php"
    reff = re.findall('input\stype=\"hidden\"\sname=\"reff\"\svalue=\"([\w=]+)\"', urllib2.urlopen(url).read(), re.I)[0]
    downData = {'ref': ref, 'reff':reff, 'submit':'download'}
    else:
    downsite = "http://www.downhh.com/download.php"
    downData = {'ref': ref, 'rulesubmit':'download'}
    #print "bt site - " + downsite + "\n downData:"
    #print downData
    downData = urllib.urlencode(downData)
    downReq = urllib2.Request(downsite, downData)
    downReq.add_header('User-Agent','Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1700.77 Safari/537.36')
    downPost = urllib2.urlopen(downReq)
    stream = downPost.read(-1)
    if (len(stream) > 1000):
    downPost.close()
    name = btStep1[0]+ ".torrent"
    fw = open(dirPath + name, 'w')
    fw.write(stream)
    fw.close()
    logger(sucfile, url+"\n")
    else:
    logger(errorfile, url+"\n")
    except urllib2.URLError, e:
    print e.reason

    def logger(logfile, msg):
    print msg
    fw = open(logfile, 'a')
    fw.write(msg)
    fw.close()

    for i in range(1, 1000):
    logger(logfile, "\n\n\n@ page " + str(i) + " ...")
    part = site + source + str(i)

    content = urllib2.urlopen(part).read()
    content = content.decode('gbk').encode('utf8')
    #print content

    pages = re.findall('', content,re.I)
    #print pages

    for page in pages:
    page = site + page;
    #logger(logfile, "\n# visiting " + page + " ...")
    pageCode = urllib2.urlopen(page).read()
    #print pageCode
    zzJump = re.findall('http://www.viidii.info/\?http://[\w]+/[\w]+\?[\w]{2,6}=[\w]+' ,pageCode)
    #zzJump = re.findall('http://www.viidii.info/\?http://[\w/\?=]*', pageCode)
    if len(zzJump) > 0:
    zzJump = zzJump[0]
    #print "- jump page - " + zzJump
    pageCode = urllib2.urlopen(page).read()
    zzPage = re.findall('http://[\w]+\.[\w]+\.[\w]+/link[\w]?\.php\?[\w]{2,6}=[\w]+' ,pageCode)
    if len(zzPage) > 0:
    zzPage = zzPage[0]
    logger(logfile, "\n- zhongzi page -" + zzPage)
    btDown(zzPage, btSave)
    else:
    logger(logfile, "\n. NOT FOUND .")
    else:
    logger(logfile, "\n... NOT FOUND ...")
    zzPage = re.findall('http://[\w]+\.[\w]+\.[\w]+/link[\w]?\.php\?ref=[\w]+' ,pageCode)

    声明:本文内容由网友自发贡献,版权归原作者所有,本站不承担相应法律责任。如您发现有涉嫌抄袭侵权的内容,请联系admin@php.cn核实处理。
    上一篇:python列表操作使用示例分享 下一篇:python动态加载变量示例分享
    VIP课程(WEB全栈开发)

    相关文章推荐

    • 【腾讯云】年中优惠,「专享618元」优惠券!• 实例详解Python面向对象的四大特征• Python数据分析之concat与merge函数(实例详解)• 介绍六个超好用的Python内置函数• 图文详解怎么用Python绘制动态可视化图表• 一起聊聊Python的编码样式
    1/1

    PHP中文网