Python實作影片爬取

little bottle
發布: 2019-04-10 09:56:30
轉載
3411 人瀏覽過

Python可以用來做什麼?公司裡主要是爬取數據,並把爬回來的數據進行分析和挖掘,然而我們自己可以用它來爬取一些資源去使用,比如,想看的劇。本文中,小編將分享爬取影片的程式碼,大家存起來試試看吧!

下載串流文件,requests庫中請求的stream設為True就可以啦,文件在此。

先找一個影片位址試驗一下:


# -*- coding: utf-8 -*- import requests def download_file(url, path): with requests.get(url, stream=True) as r: chunk_size = 1024 content_size = int(r.headers['content-length']) print '下载开始' with open(path, "wb") as f: for chunk in r.iter_content(chunk_size=chunk_size): f.write(chunk) if __name__ == '__main__': url = '就在原帖...' path = '想存哪都行' download_file(url, path)
登入後複製

發生當頭一棒:


##

AttributeError: __exit__
登入後複製

這文檔也會騙人的麼!

看來是沒有實作上下文所需的__exit__方法。既然只是為了保證要讓r最後close以釋放連接池,那就使用contextlib的closing特性好了:


# -*- coding: utf-8 -*- import requests from contextlib import closing def download_file(url, path): with closing(requests.get(url, stream=True)) as r: chunk_size = 1024 content_size = int(r.headers['content-length']) print '下载开始' with open(path, "wb") as f: for chunk in r.iter_content(chunk_size=chunk_size): f.write(chunk)
登入後複製

程式正常運行了,不過我盯著這文件,怎麼大小不見變啊,到底完成了多少了呢?還是要讓下好的內容及時存進硬碟,還能省點記憶體是不是:


# -*- coding: utf-8 -*- import requests from contextlib import closing import os def download_file(url, path): with closing(requests.get(url, stream=True)) as r: chunk_size = 1024 content_size = int(r.headers['content-length']) print '下载开始' with open(path, "wb") as f: for chunk in r.iter_content(chunk_size=chunk_size): f.write(chunk) f.flush() os.fsync(f.fileno())
登入後複製

檔以肉眼可見的速度在增大,真心疼我的硬碟,還是最後一次寫入硬碟吧,程式中記個數就好了:


def download_file(url, path): with closing(requests.get(url, stream=True)) as r: chunk_size = 1024 content_size = int(r.headers['content-length']) print '下载开始' with open(path, "wb") as f: n = 1 for chunk in r.iter_content(chunk_size=chunk_size): loaded = n*1024.0/content_size f.write(chunk) print '已下载{0:%}'.format(loaded) n += 1
登入後複製

結果就很直觀了:

##

已下载2.579129% 已下载2.581255% 已下载2.583382% 已下载2.585508%
登入後複製

心懷遠大理想的我怎麼會只滿足於這一個呢,寫個類別一起使用吧:

# -*- coding: utf-8 -*- import requests from contextlib import closing import time def download_file(url, path): with closing(requests.get(url, stream=True)) as r: chunk_size = 1024*10 content_size = int(r.headers['content-length']) print '下载开始' with open(path, "wb") as f: p = ProgressData(size = content_size, unit='Kb', block=chunk_size) for chunk in r.iter_content(chunk_size=chunk_size): f.write(chunk) p.output() class ProgressData(object): def __init__(self, block,size, unit, file_name='', ): self.file_name = file_name self.block = block/1000.0 self.size = size/1000.0 self.unit = unit self.count = 0 self.start = time.time() def output(self): self.end = time.time() self.count += 1 speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0 self.start = time.time() loaded = self.count*self.block progress = round(loaded/self.size, 4) if loaded >= self.size: print u'%s下载完成\r\n'%self.file_name else: print u'{0}下载进度{1:.2f}{2}/{3:.2f}{4} 下载速度{5:.2%} {6:.2f}{7}/s'.\ format(self.file_name, loaded, self.unit,\ self.size, self.unit, progress, speed, self.unit) print '%50s'%('/'*int((1-progress)*50))
登入後複製

運行:

下载开始 下载进度10.24Kb/120174.05Kb 0.01% 下载速度4.75Kb/s ///////////////////////////////////////////////// 下载进度20.48Kb/120174.05Kb 0.02% 下载速度32.93Kb/s /////////////////////////////////////////////////
登入後複製

看起來舒服多了。

下面要做的就是多執行緒同時下載了,主執行緒生產url放入佇列,下載執行緒取得url:

# -*- coding: utf-8 -*- import requests from contextlib import closing import time import Queue import hashlib import threading import os def download_file(url, path): with closing(requests.get(url, stream=True)) as r: chunk_size = 1024*10 content_size = int(r.headers['content-length']) if os.path.exists(path) and os.path.getsize(path)>=content_size: print '已下载' return print '下载开始' with open(path, "wb") as f: p = ProgressData(size = content_size, unit='Kb', block=chunk_size, file_name=path) for chunk in r.iter_content(chunk_size=chunk_size): f.write(chunk) p.output() class ProgressData(object): def __init__(self, block,size, unit, file_name='', ): self.file_name = file_name self.block = block/1000.0 self.size = size/1000.0 self.unit = unit self.count = 0 self.start = time.time() def output(self): self.end = time.time() self.count += 1 speed = self.block/(self.end-self.start) if (self.end-self.start)>0 else 0 self.start = time.time() loaded = self.count*self.block progress = round(loaded/self.size, 4) if loaded >= self.size: print u'%s下载完成\r\n'%self.file_name else: print u'{0}下载进度{1:.2f}{2}/{3:.2f}{4} {5:.2%} 下载速度{6:.2f}{7}/s'.\ format(self.file_name, loaded, self.unit,\ self.size, self.unit, progress, speed, self.unit) print '%50s'%('/'*int((1-progress)*50)) queue = Queue.Queue() def run(): while True: url = queue.get(timeout=100) if url is None: print u'全下完啦' break h = hashlib.md5() h.update(url) name = h.hexdigest() path = 'e:/download/' + name + '.mp4' download_file(url, path) def get_url(): queue.put(None) if __name__ == '__main__': get_url() for i in xrange(4): t = threading.Thread(target=run) t.daemon = True t.start()
登入後複製

加了重複下載的判斷,至於怎麼源源不絕的生產url,各位摸索吧,保重身體!

【推薦課程:

Python影片教學

以上是Python實作影片爬取的詳細內容。更多資訊請關注PHP中文網其他相關文章!

相關標籤:
來源:csdn.net
本網站聲明
本文內容由網友自願投稿,版權歸原作者所有。本站不承擔相應的法律責任。如發現涉嫌抄襲或侵權的內容,請聯絡admin@php.cn
作者最新文章
最新下載
更多>
網站特效
網站源碼
網站素材
前端模板
關於我們 免責聲明 Sitemap
PHP中文網:公益線上PHP培訓,幫助PHP學習者快速成長!