python优化代码（文本查找）效率？-PHP中文網路問答

#!/usr/bin/env python # -*- coding: utf-8 -*- infile2 = open('genemark.gff3', 'r') infile1 = set(line1.strip() for line1 in open('1.txt', 'r')) for line in infile2: line = line.strip().split() if line[2] == 'gene': chr, start, end = line[0], int(line[3]), int(line[4]) for line1 in infile1: line1 = line1.split() chr1, start1, end1 = line1[1], int(line1[2]), int(line1[3]) if chr1 == chr: if start1 < start < end1: print line1[0], line[-1] if start1 < end < end1: print line1[0], line[-1] if start1 > start and end > end1: print line1[0], line[-1]

genemark.gff3格式类似下边:

chr1D GeneMark.hmm gene 2705930 2711118 . + . ID=1903228_g;Name=1903228_g chr1D GeneMark.hmm mRNA 2705930 2711118 . + . ID=1903228_t;Name=1903228_t;Parent=1903228_g

1.txt:

UN011157 chr1D 2705329 2706342 98.4 95.0 972 30 21 0 UN003843 chr1D 2705681 2721144 61.4 97.4 633 12 5 0

附上原始文件的百度云链接，希望感兴趣的参考
点击下载密码 enu8

综合楼下各位朋友的答案，现推荐两种
第一种根据 @ferstar @用筹兮用严的答案，即并行版

#!/usr/bin/env python # encoding: utf-8 from collections import defaultdict from multiprocessing import Pool, cpu_count from functools import partial def find_sth(f2, f1=None): start, end = int(f2[3]), int(f2[4]) for uno1, start1, end1 in f1[f2[0]]: if (start1 <= start and start <= end1) or (start1 <= end and end <= end1) or (start1 >= start and end >= end1): with open("out.txt", "a") as fh: fh.write(uno1 + "\t" + f2[-1] + "\n") #print(uno1, f2[-1]) def main(): with open('1.txt', 'r') as f1: infile1 = defaultdict(set) for uno1, chr1, start1, end1, *others in map(str.split, f1): infile1[chr1].add((uno1, int(start1), int(end1))) with open('genemark.gff3', 'r') as f2: infile2 = [x for x in map(str.split, f2) if x[2] == 'gene'] pool = Pool(cpu_count()) pool.map(partial(find_sth, f1=infile1), infile2) pool.close() pool.join() if __name__ == "__main__": main()

第二种 @citaret 他的版本（单核版），对单核来说，不逊于上述代码。但是两者结果稍有不同，并行版结果更全（这里少了73条，出在判断条件的边界问题，由于对intervaltree熟悉，怎么改还不知道），现在边界问题已修改，两种代码结果完全一样，perfect！
如下

from collections import defaultdict from intervaltree import Interval, IntervalTree with open('1.txt') as f: d1 = defaultdict(list) xs = map(lambda x: x.strip().split(), f) for x in xs: y = (x[0], int(x[2]), int(x[3])) d1[x[1]].append(y) for k, v in d1.items(): d1[k] = IntervalTree(Interval(s, e, u) for u, s, e in v) with open('genemark.gff3') as f: for line in f: line = line.strip().split() if line[2] == 'gene': chr, start, end = line[0], int(line[3]), int(line[4]) for start1, end1, un1 in d1[chr][start-1:end+1]: print(un1, line[-1])

with open('1.txt', 'r') as f1, open('2.txt', 'r') as f2: lines1 = [_.strip().split() for _ in f1] for line2 in f2: line2 = line2.strip().split() if line2[2] != 'gene': continue chr2, start2, end2 = line2[0], int(line2[3]), int(line2[4]) for line1 in lines1: chr1, start1, end1 = line1[1], int(line1[2]), int(line1[3]) if chr1 == chr2 and (start1 < start2 < end1 or start1 < end2 < end1 or start1 > start2 and end2 > end1): print line1[0], line2[-1]

#!/usr/bin/env python # -*- coding: utf-8 -*- infile2 = open('genemark.gff3', 'r') infile1 = {} for line1 in open('1.txt', 'r'): line1 = line1.strip().split() id, chr1, start1, end1 = line1[0], line1[1], int(line1[2]), int(line1[3]) if not infile1.has_key(chr1): infile1[chr1] = [] infile1[chr1].append({"start": start1, "end": end1, "id": id}) for line in infile2: line = line.strip().split() if line[2] != 'gene': continue chr, start, end = line[0], int(line[3]), int(line[4]) if not infile1.has_key(chr): continue for i in infile1[chr]: if i['start'] < start < i['end']: print i['id'], line[-1] if i['start'] < end < i['end']: print i['id'], line[-1] if i['start'] > start and i['end'] > end: print i['id'], line[-1]

from collections import defaultdict with open('genemark.gff3') as f: ls = f.readlines() xs = map(lambda x: x.strip().split(), ls) t2 = (x for x in xs if x[2] == 'gene') with open('1.txt') as f: d1 = defaultdict(list) ls = f.readlines() xs = map(lambda x: x.strip().split(), ls) for x in xs: d1[x[1]].append(x) for line in t2: chr, start, end = line[0], int(line[3]), int(line[4]) if chr in d1: for line1 in d1[chr]: chr1, start1, end1 = line1[1], int(line1[2]), int(line1[3]) if start1 < start < end1: print line1[0], line[-1] if start1 < end < end1: print line1[0], line[-1] if start1 > start and end > end1: print line1[0], line[-1]

from collections import defaultdict with open('1.txt') as f: d1 = defaultdict(list) xs = map(lambda x: x.strip().split(), f) for x in xs: y = (x[0], int(x[2]), int(x[3])) d1[x[1]].append(y) with open('genemark.gff3') as f: for line in f: line = line.strip().split() chr, start, end = line[0], int(line[3]), int(line[4]) for un1, start1, end1 in d1[chr]: if start < end1 and end > start1: print un1, line[-1]

from collections import defaultdict from intervaltree import Interval, IntervalTree with open('1.txt') as f: d1 = defaultdict(list) xs = map(lambda x: x.strip().split(), f) for x in xs: y = (x[0], int(x[2]), int(x[3])) d1[x[1]].append(y) for k, v in d1.items(): d1[k] = IntervalTree(Interval(s, e, u) for u, s, e in v) with open('genemark.gff3') as f: for line in f: line = line.strip().split() if line[2] != 'gene': continue chr, start, end = line[0], int(line[3]), int(line[4]) for start1, end1, un1 in d1[chr][start:end]: print un1, line[-1]

from collections import defaultdict with open('1.txt', 'r') as f1, open('genemark.gff3', 'r') as f2: infile1 = defaultdict(set) for uno1, chr1, start1, end1, *others in map(str.split, f1): infile1[chr1].add((uno1, int(start1), int(end1))) infile2 = filter(lambda x: x[2] == 'gene', map(str.split, f2)) for chr, start, end, info in map(lambda x: (x[0], int(x[3]), int(x[4]), x[-1]), infile2): for uno1, start1, end1 in infile1[chr]: if start1 < start < end1 or start1 < end < end or (start1 > start and end > end1): print(uno1, info)

#!/usr/bin/env python # encoding: utf-8 from collections import defaultdict from multiprocessing import Pool, cpu_count from functools import partial def find_sth(line, f1=None): line = line.split() if line[2] != 'gene': return start, end = int(line[3]), int(line[4]) for uno1, start1, end1 in f1[line[0]]: if start1 < start < end1 or start1 < end < end or (start1 > start and end > end1): print(uno1, line[-1]) def main(): pool = Pool(cpu_count()) with open('1.txt', 'r') as f1, open('genemark.gff3', 'r') as f2: infile1 = defaultdict(set) for uno1, chr1, start1, end1, *others in map(str.split, f1): infile1[chr1].add((uno1, int(start1), int(end1))) pool.map(partial(find_sth, f1=infile1), f2) #fw.writelines(filter(lambda x: x is not None, map(lambda x: x.get(), [pool.apply_async(func, (line,)) for line in f2]))) pool.close() pool.join() if __name__ == "__main__": main()

我的方案--把四樓的程式碼小改成並行的

我寫的有問題, @用籌兮用嚴更新了正確的並行程式碼, 我的程式碼就不改了, 方便後面看到的同學參考

直接放碼(python3)

from collections import defaultdict import multiprocessing def find_sth(x): with open('1.txt', 'r') as f1: infile1 = defaultdict(set) for uno1, chr1, start1, end1, *others in map(str.split, f1): infile1[chr1].add((uno1, int(start1), int(end1))) chr, start, end, info = x[0], int(x[3]), int(x[4]), x[-1] for uno1, start1, end1 in infile1[chr]: if start1 < start < end1 or start1 < end < end or (start1 > start and end > end1): print(uno1, info) def main(): with open('genemark.gff3', 'r') as fh: lst = [x for x in map(str.split, fh) if x[2] == 'gene'] pool = multiprocessing.Pool(multiprocessing.cpu_count()) pool.map(find_sth, lst) pool.close() pool.join() if __name__ == "__main__": main()

然後看看運作效率

(py3) [ngs@cluster ~]$ time python hi_new.py > hi_new.txt real 0m3.033s user 0m31.952s sys 0m0.219s (py3) [ngs@cluster ~]$ wc -l hi_new.txt 4000 hi_new.txt

時間上貌似慢了很多(4000行數據才幾百KB), 題主可以試著用你的真實數據測試下, 處理數據越大, 並行處理的效率優勢越明顯

PS: 我估計題主實際處理的資料大小得有MB甚至是GB等級, 這種等級並行處理才是王道

來源資料及結果度娘地址連結：http://pan.baidu.com/s/1hrSZQuS 密碼：u93n

全部回覆 (6)

巴扎黑2017-04-18 09:18:12 6樓

效能應該沒有更好的最佳化餘地，不過程式碼可以稍微調整一下

點贊+0

新增回覆

阿神2017-04-18 09:18:12 5樓

這裡給兩個建議：

程式碼嵌套太深，在函數中可以透過儘早的 return 來減少嵌套層級，同樣的在循環中，可以透過使用 continue 來達到減少嵌套層級的目的。
關於性能方面

for line1 in infile1: line1 = line1.split()

每次循環都要對對 file1中的行進行 split 操作是非常不明智的

下面是我修改的程式碼

PHPzhong2017-04-18 09:18:12 4樓

用空間換時間，分別建構 genemark.gff3 的列表和 1.txt 的字典，具體實作：

修改版 v2，消除內層循環裡的 int()，簡化輸出。

v3: 仔細研究題意，發現主循環是求出集合中和一個片段相交的所有片段，我們可以先看看這個集合：

chr1D 7359 chr2D 9219 chr2B 9486 chr2A 8986 chr6B 7178 chr6A 6446 chr6D 6093 chr4A 7543 chr4B 7086 chr4D 6316 ...

每個集合的片段數量在6000-10000，遍歷的話效率低，因此考慮使用 intervaltree，可以快速得出和一個片段相交的所有片段，具體實現為：

時間測試結果為：建構 intervaltree 花時 10秒，但求交集的過程速度有100倍左右的提升。
intervaltee 參考 https://pypi.python.org/pypi/...

伊谢尔伦2017-04-18 09:18:12 3樓

文件發開後不用關閉嗎

Ty802017-04-18 09:18:12 2樓

6樓 @ferstar 提出的平行化才是正確的方向，不過程式碼有點問題…
改了下：

伊谢尔伦2017-04-18 09:18:12 1樓

發現一個很有趣的事情, 大家回答很正面, 但是實際結果呢, 我剛好無聊小測驗了一下, 過程如下:

介於題主提供的範例文字才兩行, 所以我把1.txt和genemark.gff3分别加倍到4000行

(qiime) [ngs@cluster ~]$ wc -l 1.txt 4000 1.txt (qiime) [ngs@cluster ~]$ wc -l genemark.gff3 4000 genemark.gff3

依照回覆樓層數排序, 如題主的代碼是hi.py,然后一楼答主的代码是hi1.py,依次類推

先看題主的

(qiime) [ngs@cluster ~]$ time python hi.py > hi.txt real 0m0.049s user 0m0.042s sys 0m0.007s (qiime) [ngs@cluster ~]$ wc -l hi.txt 6000 hi.txt

感覺有重複

一樓答主的

time python hi1.py > hi1.txt real 0m21.727s user 0m21.171s sys 0m0.547s (qiime) [ngs@cluster ~]$ wc -l hi1.txt 8000000 hi1.txt

重複到姥姥家了

二樓答主的

(qiime) [ngs@cluster ~]$ time python hi2.py > hi2.txt real 0m16.326s user 0m14.550s sys 0m1.044s (qiime) [ngs@cluster ~]$ wc -l hi2.txt 12000000 hi2.txt

三樓答主的

(qiime) [ngs@cluster ~]$ time python hi3.py > hi3.txt real 0m27.079s user 0m26.281s sys 0m0.786s (qiime) [ngs@cluster ~]$ wc -l hi3.txt 12000000 hi3.txt

三樓答主的結果跟二樓一樣, 但是慢了10秒多

四樓答主的(冤枉四樓同學了,這是py3代碼)

(py3) [ngs@cluster ~]$ time python hi4.py > hi4.txt real 0m0.074s user 0m0.064s sys 0m0.010s (py3) [ngs@cluster ~]$ wc -l hi4.txt 4000 hi4.txt

果然是有交流才有進步, 目前這個結果才是正確的

總結

實際好像是題主的代碼結果會有重複, 四樓答主的結果才是正確的

php8，我來也

30分鐘學會網站佈局

尚觀Oracle入門到精通視頻教程

你的第一行UNI-APP程式碼

Flutter 從頭到應用程式啟動

兄弟連新版Linux視頻教程

AXURE 9影片教學（適用於產品經理互動產品設計UI）

零基礎PS影片教學

16天帶你入門UI視頻教程

PS技巧和切片技巧影片教學

阿裡雲環境搭建以及項目上線視頻教程

電腦網路概述－程式設計師必須掌握的基礎知識

程式設計師必備教學——HTTP協定講解

Websocket影片教學

總結

我的方案--把四樓的程式碼小改成並行的