Python用多进程写文件遇到编码问题，而用多线程却不会

Question

用多进程爬取数据写入文件，运行没有报错，但是打开文件却乱码。 用多线程改写时却没有这个问题，一切正常。下面是数据写入文件的代码： {代码...} 多进程用的进程池 {代码...}

巴扎黑 · Answer

图片上已经说了，文件以错误的编码形式载入了，说明你多进程写入的时候，编码不是utf-8

世界只因有你 · Answer

文件第一行添加:

#coding: utf-8

我想大声告诉你 · Answer

打开同一个文件，相当危险，出错机率相当大，
多线程不出错，极有可能是GIL,
多进程没有锁，因此容易出错了。

url_text = codecs.open('url.txt','a','utf-8')

建议改为生产者消费都模式!

比如这样

# -*- coding: utf-8 -* -
import time
import os
import codecs
import multiprocessing
import requests
from bs4 import BeautifulSoup

baseurl = ''
baseurl1 = ''
baseurl2 = ''
pageurl = ''
searchword = ''
header = {}

def fake(url, **kwargs):
    class Response(object):
        pass
    o = Response()
    o.content = 'foo'.format(url)
    return o

requests.get = fake


def Get_urls(start_page, end_page, queue):
    print('run task {} ({})'.format(start_page, os.getpid()))
    try:
        for i in range(start_page, end_page + 1):
            pageurl = baseurl1 + str(i) + baseurl2 + searchword
            response = requests.get(pageurl, headers=header)
            soup = BeautifulSoup(response.content, 'html.parser')
            a_list = soup.find_all('a')
            for a in a_list:
                if a.text != ''and 'wssd_content.jsp?bookid'in a['href']:
                    text = a.text.strip()
                    url = baseurl + str(a['href'])
                    queue.put(text + '	' + url + '
')
    except Exception as e:
        import traceback
        traceback.print_exc()


def write_file(queue):
    print("start write file")
    url_text = codecs.open('url.txt', 'a', 'utf-8')
    while True:
        line = queue.get()
        if line is None:
            break
        print("write {}".format(line))
        url_text.write(line)
    url_text.close()


def Multiple_processes_test():
    t1 = time.time()
    manager = multiprocessing.Manager()
    queue = manager.Queue()
    print 'parent process {} '.format(os.getpid())
    page_ranges_list = [(1, 3), (4, 6), (7, 9)]
    consumer = multiprocessing.Process(target=write_file, args=(queue,))
    consumer.start()
    pool = multiprocessing.Pool(processes=3)
    results = []
    for page_range in page_ranges_list:
        result = pool.apply_async(func=Get_urls,
                         args=(page_range[0],
                               page_range[1],
                               queue
                            ))
        results.append(result)
    pool.close()
    pool.join()
    queue.put(None)
    consumer.join()
    t2 = time.time()
    print '时间：', t2 - t1


if __name__ == '__main__':
    Multiple_processes_test()

结果

foo /4/wssd_content.jsp?bookid
foo /5/wssd_content.jsp?bookid
foo /6/wssd_content.jsp?bookid
foo /1/wssd_content.jsp?bookid
foo /2/wssd_content.jsp?bookid
foo /3/wssd_content.jsp?bookid
foo /7/wssd_content.jsp?bookid
foo /8/wssd_content.jsp?bookid
foo /9/wssd_content.jsp?bookid