爬虫生成dict后,想将其写入csv文件,却出错
使用jupyter notebook,window环境。
具体代码如下
import requests
from multiprocessing.dummy import Pool as ThreadPool
from lxml import etree
import sys
import time
import random
import csv
def spider(url):
header={
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
timeout=random.choice(range(31,50))
html = requests.get(url,header,timeout=timeout)
time.sleep(random.choice(range(8,16)))
selector = etree.HTML(html.text)
content_field = selector.xpath('//*[@class="inner"]/p[3]/p[2]/ul/li')
item ={}
for each in content_field:
g = each.xpath('a/p[1]/p[1]/h3/span/text()')
go = each.xpath('a/p[1]/p[2]/p/h3/text()')
h = each.xpath('a/p[1]/p[2]/p/p/text()[1]')
j= each.xpath('a/p[1]/p[1]/p/text()[2]')
ge = each.xpath('a/p[1]/p[2]/p/p/text()[3]')
x = each.xpath('a/p[1]/p[1]/p/text()[3]')
city = each.xpath('a/p[1]/p[1]/p/text()[1]')
gg = each.xpath('a/p[2]/span/text()')
item['city']="".join(city)
item['hangye']="".join(hangye)
item['guimo']="".join(guimo)
item['gongsi']="".join(gongsi)
item['gongzi']="".join(gongzi)
item['jingyan']="".join(jingyan)
item['xueli']="".join(xueli)
item['gongzuoneirong']="".join(gongzuoneirong)
fieldnames =['city','hangye','guimo','gongsi','gongzi','jingyan','xueli','gongzuoneirong']
with open('bj.csv','a',newline='',errors='ignore')as f:
f_csv=csv.DictWriter(f,fieldnames=fieldnames)
f_csv.writeheader()
f_csv.writerow(item)
if __name__ == '__main__':
pool = ThreadPool(4)
f=open('bj.csv','w')
page = []
for i in range(1,100):
newpage = 'https://www.zhipin.com/c101010100/h_101010100/?query=%E6%95%B0%E6%8D%AE%E8%BF%90%E8%90%A5&page='+str(i) + '&ka=page-' + str(i)
page.append(newpage)
results = pool.map(spider,page)
pool.close()
pool.join()
f.close()
运行上面代码,提示错误为
ValueError: too many values to unpack (expected 2)
通过查询原因是要将dict遍历,需要dict.items()的形式。但在上述代码中如何实现,一直没有理顺,求教各位
Your Answer
3 个回答
不好意思哈,现在才有时间来回答你的问题,看到你根据我的建议把代码改过来了,下面我把改过的代码贴出来,我运行过,是没问题的
import requests
from multiprocessing.dummy import Pool
from lxml import etree
import time
import random
import csv
def spider(url):
header = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
}
timeout = random.choice(range(31, 50))
html = requests.get(url, headers=header, timeout=timeout)
time.sleep(random.choice(range(8, 16)))
selector = etree.HTML(html.text)
content_field = selector.xpath('//*[@class="inner"]/p[3]/p[2]/ul/li')
item = {}
for each in content_field:
g = each.xpath('a/p[1]/p[1]/h3/span/text()')
go = each.xpath('a/p[1]/p[2]/p/h3/text()')
h = each.xpath('a/p[1]/p[2]/p/p/text()[1]')
j = each.xpath('a/p[1]/p[1]/p/text()[2]')
ge = each.xpath('a/p[1]/p[2]/p/p/text()[3]')
x = each.xpath('a/p[1]/p[1]/p/text()[3]')
city = each.xpath('a/p[1]/p[1]/p/text()[1]')
gg = each.xpath('a/p[2]/span/text()')
item['city'] = "".join(city)
item['hangye'] = "".join(g)
item['guimo'] = "".join(go)
item['gongsi'] = "".join(h)
item['gongzi'] = "".join(j)
item['jingyan'] = "".join(ge)
item['xueli'] = "".join(x)
item['gongzuoneirong'] = "".join(gg)
fieldnames = ['city', 'hangye', 'guimo', 'gongsi', 'gongzi', 'jingyan', 'xueli', 'gongzuoneirong']
with open('bj.csv', 'a', newline='', errors='ignore')as f:
f_csv = csv.DictWriter(f, fieldnames=fieldnames)
f_csv.writeheader()
f_csv.writerow(item)
if __name__ == '__main__':
f = open('bj.csv', 'w')
page = []
for i in range(1, 100):
newpage = 'https://www.zhipin.com/c101010100/h_101010100/?query=%E6%95%B0%E6%8D%AE%E8%BF%90%E8%90%A5&page=' + str(
i) + '&ka=page-' + str(i)
page.append(newpage)
print(page)
pool = Pool(4)
results = pool.map(spider, page)
pool.close()
pool.join()
f.close()
这里主要是header,你原来是set类型,我修改后是dict类型
这里还需要给你一些建议
你的代码是放到ide还是文本编辑器中运行的?有的东西在ide下明显会报错啊
建议新手从开始学的时候就遵守PEP8规范,别养成了坏习惯,你看看你的命名
item = {'a':1, 'b':2}
fieldnames = ['a', 'b']
with open('test.csv', 'a') as f:
f_csv = DictWriter(f, fieldnames=fieldnames)
f_csv.writeheader()
f_csv.writerow(item)
我这样写并没报错喔
writerow就是直接接收dict的吧,你这个问题,我感觉是因为item的key与你表头不对应
Hot Questions
function_exists()无法判定自定义函数
2024-04-29 11:01:01
google 浏览器 手机版显示的怎么实现
2024-04-23 00:22:19
子窗口操作父窗口,输出没反应
2024-04-19 15:37:47
父窗口没有输出
2024-04-18 23:52:34
关于CSS思维导图的课件在哪?
2024-04-16 10:10:18
Hot Tools
vc9-vc14(32+64位)运行库合集(链接在下方)
phpStudy安装所需运行库集合下载
VC9 32位
VC9 32位 phpstudy集成安装环境运行库
php程序员工具箱完整版
程序员工具箱 v1.0 php集成环境
VC11 32位
VC11 32位 phpstudy集成安装环境运行库
SublimeText3汉化版
中文版,非常好用
热门话题
抖音等级价目表1-75
20335
7
20335
7
wifi显示无ip分配
13531
4
13531
4
虚拟手机号接收验证码
11850
4
11850
4
gmail邮箱登陆入口在哪里
8835
17
8835
17
windows安全中心怎么关闭
8420
7
8420
7
热门文章
2025年加密货币市场十大趋势预测:下一个风口在哪里?
2025-11-07
By DDD
Galaxy的观点:山寨币ETF大军即将到来 哪些的前景会光明
2025-11-08
By DDD
铁路12306支付失败订单还在吗_铁路12306支付失败订单处理方法
2025-11-07
By DDD
win10字体安装后在软件里找不到怎么办_win10字体安装与识别方法
2025-11-07
By DDD
解决CSS @media 查询优先级与规则覆盖问题的教程
2025-11-07
By DDD





