python - 抓取新浪微博返回页面内容排版混乱
高洛峰
高洛峰 2017-04-17 17:20:22
0
0
331

我在抓取新浪微博一个“科技”相关的人物微博时,已经完成登录,get页面返回的数据比较混乱,用beautifulsoup.prettify()问题依旧。还请大神们帮忙看看是不是我的处理方式有问题。
本人菜鸟一枚。

贴上代码:

coding=utf-8

import sys
import urllib
import urllib2
import cookielib
import base64
import re
import json
import hashlib
import os
import rsa
import binascii
import time
import requests
import bs4
import redis
import pdb
import HTMLParser

reload(sys)
sys.setdefaultencoding('utf-8')

r=redis.Redis(host='localhost',port=6379,db=0)
r.delete('user_pool')
r.lpush('user_pool','fuckyouasshole')
r.delete('fans_pool')
r.sadd('fans_pool','fuckyouasshole')

weiboSession=requests.Session()
file = open('test.txt','w')

parameters = {
'entry': 'weibo',
'callback': 'sinaSSOController.preloginCallBack',
'su': 'bGFpcmVuMjAwNg%3D%3D',
'rsakt': 'mod',
'checkpin': '1',
'client': 'ssologin.js(v1.4.5)',
'_': '1457327347813'
}
postdata = {
'entry': 'weibo',
'gateway': '1',
'from': '',
'savestate': '7',
'useticket': '1',
'pagerefer': 'http%3A%2F%2Flogin.sina.com.cn%2Fsso%2Flogout.php%3Fentry%3Dminiblog%26r%3Dhttp%253A%252F%252Fweibo.com%252Flogout.php%253Fbackurl%253D%25252F',
'vsnf': '1',
'su': '',
'service': 'miniblog',
'servertime': '',
'nonce': '',
'pwencode': 'rsa2',
'rsakv': '',
'sp': '',
'encoding': 'UTF-8',
'prelt': '147',
'url': 'http://www.weibo.com/ajaxlogin.php?framelogin=1&callback=parent.sinaSSOController.feedBackUrlCallBack&returntype=META'
}
headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:8.0) Gecko/20100101 Firefox/8.0',
'Accept-Encoding':'deflate, sdch'}
def get_servertime():

url = 'http://login.sina.com.cn/sso/prelogin.php?entry=weibo&callback=sinaSSOController.preloginCallBack&su=bGFpcmVuMjAwNg%3D%3D&rsakt=mod&checkpin=1&client=ssologin.js(v1.4.18)&_=1457327347813' #data = urllib2.urlopen(url).read() data=weiboSession.get(url).content p = re.compile('\((.*)\)') try: json_data = p.search(data).group(1) data = json.loads(json_data) servertime = str(data['servertime']) nonce = data['nonce'] pubkey = data['pubkey'] rsakv = data['rsakv'] return servertime, nonce, pubkey, rsakv except: print 'Get severtime error!' return None

def get_pwd(pwd, servertime, nonce, pubkey):

rsaPublickey = int(pubkey, 16) key = rsa.PublicKey(rsaPublickey, 65537) #创建公钥 message = str(servertime) + '\t' + str(nonce) + '\n' + str(pwd) #拼接明文 js加密文件中得到 passwd = rsa.encrypt(message, key) #加密 passwd = binascii.b2a_hex(passwd) #将加密信息转换为16进制 return passwd

def get_user(username):

username_ = urllib.quote(username) username = base64.encodestring(username_)[:-1] return username

def login(username, pwd):

url = 'http://login.sina.com.cn/sso/login.php?client=ssologin.js(v1.4.5)' try: servertime, nonce, pubkey, rsakv = get_servertime() except: return global postdata postdata['nonce'] = nonce postdata['rsakv'] = rsakv postdata['su'] = get_user(username) postdata['sp'] = get_pwd(pwd, servertime, nonce, pubkey) postdata['servertime'] = time.time() result=weiboSession.post(url,headers=headers,data=postdata) text = result.content p = re.compile('location\.replace\(\'(.*?)\'\)') try: login_url = p.search(text).group(1) response=weiboSession.get(login_url,headers=headers) print u"登录成功!" except: print 'Login error!'

login('@163.com','**')

for i in range(0,1):

while(True): try: if i==0: response=weiboSession.get('http://d.weibo.com/1087030002_2975_2009_0#',headers=headers) else: response=weiboSession.get('http://d.weibo.com/1087030002_2975_2009_0?page=%d#Pl_Core_F4RightUserList__4'%i,headers=headers) s=response.content.decode("string_escape") if re.search(r'
  • ',s): print re.search(r'
  • .*?<\\\/li>',s).group(0) soup_page=bs4.BeautifulSoup(s,"html.parser")#, file.write(soup_page.prettify()) file.close() #pdb.set_trace() for userlist in soup_page.find('ul',class_='follow_list').find_all('li'): user_url=userlist.dl.dd.p.a['href'] #print user_url r.lpush('user_pool',user_url) break except Exception,e: print Exception,':',e #time.sleep(10) pass
  • print r.rpop('user_pool')

    file记录的内容排版非常混乱,导致用
    soup_page.find('ul',class_='follow_list').find_all('li') 没法找到follow_list。提示:'None Type'object has no attribute 'find_all'

    高洛峰
    高洛峰

    拥有18年软件开发和IT教学经验。曾任多家上市公司技术总监、架构师、项目经理、高级软件工程师等职务。 网络人气名人讲师,...

    全部回覆 (0)
    最新下載
    更多>
    網站特效
    網站源碼
    網站素材
    前端模板
    關於我們 免責聲明 Sitemap
    PHP中文網:公益線上PHP培訓,幫助PHP學習者快速成長!