python query Baidu seo information-Python Tutorial-php.cn

python query Baidu seo information

高洛峰

Release： 2016-10-18 10:30:46

Original

1605 people have browsed it

A simple python function to query Baidu keyword rankings, features:

1. UA random

2. Simple and convenient operation, just getRank (keyword, domain name) directly

3. Coding conversion. There should be no problem with coding.

4. Rich results. Not only the ranking, but also the title, URL, and snapshot time of the search results, which meets SEO needs

Disadvantages:

Single-threaded, slow

#coding=utf-8
  
import requests
import BeautifulSoup
import re
import random
  
def decodeAnyWord(w):
    try:
        w.decode(&#39;utf-8&#39;)
    except:
        w = w.decode(&#39;gb2312&#39;)
    else:
        w = w.decode(&#39;utf-8&#39;)
    return w
  
def createURL(checkWord):   #create baidu URL with search words
    checkWord = checkWord.strip()
    checkWord = checkWord.replace(&#39; &#39;, &#39;+&#39;).replace(&#39;\n&#39;, &#39;&#39;)
    baiduURL = &#39;http://www.baidu.com/s?wd=%s&rn=100&#39; % checkWord
    return baiduURL
  
def getContent(baiduURL):   #get the content of the serp
    uaList = [&#39;Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+1.1.4322;+TencentTraveler)&#39;,
    &#39;Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1;+.NET+CLR+2.0.50727;+.NET+CLR+3.0.4506.2152;+.NET+CLR+3.5.30729)&#39;,
    &#39;Mozilla/5.0+(Windows+NT+5.1)+AppleWebKit/537.1+(KHTML,+like+Gecko)+Chrome/21.0.1180.89+Safari/537.1&#39;,
    &#39;Mozilla/4.0+(compatible;+MSIE+6.0;+Windows+NT+5.1;+SV1)&#39;,
    &#39;Mozilla/5.0+(Windows+NT+6.1;+rv:11.0)+Gecko/20100101+Firefox/11.0&#39;,
    &#39;Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+SV1)&#39;,
    &#39;Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+GTB7.1;+.NET+CLR+2.0.50727)&#39;,
    &#39;Mozilla/4.0+(compatible;+MSIE+8.0;+Windows+NT+5.1;+Trident/4.0;+KB974489)&#39;]
    headers = {&#39;User-Agent&#39;: random.choice(uaList)}
    ipList = [&#39;202.43.188.13:8080&#39;,
    &#39;80.243.185.168:1177&#39;,
    &#39;218.108.85.59:81&#39;]
    proxies = {&#39;http&#39;: &#39;http://%s&#39; % random.choice(ipList)}
    r = requests.get(baiduURL, headers = headers, proxies = proxies)
    return r.content
  
def getLastURL(rawurl): #get final URL while there&#39;re redirects
    r = requests.get(rawurl)
    return r.url
  
def getAtext(atext):    #get the text with <a> and </a>
    pat = re.compile(r&#39;<a .*?>(.*?)</a>&#39;)
    match = pat.findall(atext)
    pureText = match[0].replace(&#39;<em>&#39;, &#39;&#39;).replace(&#39;</em>&#39;, &#39;&#39;)
    return pureText
  
def getCacheDate(t):    #get the date of cache
    pat = re.compile(r&#39;<span class="g">.*?(\d{4}-\d{1,2}-\d{1,2})  </span>&#39;)
    match = pat.findall(t)
    cacheDate = match[0]
    return cacheDate
  
def getRank(checkWord, domain): #main line
    checkWord = checkWord.replace(&#39;\n&#39;, &#39;&#39;)
    checkWord = decodeAnyWord(checkWord)
    baiduURL = createURL(checkWord)
    cont = getContent(baiduURL)
    soup = BeautifulSoup.BeautifulSoup(cont)
    results = soup.findAll(&#39;table&#39;, {&#39;class&#39;: &#39;result&#39;})    #find all results in this page
    for result in results:
        checkData = unicode(result.find(&#39;span&#39;, {&#39;class&#39;: &#39;g&#39;}))
        if re.compile(r&#39;^[^/]*%s.*?&#39; %domain).match(checkData): #改正则
            nowRank = result[&#39;id&#39;]  #get the rank if match the domain info
  
            resLink = result.find(&#39;h3&#39;).a
            resURL = resLink[&#39;href&#39;]
            domainURL = getLastURL(resURL)  #get the target URL
            resTitle = getAtext(unicode(resLink))   #get the title of the target page
  
            rescache = result.find(&#39;span&#39;, {&#39;class&#39;: &#39;g&#39;})
            cacheDate = getCacheDate(unicode(rescache)) #get the cache date of the target page
  
            res = u&#39;%s, 第%s名, %s, %s, %s&#39; % (checkWord, nowRank, resTitle, cacheDate, domainURL)
            return res.encode(&#39;gb2312&#39;)
            break
    else:
        return &#39;>100&#39;
  
domain = &#39;www.douban.com&#39; #set the domain which you want to search.
  
  
  
f = open(&#39;r.txt&#39;)
for w in f.readlines():
    print getRank(w, domain)
  
f.close()

Copy after login