java - 使用Webmagic网页无法下载
大家讲道理
大家讲道理 2017-04-18 10:53:07
0
1
905

使用webmagic进行简单的网页数据爬取时,遇到了网页无法下载的问题,不过在调试的时候,偶尔也会出现可以下载的情况,挺令人抓狂,在网上多次搜索,没有找到相关的解决办法,自己代码能力有限,还不能看懂问题所在,还请大神出手相救。
报的错误

2017-03-31 13:55:54,610 WARN [us.codecraft.webmagic.downloader.HttpClientDownloader] - download page http://www.neofactory.co.jp/product_detail/000004/ error
java.net.SocketTimeoutException: Read timed out
    at java.net.SocketInputStream.socketRead0(Native Method)
    at java.net.SocketInputStream.socketRead(Unknown Source)
    at java.net.SocketInputStream.read(Unknown Source)
    at java.net.SocketInputStream.read(Unknown Source)
    at org.apache.http.impl.io.SessionInputBufferImpl.streamRead(SessionInputBufferImpl.java:139)
    at org.apache.http.impl.io.SessionInputBufferImpl.fillBuffer(SessionInputBufferImpl.java:155)
    at org.apache.http.impl.io.SessionInputBufferImpl.readLine(SessionInputBufferImpl.java:284)
    at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:140)
    at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:57)
    at org.apache.http.impl.io.AbstractMessageParser.parse(AbstractMessageParser.java:261)
    at org.apache.http.impl.DefaultBHttpClientConnection.receiveResponseHeader(DefaultBHttpClientConnection.java:165)
    at org.apache.http.impl.conn.CPoolProxy.receiveResponseHeader(CPoolProxy.java:167)
    at org.apache.http.protocol.HttpRequestExecutor.doReceiveResponse(HttpRequestExecutor.java:272)
    at org.apache.http.protocol.HttpRequestExecutor.execute(HttpRequestExecutor.java:124)
    at org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:271)
    at org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:184)
    at org.apache.http.impl.execchain.RetryExec.execute(RetryExec.java:88)
    at org.apache.http.impl.execchain.RedirectExec.execute(RedirectExec.java:110)
    at org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:184)
    at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:82)
    at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:107)
    at us.codecraft.webmagic.downloader.HttpClientDownloader.download(HttpClientDownloader.java:102)
    at us.codecraft.webmagic.Spider.processRequest(Spider.java:404)
    at us.codecraft.webmagic.Spider$1.run(Spider.java:321)
    at us.codecraft.webmagic.thread.CountableThreadPool$1.run(CountableThreadPool.java:74)
    at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
    at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
    at java.lang.Thread.run(Unknown Source)

我的代码

import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import jxl.Cell;
import jxl.Sheet;
import jxl.Workbook;
import jxl.read.biff.BiffException;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;

public class GithubRepoPageProcessor implements PageProcessor {
    jxl.Workbook readwb=null;
    String[] a=new String[]{};
    Goodsdata gd=new Goodsdata(); 
    DatabaseControl dc=new DatabaseControl();
    static ArrayList<String>list=new ArrayList<String>();
    private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setCharset("Shift_JIS");
    public void process(Page page) {
        String todey_status="";
        String maker_no="";
        String oem_no="";
        String color="";
        String material="";
        String size="";
        String innerGoods="";
        String rightMor="";
        String warning="";
        String introduction="";
        String referedGoods="";
        String similiarGoods="";
        String similiarGoodscheck="";
        maker_no=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[2]//td[1]/text()").get();
        oem_no=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[2]//td[2]/text()").get();
        color=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[4]//td[1]/text()").get();
        material=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[4]//td[2]/text()").get();
        size=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[6]//td/text()").get();
        innerGoods=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[8]//td/text()").get();
        rightMor=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[10]//td/text()").get();
        warning=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[12]//td/text()").get();
        introduction=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[14]//td/text()").get();
        String todey_status_check=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[4]//tbody//tr//td").get();
        if(todey_status_check.contains("売り切れ中です。")){
            todey_status="0";
        }else{
            String[] str=null;
            str=todey_status_check.split(">");
            todey_status=RegexString(str[str.length-2],"\\d{1,2}");
        }
        String html=page.getHtml().toString();
        a=html.split("\n");
        if(page.getHtml()
                .xpath("//p[1]//p[2]//p[2]//table[6]//tbody//tr[1]//td//table//tbody//tr[1]//th")
                .match()){
            for(int i=0;i<a.length;i++){
                if(!a[i].contains("この商品の関連商品")){
                    continue;
                }else{
                    for(int j=i+1;j<a.length;j++){
                        if(a[j].contains("</table>")){
                            referedGoods=referedGoods.substring(0, referedGoods.length()-1);
                            break;
                        }else{
                            if(a[j].contains("商品番号")){
                                    String regEx="\\d{6}|\\b\\w{2,3}\\d{3,4}";
                                    referedGoods=referedGoods+"nf-"+RegexString(a[j],regEx)+":";//调用正则函数表达式函数,返回关联商品番号]
                            }
                        }

                    }
                }
            }
        }
        if (page.getHtml()
                .xpath("//p[1]//p[2]//p[2]//table[6]//tbody//tr[2]//td//table//tbody//tr[1]//th//strong")
                .match()) {
            similiarGoodscheck = page.getHtml()
                    .xpath("//p[1]//p[2]//p[2]//table[6]//tbody//tr[2]//td//table//tbody//tr[1]//th//strong/text()")
                    .get();
            for (int i = 0; i < a.length; i++) {
                if (!a[i].contains(similiarGoodscheck)) {
                    continue;
                } else {
                    for (int j = i + 1; j < a.length; j++) {
                        if (a[j].contains("</table>")) {
                            similiarGoods = similiarGoods.substring(0, similiarGoods.length() - 1);
                            break;
                        } else {
                            if (a[j].contains("商品番号")) {
                                String regEx = "\\d{6}|\\b\\w{2,3}\\d{3,4}";
                                similiarGoods = similiarGoods + "nf-" + RegexString(a[j], regEx) + ":";// 调用正则函数表达式函数,返回关联商品番号]
                            }
                        }

                    }
                }
            }
        } 
//          System.out.println(todey_status);
//        System.out.println(maker_no+"  "+oem_no+" ");
//        System.out.println(color+" "+material+" "+size+" ");
//        System.out.println(innerGoods+" "+rightMor+" "+warning+" "+introduction);
//        System.out.println(referedGoods);
//        System.out.println(similiarGoods);
        gd.setMaker_no(maker_no);
        gd.setOem_no(oem_no);
        gd.setColor(color);
        gd.setMaterial(material);
        gd.setSize(size);
        gd.setInnerGoods(innerGoods);
        gd.setRightMor(rightMor);
        gd.setWarning(warning);
        gd.setIntroduction(introduction);
        gd.setReferedGoods(referedGoods);
        gd.setSimiliarGoods(similiarGoods);
        //dc.insert(gd);

    }
    public String RegexString(String targetStr,String patternStr){//正则表达式函数,接收目标html字符串,正则表达式
        String goodsnum=null;
        Pattern pt=Pattern.compile(patternStr);
        Matcher matcher=pt.matcher(targetStr);
        boolean rs=matcher.find();
        if(rs){
            goodsnum=matcher.group();
        }
        return goodsnum;
    }
    public Site getSite() {
        return site;
    }
    public void openXls() throws BiffException, IOException{//获得excel的内容
        try {
            int column=0;
            InputStream instream=new FileInputStream("C:\\Users\\xujio\\Desktop\\itemdatabase_neo.xls");
            readwb=Workbook.getWorkbook(instream);
            Sheet readsheet =readwb.getSheet(0);
            int rsColumn=readsheet.getColumns();
            int rsRows=readsheet.getRows();
            for(int j=0;j<rsColumn;j++){
                Cell cell=readsheet.getCell(j, 0);
                if(cell.getContents().equals("管理番号")){
                    column=j;
                    break;
                }
            }
            for(int i=1;i<rsRows;i++){
                String originNum=null;
                Cell cell=readsheet.getCell(column,i);
                originNum=cell.getContents();
                String[] numGoods=originNum.split("-");
                list.add(numGoods[1]);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }finally{
            readwb.close();
        }
    }
    public static void main(String[] args) {
        int check=0;    
        String strNum=null;
        try {
            new GithubRepoPageProcessor().openXls();//读取一个.xls文件
        } catch (BiffException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        } catch (IOException e) {
            // TODO Auto-generated catch block
            e.printStackTrace();
        }
        for(int i=0;i<5;i++){
            strNum=list.get(i);//获取商品代号
            String url="http://www.neofactory.co.jp/product_detail/"+list.get(i)+"/";//获取相关商品代号下的网页的地址
            Spider.create(new GithubRepoPageProcessor()).addUrl(url).thread(5).run();        
        }
    }
}
大家讲道理
大家讲道理

光阴似箭催人老,日月如移越少年。

reply all(1)
迷茫

Dear, in your exception information, it seems that the URL cannot be accessed in the first place, so it is 404, and the data cannot be crawled

Latest Downloads
More>
Web Effects
Website Source Code
Website Materials
Front End Template