使用webmagic进行简单的网页数据爬取时,遇到了网页无法下载的问题,不过在调试的时候,偶尔也会出现可以下载的情况,挺令人抓狂,在网上多次搜索,没有找到相关的解决办法,自己代码能力有限,还不能看懂问题所在,还请大神出手相救。
报的错误
2017-03-31 13:55:54,610 WARN [us.codecraft.webmagic.downloader.HttpClientDownloader] - download page http://www.neofactory.co.jp/product_detail/000004/ error
java.net.SocketTimeoutException: Read timed out
at java.net.SocketInputStream.socketRead0(Native Method)
at java.net.SocketInputStream.socketRead(Unknown Source)
at java.net.SocketInputStream.read(Unknown Source)
at java.net.SocketInputStream.read(Unknown Source)
at org.apache.http.impl.io.SessionInputBufferImpl.streamRead(SessionInputBufferImpl.java:139)
at org.apache.http.impl.io.SessionInputBufferImpl.fillBuffer(SessionInputBufferImpl.java:155)
at org.apache.http.impl.io.SessionInputBufferImpl.readLine(SessionInputBufferImpl.java:284)
at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:140)
at org.apache.http.impl.conn.DefaultHttpResponseParser.parseHead(DefaultHttpResponseParser.java:57)
at org.apache.http.impl.io.AbstractMessageParser.parse(AbstractMessageParser.java:261)
at org.apache.http.impl.DefaultBHttpClientConnection.receiveResponseHeader(DefaultBHttpClientConnection.java:165)
at org.apache.http.impl.conn.CPoolProxy.receiveResponseHeader(CPoolProxy.java:167)
at org.apache.http.protocol.HttpRequestExecutor.doReceiveResponse(HttpRequestExecutor.java:272)
at org.apache.http.protocol.HttpRequestExecutor.execute(HttpRequestExecutor.java:124)
at org.apache.http.impl.execchain.MainClientExec.execute(MainClientExec.java:271)
at org.apache.http.impl.execchain.ProtocolExec.execute(ProtocolExec.java:184)
at org.apache.http.impl.execchain.RetryExec.execute(RetryExec.java:88)
at org.apache.http.impl.execchain.RedirectExec.execute(RedirectExec.java:110)
at org.apache.http.impl.client.InternalHttpClient.doExecute(InternalHttpClient.java:184)
at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:82)
at org.apache.http.impl.client.CloseableHttpClient.execute(CloseableHttpClient.java:107)
at us.codecraft.webmagic.downloader.HttpClientDownloader.download(HttpClientDownloader.java:102)
at us.codecraft.webmagic.Spider.processRequest(Spider.java:404)
at us.codecraft.webmagic.Spider$1.run(Spider.java:321)
at us.codecraft.webmagic.thread.CountableThreadPool$1.run(CountableThreadPool.java:74)
at java.util.concurrent.ThreadPoolExecutor.runWorker(Unknown Source)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(Unknown Source)
at java.lang.Thread.run(Unknown Source)
我的代码
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.SocketTimeoutException;
import java.util.ArrayList;
import java.util.Date;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import jxl.Cell;
import jxl.Sheet;
import jxl.Workbook;
import jxl.read.biff.BiffException;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.Spider;
import us.codecraft.webmagic.processor.PageProcessor;
public class GithubRepoPageProcessor implements PageProcessor {
jxl.Workbook readwb=null;
String[] a=new String[]{};
Goodsdata gd=new Goodsdata();
DatabaseControl dc=new DatabaseControl();
static ArrayList<String>list=new ArrayList<String>();
private Site site = Site.me().setRetryTimes(3).setSleepTime(100).setCharset("Shift_JIS");
public void process(Page page) {
String todey_status="";
String maker_no="";
String oem_no="";
String color="";
String material="";
String size="";
String innerGoods="";
String rightMor="";
String warning="";
String introduction="";
String referedGoods="";
String similiarGoods="";
String similiarGoodscheck="";
maker_no=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[2]//td[1]/text()").get();
oem_no=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[2]//td[2]/text()").get();
color=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[4]//td[1]/text()").get();
material=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[4]//td[2]/text()").get();
size=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[6]//td/text()").get();
innerGoods=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[8]//td/text()").get();
rightMor=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[10]//td/text()").get();
warning=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[12]//td/text()").get();
introduction=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[5]//tbody//tr[4]//td//table//tbody//tr[14]//td/text()").get();
String todey_status_check=page.getHtml().xpath("//p[1]//p[2]//p[2]//table[4]//tbody//tr//td").get();
if(todey_status_check.contains("売り切れ中です。")){
todey_status="0";
}else{
String[] str=null;
str=todey_status_check.split(">");
todey_status=RegexString(str[str.length-2],"\\d{1,2}");
}
String html=page.getHtml().toString();
a=html.split("\n");
if(page.getHtml()
.xpath("//p[1]//p[2]//p[2]//table[6]//tbody//tr[1]//td//table//tbody//tr[1]//th")
.match()){
for(int i=0;i<a.length;i++){
if(!a[i].contains("この商品の関連商品")){
continue;
}else{
for(int j=i+1;j<a.length;j++){
if(a[j].contains("</table>")){
referedGoods=referedGoods.substring(0, referedGoods.length()-1);
break;
}else{
if(a[j].contains("商品番号")){
String regEx="\\d{6}|\\b\\w{2,3}\\d{3,4}";
referedGoods=referedGoods+"nf-"+RegexString(a[j],regEx)+":";//调用正则函数表达式函数,返回关联商品番号]
}
}
}
}
}
}
if (page.getHtml()
.xpath("//p[1]//p[2]//p[2]//table[6]//tbody//tr[2]//td//table//tbody//tr[1]//th//strong")
.match()) {
similiarGoodscheck = page.getHtml()
.xpath("//p[1]//p[2]//p[2]//table[6]//tbody//tr[2]//td//table//tbody//tr[1]//th//strong/text()")
.get();
for (int i = 0; i < a.length; i++) {
if (!a[i].contains(similiarGoodscheck)) {
continue;
} else {
for (int j = i + 1; j < a.length; j++) {
if (a[j].contains("</table>")) {
similiarGoods = similiarGoods.substring(0, similiarGoods.length() - 1);
break;
} else {
if (a[j].contains("商品番号")) {
String regEx = "\\d{6}|\\b\\w{2,3}\\d{3,4}";
similiarGoods = similiarGoods + "nf-" + RegexString(a[j], regEx) + ":";// 调用正则函数表达式函数,返回关联商品番号]
}
}
}
}
}
}
// System.out.println(todey_status);
// System.out.println(maker_no+" "+oem_no+" ");
// System.out.println(color+" "+material+" "+size+" ");
// System.out.println(innerGoods+" "+rightMor+" "+warning+" "+introduction);
// System.out.println(referedGoods);
// System.out.println(similiarGoods);
gd.setMaker_no(maker_no);
gd.setOem_no(oem_no);
gd.setColor(color);
gd.setMaterial(material);
gd.setSize(size);
gd.setInnerGoods(innerGoods);
gd.setRightMor(rightMor);
gd.setWarning(warning);
gd.setIntroduction(introduction);
gd.setReferedGoods(referedGoods);
gd.setSimiliarGoods(similiarGoods);
//dc.insert(gd);
}
public String RegexString(String targetStr,String patternStr){//正则表达式函数,接收目标html字符串,正则表达式
String goodsnum=null;
Pattern pt=Pattern.compile(patternStr);
Matcher matcher=pt.matcher(targetStr);
boolean rs=matcher.find();
if(rs){
goodsnum=matcher.group();
}
return goodsnum;
}
public Site getSite() {
return site;
}
public void openXls() throws BiffException, IOException{//获得excel的内容
try {
int column=0;
InputStream instream=new FileInputStream("C:\\Users\\xujio\\Desktop\\itemdatabase_neo.xls");
readwb=Workbook.getWorkbook(instream);
Sheet readsheet =readwb.getSheet(0);
int rsColumn=readsheet.getColumns();
int rsRows=readsheet.getRows();
for(int j=0;j<rsColumn;j++){
Cell cell=readsheet.getCell(j, 0);
if(cell.getContents().equals("管理番号")){
column=j;
break;
}
}
for(int i=1;i<rsRows;i++){
String originNum=null;
Cell cell=readsheet.getCell(column,i);
originNum=cell.getContents();
String[] numGoods=originNum.split("-");
list.add(numGoods[1]);
}
} catch (Exception e) {
e.printStackTrace();
}finally{
readwb.close();
}
}
public static void main(String[] args) {
int check=0;
String strNum=null;
try {
new GithubRepoPageProcessor().openXls();//读取一个.xls文件
} catch (BiffException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
for(int i=0;i<5;i++){
strNum=list.get(i);//获取商品代号
String url="http://www.neofactory.co.jp/product_detail/"+list.get(i)+"/";//获取相关商品代号下的网页的地址
Spider.create(new GithubRepoPageProcessor()).addUrl(url).thread(5).run();
}
}
}
Dear, in your exception information, it seems that the URL cannot be accessed in the first place, so it is 404, and the data cannot be crawled