HttpClient crawls web page source code

巴扎黑
Release: 2016-12-20 12:00:30
Original
1676 people have browsed it

Package UTIL;

import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util .Map;
import java.util.Set;
import java.util.Map.Entry;
import java.util.zip.GZIPInputStream;

import org.apache.commons.httpclient.Header;
import org.apache .commons.httpclient.HttpClient;
import org.apache.commons.httpclient.HttpException;
import org.apache.commons.httpclient.HttpMethod;
import org.apache.commons.httpclient.HttpStatus;
import org.apache.commons .httpclient.NameValuePair;
import org.apache.commons.httpclient.SimpleHttpConnectionManager;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache .commons.httpclient.params.HttpConnectionManagerParams;
import org.apache.commons.httpclient.params.HttpMethodParams;

/ **
* @author Liuwei
* Date: December 18, 2009
*
* TODO
* HttpClient’s auxiliary class
* /
public class HttpClientHelper
{

/ **
* HttpClient’s connection timeout , read data timeout setting (unit: milliseconds)
* /
Public static final int HTTPCLIENT_CONNECTION_TIMEOUT = 30000;
Public static final int HTTPCLIENT_SO_TIMEOUT = 120000;
Public static final int HTTPMETHOD_SO_TIMEOUT = 5000;

//Let the ConnectionMan ager management Whether to close the connection when httpclientconnection
private static boolean alwaysClose = FALSE;
private static string defaultEncode = "UTF-8";

private static last DateFormat DATE_FORMAT = new SimpleDateFormat("YYYY-MM-DD HH:MM:SS ");

/ **
* Get the HttpClient connection and set the relevant parameters
*
* @return
* /
public static HttpClient's getHttpClient()
{
HttpClient client = new HttpClient (new SimpleHttpConnectionManager (alwaysClose));
HttpConnectionManagerParams managerParams = client.getHttpConnectionManager() getParams() method.
//Set the connection timeout (in milliseconds)
managerParams.setConnectionTimeout (HTTPCLIENT_CONNECTION_TIMEOUT);
//Set the read data timeout (in milliseconds)
managerParams.setSoTimeout (HTTPCLIENT_SO_TIMEOUT);
Return to the client;
}

/ * *
* Get the HttpClient connection and set the relevant parameters
*
* @parameter logonSite
* @parameter logonPort
* @parameter protocol
* @return
* /
public static HttpClient’s getHttpClient (last string logonSite, final interpretation logonPort, last string protocol)
{
HttpClient client = new HttpClient(new SimpleHttpConnectionManager(alwaysClose));
client.getHostConfiguration() setHost(logonSite, logonPort, protocol).
HttpConnectionManagerParams managerParams = client.getHttpConnectionManager() getParams() method.
//Set the connection timeout (in milliseconds)
managerParams.setConnectionTimeout (HTTPCLIENT_CONNECTION_TIMEOUT);
//Set the read data timeout (in milliseconds)
managerParams.setSoTimeout (HTTPCLIENT_SO_TIMEOUT);
Return to the client;
}

Private static List getHeaders(Map Headers)
{
List = ArrayList of Headers new ();
Boolean includeUserAgent = FALSE;
if ( empty = header&& false == header.isEmpty() ! )
{
set > = entrySet header.entrySet();
for (enter entry: entrySet)
{
if (false == includeUserAgent
&& "UserAgent".equals(entry.getKey()))
{
includeUserAgent = TRUE;
}
headers.add(new headers() entry.getKey(), entry.getValue()));
}
}

if (false == includeUserAgent)
{
headers.add(new headers(
"UserAgent",
"Mozilla/4.0( Compatible with; MSIE 7.0; Windows NT 5.1; GTB5; .NET CLR 1.1.4322; .NET CLR 2.0 0.50727; Alexa Toolbar; MAXTHON 2.0)"));
}
Return Header;
}

Private static NameValuePair [] getPairs(Map POSTDATA)
{
if (null == || POSTDATA postData.isEmpty())
{
return NULL;
}

set > = entrySet postData.entrySet();
INT DATALENGTH = entrySet.size();
NameValuePair[] = new NameValuePair[DATALENGTH]
INT I = 0;
For (entry< ;String, string>entrySet)
{
double[i++] = new NameValuePair(entry.getKey(), entry.getValue());
}
return pair;
}

/ **
*Request web content information
*
* @parameter HttpClient
* @parameter reqUrl
*parameter title
* @parameter POSTDATA
*parameter encoding
* @return
* /
public static string doRequest(HttpClient HttpClient, String reqUrl,
Map header, Map POSTDATA, String encoding)
{
String htmlContent = NULL;
if (null == HttpClient )
{
Return htmlContent;
}

//Request encoding settings
encoding = (null == encoding defaultEncode: encoding);

//Header request information
List = header getHeaders(header );

System.out.println("[" + DATE_FORMAT.format(new Date()) + "] - doRequest - " + reqUrl);

//Post method
if (null = POSTDATA!)
{
PostMethod PostMethod = new EncodePostMethod(reqUrl, encoding);
for (head tempHeader: header)
{
postMethod.setRequestHeader(tempHeader);
}

//Post parameter setting
NameValuePair[] = PARAMS getPairs(POSTDATA ;
Other
{
GetMethod getMethod = new implementation getMethod(reqUrl);
for (head tempHeader: header)
{
getMethod.setRequestHeader(tempHeader);
}

//Extract web page content
htmlContent = executeMethod(HttpClient , getMethod, encoding, NULL);
}
Return htmlContent;
}

Private static string getWebSite(String reqUrl)
{
String website = NULL;
if (null == reqUrl || reqUrl.isEmpty( ))
{
Return to website;
}

String prefix = "HTTP://";
if (reqUrl.startsWith(prefix))
{
INT index = reqUrl.substring(prefix.length()) indexOf("/") + prefix.length();
Website = reqUrl.substring(0, index);
}
Return to website;
}

/ **
* Get the web page content by enumerating HTTPMethod
*
* @parameter HttpClient
* @parameter requestMethod
* parameter encoding
* parameter website
* @return
* /
private static string executeMethod (HttpClient HttpClient, enum HTTPMethod requestMethod, encoded string, string website)
{
String responseContent = NULL;
if (null == HttpClient)
{
return responseContent;
}

//Determine whether to request encrypted data
Boolean dataEncrypt = FALSE;
Header acceptEncoding = requestMethod.getRequestHeader(" Accept encoding");
if (! empty = acceptEncoding
. && acceptEncoding.getValue() contains("gzip"))
{
dataEncrypt = TRUE;
}

InputStream responseStream = NULL;
try
{
INT status = httpClient.executeMethod(requestMethod);
if(HttpStatus .SC_OK == status)
{
responseStream = requestMethod.getResponseBodyAsStream();
responseContent = getContentByStream(dataEncrypt new GZIPInputStream(responseStream): responseStream, encoding);
responseStream.close();
}
//Return code is 30130 2303307 When, it means that the page has been redirected, then re-request the URL of the location, which is very important when some login authorizations are used to obtain cookies. Otherwise, if (HttpStatus.SC_MOVED_PERMANENTLY == status
|| HttpStatus.SC_MOVED_TEMPORARILY == status
|| HttpStatus .SC_SEE_OTHER == status
|| HttpStatus.SC_TEMPORARY_REDIRECT == status)
{
// Read the new URL address
header = requestMethod.getResponseHeader("position");
if (! header = NULL)
{
String redirectUrl = header.getValue();
if (null = redirectUrl!
&& false == redirectUrl.isEmpty())
{
responseContent = void;
if (null == redirectUrl || redirectUrl. isEmpty())
{
redirectUrl = "/";
}

if (false == redirectUrl.startsWith("http://")
! && empty = website)
{
if (website.startsWith( "/"))
{
redirectUrl = website + redirectUrl;
}
other
{
redirectUrl = website + "/" + redirectUrl;
}
}

GetMethod redirect = new implementation getMethod( redirectUrl);
Header referrer = requestMethod.getRequestHeader("referrer");
if (null = referrer! ; + }

} //Terminal

} //End status

} catch up (Exception 5)
{
e.printStackTrace();
}Finally
{
If (requestMethod! = NULL)
{
requestMethod.releaseConnection();
}
}
Return responseContent;
}

/ **
* Read information from the stream according to the specified encoding
*
* @parameter inStream
*Parameter encoding
* @return
*Throws IOException
* /
Public static string getContentByStream(InputStream inStream, String encoding) throws IOException
{
if (null == break)
{
return NULL;
}

StringBuilder content = new StringBuilder();
//Read the stream content using the specified encoding format
BufferedReader reader = new BufferedReader(new InputStreamReader(Interstitial, encoding));
String message = NULL;
while (null = (message = reader.readLine())!)
{
content.append(message);
content.append("r n");
}
//Close the reader, Release resources
reader.close();
Return (content.toString());
}

/ **
*Internal class, inherited from PostMethod, used to specify the postal request encoding format
* /
Public static class PostMethod extended by EncodePostMethod
{
private string encoding = NULL;

public EncodePostMethod(URL String, String encoding)
{
super(URL);
this.encode = encoding;
}

@override
public String getRequestCharSet()
{
// TODO automatically generate method stub
return (this.encode);
}

}

/ **
* test
*
* @parameter ARGS
* /
public static invalid main(String[] args)
{
//System.setProperty("http.proxyHost", "165.228.128.10");
//System.setProperty("http.proxyPort", "3128");
//System.setProperty("http.proxySet","true");


String reqUrl = " http://news.39.net/jbyw/index.html ";
reqUrl = " http:// news.39.net/a/2010722/1404231.html ”;
Map headers = new HashMap ();
headers.put("Accept encoding" , "gzip,deflate");

HttpClient HttpClient = getHttpClient();
String htmlContent = doRequest(HttpClient, reqUrl, headers, null, "GBK");
System.out.println( htmlContent);

}
}


Related labels:
source:php.cn
Statement of this Website
The content of this article is voluntarily contributed by netizens, and the copyright belongs to the original author. This site does not assume corresponding legal responsibility. If you find any content suspected of plagiarism or infringement, please contact admin@php.cn
Latest Downloads
More>
Web Effects
Website Source Code
Website Materials
Front End Template
About us Disclaimer Sitemap
php.cn:Public welfare online PHP training,Help PHP learners grow quickly!