Htmlparser,jsoup和httpclient解析页面并下载
2012年12月18日
如果你想抓取某个网页(如新浪、csdn之类)获取最新资讯列表,或者到某个站点下载文件、图片之类,就需要html解析例如htmlparser,jsoup之类的jar包,下载可用httpclient。如果想更高效率可以搞线程池,抓取解析线程和下载线程,类似消费者和生产者模型(此处不涉及,仅演示使用),代码如下: wang chaoqun
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
public static Set < String > digLinks ( String address ) throws Exception {
Set < String > result = new HashSet < String >();
URL url = new URL ( address );
Parser parser = new Parser (( HttpURLConnection ) url . openConnection ());
NodeFilter filter = new HasAttributeFilter ( "id" , "someid" );
NodeList nodes = parser . extractAllNodesThatMatch ( filter );
Node root = nodes . elementAt ( 0 );
String html = root . getChildren (). toHtml ();
parser = Parser . createParser ( html , "utf-8" );
nodes = parser . extractAllNodesThatMatch ( new TagNameFilter ( "li" ));
parser = Parser . createParser ( html , "utf-8" );
nodes = parser . extractAllNodesThatMatch ( new TagNameFilter ( "a" ));
add ( result , nodes );
return result ;
}
private static void add ( Set < String > result , NodeList nodes ) {
for ( int i = 0 ; i < nodes . size (); i ++) {
Node child = nodes . elementAt ( i );
if ( child instanceof LinkTag ) {
LinkTag linknode = ( LinkTag ) child ;
String href = linknode . getLink ();
result . add ( href );
}
}
}
标签:java
httpclient 下载
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
/**
* 读出文件中的url,连接下载保存
* @param file
*/
public static void doSave ( String file ) {
List < String > files = FileUtil . readLines ( file );
for ( String url : files ) {
String fileName = StringUtils . substringAfterLast ( url , "/" );
download ( url , fileName );
}
}
private static void download ( String url , String fileName ) {
OutputStream out = null ;
InputStream in = null ;
HttpURLConnection connection = null ;
URL server = null ;
try {
server = new URL ( url );
connection = ( HttpURLConnection ) server . openConnection ();
connection . connect ();
in = connection . getInputStream ();
File file = new File ( dir + fileName );
if ( file . exists ()) {
return ;
}
out = new FileOutputStream ( file );
int b = in . read ();
while ( b != - 1 ) {
out . write ( b );
b = in . read ();
}
in . close ();
out . close ();
} catch ( Exception e ) {
}
}
【更新】使用jsoup和4.3的httpclient(用的fluent的jar)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
private static void initHttpClient () {
int timeoutSeconds = 10 ;
int poolSize = 20 ;
RequestConfig config = RequestConfig . custom ()
. setSocketTimeout ( timeoutSeconds * 1000 )
. setConnectTimeout ( timeoutSeconds * 1000 ). build ();
httpClient = HttpClientBuilder . create (). setMaxConnTotal ( poolSize )
. setMaxConnPerRoute ( poolSize ). setDefaultRequestConfig ( config )
. build ();
}
/**
* 根据URL获得所有的html信息
*/
public static String getContent ( String url ) {
String content = "" ;
Executor executor = Executor . newInstance ( httpClient );
try {
HttpResponse response = executor . execute ( Request . Get ( url ))
. returnResponse ();
int status = response . getStatusLine (). getStatusCode ();
if ( status >= HttpStatus . SC_BAD_REQUEST ) {
logger . error ( "error:" + status + ":" + url );
} else {
HttpEntity entity = response . getEntity ();
content = EntityUtils . toString ( entity );
logger . info ( "ok :" + status + ":" + url );
}
} catch ( Exception e ) {
logger . error ( e . getMessage () + "\n" + url );
}
return content ;
}
jsoup获取本博客 文章标题和链接
1
2
3
4
5
6
7
8
9
10
String html = getContent ( "http://wangchaoqun.com/blog/archives/" );
html = new String ( html . getBytes ( "ISO-8859-1" ), "UTF-8" );
if ( StringUtils . isNotBlank ( html )) {
Document doc = Jsoup . parse ( html );
Elements hrefs = doc . select ( "div#blog-archives>article>h1>a" );
for ( Element each : hrefs ) {
titles . add ( each . text ());
links . add ( each . attr ( "href" ));
}
}