网络爬虫中URLConnection的使用[以科学网为例]

加载中...

网络爬虫中URLConnection的使用[以科学网为例] | DataLearnerAI

package collectip;


import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/*
 * author:合肥工业大学 管院学院 钱洋 
 *1563178220@qq.com
 *博客地址:http://blog.csdn.net/qy20115549/
*/
public class UrlUtil {
	private final static String ENCODE = "GBK"; 
	public static String getURLEncoderString(String str) {
		String result = "";
		if (null == str) {
			return "";
		}
		try {
			result = java.net.URLEncoder.encode(str, ENCODE);
		} catch (UnsupportedEncodingException e) {
			e.printStackTrace();
		}
		return result;
	}

	public static void main(String[] args) throws UnsupportedEncodingException {
		String str = "blog.php?mod=member&type=管理科学和管理思想史&realmmedium=管理科学与工程&realm=管理综合&catid=565";
		UrlTrans(str);

	}
	public static String UrlTrans(String str) {
		//中文正则匹配，将中文进行转码，其他字符不变
		Pattern p = Pattern.compile("([\u4e00-\u9fa5]+)");    
		Matcher m = p.matcher( str );    
		String mv = null;  
		List<String> list=new ArrayList<String>();
		while (m.find()) {    
			mv = m.group(0);    
			list.add(getURLEncoderString(mv));    
		} 
		//找出id，即565
		String regEx="[^0-9]";   
		Pattern p1 = Pattern.compile(regEx);   
		Matcher m1 = p1.matcher(str);   
		String url=" http://blog.sciencenet.cn/blog.php?mod=member&type="+list.get(0)+"&realmmedium="+list.get(1)+"&realm="+list.get(2)+"&catid="+m1.replaceAll("").trim();
		System.out.println(url);
		return url;
	}
}

package navi.main;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import collectip.UrlUtil;
/*
 * author:合肥工业大学 管院学院 钱洋 
 *1563178220@qq.com
 *博客地址:http://blog.csdn.net/qy20115549/
*/
public class JsonTest {

	public static void main(String[] args) throws Exception {
		//使用URLConnection请求url，并返回html字符，这里使请求第一层数据，获取第二层请求的url
		String html = getRawHtml("http://blog.sciencenet.cn/blog.php?mod=member&type=%B9%DC%C0%ED%D7%DB%BA%CF");
		//使用Jsoup方式进行解析html
		Document document=Jsoup.parse(html);
		Elements elements=document.select("div[class=box_line]").get(0).select("li").select("a");
		for (Element ele: elements) {
			//第二层请求，为了爬取用户信息
			String html1 = getRawHtml(UrlUtil.UrlTrans(ele.attr("href")));
			//使用Jsoup方式进行解析html1
			Document document1=Jsoup.parse(html1);
			Elements elements2=document1.select("div[id=con_box]").select("p[class=potfont]").select("a");
			for (Element ele1: elements2) {
				//匹配字符串中的数字，获取id
				String idtest=ele1.attr("href");
				String regEx="[^0-9]";   
				Pattern p1 = Pattern.compile(regEx);   
				Matcher m1 = p1.matcher(idtest);  
				String id=m1.replaceAll("").trim();
				//获取用户名
				String name=ele1.text();
				System.out.println(id+"=="+name);
			}
		}
	}
	//URLConnection方法
	public static String  getRawHtml(String personalUrl) throws InterruptedException,IOException {
		//使用URLConnection请求数据
		URL url = new URL(personalUrl);
		URLConnection conn = url.openConnection();
		InputStream in=null;
		try {
			conn.setConnectTimeout(3000);
			in = conn.getInputStream();
		} catch (Exception e) {
		}
		//将获取的数据转化为String
		String html = convertStreamToString(in);
		return html;
	}
	//这个方法是将InputStream转化为String
	public static String convertStreamToString(InputStream is) throws IOException {
		if (is == null)
			return "";
		BufferedReader reader = new BufferedReader(new InputStreamReader(is,"gbk"));
		StringBuilder sb = new StringBuilder();
		String line = null;
		try {
			while ((line = reader.readLine()) != null) {
				sb.append(line);
			}
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			try {
				is.close();
			} catch (IOException e) {
				e.printStackTrace();
			}
		}
		reader.close();
		return sb.toString();

	}
}

网络爬虫中URLConnection的使用[以科学网为例]

DataLearner 官方微信

热门博客