使用多线程爬取数据
之前的博客李介绍了多线程的使用以及如何简单使用java的爬虫,本文把二者做一个简单的结合,使用多线程同时查不同的网址,最后把结果合并
没有通过正则表达式来爬取不同的网址,直接写死4个网址来让不同的线程爬取
MultiThreadQueryUtil2
@Service
public class MultiThreadQueryUtil2 {
public static void main(String[] args) {
List<List> multiCombineResult = getMultiCombineResult();
System.out.println(multiCombineResult);
}
/**
* 获取多线程结果并进行结果合并
* @return
*/
public static List<List> getMultiCombineResult() {
//开始时间
long start = System.currentTimeMillis();
//返回结果
List<List> result = new ArrayList<>();
List<String> webs = new ArrayList<>();
webs.add("https://taolitop.com/");
webs.add("https://taolitop.com/jj.whtml");
webs.add("https://taolitop.com/dzhrjsjykf.whtml");
webs.add("https://taolitop.com/zxns.whtml");
//假定总数据4条
//Callable用于产生结果
List<Callable<List>> tasks = new ArrayList<>();
for (int i = 1; i <= 4; i++) {
//不同的线程用户处理不同分段的数据量,这样就达到了平均分摊查询数据的压力
Callable<List> qfe = new ThredQuery2(webs.get(i-1));
System.out.println(i);
System.out.println(webs.get(i-1));
tasks.add(qfe);
}
try{
//定义固定长度的线程池 防止线程过多,4就够用了
ExecutorService executorService = Executors.newFixedThreadPool(4);
//Future用于获取结果
List<Future<List>> futures=executorService.invokeAll(tasks);
//处理线程返回结果
if(futures!=null&&futures.size() > 0){
for (Future<List> future:futures){
result.addAll(future.get());
}
}
//关闭线程池,一定不能忘记
executorService.shutdown();
}catch (Exception e){
e.printStackTrace();
}
long end = System.currentTimeMillis();
System.out.println("线程查询数据用时:"+(end-start)+"ms");
return result;
}
}
ThredQuery2
public class ThredQuery2 implements Callable<List> {
//每个线程查询出来的数据集合
private List datas;
public ThredQuery2(String url) {
//每个线程查询出来的数据集合
//1.生成httpclient,相当于该打开一个浏览器
CloseableHttpClient httpClient = HttpClients.createDefault();
CloseableHttpResponse response = null;
//2.创建get请求,相当于在浏览器地址栏输入网址
HttpGet request = new HttpGet(url);
List<String> imgs = new ArrayList<>();
try {
//3.执行get请求,相当于在输入地址栏后敲回车键
response = httpClient.execute(request);
//4.判断响应状态为200,进行处理
if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
//5.获取响应内容
HttpEntity httpEntity = response.getEntity();
String html = EntityUtils.toString(httpEntity, "utf-8");
Document doc = Jsoup.parse(html);
Elements links = doc.getElementsByTag("img");
for (Element link : links) {
String linkHref = link.attr("src");
linkHref = "https://taolitop.com" +linkHref;
imgs.add(linkHref);
}
} else {
//如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
System.out.println("返回状态不是200");
System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
}
} catch (ClientProtocolException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
//6.关闭
HttpClientUtils.closeQuietly(response);
HttpClientUtils.closeQuietly(httpClient);
}
List count = imgs;
datas = count;
}
//返回数据给Future
@Override
public List call() throws Exception {
return datas;
}
}
测试结果