java多线程爬取多个网站的资源

发布于:2023-01-04 ⋅ 阅读:(416) ⋅ 点赞:(0)

使用多线程爬取数据

之前的博客李介绍了多线程的使用以及如何简单使用java的爬虫,本文把二者做一个简单的结合,使用多线程同时查不同的网址,最后把结果合并

没有通过正则表达式来爬取不同的网址,直接写死4个网址来让不同的线程爬取

MultiThreadQueryUtil2

@Service
public class MultiThreadQueryUtil2 {

    public static void main(String[] args) {
        List<List> multiCombineResult = getMultiCombineResult();
        System.out.println(multiCombineResult);
    }
    /**
     * 获取多线程结果并进行结果合并
     * @return
     */
    public static List<List> getMultiCombineResult() {

        //开始时间
        long start = System.currentTimeMillis();
        //返回结果
        List<List> result = new ArrayList<>();

        List<String> webs = new ArrayList<>();
        webs.add("https://taolitop.com/");
        webs.add("https://taolitop.com/jj.whtml");
        webs.add("https://taolitop.com/dzhrjsjykf.whtml");
        webs.add("https://taolitop.com/zxns.whtml");
        //假定总数据4条
        //Callable用于产生结果
        List<Callable<List>> tasks = new ArrayList<>();
        for (int i = 1; i <= 4; i++) {
            //不同的线程用户处理不同分段的数据量,这样就达到了平均分摊查询数据的压力
            Callable<List> qfe = new ThredQuery2(webs.get(i-1));
            System.out.println(i);
            System.out.println(webs.get(i-1));
            tasks.add(qfe);
        }
        try{
            //定义固定长度的线程池  防止线程过多,4就够用了
            ExecutorService executorService = Executors.newFixedThreadPool(4);
            //Future用于获取结果
            List<Future<List>> futures=executorService.invokeAll(tasks);
            //处理线程返回结果
            if(futures!=null&&futures.size() > 0){
                for (Future<List> future:futures){
                    result.addAll(future.get());
                }
            }
            //关闭线程池,一定不能忘记
            executorService.shutdown();
        }catch (Exception e){
            e.printStackTrace();
        }
        long end = System.currentTimeMillis();
        System.out.println("线程查询数据用时:"+(end-start)+"ms");
        return result;
    }

}

ThredQuery2

public class ThredQuery2 implements Callable<List> {

    //每个线程查询出来的数据集合
    private List datas;

    public ThredQuery2(String url) {
        //每个线程查询出来的数据集合
        //1.生成httpclient,相当于该打开一个浏览器
        CloseableHttpClient httpClient = HttpClients.createDefault();
        CloseableHttpResponse response = null;
        //2.创建get请求,相当于在浏览器地址栏输入网址
        HttpGet request = new HttpGet(url);
        List<String> imgs = new ArrayList<>();
        try {
            //3.执行get请求,相当于在输入地址栏后敲回车键
            response = httpClient.execute(request);
            //4.判断响应状态为200,进行处理
            if(response.getStatusLine().getStatusCode() == HttpStatus.SC_OK) {
                //5.获取响应内容
                HttpEntity httpEntity = response.getEntity();
                String html = EntityUtils.toString(httpEntity, "utf-8");
                Document doc = Jsoup.parse(html);
                Elements links = doc.getElementsByTag("img");
                for (Element link : links) {
                    String linkHref = link.attr("src");
                    linkHref = "https://taolitop.com" +linkHref;
                    imgs.add(linkHref);
                }
            } else {
                //如果返回状态不是200,比如404(页面不存在)等,根据情况做处理,这里略
                System.out.println("返回状态不是200");
                System.out.println(EntityUtils.toString(response.getEntity(), "utf-8"));
            }
        } catch (ClientProtocolException e) {
            e.printStackTrace();
        } catch (IOException e) {
            e.printStackTrace();
        } finally {
            //6.关闭
            HttpClientUtils.closeQuietly(response);
            HttpClientUtils.closeQuietly(httpClient);
        }
        List count = imgs;
        datas = count;
    }
    //返回数据给Future
    @Override
    public List call() throws Exception {
        return datas;
    }
}

测试结果

image-20220827220854091

image-20220827220904501

image-20220827220912804

image-20220827220922257