HttpClient cookie爬虫记录

发布于:2024-06-01 ⋅ 阅读:(130) ⋅ 点赞:(0)

 记录一次java语言使用httpclient爬取网站接口数据的经历

需要用到的依赖:

httpclient和httpcore是封装了http请求的工具类

jsoup可以将返回的网页html找到你需要的xml节点,很方便

	<dependency>
				<groupId>org.apache.httpcomponents</groupId>
				<artifactId>httpclient</artifactId>
				<version>4.5.13</version> <!-- 请检查并使用最新版本 -->
			</dependency>
			<dependency>
				<groupId>org.apache.httpcomponents</groupId>
				<artifactId>httpcore</artifactId>
				<version>4.4.14</version> <!-- 请检查并使用最新版本 -->
			</dependency>
		<dependency>
			<groupId>org.jsoup</groupId>
			<artifactId>jsoup</artifactId>
			<version>1.13.1</version>
		</dependency>

 java类:

需要将网站请求中的cookie配置到BasicClientCookie 对象中,然后添加到请求中去,如何获取cookie文章最后有截图

package com.utils;

import org.apache.http.HttpResponse;
import org.apache.http.client.CookieStore;
import org.apache.http.client.HttpClient;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.impl.cookie.BasicClientCookie;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import java.io.IOException;
 
public class HttpClientWithCookieExample {
    public static void main(String[] args) throws InterruptedException {

        // 创建一个Cookie存储对象(支持多个cookie)
        CookieStore cookieStore = new BasicCookieStore();
        // 创建一个Cookie并设置属性
        BasicClientCookie cookie = new BasicClientCookie("ASP.NET_SessionId", "mkuq512333ljwcqkfq4i");
        cookie.setDomain("abc.com");
        cookie.setPath("/");
        BasicClientCookie cookie1 = new BasicClientCookie("Email", "abc@qq.com");
        cookie1.setDomain("abc.com");
        cookie1.setPath("/");
        BasicClientCookie cookie2 = new BasicClientCookie("Password", "511B0D5F341BDDBD9A5348923B48D14C");
        cookie2.setDomain("abc.com");
        cookie2.setPath("/");
        // 将Cookie添加到Cookie存储中
        cookieStore.addCookie(cookie);
        cookieStore.addCookie(cookie1);
        cookieStore.addCookie(cookie2);
        // 创建一个HttpClientContext对象,并将Cookie存储设置进去
        HttpClientContext context = HttpClientContext.create();
        context.setCookieStore(cookieStore);
 
        // 创建HttpClient
        HttpClient httpClient = HttpClients.custom().setDefaultCookieStore(cookieStore).build();
        extracted_area( context, httpClient);

    }


    /**
     * 爬取区域信息
     * @param context
     * @param httpClient
     * @throws InterruptedException
     */
    private static void extracted_area(HttpClientContext context, HttpClient httpClient) throws InterruptedException {
        int page = 1;
        HttpGet request = null;
        for (int i = 1; i<= page; i++){
            // 创建一个HttpGet请求,用于发送HTTP GET请求
            request = new HttpGet("https://abc.com/adminKdUser/GuanLi/AreaList.aspx");
            // 设置请求头
            try {
                // 使用HttpClient发送请求
                HttpResponse response = httpClient.execute(request, context);
                String result = "";
                if (response != null) {
                    int statusCode = response.getStatusLine().getStatusCode();
                    result = EntityUtils.toString(response.getEntity(), "utf-8");
                    //System.out.println("\n返回码:" + statusCode + "\n返内容:" + result);
                    Document doc = Jsoup.parse(result);
                    Elements tables = doc.select("table");
                    if (tables == null){
                        System.out.println("第"+i+"页===终止");
                        break;
                    }
                    System.out.println("第"+i+"页==="+tables.html());
                 /*   if (result.contains("<div class=\"content\">")){
                        int s = result.indexOf("<div class=\"content\">");
                        result = result.substring(s);
                        System.out.println("截取后返内容:" + result);
                    }*/
                    JDBCBean.executeUpdate(i,"<table>"+tables.html()+"</table>");
                }

            } catch (IOException e) {
                System.out.println(i+"解析失败");
            }finally{
                Thread.sleep(1000);
            }
        }
    }

    private static void extracted_fanyi(HttpClientContext context, HttpClient httpClient) throws InterruptedException {
        int page = 984;
        HttpGet request = null;
        for (int i = 1; i<= page; i++){
            // 创建一个HttpGet请求,用于发送HTTP GET请求
            request = new HttpGet("https://abc.com/123/GuanLi/FanYiList.aspx?page="+i);
            // 设置请求头
            try {
                // 使用HttpClient发送请求
                HttpResponse response = httpClient.execute(request, context);
                String result = "";
                if (response != null) {
                    int statusCode = response.getStatusLine().getStatusCode();
                    result = EntityUtils.toString(response.getEntity(), "utf-8");
                    //System.out.println("\n返回码:" + statusCode + "\n返内容:" + result);
                    Document doc = Jsoup.parse(result);
                    Elements tables = doc.select("table");
                    if (tables == null){
                        System.out.println("第"+i+"页===终止");
                        break;
                    }
                    System.out.println("第"+i+"页==="+tables.html());
                 /*   if (result.contains("<div class=\"content\">")){
                        int s = result.indexOf("<div class=\"content\">");
                        result = result.substring(s);
                        System.out.println("截取后返内容:" + result);
                    }*/
                    JDBCBean.executeUpdate(i,"<table>"+tables.html()+"</table>");
                }

            } catch (IOException e) {
                System.out.println(i+"解析失败");
            }finally{
                Thread.sleep(1000);
            }
        }
    }
    private static void extracted( HttpClientContext context, HttpClient httpClient) throws InterruptedException {
        int page = 2415;
        HttpGet request = null;
        for (int i = 1; i<= page; i++){
            // 创建一个HttpGet请求,用于发送HTTP GET请求
            request = new HttpGet("https://abc.com/123/User/GoodRecordList.aspx?page="+i);
            // 设置请求头
            try {
                // 使用HttpClient发送请求
                HttpResponse response = httpClient.execute(request, context);
                String result = "";
                if (response != null) {
                    int statusCode = response.getStatusLine().getStatusCode();
                    result = EntityUtils.toString(response.getEntity(), "utf-8");
                    //System.out.println("\n返回码:" + statusCode + "\n返内容:" + result);
                    Document doc = Jsoup.parse(result);
                    Elements tables = doc.select("table");
                    if (tables == null){
                        System.out.println("第"+i+"页===终止");
                        break;
                    }
                    System.out.println("第"+i+"页==="+tables.html());
                 /*   if (result.contains("<div class=\"content\">")){
                        int s = result.indexOf("<div class=\"content\">");
                        result = result.substring(s);
                        System.out.println("截取后返内容:" + result);
                    }*/
                    JDBCBean.executeUpdate(i,"<table>"+tables.html()+"</table>");
                }

            } catch (IOException e) {
                System.out.println(i+"解析失败");
            }finally{
                Thread.sleep(1000);
            }
        }
    }
}

 此处不方便透露实际网站,就用百度来作例子,取请求标头中的cookie内容,并且拼接到BasicClientCookie中即可