Java爬虫一般使用的是JsoupHttpClient这两个jar包[推荐使用Jsoup]
关于Jsoup的使用:Jsoup
效果图什么的我就懒得展示了,因为没啥可看的,代码里注释写的也很明白,看不懂的就copy代码,直接整一遍

正文

Server

首先得有个网站让你爬取,此处就自己用SpringBoot写几个接口来调用演示(Server)
具体创建步骤就不在赘述,直接上源码
注意:由于此篇文章内容仅演示如何使用,并没有写拦截器,使用登陆的方式进行操作,就不会被有拦截器的网站拦截请求了
ServerController.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
package top.lete114.testserver.Controller;

import org.springframework.stereotype.Controller;
import org.springframework.web.bind.annotation.*;

/**
* @author Lete乐特
* @createDate 2021- 03-19 18:26
*/

@Controller
public class ServerController {
@RequestMapping({"/","index","index.html"})
public String index(){
return "index";
}

@PostMapping("/login")
public String login(String username,String password){
if("admin".equals(username) && "111111".equals(password)){
System.out.println("登陆成功!");
return "info";
}
System.out.println("登陆失败!");
return "redirect:/index";
}

@GetMapping("/info")
public String info(){
return "info";
}

@PostMapping("/QianDao")
@ResponseBody
public String QianDao(){
System.out.println("签到成功!");
return "{\"return\":\"签到成功!\"}";
}
@PostMapping("/Param")
@ResponseBody
public String Param(String name,int age){
System.out.println("带参请求成功!naem="+name+",age="+age+"");
return "{\"return\":\"带参请求成功!naem="+name+",age="+age+"\"}";
}
}

index.html

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>Index</title>
</head>
<body>
<style>
.main{
height: 300px;
width: 300px;
}
input[type="submit"]{
display: flex;
}
</style>
<div class="main">
<form action="/login" method="post">
<input type="text" placeholder="请输入用户名" name="username">
<input type="password" placeholder="请输入密码" name="password">
<input type="submit" value="登陆" >
</form>
</div>

</body>
</html>

info.html

1
2
3
4
5
6
7
8
9
10
11
12
13
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<title>登陆成功</title>
</head>
<body>
<h1>用户名:<span>Lete乐特</span></h1>
<form action="/QianDao" method="post">
<input type="submit" value="签到">
</form>
</body>
</html>

Client

新建一个maven项目,并导入HttpClient依赖

1
2
3
4
5
6
<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient -->
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.13</version>
</dependency>

直接不废话,新建main.java
登陆并获取用户昵称,并执行签到操作
main.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
package top.lete114.testclient;

import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.*;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import javax.annotation.PostConstruct;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;

/**
* @author Lete乐特
* @createDate 2021- 03-19 19:17
*/
public class main {
public static void main(String[] args) throws IOException, URISyntaxException {
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpUriRequest login = RequestBuilder.post()
.setUri(new URI("http://127.0.0.1:4000/login"))// 登陆url
.setHeader("Upgrade-Insecure-Requests","1")
.setHeader("Accept","application/json")
.setHeader("Content-Type","application/x-www-form-urlencoded")
.setHeader("X-Requested-With","XMLHttpRequest")
.setHeader("User-Agent","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36")
// 设置用户名|密码
.addParameter("username", "admin")
.addParameter("password", "111111")
.build();
// 模拟登陆
CloseableHttpResponse response = httpclient.execute(login);
// 判断是否登陆成功
if (response.getStatusLine().getStatusCode() == 200){


// 访问个人信息url
HttpGet httpGet = new HttpGet("http://127.0.0.1:4000/info");
CloseableHttpResponse entity = httpclient.execute(httpGet);
// 返回获取实体
HttpEntity web = entity.getEntity();
// 获取网页内容,并指定编码
String body = EntityUtils.toString(web, "utf-8");
// 输出页面内容
System.out.println(body);
// 判断是否有用户名
System.out.println("是否存在Lete乐特:"+(body.contains("Lete乐特")));


// 签到url
HttpPost httpPost = new HttpPost("http://127.0.0.1:4000/QianDao");
CloseableHttpResponse entity1 = httpclient.execute(httpPost);
// 返回获取实体
HttpEntity web1 = entity1.getEntity();
// 获取网页内容,并指定编码
String body1 = EntityUtils.toString(web1, "utf-8");
// 输出页面内容
System.out.println(body1);
}else {
System.out.println("登陆失败,用户名或密码错误");
}
}
}

在一些特殊情况下,执行某些请求时,网站需要传入参数,即如下代码,此处只演示post请求,因为get请求可以直接写参数在url后面
main.java

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
package top.lete114.testclient;

import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.*;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.util.EntityUtils;

import javax.annotation.PostConstruct;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.*;

/**
* @author Lete乐特
* @createDate 2021- 03-19 19:17
*/
public class main {
public static void main(String[] args) throws IOException, URISyntaxException {
CloseableHttpClient httpclient = HttpClients.createDefault();
HttpUriRequest login = RequestBuilder.post()
.setUri(new URI("http://127.0.0.1:4000/login"))// 登陆url
.setHeader("Upgrade-Insecure-Requests","1")
.setHeader("Accept","application/json")
.setHeader("Content-Type","application/x-www-form-urlencoded")
.setHeader("X-Requested-With","XMLHttpRequest")
.setHeader("User-Agent","Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36")
// 设置用户名|密码
.addParameter("username", "admin")
.addParameter("password", "111111")
.build();
// 模拟登陆
CloseableHttpResponse response = httpclient.execute(login);
// 判断是否登陆成功
if (response.getStatusLine().getStatusCode() == 200){


// 访问个人信息url
HttpGet httpGet = new HttpGet("http://127.0.0.1:4000/info");
CloseableHttpResponse entity = httpclient.execute(httpGet);
// 返回获取实体
HttpEntity web = entity.getEntity();
// 获取网页内容,并指定编码
String body = EntityUtils.toString(web, "utf-8");
// 输出页面内容
System.out.println(body);
// 判断是否有用户名
System.out.println("是否存在Lete乐特:"+(body.contains("Lete乐特")));


// 签到url
HttpPost httpPost = new HttpPost("http://127.0.0.1:4000/QianDao");
CloseableHttpResponse entity1 = httpclient.execute(httpPost);
// 返回获取实体
HttpEntity web1 = entity1.getEntity();
// 获取网页内容,并指定编码
String body1 = EntityUtils.toString(web1, "utf-8");
// 输出页面内容
System.out.println(body1);


// 带参请求url
HttpPost httpPost2 = new HttpPost("http://127.0.0.1:4000/Param");

List<NameValuePair> list = new ArrayList<NameValuePair>();
list.add(new BasicNameValuePair("name","Lete114"));
list.add(new BasicNameValuePair("age","18"));

StringEntity param = new UrlEncodedFormEntity(list, "utf-8");
httpPost2.setEntity(param);
CloseableHttpResponse entity2 = httpclient.execute(httpPost2);
// 返回获取实体
HttpEntity web2 = entity2.getEntity();
// 获取网页内容,并指定编码
String body2 = EntityUtils.toString(web2, "utf-8");
// 输出页面内容
System.out.println(body2);
}else {
System.out.println("登陆失败,用户名或密码错误");
}
}
}

此篇内容就到此为止,你可以发挥你的脑洞进行更深入的使用,不仅仅可以爬取内容这么简单
如果你配合corn定时任务可以实现自动签到,自动打卡等效果