本文简述一下使用C++的libcurl库开发Web信息搜集工具的开发过程。 - 1024 快乐!!
0x00 libcurl的安装
sudo apt-get install libcurl # debian
sudo brew install libcurl # mac
一般安装了curl这个工具的话都应该自带了。
0x01 编译命令
g++ main.cpp curl_controller.cpp -o curl -lcurl -O3
0x02 源代码 - 暂未支持多线程,明天做
目录结构:
liyingzhes-MacBook-Pro:c_http liyingzhe$ tree -L 1
.
├── CMakeLists.txt
├── cmake-build-debug
├── curl
├── curl_controller.cpp
├── curl_controller.h
└── main.cpp
编译的时候只需要 curl_controller.cpp 、 curl_controller.h 、main.cpp
main.cpp
#include <iostream>
#include <curl/curl.h>
#include <string>
#include <regex>
#include "curl_controller.h"
#include <vector>
#define NUM 5 // 请求次数
int main() {
struct REGEX REG; // 正则结构体
/**
* 单次请求可以直接赋值
*/
HTTP_REQUEST_OPTIONS global_options;
global_options.URL = "http://127.0.0.1";
global_options.USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36";
global_options.TIME_OUT = 5;
std::vector <CURL *>curlist; // 发送请求队列
CURL_CONTROLLER * Controller = new CURL_CONTROLLER;
/**
* 设置正则
*/
REG.Title = "<title>([^<]+)</title>";
REG.Server = "Server:\\\\s(.*?)\\\\r";
/**
* 动态分配内存
* 用于存储响应信息
*/
struct ResponseInfo * Response = new ResponseInfo[NUM];
/**
* 动态分配内存
* 给每个请求都设置参数
*/
struct HTTP_REQUEST_OPTIONS * OPTIONS = new HTTP_REQUEST_OPTIONS[NUM];
for(int x =0 ; x < 5 ;x++){
OPTIONS[x].URL = global_options.URL;
OPTIONS[x].USER_AGENT = global_options.USER_AGENT;
OPTIONS[x].TIME_OUT = global_options.TIME_OUT;
}
/**
* 将指针放入数组,以便于多线程
*/
for(int x =0 ; x < NUM ;x++){
curlist.push_back(Controller->setOpt(&OPTIONS[x],&Response[x]));
}
/**
* 普通for请求
*/
for(int x = 0; x < NUM; x++ ){
Controller->RunCurl(curlist[x],&Response[x]);
}
/**
* 取得响应信息
*/
for(int x =0 ; x < NUM ;x++){
Controller->getResponse(&Response[x],®);
std::cout << "Code :" << Response[x].ResponseCode << "\tTitle :" << Response[x].ResponseTitle << std::endl;
std::cout << "Header :" << Response[x].ResponseHeader << std::endl;
}
/**
* 释放内存
*/
delete [] Response;
delete [] OPTIONS;
delete Controller;
return 0;
}
curl_controller.h
//
// Created by liyingzhe on 24/10/17.
//
#ifndef C_HTTP_CURL_CONTROLLER_H
#define C_HTTP_CURL_CONTROLLER_H
#include <iostream>
#include <regex>
#include <curl/curl.h>
/**
* 正则表达式
*/
struct REGEX{
std::string Title; // 匹配标题的正则
std::string Server; // 匹配Server的正则
std::string Powered; // 匹配Powered的正则
};
/**
* 设置必备参数
*/
struct HTTP_REQUEST_OPTIONS{
short TIME_OUT; // 连接超时时间
std::string URL; // 请求 URL
std::string USER_AGENT; // 浏览器标识
// 还可以扩展更多……
};
/**
* 响应内容结构体
*/
struct ResponseInfo{
std::string ResponseHeader; //响应头
std::string ResponseBody; // 响应主体
short int ResponseCode; // 响应码
std::string ResponseTitle; // 响应标题
std::string ResponseServerInfo; // 响应Server内容
};
class CURL_CONTROLLER{
public:
/**
* 初始化 CURL
*/
CURL_CONTROLLER();
/**
* 设置请求参数
* @param options
* @param Response
* @return
*/
CURL * setOpt(struct HTTP_REQUEST_OPTIONS * options,ResponseInfo * Response);
/**
* 获取响应内容
* @param Response
* @param REG
* @return
*/
ResponseInfo * getResponse(ResponseInfo * Response,REGEX * REG);
/**
* 执行请求
* @param curl
* @param Response
*/
void RunCurl(CURL * curl,ResponseInfo * Response);
/**
* 释放资源
*/
~CURL_CONTROLLER();
/**
* CURL指针
*/
CURL * curl;
private:
/**
* 回调函数
* @param ptr
* @param n
* @param m
* @param data
* @return
*/
static size_t callBackWrite(char * ptr,size_t n,size_t m,std::string * data);
};
#endif //C_HTTP_CURL_CONTROLLER_H
curl_controller.cpp
//
// Created by liyingzhe on 24/10/17.
//
#include "curl_controller.h"
/**
* 回调函数
* @param ptr
* @param n
* @param m
* @param data
* @return
*/
size_t CURL_CONTROLLER::callBackWrite(char *ptr, size_t n, size_t m, std::string *data) {
if (data == NULL) return 0;
data->append(ptr,n*m);
return n*m;
}
/**
* 初始化 CURL
*/
CURL_CONTROLLER::CURL_CONTROLLER() {
try{
curl = curl_easy_init();
}catch (std::exception &e){
std::cout << "CURL_CONTROLLER::CURL_CONTROLLER " <<e.what() << std::endl;
exit(-1);
}
}
/**
* 设置请求参数
* @param options
* @param Response
* @return
*/
CURL * CURL_CONTROLLER::setOpt(struct HTTP_REQUEST_OPTIONS * options,ResponseInfo * Response) {
try{
curl_easy_setopt(curl,CURLOPT_URL,options->URL.c_str());
curl_easy_setopt(curl,CURLOPT_HEADER,1);
curl_easy_setopt(curl,CURLOPT_TIMEOUT,options->TIME_OUT);
curl_easy_setopt(curl,CURLOPT_USERAGENT,options->USER_AGENT.c_str());
curl_easy_setopt(curl,CURLOPT_WRITEFUNCTION,callBackWrite);
curl_easy_setopt(curl,CURLOPT_WRITEDATA,&Response->ResponseBody);
curl_easy_setopt(curl,CURLOPT_WRITEHEADER,&Response->ResponseHeader); //header 头
curl_easy_setopt(curl,CURLOPT_SSL_VERIFYPEER,false);
curl_easy_setopt(curl,CURLOPT_SSL_VERIFYHOST,false);
RunCurl(curl,Response);
}catch (std::exception &e){
std::cout << "CURL_CONTROLLER::setOpt "<<e.what() << std::endl;
}
return curl;
}
/**
* 执行请求
* @param curl
* @param Response
*/
void CURL_CONTROLLER::RunCurl(CURL *curl, ResponseInfo *Response) {
try{
curl_easy_perform(curl);
curl_easy_getinfo(curl,CURLINFO_RESPONSE_CODE,&Response->ResponseCode);
}catch (std::exception &e){
std::cout << "CURL_CONTROLLER::RunCurl " <<e.what() << std::endl;
}
}
/**
* 获取响应内容
* @param Response
* @param REG
* @return
*/
ResponseInfo * CURL_CONTROLLER::getResponse(ResponseInfo * Response,REGEX * REG) {
std::smatch match_title;
std::regex title(REG->Title);
if (regex_search(Response->ResponseBody, match_title, title)) {
Response->ResponseTitle=match_title[1].str();
}
std::smatch match_server;
std::regex server(REG->Server);
if (regex_search(Response->ResponseHeader, match_server, server)) {
Response->ResponseServerInfo=match_title[1].str();
}
return Response;
}
/**
* 释放资源
*/
CURL_CONTROLLER::~CURL_CONTROLLER() {
curl_easy_cleanup(curl);
}
明天继续写吧~ 先去下厨了!