线程共享内存、具有独立堆栈 栈大小 多线程支付 宽松内存模型 处理器也是编译器
宽松内存模型 (Relaxed/Weak Memory Model)
宽松内存模型的目的是使单处理器的执行更高效。
x86 已经是市面上能买到的 “最强” 的内存模型了 😂
- 这也是 Intel 自己给自己加的包袱
- 看看 ARM/RISC-V 吧,根本就是个分布式系统
https://jyywiki.cn/OS/2022/slides/3.slides#/4/3
多处理器间即时可见性的丧失
满足单处理器 eventual memory consistency 的执行,在多处理器上可能无法序列化!
当 x≠y 时,对 x, y 的内存读写可以交换顺序
- 它们甚至可以在同一个周期里完成 (只要 load/store unit 支持)
- 如果写 x 发生 cache miss,可以让读 y 先执行
- 满足 “尽可能执行 μop” 的原则,最大化处理器性能
# <-----------+
movl $1, (x) # |
movl (y), %eax # --+
- 在多处理器上的表现
- 两个处理器分别看到 y=0 和 x=0
现代处理器:处理器也是 (动态) 编译器!
单个处理器把汇编代码 (用电路) “编译” 成更小的 μops
- RF[9] = load(RF[7] + 400)
- store(RF[12], RF[13])
- RF[3] = RF[4] + RF[5]
- 每个 μop 都有 Fetch, Issue, Execute, Commit 四个阶段
在任何时刻,处理器都维护一个 μop 的 “池子”
- 每一周期向池子补充尽可能多的 μop
- “多发射”
- 每一周期 (在不违反编译正确性的前提下) 执行尽可能多的 μop
- “乱序执行”、“按序提交”
- 这就是《计算机体系结构》 (剩下就是木桶效应,哪里短板补哪里)
画状态机
int x = 0, y = 0; void T1() { x = 1; asm volatile("" : : "memory"); // compiler barrier printf("y = %d\n", y); } void T2() { y = 1; asm volatile("" : : "memory"); // compiler barrier printf("x = %d\n", x); }
编译器不要优化 compiler barrier
asm volatile(""::"memory");
0x5f5e101 = 100000001
实现源代码的按顺序翻译
在代码中插入 “优化不能穿越” 的 barrier
asm volatile ("" ::: "memory");
- Barrier 的含义是 “可以读写任何内存”
- 使用
volatile
变量- 保持 C 语义和汇编语义一致
gcc -c -O2 sum.c && objdump -d sum.o
0000000000000020 <Tsum>: 20: f3 0f 1e fa endbr64 24: 48 81 05 00 00 00 00 addq $0x5f5e100,0x0(%rip) # 2f <Tsum+0xf> 2b: 00 e1 f5 05 2f: c3 ret
gcc -c -O1 sum.c && objdump -d sum.o
000000000000001a <Tsum>: 1a: f3 0f 1e fa endbr64 1e: 48 8b 15 00 00 00 00 mov 0x0(%rip),%rdx # 25 <Tsum+0xb> 25: 48 8d 42 01 lea 0x1(%rdx),%rax 29: 48 81 c2 01 e1 f5 05 add $0x5f5e101,%rdx 30: 48 89 c1 mov %rax,%rcx 33: 48 83 c0 01 add $0x1,%rax 37: 48 39 d0 cmp %rdx,%rax 3a: 75 f4 jne 30 <Tsum+0x16> 3c: 48 89 0d 00 00 00 00 mov %rcx,0x0(%rip) # 43 <Tsum+0x29> 43: c3 ret
cat sum.c #include "thread.h" #define N 100000000 long sum = 0; void Tsum() { for (int i = 0; i < N; i++) { sum++; // asm volatile("add $1,%0": "+m"(sum)); } }; int main() { create(Tsum); create(Tsum); join(); printf("sum=%ld\n", sum);
编译器对内存访问 “eventually consistent” 的处理导致共享内存作为线程同步工具的失效。
eventually consistent 最终 一致
编译器 x=1-->x=2-->x=3 编译器直接x=3
O1优化 O2优化
while true;do gcc -O1 sum.c -lpthread && ./a.out;done
sum=100000000
sum=100000000
sum=100000000
sum=100000000
while true;do gcc -O2 sum.c -lpthread && ./a.out;done
sum=200000000
sum=200000000
sum=200000000
输入man 3 printf 命令,再输入 /thread 过滤
man 3 printf
│printf(), fprintf(), │ Thread safety │ MT-Safe locale │
│sprintf(), snprintf(), │ │ │
│vprintf(), vfprintf(), │ │ │
│vsprintf(), vsnprintf() │ │ │
└────────────────────────┴───────────────┴────────────────┘
CONFORMING TO
fprintf(), printf(), sprintf(), vprintf(), vfprintf(), vsprintf():
POSIX.1-2001, POSIX.1-2008, C89, C99.
snprintf(), vsnprintf(): POSIX.1-2001, POSIX.1-2008, C99.
The dprintf() and vdprintf() functions were originally GNU extensions
that were later standardized in POSIX.1-2008.
Concerning the return value of snprintf(), SUSv2 and C99 contradict
each other: when snprintf() is called with size=0 then SUSv2 stipulates
an unspecified return value less than 1, while C99 allows str to be
NULL in this case, and gives the return value (as always) as the number
of characters that would have been written in case the output string
has been large enough. POSIX.1-2001 and later align their specifica‐
tion of snprintf() with C99.
glibc 2.1 adds length modifiers hh, j, t, and z and conversion charac‐
ters a and A.
glibc 2.2 adds the conversion character F with C99 semantics, and the
flag character I.
NOTES
Some programs imprudently rely on code such as the following
sprintf(buf, "%s some further text", buf);
/thread
printf
还能在多线程程序里调用吗?
void thread1() { while (1) { printf("a"); } }
void thread2() { while (1) { printf("b"); } }
我们都知道 printf 是有缓冲区的 (为什么?)
- 如果执行
buf[pos++] = ch
(pos
共享) 不就 💥 了吗?
修改为 加入汇编汇编
asm volatile("add $1,%0":"+m"(sum));
asm volatile("lock add $1,%0":"+m"(sum));
#include "thread.h" #define N 100000000 long sum=0; void Tsum(){for(int i=0;i<N;i++)sum++;}; int main(){ create(Tsum); create(Tsum); join(); printf("sum=%ld\n",sum); }
while true;do gcc sum.c -lpthread && ./a.out;done
sum=199055958
sum=196244234
sum=196224238
sum=191857318
sum=200000000
sum=197990013
sum=197888006
sum=200000000
sum=200000000
sum=198062222
sum=194003487
sum=197967435
sum=197983213
sum=198640457
2**64-100
18446744073709551516
while true;do ./a.out;done
Alipay_withdraw 单线程安全
gcc alipay.c -lpthread && ./a.out
balance = 18446744073709551516
#include "thread.h" unsigned long balance = 100; void Alipay_withdraw(int amt) { if (balance >= amt) { usleep(1); // unexpected delays balance -= amt; } } void Talipay(int id) { Alipay_withdraw(100); } int main() { create(Talipay); create(Talipay); join(); printf("balance = %lu\n", balance); }
山寨多线程支付
原子性
改写thread.h是线程拥有更大的栈
┌─────────────┬────────────────────┐
│Architecture │ Default stack size │
├─────────────┼────────────────────┤
│i386 │ 2 MB │
├─────────────┼────────────────────┤
│IA-64 │ 32 MB │
├─────────────┼────────────────────┤
│PowerPC │ 4 MB │
├─────────────┼────────────────────┤
│S/390 │ 2 MB │
├─────────────┼────────────────────┤
│Sparc-32 │ 2 MB │
├─────────────┼────────────────────┤
│Sparc-64 │ 4 MB │
├─────────────┼────────────────────┤
│x86_64 │ 2 MB │
└─────────────┴────────────────────┘
查看函数信息 man 3 pthread_create
clone是执行创建线程的系统调用
https://jyywiki.cn/OS/2022/slides/3.slides#/
多处理器编程
strace
- 程序 (源代码 S、二进制代码 C) = 状态机
- 编译器 C=compile(S)
- 应用视角的操作系统 = syscall 指令
##
CPU 使用率超过了 100%
https://jyywiki.cn/OS/2022/slides/3.slides#/1/4
Hello, Multi-threaded World!
#include "thread.h"
void Ta() { while (1) { printf("a"); } }
void Tb() { while (1) { printf("b"); } }
int main() {
create(Ta);
create(Tb);
}
利用 thread.h
就可以写出利用多处理器的程序!
- 操作系统会自动把线程放置在不同的处理器上
- 在后台运行,可以看到 CPU 使用率超过了 100%
CPU 使用率超过了 100%:指的是使用了超过1个CPU。
#include "thread.h" __thread char *base, *cur; // thread-local variables __thread int id; // objdump to see how thread-local variables are implemented __attribute__((noinline)) void set_cur(void *ptr) { cur = ptr; } __attribute__((noinline)) char *get_cur() { return cur; } void stackoverflow(int n) { set_cur(&n); if (n % 1024 == 0) { int sz = base - get_cur(); printf("Stack size of T%d >= %d KB\n", id, sz / 1024); } stackoverflow(n + 1); } void Tprobe(int tid) { id = tid; base = (void *)&tid; stackoverflow(0); } int main() { setbuf(stdout, NULL); for (int i = 0; i < 4; i++) { create(Tprobe); } }
__attribute__((noinline)) objdump 在二进制代码中查看线程局部变量的实现。
证明线程具有独立堆栈 (以及确定它们的范围)
https://jyywiki.cn/pages/OS/2022/demos/stack-probe.c
gcc stack-probe.c -lpthread &&./a.out | sort -nk 6 Stack size of T1 >= 0 KB Stack size of T2 >= 0 KB Stack size of T3 >= 0 KB Stack size of T4 >= 0 KB Stack size of T1 >= 64 KB Stack size of T2 >= 64 KB Stack size of T3 >= 64 KB Stack size of T4 >= 64 KB Stack size of T1 >= 128 KB Stack size of T2 >= 128 KB Stack size of T3 >= 128 KB Stack size of T4 >= 128 KB Stack size of T1 >= 192 KB Stack size of T2 >= 192 KB Stack size of T3 >= 192 KB Stack size of T4 >= 192 KB Stack size of T1 >= 256 KB Stack size of T2 >= 256 KB Stack size of T3 >= 256 KB Stack size of T4 >= 256 KB Stack size of T1 >= 320 KB Stack size of T2 >= 320 KB Stack size of T3 >= 320 KB Stack size of T4 >= 320 KB Stack size of T1 >= 384 KB Stack size of T2 >= 384 KB
查看程序执行的系统调用
strace ./a.out execve("./a.out", ["./a.out"], 0x7fff6c5d6c40 /* 24 vars */) = 0 brk(NULL) = 0x55a2d5f75000 arch_prctl(0x3001 /* ARCH_??? */, 0x7fff9e5ee140) = -1 EINVAL (Invalid argument) mmap(NULL, 8192, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa770566000 access("/etc/ld.so.preload", R_OK) = -1 ENOENT (No such file or directory) openat(AT_FDCWD, "/etc/ld.so.cache", O_RDONLY|O_CLOEXEC) = 3 newfstatat(3, "", {st_mode=S_IFREG|0644, st_size=23743, ...}, AT_EMPTY_PATH) = 0 mmap(NULL, 23743, PROT_READ, MAP_PRIVATE, 3, 0) = 0x7fa770560000 close(3) = 0 openat(AT_FDCWD, "/lib/x86_64-linux-gnu/libc.so.6", O_RDONLY|O_CLOEXEC) = 3 read(3, "\177ELF\2\1\1\3\0\0\0\0\0\0\0\0\3\0>\0\1\0\0\0P\237\2\0\0\0\0\0"..., 832) = 832 pread64(3, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784, 64) = 784 pread64(3, "\4\0\0\0 \0\0\0\5\0\0\0GNU\0\2\0\0\300\4\0\0\0\3\0\0\0\0\0\0\0"..., 48, 848) = 48 pread64(3, "\4\0\0\0\24\0\0\0\3\0\0\0GNU\0i8\235HZ\227\223\333\350s\360\352,\223\340."..., 68, 896) = 68 newfstatat(3, "", {st_mode=S_IFREG|0644, st_size=2216304, ...}, AT_EMPTY_PATH) = 0 pread64(3, "\6\0\0\0\4\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0@\0\0\0\0\0\0\0"..., 784, 64) = 784 mmap(NULL, 2260560, PROT_READ, MAP_PRIVATE|MAP_DENYWRITE, 3, 0) = 0x7fa770338000 mmap(0x7fa770360000, 1658880, PROT_READ|PROT_EXEC, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x28000) = 0x7fa770360000 mmap(0x7fa7704f5000, 360448, PROT_READ, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x1bd000) = 0x7fa7704f5000 mmap(0x7fa77054d000, 24576, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_DENYWRITE, 3, 0x214000) = 0x7fa77054d000 mmap(0x7fa770553000, 52816, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0) = 0x7fa770553000 close(3) = 0 mmap(NULL, 12288, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0) = 0x7fa770335000 arch_prctl(ARCH_SET_FS, 0x7fa770335740) = 0 set_tid_address(0x7fa770335a10) = 18025 set_robust_list(0x7fa770335a20, 24) = 0 rseq(0x7fa7703360e0, 0x20, 0, 0x53053053) = 0 mprotect(0x7fa77054d000, 16384, PROT_READ) = 0 mprotect(0x55a2d5573000, 4096, PROT_READ) = 0 mprotect(0x7fa7705a0000, 8192, PROT_READ) = 0 prlimit64(0, RLIMIT_STACK, NULL, {rlim_cur=8192*1024, rlim_max=RLIM64_INFINITY}) = 0 munmap(0x7fa770560000, 23743) = 0 rt_sigaction(SIGRT_1, {sa_handler=0x7fa7703c98f0, sa_mask=[], sa_flags=SA_RESTORER|SA_ONSTACK|SA_RESTART|SA_SIGINFO, sa_restorer=0x7fa77037a520}, NULL, 8) = 0 rt_sigprocmask(SIG_UNBLOCK, [RTMIN RT_1], NULL, 8) = 0 mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7fa76fb34000 mprotect(0x7fa76fb35000, 8388608, PROT_READ|PROT_WRITE) = 0 getrandom("\xa9\x26\x87\x57\xa1\x6a\xdd\xbe", 8, GRND_NONBLOCK) = 8 brk(NULL) = 0x55a2d5f75000 brk(0x55a2d5f96000) = 0x55a2d5f96000 rt_sigprocmask(SIG_BLOCK, ~[], [], 8) = 0 clone3({flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, child_tid=0x7fa770334910, parent_tid=0x7fa770334910, exit_signal=0, stack=0x7fa76fb34000, stack_size=0x7fff00, tls=0x7fa770334640} => {parent_tid=[18026]}, 88) = 18026 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7fa76f333000 mprotect(0x7fa76f334000, 8388608, PROT_READ|PROT_WRITE) = 0 rt_sigprocmask(SIG_BLOCK, ~[], [], 8) = 0 clone3({flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, child_tid=0x7fa76fb33910, parent_tid=0x7fa76fb33910, exit_signal=0, stack=0x7fa76f333000, stack_size=0x7fff00, tls=0x7fa76fb33640} => {parent_tid=[18027]}, 88) = 18027 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7fa76eb32000 mprotect(0x7fa76eb33000, 8388608, PROT_READ|PROT_WRITE) = 0 rt_sigprocmask(SIG_BLOCK, ~[], [], 8) = 0 clone3({flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, child_tid=0x7fa76f332910, parent_tid=0x7fa76f332910, exit_signal=0, stack=0x7fa76eb32000, stack_size=0x7fff00, tls=0x7fa76f332640} => {parent_tid=[18028]}, 88) = 18028 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 mmap(NULL, 8392704, PROT_NONE, MAP_PRIVATE|MAP_ANONYMOUS|MAP_STACK, -1, 0) = 0x7fa76e331000 mprotect(0x7fa76e332000, 8388608, PROT_READ|PROT_WRITE) = 0 rt_sigprocmask(SIG_BLOCK, ~[], [], 8) = 0 clone3({flags=CLONE_VM|CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD|CLONE_SYSVSEM|CLONE_SETTLS|CLONE_PARENT_SETTID|CLONE_CHILD_CLEARTID, child_tid=0x7fa76eb31910, parent_tid=0x7fa76eb31910, exit_signal=0, stack=0x7fa76e331000, stack_size=0x7fff00, tls=0x7fa76eb31640} => {parent_tid=[18029]}, 88) = 18029 rt_sigprocmask(SIG_SETMASK, [], NULL, 8) = 0 futex(0x7fa770334910, FUTEX_WAIT_BITSET|FUTEX_CLOCK_REALTIME, 18026, NULL, FUTEX_BITSET_MATCH_ANYStack size of T3 >= 0 KB Stack size of T3 >= 64 KB Stack size of T3 >= 128 KB Stack size of T3 >= 192 KB Stack size of T3 >= 256 KB Stack size of T3 >= 320 KB
__attribute__((noinline))