nptl/pthread_cond_common.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469

/* pthread_cond_common -- shared code for condition variable.
   Copyright (C) 2016-2021 Free Software Foundation, Inc.
   This file is part of the GNU C Library.

   The GNU C Library is free software; you can redistribute it and/or
   modify it under the terms of the GNU Lesser General Public
   License as published by the Free Software Foundation; either
   version 2.1 of the License, or (at your option) any later version.

   The GNU C Library is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
   Lesser General Public License for more details.

   You should have received a copy of the GNU Lesser General Public
   License along with the GNU C Library; if not, see
   <https://www.gnu.org/licenses/>.  */

#include <atomic.h>
#include <stdint.h>
#include <pthread.h>

/* We need 3 least-significant bits on __wrefs for something else.  */
#define __PTHREAD_COND_MAX_GROUP_SIZE ((unsigned) 1 << 29)

#if __HAVE_64B_ATOMICS == 1

static uint64_t __attribute__ ((unused))
__condvar_load_wseq_relaxed (pthread_cond_t *cond)
{
  return atomic_load_relaxed (&cond->__data.__wseq);
}

static uint64_t __attribute__ ((unused))
__condvar_fetch_add_wseq_acquire (pthread_cond_t *cond, unsigned int val)
{
  return atomic_fetch_add_acquire (&cond->__data.__wseq, val);
}

static uint64_t __attribute__ ((unused))
__condvar_fetch_xor_wseq_release (pthread_cond_t *cond, unsigned int val)
{
  return atomic_fetch_xor_release (&cond->__data.__wseq, val);
}

static uint64_t __attribute__ ((unused))
__condvar_load_g1_start_relaxed (pthread_cond_t *cond)
{
  return atomic_load_relaxed (&cond->__data.__g1_start);
}

static void __attribute__ ((unused))
__condvar_add_g1_start_relaxed (pthread_cond_t *cond, unsigned int val)
{
  atomic_store_relaxed (&cond->__data.__g1_start,
      atomic_load_relaxed (&cond->__data.__g1_start) + val);
}

#else

/* We use two 64b counters: __wseq and __g1_start.  They are monotonically
   increasing and single-writer-multiple-readers counters, so we can implement
   load, fetch-and-add, and fetch-and-xor operations even when we just have
   32b atomics.  Values we add or xor are less than or equal to 1<<31 (*),
   so we only have to make overflow-and-addition atomic wrt. to concurrent
   load operations and xor operations.  To do that, we split each counter into
   two 32b values of which we reserve the MSB of each to represent an
   overflow from the lower-order half to the higher-order half.

   In the common case, the state is (higher-order / lower-order half, and . is
   basically concatenation of the bits):
   0.h     / 0.l  = h.l

   When we add a value of x that overflows (i.e., 0.l + x == 1.L), we run the
   following steps S1-S4 (the values these represent are on the right-hand
   side):
   S1:  0.h     / 1.L == (h+1).L
   S2:  1.(h+1) / 1.L == (h+1).L
   S3:  1.(h+1) / 0.L == (h+1).L
   S4:  0.(h+1) / 0.L == (h+1).L
   If the LSB of the higher-order half is set, readers will ignore the
   overflow bit in the lower-order half.

   To get an atomic snapshot in load operations, we exploit that the
   higher-order half is monotonically increasing; if we load a value V from
   it, then read the lower-order half, and then read the higher-order half
   again and see the same value V, we know that both halves have existed in
   the sequence of values the full counter had.  This is similar to the
   validated reads in the time-based STMs in GCC's libitm (e.g.,
   method_ml_wt).

   The xor operation needs to be an atomic read-modify-write.  The write
   itself is not an issue as it affects just the lower-order half but not bits
   used in the add operation.  To make the full fetch-and-xor atomic, we
   exploit that concurrently, the value can increase by at most 1<<31 (*): The
   xor operation is only called while having acquired the lock, so not more
   than __PTHREAD_COND_MAX_GROUP_SIZE waiters can enter concurrently and thus
   increment __wseq.  Therefore, if the xor operation observes a value of
   __wseq, then the value it applies the modification to later on can be
   derived (see below).

   One benefit of this scheme is that this makes load operations
   obstruction-free because unlike if we would just lock the counter, readers
   can almost always interpret a snapshot of each halves.  Readers can be
   forced to read a new snapshot when the read is concurrent with an overflow.
   However, overflows will happen infrequently, so load operations are
   practically lock-free.

   (*) The highest value we add is __PTHREAD_COND_MAX_GROUP_SIZE << 2 to
   __g1_start (the two extra bits are for the lock in the two LSBs of
   __g1_start).  */

typedef struct
{
  unsigned int low;
  unsigned int high;
} _condvar_lohi;

static uint64_t
__condvar_fetch_add_64_relaxed (_condvar_lohi *lh, unsigned int op)
{
  /* S1. Note that this is an atomic read-modify-write so it extends the
     release sequence of release MO store at S3.  */
  unsigned int l = atomic_fetch_add_relaxed (&lh->low, op);
  unsigned int h = atomic_load_relaxed (&lh->high);
  uint64_t result = ((uint64_t) h << 31) | l;
  l += op;
  if ((l >> 31) > 0)
    {
      /* Overflow.  Need to increment higher-order half.  Note that all
	 add operations are ordered in happens-before.  */
      h++;
      /* S2. Release MO to synchronize with the loads of the higher-order half
	 in the load operation.  See __condvar_load_64_relaxed.  */
      atomic_store_release (&lh->high, h | ((unsigned int) 1 << 31));
      l ^= (unsigned int) 1 << 31;
      /* S3.  See __condvar_load_64_relaxed.  */
      atomic_store_release (&lh->low, l);
      /* S4.  Likewise.  */
      atomic_store_release (&lh->high, h);
    }
  return result;
}

static uint64_t
__condvar_load_64_relaxed (_condvar_lohi *lh)
{
  unsigned int h, l, h2;
  do
    {
      /* This load and the second one below to the same location read from the
	 stores in the overflow handling of the add operation or the
	 initializing stores (which is a simple special case because
	 initialization always completely happens before further use).
	 Because no two stores to the higher-order half write the same value,
	 the loop ensures that if we continue to use the snapshot, this load
	 and the second one read from the same store operation.  All candidate
	 store operations have release MO.
	 If we read from S2 in the first load, then we will see the value of
	 S1 on the next load (because we synchronize with S2), or a value
	 later in modification order.  We correctly ignore the lower-half's
	 overflow bit in this case.  If we read from S4, then we will see the
	 value of S3 in the next load (or a later value), which does not have
	 the overflow bit set anymore.
	  */
      h = atomic_load_acquire (&lh->high);
      /* This will read from the release sequence of S3 (i.e, either the S3
	 store or the read-modify-writes at S1 following S3 in modification
	 order).  Thus, the read synchronizes with S3, and the following load
	 of the higher-order half will read from the matching S2 (or a later
	 value).
	 Thus, if we read a lower-half value here that already overflowed and
	 belongs to an increased higher-order half value, we will see the
	 latter and h and h2 will not be equal.  */
      l = atomic_load_acquire (&lh->low);
      /* See above.  */
      h2 = atomic_load_relaxed (&lh->high);
    }
  while (h != h2);
  if (((