libMesh
threads_pthread.h
Go to the documentation of this file.
1 // The libMesh Finite Element Library.
2 // Copyright (C) 2002-2017 Benjamin S. Kirk, John W. Peterson, Roy H. Stogner
3 
4 // This library is free software; you can redistribute it and/or
5 // modify it under the terms of the GNU Lesser General Public
6 // License as published by the Free Software Foundation; either
7 // version 2.1 of the License, or (at your option) any later version.
8 
9 // This library is distributed in the hope that it will be useful,
10 // but WITHOUT ANY WARRANTY; without even the implied warranty of
11 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 // Lesser General Public License for more details.
13 
14 // You should have received a copy of the GNU Lesser General Public
15 // License along with this library; if not, write to the Free Software
16 // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 
18 #ifndef LIBMESH_THREADS_PTHREAD_H
19 #define LIBMESH_THREADS_PTHREAD_H
20 
21 // Do not try to #include this header directly, it is designed to be
22 // #included directly by threads.h
23 #ifndef LIBMESH_SQUASH_HEADER_WARNING
24 # warning "This file is designed to be included through libmesh/threads.h"
25 #else
26 
27 #ifdef LIBMESH_HAVE_PTHREAD
28 
29 // C++ includes
30 #ifdef LIBMESH_HAVE_CXX11_THREAD
31 # include <thread>
32 #endif
33 
34 #include "libmesh/libmesh_logging.h"
35 #include <pthread.h>
36 #include <algorithm>
37 #include <vector>
38 
39 #ifdef __APPLE__
40 # ifdef __MAC_10_12
41 # include <os/lock.h>
42 #else
43 # include <libkern/OSAtomic.h>
44 # endif
45 #endif
46 
47 // Thread-Local-Storage macros
48 #ifdef LIBMESH_HAVE_CXX11_THREAD
49 # define LIBMESH_TLS_TYPE(type) thread_local type
50 # define LIBMESH_TLS_REF(value) (value)
51 #else // Maybe support gcc __thread eventually?
52 # define LIBMESH_TLS_TYPE(type) type
53 # define LIBMESH_TLS_REF(value) (value)
54 #endif
55 
56 namespace libMesh
57 {
58 
59 namespace Threads
60 {
61 
62 
63 #ifdef LIBMESH_HAVE_CXX11_THREAD
64 
67 typedef std::thread Thread;
68 
69 #else
70 
74 typedef NonConcurrentThread Thread;
75 
76 #endif // LIBMESH_HAVE_CXX11_THREAD
77 
78 
83 #ifdef __APPLE__
84 #ifdef __MAC_10_12
85 class spin_mutex
86 {
87 public:
88  spin_mutex() { ulock = OS_UNFAIR_LOCK_INIT; }
90 
91  void lock () { os_unfair_lock_lock(&ulock); }
92  void unlock () { os_unfair_lock_unlock(&ulock); }
93 
94  class scoped_lock
95  {
96  public:
98  explicit scoped_lock ( spin_mutex & in_smutex ) : smutex(&in_smutex) { smutex->lock(); }
99 
101 
102  void acquire ( spin_mutex & in_smutex ) { smutex = &in_smutex; smutex->lock(); }
104 
105  private:
107  };
108 
109 private:
110  os_unfair_lock ulock;
111 };
112 #else
113 class spin_mutex
114 {
115 public:
116  spin_mutex() : slock(0) {} // The convention is that the lock being zero is _unlocked_
118 
119  void lock () { OSSpinLockLock(&slock); }
120  void unlock () { OSSpinLockUnlock(&slock); }
121 
122  class scoped_lock
123  {
124  public:
126  explicit scoped_lock ( spin_mutex & in_smutex ) : smutex(&in_smutex) { smutex->lock(); }
127 
129 
130  void acquire ( spin_mutex & in_smutex ) { smutex = &in_smutex; smutex->lock(); }
132 
133  private:
134  spin_mutex * smutex;
135  };
136 
137 private:
138  OSSpinLock slock;
139 };
140 #endif
141 #else
142 class spin_mutex
143 {
144 public:
145  // Might want to use PTHREAD_MUTEX_ADAPTIVE_NP on Linux, but it's not available on OSX.
146  spin_mutex() { pthread_spin_init(&slock, PTHREAD_PROCESS_PRIVATE); }
147  ~spin_mutex() { pthread_spin_destroy(&slock); }
148 
149  void lock () { pthread_spin_lock(&slock); }
150  void unlock () { pthread_spin_unlock(&slock); }
151 
152  class scoped_lock
153  {
154  public:
156  explicit scoped_lock ( spin_mutex & in_smutex ) : smutex(&in_smutex) { smutex->lock(); }
157 
159 
160  void acquire ( spin_mutex & in_smutex ) { smutex = &in_smutex; smutex->lock(); }
162 
163  private:
164  spin_mutex * smutex;
165  };
166 
167 private:
168  pthread_spinlock_t slock;
169 };
170 #endif // __APPLE__
171 
172 
173 
178 class recursive_mutex
179 {
180 public:
181  // Might want to use PTHREAD_MUTEX_ADAPTIVE_NP on Linux, but it's not available on OSX.
183  {
184  pthread_mutexattr_init(&attr);
185  pthread_mutexattr_settype(&attr, PTHREAD_MUTEX_RECURSIVE);
186  pthread_mutex_init(&mutex, &attr);
187  }
188  ~recursive_mutex() { pthread_mutex_destroy(&mutex); }
189 
190  void lock () { pthread_mutex_lock(&mutex); }
191  void unlock () { pthread_mutex_unlock(&mutex); }
192 
193  class scoped_lock
194  {
195  public:
196  scoped_lock () : rmutex(libmesh_nullptr) {}
197  explicit scoped_lock ( recursive_mutex & in_rmutex ) : rmutex(&in_rmutex) { rmutex->lock(); }
198 
200 
201  void acquire ( recursive_mutex & in_rmutex ) { rmutex = &in_rmutex; rmutex->lock(); }
202  void release () { if (rmutex) rmutex->unlock(); rmutex = libmesh_nullptr; }
203 
204  private:
206  };
207 
208 private:
209  pthread_mutex_t mutex;
210  pthread_mutexattr_t attr;
211 };
212 
213 template <typename Range>
214 unsigned int num_pthreads(Range & range)
215 {
216  unsigned int min = std::min((std::size_t)libMesh::n_threads(), range.size());
217  return min > 0 ? min : 1;
218 }
219 
220 template <typename Range, typename Body>
222 {
223 public:
224  Range * range;
225  Body * body;
226 };
227 
228 template <typename Range, typename Body>
229 void * run_body(void * args)
230 {
231  RangeBody<Range, Body> * range_body = (RangeBody<Range, Body> *)args;
232 
233  Body & body = *range_body->body;
234  Range & range = *range_body->range;
235 
236  body(range);
237 
238  return libmesh_nullptr;
239 }
240 
245 {
246 public:
247  static const int automatic = -1;
248  explicit task_scheduler_init (int = automatic) {}
249  void initialize (int = automatic) {}
250  void terminate () {}
251 };
252 
253 //-------------------------------------------------------------------
258 class split {};
259 
260 
261 
262 
263 //-------------------------------------------------------------------
268 template <typename Range, typename Body>
269 inline
270 void parallel_for (const Range & range, const Body & body)
271 {
273 
274 #ifdef LIBMESH_ENABLE_PERFORMANCE_LOGGING
275  const bool logging_was_enabled = libMesh::perflog.logging_enabled();
276 
277  if (libMesh::n_threads() > 1)
279 #endif
280 
281  unsigned int n_threads = num_pthreads(range);
282 
283  std::vector<Range *> ranges(n_threads);
284  std::vector<RangeBody<const Range, const Body>> range_bodies(n_threads);
285  std::vector<pthread_t> threads(n_threads);
286 
287  // Create the ranges for each thread
288  unsigned int range_size = range.size() / n_threads;
289 
290  typename Range::const_iterator current_beginning = range.begin();
291 
292  for (unsigned int i=0; i<n_threads; i++)
293  {
294  unsigned int this_range_size = range_size;
295 
296  if (i+1 == n_threads)
297  this_range_size += range.size() % n_threads; // Give the last one the remaining work to do
298 
299  ranges[i] = new Range(range, current_beginning, current_beginning + this_range_size);
300 
301  current_beginning = current_beginning + this_range_size;
302  }
303 
304  // Create the RangeBody arguments
305  for (unsigned int i=0; i<n_threads; i++)
306  {
307  range_bodies[i].range = ranges[i];
308  range_bodies[i].body = &body;
309  }
310 
311  // Create the threads. It may seem redundant to wrap a pragma in
312  // #ifdefs... but GCC warns about an "unknown pragma" if it
313  // encounters this line of code when -fopenmp is not passed to the
314  // compiler.
315 #ifdef LIBMESH_HAVE_OPENMP
316 #pragma omp parallel for schedule (static)
317 #endif
318  for (unsigned int i=0; i<n_threads; i++)
319  {
320 #if !LIBMESH_HAVE_OPENMP
321  pthread_create(&threads[i], libmesh_nullptr, &run_body<Range, Body>, (void *)&range_bodies[i]);
322 #else
323  run_body<Range, Body>((void *)&range_bodies[i]);
324 #endif
325  }
326 
327 #if !LIBMESH_HAVE_OPENMP
328  // Wait for them to finish
329 
330  // The use of 'int' instead of unsigned for the iteration variable
331  // is deliberate here. This is an OpenMP loop, and some older
332  // compilers warn when you don't use int for the loop index. The
333  // reason has to do with signed vs. unsigned integer overflow
334  // behavior and optimization.
335  // http://blog.llvm.org/2011/05/what-every-c-programmer-should-know.html
336  for (int i=0; i<static_cast<int>(n_threads); i++)
337  pthread_join(threads[i], libmesh_nullptr);
338 #endif
339 
340  // Clean up
341  for (unsigned int i=0; i<n_threads; i++)
342  delete ranges[i];
343 
344 #ifdef LIBMESH_ENABLE_PERFORMANCE_LOGGING
345  if (libMesh::n_threads() > 1 && logging_was_enabled)
346  libMesh::perflog.enable_logging();
347 #endif
348 }
349 
354 template <typename Range, typename Body, typename Partitioner>
355 inline
356 void parallel_for (const Range & range, const Body & body, const Partitioner &)
357 {
358  parallel_for (range, body);
359 }
360 
365 template <typename Range, typename Body>
366 inline
367 void parallel_reduce (const Range & range, Body & body)
368 {
370 
371 #ifdef LIBMESH_ENABLE_PERFORMANCE_LOGGING
372  const bool logging_was_enabled = libMesh::perflog.logging_enabled();
373 
374  if (libMesh::n_threads() > 1)
376 #endif
377 
378  unsigned int n_threads = num_pthreads(range);
379 
380  std::vector<Range *> ranges(n_threads);
381  std::vector<Body *> bodies(n_threads);
382  std::vector<RangeBody<Range, Body>> range_bodies(n_threads);
383 
384  // Create copies of the body for each thread
385  bodies[0] = &body; // Use the original body for the first one
386  for (unsigned int i=1; i<n_threads; i++)
387  bodies[i] = new Body(body, Threads::split());
388 
389  // Create the ranges for each thread
390  unsigned int range_size = range.size() / n_threads;
391 
392  typename Range::const_iterator current_beginning = range.begin();
393 
394  for (unsigned int i=0; i<n_threads; i++)
395  {
396  unsigned int this_range_size = range_size;
397 
398  if (i+1 == n_threads)
399  this_range_size += range.size() % n_threads; // Give the last one the remaining work to do
400 
401  ranges[i] = new Range(range, current_beginning, current_beginning + this_range_size);
402 
403  current_beginning = current_beginning + this_range_size;
404  }
405 
406  // Create the RangeBody arguments
407  for (unsigned int i=0; i<n_threads; i++)
408  {
409  range_bodies[i].range = ranges[i];
410  range_bodies[i].body = bodies[i];
411  }
412 
413  // Create the threads
414  std::vector<pthread_t> threads(n_threads);
415 
416  // It may seem redundant to wrap a pragma in #ifdefs... but GCC
417  // warns about an "unknown pragma" if it encounters this line of
418  // code when -fopenmp is not passed to the compiler.
419 #ifdef LIBMESH_HAVE_OPENMP
420 #pragma omp parallel for schedule (static)
421 #endif
422  // The use of 'int' instead of unsigned for the iteration variable
423  // is deliberate here. This is an OpenMP loop, and some older
424  // compilers warn when you don't use int for the loop index. The
425  // reason has to do with signed vs. unsigned integer overflow
426  // behavior and optimization.
427  // http://blog.llvm.org/2011/05/what-every-c-programmer-should-know.html
428  for (int i=0; i<static_cast<int>(n_threads); i++)
429  {
430 #if !LIBMESH_HAVE_OPENMP
431  pthread_create(&threads[i], libmesh_nullptr, &run_body<Range, Body>, (void *)&range_bodies[i]);
432 #else
433  run_body<Range, Body>((void *)&range_bodies[i]);
434 #endif
435  }
436 
437 #if !LIBMESH_HAVE_OPENMP
438  // Wait for them to finish
439  for (unsigned int i=0; i<n_threads; i++)
440  pthread_join(threads[i], libmesh_nullptr);
441 #endif
442 
443  // Join them all down to the original Body
444  for (unsigned int i=n_threads-1; i != 0; i--)
445  bodies[i-1]->join(*bodies[i]);
446 
447  // Clean up
448  for (unsigned int i=1; i<n_threads; i++)
449  delete bodies[i];
450  for (unsigned int i=0; i<n_threads; i++)
451  delete ranges[i];
452 
453 #ifdef LIBMESH_ENABLE_PERFORMANCE_LOGGING
454  if (libMesh::n_threads() > 1 && logging_was_enabled)
456 #endif
457 }
458 
463 template <typename Range, typename Body, typename Partitioner>
464 inline
465 void parallel_reduce (const Range & range, Body & body, const Partitioner &)
466 {
467  parallel_reduce(range, body);
468 }
469 
470 
475 template <typename T>
476 class atomic
477 {
478 public:
479  atomic () : val(0) {}
480  operator T () { return val; }
481 
483  {
484  spin_mutex::scoped_lock lock(smutex);
485  val = value;
486  return val;
487  }
488 
490  {
491  spin_mutex::scoped_lock lock(smutex);
492  val = value;
493  return *this;
494  }
495 
496 
498  {
499  spin_mutex::scoped_lock lock(smutex);
500  val += value;
501  return val;
502  }
503 
505  {
506  spin_mutex::scoped_lock lock(smutex);
507  val -= value;
508  return val;
509  }
510 
512  {
513  spin_mutex::scoped_lock lock(smutex);
514  val++;
515  return val;
516  }
517 
518  T operator++(int)
519  {
520  spin_mutex::scoped_lock lock(smutex);
521  val++;
522  return val;
523  }
524 
526  {
527  spin_mutex::scoped_lock lock(smutex);
528  val--;
529  return val;
530  }
531 
532  T operator--(int)
533  {
534  spin_mutex::scoped_lock lock(smutex);
535  val--;
536  return val;
537  }
538 
539 private:
540  T val;
542 };
543 
544 } // namespace Threads
545 
546 } // namespace libMesh
547 
548 #endif // #ifdef LIBMESH_HAVE_PTHREAD
549 
550 #endif // LIBMESH_SQUASH_HEADER_WARNING
551 
552 #endif // LIBMESH_THREADS_PTHREAD_H
NonConcurrentThread Thread
Use the non-concurrent placeholder.
Definition: threads_none.h:43
unsigned int n_threads()
Definition: libmesh_base.h:125
void acquire(recursive_mutex &in_rmutex)
void parallel_for(const Range &range, const Body &body)
Execute the provided function object in parallel on the specified range.
Definition: threads_none.h:73
void enable_logging()
Enables performance logging for an active object.
Definition: perf_log.h:161
Dummy "splitting object" used to distinguish splitting constructors from copy constructors.
Definition: threads_none.h:63
void * run_body(void *args)
const class libmesh_nullptr_t libmesh_nullptr
The libMesh namespace provides an interface to certain functionality in the library.
bool in_threads
A boolean which is true iff we are in a Threads:: function It may be useful to assert(!Threadsin_thre...
Definition: threads.C:31
The Partitioner class provides a uniform interface for partitioning algorithms.
Definition: partitioner.h:48
bool logging_enabled() const
Definition: perf_log.h:166
tbb::spin_mutex spin_mutex
Spin mutex.
Definition: threads_tbb.h:209
Defines atomic operations which can only be executed on a single thread at a time.
Definition: threads_none.h:172
We use a class to turn Threads::in_threads on and off, to be exception-safe.
Definition: threads.h:53
atomic< T > & operator=(const atomic< T > &value)
PerfLog perflog
A PerfLog object to log performance.
static const bool value
Definition: xdr_io.C:108
void parallel_reduce(const Range &range, Body &body)
Execute the provided reduction operation in parallel on the specified range.
Definition: threads_none.h:101
Scheduler to manage threads.
Definition: threads_none.h:48
unsigned int num_pthreads(Range &range)
long double min(long double a, double b)
void disable_logging()
Disables performance logging for an active object.
Definition: perf_log.h:156