diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index 8927bd404..f2e3c1152 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -523,9 +523,13 @@
 #if EIGEN_COMP_MSVC || EIGEN_COMP_ICC
 #define EIGEN_STRONG_INLINE __forceinline
 #else
+#if EIGEN_COMP_CLANG
+#define EIGEN_STRONG_INLINE inline __attribute__((always_inline))
+#else
 #define EIGEN_STRONG_INLINE inline
 #endif
 #endif
+#endif
 
 // EIGEN_ALWAYS_INLINE is the stronget, it has the effect of making the function inline and adding every possible
 // attribute to maximize inlining. This should only be used when really necessary: in particular,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index bb63baee2..1c3155b74 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -164,9 +164,9 @@ class TensorCostModel {
   static const int kDeviceCyclesPerComputeCycle = 1;
 
  // Costs in device cycles.
-  static const int kStartupCycles = 100000;
-  static const int kPerThreadCycles = 100000;
-  static const int kTaskSize = 40000;
+  static const int kStartupCycles = 5000;
+  static const int kPerThreadCycles = 5000;
+  static const int kTaskSize = 5000;
 
   // Returns the number of threads in [1:max_threads] to use for
   // evaluating an expression with the given output size and cost per
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index ca9ba402e..5e3fe21ad 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -234,6 +234,7 @@ struct ThreadPoolDevice {
       }
     }
 
+#if !defined(EIGEN_OPENMP)
     // Recursively divide size into halves until we reach block_size.
     // Division code rounds mid to block_size, so we are guaranteed to get
     // block_count leaves that do actual computations.
@@ -253,6 +254,17 @@ struct ThreadPoolDevice {
     };
     handleRange(0, n);
     barrier.Wait();
+#else
+    auto blocks = static_cast<unsigned int>(divup(n, block_size));
+    #pragma omp parallel for
+    for (unsigned int i = 0; i < blocks; i++) {
+        auto first = (block_size * i);
+        auto last  = first + block_size;
+        if (n <= last)
+            last = n;
+        f(first, last);
+    }
+#endif
   }
 
   // Convenience wrapper for parallelFor that does not align blocks.