kudu-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From t...@apache.org
Subject [2/3] incubator-kudu git commit: codegen: force inlining of utility functions in JIT code
Date Fri, 22 Jan 2016 01:19:04 GMT
codegen: force inlining of utility functions in JIT code

In investigating why some benchmarks got slower with the switch to C++11, I
looked at some of the LLVM-generated code. We switched from LLVM 3.4 to 3.7
with the switch to C++11, so I thought we might have accidentally broken
optimizations.

It turns out we didn't accidentally disable optimizations, but llvm's inlining
heuristics must have changed. Looking at the generated code, I noticed that
we were not inlining the small utility functions from precompiled.cc, but instead
generating indirect calls.

I figured out that adding the 'always_inline' attribute to these functions
fixes the inlining and gave a pretty big performance improvement.

Before:
I0120 12:57:22.352180 25902 full_stack-insert-scan-test.cc:403] Time spent empty projection,
0 col: real 0.418s user 0.016s     sys 0.048s
I0120 12:57:22.889552 25902 full_stack-insert-scan-test.cc:403] Time spent key scan, 1 col:
real 0.479s user 0.020s     sys 0.008s
I0120 12:57:24.727413 25902 full_stack-insert-scan-test.cc:403] Time spent full schema scan,
10 col: real 1.770s        user 0.092s     sys 0.016s
I0120 12:57:25.588093 25902 full_stack-insert-scan-test.cc:403] Time spent String projection,
1 col: real 0.788s        user 0.064s     sys 0.000s
I0120 12:57:26.288879 25902 full_stack-insert-scan-test.cc:403] Time spent Int32 projection,
4 col: real 0.639s user 0.028s     sys 0.000s
I0120 12:57:27.042276 25902 full_stack-insert-scan-test.cc:403] Time spent Int64 projection,
4 col: real 0.693s user 0.028s     sys 0.008s

After:
I0120 12:56:01.236634 25319 full_stack-insert-scan-test.cc:310] Doing test scans on table
of 10000000 rows.
I0120 12:56:01.965637 25319 full_stack-insert-scan-test.cc:403] Time spent empty projection,
0 col: real 0.366s user 0.032s     sys 0.004s
I0120 12:56:02.491664 25319 full_stack-insert-scan-test.cc:403] Time spent key scan, 1 col:
real 0.459s user 0.024s     sys 0.004s
I0120 12:56:03.834020 25319 full_stack-insert-scan-test.cc:403] Time spent full schema scan,
10 col: real 1.250s        user 0.096s     sys 0.008s
I0120 12:56:04.681128 25319 full_stack-insert-scan-test.cc:403] Time spent String projection,
1 col: real 0.780s        user 0.060s     sys 0.000s
I0120 12:56:05.294754 25319 full_stack-insert-scan-test.cc:403] Time spent Int32 projection,
4 col: real 0.547s user 0.028s     sys 0.000s
I0120 12:56:05.988930 25319 full_stack-insert-scan-test.cc:403] Time spent Int64 projection,
4 col: real 0.633s user 0.036s     sys 0.000s

Comparing the machine code before and after for the projection with 4 int64s
shows the effect of inlining:

LLVM 3.4 (prior to our C++11 upgrade):
        movq    (%rsi), %rax
        movq    8(%rsi), %rcx
        movq    192(%rax), %rdx
        movq    (%rdx), %rdx
        movq    40(%rdi), %rsi
        movq    %rsi, (%rdx,%rcx,8)
        movq    192(%rax), %rdx
        movq    8(%rdx), %rdx
        movq    48(%rdi), %rsi
        movq    %rsi, (%rdx,%rcx,8)
        movq    192(%rax), %rdx
        movq    16(%rdx), %rdx
        movq    56(%rdi), %rsi
        movq    %rsi, (%rdx,%rcx,8)
        movq    192(%rax), %rax
        movq    24(%rax), %rax
        movq    64(%rdi), %rdx
        movq    %rdx, (%rax,%rcx,8)
        movb    $1, %al
        ret

After upgrade to LLVM 3.7:
        pushq   %r15
        pushq   %r14
        pushq   %rbx
        movq    %rsi, %r14
        movq    %rdi, %rbx
        leaq    40(%rbx), %rdi
        movabsq $140426221318144, %r15     <-- address of the 'PrecompiledCopyCell' function
        xorl    %edx, %edx
        callq   *%r15                      <-- indirect call for each cell (#1)
        leaq    48(%rbx), %rdi
        movl    $1, %edx
        movq    %r14, %rsi
        callq   *%r15                      <-- #2
        leaq    56(%rbx), %rdi
        movl    $2, %edx
        movq    %r14, %rsi
        callq   *%r15                      <-- #3
        addq    $64, %rbx
        movl    $3, %edx
        movq    %rbx, %rdi
        movq    %r14, %rsi
        callq   *%r15                      <-- #4
        movb    $1, %al
        popq    %rbx
        popq    %r14
        popq    %r15
        retq

After this patch:
        movq    (%rsi), %rax
        movq    8(%rsi), %rcx
        movq    192(%rax), %rdx
        movq    (%rdx), %rdx
        movq    40(%rdi), %rsi
        movq    %rsi, (%rdx,%rcx,8)
        movq    192(%rax), %rdx
        movq    8(%rdx), %rdx
        movq    48(%rdi), %rsi
        movq    %rsi, (%rdx,%rcx,8)
        movq    192(%rax), %rdx
        movq    16(%rdx), %rdx
        movq    56(%rdi), %rsi
        movq    %rsi, (%rdx,%rcx,8)
        movq    192(%rax), %rax
        movq    24(%rax), %rax
        movq    64(%rdi), %rdx
        movq    %rdx, (%rax,%rcx,8)
        movb    $1, %al
        retq

It looks like the 'after' still has some redundant loads which might be preventable,
but this should restore our previous performance.

Change-Id: I49c0eec360818c160db25bac7a91b5d00fff57ba
Reviewed-on: http://gerrit.cloudera.org:8080/1845
Reviewed-by: Adar Dembo <adar@cloudera.com>
Tested-by: Internal Jenkins


Project: http://git-wip-us.apache.org/repos/asf/incubator-kudu/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-kudu/commit/1c265653
Tree: http://git-wip-us.apache.org/repos/asf/incubator-kudu/tree/1c265653
Diff: http://git-wip-us.apache.org/repos/asf/incubator-kudu/diff/1c265653

Branch: refs/heads/master
Commit: 1c26565341d603e97956c95e7865036b7509c475
Parents: 21a6164
Author: Todd Lipcon <todd@apache.org>
Authored: Wed Jan 20 12:56:46 2016 -0800
Committer: Todd Lipcon <todd@apache.org>
Committed: Fri Jan 22 01:17:48 2016 +0000

----------------------------------------------------------------------
 src/kudu/codegen/precompiled.cc | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-kudu/blob/1c265653/src/kudu/codegen/precompiled.cc
----------------------------------------------------------------------
diff --git a/src/kudu/codegen/precompiled.cc b/src/kudu/codegen/precompiled.cc
index 7f2e20b..b073176 100644
--- a/src/kudu/codegen/precompiled.cc
+++ b/src/kudu/codegen/precompiled.cc
@@ -44,10 +44,22 @@
 // IR_BUILD because we use a fake static library target to workaround a cmake
 // dependencies bug. See 'ir_fake_target' in CMakeLists.txt.
 #ifdef IR_BUILD
+
+// This file uses the 'always_inline' attribute on a bunch of functions to force
+// the LLVM optimizer at runtime to inline them where it otherwise might not.
+// Because the functions themselves aren't marked 'inline', gcc is unhappy with this.
+// But, we can't mark them 'inline' or else they'll get optimized away and not even
+// included in the .ll file. So, instead, we just mark them as always_inline in
+// the IR_BUILD context.
+#define IR_ALWAYS_INLINE __attribute__((always_inline))
+
 // Workaround for an MCJIT deficiency where we see a link error when trying
 // to load the JITted library. See the following LLVM bug and suggested workaround.
 // https://llvm.org/bugs/show_bug.cgi?id=18062
 extern "C" void *__dso_handle __attribute__((__visibility__("hidden"))) = NULL;
+
+#else
+#define IR_ALWAYS_INLINE
 #endif
 
 namespace kudu {
@@ -55,8 +67,8 @@ namespace kudu {
 // Returns whether copy was successful (fails iff slice relocation fails,
 // which can only occur if is_string is true).
 // If arena is NULL, then no relocation occurs.
-static bool BasicCopyCell(uint64_t size, uint8_t* src, uint8_t* dst,
-                          bool is_string, Arena* arena) {
+IR_ALWAYS_INLINE static bool BasicCopyCell(
+    uint64_t size, uint8_t* src, uint8_t* dst, bool is_string, Arena* arena) {
   // Relocate indirect data
   if (is_string) {
     if (PREDICT_TRUE(arena != nullptr)) {
@@ -99,8 +111,9 @@ extern "C" {
 //   If arena is NULL then only the direct copy will occur.
 //   Returns whether successful. If not, out-of-memory during relocation of
 //   slices has occured, which can only happen if is_string is true.
-bool _PrecompiledCopyCellToRowBlock(uint64_t size, uint8_t* src, RowBlockRow* dst,
-                                    uint64_t col, bool is_string, Arena* arena) {
+IR_ALWAYS_INLINE bool _PrecompiledCopyCellToRowBlock(
+    uint64_t size, uint8_t* src, RowBlockRow* dst,
+    uint64_t col, bool is_string, Arena* arena) {
 
   // We manually compute the destination cell pointer here, rather than
   // using dst->cell_ptr(), since we statically know the size of the column
@@ -125,9 +138,9 @@ bool _PrecompiledCopyCellToRowBlock(uint64_t size, uint8_t* src, RowBlockRow*
ds
 //   bitmap indicates the cell itself is non-null).
 //   Returns whether successful. If not, out-of-memory during relocation of
 //   slices has occured, which can only happen if is_string is true.
-bool _PrecompiledCopyCellToRowBlockNullable(
-  uint64_t size, uint8_t* src, RowBlockRow* dst, uint64_t col, bool is_string,
-  Arena* arena, uint8_t* src_bitmap, uint64_t bitmap_idx) {
+IR_ALWAYS_INLINE bool _PrecompiledCopyCellToRowBlockNullable(
+    uint64_t size, uint8_t* src, RowBlockRow* dst, uint64_t col, bool is_string,
+    Arena* arena, uint8_t* src_bitmap, uint64_t bitmap_idx) {
   // Using this method implies the nullablity of the column.
   // Write whether the column is nullable to the RowBlock's ColumnBlock's bitmap
   bool is_null = BitmapTest(src_bitmap, bitmap_idx);
@@ -142,8 +155,8 @@ bool _PrecompiledCopyCellToRowBlockNullable(
 //
 //   Sets the cell at column 'col' for destination RowBlockRow 'dst'
 //   to be marked as 'is_null' (requires the column is nullable).
-void _PrecompiledCopyCellToRowBlockSetNull(
-  RowBlockRow* dst, uint64_t col, bool is_null) {
+IR_ALWAYS_INLINE void _PrecompiledCopyCellToRowBlockSetNull(
+    RowBlockRow* dst, uint64_t col, bool is_null) {
   dst->cell(col).set_null(is_null);
 }
 


Mime
View raw message