2006/09/15

Document Actions

グローバル変数は遅い？

ふとしたことから以下のような不思議な現象を発見してしまいました。

test.cpp:

#define N (64*500)
            
class wrap {
    int data[N];
public:     
    wrap() {
        int i;                      
        for (i = N - 1; i >= 0; i--)
            data[N - i + 1] = i;
    }
                 
    void doit() {
        int i, j, tmp;               
        for (i = 0; i < N - 1; i++) {    
            for (j = i + 1; j < N; j++) {
                if (data[i] > data[j]) {
                    tmp = data[i];
                    data[i] = data[j];
                    data[j] = tmp;
                }
            }
        }
    }
};
            
int main() {
    wrap a;  
    a.doit();
}

test.c:

#define N (64*500)
            
int data[N];

void init() {
    int i;
    for (i = N - 1; i >= 0; i--)    
        data[N - i + 1] = i;
}

void doit() {
    int i, j, tmp;
    for (i = 0; i < N - 1; i++) {    
        for (j = i + 1; j < N; j++) {    
            if (data[i] > data[j]) {
                tmp = data[i];
                data[i] = data[j];
                data[j] = tmp;
            }
        }
    }
}

int main() {
    init(); 
    doit();
}

% g++ test.cpp -o test.cpp.out
% time ./test.cpp.out
./test.cpp.out  5.01s user 0.00s system 94% cpu 5.311 total
% gcc test.c -o test.c.out
% time ./test.c.out
./test.c.out  3.84s user 0.00s system 95% cpu 4.048 total

classのオーバーヘッドがあるんでこれは当然なんですが、

% g++ -O3 test.cpp -o test.cpp.out
% time ./test.cpp.out
./test.cpp.out  1.14s user 0.00s system 92% cpu 1.233 total
% gcc -O3 test.c -o test.c.out
% time ./test.c.out
./test.c.out  1.32s user 0.00s system 93% cpu 1.412 total

奇妙ですね。

% g++ -O3 -c test.cpp -o test.cpp.o
% objdump -t test.cpp.o
SYMBOL TABLE:
00000000 l    df *ABS*  00000000 test.cpp
00000000 l    d  .text  00000000 .text
00000000 l    d  .data  00000000 .data
00000000 l    d  .bss   00000000 .bss
00000000 l    d  .note.GNU-stack        00000000 .note.GNU-stack
00000000 l    d  .comment       00000000 .comment
00000000 g     F .text  00000067 main
% gcc -O3 -c test.c -o test.c.o
% objdump -t test.c.o
00000000 l    df *ABS*  00000000 test.c
00000000 l    d  .text  00000000 .text
00000000 l    d  .data  00000000 .data
00000000 l    d  .bss   00000000 .bss
00000000 l    d  .note.GNU-stack        00000000 .note.GNU-stack
00000000 l    d  .comment       00000000 .comment
00000000 g     F .text  00000020 init
0001f400       O *COM*  00000020 data
00000020 g     F .text  0000004e doit
00000070 g     F .text  0000006e main

initとdoitがinlineになってないじゃん。それぞれ一回しか呼ばれていないので全然関係ないのですが、一応inline化しましょう。gccマニュアルによるとスタンドアロンなオブジェクトのインライン展開にはstatic inlineを使わないとだめだよとのことなので以下のように修正します。

test.c:

#define N (64*500)

int data[N];

static inline void init() {
    int i;
    for (i = N - 1; i >= 0; i--)
        data[N - i + 1] = i;
}

static inline void doit() {
    int i, j, tmp;
    for (i = 0; i < N - 1; i++) {
        for (j = i + 1; j < N; j++) {
            if (data[i] > data[j]) {
                tmp = data[i];
                data[i] = data[j];
                data[j] = tmp;
            }
        }
    }
}

int main() {
    init();
    doit();
}

% gcc -O3 -c test.c -o test.c.o
% objdump -t test.c.o
SYMBOL TABLE:
00000000 l    df *ABS*  00000000 test.c
00000000 l    d  .text  00000000 .text
00000000 l    d  .data  00000000 .data
00000000 l    d  .bss   00000000 .bss
00000000 l    d  .note.GNU-stack        00000000 .note.GNU-stack
00000000 l    d  .comment       00000000 .comment
00000000 g     F .text  0000006e main
0001f400       O *COM*  00000020 data

おし消えました（一応asmでも確認）。

% gcc -O3 test.c -o test.c.out
% time ./test.c.out
./test.c.out  1.33s user 0.00s system 94% cpu 1.411 total

まあ当然の結果です。以下にtest.cpp.outとtest.c.outのdesasmされた.textをのせます（gccの-Sでもいいのですが読みにくいので）。

odjdump -d test.cpp.o:

00000000 <main>:
   0:   55                      push   %ebp
   1:   89 e5                   mov    %esp,%ebp
   3:   57                      push   %edi
   4:   56                      push   %esi
   5:   53                      push   %ebx
   6:   81 ec 0c f4 01 00       sub    $0x1f40c,%esp
   c:   83 e4 f0                and    $0xfffffff0,%esp
   f:   83 ec 10                sub    $0x10,%esp
  12:   8d b5 e8 0b fe ff       lea    0xfffe0be8(%ebp),%esi
  18:   b8 ff 7c 00 00          mov    $0x7cff,%eax
  1d:   8d 95 f0 0b fe ff       lea    0xfffe0bf0(%ebp),%edx
  23:   90                      nop    
  24:   89 02                   mov    %eax,(%edx)
  26:   83 c2 04                add    $0x4,%edx
  29:   48                      dec    %eax
  2a:   79 f8                   jns    24 <main+0x24>
  2c:   31 db                   xor    %ebx,%ebx
  2e:   8d 7b 01                lea    0x1(%ebx),%edi
  31:   89 f8                   mov    %edi,%eax
  33:   81 ff ff 7c 00 00       cmp    $0x7cff,%edi
  39:   eb 16                   jmp    51 <main+0x51>
  3b:   8b 0c 9e                mov    (%esi,%ebx,4),%ecx
  3e:   8b 14 86                mov    (%esi,%eax,4),%edx
  41:   39 d1                   cmp    %edx,%ecx
  43:   7e 06                   jle    4b <main+0x4b>
  45:   89 14 9e                mov    %edx,(%esi,%ebx,4)
  48:   89 0c 86                mov    %ecx,(%esi,%eax,4)
  4b:   40                      inc    %eax
  4c:   3d ff 7c 00 00          cmp    $0x7cff,%eax
  51:   7e e8                   jle    3b <main+0x3b>
  53:   81 ff fe 7c 00 00       cmp    $0x7cfe,%edi
  59:   89 fb                   mov    %edi,%ebx
  5b:   7e d1                   jle    2e <main+0x2e>
  5d:   8d 65 f4                lea    0xfffffff4(%ebp),%esp
  60:   5b                      pop    %ebx
  61:   5e                      pop    %esi
  62:   31 c0                   xor    %eax,%eax
  64:   5f                      pop    %edi
  65:   c9                      leave  
  66:   c3                      ret

objdump -d test.c.o:

00000000 <main>:
   0:   55                      push   %ebp
   1:   89 e5                   mov    %esp,%ebp
   3:   56                      push   %esi
   4:   53                      push   %ebx
   5:   83 e4 f0                and    $0xfffffff0,%esp
   8:   83 ec 10                sub    $0x10,%esp
   b:   ba ff 7c 00 00          mov    $0x7cff,%edx
  10:   b9 01 7d 00 00          mov    $0x7d01,%ecx
  15:   8d 76 00                lea    0x0(%esi),%esi
  18:   89 c8                   mov    %ecx,%eax
  1a:   29 d0                   sub    %edx,%eax
  1c:   89 14 85 00 00 00 00    mov    %edx,0x0(,%eax,4)
  23:   4a                      dec    %edx
  24:   79 f2                   jns    18 <main+0x18>
  26:   31 db                   xor    %ebx,%ebx
  28:   8d 73 01                lea    0x1(%ebx),%esi
  2b:   89 f0                   mov    %esi,%eax
  2d:   81 fe ff 7c 00 00       cmp    $0x7cff,%esi
  33:   eb 26                   jmp    5b <main+0x5b>
  35:   8b 0c 9d 00 00 00 00    mov    0x0(,%ebx,4),%ecx
  3c:   8b 14 85 00 00 00 00    mov    0x0(,%eax,4),%edx
  43:   39 d1                   cmp    %edx,%ecx
  45:   7e 0e                   jle    55 <main+0x55>
  47:   89 14 9d 00 00 00 00    mov    %edx,0x0(,%ebx,4)
  4e:   89 0c 85 00 00 00 00    mov    %ecx,0x0(,%eax,4)
  55:   40                      inc    %eax
  56:   3d ff 7c 00 00          cmp    $0x7cff,%eax
  5b:   7e d8                   jle    35 <main+0x35>
  5d:   81 fe fe 7c 00 00       cmp    $0x7cfe,%esi
  63:   89 f3                   mov    %esi,%ebx
  65:   7e c1                   jle    28 <main+0x28>
  67:   8d 65 f8                lea    0xfffffff8(%ebp),%esp
  6a:   5b                      pop    %ebx
  6b:   5e                      pop    %esi
  6c:   c9                      leave  
  6d:   c3                      ret

test.c.oのほうはリンク時にdataのアドレスが0x0にrelocateされるんですが、それを除けば全く同じです。となればネックになっているのはdataのfetchだと考えるのが普通なんですが、ssもdsも速度的には同等であるはずですし、置かれるデータ構造が同じうえ、キャッシュはlinear addressでされるらしいので、どこで遅くなるのか見当もつきません。実効アドレッシングで差がついてるのか、あるいはちゃんとアドレッシングの原理を理解していないのか、COMMONなオブジェクトはなんらかの理由があって遅いのか、はたまた、等々。いまだ未解決。

以下の資料を読みあさっています。

http://www.intel.co.jp/jp/developer/download/index.htm#ia32

わかりしだい追記したいと思います。誰かわかる人いたら教えてください。この辺の知識はかなり疎くてしょうがないですもう。

追記。ValgrindにCachegrindというCPUのキャッシュ機構をシミュレーションしてキャッシュミス数等をプロファイルする機能があることを知ったので試してみました。なお（ちゃんと調べてないですが）、このマシンのL1 Data Cache Sizeは16KBらしいのでtest.cpp、test.cのdataのサイズをそれにあわせました。

% valgrind --tool=cachegrind ./test.cpp.out
==17087== I   refs:      1,209,567,234
==17087== I1  misses:            1,021
==17087== L2i misses:              584
==17087== I1  miss rate:          0.00%
==17087== L2i miss rate:          0.00%
==17087==                      
==17087== D   refs:        537,402,348  (268,898,767 rd + 268,503,581 wr)
==17087== D1  misses:        7,818,505  (  7,817,144 rd +       1,361 wr)
==17087== L2d misses:            6,058  (      4,759 rd +       1,299 wr)
==17087== D1  miss rate:           1.4% (        2.9%   +         0.0%  )
==17087== L2d miss rate:           0.0% (        0.0%   +         0.0%  )
==17087==                      
==17087== L2 refs:           7,819,526  (  7,818,165 rd +       1,361 wr)
==17087== L2 misses:             6,642  (      5,343 rd +       1,299 wr)
==17087== L2 miss rate:            0.0% (        0.0%   +         0.0%  )
% valgrind --tool=cachegrind ./test.c.out
==17094== I   refs:      1,208,158,555
==17094== I1  misses:              934
==17094== L2i misses:              524
==17094== I1  miss rate:          0.00%
==17094== L2i miss rate:          0.00%
==17094==
==17094== D   refs:        536,850,915  (268,466,100 rd + 268,384,815 wr)
==17094== D1  misses:        7,806,491  (  7,805,248 rd +       1,243 wr)
==17094== L2d misses:            2,210  (        997 rd +       1,213 wr)
==17094== D1  miss rate:           1.4% (        2.9%   +         0.0%  )
==17094== L2d miss rate:           0.0% (        0.0%   +         0.0%  )
==17094==
==17094== L2 refs:           7,807,425  (  7,806,182 rd +       1,243 wr)
==17094== L2 misses:             2,734  (      1,521 rd +       1,213 wr)
==17094== L2 miss rate:            0.0% (        0.0%   +         0.0%  )

この結果を見たかぎりではC版のほうが当然速くなるはずなのですが...どうやらキャッシュミスとかアラインメントの問題でもないようです（cachegrindを信用して）。再調査します。

Category(s): program; gcc

The URL to Trackback this entry is:: http://dev.ariel-networks.com/Members/matsuyama/30ed30fc30eb59096570306f90453044/tbping

Re:グローバル変数は遅い？

Posted by inoue at 2006-09-15 01:48

答えは分かりませんが、一応追試。同じくC++版の方が速いです。

AMD Athlon 650MHz
gcc/g++ 3.3.5 (Debian sarge)

CPUが遅い分、数値がはっきり出ます。C版は平均して5.5秒程度、C++版は平均して4秒程度です。

> mov %edx,0x0(,%eax,4)

事実だけ見れば、これ(および類する箇所)が遅い、としか言いようが無さそうです。

Re:グローバル変数は遅い？

Posted by matsuyama at 2006-09-15 11:12

AMDでもなるんですか。んじゃPentiumの特殊な最適化のせいではないってことですね。個人的には局所化の問題なような気がしてならないんですが、ちゃんと証明できないのでまだまだ調査中です。

Re:グローバル変数は遅い？

Posted by Sildenafil at 2010-08-14 18:49

int main() {
wrap a;
a.doit();
}

今回は私の注意！うーん

Add comment

You can add a comment by filling out the form below. Plain text formatting.

Author (Required)

EMail (Required)

URL (Required)

Title (Required)

Comment (Required)

Verification Code

This helps us prevent automated spamming.

Remember your information on cookie?

ありえるえりあ

Sections

Personal tools

Document Actions

グローバル変数は遅い？

Re:グローバル変数は遅い？

Re:グローバル変数は遅い？

Re:グローバル変数は遅い？