I can offer a counter-example, closely based on your code.
Code
#include "timer.h"
#include <stdio.h>
enum { M = 128 };
extern int SumByColRow (int matrix[M][M], int size);
extern int SumByRowCol (int matrix[M][M], int size);
int SumByColRow (int matrix[M][M], int size)
{
int Sum = 0;
for (int j = 0; j < size; j ++)
{
for (int i = 0; i < size; i ++)
Sum += matrix[i][j];
}
return Sum;
}
int SumByRowCol (int matrix[M][M], int size)
{
int Sum = 0;
for (int i = 0; i < size; i ++)
{
for (int j = 0; j < size; j ++)
Sum += matrix[i][j];
}
return Sum;
}
static inline int max(int i, int j) { return (i > j) ? i : j; }
int main(void)
{
int matrix[M][M];
for (int i = 0; i < M; i++)
for (int j = 0; j < M; j++)
matrix[i][j] = 1000*i + j;
Clock clk;
unsigned long long x[M];
char buffer[32];
unsigned long long sum;
clk_init(&clk);
clk_start(&clk);
for (int i = 0; i < M; i++)
x[i] = SumByColRow(matrix, max(M - i, 10));
clk_stop(&clk);
sum = 0;
for (int i = 0; i < M; i++)
sum += x[i];
printf("SumByColRow: value = %llu, time = %s\n", sum, clk_elapsed_us(&clk, buffer, sizeof(buffer)));
clk_start(&clk);
for (int i = 0; i < M; i++)
x[i] = SumByRowCol(matrix, max(M - i, 10));
clk_stop(&clk);
sum = 0;
for (int i = 0; i < M; i++)
sum += x[i];
printf("SumByRowCol: value = %llu, time = %s\n", sum, clk_elapsed_us(&clk, buffer, sizeof(buffer)));
return 0;
}
The two SumBy functions are substantially unchanged (minor notational tweaks, but nothing more). The timing harness stores a start time and a stop time in the Clock structure, and the clk_elapsed_us() function formats the elapsed time in microseconds into the string it is passed.
The messing around with x[i] and so on is to (try and) ensure that the compiler doesn't optimize everything away.
Output
Machine: Mac OS X 10.8.5, GCC (i686-apple-darwin11-llvm-gcc-4.2 (GCC) 4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2336.11.00)), Intel Core 2 Duo at 2 GHz, 4 GB 1067 MHz DDR3 RAM (an 'Early 2009' Mac Mini).
SumByColRow: value = 33764046316, time = 0.002411
SumByRowCol: value = 33764046316, time = 0.000677
This shows the expected result — that the columns by rows computation is slower because the matrix is big enough to span pages (64 KiB). It is not yet clear from the question what size M is, nor what size is passed to the SumBy functions, but with a 'big enough' array and varying sizes, you can get the expected performance pattern.
Those times aren't big enough for comfort — I'd rather the lower time was of the order of a second or two. Adding a for (int j = 0; j < 1600; j++) loop in front of each of the timed loops in the main program yields:
SumByColRow: value = 33764046316, time = 2.529205
SumByRowCol: value = 33764046316, time = 1.022970
The ratio is smaller (3.56 vs 2.47), but still decidedly tilted in favour of SumByRowCol().
Initializing the matrix 'warms the cache' to the extent it can be warmed. Reversing the order of computation (SumByRowCol before SumByColRow) does not make a significant difference to the timings. The results are pretty consistent across multiple runs.
Assembler output
Compiled with gcc -O3 -std=c99 -S:
.section __TEXT,__text,regular,pure_instructions
.globl _SumByColRow
.align 4, 0x90
_SumByColRow:
Leh_func_begin1:
pushq %rbp
Ltmp0:
movq %rsp, %rbp
Ltmp1:
testl %esi, %esi
jg LBB1_5
xorl %eax, %eax
LBB1_2:
popq %rbp
ret
LBB1_5:
movl %esi, %ecx
xorl %eax, %eax
movq %rcx, %rdx
jmp LBB1_6
.align 4, 0x90
LBB1_3:
addl (%r8), %eax
addq $512, %r8
decq %rsi
jne LBB1_3
addq $4, %rdi
decq %rdx
je LBB1_2
LBB1_6:
movq %rcx, %rsi
movq %rdi, %r8
jmp LBB1_3
Leh_func_end1:
.globl _SumByRowCol
.align 4, 0x90
_SumByRowCol:
Leh_func_begin2:
pushq %rbp
Ltmp2:
movq %rsp, %rbp
Ltmp3:
testl %esi, %esi
jg LBB2_5
xorl %eax, %eax
LBB2_2:
popq %rbp
ret
LBB2_5:
movl %esi, %ecx
xorl %eax, %eax
movq %rcx, %rdx
jmp LBB2_6
.align 4, 0x90
LBB2_3:
addl (%r8), %eax
addq $4, %r8
decq %rsi
jne LBB2_3
addq $512, %rdi
decq %rdx
je LBB2_2
LBB2_6:
movq %rcx, %rsi
movq %rdi, %r8
jmp LBB2_3
Leh_func_end2: