Cache lab

# cache lab

提示

# Part A

提示

getopt (opens new window)
fscanf
malloc and free
LRU (least-recently used) replacement policy
- 会替换最后一次访问时间最久远的那一行
For this this lab, you should assume that memory accesses are aligned properly, such that a single memory access never crosses block boundaries. By making this assumption, you can ignore the request sizes in the valgrind traces.
For this lab, we are interested only in data cache performance, so your simulator should ignore all instruction cache accesses (lines starting with “I”). Recall that valgrind always puts “I” in the first column (with no preceding space), and “M”, “L”, and “S” in the second column (with a preceding space). This may help you parse the trace.

解题思路

通过 ./csim-ref 的运行结果，可以看出写不命中的策略的是写分配 (write allocate)

S 600a70,4 miss eviction # 写不命中，并驱逐块
S 7ff000398,8 miss       # 写不命中，不驱逐块

1
2

修改动作 M 先完成加载，然后再进行修改，其各种不同的动作如下

M 7ff000388,4 hit hit             # 修改命中，两次 hit
M 7ff000388,4 miss eviction hit   # 修改不命中，驱逐块，一次 hit
M 20,1 miss hit                   # 修改不命中， 不驱逐块，一次 hit

1
2
3

M 除了需要额外的加 1 次 hit 外，其余的处理逻辑 L, S, M 是一致的

答案

csim.c

#include <stdio.h>
#include <stdlib.h>
#include <getopt.h>
#include "cachelab.h"

#define TRUE 1
#define FALSE 0
#define ADDRESS_BIT 64

typedef struct LINE
{
  unsigned long tag;
  unsigned long last_used_times;
  int valid_flag;
}Line;

static int hit_count, miss_count, eviction_count;
static unsigned set_bits, block_bits;
static unsigned long S, E;
static unsigned long cache_visit_counter;
static char *file_name;
static int verbose_flag;

void print_help(){
  printf("Usage: ./csim [-hv] -s <num> -E <num> -b <num> -t <file>\n"\
         "Options:\n"\
         "  -h         Print this help message.\n"\
         "  -v         Optional verbose flag.\n"\
         "  -s <num>   Number of set index bits.\n"\
         "  -E <num>   Number of lines per set.\n"\
         "  -b <num>   Number of block offset bits.\n"\
         "  -t <file>  Trace file.\n\n"\
         "Examples:\n"\
         "linux>  ./csim -s 4 -E 1 -b 4 -t traces/yi.trace\n"\
         "linux>  ./csim -v -s 8 -E 2 -b 4 -t traces/yi.trace");
}

void get_opt(int argc, char **argv){
  int opt;
  while ((opt = getopt(argc, argv, "hvs:E:b:t:")) != -1)
    switch (opt){
      case 'h':
        print_help();
        exit(EXIT_SUCCESS);
      case 'v':
        verbose_flag = TRUE;
        break;
      case 's':
        set_bits = atoi(optarg);
        break;
      case 'E':
        E = atoi(optarg);
        break;
      case 'b':
        block_bits = atoi(optarg);
        break;
      case 't':
        file_name = optarg;
        break;
      case '?':
        print_help();
        exit(EXIT_FAILURE);
      }
}

Line *init_cache(){
  Line *cache;
  //Make sure S * E doesn't overflow
  __uint128_t required_size = S * (__uint128_t)E;
  size_t request_size = (size_t)required_size;
  if (request_size != required_size)
  {
    perror("S * E is too big");
    exit(EXIT_FAILURE);
  }
  cache = calloc(request_size, sizeof(Line));
  if (cache == NULL)
    exit(EXIT_FAILURE);
  return cache;
}

void process_instruction(Line *cache, char instruction, unsigned long address){
  unsigned long tag, s, e, block_offset;
  unsigned tag_bits;
  enum {HIT, MISS, EVICTION} status;
  Line *line;
  Line *target;  // The target element for the input address.

  if(instruction != 'L' && instruction != 'S' && instruction != 'M')
    exit(EXIT_FAILURE);

  tag_bits = ADDRESS_BIT - (set_bits + block_bits);
  block_offset = address << (tag_bits + set_bits) >> (ADDRESS_BIT - block_bits);
  s = address << tag_bits >> (ADDRESS_BIT - set_bits);
  tag = address >> (ADDRESS_BIT - tag_bits);
  cache = cache + s * E;  // The s sets cache

  status = EVICTION;
  target = cache;  
  for(e = 0; e < E; e++){
    line = cache + e;

    if(line->valid_flag == FALSE){
      status = MISS;
      target = line;
      break;
    }

    if(line->tag == tag){
      status = HIT;
      target = line;
      break;
    }

    if (line->last_used_times < target->last_used_times)  // leatst recently used
      target = line;
  }

  target->last_used_times = ++cache_visit_counter;

  switch (status){
  case HIT:
    hit_count++;
    if(verbose_flag)
      printf(" hit");
    break;

  case MISS:
    target->tag = tag;
    target->valid_flag = TRUE;
    miss_count++;
    if(verbose_flag)
      printf(" miss");
    break;
  
  case EVICTION:
    target->tag = tag;
    miss_count++;
    eviction_count++;
    if(verbose_flag)
      printf(" miss eviction");
    break;
  
  default:
    break;
  }
  if(instruction == 'M'){
    hit_count++;
    if(verbose_flag)
      printf(" hit");
  }
  if(verbose_flag)
    // For part B use
    printf(" s = %lu, b= %lu\n", s, block_offset);
}

void sim_cache(Line *cache){
  FILE *fp;
  char instruction;
  unsigned long address;
  unsigned size;

  fp = fopen(file_name, "r");
  while(fscanf(fp, " %c %lx,%u", &instruction, &address, &size) != EOF){
    if(instruction != 'I'){
      if(verbose_flag == TRUE)
        printf("%c %lx,%u", instruction, address, size);
      process_instruction(cache, instruction, address);
    }
  }
  fclose(fp);
}

int main(int argc, char **argv){
  Line *cache;

  get_opt(argc, argv);
  if(set_bits==0 || E ==0 ||block_bits ==0 || file_name == NULL)
  {
    printf("Missing required arguments!\n");
    print_help();
    exit(EXIT_FAILURE);
  }

  S = 1 << set_bits;
  cache = init_cache();
  sim_cache(cache);
  free(cache);
  printSummary(hit_count, miss_count, eviction_count);
  return EXIT_SUCCESS;
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191

测试

make clean && make
# 具体的命令
./csim -s 1 -E 1 -b 1 -t traces/yi2.trace
./csim -s 4 -E 2 -b 4 -t traces/yi.trace
./csim -s 2 -E 1 -b 4 -t traces/dave.trace
./csim -s 2 -E 1 -b 3 -t traces/trans.trace
./csim -s 2 -E 2 -b 3 -t traces/trans.trace
./csim -s 2 -E 4 -b 3 -t traces/trans.trace
./csim -s 5 -E 1 -b 5 -t traces/trans.trace
./csim -s 5 -E 1 -b 5 -t traces/long.trace
# 自动测试
./test-csim

1
2
3
4
5
6
7
8
9
10
11
12

# Part B

根据题意，只用针对 32 * 32，64 * 64， 61 *67 这 3 种情况进行优化即可

所以，编写程序时只用考虑这 3 种情况即可，不用考虑程序的通用性
- 考虑了半天怎么只用通用程序解决这个问题，真是 🌞🐶

提示

It then evaluates each trace by running the reference simulator on a cache with parameters (s = 5, E = 1, b = 5).
- 总共有 32 个组，每个组可以放 32 个字节，相当于 8 个 int
  - 整个缓存可以存放 32 * 8 个 int 数据
Using Blocking to Increase Temporal Locality (opens new window)
Your transpose function may not modify array A. You may, however, do whatever you want with the contents of array B
You are allowed to define at most 12 local variables of type int per transpose function
- 8 * 8 分块的话，用于循环基本需要 4 个变量
  - 2 个用于循环遍历块，2 个用于循环遍历块中的元素
- 剩下的 8 个变量可以用于缓存从 A 中的块中读出的一行的数据
  - 正好对应一个 cache 中的分组

分块的基本使用

可以使用下列命令查看调用程序所分配的数组

make clean && make
./test-trans -M 61 -N 67
./csim -v -s 5 -E 1 -b 5 -t trace.f1 > output.txt

1
2
3

output.txt 中前一段内容如下

S 10e100,1 miss s = 8, b= 0
L 18e160,8 miss s = 11, b= 0
L 18e124,4 miss s = 9, b= 4
L 18e120,4 hit s = 9, b= 0

L 10e120,4 miss eviction s = 9, b= 0
S 14e120,4 miss eviction s = 9, b= 0
L 10e124,4 miss eviction s = 9, b= 4
S 14e22c,4 miss s = 17, b= 12
L 10e128,4 hit s = 9, b= 8
S 14e338,4 miss s = 25, b= 24
L 10e12c,4 hit s = 9, b= 12
S 14e444,4 miss s = 2, b= 4
...

1
2
3
4
5
6
7
8
9
10
11
12
13
14

开头的几行是程序的初始化，不同版本的 gcc 输出可能不同
这是程序默认的转置函数的输出
- 其中 s 和 b 的信息需要在 csim.c 中添加相应的输出语句
- 可以看出两个数组的开头均和缓存区对齐，而且两个数组的首个元素位于相同的分组
  - 这是后续进行优化的基础

由于一个分组储存 8 个 int ，所以一次从 A 中读取 8 个 int 是最高效的选择
- 关键是如何把从 A 中读取出的这 8 个元素高效的储存到数组 B 中
- 因此，基本的分块大小需要选为 8 * 8
- 将 B 矩阵尽可能多的划分为大小为 8 * 8 的块，每个块命名为
  - 对应于矩阵 B 的第行和第列的交汇处
- 将矩阵 A 也按照同样的方法分块，由矩阵转置的定义可以得到
  - 这是块的理论依据
  - 如果 B 能被整分为个块，则理论上的最优解是
    - 此时的不命中全部为冷不命中

先上最终的结果

make clean && make
./driver.py

1
2

Part B: Testing transpose function
Running ./test-trans -M 32 -N 32
Running ./test-trans -M 64 -N 64
Running ./test-trans -M 61 -N 67

Cache Lab summary:
                        Points   Max pts      Misses
Csim correctness          27.0        27
Trans perf 32x32           8.0         8         260
Trans perf 64x64           8.0         8        1028
Trans perf 61x67          10.0        10        1749
          Total points    53.0        53

1
2
3
4
5
6
7
8
9
10
11
12

可以看到除去函数调用产生的 4 次不命中之外，32 * 32 和 64 * 64 的最终结果分别达到了理论最优解 256 和 1024

32 * 32 的情况

位于非主对角线上的块，只需按行读取 A 中的数组，将其按列赋给 B 即可
- 分列中的同列元素间隔 32 的整数倍个 int ，对应于 4 的整数倍的分组
- 由于有 32 个分组，所以 A 和 B 各自分块中的 8 行不会存在冲突
当矩阵大小为 32 * 32 时，位于主对角线上的块会存在着冲突不命中
- 因为 A ，B 都是 32 * 32，首元素又位于相同的分组，在主对角线上，有
- A，和 B 的分块中的每一行在缓冲中都具有相同的分组
- 如果按照非主对角线上相同的情况处理，则按列写入 B 时，下一次读取 A 时会驱逐掉上一次 B 写入的 cache 中的数据，从而造成较多的不命中
  - 冲突不命中最直接的体验
- A 中的数据可以按行一次读完放入临时变量中，可以避免掉这种影响
- B 中的元素为了避免在读取 A 时被驱逐，可以借助其它块进行缓存
  - 此处，需要确保选择的用来缓存结果的块就是下一次会被使用的块，相当于是暖身(warmed up)

32 * 32 的代码

if(M == 32 && N == 32){
    for (i = 0; i < M; i += 8){
        // if i != 0
        //  [i, j] ~ [i + 8, j + 8] is the cache block for the main diagonal block
        // if i == 0
        // [i, j + 8] ~ [i + 8, j + 16] is the cache block for the main diagonal block
        j = i ? 0 : 8;
        for (ii = i; ii < i + 8; ii++)
            for (jj = i; jj < i + 8; jj++)
                B[ii][jj - i + j] = A[jj][ii];

        for (ii = i; ii < i + 8; ii++)
            for (jj = i; jj < i + 8; jj++)
                B[ii][jj] = B[ii][jj - i + j];

        /*
        ** Handle the block does not on the main diagonal
        */
        for (j = 0; j < N; j += 8)
            if (i != j)
                for (ii = i; ii < i + 8; ii++)
                    for (jj = j; jj < j + 8; jj++)
                        B[ii][jj] = A[jj][ii];
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

64 * 64 的情况

位于非主对角线的其它块
- 分块中的同列元素间隔 64 的整数倍个 int，对应着 8 的整数倍的分组
- 由于总共只有 32 个分组，所以在分块中的第 i + 4 行会驱逐第 i 行的缓存，其中 i = 1, 2, 3, 4
- 此时需要将 8 * 8块分为 4 个 4 * 4 的子块进行处理，如下
  - 最终需要达到的效果是
  - 由于在数组 A 中进行操作的空间较小，所以 A 中的数组按行复制，对应到 B 中就要按列处理
    - 为了避免后 4 行会驱逐前 4 行，所以先对 B 中的前 4 行进行操作，再对后 4 行进行操作
  - 拷贝完 A 中的前 4 行后， B 中的结果如下
    - 其中 N 表示暂时没有数据
  - A 中后 4 行处理
    - 基本原理就是先处理 B 中的和块
      - 此时可以用 4 个临时变量先缓存中第 i 行的四个变量
      - 再从中复制中第 i 列的四个变量，放入中中第 i 行的位置
      - 然后将 4 个临时变量放入中第 i 行的位置
        此时会驱逐掉中第 i 行的数据，但此时第 i 行已经完成了处理
      - i 会 1 遍历到 4 后，最终的效果如下
  - 最后再将替换为即可
    - 此时，中所有对应的行都已经在缓存中，不会存在不命中的情况
位于主对角线的块
- 除了会有非主对角线上块的问题之外
- 与 32 * 32 主对角线上的情况相同，分块 B 中每行的分组会与分块 A 中每行的分组冲突
- 因此跟 32 * 32 块一样，需要借助 B 中的其它块进行处理，为了避免增加不命中，仍然需要确保选择的用来缓存的块在其被使用之前不会被驱逐，相当于是暖身(warmed up)
  - 假设借用的块为矩阵 B 中的块 C
    - 为了避免 C 的后 4 行驱逐前 4 行的情况
    - 以及为了能够借用非对角线上块
    - 只能借助块 C 中的前 4 行
      - 因为非对角线上的块进行处理时，先处理的是前 4 行
  - 为了简化处理逻辑，需要借助两个缓存块，这两个缓存块均选为同行的非主对线上的两个块，设为 C ， D
- 具体的处理步骤如下，细节见代码
  - 按行复制 A 中的数据到缓存块 C 和 D 中，同时对其进行转置处理
  - 再将 C 和 D 中的数据复制到 B 的主对角线中的块中

64 * 64 的代码

if (M == 64 && N == 64){
    for(i = 0; i < M; i += 8){
        /*
        ** handle the block on the main diagonal
        */

        tmp0 = (i == 0) ? 8: 0;
        tmp1 = (i >  8) ? 8: 16;

        for(ii = 0; ii < 8; ii++)
            for (jj = 0; jj < 4; jj++){
                B[i + jj][tmp0 + ii] = A[i + ii][i + jj];
                B[i + jj][tmp1 + ii] = A[i + ii][i + jj + 4];
          }


        for(ii = 0; ii < 4; ii++){
            for(jj = 0; jj < 8; jj++)
                B[i + ii][i + jj] = B[i + ii][tmp0 + jj];
            for(jj = 0; jj < 8; jj++)
                B[i + ii + 4][i + jj] = B[i + ii][tmp1 + jj];
        }

        /*
        ** handle the block not on the main diagonal 
        */
        for(j = 0; j < N; j += 8){
            if (i == j)
                continue;
            // B = [ A11^T, A12^T
            //       N21, N22 ]
            for(ii = 0; ii < 4; ii++){
                for(jj = 0; jj < 4; jj++){
                    B[i + jj][j + ii]     = A[j + ii][i + jj];
                    B[i + jj][j + ii + 4] = A[j + ii][i + jj + 4];
                }
            }


            for(ii = 0; ii < 4; ii++){
                tmp2 = B[i + ii][j + 4];
                tmp3 = B[i + ii][j + 5];
                tmp4 = B[i + ii][j + 6];
                tmp5 = B[i + ii][j + 7];

                // A12^T to A21^T
                for(jj = 4; jj < 8; jj++){
                    B[i + ii][j + jj] = A[j + jj][i + ii];
                }

                // N21 to A12^T
                B[i + 4 + ii][j]     = tmp2;
                B[i + 4 + ii][j + 1] = tmp3;
                B[i + 4 + ii][j + 2] = tmp4;
                B[i + 4 + ii][j + 3] = tmp5;
                
                // N22 to A22^T
                for(jj = 4; jj < 8; jj++){
                    B[i + 4 + ii][j + jj] = A[j + jj][i + ii + 4];
                }
            }
        }
    }
}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64

61 * 67 的情况

由于无法规则的分块，所以无法保证分块中的每一行中的 8 个 int 均位于同一个分组，这个就很难像之前两种情况下进行具体的分析
- 过于具体的分析会使代码过于庞大和复杂，不够优雅，所以只大致的将能优化的尽量优化到
只能将 B 按列分为 8 个块，按行分为 7 个块，总共 56 个块
- 由于每行有 61 个元素，每列有 67 个元素，均为素数，所以不会存在着之前那种特别明显的冲突不命中的情况
- 但是为了尽可能降低块被驱逐的影响，每次尽可能多的从 A 中读取数据，然后将其存入临时变量中
为了尽可能减少不命中，将 B 中每行分块的末尾剩余的 3 (67 - 8 * 8 = 3) 列 (即 3 * 8 的块) 紧接着 8 * 8 的分块的处理
- 由于 A 中是连续读取变量，所以不命中造成的驱逐块对 B 的影响更大，所以要尽可能减小 B 中的不命中
对于最后 5 (61 -7 * 8 = 5)行，则分为 8 个 5 * 8 的块进行处理，能进一步减少不命中
- 最后一个 5 * 3 的块单独处理

61 * 67 的代码

if (M == 61 && N == 67){
    // 8 * 8 blocks
    for (i = 0; i < 56; i += 8){
        for (j = 0; j < 64; j += 8)
            for(ii = 0; ii < 8; ii++){
                tmp0 = A[j + ii][i];     tmp1 = A[j + ii][i + 1];
                tmp2 = A[j + ii][i + 2]; tmp3 = A[j + ii][i + 3];
                tmp4 = A[j + ii][i + 4]; tmp5 = A[j + ii][i + 5];
                tmp6 = A[j + ii][i + 6]; tmp7 = A[j + ii][i + 7];

                B[i][j + ii] = tmp0;     B[i + 1][j + ii] = tmp1; 
                B[i + 2][j + ii] = tmp2; B[i + 3][j + ii] = tmp3;
                B[i + 4][j + ii] = tmp4; B[i + 5][j + ii] = tmp5;
                B[i + 6][j + ii] = tmp6; B[i + 7][j + ii] = tmp7;
            }
        // 3 * 8 blocks
        for(; j < 67; j++){
            for(ii = 0; ii < 8; ii++){
                B[i + ii][j] = A[j][i + ii];
            }
        }
        
    }

    // 5 * 8 blocks
    for(j = 0; j < 64; j += 8){
        for(ii = j; ii < j + 8; ii++){
            tmp0 = A[ii][i];     tmp1 = A[ii][i + 1]; tmp2 = A[ii][i + 2];
            tmp3 = A[ii][i + 3]; tmp4 = A[ii][i + 4]; 

            B[i][ii] = tmp0;      B[i + 1][ii] = tmp1; B[i + 2][ii] = tmp2;
            B[i + 3][ii] = tmp3;  B[i + 4][ii] = tmp4;
        }
    }

    //  the last 5 * 3 block
    for(ii = 64; ii < 67; ii++){
        tmp0 = A[ii][i];     tmp1 = A[ii][i + 1]; tmp2 = A[ii][i + 2];
        tmp3 = A[ii][i + 3]; tmp4 = A[ii][i + 4]; 

        B[i][ii] = tmp0;      B[i + 1][ii] = tmp1; B[i + 2][ii] = tmp2;
        B[i + 3][ii] = tmp3;  B[i + 4][ii] = tmp4;
    } 
  }

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44

完整的 trans.c

/* 
 * trans.c - Matrix transpose B = A^T
 *
 * Each transpose function must have a prototype of the form:
 * void trans(int M, int N, int A[N][M], int B[M][N]);
 *
 * A transpose function is evaluated by counting the number of misses
 * on a 1KB direct mapped cache with a block size of 32 bytes.
 */ 
#include <stdio.h>
#include "cachelab.h"

int is_transpose(int M, int N, int A[N][M], int B[M][N]);

/* 
 * transpose_submit - This is the solution transpose function that you
 *     will be graded on for Part B of the assignment. Do not change
 *     the description string "Transpose submission", as the driver
 *     searches for that string to identify the transpose function to
 *     be graded. 
 */

char transpose_submit_desc[] = "Transpose submission";
void transpose_submit(int M, int N, int A[N][M], int B[M][N])
{
    register int i, j, ii, jj;
    register int tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
    if(M == 32 && N == 32){
      for (i = 0; i < M; i += 8){
        // if i != 0
        //  [i, j] ~ [i + 8, j + 8] is the cache block for the main diagonal block
        // if i == 0
        // [i, j + 8] ~ [i + 8, j + 16] is the cache block for the main diagonal block
        j = i ? 0 : 8;
        for (ii = i; ii < i + 8; ii++)
            for (jj = i; jj < i + 8; jj++)
                B[ii][jj - i + j] = A[jj][ii];

        for (ii = i; ii < i + 8; ii++)
            for (jj = i; jj < i + 8; jj++)
                B[ii][jj] = B[ii][jj - i + j];

          /*
          ** Handle the block does not on the main diagonal
          */
          for (j = 0; j < N; j += 8)
              if (i != j)
                  for (ii = i; ii < i + 8; ii++)
                      for (jj = j; jj < j + 8; jj++)
                          B[ii][jj] = A[jj][ii];
      }
    }
    else if (M == 64 && N == 64){
        for(i = 0; i < M; i += 8){
            /*
            ** handle the block on the main diagonal
            */

            tmp0 = (i == 0) ? 8: 0;
            tmp1 = (i >  8) ? 8: 16;

            for(ii = 0; ii < 8; ii++)
                for (jj = 0; jj < 4; jj++){
                    B[i + jj][tmp0 + ii] = A[i + ii][i + jj];
                    B[i + jj][tmp1 + ii] = A[i + ii][i + jj + 4];
              }


            for(ii = 0; ii < 4; ii++){
                for(jj = 0; jj < 8; jj++)
                    B[i + ii][i + jj] = B[i + ii][tmp0 + jj];
                for(jj = 0; jj < 8; jj++)
                    B[i + ii + 4][i + jj] = B[i + ii][tmp1 + jj];
            }

            /*
            ** handle the block not on the main diagonal 
            */
            for(j = 0; j < N; j += 8){
                if (i == j)
                    continue;
                // B = [ A11^T, A12^T
                //       N21, N22 ]
                for(ii = 0; ii < 4; ii++){
                    for(jj = 0; jj < 4; jj++){
                        B[i + jj][j + ii]     = A[j + ii][i + jj];
                        B[i + jj][j + ii + 4] = A[j + ii][i + jj + 4];
                    }
                }


                for(ii = 0; ii < 4; ii++){
                    tmp2 = B[i + ii][j + 4];
                    tmp3 = B[i + ii][j + 5];
                    tmp4 = B[i + ii][j + 6];
                    tmp5 = B[i + ii][j + 7];

                    // A12^T to A21^T
                    for(jj = 4; jj < 8; jj++){
                        B[i + ii][j + jj] = A[j + jj][i + ii];
                    }

                    // N21 to A12^T
                    B[i + 4 + ii][j]     = tmp2;
                    B[i + 4 + ii][j + 1] = tmp3;
                    B[i + 4 + ii][j + 2] = tmp4;
                    B[i + 4 + ii][j + 3] = tmp5;
                    
                    // N22 to A22^T
                    for(jj = 4; jj < 8; jj++){
                        B[i + 4 + ii][j + jj] = A[j + jj][i + ii + 4];
                    }
                }
            }
        }
    }
    else if (M == 61 && N == 67){
        // 8 * 8 blocks
        for (i = 0; i < 56; i += 8){
            for (j = 0; j < 64; j += 8)
                for(ii = 0; ii < 8; ii++){
                    tmp0 = A[j + ii][i];     tmp1 = A[j + ii][i + 1];
                    tmp2 = A[j + ii][i + 2]; tmp3 = A[j + ii][i + 3];
                    tmp4 = A[j + ii][i + 4]; tmp5 = A[j + ii][i + 5];
                    tmp6 = A[j + ii][i + 6]; tmp7 = A[j + ii][i + 7];

                    B[i][j + ii] = tmp0;     B[i + 1][j + ii] = tmp1;
                    B[i + 2][j + ii] = tmp2; B[i + 3][j + ii] = tmp3;
                    B[i + 4][j + ii] = tmp4; B[i + 5][j + ii] = tmp5;
                    B[i + 6][j + ii] = tmp6; B[i + 7][j + ii] = tmp7;
                }
            // 3 * 8 blocks
            for(; j < 67; j++){
                for(ii = 0; ii < 8; ii++){
                    B[i + ii][j] = A[j][i + ii];
                }
            }
            
        }

        // 5 * 8 blocks
        for(j = 0; j < 64; j += 8){
            for(ii = j; ii < j + 8; ii++){
                tmp0 = A[ii][i];     tmp1 = A[ii][i + 1]; tmp2 = A[ii][i + 2];
                tmp3 = A[ii][i + 3]; tmp4 = A[ii][i + 4]; 

                B[i][ii] = tmp0;      B[i + 1][ii] = tmp1; B[i + 2][ii] = tmp2;
                B[i + 3][ii] = tmp3;  B[i + 4][ii] = tmp4;
            }
        }
    
        //  the last 5 * 3 block
        for(ii = 64; ii < 67; ii++){
            tmp0 = A[ii][i];     tmp1 = A[ii][i + 1]; tmp2 = A[ii][i + 2];
            tmp3 = A[ii][i + 3]; tmp4 = A[ii][i + 4]; 

            B[i][ii] = tmp0;      B[i + 1][ii] = tmp1; B[i + 2][ii] = tmp2;
            B[i + 3][ii] = tmp3;  B[i + 4][ii] = tmp4;
        } 
    }
}
/* 
 * You can define additional transpose functions below. We've defined
 * a simple one below to help you get started. 
 */ 

/* 
 * trans - A simple baseline transpose function, not optimized for the cache.
 */
char trans_desc[] = "Simple row-wise scan transpose";
void trans(int M, int N, int A[N][M], int B[M][N])
{
    int i, j, tmp;

    for (i = 0; i < N; i++) {
        for (j = 0; j < M; j++) {
            tmp = A[i][j];
            B[j][i] = tmp;
        }
    }    

}

/*
 * registerFunctions - This function registers your transpose
 *     functions with the driver.  At runtime, the driver will
 *     evaluate each of the registered functions and summarize their
 *     performance. This is a handy way to experiment with different
 *     transpose strategies.
 */
// with the original operation (without blocks-shifting and lazy-transposing), 1176 = 35 * 8 + 16 * 56
// with block-shifting and lazy-transposing, it reaches theoratical limit: 1024 = 16 * 64

void registerFunctions()
{
    /* Register your solution function */
    registerTransFunction(transpose_submit, transpose_submit_desc);

    /* Register any additional transpose functions */
    registerTransFunction(trans, trans_desc); 

}

/* 
 * is_transpose - This helper function checks if B is the transpose of
 *     A. You can check the correctness of your transpose by calling
 *     it before returning from the transpose function.
 */
int is_transpose(int M, int N, int A[N][M], int B[M][N])
{
    int i, j;

    for (i = 0; i < N; i++) {
        for (j = 0; j < M; ++j) {
            if (A[i][j] != B[j][i]) {
                return 0;
            }
        }
    }
    return 1;
}

程序中应避免使用 switch 语句

因为 switch 语句会生成跳转表，会造成额外的内存操作
gcc 中使用 -fno-jump-tables 选项可以禁止生成跳转表

编辑

#lab

← Architecture lab Shell lab→