Architecture lab
# Architecture lab
每个 Linux 命令块开始时的文件夹默认为 sim
提示
图形化的模拟运行过程
# 生成相应的 yo 文件 cd ./y86-code make asum.yo # 无流水线仿真 cd ../seq make clean && make ./ssim -g ../y86-code/asum.yo # 带流水线仿真 cd ../pipe make clean && make ./psim -g ../y86-code/asum.yo
1
2
3
4
5
6
7
8
9
10
11
# 实验环境准备
实验环境配置
需要安装 Tcl/tk 这两个软件:
sudo apt install tcl tk
1manjaro
使用如下命令sudo pacman -S tcl tk
1因为 Makefile 里写的 tcl 的版本较低,需要修改一下 Makefile:
sed -i "s/tcl8.5/tcl8.6/g" Makefile sed -i "s/CFLAGS=/CFLAGS=-DUSE_INTERP_RESULT /g" Makefile
1
2- 需要注意
seq
和pipe
两个文件夹下的 Makefile 也需要修改
- 需要注意
-
由于相关软件更新,在
ssim.c
和psim.c
中需要注释掉以下两条语句extern int matherr(); int *tclDummyMathPtr = (int *) matherr;
1
2
较新版本 gcc 下 Makefile 中 gcc 命令需要添加
-fcommon
参数
# Part A
根据 Y86 的相关指令进行编写即可
sum.ys
# Execution begins at address 0 .pos 0 irmovq stack, %rsp # Set up stack pointer irmovq ele1,%rdi call sum_list # sum_list(ele1) halt # Terminate program .align 8 ele1: .quad 0x00a .quad ele2 ele2: .quad 0x0b0 .quad ele3 ele3: .quad 0xc00 .quad 0 # long sum_list(list_ptr ls) # ls in %rdi sum_list: xorq %rax, %rax # val = 0 andq %rdi, %rdi # set CC je done loop: mrmovq (%rdi), %rsi # get ls->val addq %rsi, %rax # val += ls->val mrmovq 0x8(%rdi), %rdi # ls = ls-next; andq %rdi, %rdi # set CC jne loop done: ret # Stack starts here and grows to lower addresses .pos 0x200 stack:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36rsum.ys
# Execution begins at address 0 .pos 0 irmovq stack, %rsp # Set up stack pointer irmovq ele1,%rdi call rsum_list # sum_list(ele1) halt # Terminate program .align 8 ele1: .quad 0x00a .quad ele2 ele2: .quad 0x0b0 .quad ele3 ele3: .quad 0xc00 .quad 0 # long rsum_list(list_ptr ls) # ls in %rdi rsum_list: xorq %rax, %rax # val = 0 andq %rdi, %rdi # set CC je done mrmovq (%rdi), %rsi # get ls->val pushq %rsi mrmovq 0x8(%rdi), %rdi # ls = ls-next; call rsum_list popq %rsi addq %rsi, %rax # val += ls->val done: ret # Stack starts here and grows to lower addresses .pos 0x200 stack:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36copy.ys
# Execution begins at address 0 .pos 0 irmovq stack, %rsp # Set up stack pointer irmovq src, %rdi irmovq dest, %rsi irmovq $3, %rdx call copy_block # copy_block(src, dest, len) halt # Terminate program .align 8 # Source block src: .quad 0x00a .quad 0x0b0 .quad 0xc00 # Destination block dest: .quad 0x111 .quad 0x222 .quad 0x333 # long copy_block(src, dest, len) # src in %rdi, dest in %rsi, len in %rdx copy_block: andq %rdx, %rdx # set CC jle done xorq %rax, %rax # val = 0 irmovq $8, %r8 irmovq $1, %r9 loop: mrmovq (%rdi), %r10 # val = *src rmmovq %r10, (%rsi) xorq %r10, %rax addq %r8, %rdi # src++ addq %r8, %rsi # dest++ subq %r9, %rdx # len-- and set CC jne loop done: ret # Stack starts here and grows to lower addresses .pos 0x200 stack:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
运行下述命令检查代码正确性
cd misc
make clean && make
./yas sum.ys && ./yis sum.yo
./yas rsum.ys && ./yis rsum.yo
./yas copy.ys && ./yis copy.yo
1
2
3
4
5
2
3
4
5
# Part B
参考
练习 4.51 4.52 及 4.3
图 4.18
解答
iaddq
的计算过程如下
阶段 | iaddq V, rB |
---|---|
取指 | icode:ifun ← M1[PC] rA:rB ← M1[PC+1] valC ← M8[PC+2] valP ← PC + 10 |
译码 | valB ← R[rB] |
执行 | valE ← valB + valC |
访存 | |
写回 | R[rB] ← valE |
更新PC | PC ← valP |
修改后的 seq-full.hcl
- 修改的地方主要是在各个信号中添加
IIADDQ
, 以达成上表中的逻辑行为
#/* $begin seq-all-hcl */ #################################################################### # HCL Description of Control for Single Cycle Y86-64 Processor SEQ # # Copyright (C) Randal E. Bryant, David R. O'Hallaron, 2010 # #################################################################### ## Your task is to implement the iaddq instruction ## The file contains a declaration of the icodes ## for iaddq (IIADDQ) ## Your job is to add the rest of the logic to make it work #################################################################### # C Include's. Don't alter these # #################################################################### quote '#include <stdio.h>' quote '#include "isa.h"' quote '#include "sim.h"' quote 'int sim_main(int argc, char *argv[]);' quote 'word_t gen_pc(){return 0;}' quote 'int main(int argc, char *argv[])' quote ' {plusmode=0;return sim_main(argc,argv);}' #################################################################### # Declarations. Do not change/remove/delete any of these # #################################################################### ##### Symbolic representation of Y86-64 Instruction Codes ############# wordsig INOP 'I_NOP' wordsig IHALT 'I_HALT' wordsig IRRMOVQ 'I_RRMOVQ' wordsig IIRMOVQ 'I_IRMOVQ' wordsig IRMMOVQ 'I_RMMOVQ' wordsig IMRMOVQ 'I_MRMOVQ' wordsig IOPQ 'I_ALU' wordsig IJXX 'I_JMP' wordsig ICALL 'I_CALL' wordsig IRET 'I_RET' wordsig IPUSHQ 'I_PUSHQ' wordsig IPOPQ 'I_POPQ' # Instruction code for iaddq instruction wordsig IIADDQ 'I_IADDQ' ##### Symbolic represenations of Y86-64 function codes ##### wordsig FNONE 'F_NONE' # Default function code ##### Symbolic representation of Y86-64 Registers referenced explicitly ##### wordsig RRSP 'REG_RSP' # Stack Pointer wordsig RNONE 'REG_NONE' # Special value indicating "no register" ##### ALU Functions referenced explicitly ##### wordsig ALUADD 'A_ADD' # ALU should add its arguments ##### Possible instruction status values ##### wordsig SAOK 'STAT_AOK' # Normal execution wordsig SADR 'STAT_ADR' # Invalid memory address wordsig SINS 'STAT_INS' # Invalid instruction wordsig SHLT 'STAT_HLT' # Halt instruction encountered ##### Signals that can be referenced by control logic #################### ##### Fetch stage inputs ##### wordsig pc 'pc' # Program counter ##### Fetch stage computations ##### wordsig imem_icode 'imem_icode' # icode field from instruction memory wordsig imem_ifun 'imem_ifun' # ifun field from instruction memory wordsig icode 'icode' # Instruction control code wordsig ifun 'ifun' # Instruction function wordsig rA 'ra' # rA field from instruction wordsig rB 'rb' # rB field from instruction wordsig valC 'valc' # Constant from instruction wordsig valP 'valp' # Address of following instruction boolsig imem_error 'imem_error' # Error signal from instruction memory boolsig instr_valid 'instr_valid' # Is fetched instruction valid? ##### Decode stage computations ##### wordsig valA 'vala' # Value from register A port wordsig valB 'valb' # Value from register B port ##### Execute stage computations ##### wordsig valE 'vale' # Value computed by ALU boolsig Cnd 'cond' # Branch test ##### Memory stage computations ##### wordsig valM 'valm' # Value read from memory boolsig dmem_error 'dmem_error' # Error signal from data memory #################################################################### # Control Signal Definitions. # #################################################################### ################ Fetch Stage ################################### # Determine instruction code word icode = [ imem_error: INOP; 1: imem_icode; # Default: get from instruction memory ]; # Determine instruction function word ifun = [ imem_error: FNONE; 1: imem_ifun; # Default: get from instruction memory ]; bool instr_valid = icode in { INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ, IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ, IIADDQ }; # Does fetched instruction require a regid byte? bool need_regids = icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ, IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ}; # Does fetched instruction require a constant word? bool need_valC = icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL, IIADDQ }; ################ Decode Stage ################################### ## What register should be used as the A source? word srcA = [ icode in { IRRMOVQ, IRMMOVQ, IOPQ, IPUSHQ } : rA; icode in { IPOPQ, IRET } : RRSP; 1 : RNONE; # Don't need register ]; ## What register should be used as the B source? word srcB = [ icode in { IOPQ, IRMMOVQ, IMRMOVQ, IIADDQ } : rB; icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP; 1 : RNONE; # Don't need register ]; ## What register should be used as the E destination? word dstE = [ icode in { IRRMOVQ } && Cnd : rB; icode in { IIRMOVQ, IOPQ, IIADDQ } : rB; icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP; 1 : RNONE; # Don't write any register ]; ## What register should be used as the M destination? word dstM = [ icode in { IMRMOVQ, IPOPQ } : rA; 1 : RNONE; # Don't write any register ]; ################ Execute Stage ################################### ## Select input A to ALU word aluA = [ icode in { IRRMOVQ, IOPQ } : valA; icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ } : valC; icode in { ICALL, IPUSHQ } : -8; icode in { IRET, IPOPQ } : 8; # Other instructions don't need ALU ]; ## Select input B to ALU word aluB = [ icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL, IPUSHQ, IRET, IPOPQ, IIADDQ } : valB; icode in { IRRMOVQ, IIRMOVQ } : 0; # Other instructions don't need ALU ]; ## Set the ALU function word alufun = [ icode == IOPQ : ifun; 1 : ALUADD; ]; ## Should the condition codes be updated? bool set_cc = icode in { IOPQ, IIADDQ }; ################ Memory Stage ################################### ## Set read control signal bool mem_read = icode in { IMRMOVQ, IPOPQ, IRET }; ## Set write control signal bool mem_write = icode in { IRMMOVQ, IPUSHQ, ICALL }; ## Select memory address word mem_addr = [ icode in { IRMMOVQ, IPUSHQ, ICALL, IMRMOVQ } : valE; icode in { IPOPQ, IRET } : valA; # Other instructions don't need address ]; ## Select memory input data word mem_data = [ # Value from register icode in { IRMMOVQ, IPUSHQ } : valA; # Return PC icode == ICALL : valP; # Default: Don't write anything ]; ## Determine instruction status word Stat = [ imem_error || dmem_error : SADR; !instr_valid: SINS; icode == IHALT : SHLT; 1 : SAOK; ]; ################ Program Counter Update ############################ ## What address should instruction be fetched at word new_pc = [ # Call. Use instruction constant icode == ICALL : valC; # Taken branch. Use instruction constant icode == IJXX && Cnd : valC; # Completion of RET instruction. Use value from stack icode == IRET : valM; # Default: Use incremented PC 1 : valP; ]; #/* $end seq-all-hcl */
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224- 修改的地方主要是在各个信号中添加
运行下述命令检查代码正确性
cd seq
make clean && make VERSION=full
cd ../ptest; make SIM=../seq/ssim TFLAGS=-i
1
2
3
2
3
# Part C
解题思路
添加
iadd
指令可以减少指令的条数,诸如以下两条指令,即可合并为一条指令irmovq $1, %r10 subq %r10, %rdx
1
2可合并为
iaddq $-1, %rdx
1采用条件转移指令不能改善性能
andq %r10, %r10 # val <= 0? jle Npos # if so, goto Npos: iaddq $1, %rax # count++
1
2
3- 此段代码如果换为条件转移指令所需要指令条数为 4 条
- 而跳转指令当前采用是总是跳转(AT)策略,当判断成功时,需要 2 条指令。判断失败时,需要加入两个气泡,因此总共需要 5 条指令
- 本例中数据是随机生成的,因此判断成功的概念为 50%,平均指令数为 3.5 条
- 因此条件跳转指令的性能要优于条件转移指令
内存读写时,要避免数据的加载/使用冒险
如下面的代码,即存在着数据的加载/使用冒险,会导致产生 1 个气泡
- 需要向其中加入一条其它不使用
%r10
的指令来避免此种情况
mrmovq (%rdi), %r10 # read val from src... rmmovq %r10, (%rsi) # ...and store it to dst
1
2- 需要向其中加入一条其它不使用
最终的优化策略是循环展开和避免数据的加载/使用冒险
- 循环展开可以减少程序条件跳转的次数,从而提升 CPE
- 实际编写代码时发现,发现只采用循环展开的效果并不是很好
- 经分析是由于循环展开的余数部分仍然会有较多的条件跳转判断
- 因此,将余数部分也展开,从而进一步提升 CPE
- 判断余数时使用二分法,可以减少判断的次数
- 余数部分展开时,较大的余数可以利用较小的余数部分的代码
- 通过将余数为 1 时的内存数据提前移入寄存器
%r8
可以巧妙的避免数据的加载/使用冒险- 对应于 9 次展开第 86 行的代码
- 要注意余数为 0 时,不应再对内存进行写入操作
- 对余数为 0 和 余数为 1 的部分单独进行处理,可以进一步的提升 CPE
- 对应于 9 次展开 99 ~ 102 行的代码
- 设展开次数为
K
,则对于余数K-1
, 可以不采用跳转操作,而是直接接着代码顺序执行
- 能直接
ret
的就直接ret
,而不是跳转到Done
,可以节省一条标签 - 对于此程序, 省略
xorq %rax, %rax
这条语句也可以通过测试程序- 这个改动需要函数调用者的配合, 不知道 MCU 的最优答案是否省略了这条语句
- 最终发现展开次数为
9
时,代码的长度为 985 bytes,再往前展开,则会超出代码长度的限制。- 展开次数为
9
时,平均 CPE 为 7.48,达到了 guide 中的最好水平- 通过测试,展开次数为
8
时,平均 CPE 为 7.5
- 通过测试,展开次数为
- 展开次数为
最终答案
pipe-full.hcl
- 修改的地方主要是在各个信号中添加
IIADDQ
#/* $begin pipe-all-hcl */ #################################################################### # HCL Description of Control for Pipelined Y86-64 Processor # # Copyright (C) Randal E. Bryant, David R. O'Hallaron, 2014 # #################################################################### ## Your task is to implement the iaddq instruction ## The file contains a declaration of the icodes ## for iaddq (IIADDQ) ## Your job is to add the rest of the logic to make it work #################################################################### # C Include's. Don't alter these # #################################################################### quote '#include <stdio.h>' quote '#include "isa.h"' quote '#include "pipeline.h"' quote '#include "stages.h"' quote '#include "sim.h"' quote 'int sim_main(int argc, char *argv[]);' quote 'int main(int argc, char *argv[]){return sim_main(argc,argv);}' #################################################################### # Declarations. Do not change/remove/delete any of these # #################################################################### ##### Symbolic representation of Y86-64 Instruction Codes ############# wordsig INOP 'I_NOP' wordsig IHALT 'I_HALT' wordsig IRRMOVQ 'I_RRMOVQ' wordsig IIRMOVQ 'I_IRMOVQ' wordsig IRMMOVQ 'I_RMMOVQ' wordsig IMRMOVQ 'I_MRMOVQ' wordsig IOPQ 'I_ALU' wordsig IJXX 'I_JMP' wordsig ICALL 'I_CALL' wordsig IRET 'I_RET' wordsig IPUSHQ 'I_PUSHQ' wordsig IPOPQ 'I_POPQ' # Instruction code for iaddq instruction wordsig IIADDQ 'I_IADDQ' ##### Symbolic represenations of Y86-64 function codes ##### wordsig FNONE 'F_NONE' # Default function code ##### Symbolic representation of Y86-64 Registers referenced ##### wordsig RRSP 'REG_RSP' # Stack Pointer wordsig RNONE 'REG_NONE' # Special value indicating "no register" ##### ALU Functions referenced explicitly ########################## wordsig ALUADD 'A_ADD' # ALU should add its arguments ##### Possible instruction status values ##### wordsig SBUB 'STAT_BUB' # Bubble in stage wordsig SAOK 'STAT_AOK' # Normal execution wordsig SADR 'STAT_ADR' # Invalid memory address wordsig SINS 'STAT_INS' # Invalid instruction wordsig SHLT 'STAT_HLT' # Halt instruction encountered ##### Signals that can be referenced by control logic ############## ##### Pipeline Register F ########################################## wordsig F_predPC 'pc_curr->pc' # Predicted value of PC ##### Intermediate Values in Fetch Stage ########################### wordsig imem_icode 'imem_icode' # icode field from instruction memory wordsig imem_ifun 'imem_ifun' # ifun field from instruction memory wordsig f_icode 'if_id_next->icode' # (Possibly modified) instruction code wordsig f_ifun 'if_id_next->ifun' # Fetched instruction function wordsig f_valC 'if_id_next->valc' # Constant data of fetched instruction wordsig f_valP 'if_id_next->valp' # Address of following instruction boolsig imem_error 'imem_error' # Error signal from instruction memory boolsig instr_valid 'instr_valid' # Is fetched instruction valid? ##### Pipeline Register D ########################################## wordsig D_icode 'if_id_curr->icode' # Instruction code wordsig D_rA 'if_id_curr->ra' # rA field from instruction wordsig D_rB 'if_id_curr->rb' # rB field from instruction wordsig D_valP 'if_id_curr->valp' # Incremented PC ##### Intermediate Values in Decode Stage ######################### wordsig d_srcA 'id_ex_next->srca' # srcA from decoded instruction wordsig d_srcB 'id_ex_next->srcb' # srcB from decoded instruction wordsig d_rvalA 'd_regvala' # valA read from register file wordsig d_rvalB 'd_regvalb' # valB read from register file ##### Pipeline Register E ########################################## wordsig E_icode 'id_ex_curr->icode' # Instruction code wordsig E_ifun 'id_ex_curr->ifun' # Instruction function wordsig E_valC 'id_ex_curr->valc' # Constant data wordsig E_srcA 'id_ex_curr->srca' # Source A register ID wordsig E_valA 'id_ex_curr->vala' # Source A value wordsig E_srcB 'id_ex_curr->srcb' # Source B register ID wordsig E_valB 'id_ex_curr->valb' # Source B value wordsig E_dstE 'id_ex_curr->deste' # Destination E register ID wordsig E_dstM 'id_ex_curr->destm' # Destination M register ID ##### Intermediate Values in Execute Stage ######################### wordsig e_valE 'ex_mem_next->vale' # valE generated by ALU boolsig e_Cnd 'ex_mem_next->takebranch' # Does condition hold? wordsig e_dstE 'ex_mem_next->deste' # dstE (possibly modified to be RNONE) ##### Pipeline Register M ######################### wordsig M_stat 'ex_mem_curr->status' # Instruction status wordsig M_icode 'ex_mem_curr->icode' # Instruction code wordsig M_ifun 'ex_mem_curr->ifun' # Instruction function wordsig M_valA 'ex_mem_curr->vala' # Source A value wordsig M_dstE 'ex_mem_curr->deste' # Destination E register ID wordsig M_valE 'ex_mem_curr->vale' # ALU E value wordsig M_dstM 'ex_mem_curr->destm' # Destination M register ID boolsig M_Cnd 'ex_mem_curr->takebranch' # Condition flag boolsig dmem_error 'dmem_error' # Error signal from instruction memory ##### Intermediate Values in Memory Stage ########################## wordsig m_valM 'mem_wb_next->valm' # valM generated by memory wordsig m_stat 'mem_wb_next->status' # stat (possibly modified to be SADR) ##### Pipeline Register W ########################################## wordsig W_stat 'mem_wb_curr->status' # Instruction status wordsig W_icode 'mem_wb_curr->icode' # Instruction code wordsig W_dstE 'mem_wb_curr->deste' # Destination E register ID wordsig W_valE 'mem_wb_curr->vale' # ALU E value wordsig W_dstM 'mem_wb_curr->destm' # Destination M register ID wordsig W_valM 'mem_wb_curr->valm' # Memory M value #################################################################### # Control Signal Definitions. # #################################################################### ################ Fetch Stage ################################### ## What address should instruction be fetched at word f_pc = [ # Mispredicted branch. Fetch at incremented PC M_icode == IJXX && !M_Cnd : M_valA; # Completion of RET instruction W_icode == IRET : W_valM; # Default: Use predicted value of PC 1 : F_predPC; ]; ## Determine icode of fetched instruction word f_icode = [ imem_error : INOP; 1: imem_icode; ]; # Determine ifun word f_ifun = [ imem_error : FNONE; 1: imem_ifun; ]; # Is instruction valid? bool instr_valid = f_icode in { INOP, IHALT, IRRMOVQ, IIRMOVQ, IRMMOVQ, IMRMOVQ, IOPQ, IJXX, ICALL, IRET, IPUSHQ, IPOPQ, IIADDQ }; # Determine status code for fetched instruction word f_stat = [ imem_error: SADR; !instr_valid : SINS; f_icode == IHALT : SHLT; 1 : SAOK; ]; # Does fetched instruction require a regid byte? bool need_regids = f_icode in { IRRMOVQ, IOPQ, IPUSHQ, IPOPQ, IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ }; # Does fetched instruction require a constant word? bool need_valC = f_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IJXX, ICALL, IIADDQ }; # Predict next value of PC word f_predPC = [ f_icode in { IJXX, ICALL } : f_valC; 1 : f_valP; ]; ################ Decode Stage ###################################### ## What register should be used as the A source? word d_srcA = [ D_icode in { IRRMOVQ, IRMMOVQ, IOPQ, IPUSHQ } : D_rA; D_icode in { IPOPQ, IRET } : RRSP; 1 : RNONE; # Don't need register ]; ## What register should be used as the B source? word d_srcB = [ D_icode in { IOPQ, IRMMOVQ, IMRMOVQ, IIADDQ } : D_rB; D_icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP; 1 : RNONE; # Don't need register ]; ## What register should be used as the E destination? word d_dstE = [ D_icode in { IRRMOVQ, IIRMOVQ, IOPQ, IIADDQ } : D_rB; D_icode in { IPUSHQ, IPOPQ, ICALL, IRET } : RRSP; 1 : RNONE; # Don't write any register ]; ## What register should be used as the M destination? word d_dstM = [ D_icode in { IMRMOVQ, IPOPQ } : D_rA; 1 : RNONE; # Don't write any register ]; ## What should be the A value? ## Forward into decode stage for valA word d_valA = [ D_icode in { ICALL, IJXX } : D_valP; # Use incremented PC d_srcA == e_dstE : e_valE; # Forward valE from execute d_srcA == M_dstM : m_valM; # Forward valM from memory d_srcA == M_dstE : M_valE; # Forward valE from memory d_srcA == W_dstM : W_valM; # Forward valM from write back d_srcA == W_dstE : W_valE; # Forward valE from write back 1 : d_rvalA; # Use value read from register file ]; word d_valB = [ d_srcB == e_dstE : e_valE; # Forward valE from execute d_srcB == M_dstM : m_valM; # Forward valM from memory d_srcB == M_dstE : M_valE; # Forward valE from memory d_srcB == W_dstM : W_valM; # Forward valM from write back d_srcB == W_dstE : W_valE; # Forward valE from write back 1 : d_rvalB; # Use value read from register file ]; ################ Execute Stage ##################################### ## Select input A to ALU word aluA = [ E_icode in { IRRMOVQ, IOPQ } : E_valA; E_icode in { IIRMOVQ, IRMMOVQ, IMRMOVQ, IIADDQ } : E_valC; E_icode in { ICALL, IPUSHQ } : -8; E_icode in { IRET, IPOPQ } : 8; # Other instructions don't need ALU ]; ## Select input B to ALU word aluB = [ E_icode in { IRMMOVQ, IMRMOVQ, IOPQ, ICALL, IPUSHQ, IRET, IPOPQ, IIADDQ } : E_valB; E_icode in { IRRMOVQ, IIRMOVQ } : 0; # Other instructions don't need ALU ]; ## Set the ALU function word alufun = [ E_icode == IOPQ : E_ifun; 1 : ALUADD; ]; ## Should the condition codes be updated? bool set_cc = (E_icode == IOPQ || E_icode == IIADDQ) && # State changes only during normal operation !m_stat in { SADR, SINS, SHLT } && !W_stat in { SADR, SINS, SHLT }; ## Generate valA in execute stage word e_valA = E_valA; # Pass valA through stage ## Set dstE to RNONE in event of not-taken conditional move word e_dstE = [ E_icode == IRRMOVQ && !e_Cnd : RNONE; 1 : E_dstE; ]; ################ Memory Stage ###################################### ## Select memory address word mem_addr = [ M_icode in { IRMMOVQ, IPUSHQ, ICALL, IMRMOVQ } : M_valE; M_icode in { IPOPQ, IRET } : M_valA; # Other instructions don't need address ]; ## Set read control signal bool mem_read = M_icode in { IMRMOVQ, IPOPQ, IRET }; ## Set write control signal bool mem_write = M_icode in { IRMMOVQ, IPUSHQ, ICALL }; #/* $begin pipe-m_stat-hcl */ ## Update the status word m_stat = [ dmem_error : SADR; 1 : M_stat; ]; #/* $end pipe-m_stat-hcl */ ## Set E port register ID word w_dstE = W_dstE; ## Set E port value word w_valE = W_valE; ## Set M port register ID word w_dstM = W_dstM; ## Set M port value word w_valM = W_valM; ## Update processor status word Stat = [ W_stat == SBUB : SAOK; 1 : W_stat; ]; ################ Pipeline Register Control ######################### # Should I stall or inject a bubble into Pipeline Register F? # At most one of these can be true. bool F_bubble = 0; bool F_stall = # Conditions for a load/use hazard E_icode in { IMRMOVQ, IPOPQ } && E_dstM in { d_srcA, d_srcB } || # Stalling at fetch while ret passes through pipeline IRET in { D_icode, E_icode, M_icode }; # Should I stall or inject a bubble into Pipeline Register D? # At most one of these can be true. bool D_stall = # Conditions for a load/use hazard E_icode in { IMRMOVQ, IPOPQ } && E_dstM in { d_srcA, d_srcB }; bool D_bubble = # Mispredicted branch (E_icode == IJXX && !e_Cnd) || # Stalling at fetch while ret passes through pipeline # but not condition for a load/use hazard !(E_icode in { IMRMOVQ, IPOPQ } && E_dstM in { d_srcA, d_srcB }) && IRET in { D_icode, E_icode, M_icode }; # Should I stall or inject a bubble into Pipeline Register E? # At most one of these can be true. bool E_stall = 0; bool E_bubble = # Mispredicted branch (E_icode == IJXX && !e_Cnd) || # Conditions for a load/use hazard E_icode in { IMRMOVQ, IPOPQ } && E_dstM in { d_srcA, d_srcB}; # Should I stall or inject a bubble into Pipeline Register M? # At most one of these can be true. bool M_stall = 0; # Start injecting bubbles as soon as exception passes through memory stage bool M_bubble = m_stat in { SADR, SINS, SHLT } || W_stat in { SADR, SINS, SHLT }; # Should I stall or inject a bubble into Pipeline Register W? bool W_stall = W_stat in { SADR, SINS, SHLT }; bool W_bubble = 0; #/* $end pipe-all-hcl */
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363- 修改的地方主要是在各个信号中添加
ncopy.ys
8 次展开, CPE 为 7.5
#/* $begin ncopy-ys */ ################################################################## # ncopy.ys - Copy a src block of len words to dst. # Return the number of positive words (>0) contained in src. # # Include your name and ID here. # # Describe how and why you modified the baseline code. # ################################################################## # Do not modify this portion # Function prologue. # %rdi = src, %rsi = dst, %rdx = len ncopy: ################################################################## # You can modify this portion # Loop header # xorq %rax,%rax # count = 0; iaddq $-8, %rdx jge K_Loop jmp Remainder K_Loop: mrmovq (%rdi), %r8 mrmovq 0x08(%rdi), %r9 andq %r8, %r8 rmmovq %r8, (%rsi) jle Npos1 iaddq $1, %rax Npos1: rmmovq %r9, 0x08(%rsi) andq %r9, %r9 mrmovq 0x10(%rdi), %r8 jle Npos2 iaddq $1, %rax Npos2: rmmovq %r8, 0x10(%rsi) andq %r8, %r8 mrmovq 0x18(%rdi), %r8 jle Npos3 iaddq $1, %rax Npos3: rmmovq %r8, 0x18(%rsi) andq %r8, %r8 mrmovq 0x20(%rdi), %r8 jle Npos4 iaddq $1, %rax Npos4: rmmovq %r8, 0x20(%rsi) andq %r8, %r8 mrmovq 0x28(%rdi), %r8 jle Npos5 iaddq $1, %rax Npos5: rmmovq %r8, 0x28(%rsi) andq %r8, %r8 mrmovq 0x30(%rdi), %r8 jle Npos6 iaddq $1, %rax Npos6: rmmovq %r8, 0x30(%rsi) andq %r8, %r8 mrmovq 0x38(%rdi), %r8 jle Npos7 iaddq $1, %rax Npos7: andq %r8, %r8 rmmovq %r8, 0x38(%rsi) jle Npos8 iaddq $1, %rax Npos8: iaddq $0x40, %rdi iaddq $0x40, %rsi iaddq $-8, %rdx jge K_Loop Remainder: iaddq $4, %rdx mrmovq (%rdi), %r8 jl Remainder_0_3 rmmovq %r8, (%rsi) jg Remainder_5_7 jmp Remainder_4 Remainder_0_3: iaddq $2, %rdx jl Remainder_0_1 rmmovq %r8, (%rsi) je Remainder_2 jg Remainder_3 Remainder_0_1: iaddq $1, %rdx je Remainder_1 ret Remainder_5_7: iaddq $-2, %rdx jl Remainder_5 je Remainder_6 Remainder_7: andq %r8, %r8 mrmovq 48(%rdi), %r8 jle Remainder_Nop7 iaddq $1, %rax Remainder_Nop7: rmmovq %r8, 48(%rsi) Remainder_6: andq %r8, %r8 mrmovq 40(%rdi), %r8 jle Remainder_Nop6 iaddq $1, %rax Remainder_Nop6: rmmovq %r8, 40(%rsi) Remainder_5: andq %r8, %r8 mrmovq 32(%rdi), %r8 jle Remainder_Nop5 iaddq $1, %rax Remainder_Nop5: rmmovq %r8, 32(%rsi) Remainder_4: andq %r8, %r8 mrmovq 24(%rdi), %r8 jle Remainder_Nop4 iaddq $1, %rax Remainder_Nop4: rmmovq %r8, 24(%rsi) Remainder_3: andq %r8, %r8 mrmovq 16(%rdi), %r8 jle Remainder_Nop3 iaddq $1, %rax Remainder_Nop3: rmmovq %r8, 16(%rsi) Remainder_2: andq %r8, %r8 mrmovq 8(%rdi), %r8 jle Remainder_Nop2 iaddq $1, %rax Remainder_Nop2: rmmovq %r8, 8(%rsi) andq %r8, %r8 jle Done iaddq $1, %rax ret Remainder_1: rmmovq %r8, (%rsi) andq %r8, %r8 jle Done iaddq $1, %rax ret ################################################################## # Do not modify the following section of code # Function epilogue. Done: ret ################################################################## # Keep the following label at the end of your function End: #/* $end ncopy-ys */
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
1709 次展开, CPE 为 7.48
#/* $begin ncopy-ys */ ################################################################## # ncopy.ys - Copy a src block of len words to dst. # Return the number of positive words (>0) contained in src. # # Include your name and ID here. # # Describe how and why you modified the baseline code. # ################################################################## # Do not modify this portion # Function prologue. # %rdi = src, %rsi = dst, %rdx = len ncopy: ################################################################## # You can modify this portion # Loop header # xorq %rax,%rax # count = 0; iaddq $-9, %rdx jge K_Loop jmp Remainder K_Loop: mrmovq (%rdi), %r8 mrmovq 0x08(%rdi), %r9 andq %r8, %r8 rmmovq %r8, (%rsi) jle Npos1 iaddq $1, %rax Npos1: rmmovq %r9, 0x08(%rsi) andq %r9, %r9 mrmovq 0x10(%rdi), %r8 jle Npos2 iaddq $1, %rax Npos2: rmmovq %r8, 0x10(%rsi) andq %r8, %r8 mrmovq 0x18(%rdi), %r8 jle Npos3 iaddq $1, %rax Npos3: rmmovq %r8, 0x18(%rsi) andq %r8, %r8 mrmovq 0x20(%rdi), %r8 jle Npos4 iaddq $1, %rax Npos4: rmmovq %r8, 0x20(%rsi) andq %r8, %r8 mrmovq 0x28(%rdi), %r8 jle Npos5 iaddq $1, %rax Npos5: rmmovq %r8, 0x28(%rsi) andq %r8, %r8 mrmovq 0x30(%rdi), %r8 jle Npos6 iaddq $1, %rax Npos6: rmmovq %r8, 0x30(%rsi) andq %r8, %r8 mrmovq 0x38(%rdi), %r8 jle Npos7 iaddq $1, %rax Npos7: rmmovq %r8, 0x38(%rsi) andq %r8, %r8 mrmovq 0x40(%rdi), %r8 jle Npos8 iaddq $1, %rax Npos8: rmmovq %r8, 0x40(%rsi) andq %r8, %r8 jle Npos9 iaddq $1, %rax Npos9: iaddq $0x48, %rdi iaddq $0x48, %rsi iaddq $-9, %rdx jge K_Loop Remainder: iaddq $5, %rdx mrmovq (%rdi), %r8 jl Remainder_0_3 rmmovq %r8, (%rsi) jg Remainder_5_8 jmp Remainder_4 Remainder_0_3: iaddq $2, %rdx jl Remainder_0_1 rmmovq %r8, (%rsi) je Remainder_2 jg Remainder_3 Remainder_0_1: iaddq $1, %rdx je Remainder_1 ret Remainder_5_8: iaddq $-2, %rdx jg Remainder_7_8 jl Remainder_5 je Remainder_6 Remainder_7_8: iaddq $-1, %rdx je Remainder_7 Remainder_8: andq %r8, %r8 mrmovq 56(%rdi), %r8 jle Remainder_Nop8 iaddq $1, %rax Remainder_Nop8: rmmovq %r8, 56(%rsi) Remainder_7: andq %r8, %r8 mrmovq 48(%rdi), %r8 jle Remainder_Nop7 iaddq $1, %rax Remainder_Nop7: rmmovq %r8, 48(%rsi) Remainder_6: andq %r8, %r8 mrmovq 40(%rdi), %r8 jle Remainder_Nop6 iaddq $1, %rax Remainder_Nop6: rmmovq %r8, 40(%rsi) Remainder_5: andq %r8, %r8 mrmovq 32(%rdi), %r8 jle Remainder_Nop5 iaddq $1, %rax Remainder_Nop5: rmmovq %r8, 32(%rsi) Remainder_4: andq %r8, %r8 mrmovq 24(%rdi), %r8 jle Remainder_Nop4 iaddq $1, %rax Remainder_Nop4: rmmovq %r8, 24(%rsi) Remainder_3: andq %r8, %r8 mrmovq 16(%rdi), %r8 jle Remainder_Nop3 iaddq $1, %rax Remainder_Nop3: rmmovq %r8, 16(%rsi) Remainder_2: andq %r8, %r8 mrmovq 8(%rdi), %r8 jle Remainder_Nop2 iaddq $1, %rax Remainder_Nop2: rmmovq %r8, 8(%rsi) andq %r8, %r8 jle Done iaddq $1, %rax ret Remainder_1: rmmovq %r8, (%rsi) andq %r8, %r8 jle Done iaddq $1, %rax ret ################################################################## # Do not modify the following section of code # Function epilogue. Done: ret ################################################################## # Keep the following label at the end of your function End: #/* $end ncopy-ys */
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
答案验证
cd pipe
# 编译添加了 iaddq 的 pipe
make clean && make VERSION=full
# 测试 iaddq 指令的正确性
cd ../ptest; make SIM=../pipe/psim TFLAGS=-i
# 检查 ncopy.ys 的正确性
cd ../pipe
# 检查代码代度
make ncopy.yo
./check-len.pl < ncopy.yo
# 检查代码正确性
./correctness.pl
# 检查代码性能
./benchmark.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
2
3
4
5
6
7
8
9
10
11
12
13
14
编辑 (opens new window)