Times in seconds for 100,000,000 instructions at 200MHz ARM clock speed:

cfadd mvd0,mvd0,mvd0	8 ARM clocks
cfadd mvd0,mvd1,mvd2	2 ARM clocks
Interlace 2 0=0+2/1=1+2	4 ARM clocks
Interlace 3		3 ARM clocks
Interlace 4		2 ARM clocks 
0=0+0 					8 ARM clocks per group
0=0+0 1=1+1				8 ARM clocks per group
0=0+0 1=1+1 2=2+2			8 ARM clocks per group
0=0+0 1=1+1 2=2+2 3=3+3			8 ARM clocks per group
0=0+0 1=1+1 2=2+2 3=3+3	4=4+4		10 ARM clocks per group
0=0+0 1=1+1 2=2+2 3=3+3	4=4+4 5=5+5	12 ARM clocks per group
0=0*0 					8 ARM clocks per group
0=0*0 1=1*1				8 ARM clocks per group
0=0*0 1=1*1 2=2*2			12 ARM clocks per group

# Others: cfcpyd, cfcvt64d, cfsh64 #0
0=0					8
0=0 1=1 2=2 3=3				8
0=0 1=1 2=2 3=3 4=4			10

# Moves to ARM regs
r0=mv0l				2 cycles
r0=mv0l r1=mv1l			2 cycles
r0=mv0l r1=mv1l r2=mv2l		3 cycles
r0=mv0l r0=mv1l			4 cycles
r0=mv0l r1=mv0l			2 cycles
# So writing twice to the same ARM register causes a 1-cycle stall (unlikely!)
r0=mv0l r1=r0			3 cycles
r0=mv0l r0=r1			2 cycles
r0=mv0l r1=r1			2 cycles

# Moves from ARM regs
mv0l=r0					4 cycles
mv0l=r0 mv1l=r1				4 cycles
mv0l=r0 mv0l=r1				8 cycles
mv0l=r0 mv0h=r1				8 cycles
mv0l=r0 mv1l=r1 mv2l=r2			4 cycles
mv0l=r0 mv1l=r1 mv2l=r2 mv3l=r3		4 cycles
mv0l=r0 mv1l=r1 mv2l=r2 mv3l=r3 mv4l=r4	5 cycles
mv0l=r0 mv1l=r1 mv2l=r2 mv2l=r3		8 cycles
# the classic
mv0l=r0 mv0h=r1 mv1l=r2 mv1h=r3		10 cycles
# interleaved
mv0l=r0 mv1l=r2 mv0h=r1 mv1h=r3		8 cycles
mv0l=r0 mv1l=r2 mv2l=r4 mv0h=r1 mv1h=r3 mv2h=r5	8 cycles
mv0l=r0 mv1l=r2 mv2l=r4 mv3l=r6 mv0h=r1 mv1h=r3 mv2h=r5 mv3h=r7	8 cycles
[5 interleaved pairs]			10 cycles
# So you can do one tranfer per cycle unless it's to the same mav reg again
# which has a 4-cycle latency. (likely in the 2-arm-to-one-mav register load)

# Write-use latency of mav regs
mv0l=r0 mv1=mv0				6 cycles
mv0l=r0 mv0=mv1				10 cycles # waits longer to overwrite!
mv0l=r0 mv2=mv1				4 cycles
mv0l=r0 mv2=mv1	mv4=mv3			4 cycles
mv0l=r0 mv2=mv1	mv4=mv3 mv6=mv5		6 cycles
mv0l=r0 mv1l=r1 mv2=mv3 mv4=mv5		4 cycles
# so move from arm to maverick occupies the LS pipeline for 2 cycles
# and maverick writes go through the data pipeline (4 cycles) before they are
# writable again or readable ?

# Simultaneous read and write to/from ARM/Mav regs
mv0l=r0 				4
mv0l=r0 r1=mv1l				4
mv0l=r0 r1=mv1l r2=mv2l			4
mv0l=r0 r1=mv1l r2=mv2l	r3=mv3l		4
mv0l=r0 mv1l=r1				4
mv0l=r0 mv1l=r1				4

# Loads from cached memory
mv0f=[r0]				4
mv0f=[r0] mv1f=[r1] mv2f=[r2] mv3f=[r3]	4
mv0f=[r0] mv1f=[r1] mv2f=[r2] mv3f=[r3]	4
4 double loads				8
3 double loads				6
2 double loads				5
1 double loads				5

# Stores to memory
1 double store				40 cycles	17 on simone
2 double stores				40 cycles	21 on simone
3 double stores				64 cycles	38 on simone
4 double stores				64 cycles	42 on simone
5 double stores				89 cycles	59 on simone
5 double stores to the same location	200 cycles	84.5 on simone

1 arm reg to the same location		13 cycles
2 arm regs to the same location		37 cycles
2 arm regs to the adjacent locations	39 cycles	17 on simone
