ARM assembler pack: profiler-assisted optimizations and NEON support.

This commit is contained in:
Andy Polyakov 2011-04-01 20:58:34 +00:00
parent d8d958323b
commit 1e86318091
13 changed files with 715 additions and 224 deletions

View File

@ -398,8 +398,9 @@ my %table=(
"linux-alpha-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",
"linux-alpha+bwx-ccc","ccc:-fast -readonly_strings -DL_ENDIAN -DTERMIO::-D_REENTRANT:::SIXTY_FOUR_BIT_LONG RC4_CHAR RC4_CHUNK DES_INT DES_PTR DES_RISC1 DES_UNROLL:${alpha_asm}",
# Android: linux-armv4 but without -DTERMIO and pointers to headers and libs.
"android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
# Android: linux-* but without -DTERMIO and pointers to headers and libs.
"android","gcc:-mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${no_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
"android-armv7","gcc:-march=armv7-a -mandroid -I\$(ANDROID_DEV)/include -B\$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall::-D_REENTRANT::-ldl:BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR:${armv4_asm}:dlfcn:linux-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",
#### *BSD [do see comment about ${BSDthreads} above!]
"BSD-generic32","gcc:-DTERMIOS -O3 -fomit-frame-pointer -Wall::${BSDthreads}:::BN_LLONG RC2_CHAR RC4_INDEX DES_INT DES_UNROLL:${no_asm}:dlfcn:bsd-gcc-shared:-fPIC::.so.\$(SHLIB_MAJOR).\$(SHLIB_MINOR)",

32
TABLE
View File

@ -1001,6 +1001,38 @@ $sys_id =
$lflags = -ldl
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR
$cpuid_obj =
$bn_obj =
$des_obj =
$aes_obj =
$bf_obj =
$md5_obj =
$sha1_obj =
$cast_obj =
$rc4_obj =
$rmd160_obj =
$rc5_obj =
$wp_obj =
$cmll_obj =
$modes_obj =
$perlasm_scheme = void
$dso_scheme = dlfcn
$shared_target= linux-shared
$shared_cflag = -fPIC
$shared_ldflag =
$shared_extension = .so.$(SHLIB_MAJOR).$(SHLIB_MINOR)
$ranlib =
$arflags =
$multilib =
*** android-armv7
$cc = gcc
$cflags = -march=armv7-a -mandroid -I$(ANDROID_DEV)/include -B$(ANDROID_DEV)/lib -O3 -fomit-frame-pointer -Wall
$unistd =
$thread_cflag = -D_REENTRANT
$sys_id =
$lflags = -ldl
$bn_ops = BN_LLONG RC4_CHAR RC4_CHUNK DES_INT DES_UNROLL BF_PTR
$cpuid_obj =
$bn_obj = bn_asm.o armv4-mont.o
$des_obj =
$aes_obj = aes_cbc.o aes-armv4.o

1
config
View File

@ -821,6 +821,7 @@ case "$GUESSOS" in
beos-*) OUT="$GUESSOS" ;;
x86pc-*-qnx6) OUT="QNX6-i386" ;;
*-*-qnx6) OUT="QNX6" ;;
armv[7-9]*-*-android) OUT="android-armv7" ;;
*) OUT=`echo $GUESSOS | awk -F- '{print $3}'`;;
esac

View File

@ -68,7 +68,8 @@ aes-parisc.s: asm/aes-parisc.pl
$(PERL) asm/aes-parisc.pl $(PERLASM_SCHEME) $@
# GNU make "catch all"
aes-%.s: asm/aes-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
aes-%.S: asm/aes-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
aes-armv4.o: aes-armv4.S
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO

View File

@ -27,6 +27,11 @@
# Rescheduling for dual-issue pipeline resulted in 12% improvement on
# Cortex A8 core and ~25 cycles per byte processed with 128-bit key.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~21.5 cycles per byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@ -46,6 +51,7 @@ $key="r11";
$rounds="r12";
$code=<<___;
#include "arm_arch.h"
.text
.code 32
@ -166,7 +172,7 @@ AES_encrypt:
mov $rounds,r0 @ inp
mov $key,r2
sub $tbl,r3,#AES_encrypt-AES_Te @ Te
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1]
@ -195,10 +201,33 @@ AES_encrypt:
orr $s3,$s3,$t1,lsl#8
orr $s3,$s3,$t2,lsl#16
orr $s3,$s3,$t3,lsl#24
#else
ldr $s0,[$rounds,#0]
ldr $s1,[$rounds,#4]
ldr $s2,[$rounds,#8]
ldr $s3,[$rounds,#12]
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
#endif
bl _armv4_AES_encrypt
ldr $rounds,[sp],#4 @ pop out
#if __ARM_ARCH__>=7
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
str $s0,[$rounds,#0]
str $s1,[$rounds,#4]
str $s2,[$rounds,#8]
str $s3,[$rounds,#12]
#else
mov $t1,$s0,lsr#24 @ write output in endian-neutral
mov $t2,$s0,lsr#16 @ manner...
mov $t3,$s0,lsr#8
@ -227,11 +256,15 @@ AES_encrypt:
strb $t2,[$rounds,#13]
strb $t3,[$rounds,#14]
strb $s3,[$rounds,#15]
#endif
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size AES_encrypt,.-AES_encrypt
.type _armv4_AES_encrypt,%function
@ -271,11 +304,11 @@ _armv4_AES_encrypt:
and $i2,lr,$s2,lsr#16 @ i1
eor $t3,$t3,$i3,ror#8
and $i3,lr,$s2
eor $s1,$s1,$t1,ror#24
ldr $i1,[$tbl,$i1,lsl#2] @ Te2[s2>>8]
eor $s1,$s1,$t1,ror#24
ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
mov $s2,$s2,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Te1[s2>>16]
ldr $i3,[$tbl,$i3,lsl#2] @ Te3[s2>>0]
eor $s0,$s0,$i1,ror#16
ldr $s2,[$tbl,$s2,lsl#2] @ Te0[s2>>24]
@ -284,16 +317,16 @@ _armv4_AES_encrypt:
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$t3,$i3,ror#16
and $i3,lr,$s3,lsr#16 @ i2
eor $s2,$s2,$t2,ror#16
ldr $i1,[$tbl,$i1,lsl#2] @ Te3[s3>>0]
eor $s2,$s2,$t2,ror#16
ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
mov $s3,$s3,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Te2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Te1[s3>>16]
eor $s0,$s0,$i1,ror#24
ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
eor $s1,$s1,$i2,ror#16
ldr $i1,[$key],#16
eor $s1,$s1,$i2,ror#16
ldr $s3,[$tbl,$s3,lsl#2] @ Te0[s3>>24]
eor $s2,$s2,$i3,ror#8
ldr $t1,[$key,#-12]
eor $s3,$s3,$t3,ror#8
@ -333,11 +366,11 @@ _armv4_AES_encrypt:
and $i2,lr,$s2,lsr#16 @ i1
eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s2
eor $s1,$t1,$s1,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s2>>8]
eor $s1,$t1,$s1,lsl#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
mov $s2,$s2,lsr#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s2>>16]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s2>>0]
eor $s0,$i1,$s0,lsl#8
ldrb $s2,[$tbl,$s2,lsl#2] @ Te4[s2>>24]
@ -346,15 +379,15 @@ _armv4_AES_encrypt:
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$i3,$t3,lsl#8
and $i3,lr,$s3,lsr#16 @ i2
eor $s2,$t2,$s2,lsl#24
ldrb $i1,[$tbl,$i1,lsl#2] @ Te4[s3>>0]
eor $s2,$t2,$s2,lsl#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
mov $s3,$s3,lsr#24
ldrb $i2,[$tbl,$i2,lsl#2] @ Te4[s3>>8]
ldrb $i3,[$tbl,$i3,lsl#2] @ Te4[s3>>16]
eor $s0,$i1,$s0,lsl#8
ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
ldr $i1,[$key,#0]
ldrb $s3,[$tbl,$s3,lsl#2] @ Te4[s3>>24]
eor $s1,$s1,$i2,lsl#8
ldr $t1,[$key,#4]
eor $s2,$s2,$i3,lsl#16
@ -398,6 +431,7 @@ AES_set_encrypt_key:
mov lr,r1 @ bits
mov $key,r2 @ key
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1]
@ -430,6 +464,22 @@ AES_set_encrypt_key:
orr $s3,$s3,$t3,lsl#24
str $s2,[$key,#-8]
str $s3,[$key,#-4]
#else
ldr $s0,[$rounds,#0]
ldr $s1,[$rounds,#4]
ldr $s2,[$rounds,#8]
ldr $s3,[$rounds,#12]
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
str $s0,[$key],#16
str $s1,[$key,#-12]
str $s2,[$key,#-8]
str $s3,[$key,#-4]
#endif
teq lr,#128
bne .Lnot128
@ -466,6 +516,7 @@ AES_set_encrypt_key:
b .Ldone
.Lnot128:
#if __ARM_ARCH__<7
ldrb $i2,[$rounds,#19]
ldrb $t1,[$rounds,#18]
ldrb $t2,[$rounds,#17]
@ -482,6 +533,16 @@ AES_set_encrypt_key:
str $i2,[$key],#8
orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4]
#else
ldr $i2,[$rounds,#16]
ldr $i3,[$rounds,#20]
#ifdef __ARMEL__
rev $i2,$i2
rev $i3,$i3
#endif
str $i2,[$key],#8
str $i3,[$key,#-4]
#endif
teq lr,#192
bne .Lnot192
@ -526,6 +587,7 @@ AES_set_encrypt_key:
b .L192_loop
.Lnot192:
#if __ARM_ARCH__<7
ldrb $i2,[$rounds,#27]
ldrb $t1,[$rounds,#26]
ldrb $t2,[$rounds,#25]
@ -542,6 +604,16 @@ AES_set_encrypt_key:
str $i2,[$key],#8
orr $i3,$i3,$t3,lsl#24
str $i3,[$key,#-4]
#else
ldr $i2,[$rounds,#24]
ldr $i3,[$rounds,#28]
#ifdef __ARMEL__
rev $i2,$i2
rev $i3,$i3
#endif
str $i2,[$key],#8
str $i3,[$key,#-4]
#endif
mov $rounds,#14
str $rounds,[$key,#240-32]
@ -692,10 +764,14 @@ $code.=<<___;
bne .Lmix
mov r0,#0
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size AES_set_decrypt_key,.-AES_set_decrypt_key
.type AES_Td,%object
@ -811,7 +887,7 @@ AES_decrypt:
mov $rounds,r0 @ inp
mov $key,r2
sub $tbl,r3,#AES_decrypt-AES_Td @ Td
#if __ARM_ARCH__<7
ldrb $s0,[$rounds,#3] @ load input data in endian-neutral
ldrb $t1,[$rounds,#2] @ manner...
ldrb $t2,[$rounds,#1]
@ -840,10 +916,33 @@ AES_decrypt:
orr $s3,$s3,$t1,lsl#8
orr $s3,$s3,$t2,lsl#16
orr $s3,$s3,$t3,lsl#24
#else
ldr $s0,[$rounds,#0]
ldr $s1,[$rounds,#4]
ldr $s2,[$rounds,#8]
ldr $s3,[$rounds,#12]
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
#endif
bl _armv4_AES_decrypt
ldr $rounds,[sp],#4 @ pop out
#if __ARM_ARCH__>=7
#ifdef __ARMEL__
rev $s0,$s0
rev $s1,$s1
rev $s2,$s2
rev $s3,$s3
#endif
str $s0,[$rounds,#0]
str $s1,[$rounds,#4]
str $s2,[$rounds,#8]
str $s3,[$rounds,#12]
#else
mov $t1,$s0,lsr#24 @ write output in endian-neutral
mov $t2,$s0,lsr#16 @ manner...
mov $t3,$s0,lsr#8
@ -872,11 +971,15 @@ AES_decrypt:
strb $t2,[$rounds,#13]
strb $t3,[$rounds,#14]
strb $s3,[$rounds,#15]
#endif
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size AES_decrypt,.-AES_decrypt
.type _armv4_AES_decrypt,%function
@ -916,11 +1019,11 @@ _armv4_AES_decrypt:
and $i2,lr,$s2 @ i1
eor $t3,$i3,$t3,ror#8
and $i3,lr,$s2,lsr#16
eor $s1,$s1,$t1,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td2[s2>>8]
eor $s1,$s1,$t1,ror#8
ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
mov $s2,$s2,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Td3[s2>>0]
ldr $i3,[$tbl,$i3,lsl#2] @ Td1[s2>>16]
eor $s0,$s0,$i1,ror#16
ldr $s2,[$tbl,$s2,lsl#2] @ Td0[s2>>24]
@ -929,22 +1032,22 @@ _armv4_AES_decrypt:
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$i3,$t3,ror#8
and $i3,lr,$s3 @ i2
eor $s2,$s2,$t2,ror#8
ldr $i1,[$tbl,$i1,lsl#2] @ Td1[s3>>16]
eor $s2,$s2,$t2,ror#8
ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
mov $s3,$s3,lsr#24
ldr $i2,[$tbl,$i2,lsl#2] @ Td2[s3>>8]
ldr $i3,[$tbl,$i3,lsl#2] @ Td3[s3>>0]
eor $s0,$s0,$i1,ror#8
ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
eor $s1,$s1,$i2,ror#16
eor $s2,$s2,$i3,ror#24
ldr $i1,[$key],#16
eor $s3,$s3,$t3,ror#8
eor $s1,$s1,$i2,ror#16
ldr $s3,[$tbl,$s3,lsl#2] @ Td0[s3>>24]
eor $s2,$s2,$i3,ror#24
ldr $t1,[$key,#-12]
ldr $t2,[$key,#-8]
eor $s0,$s0,$i1
ldr $t2,[$key,#-8]
eor $s3,$s3,$t3,ror#8
ldr $t3,[$key,#-4]
and $i1,lr,$s0,lsr#16
eor $s1,$s1,$t1
@ -985,11 +1088,11 @@ _armv4_AES_decrypt:
and $i1,lr,$s2,lsr#8 @ i0
eor $t2,$t2,$i2,lsl#8
and $i2,lr,$s2 @ i1
eor $t3,$t3,$i3,lsl#8
ldrb $i1,[$tbl,$i1] @ Td4[s2>>8]
eor $t3,$t3,$i3,lsl#8
ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
and $i3,lr,$s2,lsr#16
ldrb $i2,[$tbl,$i2] @ Td4[s2>>0]
ldrb $s2,[$tbl,$s2,lsr#24] @ Td4[s2>>24]
eor $s0,$s0,$i1,lsl#8
ldrb $i3,[$tbl,$i3] @ Td4[s2>>16]
@ -997,11 +1100,11 @@ _armv4_AES_decrypt:
and $i1,lr,$s3,lsr#16 @ i0
eor $s2,$t2,$s2,lsl#16
and $i2,lr,$s3,lsr#8 @ i1
eor $t3,$t3,$i3,lsl#16
ldrb $i1,[$tbl,$i1] @ Td4[s3>>16]
eor $t3,$t3,$i3,lsl#16
ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
and $i3,lr,$s3 @ i2
ldrb $i2,[$tbl,$i2] @ Td4[s3>>8]
ldrb $i3,[$tbl,$i3] @ Td4[s3>>0]
ldrb $s3,[$tbl,$s3,lsr#24] @ Td4[s3>>24]
eor $s0,$s0,$i1,lsl#16

View File

@ -57,7 +57,9 @@ ghash-parisc.s: asm/ghash-parisc.pl
$(PERL) asm/ghash-parisc.pl $(PERLASM_SCHEME) $@
# GNU make "catch all"
ghash-%.s: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
ghash-%.S: asm/ghash-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
ghash-armv4.o: ghash-armv4.S
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO

View File

@ -25,6 +25,18 @@
# Cortex A8 core and ~25 cycles per processed byte (which was observed
# to be ~3 times faster than gcc-generated code:-)
#
# February 2011
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Cortex A8 core and ~23.5 cycles per byte.
#
# March 2011
#
# Add NEON implementation featuring polynomial multiplication, i.e. no
# lookup tables involved. On Cortex A8 it was measured to process one
# byte in 15 cycles or 55% faster than integer-only code.
# ====================================================================
# Note about "528B" variant. In ARM case it makes lesser sense to
# implement it for following reasons:
#
@ -52,6 +64,7 @@ $Xi="r0"; # argument block
$Htbl="r1";
$inp="r2";
$len="r3";
$Zll="r4"; # variables
$Zlh="r5";
$Zhl="r6";
@ -72,8 +85,13 @@ sub Zsmash() {
my $i=12;
my @args=@_;
for ($Zll,$Zlh,$Zhl,$Zhh) {
# can be reduced to single "str $_,[$Xi,$i]" on big-endian platforms
$code.=<<___;
#if __ARM_ARCH__>=7 && defined(__ARMEL__)
rev $_,$_
str $_,[$Xi,#$i]
#elif defined(__ARMEB__)
str $_,[$Xi,#$i]
#else
mov $Tlh,$_,lsr#8
strb $_,[$Xi,#$i+3]
mov $Thl,$_,lsr#16
@ -81,6 +99,7 @@ sub Zsmash() {
mov $Thh,$_,lsr#24
strb $Thl,[$Xi,#$i+1]
strb $Thh,[$Xi,#$i]
#endif
___
$code.="\t".shift(@args)."\n";
$i-=4;
@ -88,6 +107,8 @@ ___
}
$code=<<___;
#include "arm_arch.h"
.text
.code 32
@ -149,41 +170,41 @@ gcm_ghash_4bit:
and $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16
.Loop:
.Linner:
add $Thh,$Htbl,$nlo,lsl#4
subs $cnt,$cnt,#1
and $nlo,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
subs $cnt,$cnt,#1
add $nlo,$nlo,$nlo
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
eor $Zlh,$Zlh,$Zhl,lsl#28
ldrh $Tll,[sp,$nlo] @ rem_4bit[rem]
eor $Zhl,$Thl,$Zhl,lsr#4
ldrplb $nlo,[$inp,$cnt]
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
ldrplb $nlo,[$inp,$cnt]
add $Thh,$Htbl,$nhi
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
add $nhi,$nhi,$nhi
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[sp,$nhi] @ rem_4bit[rem]
ldrplb $Tll,[$Xi,$cnt]
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
ldrplb $nhi,[$Xi,$cnt]
ldrh $Tlh,[sp,$nhi]
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
eorpl $nlo,$nlo,$nhi
eorpl $nlo,$nlo,$Tll
eor $Zhh,$Thh,$Zhh,lsr#4
andpl $nhi,$nlo,#0xf0
andpl $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
bpl .Loop
eor $Zhh,$Zhh,$Tlh,lsl#16 @ ^= rem_4bit[rem]
bpl .Linner
ldr $len,[sp,#32] @ re-load $len/end
add $inp,$inp,#16
@ -194,10 +215,14 @@ $code.=<<___;
bne .Louter
add sp,sp,#36
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size gcm_ghash_4bit,.-gcm_ghash_4bit
.global gcm_gmult_4bit
@ -231,31 +256,31 @@ gcm_gmult_4bit:
eor $Zhh,$Zhh,$Tll,lsl#16
and $nlo,$nlo,#0x0f
.Loop2:
.Loop:
add $Thh,$Htbl,$nlo,lsl#4
subs $cnt,$cnt,#1
and $nlo,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
subs $cnt,$cnt,#1
add $nlo,$nlo,$nlo
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nlo]
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
eor $Zlh,$Zlh,$Zhl,lsl#28
ldrh $Tll,[$rem_4bit,$nlo] @ rem_4bit[rem]
eor $Zhl,$Thl,$Zhl,lsr#4
ldrplb $nlo,[$Xi,$cnt]
eor $Zhl,$Zhl,$Zhh,lsl#28
eor $Zhh,$Thh,$Zhh,lsr#4
ldrplb $nlo,[$Xi,$cnt]
add $Thh,$Htbl,$nhi
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
and $nhi,$Zll,#0xf @ rem
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
add $nhi,$nhi,$nhi
ldmia $Thh,{$Tll-$Thh} @ load Htbl[nhi]
eor $Zll,$Tll,$Zll,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zll,$Zll,$Zlh,lsl#28
eor $Zlh,$Tlh,$Zlh,lsr#4
ldrh $Tll,[$rem_4bit,$nhi] @ rem_4bit[rem]
eor $Zlh,$Zlh,$Zhl,lsl#28
eor $Zhl,$Thl,$Zhl,lsr#4
eor $Zhl,$Zhl,$Zhh,lsl#28
@ -263,16 +288,138 @@ gcm_gmult_4bit:
andpl $nhi,$nlo,#0xf0
andpl $nlo,$nlo,#0x0f
eor $Zhh,$Zhh,$Tll,lsl#16 @ ^= rem_4bit[rem]
bpl .Loop2
bpl .Loop
___
&Zsmash();
$code.=<<___;
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size gcm_gmult_4bit,.-gcm_gmult_4bit
.asciz "GHASH for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
___
{
my $cnt=$Htbl; # $Htbl is used once in the very beginning
my ($Hhi, $Hlo, $Zo, $T, $xi, $mod) = map("d$_",(0..7));
my ($Qhi, $Qlo, $Z, $R, $zero, $Qpost, $IN) = map("q$_",(8..15));
# Z:Zo keeps 128-bit result shifted by 1 to the right, with bottom bit
# in Zo. Or should I say "top bit", because GHASH is specified in
# reverse bit order? Otherwise straightforward 128-bt H by one input
# byte multiplication and modulo-reduction, times 16.
sub Dlo() { shift=~m|q([1]?[0-9])|?"d".($1*2):""; }
sub Dhi() { shift=~m|q([1]?[0-9])|?"d".($1*2+1):""; }
sub Q() { shift=~m|d([1-3]?[02468])|?"q".($1/2):""; }
$code.=<<___;
#if __ARM_ARCH__>=7
.fpu neon
.global gcm_gmult_neon
.type gcm_gmult_neon,%function
.align 4
gcm_gmult_neon:
sub $Htbl,#16 @ point at H in GCM128_CTX
vld1.64 `&Dhi("$IN")`,[$Xi,:64]!@ load Xi
vmov.i32 $mod,#0xe1 @ our irreducible polynomial
vld1.64 `&Dlo("$IN")`,[$Xi,:64]!
vshr.u64 $mod,#32
vldmia $Htbl,{$Hhi-$Hlo} @ load H
veor $zero,$zero
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
veor $Qpost,$Qpost
veor $R,$R
mov $cnt,#16
veor $Z,$Z
mov $len,#16
veor $Zo,$Zo
vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
b .Linner_neon
.size gcm_gmult_neon,.-gcm_gmult_neon
.global gcm_ghash_neon
.type gcm_ghash_neon,%function
.align 4
gcm_ghash_neon:
vld1.64 `&Dhi("$Z")`,[$Xi,:64]! @ load Xi
vmov.i32 $mod,#0xe1 @ our irreducible polynomial
vld1.64 `&Dlo("$Z")`,[$Xi,:64]!
vshr.u64 $mod,#32
vldmia $Xi,{$Hhi-$Hlo} @ load H
veor $zero,$zero
nop
#ifdef __ARMEL__
vrev64.8 $Z,$Z
#endif
.Louter_neon:
vld1.64 `&Dhi($IN)`,[$inp]! @ load inp
veor $Qpost,$Qpost
vld1.64 `&Dlo($IN)`,[$inp]!
veor $R,$R
mov $cnt,#16
#ifdef __ARMEL__
vrev64.8 $IN,$IN
#endif
veor $Zo,$Zo
veor $IN,$Z @ inp^=Xi
veor $Z,$Z
vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
.Linner_neon:
subs $cnt,$cnt,#1
vmull.p8 $Qlo,$Hlo,$xi @ H.lo·Xi[i]
vmull.p8 $Qhi,$Hhi,$xi @ H.hi·Xi[i]
vext.8 $IN,$zero,#1 @ IN>>=8
veor $Z,$Qpost @ modulo-scheduled part
vshl.i64 `&Dlo("$R")`,#48
vdup.8 $xi,`&Dlo("$IN")`[0] @ broadcast lowest byte
veor $T,`&Dlo("$Qlo")`,`&Dlo("$Z")`
veor `&Dhi("$Z")`,`&Dlo("$R")`
vuzp.8 $Qlo,$Qhi
vsli.8 $Zo,$T,#1 @ compose the "carry" byte
vext.8 $Z,$zero,#1 @ Z>>=8
vmull.p8 $R,$Zo,$mod @ "carry"·0xe1
vshr.u8 $Zo,$T,#7 @ save Z's bottom bit
vext.8 $Qpost,$Qlo,$zero,#1 @ Qlo>>=8
veor $Z,$Qhi
bne .Linner_neon
veor $Z,$Qpost @ modulo-scheduled artefact
vshl.i64 `&Dlo("$R")`,#48
veor `&Dhi("$Z")`,`&Dlo("$R")`
@ finalization, normalize Z:Zo
vand $Zo,$mod @ suffices to mask the bit
vshr.u64 `&Dhi(&Q("$Zo"))`,`&Dlo("$Z")`,#63
vshl.i64 $Z,#1
subs $len,#16
vorr $Z,`&Q("$Zo")` @ Z=Z:Zo<<1
bne .Louter_neon
#ifdef __ARMEL__
vrev64.8 $Z,$Z
#endif
sub $Xi,#16
vst1.64 `&Dhi("$Z")`,[$Xi,:64]! @ write out Xi
vst1.64 `&Dlo("$Z")`,[$Xi,:64]
bx lr
.size gcm_ghash_neon,.-gcm_ghash_neon
#endif
___
}
$code.=<<___;
.asciz "GHASH for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
___

View File

@ -642,27 +642,38 @@ static void gcm_gmult_1bit(u64 Xi[2],const u64 H[2])
#endif
#if TABLE_BITS==4 && defined(GHASH_ASM) && !defined(I386_ONLY) && \
#if TABLE_BITS==4 && defined(GHASH_ASM)
# if !defined(I386_ONLY) && \
(defined(__i386) || defined(__i386__) || \
defined(__x86_64) || defined(__x86_64__) || \
defined(_M_IX86) || defined(_M_AMD64) || defined(_M_X64))
# define GHASH_ASM_X86_OR_64
# define GHASH_ASM_X86_OR_64
# define GCM_FUNCREF_4BIT
extern unsigned int OPENSSL_ia32cap_P[2];
void gcm_init_clmul(u128 Htable[16],const u64 Xi[2]);
void gcm_gmult_clmul(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_clmul(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
# define GHASH_ASM_X86
# if defined(__i386) || defined(__i386__) || defined(_M_IX86)
# define GHASH_ASM_X86
void gcm_gmult_4bit_mmx(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_4bit_mmx(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
void gcm_gmult_4bit_x86(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_4bit_x86(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
# endif
# endif
# elif defined(__arm__) || defined(__arm)
# include "arm_arch.h"
# if __ARM_ARCH__>=7
# define GHASH_ASM_ARM
# define GCM_FUNCREF_4BIT
extern unsigned int OPENSSL_armcap;
# define GCM_FUNCREF_4BIT
void gcm_gmult_neon(u64 Xi[2],const u128 Htable[16]);
void gcm_ghash_neon(u64 Xi[2],const u128 Htable[16],const u8 *inp,size_t len);
# endif
# endif
#endif
void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
@ -715,6 +726,15 @@ void CRYPTO_gcm128_init(GCM128_CONTEXT *ctx,void *key,block128_f block)
ctx->gmult = gcm_gmult_4bit;
ctx->ghash = gcm_ghash_4bit;
# endif
# elif defined(GHASH_ASM_ARM)
if (OPENSSL_armcap & 1) {
ctx->gmult = gcm_gmult_neon;
ctx->ghash = gcm_ghash_neon;
} else {
gcm_init_4bit(ctx->Htable,ctx->H.u);
ctx->gmult = gcm_gmult_4bit;
ctx->ghash = gcm_ghash_4bit;
}
# else
gcm_init_4bit(ctx->Htable,ctx->H.u);
# endif

View File

@ -56,8 +56,8 @@ sha256-ia64.s: asm/sha512-ia64.pl
sha512-ia64.s: asm/sha512-ia64.pl
(cd asm; $(PERL) sha512-ia64.pl ../$@ $(CFLAGS))
sha256-armv4.s: asm/sha256-armv4.pl
$(PERL) $< $@
sha256-armv4.S: asm/sha256-armv4.pl
$(PERL) $< $(PERLASM_SCHEME) $@
sha1-alpha.s: asm/sha1-alpha.pl
$(PERL) $< | $(CC) -E - | tee $@ > /dev/null
@ -83,9 +83,13 @@ sha256-mips.s: asm/sha512-mips.pl; $(PERL) asm/sha512-mips.pl $(PERLASM_SCHEME)
sha512-mips.s: asm/sha512-mips.pl; $(PERL) asm/sha512-mips.pl $(PERLASM_SCHEME) $@
# GNU make "catch all"
sha1-%.s: asm/sha1-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
sha256-%.s: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
sha512-%.s: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
sha1-%.S: asm/sha1-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
sha256-%.S: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
sha512-%.S: asm/sha512-%.pl; $(PERL) $< $(PERLASM_SCHEME) $@
sha1-armv4-large.o: sha1-armv4-large.S
sha256-armv4.o: sha256-armv4.S
sha512-armv4.o: sha512-armv4.S
files:
$(PERL) $(TOP)/util/files.pl Makefile >> $(TOP)/MINFO

View File

@ -47,6 +47,10 @@
# Cortex A8 core and in absolute terms ~870 cycles per input block
# [or 13.6 cycles per byte].
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 10%
# improvement on Cortex A8 core and 12.2 cycles per byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
@ -76,31 +80,41 @@ $code.=<<___;
add $e,$K,$e,ror#2 @ E+=K_xx_xx
ldr $t3,[$Xi,#2*4]
eor $t0,$t0,$t1
eor $t2,$t2,$t3
eor $t2,$t2,$t3 @ 1 cycle stall
eor $t1,$c,$d @ F_xx_xx
mov $t0,$t0,ror#31
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
eor $t0,$t0,$t2,ror#31
str $t0,[$Xi,#-4]!
$opt1 @ F_xx_xx
$opt2 @ F_xx_xx
add $e,$e,$t0 @ E+=X[i]
str $t0,[$Xi,#-4]!
___
}
sub BODY_00_15 {
my ($a,$b,$c,$d,$e)=@_;
$code.=<<___;
#if __ARM_ARCH__<7
ldrb $t1,[$inp,#2]
ldrb $t0,[$inp,#3]
ldrb $t2,[$inp,#1]
add $e,$K,$e,ror#2 @ E+=K_00_19
ldrb $t3,[$inp],#4
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
orr $t0,$t0,$t1,lsl#8
eor $t1,$c,$d @ F_xx_xx
orr $t0,$t0,$t2,lsl#16
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
orr $t0,$t0,$t3,lsl#24
#else
ldr $t0,[$inp],#4 @ handles unaligned
add $e,$K,$e,ror#2 @ E+=K_00_19
eor $t1,$c,$d @ F_xx_xx
add $e,$e,$a,ror#27 @ E+=ROR(A,27)
#ifdef __ARMEL__
rev $t0,$t0 @ byte swap
#endif
#endif
and $t1,$b,$t1,ror#2
add $e,$e,$t0 @ E+=X[i]
eor $t1,$t1,$d,ror#2 @ F_00_19(B,C,D)
@ -136,6 +150,8 @@ ___
}
$code=<<___;
#include "arm_arch.h"
.text
.global sha1_block_data_order
@ -209,10 +225,14 @@ $code.=<<___;
teq $inp,$len
bne .Lloop @ [+18], total 1307
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.align 2
.LK_00_19: .word 0x5a827999
.LK_20_39: .word 0x6ed9eba1

View File

@ -18,11 +18,16 @@
# Rescheduling for dual-issue pipeline resulted in 22% improvement on
# Cortex A8 core and ~20 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 16%
# improvement on Cortex A8 core and ~17 cycles per processed byte.
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$ctx="r0"; $t0="r0";
$inp="r1";
$inp="r1"; $t3="r1";
$len="r2"; $t1="r2";
$T1="r3";
$A="r4";
@ -46,6 +51,9 @@ sub BODY_00_15 {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___ if ($i<16);
#if __ARM_ARCH__>=7
ldr $T1,[$inp],#4
#else
ldrb $T1,[$inp,#3] @ $i
ldrb $t2,[$inp,#2]
ldrb $t1,[$inp,#1]
@ -53,16 +61,24 @@ $code.=<<___ if ($i<16);
orr $T1,$T1,$t2,lsl#8
orr $T1,$T1,$t1,lsl#16
orr $T1,$T1,$t0,lsl#24
`"str $inp,[sp,#17*4]" if ($i==15)`
#endif
___
$code.=<<___;
ldr $t2,[$Ktbl],#4 @ *K256++
mov $t0,$e,ror#$Sigma1[0]
str $T1,[sp,#`$i%16`*4]
ldr $t2,[$Ktbl],#4 @ *K256++
eor $t0,$t0,$e,ror#$Sigma1[1]
eor $t1,$f,$g
#if $i>=16
add $T1,$T1,$t3 @ from BODY_16_xx
#elif __ARM_ARCH__>=7 && defined(__ARMEL__)
rev $T1,$T1
#endif
#if $i==15
str $inp,[sp,#17*4] @ leave room for $t3
#endif
eor $t0,$t0,$e,ror#$Sigma1[2] @ Sigma1(e)
and $t1,$t1,$e
str $T1,[sp,#`$i%16`*4]
add $T1,$T1,$t0
eor $t1,$t1,$g @ Ch(e,f,g)
add $T1,$T1,$h
@ -71,6 +87,9 @@ $code.=<<___;
eor $h,$h,$a,ror#$Sigma0[1]
add $T1,$T1,$t2
eor $h,$h,$a,ror#$Sigma0[2] @ Sigma0(a)
#if $i>=15
ldr $t3,[sp,#`($i+2)%16`*4] @ from BODY_16_xx
#endif
orr $t0,$a,$b
and $t1,$a,$b
and $t0,$t0,$c
@ -85,24 +104,26 @@ sub BODY_16_XX {
my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
$code.=<<___;
ldr $t1,[sp,#`($i+1)%16`*4] @ $i
@ ldr $t3,[sp,#`($i+1)%16`*4] @ $i
ldr $t2,[sp,#`($i+14)%16`*4]
mov $t0,$t3,ror#$sigma0[0]
ldr $T1,[sp,#`($i+0)%16`*4]
mov $t0,$t1,ror#$sigma0[0]
ldr $inp,[sp,#`($i+9)%16`*4]
eor $t0,$t0,$t1,ror#$sigma0[1]
eor $t0,$t0,$t1,lsr#$sigma0[2] @ sigma0(X[i+1])
mov $t1,$t2,ror#$sigma1[0]
eor $t0,$t0,$t3,ror#$sigma0[1]
ldr $t1,[sp,#`($i+9)%16`*4]
eor $t0,$t0,$t3,lsr#$sigma0[2] @ sigma0(X[i+1])
mov $t3,$t2,ror#$sigma1[0]
add $T1,$T1,$t0
eor $t1,$t1,$t2,ror#$sigma1[1]
add $T1,$T1,$inp
eor $t1,$t1,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
eor $t3,$t3,$t2,ror#$sigma1[1]
add $T1,$T1,$t1
eor $t3,$t3,$t2,lsr#$sigma1[2] @ sigma1(X[i+14])
@ add $T1,$T1,$t3
___
&BODY_00_15(@_);
}
$code=<<___;
#include "arm_arch.h"
.text
.code 32
@ -132,7 +153,7 @@ K256:
sha256_block_data_order:
sub r3,pc,#8 @ sha256_block_data_order
add $len,$inp,$len,lsl#6 @ len to point at the end of inp
stmdb sp!,{$ctx,$inp,$len,r4-r12,lr}
stmdb sp!,{$ctx,$inp,$len,r4-r11,lr}
ldmia $ctx,{$A,$B,$C,$D,$E,$F,$G,$H}
sub $Ktbl,r3,#256 @ K256
sub sp,sp,#16*4 @ alloca(X[16])
@ -171,10 +192,14 @@ $code.=<<___;
bne .Loop
add sp,sp,#`16+3`*4 @ destroy frame
ldmia sp!,{r4-r12,lr}
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r11,pc}
#else
ldmia sp!,{r4-r11,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
#endif
.size sha256_block_data_order,.-sha256_block_data_order
.asciz "SHA256 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
.align 2

View File

@ -18,22 +18,33 @@
# Rescheduling for dual-issue pipeline resulted in 6% improvement on
# Cortex A8 core and ~40 cycles per processed byte.
# February 2011.
#
# Profiler-assisted and platform-specific optimization resulted in 7%
# improvement on Coxtex A8 core and ~38 cycles per byte.
# March 2011.
#
# Add NEON implementation. On Cortex A8 it was measured to process
# one byte in 25.5 cycles or 47% faster than integer-only code.
# Byte order [in]dependence. =========================================
#
# Caller is expected to maintain specific *dword* order in h[0-7],
# namely with most significant dword at *lower* address, which is
# reflected in below two parameters. *Byte* order within these dwords
# in turn is whatever *native* byte order on current platform.
$hi=0;
$lo=4;
# Originally caller was expected to maintain specific *dword* order in
# h[0-7], namely with most significant dword at *lower* address, which
# was reflected in below two parameters as 0 and 4. Now caller is
# expected to maintain native byte order for whole 64-bit values.
$hi="HI";
$lo="LO";
# ====================================================================
while (($output=shift) && ($output!~/^\w[\w\-]*\.\w+$/)) {}
open STDOUT,">$output";
$ctx="r0";
$ctx="r0"; # parameter block
$inp="r1";
$len="r2";
$Tlo="r3";
$Thi="r4";
$Alo="r5";
@ -61,15 +72,17 @@ $Xoff=8*8;
sub BODY_00_15() {
my $magic = shift;
$code.=<<___;
ldr $t2,[sp,#$Hoff+0] @ h.lo
ldr $t3,[sp,#$Hoff+4] @ h.hi
@ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41))
@ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23
@ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23
mov $t0,$Elo,lsr#14
str $Tlo,[sp,#$Xoff+0]
mov $t1,$Ehi,lsr#14
str $Thi,[sp,#$Xoff+4]
eor $t0,$t0,$Ehi,lsl#18
ldr $t2,[sp,#$Hoff+0] @ h.lo
eor $t1,$t1,$Elo,lsl#18
ldr $t3,[sp,#$Hoff+4] @ h.hi
eor $t0,$t0,$Elo,lsr#18
eor $t1,$t1,$Ehi,lsr#18
eor $t0,$t0,$Ehi,lsl#14
@ -96,25 +109,24 @@ $code.=<<___;
and $t1,$t1,$Ehi
str $Ahi,[sp,#$Aoff+4]
eor $t0,$t0,$t2
ldr $t2,[$Ktbl,#4] @ K[i].lo
ldr $t2,[$Ktbl,#$lo] @ K[i].lo
eor $t1,$t1,$t3 @ Ch(e,f,g)
ldr $t3,[$Ktbl,#0] @ K[i].hi
ldr $t3,[$Ktbl,#$hi] @ K[i].hi
adds $Tlo,$Tlo,$t0
ldr $Elo,[sp,#$Doff+0] @ d.lo
adc $Thi,$Thi,$t1 @ T += Ch(e,f,g)
ldr $Ehi,[sp,#$Doff+4] @ d.hi
adds $Tlo,$Tlo,$t2
and $t0,$t2,#0xff
adc $Thi,$Thi,$t3 @ T += K[i]
adds $Elo,$Elo,$Tlo
adc $Ehi,$Ehi,$Thi @ d += T
and $t0,$t2,#0xff
teq $t0,#$magic
orreq $Ktbl,$Ktbl,#1
ldr $t2,[sp,#$Boff+0] @ b.lo
adc $Ehi,$Ehi,$Thi @ d += T
teq $t0,#$magic
ldr $t3,[sp,#$Coff+0] @ c.lo
orreq $Ktbl,$Ktbl,#1
@ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39))
@ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25
@ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25
@ -131,80 +143,100 @@ $code.=<<___;
eor $t0,$t0,$Alo,lsl#25
eor $t1,$t1,$Ahi,lsl#25 @ Sigma0(a)
adds $Tlo,$Tlo,$t0
and $t0,$Alo,$t2
adc $Thi,$Thi,$t1 @ T += Sigma0(a)
and $t0,$Alo,$t2
orr $Alo,$Alo,$t2
ldr $t1,[sp,#$Boff+4] @ b.hi
orr $Alo,$Alo,$t2
ldr $t2,[sp,#$Coff+4] @ c.hi
and $Alo,$Alo,$t3
orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
and $t3,$Ahi,$t1
orr $Ahi,$Ahi,$t1
orr $Alo,$Alo,$t0 @ Maj(a,b,c).lo
and $Ahi,$Ahi,$t2
orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
adds $Alo,$Alo,$Tlo
adc $Ahi,$Ahi,$Thi @ h += T
orr $Ahi,$Ahi,$t3 @ Maj(a,b,c).hi
sub sp,sp,#8
adc $Ahi,$Ahi,$Thi @ h += T
tst $Ktbl,#1
add $Ktbl,$Ktbl,#8
___
}
$code=<<___;
#include "arm_arch.h"
#ifdef __ARMEL__
# define LO 0
# define HI 4
# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1
#else
# define HI 0
# define LO 4
# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1
#endif
.text
.code 32
.type K512,%object
.align 5
K512:
.word 0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd
.word 0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc
.word 0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019
.word 0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118
.word 0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe
.word 0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2
.word 0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1
.word 0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694
.word 0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3
.word 0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65
.word 0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483
.word 0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5
.word 0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210
.word 0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4
.word 0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725
.word 0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70
.word 0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926
.word 0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df
.word 0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8
.word 0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b
.word 0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001
.word 0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30
.word 0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910
.word 0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8
.word 0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53
.word 0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8
.word 0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb
.word 0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3
.word 0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60
.word 0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec
.word 0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9
.word 0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b
.word 0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207
.word 0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178
.word 0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6
.word 0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b
.word 0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493
.word 0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c
.word 0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a
.word 0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817
WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd)
WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc)
WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019)
WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118)
WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe)
WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2)
WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1)
WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694)
WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3)
WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65)
WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483)
WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5)
WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210)
WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4)
WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725)
WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70)
WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926)
WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df)
WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8)
WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b)
WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001)
WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30)
WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910)
WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8)
WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53)
WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8)
WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb)
WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3)
WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60)
WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec)
WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9)
WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b)
WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207)
WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178)
WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6)
WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b)
WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493)
WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c)
WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a)
WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817)
.size K512,.-K512
.LOPENSSL_armcap:
.word OPENSSL_armcap-sha512_block_data_order
.skip 32-4
.global sha512_block_data_order
.type sha512_block_data_order,%function
sha512_block_data_order:
sub r3,pc,#8 @ sha512_block_data_order
add $len,$inp,$len,lsl#7 @ len to point at the end of inp
#if __ARM_ARCH__>=7
ldr r12,.LOPENSSL_armcap
ldr r12,[r3,r12] @ OPENSSL_armcap
tst r12,#1
bne .LNEON
#endif
stmdb sp!,{r4-r12,lr}
sub $Ktbl,r3,#640 @ K512
sub $Ktbl,r3,#672 @ K512
sub sp,sp,#9*8
ldr $Elo,[$ctx,#$Eoff+$lo]
@ -238,6 +270,7 @@ sha512_block_data_order:
str $Thi,[sp,#$Foff+4]
.L00_15:
#if __ARM_ARCH__<7
ldrb $Tlo,[$inp,#7]
ldrb $t0, [$inp,#6]
ldrb $t1, [$inp,#5]
@ -252,26 +285,30 @@ sha512_block_data_order:
orr $Thi,$Thi,$t3,lsl#8
orr $Thi,$Thi,$t0,lsl#16
orr $Thi,$Thi,$t1,lsl#24
str $Tlo,[sp,#$Xoff+0]
str $Thi,[sp,#$Xoff+4]
#else
ldr $Tlo,[$inp,#4]
ldr $Thi,[$inp],#8
#ifdef __ARMEL__
rev $Tlo,$Tlo
rev $Thi,$Thi
#endif
#endif
___
&BODY_00_15(0x94);
$code.=<<___;
tst $Ktbl,#1
beq .L00_15
bic $Ktbl,$Ktbl,#1
.L16_79:
ldr $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldr $t1,[sp,#`$Xoff+8*(16-1)`+4]
ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
bic $Ktbl,$Ktbl,#1
.L16_79:
@ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
@ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25
@ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7
mov $Tlo,$t0,lsr#1
ldr $t2,[sp,#`$Xoff+8*(16-14)`+0]
mov $Thi,$t1,lsr#1
ldr $t3,[sp,#`$Xoff+8*(16-14)`+4]
eor $Tlo,$Tlo,$t1,lsl#31
eor $Thi,$Thi,$t0,lsl#31
eor $Tlo,$Tlo,$t0,lsr#8
@ -295,25 +332,24 @@ $code.=<<___;
eor $t1,$t1,$t3,lsl#3
eor $t0,$t0,$t2,lsr#6
eor $t1,$t1,$t3,lsr#6
ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
eor $t0,$t0,$t3,lsl#26
ldr $t2,[sp,#`$Xoff+8*(16-9)`+0]
ldr $t3,[sp,#`$Xoff+8*(16-9)`+4]
adds $Tlo,$Tlo,$t0
ldr $t0,[sp,#`$Xoff+8*16`+0]
adc $Thi,$Thi,$t1
ldr $t0,[sp,#`$Xoff+8*16`+0]
ldr $t1,[sp,#`$Xoff+8*16`+4]
adds $Tlo,$Tlo,$t2
adc $Thi,$Thi,$t3
adds $Tlo,$Tlo,$t0
adc $Thi,$Thi,$t1
str $Tlo,[sp,#$Xoff+0]
str $Thi,[sp,#$Xoff+4]
___
&BODY_00_15(0x17);
$code.=<<___;
tst $Ktbl,#1
ldreq $t0,[sp,#`$Xoff+8*(16-1)`+0]
ldreq $t1,[sp,#`$Xoff+8*(16-1)`+4]
beq .L16_79
bic $Ktbl,$Ktbl,#1
@ -324,12 +360,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Boff+$lo]
ldr $t3, [$ctx,#$Boff+$hi]
adds $t0,$Alo,$t0
adc $t1,$Ahi,$t1
adds $t2,$Tlo,$t2
adc $t3,$Thi,$t3
str $t0, [$ctx,#$Aoff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Aoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Boff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Boff+$hi]
ldr $Alo,[sp,#$Coff+0]
@ -341,12 +377,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Doff+$lo]
ldr $t3, [$ctx,#$Doff+$hi]
adds $t0,$Alo,$t0
adc $t1,$Ahi,$t1
adds $t2,$Tlo,$t2
adc $t3,$Thi,$t3
str $t0, [$ctx,#$Coff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Coff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Doff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Doff+$hi]
ldr $Tlo,[sp,#$Foff+0]
@ -356,12 +392,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Foff+$lo]
ldr $t3, [$ctx,#$Foff+$hi]
adds $Elo,$Elo,$t0
adc $Ehi,$Ehi,$t1
adds $t2,$Tlo,$t2
adc $t3,$Thi,$t3
str $Elo,[$ctx,#$Eoff+$lo]
adc $Ehi,$Ehi,$t1
str $Ehi,[$ctx,#$Eoff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Foff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Foff+$hi]
ldr $Alo,[sp,#$Goff+0]
@ -373,12 +409,12 @@ $code.=<<___;
ldr $t2, [$ctx,#$Hoff+$lo]
ldr $t3, [$ctx,#$Hoff+$hi]
adds $t0,$Alo,$t0
adc $t1,$Ahi,$t1
adds $t2,$Tlo,$t2
adc $t3,$Thi,$t3
str $t0, [$ctx,#$Goff+$lo]
adc $t1,$Ahi,$t1
str $t1, [$ctx,#$Goff+$hi]
adds $t2,$Tlo,$t2
str $t2, [$ctx,#$Hoff+$lo]
adc $t3,$Thi,$t3
str $t3, [$ctx,#$Hoff+$hi]
add sp,sp,#640
@ -388,13 +424,156 @@ $code.=<<___;
bne .Loop
add sp,sp,#8*9 @ destroy frame
#if __ARM_ARCH__>=5
ldmia sp!,{r4-r12,pc}
#else
ldmia sp!,{r4-r12,lr}
tst lr,#1
moveq pc,lr @ be binary compatible with V4, yet
bx lr @ interoperable with Thumb ISA:-)
.size sha512_block_data_order,.-sha512_block_data_order
.asciz "SHA512 block transform for ARMv4, CRYPTOGAMS by <appro\@openssl.org>"
#endif
___
{
my @Sigma0=(28,34,39);
my @Sigma1=(14,18,41);
my @sigma0=(1, 8, 7);
my @sigma1=(19,61,6);
my $Ktbl="r3";
my $cnt="r12"; # volatile register known as ip, intra-procedure-call scratch
my @X=map("d$_",(0..15));
my @V=($A,$B,$C,$D,$E,$F,$G,$H)=map("d$_",(16..23));
sub NEON_00_15() {
my $i=shift;
my ($a,$b,$c,$d,$e,$f,$g,$h)=@_;
my ($t0,$t1,$t2,$T1,$K,$Ch,$Maj)=map("d$_",(24..31)); # temps
$code.=<<___ if ($i<16 || $i&1);
vshr.u64 $t0,$e,#@Sigma1[0] @ $i
#if $i<16
vld1.64 {@X[$i%16]},[$inp]! @ handles unaligned
#endif
vshr.u64 $t1,$e,#@Sigma1[1]
vshr.u64 $t2,$e,#@Sigma1[2]
___
$code.=<<___;
vld1.64 {$K},[$Ktbl,:64]! @ K[i++]
vsli.64 $t0,$e,#`64-@Sigma1[0]`
vsli.64 $t1,$e,#`64-@Sigma1[1]`
vsli.64 $t2,$e,#`64-@Sigma1[2]`
#if $i<16 && defined(__ARMEL__)
vrev64.8 @X[$i],@X[$i]
#endif
vadd.i64 $T1,$K,$h
veor $Ch,$f,$g
veor $t0,$t1
vand $Ch,$e
veor $t0,$t2 @ Sigma1(e)
veor $Ch,$g @ Ch(e,f,g)
vadd.i64 $T1,$t0
vshr.u64 $t0,$a,#@Sigma0[0]
vadd.i64 $T1,$Ch
vshr.u64 $t1,$a,#@Sigma0[1]
vshr.u64 $t2,$a,#@Sigma0[2]
vsli.64 $t0,$a,#`64-@Sigma0[0]`
vsli.64 $t1,$a,#`64-@Sigma0[1]`
vsli.64 $t2,$a,#`64-@Sigma0[2]`
vadd.i64 $T1,@X[$i%16]
vorr $Maj,$a,$c
vand $Ch,$a,$c
veor $h,$t0,$t1
vand $Maj,$b
veor $h,$t2 @ Sigma0(a)
vorr $Maj,$Ch @ Maj(a,b,c)
vadd.i64 $h,$T1
vadd.i64 $d,$T1
vadd.i64 $h,$Maj
___
}
sub NEON_16_79() {
my $i=shift;
if ($i&1) { &NEON_00_15($i,@_); return; }
# 2x-vectorized, therefore runs every 2nd round
my @X=map("q$_",(0..7)); # view @X as 128-bit vector
my ($t0,$t1,$s0,$s1) = map("q$_",(12..15)); # temps
my ($d0,$d1,$d2) = map("d$_",(24..26)); # temps from NEON_00_15
my $e=@_[4]; # $e from NEON_00_15
$i /= 2;
$code.=<<___;
vshr.u64 $t0,@X[($i+7)%8],#@sigma1[0]
vshr.u64 $t1,@X[($i+7)%8],#@sigma1[1]
vshr.u64 $s1,@X[($i+7)%8],#@sigma1[2]
vsli.64 $t0,@X[($i+7)%8],#`64-@sigma1[0]`
vext.8 $s0,@X[$i%8],@X[($i+1)%8],#8 @ X[i+1]
vsli.64 $t1,@X[($i+7)%8],#`64-@sigma1[1]`
veor $s1,$t0
vshr.u64 $t0,$s0,#@sigma0[0]
veor $s1,$t1 @ sigma1(X[i+14])
vshr.u64 $t1,$s0,#@sigma0[1]
vadd.i64 @X[$i%8],$s1
vshr.u64 $s1,$s0,#@sigma0[2]
vsli.64 $t0,$s0,#`64-@sigma0[0]`
vsli.64 $t1,$s0,#`64-@sigma0[1]`
vext.8 $s0,@X[($i+4)%8],@X[($i+5)%8],#8 @ X[i+9]
veor $s1,$t0
vshr.u64 $d0,$e,#@Sigma1[0] @ from NEON_00_15
vadd.i64 @X[$i%8],$s0
vshr.u64 $d1,$e,#@Sigma1[1] @ from NEON_00_15
veor $s1,$t1 @ sigma0(X[i+1])
vshr.u64 $d2,$e,#@Sigma1[2] @ from NEON_00_15
vadd.i64 @X[$i%8],$s1
___
&NEON_00_15(2*$i,@_);
}
$code.=<<___;
#if __ARM_ARCH__>=7
.fpu neon
.align 4
.LNEON:
dmb @ errata #451034 on early Cortex A8
vstmdb sp!,{d8-d15} @ ABI specification says so
sub $Ktbl,r3,#672 @ K512
vldmia $ctx,{$A-$H} @ load context
.Loop_neon:
___
for($i=0;$i<16;$i++) { &NEON_00_15($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
mov $cnt,#4
.L16_79_neon:
subs $cnt,#1
___
for(;$i<32;$i++) { &NEON_16_79($i,@V); unshift(@V,pop(@V)); }
$code.=<<___;
bne .L16_79_neon
vldmia $ctx,{d24-d31} @ load context to temp
vadd.i64 q8,q12 @ vectorized accumulate
vadd.i64 q9,q13
vadd.i64 q10,q14
vadd.i64 q11,q15
vstmia $ctx,{$A-$H} @ save context
teq $inp,$len
sub $Ktbl,#640 @ rewind K512
bne .Loop_neon
vldmia sp!,{d8-d15} @ epilogue
bx lr
#endif
___
}
$code.=<<___;
.size sha512_block_data_order,.-sha512_block_data_order
.asciz "SHA512 block transform for ARMv4/NEON, CRYPTOGAMS by <appro\@openssl.org>"
.align 2
.comm OPENSSL_armcap,4,4
___
$code =~ s/\`([^\`]*)\`/eval $1/gem;

View File

@ -61,19 +61,6 @@ const char SHA512_version[]="SHA-512" OPENSSL_VERSION_PTEXT;
int SHA384_Init (SHA512_CTX *c)
{
#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
/* maintain dword order required by assembler module */
unsigned int *h = (unsigned int *)c->h;
h[0] = 0xcbbb9d5d; h[1] = 0xc1059ed8;
h[2] = 0x629a292a; h[3] = 0x367cd507;
h[4] = 0x9159015a; h[5] = 0x3070dd17;
h[6] = 0x152fecd8; h[7] = 0xf70e5939;
h[8] = 0x67332667; h[9] = 0xffc00b31;
h[10] = 0x8eb44a87; h[11] = 0x68581511;
h[12] = 0xdb0c2e0d; h[13] = 0x64f98fa7;
h[14] = 0x47b5481d; h[15] = 0xbefa4fa4;
#else
c->h[0]=U64(0xcbbb9d5dc1059ed8);
c->h[1]=U64(0x629a292a367cd507);
c->h[2]=U64(0x9159015a3070dd17);
@ -82,7 +69,7 @@ int SHA384_Init (SHA512_CTX *c)
c->h[5]=U64(0x8eb44a8768581511);
c->h[6]=U64(0xdb0c2e0d64f98fa7);
c->h[7]=U64(0x47b5481dbefa4fa4);
#endif
c->Nl=0; c->Nh=0;
c->num=0; c->md_len=SHA384_DIGEST_LENGTH;
return 1;
@ -90,19 +77,6 @@ int SHA384_Init (SHA512_CTX *c)
int SHA512_Init (SHA512_CTX *c)
{
#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
/* maintain dword order required by assembler module */
unsigned int *h = (unsigned int *)c->h;
h[0] = 0x6a09e667; h[1] = 0xf3bcc908;
h[2] = 0xbb67ae85; h[3] = 0x84caa73b;
h[4] = 0x3c6ef372; h[5] = 0xfe94f82b;
h[6] = 0xa54ff53a; h[7] = 0x5f1d36f1;
h[8] = 0x510e527f; h[9] = 0xade682d1;
h[10] = 0x9b05688c; h[11] = 0x2b3e6c1f;
h[12] = 0x1f83d9ab; h[13] = 0xfb41bd6b;
h[14] = 0x5be0cd19; h[15] = 0x137e2179;
#else
c->h[0]=U64(0x6a09e667f3bcc908);
c->h[1]=U64(0xbb67ae8584caa73b);
c->h[2]=U64(0x3c6ef372fe94f82b);
@ -111,7 +85,7 @@ int SHA512_Init (SHA512_CTX *c)
c->h[5]=U64(0x9b05688c2b3e6c1f);
c->h[6]=U64(0x1f83d9abfb41bd6b);
c->h[7]=U64(0x5be0cd19137e2179);
#endif
c->Nl=0; c->Nh=0;
c->num=0; c->md_len=SHA512_DIGEST_LENGTH;
return 1;
@ -160,24 +134,6 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
if (md==0) return 0;
#if defined(SHA512_ASM) && (defined(__arm__) || defined(__arm))
/* recall assembler dword order... */
n = c->md_len;
if (n == SHA384_DIGEST_LENGTH || n == SHA512_DIGEST_LENGTH)
{
unsigned int *h = (unsigned int *)c->h, t;
for (n/=4;n;n--)
{
t = *(h++);
*(md++) = (unsigned char)(t>>24);
*(md++) = (unsigned char)(t>>16);
*(md++) = (unsigned char)(t>>8);
*(md++) = (unsigned char)(t);
}
}
else return 0;
#else
switch (c->md_len)
{
/* Let compiler decide if it's appropriate to unroll... */
@ -214,7 +170,7 @@ int SHA512_Final (unsigned char *md, SHA512_CTX *c)
/* ... as well as make sure md_len is not abused. */
default: return 0;
}
#endif
return 1;
}