am 241194ca: Merge "[MIPS] Benchmark test for MIPS memset16/memset32"
# By Duane Sand # Via Duane Sand (1) and Gerrit Code Review (1) * commit '241194cad4b27ffd1cb574cb10c4c6941ef3882f': [MIPS] Benchmark test for MIPS memset16/memset32
This commit is contained in:
commit
90dd140294
8 changed files with 766 additions and 0 deletions
|
|
@ -154,3 +154,5 @@ LOCAL_SRC_FILES := str_parms.c hashmap.c memory.c
|
||||||
LOCAL_SHARED_LIBRARIES := liblog
|
LOCAL_SHARED_LIBRARIES := liblog
|
||||||
LOCAL_MODULE_TAGS := optional
|
LOCAL_MODULE_TAGS := optional
|
||||||
include $(BUILD_EXECUTABLE)
|
include $(BUILD_EXECUTABLE)
|
||||||
|
|
||||||
|
include $(call all-makefiles-under,$(LOCAL_PATH))
|
||||||
|
|
|
||||||
1
libcutils/tests/Android.mk
Normal file
1
libcutils/tests/Android.mk
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
include $(all-subdir-makefiles)
|
||||||
23
libcutils/tests/memset_mips/Android.mk
Normal file
23
libcutils/tests/memset_mips/Android.mk
Normal file
|
|
@ -0,0 +1,23 @@
|
||||||
|
# Copyright 2012 The Android Open Source Project
|
||||||
|
|
||||||
|
ifeq ($(TARGET_ARCH),mips)
|
||||||
|
|
||||||
|
LOCAL_PATH:= $(call my-dir)
|
||||||
|
include $(CLEAR_VARS)
|
||||||
|
|
||||||
|
LOCAL_SRC_FILES:= \
|
||||||
|
test_memset.c \
|
||||||
|
android_memset_dumb.S \
|
||||||
|
android_memset_test.S \
|
||||||
|
memset_cmips.S \
|
||||||
|
memset_omips.S
|
||||||
|
|
||||||
|
LOCAL_MODULE:= test_memset
|
||||||
|
|
||||||
|
LOCAL_FORCE_STATIC_EXECUTABLE := true
|
||||||
|
LOCAL_STATIC_LIBRARIES := libcutils libc
|
||||||
|
LOCAL_MODULE_TAGS := tests
|
||||||
|
|
||||||
|
include $(BUILD_EXECUTABLE)
|
||||||
|
|
||||||
|
endif
|
||||||
36
libcutils/tests/memset_mips/android_memset_dumb.S
Normal file
36
libcutils/tests/memset_mips/android_memset_dumb.S
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
.global android_memset16_dumb
|
||||||
|
.type android_memset16_dumb, @function
|
||||||
|
android_memset16_dumb:
|
||||||
|
.ent android_memset16_dumb
|
||||||
|
|
||||||
|
.set noreorder
|
||||||
|
beqz $a2,9f
|
||||||
|
srl $a2,1
|
||||||
|
|
||||||
|
1: sh $a1,($a0)
|
||||||
|
subu $a2,1
|
||||||
|
bnez $a2,1b
|
||||||
|
addu $a0,2
|
||||||
|
.set reorder
|
||||||
|
|
||||||
|
9: j $ra
|
||||||
|
.end android_memset16_dumb
|
||||||
|
.size android_memset16_dumb,.-android_memset16_dumb
|
||||||
|
|
||||||
|
.global android_memset32_dumb
|
||||||
|
.type android_memset32_dumb, @function
|
||||||
|
android_memset32_dumb:
|
||||||
|
.ent android_memset32_dumb
|
||||||
|
.set noreorder
|
||||||
|
beqz $a2,9f
|
||||||
|
srl $a2,2
|
||||||
|
|
||||||
|
1: sw $a1,($a0)
|
||||||
|
subu $a2,1
|
||||||
|
bnez $a2,1b
|
||||||
|
addu $a0,4
|
||||||
|
.set reorder
|
||||||
|
|
||||||
|
9: j $ra
|
||||||
|
.end android_memset32_dumb
|
||||||
|
.size android_memset32_dumb,.-android_memset32_dumb
|
||||||
152
libcutils/tests/memset_mips/android_memset_test.S
Normal file
152
libcutils/tests/memset_mips/android_memset_test.S
Normal file
|
|
@ -0,0 +1,152 @@
|
||||||
|
/*
|
||||||
|
* Copyright (C) 2006 The android Open Source Project
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifdef NDEBUG
|
||||||
|
#define DBG #
|
||||||
|
#else
|
||||||
|
#define DBG
|
||||||
|
#endif
|
||||||
|
|
||||||
|
.text
|
||||||
|
.align
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Optimized memset16 for MIPS
|
||||||
|
*
|
||||||
|
* void android_memset16_test(uint16_t* dst, uint16_t value, size_t size);
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
.global android_memset16_test
|
||||||
|
.type android_memset16_test, @function
|
||||||
|
android_memset16_test:
|
||||||
|
.ent android_memset16_test
|
||||||
|
.set noreorder
|
||||||
|
|
||||||
|
/* Check parameters */
|
||||||
|
DBG andi $t0,$a0,1 /* $a0 must be halfword aligned */
|
||||||
|
DBG tne $t0
|
||||||
|
DBG lui $t1,0xffff /* $a1 must be 16bits */
|
||||||
|
DBG and $t1,$a1
|
||||||
|
DBG tne $t1
|
||||||
|
DBG andi $t2,$a2,1 /* $a2 must be even */
|
||||||
|
DBG tne $t2
|
||||||
|
|
||||||
|
#if (__mips==32) && (__mips_isa_rev>=2)
|
||||||
|
ins $a2,$0,0,1
|
||||||
|
#else
|
||||||
|
li $t0,~1
|
||||||
|
and $a2,$t0
|
||||||
|
#endif
|
||||||
|
|
||||||
|
move $t8,$ra
|
||||||
|
blez $a2,9f /* Anything to do? */
|
||||||
|
andi $t0,$a0,2 /* Check dst alignment */
|
||||||
|
/* Expand value to 32 bits and check destination alignment */
|
||||||
|
#if (__mips==32) && (__mips_isa_rev>=2)
|
||||||
|
beqz $t0,.Laligned32 /* dst is 32 bit aligned */
|
||||||
|
ins $a1,$a1,16,16
|
||||||
|
#else
|
||||||
|
sll $t2,$a1,16
|
||||||
|
beqz $t0,.Laligned32 /* dst is 32 bit aligned */
|
||||||
|
or $a1,$t2
|
||||||
|
#endif
|
||||||
|
sh $a1,($a0) /* do one halfword to get aligned */
|
||||||
|
subu $a2,2
|
||||||
|
addu $a0,2
|
||||||
|
|
||||||
|
.Laligned32:
|
||||||
|
and $t1,$a2,63 /* is there enough left to do a full 64 byte loop? */
|
||||||
|
beq $a2,$t1,1f
|
||||||
|
subu $t2,$a2,$t1 /* $t2 is the number of bytes to do in loop64 */
|
||||||
|
addu $t3,$a0,$t2 /* $t3 is the end marker for loop64 */
|
||||||
|
subu $a2,$t2
|
||||||
|
.Lloop64:
|
||||||
|
addu $a0,64
|
||||||
|
sw $a1,-64($a0)
|
||||||
|
sw $a1,-60($a0)
|
||||||
|
sw $a1,-56($a0)
|
||||||
|
sw $a1,-52($a0)
|
||||||
|
sw $a1,-48($a0)
|
||||||
|
sw $a1,-44($a0)
|
||||||
|
sw $a1,-40($a0)
|
||||||
|
sw $a1,-36($a0)
|
||||||
|
sw $a1,-32($a0)
|
||||||
|
sw $a1,-28($a0)
|
||||||
|
sw $a1,-24($a0)
|
||||||
|
sw $a1,-20($a0)
|
||||||
|
sw $a1,-16($a0)
|
||||||
|
sw $a1,-12($a0)
|
||||||
|
sw $a1,-8($a0)
|
||||||
|
bne $a0,$t3,.Lloop64
|
||||||
|
sw $a1,-4($a0)
|
||||||
|
|
||||||
|
/* Do the last 0..62 bytes */
|
||||||
|
1: li $t0,64+12
|
||||||
|
andi $t1,$a2,0x3c /* $t1 how many bytes to store using sw */
|
||||||
|
bal 1f
|
||||||
|
subu $t0,$t1 /* 64+12-$t0 is offset to jump from 1f */
|
||||||
|
1: addu $ra,$t0
|
||||||
|
j $ra
|
||||||
|
subu $a2,$t1
|
||||||
|
2: sw $a1,60($a0)
|
||||||
|
sw $a1,56($a0)
|
||||||
|
sw $a1,52($a0)
|
||||||
|
sw $a1,48($a0)
|
||||||
|
sw $a1,44($a0)
|
||||||
|
sw $a1,40($a0)
|
||||||
|
sw $a1,36($a0)
|
||||||
|
sw $a1,32($a0)
|
||||||
|
sw $a1,28($a0)
|
||||||
|
sw $a1,24($a0)
|
||||||
|
sw $a1,20($a0)
|
||||||
|
sw $a1,16($a0)
|
||||||
|
sw $a1,12($a0)
|
||||||
|
sw $a1,8($a0)
|
||||||
|
sw $a1,4($a0)
|
||||||
|
sw $a1,0($a0)
|
||||||
|
|
||||||
|
beqz $a2,9f
|
||||||
|
addu $a0,$t1
|
||||||
|
sh $a1,($a0)
|
||||||
|
|
||||||
|
9: j $t8
|
||||||
|
nop
|
||||||
|
.end android_memset16_test
|
||||||
|
.size android_memset16_test,.-android_memset16_test
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Optimized memset32 for MIPS
|
||||||
|
*
|
||||||
|
* void android_memset32_test(uint32_t* dst, uint32_t value, size_t size);
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
.global android_memset32_test
|
||||||
|
.type android_memset32_test, @function
|
||||||
|
android_memset32_test:
|
||||||
|
.ent android_memset32_test
|
||||||
|
.set noreorder
|
||||||
|
|
||||||
|
/* Check parameters */
|
||||||
|
DBG andi $t0,$a0,3 /* $a0 must be word aligned */
|
||||||
|
DBG tne $t0
|
||||||
|
DBG andi $t2,$a2,3 /* $a2 must be a multiple of 4 bytes */
|
||||||
|
DBG tne $t2
|
||||||
|
|
||||||
|
b .Laligned32
|
||||||
|
move $t8,$ra
|
||||||
|
.end android_memset32_test
|
||||||
|
.size android_memset32_test,.-android_memset32_test
|
||||||
227
libcutils/tests/memset_mips/memset_cmips.S
Normal file
227
libcutils/tests/memset_mips/memset_cmips.S
Normal file
|
|
@ -0,0 +1,227 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2009
|
||||||
|
* MIPS Technologies, Inc., California.
|
||||||
|
*
|
||||||
|
* Redistribution and use in source and binary forms, with or without
|
||||||
|
* modification, are permitted provided that the following conditions
|
||||||
|
* are met:
|
||||||
|
* 1. Redistributions of source code must retain the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer.
|
||||||
|
* 2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
* notice, this list of conditions and the following disclaimer in the
|
||||||
|
* documentation and/or other materials provided with the distribution.
|
||||||
|
* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
|
||||||
|
* contributors may be used to endorse or promote products derived from
|
||||||
|
* this software without specific prior written permission.
|
||||||
|
*
|
||||||
|
* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
|
||||||
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
|
||||||
|
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
||||||
|
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
||||||
|
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
||||||
|
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
||||||
|
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
||||||
|
* SUCH DAMAGE.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/************************************************************************
|
||||||
|
*
|
||||||
|
* memset.S, version "64h" with 1 cache line horizon for "pref 30" and 14 nops
|
||||||
|
* Version: "043009"
|
||||||
|
*
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
|
||||||
|
/************************************************************************
|
||||||
|
* Include files
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
|
#include "machine/asm.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This routine could be optimized for MIPS64. The current code only
|
||||||
|
* uses MIPS32 instructions.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if defined(__MIPSEB__)
|
||||||
|
# define SWHI swl /* high part is left in big-endian */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__MIPSEL__)
|
||||||
|
# define SWHI swr /* high part is right in little-endian */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !(defined(XGPROF) || defined(XPROF))
|
||||||
|
#undef SETUP_GP
|
||||||
|
#define SETUP_GP
|
||||||
|
#endif
|
||||||
|
|
||||||
|
LEAF(memset_cmips,0)
|
||||||
|
|
||||||
|
.set noreorder
|
||||||
|
.set noat
|
||||||
|
|
||||||
|
addu t0,a0,a2 # t0 is the "past the end" address
|
||||||
|
slti AT,a2,4 # is a2 less than 4?
|
||||||
|
bne AT,zero,.Llast4 # if yes, go to last4
|
||||||
|
move v0,a0 # memset returns the dst pointer
|
||||||
|
|
||||||
|
beq a1,zero,.Lset0
|
||||||
|
subu v1,zero,a0
|
||||||
|
|
||||||
|
# smear byte into 32 bit word
|
||||||
|
#if (__mips==32) && (__mips_isa_rev>=2)
|
||||||
|
ins a1, a1, 8, 8 # Replicate fill byte into half-word.
|
||||||
|
ins a1, a1, 16, 16 # Replicate fill byte into word.
|
||||||
|
#else
|
||||||
|
and a1,0xff
|
||||||
|
sll AT,a1,8
|
||||||
|
or a1,AT
|
||||||
|
sll AT,a1,16
|
||||||
|
or a1,AT
|
||||||
|
#endif
|
||||||
|
|
||||||
|
.Lset0: andi v1,v1,0x3 # word-unaligned address?
|
||||||
|
beq v1,zero,.Laligned # v1 is the unalignment count
|
||||||
|
subu a2,a2,v1
|
||||||
|
SWHI a1,0(a0)
|
||||||
|
addu a0,a0,v1
|
||||||
|
|
||||||
|
# Here we have the "word-aligned" a0 (until the "last4")
|
||||||
|
.Laligned:
|
||||||
|
andi t8,a2,0x3f # any 64-byte chunks?
|
||||||
|
# t8 is the byte count past 64-byte chunks
|
||||||
|
beq a2,t8,.Lchk8w # when a2==t8, no 64-byte chunks
|
||||||
|
# There will be at most 1 32-byte chunk then
|
||||||
|
subu a3,a2,t8 # subtract from a2 the reminder
|
||||||
|
# Here a3 counts bytes in 16w chunks
|
||||||
|
addu a3,a0,a3 # Now a3 is the final dst after 64-byte chunks
|
||||||
|
|
||||||
|
# Find out, if there are any 64-byte chunks after which will be still at least
|
||||||
|
# 96 bytes left. The value "96" is calculated as needed buffer for
|
||||||
|
# "pref 30,64(a0)" prefetch, which can be used as "pref 30,0(a0)" after
|
||||||
|
# incrementing "a0" by 64.
|
||||||
|
# For "a2" below 160 there will be no such "pref 30 safe" 64-byte chunk.
|
||||||
|
#
|
||||||
|
sltiu v1,a2,160
|
||||||
|
bgtz v1,.Lloop16w_nopref30 # skip "pref 30,0(a0)"
|
||||||
|
subu t7,a2,96 # subtract "pref 30 unsafe" region
|
||||||
|
# below we have at least 1 64-byte chunk which is "pref 30 safe"
|
||||||
|
andi t6,t7,0x3f # t6 is past "64-byte safe chunks" reminder
|
||||||
|
subu t5,t7,t6 # subtract from t7 the reminder
|
||||||
|
# Here t5 counts bytes in 16w "safe" chunks
|
||||||
|
addu t4,a0,t5 # Now t4 is the dst after 64-byte "safe" chunks
|
||||||
|
|
||||||
|
# Don't use "pref 30,0(a0)" for a0 in a "middle" of a cache line
|
||||||
|
# pref 30,0(a0)
|
||||||
|
# Here we are in the region, where it is safe to use "pref 30,64(a0)"
|
||||||
|
.Lloop16w:
|
||||||
|
addiu a0,a0,64
|
||||||
|
pref 30,-32(a0) # continue setting up the dest, addr 64-32
|
||||||
|
sw a1,-64(a0)
|
||||||
|
sw a1,-60(a0)
|
||||||
|
sw a1,-56(a0)
|
||||||
|
sw a1,-52(a0)
|
||||||
|
sw a1,-48(a0)
|
||||||
|
sw a1,-44(a0)
|
||||||
|
sw a1,-40(a0)
|
||||||
|
sw a1,-36(a0)
|
||||||
|
nop
|
||||||
|
nop # the extra nop instructions help to balance
|
||||||
|
nop # cycles needed for "store" + "fill" + "evict"
|
||||||
|
nop # For 64byte store there are needed 8 fill
|
||||||
|
nop # and 8 evict cycles, i.e. at least 32 instr.
|
||||||
|
nop
|
||||||
|
nop
|
||||||
|
pref 30,0(a0) # continue setting up the dest, addr 64-0
|
||||||
|
sw a1,-32(a0)
|
||||||
|
sw a1,-28(a0)
|
||||||
|
sw a1,-24(a0)
|
||||||
|
sw a1,-20(a0)
|
||||||
|
sw a1,-16(a0)
|
||||||
|
sw a1,-12(a0)
|
||||||
|
sw a1,-8(a0)
|
||||||
|
sw a1,-4(a0)
|
||||||
|
nop
|
||||||
|
nop
|
||||||
|
nop
|
||||||
|
nop # NOTE: adding 14 nop-s instead of 12 nop-s
|
||||||
|
nop # gives better results for "fast" memory
|
||||||
|
nop
|
||||||
|
bne a0,t4,.Lloop16w
|
||||||
|
nop
|
||||||
|
|
||||||
|
beq a0,a3,.Lchk8w # maybe no more 64-byte chunks?
|
||||||
|
nop # this "delayed slot" is useless ...
|
||||||
|
|
||||||
|
.Lloop16w_nopref30: # there could be up to 3 "64-byte nopref30" chunks
|
||||||
|
addiu a0,a0,64
|
||||||
|
sw a1,-64(a0)
|
||||||
|
sw a1,-60(a0)
|
||||||
|
sw a1,-56(a0)
|
||||||
|
sw a1,-52(a0)
|
||||||
|
sw a1,-48(a0)
|
||||||
|
sw a1,-44(a0)
|
||||||
|
sw a1,-40(a0)
|
||||||
|
sw a1,-36(a0)
|
||||||
|
sw a1,-32(a0)
|
||||||
|
sw a1,-28(a0)
|
||||||
|
sw a1,-24(a0)
|
||||||
|
sw a1,-20(a0)
|
||||||
|
sw a1,-16(a0)
|
||||||
|
sw a1,-12(a0)
|
||||||
|
sw a1,-8(a0)
|
||||||
|
bne a0,a3,.Lloop16w_nopref30
|
||||||
|
sw a1,-4(a0)
|
||||||
|
|
||||||
|
.Lchk8w: # t8 here is the byte count past 64-byte chunks
|
||||||
|
|
||||||
|
andi t7,t8,0x1f # is there a 32-byte chunk?
|
||||||
|
# the t7 is the reminder count past 32-bytes
|
||||||
|
beq t8,t7,.Lchk1w # when t8==t7, no 32-byte chunk
|
||||||
|
move a2,t7
|
||||||
|
|
||||||
|
sw a1,0(a0)
|
||||||
|
sw a1,4(a0)
|
||||||
|
sw a1,8(a0)
|
||||||
|
sw a1,12(a0)
|
||||||
|
sw a1,16(a0)
|
||||||
|
sw a1,20(a0)
|
||||||
|
sw a1,24(a0)
|
||||||
|
sw a1,28(a0)
|
||||||
|
addiu a0,a0,32
|
||||||
|
|
||||||
|
.Lchk1w:
|
||||||
|
andi t8,a2,0x3 # now t8 is the reminder past 1w chunks
|
||||||
|
beq a2,t8,.Llast4
|
||||||
|
subu a3,a2,t8 # a3 is the count of bytes in 1w chunks
|
||||||
|
addu a3,a0,a3 # now a3 is the dst address past the 1w chunks
|
||||||
|
|
||||||
|
# copying in words (4-byte chunks)
|
||||||
|
.LwordCopy_loop:
|
||||||
|
addiu a0,a0,4
|
||||||
|
bne a0,a3,.LwordCopy_loop
|
||||||
|
sw a1,-4(a0)
|
||||||
|
|
||||||
|
.Llast4:beq a0,t0,.Llast4e
|
||||||
|
.Llast4l:addiu a0,a0,1
|
||||||
|
bne a0,t0,.Llast4l
|
||||||
|
sb a1,-1(a0)
|
||||||
|
|
||||||
|
.Llast4e:
|
||||||
|
j ra
|
||||||
|
nop
|
||||||
|
|
||||||
|
.set at
|
||||||
|
.set reorder
|
||||||
|
|
||||||
|
END(memset_cmips)
|
||||||
|
|
||||||
|
|
||||||
|
/************************************************************************
|
||||||
|
* Implementation : Static functions
|
||||||
|
************************************************************************/
|
||||||
|
|
||||||
90
libcutils/tests/memset_mips/memset_omips.S
Normal file
90
libcutils/tests/memset_mips/memset_omips.S
Normal file
|
|
@ -0,0 +1,90 @@
|
||||||
|
/* Copyright (C) 2002, 2003 Free Software Foundation, Inc.
|
||||||
|
This file is part of the GNU C Library.
|
||||||
|
Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
|
||||||
|
|
||||||
|
The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
modify it under the terms of the GNU Lesser General Public
|
||||||
|
License as published by the Free Software Foundation; either
|
||||||
|
version 2.1 of the License, or (at your option) any later version.
|
||||||
|
|
||||||
|
The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
Lesser General Public License for more details.
|
||||||
|
|
||||||
|
You should have received a copy of the GNU Lesser General Public
|
||||||
|
License along with the GNU C Library; if not, write to the Free
|
||||||
|
Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
|
||||||
|
02111-1307 USA. */
|
||||||
|
|
||||||
|
/* void *memset_omips(void *s, int c, size_t n). */
|
||||||
|
|
||||||
|
#include "machine/asm.h"
|
||||||
|
|
||||||
|
#ifdef __mips64
|
||||||
|
#error mips32 code being compiled for mips64!
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if defined(__MIPSEB__)
|
||||||
|
#error big-endian is not supported in Broadcom MIPS Android platform
|
||||||
|
# define SWHI swl /* high part is left in big-endian */
|
||||||
|
#else
|
||||||
|
# define SWHI swr /* high part is right in little-endian */
|
||||||
|
#endif
|
||||||
|
|
||||||
|
LEAF (memset_omips,0)
|
||||||
|
.set noreorder
|
||||||
|
|
||||||
|
slti t1, a2, 8 # Less than 8?
|
||||||
|
bne t1, zero, .Llast8
|
||||||
|
move v0, a0 # Setup exit value before too late
|
||||||
|
|
||||||
|
beq a1, zero, .Lueven # If zero pattern, no need to extend
|
||||||
|
andi a1, 0xff # Avoid problems with bogus arguments
|
||||||
|
sll t0, a1, 8
|
||||||
|
or a1, t0
|
||||||
|
sll t0, a1, 16
|
||||||
|
or a1, t0 # a1 is now pattern in full word
|
||||||
|
|
||||||
|
.Lueven:
|
||||||
|
subu t0, zero, a0 # Unaligned address?
|
||||||
|
andi t0, 0x3
|
||||||
|
beq t0, zero, .Lchkw
|
||||||
|
subu a2, t0
|
||||||
|
SWHI a1, 0(a0) # Yes, handle first unaligned part
|
||||||
|
addu a0, t0 # Now both a0 and a2 are updated
|
||||||
|
|
||||||
|
.Lchkw:
|
||||||
|
andi t0, a2, 0x7 # Enough left for one loop iteration?
|
||||||
|
beq t0, a2, .Lchkl
|
||||||
|
subu a3, a2, t0
|
||||||
|
addu a3, a0 # a3 is last loop address +1
|
||||||
|
move a2, t0 # a2 is now # of bytes left after loop
|
||||||
|
.Lloopw:
|
||||||
|
addiu a0, 8 # Handle 2 words pr. iteration
|
||||||
|
sw a1, -8(a0)
|
||||||
|
bne a0, a3, .Lloopw
|
||||||
|
sw a1, -4(a0)
|
||||||
|
|
||||||
|
.Lchkl:
|
||||||
|
andi t0, a2, 0x4 # Check if there is at least a full
|
||||||
|
beq t0, zero, .Llast8 # word remaining after the loop
|
||||||
|
subu a2, t0
|
||||||
|
sw a1, 0(a0) # Yes...
|
||||||
|
addiu a0, 4
|
||||||
|
|
||||||
|
.Llast8:
|
||||||
|
blez a2, .Lexit # Handle last 8 bytes (if cnt>0)
|
||||||
|
addu a3, a2, a0 # a3 is last address +1
|
||||||
|
.Llst8l:
|
||||||
|
addiu a0, 1
|
||||||
|
bne a0, a3, .Llst8l
|
||||||
|
sb a1, -1(a0)
|
||||||
|
.Lexit:
|
||||||
|
j ra # Bye, bye
|
||||||
|
nop
|
||||||
|
|
||||||
|
.set reorder
|
||||||
|
END (memset_omips)
|
||||||
|
|
||||||
|
|
||||||
235
libcutils/tests/memset_mips/test_memset.c
Normal file
235
libcutils/tests/memset_mips/test_memset.c
Normal file
|
|
@ -0,0 +1,235 @@
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <assert.h>
|
||||||
|
#include <cutils/memory.h>
|
||||||
|
#include <time.h>
|
||||||
|
|
||||||
|
/*
|
||||||
|
* All systems must implement or emulate the rdhwr instruction to read
|
||||||
|
* the userlocal register. Systems that emulate also return teh count register
|
||||||
|
* when accessing register $2 so this should work on most systems
|
||||||
|
*/
|
||||||
|
#define USE_RDHWR
|
||||||
|
|
||||||
|
#ifdef USE_RDHWR
|
||||||
|
#define UNITS "cycles"
|
||||||
|
#define SCALE 2 /* Most CPU's */
|
||||||
|
static inline uint32_t
|
||||||
|
get_count(void)
|
||||||
|
{
|
||||||
|
uint32_t res;
|
||||||
|
asm volatile (".set push; .set mips32r2; rdhwr %[res],$2; .set pop" : [res] "=r" (res) : : "memory");
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
#define UNITS "ns"
|
||||||
|
#define SCALE 1
|
||||||
|
static inline uint32_t
|
||||||
|
get_count(void)
|
||||||
|
{
|
||||||
|
struct timespec now;
|
||||||
|
uint32_t res;
|
||||||
|
clock_gettime(CLOCK_REALTIME, &now);
|
||||||
|
res = (uint32_t)(now.tv_sec * 1000000000LL + now.tv_nsec);
|
||||||
|
// printf ("now=%d.%09d res=%d\n", (int)now.tv_sec, (int)now.tv_nsec, res);
|
||||||
|
return res;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
uint32_t overhead;
|
||||||
|
void
|
||||||
|
measure_overhead(void)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
uint32_t start, stop, delta;
|
||||||
|
for (i = 0; i < 32; i++) {
|
||||||
|
start = get_count();
|
||||||
|
stop = get_count();
|
||||||
|
delta = stop - start;
|
||||||
|
if (overhead == 0 || delta < overhead)
|
||||||
|
overhead = delta;
|
||||||
|
}
|
||||||
|
printf("overhead is %d"UNITS"\n", overhead);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t
|
||||||
|
timeone(void (*fn)(), void *d, uint32_t val, uint32_t bytes)
|
||||||
|
{
|
||||||
|
uint32_t start, stop, delta;
|
||||||
|
start = get_count();
|
||||||
|
(*fn)(d, val, bytes);
|
||||||
|
stop = get_count();
|
||||||
|
delta = stop - start - overhead;
|
||||||
|
// printf ("start=0x%08x stop=0x%08x delta=0x%08x\n", start, stop, delta);
|
||||||
|
return delta * SCALE;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* define VERIFY to check that memset only touches the bytes it's supposed to */
|
||||||
|
/*#define VERIFY*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Using a big arena means that memset will most likely miss in the cache
|
||||||
|
* NB Enabling verification effectively warms up the cache...
|
||||||
|
*/
|
||||||
|
#define ARENASIZE 0x1000000
|
||||||
|
#ifdef VERIFY
|
||||||
|
char arena[ARENASIZE+8]; /* Allow space for guard words */
|
||||||
|
#else
|
||||||
|
char arena[ARENASIZE];
|
||||||
|
#endif
|
||||||
|
|
||||||
|
void
|
||||||
|
testone(char *tag, void (*fn)(), int trials, int minbytes, int maxbytes, int size, int threshold)
|
||||||
|
{
|
||||||
|
int offset;
|
||||||
|
void *d;
|
||||||
|
void *p;
|
||||||
|
uint32_t v, notv = 0;
|
||||||
|
uint32_t n;
|
||||||
|
int i, units;
|
||||||
|
int totalunits = 0, totalbytes = 0, samples = 0;
|
||||||
|
|
||||||
|
/* Reset RNG to ensure each test uses same random values */
|
||||||
|
srand(0); /* FIXME should be able to use some other seed than 0 */
|
||||||
|
|
||||||
|
for (i = 0; i < trials; i++) {
|
||||||
|
n = minbytes + (rand() % (maxbytes-minbytes)); /* How many bytes to do */
|
||||||
|
offset = ((rand() % (ARENASIZE-n))); /* Where to start */
|
||||||
|
|
||||||
|
#ifdef VERIFY
|
||||||
|
offset += 4; /* Allow space for guard word at beginning */
|
||||||
|
#endif
|
||||||
|
v = rand();
|
||||||
|
|
||||||
|
/* Adjust alignment and sizes based on transfer size */
|
||||||
|
switch (size) {
|
||||||
|
case 1:
|
||||||
|
v &= 0xff;
|
||||||
|
notv = ~v & 0xff;
|
||||||
|
break;
|
||||||
|
case 2:
|
||||||
|
v &= 0xffff;
|
||||||
|
notv = ~v & 0xffff;
|
||||||
|
offset &= ~1;
|
||||||
|
n &= ~1;
|
||||||
|
break;
|
||||||
|
case 4:
|
||||||
|
notv = ~v;
|
||||||
|
offset &= ~3;
|
||||||
|
n &= ~3;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
d = &arena[offset];
|
||||||
|
|
||||||
|
#ifdef VERIFY
|
||||||
|
/* Initialise the area and guard words */
|
||||||
|
for (p = &arena[offset-4]; p < (void *)&arena[offset+n+4]; p = (void *)((uint32_t)p + size)) {
|
||||||
|
if (size == 1)
|
||||||
|
*(uint8_t *)p = notv;
|
||||||
|
else if (size == 2)
|
||||||
|
*(uint16_t *)p = notv;
|
||||||
|
else if (size == 4)
|
||||||
|
*(uint32_t *)p = notv;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
units = timeone(fn, d, v, n);
|
||||||
|
#ifdef VERIFY
|
||||||
|
/* Check the area and guard words */
|
||||||
|
for (p = &arena[offset-4]; p < (void *)&arena[offset+n+4]; p = (void *)((uint32_t)p + size)) {
|
||||||
|
uint32_t got = 0;
|
||||||
|
if (size == 1)
|
||||||
|
got = *(uint8_t *)p;
|
||||||
|
else if (size == 2)
|
||||||
|
got = *(uint16_t *)p;
|
||||||
|
else if (size == 4)
|
||||||
|
got = *(uint32_t *)p;
|
||||||
|
if (p < (void *)&arena[offset]) {
|
||||||
|
if (got != notv)
|
||||||
|
printf ("%s: verify failure: preguard:%p d=%p v=%08x got=%08x n=%d\n", tag, p, d, v, got, n);
|
||||||
|
}
|
||||||
|
else if (p < (void *)&arena[offset+n]) {
|
||||||
|
if (got != v)
|
||||||
|
printf ("%s: verify failure: arena:%p d=%p v=%08x got=%08x n=%d\n", tag, p, d, v, n);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
if (got != notv)
|
||||||
|
printf ("%s: verify failure: postguard:%p d=%p v=%08x got=%08x n=%d\n", tag, p, d, v, n);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
|
/* If the cycle count looks reasonable include it in the statistics */
|
||||||
|
if (units < threshold) {
|
||||||
|
totalbytes += n;
|
||||||
|
totalunits += units;
|
||||||
|
samples++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("%s: samples=%d avglen=%d avg" UNITS "=%d bp"UNITS"=%g\n",
|
||||||
|
tag, samples, totalbytes/samples, totalunits/samples, (double)totalbytes/(double)totalunits);
|
||||||
|
}
|
||||||
|
|
||||||
|
extern void android_memset32_dumb(uint32_t* dst, uint32_t value, size_t size);
|
||||||
|
extern void android_memset16_dumb(uint32_t* dst, uint16_t value, size_t size);
|
||||||
|
extern void android_memset32_test(uint32_t* dst, uint32_t value, size_t size);
|
||||||
|
extern void android_memset16_test(uint32_t* dst, uint16_t value, size_t size);
|
||||||
|
extern void memset_cmips(void* dst, int value, size_t size);
|
||||||
|
extern void memset_omips(void* dst, int value, size_t size);
|
||||||
|
|
||||||
|
int
|
||||||
|
main(int argc, char **argv)
|
||||||
|
{
|
||||||
|
int i;
|
||||||
|
struct {
|
||||||
|
char *type;
|
||||||
|
int trials;
|
||||||
|
int minbytes, maxbytes;
|
||||||
|
} *pp, params[] = {
|
||||||
|
{"small", 10000, 0, 64},
|
||||||
|
{"medium", 10000, 64, 512},
|
||||||
|
{"large", 10000, 512, 1280},
|
||||||
|
{"varied", 10000, 0, 1280},
|
||||||
|
};
|
||||||
|
#define NPARAMS (sizeof(params)/sizeof(params[0]))
|
||||||
|
struct {
|
||||||
|
char *name;
|
||||||
|
void (*fn)();
|
||||||
|
int size;
|
||||||
|
} *fp, functions[] = {
|
||||||
|
{"dmemset16", (void (*)())android_memset16_dumb, 2},
|
||||||
|
{"tmemset16", (void (*)())android_memset16_test, 2},
|
||||||
|
{"lmemset16", (void (*)())android_memset16, 2},
|
||||||
|
|
||||||
|
{"dmemset32", (void (*)())android_memset32_dumb, 4},
|
||||||
|
{"tmemset32", (void (*)())android_memset32_test, 4},
|
||||||
|
{"lmemset32", (void (*)())android_memset32, 4},
|
||||||
|
|
||||||
|
{"cmemset", (void (*)())memset_cmips, 1},
|
||||||
|
{"omemset", (void (*)())memset_omips, 1},
|
||||||
|
{"lmemset", (void (*)())memset, 1},
|
||||||
|
};
|
||||||
|
#define NFUNCTIONS (sizeof(functions)/sizeof(functions[0]))
|
||||||
|
char tag[40];
|
||||||
|
int threshold;
|
||||||
|
|
||||||
|
measure_overhead();
|
||||||
|
|
||||||
|
/* Warm up the page cache */
|
||||||
|
memset(arena, 0xff, ARENASIZE); /* use 0xff now to avoid COW later */
|
||||||
|
|
||||||
|
for (fp = functions; fp < &functions[NFUNCTIONS]; fp++) {
|
||||||
|
(fp->fn)(arena, 0xffffffff, ARENASIZE); /* one call to get the code into Icache */
|
||||||
|
for (pp = params; pp < ¶ms[NPARAMS]; pp++) {
|
||||||
|
sprintf(tag, "%10s: %7s %4d-%4d", fp->name, pp->type, pp->minbytes, pp->maxbytes);
|
||||||
|
|
||||||
|
/* Set the cycle threshold */
|
||||||
|
threshold = pp->maxbytes * 4 * 10; /* reasonable for cycles and ns */
|
||||||
|
testone(tag, fp->fn, pp->trials, pp->minbytes, pp->maxbytes, fp->size, threshold);
|
||||||
|
}
|
||||||
|
printf ("\n");
|
||||||
|
}
|
||||||
|
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Loading…
Add table
Reference in a new issue