mirror of
https://github.com/php/php-src.git
synced 2025-08-16 05:58:45 +02:00
opcache: Patch SSE based fast_memcpy() implementation
Use _mm_store_si128() instead of _mm_stream_si128(). This ensures that copied memory is preserved in data cache, which is good as the interpretor will start to use this data without the need to go back to memory. _mm_stream* is intended to be used for stores where we want to avoid reading data into the cache and the cache pollution; in our scenario it seems that preserving the data in cache has a positive impact. Tests on WordPress 4.1 show ~1% performance increase with fast_memcpy() in place versus standard memcpy() when running php-cgi -T10000 wordpress/index.php. I also updated SW prefetching on target memory but its contribution is almost negligible. The address to be prefetched will be used in a couple of cycles (at the next iteration) while the data from memory will be available in >100 cycles.
This commit is contained in:
parent
4e66cce87c
commit
68185bafbe
1 changed files with 5 additions and 4 deletions
|
@ -658,16 +658,17 @@ static zend_always_inline void fast_memcpy(void *dest, const void *src, size_t s
|
|||
|
||||
do {
|
||||
_mm_prefetch(dqsrc + 4, _MM_HINT_NTA);
|
||||
_mm_prefetch(dqdest + 4, _MM_HINT_T0);
|
||||
|
||||
__m128i xmm0 = _mm_load_si128(dqsrc + 0);
|
||||
__m128i xmm1 = _mm_load_si128(dqsrc + 1);
|
||||
__m128i xmm2 = _mm_load_si128(dqsrc + 2);
|
||||
__m128i xmm3 = _mm_load_si128(dqsrc + 3);
|
||||
dqsrc += 4;
|
||||
_mm_stream_si128(dqdest + 0, xmm0);
|
||||
_mm_stream_si128(dqdest + 1, xmm1);
|
||||
_mm_stream_si128(dqdest + 2, xmm2);
|
||||
_mm_stream_si128(dqdest + 3, xmm3);
|
||||
_mm_store_si128(dqdest + 0, xmm0);
|
||||
_mm_store_si128(dqdest + 1, xmm1);
|
||||
_mm_store_si128(dqdest + 2, xmm2);
|
||||
_mm_store_si128(dqdest + 3, xmm3);
|
||||
dqdest += 4;
|
||||
} while (dqsrc != end);
|
||||
}
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue