mirror of
https://github.com/openjdk/jdk.git
synced 2025-09-24 04:54:40 +02:00
8078497: C2's superword optimization causes unaligned memory accesses
Prevent vectorization of memory operations with different invariant offsets if unaligned memory accesses are not allowed. Reviewed-by: kvn
This commit is contained in:
parent
58a1361125
commit
43a5abc876
3 changed files with 204 additions and 21 deletions
|
@ -293,6 +293,13 @@ void SuperWord::find_adjacent_refs() {
|
||||||
// if unaligned memory access is not allowed because number of
|
// if unaligned memory access is not allowed because number of
|
||||||
// iterations in pre-loop will be not enough to align it.
|
// iterations in pre-loop will be not enough to align it.
|
||||||
create_pack = false;
|
create_pack = false;
|
||||||
|
} else {
|
||||||
|
SWPointer p2(best_align_to_mem_ref, this);
|
||||||
|
if (align_to_ref_p.invar() != p2.invar()) {
|
||||||
|
// Do not vectorize memory accesses with different invariants
|
||||||
|
// if unaligned memory accesses are not allowed.
|
||||||
|
create_pack = false;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
|
@ -516,24 +523,50 @@ bool SuperWord::ref_is_alignable(SWPointer& p) {
|
||||||
if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) {
|
if (ABS(span) == mem_size && (ABS(offset) % mem_size) == 0) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// If initial offset from start of object is computable,
|
// If the initial offset from start of the object is computable,
|
||||||
// compute alignment within the vector.
|
// check if the pre-loop can align the final offset accordingly.
|
||||||
|
//
|
||||||
|
// In other words: Can we find an i such that the offset
|
||||||
|
// after i pre-loop iterations is aligned to vw?
|
||||||
|
// (init_offset + pre_loop) % vw == 0 (1)
|
||||||
|
// where
|
||||||
|
// pre_loop = i * span
|
||||||
|
// is the number of bytes added to the offset by i pre-loop iterations.
|
||||||
|
//
|
||||||
|
// For this to hold we need pre_loop to increase init_offset by
|
||||||
|
// pre_loop = vw - (init_offset % vw)
|
||||||
|
//
|
||||||
|
// This is only possible if pre_loop is divisible by span because each
|
||||||
|
// pre-loop iteration increases the initial offset by 'span' bytes:
|
||||||
|
// (vw - (init_offset % vw)) % span == 0
|
||||||
|
//
|
||||||
int vw = vector_width_in_bytes(p.mem());
|
int vw = vector_width_in_bytes(p.mem());
|
||||||
assert(vw > 1, "sanity");
|
assert(vw > 1, "sanity");
|
||||||
if (vw % span == 0) {
|
Node* init_nd = pre_end->init_trip();
|
||||||
Node* init_nd = pre_end->init_trip();
|
if (init_nd->is_Con() && p.invar() == NULL) {
|
||||||
if (init_nd->is_Con() && p.invar() == NULL) {
|
int init = init_nd->bottom_type()->is_int()->get_con();
|
||||||
int init = init_nd->bottom_type()->is_int()->get_con();
|
int init_offset = init * p.scale_in_bytes() + offset;
|
||||||
|
assert(init_offset >= 0, "positive offset from object start");
|
||||||
int init_offset = init * p.scale_in_bytes() + offset;
|
if (vw % span == 0) {
|
||||||
assert(init_offset >= 0, "positive offset from object start");
|
// If vm is a multiple of span, we use formula (1).
|
||||||
|
|
||||||
if (span > 0) {
|
if (span > 0) {
|
||||||
return (vw - (init_offset % vw)) % span == 0;
|
return (vw - (init_offset % vw)) % span == 0;
|
||||||
} else {
|
} else {
|
||||||
assert(span < 0, "nonzero stride * scale");
|
assert(span < 0, "nonzero stride * scale");
|
||||||
return (init_offset % vw) % -span == 0;
|
return (init_offset % vw) % -span == 0;
|
||||||
}
|
}
|
||||||
|
} else if (span % vw == 0) {
|
||||||
|
// If span is a multiple of vw, we can simplify formula (1) to:
|
||||||
|
// (init_offset + i * span) % vw == 0
|
||||||
|
// =>
|
||||||
|
// (init_offset % vw) + ((i * span) % vw) == 0
|
||||||
|
// =>
|
||||||
|
// init_offset % vw == 0
|
||||||
|
//
|
||||||
|
// Because we add a multiple of vw to the initial offset, the final
|
||||||
|
// offset is a multiple of vw if and only if init_offset is a multiple.
|
||||||
|
//
|
||||||
|
return (init_offset % vw) == 0;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
|
@ -545,17 +578,23 @@ int SuperWord::get_iv_adjustment(MemNode* mem_ref) {
|
||||||
SWPointer align_to_ref_p(mem_ref, this);
|
SWPointer align_to_ref_p(mem_ref, this);
|
||||||
int offset = align_to_ref_p.offset_in_bytes();
|
int offset = align_to_ref_p.offset_in_bytes();
|
||||||
int scale = align_to_ref_p.scale_in_bytes();
|
int scale = align_to_ref_p.scale_in_bytes();
|
||||||
|
int elt_size = align_to_ref_p.memory_size();
|
||||||
int vw = vector_width_in_bytes(mem_ref);
|
int vw = vector_width_in_bytes(mem_ref);
|
||||||
assert(vw > 1, "sanity");
|
assert(vw > 1, "sanity");
|
||||||
int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
|
int iv_adjustment;
|
||||||
// At least one iteration is executed in pre-loop by default. As result
|
if (scale != 0) {
|
||||||
// several iterations are needed to align memory operations in main-loop even
|
int stride_sign = (scale * iv_stride()) > 0 ? 1 : -1;
|
||||||
// if offset is 0.
|
// At least one iteration is executed in pre-loop by default. As result
|
||||||
int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
|
// several iterations are needed to align memory operations in main-loop even
|
||||||
int elt_size = align_to_ref_p.memory_size();
|
// if offset is 0.
|
||||||
assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
|
int iv_adjustment_in_bytes = (stride_sign * vw - (offset % vw));
|
||||||
err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
|
assert(((ABS(iv_adjustment_in_bytes) % elt_size) == 0),
|
||||||
int iv_adjustment = iv_adjustment_in_bytes/elt_size;
|
err_msg_res("(%d) should be divisible by (%d)", iv_adjustment_in_bytes, elt_size));
|
||||||
|
iv_adjustment = iv_adjustment_in_bytes/elt_size;
|
||||||
|
} else {
|
||||||
|
// This memory op is not dependent on iv (scale == 0)
|
||||||
|
iv_adjustment = 0;
|
||||||
|
}
|
||||||
|
|
||||||
#ifndef PRODUCT
|
#ifndef PRODUCT
|
||||||
if (TraceSuperWord)
|
if (TraceSuperWord)
|
||||||
|
|
|
@ -40,7 +40,7 @@
|
||||||
// Exploiting SuperWord Level Parallelism with
|
// Exploiting SuperWord Level Parallelism with
|
||||||
// Multimedia Instruction Sets
|
// Multimedia Instruction Sets
|
||||||
// by
|
// by
|
||||||
// Samuel Larsen and Saman Amarasighe
|
// Samuel Larsen and Saman Amarasinghe
|
||||||
// MIT Laboratory for Computer Science
|
// MIT Laboratory for Computer Science
|
||||||
// date
|
// date
|
||||||
// May 2000
|
// May 2000
|
||||||
|
@ -466,7 +466,7 @@ class SWPointer VALUE_OBJ_CLASS_SPEC {
|
||||||
|
|
||||||
Node* _base; // NULL if unsafe nonheap reference
|
Node* _base; // NULL if unsafe nonheap reference
|
||||||
Node* _adr; // address pointer
|
Node* _adr; // address pointer
|
||||||
jint _scale; // multipler for iv (in bytes), 0 if no loop iv
|
jint _scale; // multiplier for iv (in bytes), 0 if no loop iv
|
||||||
jint _offset; // constant offset (in bytes)
|
jint _offset; // constant offset (in bytes)
|
||||||
Node* _invar; // invariant offset (in bytes), NULL if none
|
Node* _invar; // invariant offset (in bytes), NULL if none
|
||||||
bool _negate_invar; // if true then use: (0 - _invar)
|
bool _negate_invar; // if true then use: (0 - _invar)
|
||||||
|
|
|
@ -0,0 +1,144 @@
|
||||||
|
/*
|
||||||
|
* Copyright (c) 2015, Oracle and/or its affiliates. All rights reserved.
|
||||||
|
* DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
|
||||||
|
*
|
||||||
|
* This code is free software; you can redistribute it and/or modify it
|
||||||
|
* under the terms of the GNU General Public License version 2 only, as
|
||||||
|
* published by the Free Software Foundation.
|
||||||
|
*
|
||||||
|
* This code is distributed in the hope that it will be useful, but WITHOUT
|
||||||
|
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
|
||||||
|
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||||||
|
* version 2 for more details (a copy is included in the LICENSE file that
|
||||||
|
* accompanied this code).
|
||||||
|
*
|
||||||
|
* You should have received a copy of the GNU General Public License version
|
||||||
|
* 2 along with this work; if not, write to the Free Software Foundation,
|
||||||
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
|
||||||
|
*
|
||||||
|
* Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
|
||||||
|
* or visit www.oracle.com if you need additional information or have any
|
||||||
|
* questions.
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
import com.oracle.java.testlibrary.*;
|
||||||
|
import sun.misc.Unsafe;
|
||||||
|
|
||||||
|
/**
|
||||||
|
* @test
|
||||||
|
* @bug 8078497
|
||||||
|
* @summary Tests correct alignment of vectors with loop invariant offset.
|
||||||
|
* @library /testlibrary
|
||||||
|
* @run main TestVectorizationWithInvariant
|
||||||
|
*/
|
||||||
|
public class TestVectorizationWithInvariant {
|
||||||
|
|
||||||
|
private static Unsafe unsafe;
|
||||||
|
private static final long BYTE_ARRAY_OFFSET;
|
||||||
|
private static final long CHAR_ARRAY_OFFSET;
|
||||||
|
|
||||||
|
static {
|
||||||
|
unsafe = Utils.getUnsafe();
|
||||||
|
BYTE_ARRAY_OFFSET = unsafe.arrayBaseOffset(byte[].class);
|
||||||
|
CHAR_ARRAY_OFFSET = unsafe.arrayBaseOffset(char[].class);
|
||||||
|
}
|
||||||
|
|
||||||
|
public static void main(String[] args) throws Exception {
|
||||||
|
byte[] byte_array1 = new byte[1000];
|
||||||
|
byte[] byte_array2 = new byte[1000];
|
||||||
|
char[] char_array = new char[1000];
|
||||||
|
|
||||||
|
for (int i = 0; i < 20_000; ++i) {
|
||||||
|
copyByteToChar(byte_array1, byte_array2, char_array, 1);
|
||||||
|
copyCharToByte(char_array, byte_array1, 1);
|
||||||
|
copyCharToByteAligned(char_array, byte_array1);
|
||||||
|
copyCharToByteUnaligned(char_array, byte_array1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copy multiple consecutive chars from a byte array to a given offset in a char array
|
||||||
|
* to trigger C2's superword optimization. The offset in the byte array is independent
|
||||||
|
* of the loop induction variable and can be set to an arbitrary value. It may then not
|
||||||
|
* be possible to both align the LoadUS and the StoreC operations. Therefore, vectorization
|
||||||
|
* should only be done in this case if unaligned memory accesses are allowed.
|
||||||
|
*/
|
||||||
|
public static void copyByteToChar(byte[] src1, byte[] src2, char[] dst, int off) {
|
||||||
|
off = (int) BYTE_ARRAY_OFFSET + (off << 1);
|
||||||
|
byte[] src = src1;
|
||||||
|
for (int i = (int) CHAR_ARRAY_OFFSET; i < 100; i = i + 8) {
|
||||||
|
// Copy 8 chars from src to dst
|
||||||
|
unsafe.putChar(dst, i + 0, unsafe.getChar(src, off + 0));
|
||||||
|
unsafe.putChar(dst, i + 2, unsafe.getChar(src, off + 2));
|
||||||
|
unsafe.putChar(dst, i + 4, unsafe.getChar(src, off + 4));
|
||||||
|
unsafe.putChar(dst, i + 6, unsafe.getChar(src, off + 6));
|
||||||
|
unsafe.putChar(dst, i + 8, unsafe.getChar(src, off + 8));
|
||||||
|
unsafe.putChar(dst, i + 10, unsafe.getChar(src, off + 10));
|
||||||
|
unsafe.putChar(dst, i + 12, unsafe.getChar(src, off + 12));
|
||||||
|
unsafe.putChar(dst, i + 14, unsafe.getChar(src, off + 14));
|
||||||
|
|
||||||
|
// Prevent loop invariant code motion of char read.
|
||||||
|
src = (src == src1) ? src2 : src1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Copy multiple consecutive chars from a char array to a given offset in a byte array
|
||||||
|
* to trigger C2's superword optimization. Checks for similar problems as 'copyByteToChar'.
|
||||||
|
*/
|
||||||
|
public static void copyCharToByte(char[] src, byte[] dst, int off) {
|
||||||
|
off = (int) BYTE_ARRAY_OFFSET + (off << 1);
|
||||||
|
for (int i = 0; i < 100; i = i + 8) {
|
||||||
|
// Copy 8 chars from src to dst
|
||||||
|
unsafe.putChar(dst, off + 0, src[i + 0]);
|
||||||
|
unsafe.putChar(dst, off + 2, src[i + 1]);
|
||||||
|
unsafe.putChar(dst, off + 4, src[i + 2]);
|
||||||
|
unsafe.putChar(dst, off + 6, src[i + 3]);
|
||||||
|
unsafe.putChar(dst, off + 8, src[i + 4]);
|
||||||
|
unsafe.putChar(dst, off + 10, src[i + 5]);
|
||||||
|
unsafe.putChar(dst, off + 12, src[i + 6]);
|
||||||
|
unsafe.putChar(dst, off + 14, src[i + 7]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Variant of copyCharToByte with a constant destination array offset.
|
||||||
|
* The loop should always be vectorized because both the LoadUS and StoreC
|
||||||
|
* operations can be aligned.
|
||||||
|
*/
|
||||||
|
public static void copyCharToByteAligned(char[] src, byte[] dst) {
|
||||||
|
final int off = (int) BYTE_ARRAY_OFFSET;
|
||||||
|
for (int i = 8; i < 100; i = i + 8) {
|
||||||
|
// Copy 8 chars from src to dst
|
||||||
|
unsafe.putChar(dst, off + 0, src[i + 0]);
|
||||||
|
unsafe.putChar(dst, off + 2, src[i + 1]);
|
||||||
|
unsafe.putChar(dst, off + 4, src[i + 2]);
|
||||||
|
unsafe.putChar(dst, off + 6, src[i + 3]);
|
||||||
|
unsafe.putChar(dst, off + 8, src[i + 4]);
|
||||||
|
unsafe.putChar(dst, off + 10, src[i + 5]);
|
||||||
|
unsafe.putChar(dst, off + 12, src[i + 6]);
|
||||||
|
unsafe.putChar(dst, off + 14, src[i + 7]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Variant of copyCharToByte with a constant destination array offset. The
|
||||||
|
* loop should only be vectorized if unaligned memory operations are allowed
|
||||||
|
* because not both the LoadUS and the StoreC can be aligned.
|
||||||
|
*/
|
||||||
|
public static void copyCharToByteUnaligned(char[] src, byte[] dst) {
|
||||||
|
final int off = (int) BYTE_ARRAY_OFFSET + 2;
|
||||||
|
for (int i = 0; i < 100; i = i + 8) {
|
||||||
|
// Copy 8 chars from src to dst
|
||||||
|
unsafe.putChar(dst, off + 0, src[i + 0]);
|
||||||
|
unsafe.putChar(dst, off + 2, src[i + 1]);
|
||||||
|
unsafe.putChar(dst, off + 4, src[i + 2]);
|
||||||
|
unsafe.putChar(dst, off + 6, src[i + 3]);
|
||||||
|
unsafe.putChar(dst, off + 8, src[i + 4]);
|
||||||
|
unsafe.putChar(dst, off + 10, src[i + 5]);
|
||||||
|
unsafe.putChar(dst, off + 12, src[i + 6]);
|
||||||
|
unsafe.putChar(dst, off + 14, src[i + 7]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Add table
Add a link
Reference in a new issue