diff --git a/src/libcore/lib.rs b/src/libcore/lib.rs
index d21cfbcdfce..e226e9fa154 100644
--- a/src/libcore/lib.rs
+++ b/src/libcore/lib.rs
@@ -79,7 +79,7 @@
 #![feature(reflect)]
 #![feature(rustc_attrs)]
 #![cfg_attr(stage0, feature(simd))]
-#![cfg_attr(not(stage0), feature(repr_simd))]
+#![cfg_attr(not(stage0), feature(repr_simd, platform_intrinsics))]
 #![feature(staged_api)]
 #![feature(unboxed_closures)]
 
diff --git a/src/libcore/simd.rs b/src/libcore/simd.rs
index d58d0c50a89..fb39b3accc3 100644
--- a/src/libcore/simd.rs
+++ b/src/libcore/simd.rs
@@ -10,25 +10,12 @@
 
 //! SIMD vectors.
 //!
-//! These types can be used for accessing basic SIMD operations. Each of them
-//! implements the standard arithmetic operator traits (Add, Sub, Mul, Div,
-//! Rem, Shl, Shr) through compiler magic, rather than explicitly. Currently
+//! These types can be used for accessing basic SIMD operations. Currently
 //! comparison operators are not implemented. To use SSE3+, you must enable
 //! the features, like `-C target-feature=sse3,sse4.1,sse4.2`, or a more
 //! specific `target-cpu`. No other SIMD intrinsics or high-level wrappers are
 //! provided beyond this module.
 //!
-//! ```rust
-//! #![feature(core_simd)]
-//!
-//! fn main() {
-//!     use std::simd::f32x4;
-//!     let a = f32x4(40.0, 41.0, 42.0, 43.0);
-//!     let b = f32x4(1.0, 1.1, 3.4, 9.8);
-//!     println!("{:?}", a + b);
-//! }
-//! ```
-//!
 //! # Stability Note
 //!
 //! These are all experimental. The interface may change entirely, without
@@ -44,6 +31,30 @@
 #![allow(missing_docs)]
 #![allow(deprecated)]
 
+use ops::{Add, Sub, Mul, Div, Shl, Shr, BitAnd, BitOr, BitXor};
+
+// FIXME(stage0): the contents of macro can be inlined.
+// ABIs are verified as valid as soon as they are parsed, i.e. before
+// `cfg` stripping. The `platform-intrinsic` ABI is new, so stage0
+// doesn't know about it, but it still errors out when it hits it
+// (despite this being in a `cfg(not(stage0))` module).
+macro_rules! argh {
+    () => {
+        extern "platform-intrinsic" {
+            fn simd_add<T>(x: T, y: T) -> T;
+            fn simd_sub<T>(x: T, y: T) -> T;
+            fn simd_mul<T>(x: T, y: T) -> T;
+            fn simd_div<T>(x: T, y: T) -> T;
+            fn simd_shl<T>(x: T, y: T) -> T;
+            fn simd_shr<T>(x: T, y: T) -> T;
+            fn simd_and<T>(x: T, y: T) -> T;
+            fn simd_or<T>(x: T, y: T) -> T;
+            fn simd_xor<T>(x: T, y: T) -> T;
+        }
+    }
+}
+argh!();
+
 #[repr(simd)]
 #[derive(Copy, Clone, Debug)]
 #[repr(C)]
@@ -101,3 +112,32 @@ pub struct f32x4(pub f32, pub f32, pub f32, pub f32);
 #[derive(Copy, Clone, Debug)]
 #[repr(C)]
 pub struct f64x2(pub f64, pub f64);
+
+macro_rules! impl_traits {
+    ($($trayt: ident, $method: ident, $func: ident: $($ty: ty),*;)*) => {
+        $($(
+            impl $trayt<$ty> for $ty {
+                type Output = Self;
+                fn $method(self, other: Self) -> Self {
+                    unsafe {
+                        $func(self, other)
+                    }
+                }
+            }
+            )*)*
+    }
+}
+
+impl_traits! {
+    Add, add, simd_add: u8x16, u16x8, u32x4, u64x2, i8x16, i16x8, i32x4, i64x2, f32x4, f64x2;
+    Sub, sub, simd_sub: u8x16, u16x8, u32x4, u64x2, i8x16, i16x8, i32x4, i64x2, f32x4, f64x2;
+    Mul, mul, simd_mul: u8x16, u16x8, u32x4, u64x2, i8x16, i16x8, i32x4, i64x2, f32x4, f64x2;
+
+    Div, div, simd_div: f32x4, f64x2;
+
+    Shl, shl, simd_shl: u8x16, u16x8, u32x4, u64x2, i8x16, i16x8, i32x4, i64x2;
+    Shr, shr, simd_shr: u8x16, u16x8, u32x4, u64x2, i8x16, i16x8, i32x4, i64x2;
+    BitAnd, bitand, simd_and: u8x16, u16x8, u32x4, u64x2, i8x16, i16x8, i32x4, i64x2;
+    BitOr, bitor, simd_or: u8x16, u16x8, u32x4, u64x2, i8x16, i16x8, i32x4, i64x2;
+    BitXor, bitxor, simd_xor: u8x16, u16x8, u32x4, u64x2, i8x16, i16x8, i32x4, i64x2;
+}
diff --git a/src/test/bench/shootout-spectralnorm.rs b/src/test/bench/shootout-spectralnorm.rs
index b3591477022..a6c77eaf7c6 100644
--- a/src/test/bench/shootout-spectralnorm.rs
+++ b/src/test/bench/shootout-spectralnorm.rs
@@ -91,7 +91,7 @@ fn mult<F>(v: &[f64], out: &mut [f64], start: usize, a: F)
         for (j, chunk) in v.chunks(2).enumerate().map(|(j, s)| (2 * j, s)) {
             let top = f64x2(chunk[0], chunk[1]);
             let bot = f64x2(a(i, j), a(i, j + 1));
-            sum += top / bot;
+            sum = sum + top / bot;
         }
         let f64x2(a, b) = sum;
         *slot = a + b;