diff options
Diffstat (limited to 'arch/x86')
208 files changed, 11361 insertions, 1378 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f49767716c70..cb9a1044a771 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -194,9 +194,6 @@ config NEED_PER_CPU_EMBED_FIRST_CHUNK config NEED_PER_CPU_PAGE_FIRST_CHUNK def_bool y -config HAVE_CPUMASK_OF_CPU_MAP - def_bool X86_64_SMP - config ARCH_HIBERNATION_POSSIBLE def_bool y @@ -1454,6 +1451,15 @@ config ARCH_USES_PG_UNCACHED def_bool y depends on X86_PAT +config ARCH_RANDOM + def_bool y + prompt "x86 architectural random number generator" if EXPERT + ---help--- + Enable the x86 architectural RDRAND instruction + (Intel Bull Mountain technology) to generate random numbers. + If supported, this is a high bandwidth, cryptographically + secure hardware random number generator. + config EFI bool "EFI runtime service support" depends on ACPI @@ -2066,6 +2072,20 @@ config OLPC_XO15_SCI - AC adapter status updates - Battery status updates +config ALIX + bool "PCEngines ALIX System Support (LED setup)" + select GPIOLIB + ---help--- + This option enables system support for the PCEngines ALIX. + At present this just sets up LEDs for GPIO control on + ALIX2/3/6 boards. However, other system specific setup should + get added here. + + Note: You must still enable the drivers for GPIO and LED support + (GPIO_CS5535 & LEDS_GPIO) to actually use the LEDs + + Note: You have to set alix.force=1 for boards with Award BIOS. + endif # X86_32 config AMD_NB diff --git a/arch/x86/Makefile.um b/arch/x86/Makefile.um new file mode 100644 index 000000000000..36ddec6a41c9 --- /dev/null +++ b/arch/x86/Makefile.um @@ -0,0 +1,61 @@ +core-y += arch/x86/crypto/ + +ifeq ($(CONFIG_X86_32),y) +START := 0x8048000 + +LDFLAGS += -m elf_i386 +ELF_ARCH := i386 +ELF_FORMAT := elf32-i386 +CHECKFLAGS += -D__i386__ + +ifeq ("$(origin SUBARCH)", "command line") +ifneq ("$(shell uname -m | sed -e s/i.86/i386/)", "$(SUBARCH)") +KBUILD_CFLAGS += $(call cc-option,-m32) +KBUILD_AFLAGS += $(call cc-option,-m32) +LINK-y += $(call cc-option,-m32) + +export LDFLAGS +endif +endif + +# First of all, tune CFLAGS for the specific CPU. This actually sets cflags-y. +include $(srctree)/arch/x86/Makefile_32.cpu + +# prevent gcc from keeping the stack 16 byte aligned. Taken from i386. +cflags-y += $(call cc-option,-mpreferred-stack-boundary=2) + +# Prevent sprintf in nfsd from being converted to strcpy and resulting in +# an unresolved reference. +cflags-y += -ffreestanding + +# Disable unit-at-a-time mode on pre-gcc-4.0 compilers, it makes gcc use +# a lot more stack due to the lack of sharing of stacklots. Also, gcc +# 4.3.0 needs -funit-at-a-time for extern inline functions. +KBUILD_CFLAGS += $(shell if [ $(call cc-version) -lt 0400 ] ; then \ + echo $(call cc-option,-fno-unit-at-a-time); \ + else echo $(call cc-option,-funit-at-a-time); fi ;) + +KBUILD_CFLAGS += $(cflags-y) + +else + +START := 0x60000000 + +KBUILD_CFLAGS += -fno-builtin -m64 + +CHECKFLAGS += -m64 -D__x86_64__ +KBUILD_AFLAGS += -m64 +LDFLAGS += -m elf_x86_64 +KBUILD_CPPFLAGS += -m64 + +ELF_ARCH := i386:x86-64 +ELF_FORMAT := elf64-x86-64 + +# Not on all 64-bit distros /lib is a symlink to /lib64. PLD is an example. + +LINK-$(CONFIG_LD_SCRIPT_DYN) += -Wl,-rpath,/lib64 +LINK-y += -m64 + +# Do unit-at-a-time unconditionally on x86_64, following the host +KBUILD_CFLAGS += $(call cc-option,-funit-at-a-time) +endif diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile index c04f1b7a9139..3537d4b91f74 100644 --- a/arch/x86/crypto/Makefile +++ b/arch/x86/crypto/Makefile @@ -7,21 +7,33 @@ obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o +obj-$(CONFIG_CRYPTO_BLOWFISH_X86_64) += blowfish-x86_64.o obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o +obj-$(CONFIG_CRYPTO_TWOFISH_X86_64_3WAY) += twofish-x86_64-3way.o obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o +obj-$(CONFIG_CRYPTO_SHA1_SSSE3) += sha1-ssse3.o aes-i586-y := aes-i586-asm_32.o aes_glue.o twofish-i586-y := twofish-i586-asm_32.o twofish_glue.o salsa20-i586-y := salsa20-i586-asm_32.o salsa20_glue.o aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o +blowfish-x86_64-y := blowfish-x86_64-asm_64.o blowfish_glue.o twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o +twofish-x86_64-3way-y := twofish-x86_64-asm_64-3way.o twofish_glue_3way.o salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o + +# enable AVX support only when $(AS) can actually assemble the instructions +ifeq ($(call as-instr,vpxor %xmm0$(comma)%xmm1$(comma)%xmm2,yes,no),yes) +AFLAGS_sha1_ssse3_asm.o += -DSHA1_ENABLE_AVX_SUPPORT +CFLAGS_sha1_ssse3_glue.o += -DSHA1_ENABLE_AVX_SUPPORT +endif +sha1-ssse3-y := sha1_ssse3_asm.o sha1_ssse3_glue.o diff --git a/arch/x86/crypto/aes_glue.c b/arch/x86/crypto/aes_glue.c index 49ae9fe32b22..8efcf42a9d7e 100644 --- a/arch/x86/crypto/aes_glue.c +++ b/arch/x86/crypto/aes_glue.c @@ -3,7 +3,9 @@ * */ +#include <linux/module.h> #include <crypto/aes.h> +#include <asm/aes.h> asmlinkage void aes_enc_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); asmlinkage void aes_dec_blk(struct crypto_aes_ctx *ctx, u8 *out, const u8 *in); diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index feee8ff1d05e..545d0ce59818 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -22,6 +22,7 @@ #include <linux/hardirq.h> #include <linux/types.h> #include <linux/crypto.h> +#include <linux/module.h> #include <linux/err.h> #include <crypto/algapi.h> #include <crypto/aes.h> diff --git a/arch/x86/crypto/blowfish-x86_64-asm_64.S b/arch/x86/crypto/blowfish-x86_64-asm_64.S new file mode 100644 index 000000000000..391d245dc086 --- /dev/null +++ b/arch/x86/crypto/blowfish-x86_64-asm_64.S @@ -0,0 +1,390 @@ +/* + * Blowfish Cipher Algorithm (x86_64) + * + * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "blowfish-x86_64-asm.S" +.text + +/* structure of crypto context */ +#define p 0 +#define s0 ((16 + 2) * 4) +#define s1 ((16 + 2 + (1 * 256)) * 4) +#define s2 ((16 + 2 + (2 * 256)) * 4) +#define s3 ((16 + 2 + (3 * 256)) * 4) + +/* register macros */ +#define CTX %rdi +#define RIO %rsi + +#define RX0 %rax +#define RX1 %rbx +#define RX2 %rcx +#define RX3 %rdx + +#define RX0d %eax +#define RX1d %ebx +#define RX2d %ecx +#define RX3d %edx + +#define RX0bl %al +#define RX1bl %bl +#define RX2bl %cl +#define RX3bl %dl + +#define RX0bh %ah +#define RX1bh %bh +#define RX2bh %ch +#define RX3bh %dh + +#define RT0 %rbp +#define RT1 %rsi +#define RT2 %r8 +#define RT3 %r9 + +#define RT0d %ebp +#define RT1d %esi +#define RT2d %r8d +#define RT3d %r9d + +#define RKEY %r10 + +/*********************************************************************** + * 1-way blowfish + ***********************************************************************/ +#define F() \ + rorq $16, RX0; \ + movzbl RX0bh, RT0d; \ + movzbl RX0bl, RT1d; \ + rolq $16, RX0; \ + movl s0(CTX,RT0,4), RT0d; \ + addl s1(CTX,RT1,4), RT0d; \ + movzbl RX0bh, RT1d; \ + movzbl RX0bl, RT2d; \ + rolq $32, RX0; \ + xorl s2(CTX,RT1,4), RT0d; \ + addl s3(CTX,RT2,4), RT0d; \ + xorq RT0, RX0; + +#define add_roundkey_enc(n) \ + xorq p+4*(n)(CTX), RX0; + +#define round_enc(n) \ + add_roundkey_enc(n); \ + \ + F(); \ + F(); + +#define add_roundkey_dec(n) \ + movq p+4*(n-1)(CTX), RT0; \ + rorq $32, RT0; \ + xorq RT0, RX0; + +#define round_dec(n) \ + add_roundkey_dec(n); \ + \ + F(); \ + F(); \ + +#define read_block() \ + movq (RIO), RX0; \ + rorq $32, RX0; \ + bswapq RX0; + +#define write_block() \ + bswapq RX0; \ + movq RX0, (RIO); + +#define xor_block() \ + bswapq RX0; \ + xorq RX0, (RIO); + +.align 8 +.global __blowfish_enc_blk +.type __blowfish_enc_blk,@function; + +__blowfish_enc_blk: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: bool, if true: xor output + */ + movq %rbp, %r11; + + movq %rsi, %r10; + movq %rdx, RIO; + + read_block(); + + round_enc(0); + round_enc(2); + round_enc(4); + round_enc(6); + round_enc(8); + round_enc(10); + round_enc(12); + round_enc(14); + add_roundkey_enc(16); + + movq %r11, %rbp; + + movq %r10, RIO; + test %cl, %cl; + jnz __enc_xor; + + write_block(); + ret; +__enc_xor: + xor_block(); + ret; + +.align 8 +.global blowfish_dec_blk +.type blowfish_dec_blk,@function; + +blowfish_dec_blk: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + movq %rbp, %r11; + + movq %rsi, %r10; + movq %rdx, RIO; + + read_block(); + + round_dec(17); + round_dec(15); + round_dec(13); + round_dec(11); + round_dec(9); + round_dec(7); + round_dec(5); + round_dec(3); + add_roundkey_dec(1); + + movq %r10, RIO; + write_block(); + + movq %r11, %rbp; + + ret; + +/********************************************************************** + 4-way blowfish, four blocks parallel + **********************************************************************/ + +/* F() for 4-way. Slower when used alone/1-way, but faster when used + * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330). + */ +#define F4(x) \ + movzbl x ## bh, RT1d; \ + movzbl x ## bl, RT3d; \ + rorq $16, x; \ + movzbl x ## bh, RT0d; \ + movzbl x ## bl, RT2d; \ + rorq $16, x; \ + movl s0(CTX,RT0,4), RT0d; \ + addl s1(CTX,RT2,4), RT0d; \ + xorl s2(CTX,RT1,4), RT0d; \ + addl s3(CTX,RT3,4), RT0d; \ + xorq RT0, x; + +#define add_preloaded_roundkey4() \ + xorq RKEY, RX0; \ + xorq RKEY, RX1; \ + xorq RKEY, RX2; \ + xorq RKEY, RX3; + +#define preload_roundkey_enc(n) \ + movq p+4*(n)(CTX), RKEY; + +#define add_roundkey_enc4(n) \ + add_preloaded_roundkey4(); \ + preload_roundkey_enc(n + 2); + +#define round_enc4(n) \ + add_roundkey_enc4(n); \ + \ + F4(RX0); \ + F4(RX1); \ + F4(RX2); \ + F4(RX3); \ + \ + F4(RX0); \ + F4(RX1); \ + F4(RX2); \ + F4(RX3); + +#define preload_roundkey_dec(n) \ + movq p+4*((n)-1)(CTX), RKEY; \ + rorq $32, RKEY; + +#define add_roundkey_dec4(n) \ + add_preloaded_roundkey4(); \ + preload_roundkey_dec(n - 2); + +#define round_dec4(n) \ + add_roundkey_dec4(n); \ + \ + F4(RX0); \ + F4(RX1); \ + F4(RX2); \ + F4(RX3); \ + \ + F4(RX0); \ + F4(RX1); \ + F4(RX2); \ + F4(RX3); + +#define read_block4() \ + movq (RIO), RX0; \ + rorq $32, RX0; \ + bswapq RX0; \ + \ + movq 8(RIO), RX1; \ + rorq $32, RX1; \ + bswapq RX1; \ + \ + movq 16(RIO), RX2; \ + rorq $32, RX2; \ + bswapq RX2; \ + \ + movq 24(RIO), RX3; \ + rorq $32, RX3; \ + bswapq RX3; + +#define write_block4() \ + bswapq RX0; \ + movq RX0, (RIO); \ + \ + bswapq RX1; \ + movq RX1, 8(RIO); \ + \ + bswapq RX2; \ + movq RX2, 16(RIO); \ + \ + bswapq RX3; \ + movq RX3, 24(RIO); + +#define xor_block4() \ + bswapq RX0; \ + xorq RX0, (RIO); \ + \ + bswapq RX1; \ + xorq RX1, 8(RIO); \ + \ + bswapq RX2; \ + xorq RX2, 16(RIO); \ + \ + bswapq RX3; \ + xorq RX3, 24(RIO); + +.align 8 +.global __blowfish_enc_blk_4way +.type __blowfish_enc_blk_4way,@function; + +__blowfish_enc_blk_4way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + * %rcx: bool, if true: xor output + */ + pushq %rbp; + pushq %rbx; + pushq %rcx; + + preload_roundkey_enc(0); + + movq %rsi, %r11; + movq %rdx, RIO; + + read_block4(); + + round_enc4(0); + round_enc4(2); + round_enc4(4); + round_enc4(6); + round_enc4(8); + round_enc4(10); + round_enc4(12); + round_enc4(14); + add_preloaded_roundkey4(); + + popq %rbp; + movq %r11, RIO; + + test %bpl, %bpl; + jnz __enc_xor4; + + write_block4(); + + popq %rbx; + popq %rbp; + ret; + +__enc_xor4: + xor_block4(); + + popq %rbx; + popq %rbp; + ret; + +.align 8 +.global blowfish_dec_blk_4way +.type blowfish_dec_blk_4way,@function; + +blowfish_dec_blk_4way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src + */ + pushq %rbp; + pushq %rbx; + preload_roundkey_dec(17); + + movq %rsi, %r11; + movq %rdx, RIO; + + read_block4(); + + round_dec4(17); + round_dec4(15); + round_dec4(13); + round_dec4(11); + round_dec4(9); + round_dec4(7); + round_dec4(5); + round_dec4(3); + add_preloaded_roundkey4(); + + movq %r11, RIO; + write_block4(); + + popq %rbx; + popq %rbp; + + ret; + diff --git a/arch/x86/crypto/blowfish_glue.c b/arch/x86/crypto/blowfish_glue.c new file mode 100644 index 000000000000..b05aa163d55a --- /dev/null +++ b/arch/x86/crypto/blowfish_glue.c @@ -0,0 +1,492 @@ +/* + * Glue Code for assembler optimized version of Blowfish + * + * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: + * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> + * CTR part based on code (crypto/ctr.c) by: + * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include <crypto/blowfish.h> +#include <linux/crypto.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/types.h> +#include <crypto/algapi.h> + +/* regular block cipher functions */ +asmlinkage void __blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src, + bool xor); +asmlinkage void blowfish_dec_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src); + +/* 4-way parallel cipher functions */ +asmlinkage void __blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void blowfish_dec_blk_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void blowfish_enc_blk(struct bf_ctx *ctx, u8 *dst, const u8 *src) +{ + __blowfish_enc_blk(ctx, dst, src, false); +} + +static inline void blowfish_enc_blk_xor(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk(ctx, dst, src, true); +} + +static inline void blowfish_enc_blk_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk_4way(ctx, dst, src, false); +} + +static inline void blowfish_enc_blk_xor_4way(struct bf_ctx *ctx, u8 *dst, + const u8 *src) +{ + __blowfish_enc_blk_4way(ctx, dst, src, true); +} + +static void blowfish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + blowfish_enc_blk(crypto_tfm_ctx(tfm), dst, src); +} + +static void blowfish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) +{ + blowfish_dec_blk(crypto_tfm_ctx(tfm), dst, src); +} + +static struct crypto_alg bf_alg = { + .cra_name = "blowfish", + .cra_driver_name = "blowfish-asm", + .cra_priority = 200, + .cra_flags = CRYPTO_ALG_TYPE_CIPHER, + .cra_blocksize = BF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct bf_ctx), + .cra_alignmask = 3, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(bf_alg.cra_list), + .cra_u = { + .cipher = { + .cia_min_keysize = BF_MIN_KEY_SIZE, + .cia_max_keysize = BF_MAX_KEY_SIZE, + .cia_setkey = blowfish_setkey, + .cia_encrypt = blowfish_encrypt, + .cia_decrypt = blowfish_decrypt, + } + } +}; + +static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, + void (*fn)(struct bf_ctx *, u8 *, const u8 *), + void (*fn_4way)(struct bf_ctx *, u8 *, const u8 *)) +{ + struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = BF_BLOCK_SIZE; + unsigned int nbytes; + int err; + + err = blkcipher_walk_virt(desc, walk); + + while ((nbytes = walk->nbytes)) { + u8 *wsrc = walk->src.virt.addr; + u8 *wdst = walk->dst.virt.addr; + + /* Process four block batch */ + if (nbytes >= bsize * 4) { + do { + fn_4way(ctx, wdst, wsrc); + + wsrc += bsize * 4; + wdst += bsize * 4; + nbytes -= bsize * 4; + } while (nbytes >= bsize * 4); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + fn(ctx, wdst, wsrc); + + wsrc += bsize; + wdst += bsize; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + err = blkcipher_walk_done(desc, walk, nbytes); + } + + return err; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, blowfish_enc_blk, blowfish_enc_blk_4way); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, blowfish_dec_blk, blowfish_dec_blk_4way); +} + +static struct crypto_alg blk_ecb_alg = { + .cra_name = "ecb(blowfish)", + .cra_driver_name = "ecb-blowfish-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = BF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct bf_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = BF_MIN_KEY_SIZE, + .max_keysize = BF_MAX_KEY_SIZE, + .setkey = blowfish_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}; + +static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = BF_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u64 *src = (u64 *)walk->src.virt.addr; + u64 *dst = (u64 *)walk->dst.virt.addr; + u64 *iv = (u64 *)walk->iv; + + do { + *dst = *src ^ *iv; + blowfish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + + *(u64 *)walk->iv = *iv; + return nbytes; +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_encrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = BF_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u64 *src = (u64 *)walk->src.virt.addr; + u64 *dst = (u64 *)walk->dst.virt.addr; + u64 ivs[4 - 1]; + u64 last_iv; + + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; + + last_iv = *src; + + /* Process four block batch */ + if (nbytes >= bsize * 4) { + do { + nbytes -= bsize * 4 - bsize; + src -= 4 - 1; + dst -= 4 - 1; + + ivs[0] = src[0]; + ivs[1] = src[1]; + ivs[2] = src[2]; + + blowfish_dec_blk_4way(ctx, (u8 *)dst, (u8 *)src); + + dst[1] ^= ivs[0]; + dst[2] ^= ivs[1]; + dst[3] ^= ivs[2]; + + nbytes -= bsize; + if (nbytes < bsize) + goto done; + + *dst ^= *(src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= bsize * 4); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + for (;;) { + blowfish_dec_blk(ctx, (u8 *)dst, (u8 *)src); + + nbytes -= bsize; + if (nbytes < bsize) + break; + + *dst ^= *(src - 1); + src -= 1; + dst -= 1; + } + +done: + *dst ^= *(u64 *)walk->iv; + *(u64 *)walk->iv = last_iv; + + return nbytes; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_decrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static struct crypto_alg blk_cbc_alg = { + .cra_name = "cbc(blowfish)", + .cra_driver_name = "cbc-blowfish-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = BF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct bf_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = BF_MIN_KEY_SIZE, + .max_keysize = BF_MAX_KEY_SIZE, + .ivsize = BF_BLOCK_SIZE, + .setkey = blowfish_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}; + +static void ctr_crypt_final(struct bf_ctx *ctx, struct blkcipher_walk *walk) +{ + u8 *ctrblk = walk->iv; + u8 keystream[BF_BLOCK_SIZE]; + u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + + blowfish_enc_blk(ctx, keystream, ctrblk); + crypto_xor(keystream, src, nbytes); + memcpy(dst, keystream, nbytes); + + crypto_inc(ctrblk, BF_BLOCK_SIZE); +} + +static unsigned int __ctr_crypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct bf_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = BF_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u64 *src = (u64 *)walk->src.virt.addr; + u64 *dst = (u64 *)walk->dst.virt.addr; + u64 ctrblk = be64_to_cpu(*(__be64 *)walk->iv); + __be64 ctrblocks[4]; + + /* Process four block batch */ + if (nbytes >= bsize * 4) { + do { + if (dst != src) { + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + dst[3] = src[3]; + } + + /* create ctrblks for parallel encrypt */ + ctrblocks[0] = cpu_to_be64(ctrblk++); + ctrblocks[1] = cpu_to_be64(ctrblk++); + ctrblocks[2] = cpu_to_be64(ctrblk++); + ctrblocks[3] = cpu_to_be64(ctrblk++); + + blowfish_enc_blk_xor_4way(ctx, (u8 *)dst, + (u8 *)ctrblocks); + + src += 4; + dst += 4; + } while ((nbytes -= bsize * 4) >= bsize * 4); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + if (dst != src) + *dst = *src; + + ctrblocks[0] = cpu_to_be64(ctrblk++); + + blowfish_enc_blk_xor(ctx, (u8 *)dst, (u8 *)ctrblocks); + + src += 1; + dst += 1; + } while ((nbytes -= bsize) >= bsize); + +done: + *(__be64 *)walk->iv = cpu_to_be64(ctrblk); + return nbytes; +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, BF_BLOCK_SIZE); + + while ((nbytes = walk.nbytes) >= BF_BLOCK_SIZE) { + nbytes = __ctr_crypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + if (walk.nbytes) { + ctr_crypt_final(crypto_blkcipher_ctx(desc->tfm), &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + + return err; +} + +static struct crypto_alg blk_ctr_alg = { + .cra_name = "ctr(blowfish)", + .cra_driver_name = "ctr-blowfish-asm", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct bf_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = BF_MIN_KEY_SIZE, + .max_keysize = BF_MAX_KEY_SIZE, + .ivsize = BF_BLOCK_SIZE, + .setkey = blowfish_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}; + +static int __init init(void) +{ + int err; + + err = crypto_register_alg(&bf_alg); + if (err) + goto bf_err; + err = crypto_register_alg(&blk_ecb_alg); + if (err) + goto ecb_err; + err = crypto_register_alg(&blk_cbc_alg); + if (err) + goto cbc_err; + err = crypto_register_alg(&blk_ctr_alg); + if (err) + goto ctr_err; + + return 0; + +ctr_err: + crypto_unregister_alg(&blk_cbc_alg); +cbc_err: + crypto_unregister_alg(&blk_ecb_alg); +ecb_err: + crypto_unregister_alg(&bf_alg); +bf_err: + return err; +} + +static void __exit fini(void) +{ + crypto_unregister_alg(&blk_ctr_alg); + crypto_unregister_alg(&blk_cbc_alg); + crypto_unregister_alg(&blk_ecb_alg); + crypto_unregister_alg(&bf_alg); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Blowfish Cipher Algorithm, asm optimized"); +MODULE_ALIAS("blowfish"); +MODULE_ALIAS("blowfish-asm"); diff --git a/arch/x86/crypto/sha1_ssse3_asm.S b/arch/x86/crypto/sha1_ssse3_asm.S new file mode 100644 index 000000000000..b2c2f57d70e8 --- /dev/null +++ b/arch/x86/crypto/sha1_ssse3_asm.S @@ -0,0 +1,558 @@ +/* + * This is a SIMD SHA-1 implementation. It requires the Intel(R) Supplemental + * SSE3 instruction set extensions introduced in Intel Core Microarchitecture + * processors. CPUs supporting Intel(R) AVX extensions will get an additional + * boost. + * + * This work was inspired by the vectorized implementation of Dean Gaudet. + * Additional information on it can be found at: + * http://www.arctic.org/~dean/crypto/sha1.html + * + * It was improved upon with more efficient vectorization of the message + * scheduling. This implementation has also been optimized for all current and + * several future generations of Intel CPUs. + * + * See this article for more information about the implementation details: + * http://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1/ + * + * Copyright (C) 2010, Intel Corp. + * Authors: Maxim Locktyukhin <maxim.locktyukhin@intel.com> + * Ronen Zohar <ronen.zohar@intel.com> + * + * Converted to AT&T syntax and adapted for inclusion in the Linux kernel: + * Author: Mathias Krause <minipli@googlemail.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#define CTX %rdi // arg1 +#define BUF %rsi // arg2 +#define CNT %rdx // arg3 + +#define REG_A %ecx +#define REG_B %esi +#define REG_C %edi +#define REG_D %ebp +#define REG_E %edx + +#define REG_T1 %eax +#define REG_T2 %ebx + +#define K_BASE %r8 +#define HASH_PTR %r9 +#define BUFFER_PTR %r10 +#define BUFFER_END %r11 + +#define W_TMP1 %xmm0 +#define W_TMP2 %xmm9 + +#define W0 %xmm1 +#define W4 %xmm2 +#define W8 %xmm3 +#define W12 %xmm4 +#define W16 %xmm5 +#define W20 %xmm6 +#define W24 %xmm7 +#define W28 %xmm8 + +#define XMM_SHUFB_BSWAP %xmm10 + +/* we keep window of 64 w[i]+K pre-calculated values in a circular buffer */ +#define WK(t) (((t) & 15) * 4)(%rsp) +#define W_PRECALC_AHEAD 16 + +/* + * This macro implements the SHA-1 function's body for single 64-byte block + * param: function's name + */ +.macro SHA1_VECTOR_ASM name + .global \name + .type \name, @function + .align 32 +\name: + push %rbx + push %rbp + push %r12 + + mov %rsp, %r12 + sub $64, %rsp # allocate workspace + and $~15, %rsp # align stack + + mov CTX, HASH_PTR + mov BUF, BUFFER_PTR + + shl $6, CNT # multiply by 64 + add BUF, CNT + mov CNT, BUFFER_END + + lea K_XMM_AR(%rip), K_BASE + xmm_mov BSWAP_SHUFB_CTL(%rip), XMM_SHUFB_BSWAP + + SHA1_PIPELINED_MAIN_BODY + + # cleanup workspace + mov $8, %ecx + mov %rsp, %rdi + xor %rax, %rax + rep stosq + + mov %r12, %rsp # deallocate workspace + + pop %r12 + pop %rbp + pop %rbx + ret + + .size \name, .-\name +.endm + +/* + * This macro implements 80 rounds of SHA-1 for one 64-byte block + */ +.macro SHA1_PIPELINED_MAIN_BODY + INIT_REGALLOC + + mov (HASH_PTR), A + mov 4(HASH_PTR), B + mov 8(HASH_PTR), C + mov 12(HASH_PTR), D + mov 16(HASH_PTR), E + + .set i, 0 + .rept W_PRECALC_AHEAD + W_PRECALC i + .set i, (i+1) + .endr + +.align 4 +1: + RR F1,A,B,C,D,E,0 + RR F1,D,E,A,B,C,2 + RR F1,B,C,D,E,A,4 + RR F1,E,A,B,C,D,6 + RR F1,C,D,E,A,B,8 + + RR F1,A,B,C,D,E,10 + RR F1,D,E,A,B,C,12 + RR F1,B,C,D,E,A,14 + RR F1,E,A,B,C,D,16 + RR F1,C,D,E,A,B,18 + + RR F2,A,B,C,D,E,20 + RR F2,D,E,A,B,C,22 + RR F2,B,C,D,E,A,24 + RR F2,E,A,B,C,D,26 + RR F2,C,D,E,A,B,28 + + RR F2,A,B,C,D,E,30 + RR F2,D,E,A,B,C,32 + RR F2,B,C,D,E,A,34 + RR F2,E,A,B,C,D,36 + RR F2,C,D,E,A,B,38 + + RR F3,A,B,C,D,E,40 + RR F3,D,E,A,B,C,42 + RR F3,B,C,D,E,A,44 + RR F3,E,A,B,C,D,46 + RR F3,C,D,E,A,B,48 + + RR F3,A,B,C,D,E,50 + RR F3,D,E,A,B,C,52 + RR F3,B,C,D,E,A,54 + RR F3,E,A,B,C,D,56 + RR F3,C,D,E,A,B,58 + + add $64, BUFFER_PTR # move to the next 64-byte block + cmp BUFFER_END, BUFFER_PTR # if the current is the last one use + cmovae K_BASE, BUFFER_PTR # dummy source to avoid buffer overrun + + RR F4,A,B,C,D,E,60 + RR F4,D,E,A,B,C,62 + RR F4,B,C,D,E,A,64 + RR F4,E,A,B,C,D,66 + RR F4,C,D,E,A,B,68 + + RR F4,A,B,C,D,E,70 + RR F4,D,E,A,B,C,72 + RR F4,B,C,D,E,A,74 + RR F4,E,A,B,C,D,76 + RR F4,C,D,E,A,B,78 + + UPDATE_HASH (HASH_PTR), A + UPDATE_HASH 4(HASH_PTR), B + UPDATE_HASH 8(HASH_PTR), C + UPDATE_HASH 12(HASH_PTR), D + UPDATE_HASH 16(HASH_PTR), E + + RESTORE_RENAMED_REGS + cmp K_BASE, BUFFER_PTR # K_BASE means, we reached the end + jne 1b +.endm + +.macro INIT_REGALLOC + .set A, REG_A + .set B, REG_B + .set C, REG_C + .set D, REG_D + .set E, REG_E + .set T1, REG_T1 + .set T2, REG_T2 +.endm + +.macro RESTORE_RENAMED_REGS + # order is important (REG_C is where it should be) + mov B, REG_B + mov D, REG_D + mov A, REG_A + mov E, REG_E +.endm + +.macro SWAP_REG_NAMES a, b + .set _T, \a + .set \a, \b + .set \b, _T +.endm + +.macro F1 b, c, d + mov \c, T1 + SWAP_REG_NAMES \c, T1 + xor \d, T1 + and \b, T1 + xor \d, T1 +.endm + +.macro F2 b, c, d + mov \d, T1 + SWAP_REG_NAMES \d, T1 + xor \c, T1 + xor \b, T1 +.endm + +.macro F3 b, c ,d + mov \c, T1 + SWAP_REG_NAMES \c, T1 + mov \b, T2 + or \b, T1 + and \c, T2 + and \d, T1 + or T2, T1 +.endm + +.macro F4 b, c, d + F2 \b, \c, \d +.endm + +.macro UPDATE_HASH hash, val + add \hash, \val + mov \val, \hash +.endm + +/* + * RR does two rounds of SHA-1 back to back with W[] pre-calc + * t1 = F(b, c, d); e += w(i) + * e += t1; b <<= 30; d += w(i+1); + * t1 = F(a, b, c); + * d += t1; a <<= 5; + * e += a; + * t1 = e; a >>= 7; + * t1 <<= 5; + * d += t1; + */ +.macro RR F, a, b, c, d, e, round + add WK(\round), \e + \F \b, \c, \d # t1 = F(b, c, d); + W_PRECALC (\round + W_PRECALC_AHEAD) + rol $30, \b + add T1, \e + add WK(\round + 1), \d + + \F \a, \b, \c + W_PRECALC (\round + W_PRECALC_AHEAD + 1) + rol $5, \a + add \a, \e + add T1, \d + ror $7, \a # (a <<r 5) >>r 7) => a <<r 30) + + mov \e, T1 + SWAP_REG_NAMES \e, T1 + + rol $5, T1 + add T1, \d + + # write: \a, \b + # rotate: \a<=\d, \b<=\e, \c<=\a, \d<=\b, \e<=\c +.endm + +.macro W_PRECALC r + .set i, \r + + .if (i < 20) + .set K_XMM, 0 + .elseif (i < 40) + .set K_XMM, 16 + .elseif (i < 60) + .set K_XMM, 32 + .elseif (i < 80) + .set K_XMM, 48 + .endif + + .if ((i < 16) || ((i >= 80) && (i < (80 + W_PRECALC_AHEAD)))) + .set i, ((\r) % 80) # pre-compute for the next iteration + .if (i == 0) + W_PRECALC_RESET + .endif + W_PRECALC_00_15 + .elseif (i<32) + W_PRECALC_16_31 + .elseif (i < 80) // rounds 32-79 + W_PRECALC_32_79 + .endif +.endm + +.macro W_PRECALC_RESET + .set W, W0 + .set W_minus_04, W4 + .set W_minus_08, W8 + .set W_minus_12, W12 + .set W_minus_16, W16 + .set W_minus_20, W20 + .set W_minus_24, W24 + .set W_minus_28, W28 + .set W_minus_32, W +.endm + +.macro W_PRECALC_ROTATE + .set W_minus_32, W_minus_28 + .set W_minus_28, W_minus_24 + .set W_minus_24, W_minus_20 + .set W_minus_20, W_minus_16 + .set W_minus_16, W_minus_12 + .set W_minus_12, W_minus_08 + .set W_minus_08, W_minus_04 + .set W_minus_04, W + .set W, W_minus_32 +.endm + +.macro W_PRECALC_SSSE3 + +.macro W_PRECALC_00_15 + W_PRECALC_00_15_SSSE3 +.endm +.macro W_PRECALC_16_31 + W_PRECALC_16_31_SSSE3 +.endm +.macro W_PRECALC_32_79 + W_PRECALC_32_79_SSSE3 +.endm + +/* message scheduling pre-compute for rounds 0-15 */ +.macro W_PRECALC_00_15_SSSE3 + .if ((i & 3) == 0) + movdqu (i*4)(BUFFER_PTR), W_TMP1 + .elseif ((i & 3) == 1) + pshufb XMM_SHUFB_BSWAP, W_TMP1 + movdqa W_TMP1, W + .elseif ((i & 3) == 2) + paddd (K_BASE), W_TMP1 + .elseif ((i & 3) == 3) + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +/* message scheduling pre-compute for rounds 16-31 + * + * - calculating last 32 w[i] values in 8 XMM registers + * - pre-calculate K+w[i] values and store to mem, for later load by ALU add + * instruction + * + * some "heavy-lifting" vectorization for rounds 16-31 due to w[i]->w[i-3] + * dependency, but improves for 32-79 + */ +.macro W_PRECALC_16_31_SSSE3 + # blended scheduling of vector and scalar instruction streams, one 4-wide + # vector iteration / 4 scalar rounds + .if ((i & 3) == 0) + movdqa W_minus_12, W + palignr $8, W_minus_16, W # w[i-14] + movdqa W_minus_04, W_TMP1 + psrldq $4, W_TMP1 # w[i-3] + pxor W_minus_08, W + .elseif ((i & 3) == 1) + pxor W_minus_16, W_TMP1 + pxor W_TMP1, W + movdqa W, W_TMP2 + movdqa W, W_TMP1 + pslldq $12, W_TMP2 + .elseif ((i & 3) == 2) + psrld $31, W + pslld $1, W_TMP1 + por W, W_TMP1 + movdqa W_TMP2, W + psrld $30, W_TMP2 + pslld $2, W + .elseif ((i & 3) == 3) + pxor W, W_TMP1 + pxor W_TMP2, W_TMP1 + movdqa W_TMP1, W + paddd K_XMM(K_BASE), W_TMP1 + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +/* message scheduling pre-compute for rounds 32-79 + * + * in SHA-1 specification: w[i] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) rol 1 + * instead we do equal: w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]) rol 2 + * allows more efficient vectorization since w[i]=>w[i-3] dependency is broken + */ +.macro W_PRECALC_32_79_SSSE3 + .if ((i & 3) == 0) + movdqa W_minus_04, W_TMP1 + pxor W_minus_28, W # W is W_minus_32 before xor + palignr $8, W_minus_08, W_TMP1 + .elseif ((i & 3) == 1) + pxor W_minus_16, W + pxor W_TMP1, W + movdqa W, W_TMP1 + .elseif ((i & 3) == 2) + psrld $30, W + pslld $2, W_TMP1 + por W, W_TMP1 + .elseif ((i & 3) == 3) + movdqa W_TMP1, W + paddd K_XMM(K_BASE), W_TMP1 + movdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.endm // W_PRECALC_SSSE3 + + +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +.section .rodata +.align 16 + +K_XMM_AR: + .long K1, K1, K1, K1 + .long K2, K2, K2, K2 + .long K3, K3, K3, K3 + .long K4, K4, K4, K4 + +BSWAP_SHUFB_CTL: + .long 0x00010203 + .long 0x04050607 + .long 0x08090a0b + .long 0x0c0d0e0f + + +.section .text + +W_PRECALC_SSSE3 +.macro xmm_mov a, b + movdqu \a,\b +.endm + +/* SSSE3 optimized implementation: + * extern "C" void sha1_transform_ssse3(u32 *digest, const char *data, u32 *ws, + * unsigned int rounds); + */ +SHA1_VECTOR_ASM sha1_transform_ssse3 + +#ifdef SHA1_ENABLE_AVX_SUPPORT + +.macro W_PRECALC_AVX + +.purgem W_PRECALC_00_15 +.macro W_PRECALC_00_15 + W_PRECALC_00_15_AVX +.endm +.purgem W_PRECALC_16_31 +.macro W_PRECALC_16_31 + W_PRECALC_16_31_AVX +.endm +.purgem W_PRECALC_32_79 +.macro W_PRECALC_32_79 + W_PRECALC_32_79_AVX +.endm + +.macro W_PRECALC_00_15_AVX + .if ((i & 3) == 0) + vmovdqu (i*4)(BUFFER_PTR), W_TMP1 + .elseif ((i & 3) == 1) + vpshufb XMM_SHUFB_BSWAP, W_TMP1, W + .elseif ((i & 3) == 2) + vpaddd (K_BASE), W, W_TMP1 + .elseif ((i & 3) == 3) + vmovdqa W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.macro W_PRECALC_16_31_AVX + .if ((i & 3) == 0) + vpalignr $8, W_minus_16, W_minus_12, W # w[i-14] + vpsrldq $4, W_minus_04, W_TMP1 # w[i-3] + vpxor W_minus_08, W, W + vpxor W_minus_16, W_TMP1, W_TMP1 + .elseif ((i & 3) == 1) + vpxor W_TMP1, W, W + vpslldq $12, W, W_TMP2 + vpslld $1, W, W_TMP1 + .elseif ((i & 3) == 2) + vpsrld $31, W, W + vpor W, W_TMP1, W_TMP1 + vpslld $2, W_TMP2, W + vpsrld $30, W_TMP2, W_TMP2 + .elseif ((i & 3) == 3) + vpxor W, W_TMP1, W_TMP1 + vpxor W_TMP2, W_TMP1, W + vpaddd K_XMM(K_BASE), W, W_TMP1 + vmovdqu W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.macro W_PRECALC_32_79_AVX + .if ((i & 3) == 0) + vpalignr $8, W_minus_08, W_minus_04, W_TMP1 + vpxor W_minus_28, W, W # W is W_minus_32 before xor + .elseif ((i & 3) == 1) + vpxor W_minus_16, W_TMP1, W_TMP1 + vpxor W_TMP1, W, W + .elseif ((i & 3) == 2) + vpslld $2, W, W_TMP1 + vpsrld $30, W, W + vpor W, W_TMP1, W + .elseif ((i & 3) == 3) + vpaddd K_XMM(K_BASE), W, W_TMP1 + vmovdqu W_TMP1, WK(i&~3) + W_PRECALC_ROTATE + .endif +.endm + +.endm // W_PRECALC_AVX + +W_PRECALC_AVX +.purgem xmm_mov +.macro xmm_mov a, b + vmovdqu \a,\b +.endm + + +/* AVX optimized implementation: + * extern "C" void sha1_transform_avx(u32 *digest, const char *data, u32 *ws, + * unsigned int rounds); + */ +SHA1_VECTOR_ASM sha1_transform_avx + +#endif diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c new file mode 100644 index 000000000000..f916499d0abe --- /dev/null +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -0,0 +1,240 @@ +/* + * Cryptographic API. + * + * Glue code for the SHA1 Secure Hash Algorithm assembler implementation using + * Supplemental SSE3 instructions. + * + * This file is based on sha1_generic.c + * + * Copyright (c) Alan Smithee. + * Copyright (c) Andrew McDonald <andrew@mcdonald.org.uk> + * Copyright (c) Jean-Francois Dive <jef@linuxbe.org> + * Copyright (c) Mathias Krause <minipli@googlemail.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <crypto/internal/hash.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/mm.h> +#include <linux/cryptohash.h> +#include <linux/types.h> +#include <crypto/sha.h> +#include <asm/byteorder.h> +#include <asm/i387.h> +#include <asm/xcr.h> +#include <asm/xsave.h> + + +asmlinkage void sha1_transform_ssse3(u32 *digest, const char *data, + unsigned int rounds); +#ifdef SHA1_ENABLE_AVX_SUPPORT +asmlinkage void sha1_transform_avx(u32 *digest, const char *data, + unsigned int rounds); +#endif + +static asmlinkage void (*sha1_transform_asm)(u32 *, const char *, unsigned int); + + +static int sha1_ssse3_init(struct shash_desc *desc) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + *sctx = (struct sha1_state){ + .state = { SHA1_H0, SHA1_H1, SHA1_H2, SHA1_H3, SHA1_H4 }, + }; + + return 0; +} + +static int __sha1_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len, unsigned int partial) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + unsigned int done = 0; + + sctx->count += len; + + if (partial) { + done = SHA1_BLOCK_SIZE - partial; + memcpy(sctx->buffer + partial, data, done); + sha1_transform_asm(sctx->state, sctx->buffer, 1); + } + + if (len - done >= SHA1_BLOCK_SIZE) { + const unsigned int rounds = (len - done) / SHA1_BLOCK_SIZE; + + sha1_transform_asm(sctx->state, data + done, rounds); + done += rounds * SHA1_BLOCK_SIZE; + } + + memcpy(sctx->buffer, data + done, len - done); + + return 0; +} + +static int sha1_ssse3_update(struct shash_desc *desc, const u8 *data, + unsigned int len) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + unsigned int partial = sctx->count % SHA1_BLOCK_SIZE; + int res; + + /* Handle the fast case right here */ + if (partial + len < SHA1_BLOCK_SIZE) { + sctx->count += len; + memcpy(sctx->buffer + partial, data, len); + + return 0; + } + + if (!irq_fpu_usable()) { + res = crypto_sha1_update(desc, data, len); + } else { + kernel_fpu_begin(); + res = __sha1_ssse3_update(desc, data, len, partial); + kernel_fpu_end(); + } + + return res; +} + + +/* Add padding and return the message digest. */ +static int sha1_ssse3_final(struct shash_desc *desc, u8 *out) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + unsigned int i, index, padlen; + __be32 *dst = (__be32 *)out; + __be64 bits; + static const u8 padding[SHA1_BLOCK_SIZE] = { 0x80, }; + + bits = cpu_to_be64(sctx->count << 3); + + /* Pad out to 56 mod 64 and append length */ + index = sctx->count % SHA1_BLOCK_SIZE; + padlen = (index < 56) ? (56 - index) : ((SHA1_BLOCK_SIZE+56) - index); + if (!irq_fpu_usable()) { + crypto_sha1_update(desc, padding, padlen); + crypto_sha1_update(desc, (const u8 *)&bits, sizeof(bits)); + } else { + kernel_fpu_begin(); + /* We need to fill a whole block for __sha1_ssse3_update() */ + if (padlen <= 56) { + sctx->count += padlen; + memcpy(sctx->buffer + index, padding, padlen); + } else { + __sha1_ssse3_update(desc, padding, padlen, index); + } + __sha1_ssse3_update(desc, (const u8 *)&bits, sizeof(bits), 56); + kernel_fpu_end(); + } + + /* Store state in digest */ + for (i = 0; i < 5; i++) + dst[i] = cpu_to_be32(sctx->state[i]); + + /* Wipe context */ + memset(sctx, 0, sizeof(*sctx)); + + return 0; +} + +static int sha1_ssse3_export(struct shash_desc *desc, void *out) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + memcpy(out, sctx, sizeof(*sctx)); + + return 0; +} + +static int sha1_ssse3_import(struct shash_desc *desc, const void *in) +{ + struct sha1_state *sctx = shash_desc_ctx(desc); + + memcpy(sctx, in, sizeof(*sctx)); + + return 0; +} + +static struct shash_alg alg = { + .digestsize = SHA1_DIGEST_SIZE, + .init = sha1_ssse3_init, + .update = sha1_ssse3_update, + .final = sha1_ssse3_final, + .export = sha1_ssse3_export, + .import = sha1_ssse3_import, + .descsize = sizeof(struct sha1_state), + .statesize = sizeof(struct sha1_state), + .base = { + .cra_name = "sha1", + .cra_driver_name= "sha1-ssse3", + .cra_priority = 150, + .cra_flags = CRYPTO_ALG_TYPE_SHASH, + .cra_blocksize = SHA1_BLOCK_SIZE, + .cra_module = THIS_MODULE, + } +}; + +#ifdef SHA1_ENABLE_AVX_SUPPORT +static bool __init avx_usable(void) +{ + u64 xcr0; + + if (!cpu_has_avx || !cpu_has_osxsave) + return false; + + xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); + if ((xcr0 & (XSTATE_SSE | XSTATE_YMM)) != (XSTATE_SSE | XSTATE_YMM)) { + pr_info("AVX detected but unusable.\n"); + + return false; + } + + return true; +} +#endif + +static int __init sha1_ssse3_mod_init(void) +{ + /* test for SSSE3 first */ + if (cpu_has_ssse3) + sha1_transform_asm = sha1_transform_ssse3; + +#ifdef SHA1_ENABLE_AVX_SUPPORT + /* allow AVX to override SSSE3, it's a little faster */ + if (avx_usable()) + sha1_transform_asm = sha1_transform_avx; +#endif + + if (sha1_transform_asm) { + pr_info("Using %s optimized SHA-1 implementation\n", + sha1_transform_asm == sha1_transform_ssse3 ? "SSSE3" + : "AVX"); + return crypto_register_shash(&alg); + } + pr_info("Neither AVX nor SSSE3 is available/usable.\n"); + + return -ENODEV; +} + +static void __exit sha1_ssse3_mod_fini(void) +{ + crypto_unregister_shash(&alg); +} + +module_init(sha1_ssse3_mod_init); +module_exit(sha1_ssse3_mod_fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("SHA1 Secure Hash Algorithm, Supplemental SSE3 accelerated"); + +MODULE_ALIAS("sha1"); diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S index 575331cb2a8a..658af4bb35c9 100644 --- a/arch/x86/crypto/twofish-i586-asm_32.S +++ b/arch/x86/crypto/twofish-i586-asm_32.S @@ -26,7 +26,7 @@ #define in_blk 12 /* input byte array address parameter*/ #define out_blk 8 /* output byte array address parameter*/ -#define tfm 4 /* Twofish context structure */ +#define ctx 4 /* Twofish context structure */ #define a_offset 0 #define b_offset 4 @@ -229,8 +229,8 @@ twofish_enc_blk: push %esi push %edi - mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */ - add $crypto_tfm_ctx_offset, %ebp /* ctx address */ + mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base + * pointer to the ctx address */ mov in_blk+16(%esp),%edi /* input address in edi */ mov (%edi), %eax @@ -285,8 +285,8 @@ twofish_dec_blk: push %edi - mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */ - add $crypto_tfm_ctx_offset, %ebp /* ctx address */ + mov ctx + 16(%esp), %ebp /* abuse the base pointer: set new base + * pointer to the ctx address */ mov in_blk+16(%esp),%edi /* input address in edi */ mov (%edi), %eax diff --git a/arch/x86/crypto/twofish-x86_64-asm_64-3way.S b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S new file mode 100644 index 000000000000..5b012a2c5119 --- /dev/null +++ b/arch/x86/crypto/twofish-x86_64-asm_64-3way.S @@ -0,0 +1,316 @@ +/* + * Twofish Cipher 3-way parallel algorithm (x86_64) + * + * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +.file "twofish-x86_64-asm-3way.S" +.text + +/* structure of crypto context */ +#define s0 0 +#define s1 1024 +#define s2 2048 +#define s3 3072 +#define w 4096 +#define k 4128 + +/********************************************************************** + 3-way twofish + **********************************************************************/ +#define CTX %rdi +#define RIO %rdx + +#define RAB0 %rax +#define RAB1 %rbx +#define RAB2 %rcx + +#define RAB0d %eax +#define RAB1d %ebx +#define RAB2d %ecx + +#define RAB0bh %ah +#define RAB1bh %bh +#define RAB2bh %ch + +#define RAB0bl %al +#define RAB1bl %bl +#define RAB2bl %cl + +#define RCD0 %r8 +#define RCD1 %r9 +#define RCD2 %r10 + +#define RCD0d %r8d +#define RCD1d %r9d +#define RCD2d %r10d + +#define RX0 %rbp +#define RX1 %r11 +#define RX2 %r12 + +#define RX0d %ebp +#define RX1d %r11d +#define RX2d %r12d + +#define RY0 %r13 +#define RY1 %r14 +#define RY2 %r15 + +#define RY0d %r13d +#define RY1d %r14d +#define RY2d %r15d + +#define RT0 %rdx +#define RT1 %rsi + +#define RT0d %edx +#define RT1d %esi + +#define do16bit_ror(rot, op1, op2, T0, T1, tmp1, tmp2, ab, dst) \ + movzbl ab ## bl, tmp2 ## d; \ + movzbl ab ## bh, tmp1 ## d; \ + rorq $(rot), ab; \ + op1##l T0(CTX, tmp2, 4), dst ## d; \ + op2##l T1(CTX, tmp1, 4), dst ## d; + +/* + * Combined G1 & G2 function. Reordered with help of rotates to have moves + * at begining. + */ +#define g1g2_3(ab, cd, Tx0, Tx1, Tx2, Tx3, Ty0, Ty1, Ty2, Ty3, x, y) \ + /* G1,1 && G2,1 */ \ + do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 0, ab ## 0, x ## 0); \ + do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 0, ab ## 0, y ## 0); \ + \ + do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 1, ab ## 1, x ## 1); \ + do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 1, ab ## 1, y ## 1); \ + \ + do16bit_ror(32, mov, xor, Tx0, Tx1, RT0, x ## 2, ab ## 2, x ## 2); \ + do16bit_ror(48, mov, xor, Ty1, Ty2, RT0, y ## 2, ab ## 2, y ## 2); \ + \ + /* G1,2 && G2,2 */ \ + do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 0, x ## 0); \ + do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 0, y ## 0); \ + xchgq cd ## 0, ab ## 0; \ + \ + do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 1, x ## 1); \ + do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 1, y ## 1); \ + xchgq cd ## 1, ab ## 1; \ + \ + do16bit_ror(32, xor, xor, Tx2, Tx3, RT0, RT1, ab ## 2, x ## 2); \ + do16bit_ror(16, xor, xor, Ty3, Ty0, RT0, RT1, ab ## 2, y ## 2); \ + xchgq cd ## 2, ab ## 2; + +#define enc_round_end(ab, x, y, n) \ + addl y ## d, x ## d; \ + addl x ## d, y ## d; \ + addl k+4*(2*(n))(CTX), x ## d; \ + xorl ab ## d, x ## d; \ + addl k+4*(2*(n)+1)(CTX), y ## d; \ + shrq $32, ab; \ + roll $1, ab ## d; \ + xorl y ## d, ab ## d; \ + shlq $32, ab; \ + rorl $1, x ## d; \ + orq x, ab; + +#define dec_round_end(ba, x, y, n) \ + addl y ## d, x ## d; \ + addl x ## d, y ## d; \ + addl k+4*(2*(n))(CTX), x ## d; \ + addl k+4*(2*(n)+1)(CTX), y ## d; \ + xorl ba ## d, y ## d; \ + shrq $32, ba; \ + roll $1, ba ## d; \ + xorl x ## d, ba ## d; \ + shlq $32, ba; \ + rorl $1, y ## d; \ + orq y, ba; + +#define encrypt_round3(ab, cd, n) \ + g1g2_3(ab, cd, s0, s1, s2, s3, s0, s1, s2, s3, RX, RY); \ + \ + enc_round_end(ab ## 0, RX0, RY0, n); \ + enc_round_end(ab ## 1, RX1, RY1, n); \ + enc_round_end(ab ## 2, RX2, RY2, n); + +#define decrypt_round3(ba, dc, n) \ + g1g2_3(ba, dc, s1, s2, s3, s0, s3, s0, s1, s2, RY, RX); \ + \ + dec_round_end(ba ## 0, RX0, RY0, n); \ + dec_round_end(ba ## 1, RX1, RY1, n); \ + dec_round_end(ba ## 2, RX2, RY2, n); + +#define encrypt_cycle3(ab, cd, n) \ + encrypt_round3(ab, cd, n*2); \ + encrypt_round3(ab, cd, (n*2)+1); + +#define decrypt_cycle3(ba, dc, n) \ + decrypt_round3(ba, dc, (n*2)+1); \ + decrypt_round3(ba, dc, (n*2)); + +#define inpack3(in, n, xy, m) \ + movq 4*(n)(in), xy ## 0; \ + xorq w+4*m(CTX), xy ## 0; \ + \ + movq 4*(4+(n))(in), xy ## 1; \ + xorq w+4*m(CTX), xy ## 1; \ + \ + movq 4*(8+(n))(in), xy ## 2; \ + xorq w+4*m(CTX), xy ## 2; + +#define outunpack3(op, out, n, xy, m) \ + xorq w+4*m(CTX), xy ## 0; \ + op ## q xy ## 0, 4*(n)(out); \ + \ + xorq w+4*m(CTX), xy ## 1; \ + op ## q xy ## 1, 4*(4+(n))(out); \ + \ + xorq w+4*m(CTX), xy ## 2; \ + op ## q xy ## 2, 4*(8+(n))(out); + +#define inpack_enc3() \ + inpack3(RIO, 0, RAB, 0); \ + inpack3(RIO, 2, RCD, 2); + +#define outunpack_enc3(op) \ + outunpack3(op, RIO, 2, RAB, 6); \ + outunpack3(op, RIO, 0, RCD, 4); + +#define inpack_dec3() \ + inpack3(RIO, 0, RAB, 4); \ + rorq $32, RAB0; \ + rorq $32, RAB1; \ + rorq $32, RAB2; \ + inpack3(RIO, 2, RCD, 6); \ + rorq $32, RCD0; \ + rorq $32, RCD1; \ + rorq $32, RCD2; + +#define outunpack_dec3() \ + rorq $32, RCD0; \ + rorq $32, RCD1; \ + rorq $32, RCD2; \ + outunpack3(mov, RIO, 0, RCD, 0); \ + rorq $32, RAB0; \ + rorq $32, RAB1; \ + rorq $32, RAB2; \ + outunpack3(mov, RIO, 2, RAB, 2); + +.align 8 +.global __twofish_enc_blk_3way +.type __twofish_enc_blk_3way,@function; + +__twofish_enc_blk_3way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src, RIO + * %rcx: bool, if true: xor output + */ + pushq %r15; + pushq %r14; + pushq %r13; + pushq %r12; + pushq %rbp; + pushq %rbx; + + pushq %rcx; /* bool xor */ + pushq %rsi; /* dst */ + + inpack_enc3(); + + encrypt_cycle3(RAB, RCD, 0); + encrypt_cycle3(RAB, RCD, 1); + encrypt_cycle3(RAB, RCD, 2); + encrypt_cycle3(RAB, RCD, 3); + encrypt_cycle3(RAB, RCD, 4); + encrypt_cycle3(RAB, RCD, 5); + encrypt_cycle3(RAB, RCD, 6); + encrypt_cycle3(RAB, RCD, 7); + + popq RIO; /* dst */ + popq %rbp; /* bool xor */ + + testb %bpl, %bpl; + jnz __enc_xor3; + + outunpack_enc3(mov); + + popq %rbx; + popq %rbp; + popq %r12; + popq %r13; + popq %r14; + popq %r15; + ret; + +__enc_xor3: + outunpack_enc3(xor); + + popq %rbx; + popq %rbp; + popq %r12; + popq %r13; + popq %r14; + popq %r15; + ret; + +.global twofish_dec_blk_3way +.type twofish_dec_blk_3way,@function; + +twofish_dec_blk_3way: + /* input: + * %rdi: ctx, CTX + * %rsi: dst + * %rdx: src, RIO + */ + pushq %r15; + pushq %r14; + pushq %r13; + pushq %r12; + pushq %rbp; + pushq %rbx; + + pushq %rsi; /* dst */ + + inpack_dec3(); + + decrypt_cycle3(RAB, RCD, 7); + decrypt_cycle3(RAB, RCD, 6); + decrypt_cycle3(RAB, RCD, 5); + decrypt_cycle3(RAB, RCD, 4); + decrypt_cycle3(RAB, RCD, 3); + decrypt_cycle3(RAB, RCD, 2); + decrypt_cycle3(RAB, RCD, 1); + decrypt_cycle3(RAB, RCD, 0); + + popq RIO; /* dst */ + + outunpack_dec3(); + + popq %rbx; + popq %rbp; + popq %r12; + popq %r13; + popq %r14; + popq %r15; + ret; + diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S index 573aa102542e..7bcf3fcc3668 100644 --- a/arch/x86/crypto/twofish-x86_64-asm_64.S +++ b/arch/x86/crypto/twofish-x86_64-asm_64.S @@ -221,10 +221,9 @@ twofish_enc_blk: pushq R1 - /* %rdi contains the crypto tfm address */ + /* %rdi contains the ctx address */ /* %rsi contains the output address */ /* %rdx contains the input address */ - add $crypto_tfm_ctx_offset, %rdi /* set ctx address */ /* ctx address is moved to free one non-rex register as target for the 8bit high operations */ mov %rdi, %r11 @@ -274,10 +273,9 @@ twofish_enc_blk: twofish_dec_blk: pushq R1 - /* %rdi contains the crypto tfm address */ + /* %rdi contains the ctx address */ /* %rsi contains the output address */ /* %rdx contains the input address */ - add $crypto_tfm_ctx_offset, %rdi /* set ctx address */ /* ctx address is moved to free one non-rex register as target for the 8bit high operations */ mov %rdi, %r11 diff --git a/arch/x86/crypto/twofish_glue.c b/arch/x86/crypto/twofish_glue.c index cefaf8b9aa18..dc6b3fb817fc 100644 --- a/arch/x86/crypto/twofish_glue.c +++ b/arch/x86/crypto/twofish_glue.c @@ -44,17 +44,21 @@ #include <linux/module.h> #include <linux/types.h> -asmlinkage void twofish_enc_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); -asmlinkage void twofish_dec_blk(struct crypto_tfm *tfm, u8 *dst, const u8 *src); +asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); +EXPORT_SYMBOL_GPL(twofish_enc_blk); +asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); +EXPORT_SYMBOL_GPL(twofish_dec_blk); static void twofish_encrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { - twofish_enc_blk(tfm, dst, src); + twofish_enc_blk(crypto_tfm_ctx(tfm), dst, src); } static void twofish_decrypt(struct crypto_tfm *tfm, u8 *dst, const u8 *src) { - twofish_dec_blk(tfm, dst, src); + twofish_dec_blk(crypto_tfm_ctx(tfm), dst, src); } static struct crypto_alg alg = { diff --git a/arch/x86/crypto/twofish_glue_3way.c b/arch/x86/crypto/twofish_glue_3way.c new file mode 100644 index 000000000000..5ede9c444c3e --- /dev/null +++ b/arch/x86/crypto/twofish_glue_3way.c @@ -0,0 +1,472 @@ +/* + * Glue Code for 3-way parallel assembler optimized version of Twofish + * + * Copyright (c) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> + * + * CBC & ECB parts based on code (crypto/cbc.c,ecb.c) by: + * Copyright (c) 2006 Herbert Xu <herbert@gondor.apana.org.au> + * CTR part based on code (crypto/ctr.c) by: + * (C) Copyright IBM Corp. 2007 - Joy Latten <latten@us.ibm.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 + * USA + * + */ + +#include <linux/crypto.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/types.h> +#include <crypto/algapi.h> +#include <crypto/twofish.h> +#include <crypto/b128ops.h> + +/* regular block cipher functions from twofish_x86_64 module */ +asmlinkage void twofish_enc_blk(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); +asmlinkage void twofish_dec_blk(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); + +/* 3-way parallel cipher functions */ +asmlinkage void __twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src, bool xor); +asmlinkage void twofish_dec_blk_3way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src); + +static inline void twofish_enc_blk_3way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src) +{ + __twofish_enc_blk_3way(ctx, dst, src, false); +} + +static inline void twofish_enc_blk_xor_3way(struct twofish_ctx *ctx, u8 *dst, + const u8 *src) +{ + __twofish_enc_blk_3way(ctx, dst, src, true); +} + +static int ecb_crypt(struct blkcipher_desc *desc, struct blkcipher_walk *walk, + void (*fn)(struct twofish_ctx *, u8 *, const u8 *), + void (*fn_3way)(struct twofish_ctx *, u8 *, const u8 *)) +{ + struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = TF_BLOCK_SIZE; + unsigned int nbytes; + int err; + + err = blkcipher_walk_virt(desc, walk); + + while ((nbytes = walk->nbytes)) { + u8 *wsrc = walk->src.virt.addr; + u8 *wdst = walk->dst.virt.addr; + + /* Process three block batch */ + if (nbytes >= bsize * 3) { + do { + fn_3way(ctx, wdst, wsrc); + + wsrc += bsize * 3; + wdst += bsize * 3; + nbytes -= bsize * 3; + } while (nbytes >= bsize * 3); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + fn(ctx, wdst, wsrc); + + wsrc += bsize; + wdst += bsize; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + err = blkcipher_walk_done(desc, walk, nbytes); + } + + return err; +} + +static int ecb_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, twofish_enc_blk, twofish_enc_blk_3way); +} + +static int ecb_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + + blkcipher_walk_init(&walk, dst, src, nbytes); + return ecb_crypt(desc, &walk, twofish_dec_blk, twofish_dec_blk_3way); +} + +static struct crypto_alg blk_ecb_alg = { + .cra_name = "ecb(twofish)", + .cra_driver_name = "ecb-twofish-3way", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_ecb_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .setkey = twofish_setkey, + .encrypt = ecb_encrypt, + .decrypt = ecb_decrypt, + }, + }, +}; + +static unsigned int __cbc_encrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = TF_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 *iv = (u128 *)walk->iv; + + do { + u128_xor(dst, src, iv); + twofish_enc_blk(ctx, (u8 *)dst, (u8 *)dst); + iv = dst; + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + + u128_xor((u128 *)walk->iv, (u128 *)walk->iv, iv); + return nbytes; +} + +static int cbc_encrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_encrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static unsigned int __cbc_decrypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = TF_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 ivs[3 - 1]; + u128 last_iv; + + /* Start of the last block. */ + src += nbytes / bsize - 1; + dst += nbytes / bsize - 1; + + last_iv = *src; + + /* Process three block batch */ + if (nbytes >= bsize * 3) { + do { + nbytes -= bsize * (3 - 1); + src -= 3 - 1; + dst -= 3 - 1; + + ivs[0] = src[0]; + ivs[1] = src[1]; + + twofish_dec_blk_3way(ctx, (u8 *)dst, (u8 *)src); + + u128_xor(dst + 1, dst + 1, ivs + 0); + u128_xor(dst + 2, dst + 2, ivs + 1); + + nbytes -= bsize; + if (nbytes < bsize) + goto done; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } while (nbytes >= bsize * 3); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + for (;;) { + twofish_dec_blk(ctx, (u8 *)dst, (u8 *)src); + + nbytes -= bsize; + if (nbytes < bsize) + break; + + u128_xor(dst, dst, src - 1); + src -= 1; + dst -= 1; + } + +done: + u128_xor(dst, dst, (u128 *)walk->iv); + *(u128 *)walk->iv = last_iv; + + return nbytes; +} + +static int cbc_decrypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt(desc, &walk); + + while ((nbytes = walk.nbytes)) { + nbytes = __cbc_decrypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + return err; +} + +static struct crypto_alg blk_cbc_alg = { + .cra_name = "cbc(twofish)", + .cra_driver_name = "cbc-twofish-3way", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = TF_BLOCK_SIZE, + .cra_ctxsize = sizeof(struct twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_cbc_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = twofish_setkey, + .encrypt = cbc_encrypt, + .decrypt = cbc_decrypt, + }, + }, +}; + +static inline void u128_to_be128(be128 *dst, const u128 *src) +{ + dst->a = cpu_to_be64(src->a); + dst->b = cpu_to_be64(src->b); +} + +static inline void be128_to_u128(u128 *dst, const be128 *src) +{ + dst->a = be64_to_cpu(src->a); + dst->b = be64_to_cpu(src->b); +} + +static inline void u128_inc(u128 *i) +{ + i->b++; + if (!i->b) + i->a++; +} + +static void ctr_crypt_final(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + u8 *ctrblk = walk->iv; + u8 keystream[TF_BLOCK_SIZE]; + u8 *src = walk->src.virt.addr; + u8 *dst = walk->dst.virt.addr; + unsigned int nbytes = walk->nbytes; + + twofish_enc_blk(ctx, keystream, ctrblk); + crypto_xor(keystream, src, nbytes); + memcpy(dst, keystream, nbytes); + + crypto_inc(ctrblk, TF_BLOCK_SIZE); +} + +static unsigned int __ctr_crypt(struct blkcipher_desc *desc, + struct blkcipher_walk *walk) +{ + struct twofish_ctx *ctx = crypto_blkcipher_ctx(desc->tfm); + unsigned int bsize = TF_BLOCK_SIZE; + unsigned int nbytes = walk->nbytes; + u128 *src = (u128 *)walk->src.virt.addr; + u128 *dst = (u128 *)walk->dst.virt.addr; + u128 ctrblk; + be128 ctrblocks[3]; + + be128_to_u128(&ctrblk, (be128 *)walk->iv); + + /* Process three block batch */ + if (nbytes >= bsize * 3) { + do { + if (dst != src) { + dst[0] = src[0]; + dst[1] = src[1]; + dst[2] = src[2]; + } + + /* create ctrblks for parallel encrypt */ + u128_to_be128(&ctrblocks[0], &ctrblk); + u128_inc(&ctrblk); + u128_to_be128(&ctrblocks[1], &ctrblk); + u128_inc(&ctrblk); + u128_to_be128(&ctrblocks[2], &ctrblk); + u128_inc(&ctrblk); + + twofish_enc_blk_xor_3way(ctx, (u8 *)dst, + (u8 *)ctrblocks); + + src += 3; + dst += 3; + nbytes -= bsize * 3; + } while (nbytes >= bsize * 3); + + if (nbytes < bsize) + goto done; + } + + /* Handle leftovers */ + do { + if (dst != src) + *dst = *src; + + u128_to_be128(&ctrblocks[0], &ctrblk); + u128_inc(&ctrblk); + + twofish_enc_blk(ctx, (u8 *)ctrblocks, (u8 *)ctrblocks); + u128_xor(dst, dst, (u128 *)ctrblocks); + + src += 1; + dst += 1; + nbytes -= bsize; + } while (nbytes >= bsize); + +done: + u128_to_be128((be128 *)walk->iv, &ctrblk); + return nbytes; +} + +static int ctr_crypt(struct blkcipher_desc *desc, struct scatterlist *dst, + struct scatterlist *src, unsigned int nbytes) +{ + struct blkcipher_walk walk; + int err; + + blkcipher_walk_init(&walk, dst, src, nbytes); + err = blkcipher_walk_virt_block(desc, &walk, TF_BLOCK_SIZE); + + while ((nbytes = walk.nbytes) >= TF_BLOCK_SIZE) { + nbytes = __ctr_crypt(desc, &walk); + err = blkcipher_walk_done(desc, &walk, nbytes); + } + + if (walk.nbytes) { + ctr_crypt_final(desc, &walk); + err = blkcipher_walk_done(desc, &walk, 0); + } + + return err; +} + +static struct crypto_alg blk_ctr_alg = { + .cra_name = "ctr(twofish)", + .cra_driver_name = "ctr-twofish-3way", + .cra_priority = 300, + .cra_flags = CRYPTO_ALG_TYPE_BLKCIPHER, + .cra_blocksize = 1, + .cra_ctxsize = sizeof(struct twofish_ctx), + .cra_alignmask = 0, + .cra_type = &crypto_blkcipher_type, + .cra_module = THIS_MODULE, + .cra_list = LIST_HEAD_INIT(blk_ctr_alg.cra_list), + .cra_u = { + .blkcipher = { + .min_keysize = TF_MIN_KEY_SIZE, + .max_keysize = TF_MAX_KEY_SIZE, + .ivsize = TF_BLOCK_SIZE, + .setkey = twofish_setkey, + .encrypt = ctr_crypt, + .decrypt = ctr_crypt, + }, + }, +}; + +int __init init(void) +{ + int err; + + err = crypto_register_alg(&blk_ecb_alg); + if (err) + goto ecb_err; + err = crypto_register_alg(&blk_cbc_alg); + if (err) + goto cbc_err; + err = crypto_register_alg(&blk_ctr_alg); + if (err) + goto ctr_err; + + return 0; + +ctr_err: + crypto_unregister_alg(&blk_cbc_alg); +cbc_err: + crypto_unregister_alg(&blk_ecb_alg); +ecb_err: + return err; +} + +void __exit fini(void) +{ + crypto_unregister_alg(&blk_ctr_alg); + crypto_unregister_alg(&blk_cbc_alg); + crypto_unregister_alg(&blk_ecb_alg); +} + +module_init(init); +module_exit(fini); + +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Twofish Cipher Algorithm, 3-way parallel asm optimized"); +MODULE_ALIAS("twofish"); +MODULE_ALIAS("twofish-asm"); diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 54edb207ff3a..a6253ec1b284 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -850,4 +850,6 @@ ia32_sys_call_table: .quad sys_syncfs .quad compat_sys_sendmmsg /* 345 */ .quad sys_setns + .quad compat_sys_process_vm_readv + .quad compat_sys_process_vm_writev ia32_syscall_end: diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h index 67f87f257611..8e41071704a5 100644 --- a/arch/x86/include/asm/amd_nb.h +++ b/arch/x86/include/asm/amd_nb.h @@ -19,9 +19,15 @@ extern int amd_numa_init(void); extern int amd_get_subcaches(int); extern int amd_set_subcaches(int, int); +struct amd_l3_cache { + unsigned indices; + u8 subcaches[4]; +}; + struct amd_northbridge { struct pci_dev *misc; struct pci_dev *link; + struct amd_l3_cache l3_cache; }; struct amd_northbridge_info { diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h index 34595d5e1038..3925d8007864 100644 --- a/arch/x86/include/asm/apicdef.h +++ b/arch/x86/include/asm/apicdef.h @@ -100,7 +100,9 @@ #define APIC_TIMER_BASE_CLKIN 0x0 #define APIC_TIMER_BASE_TMBASE 0x1 #define APIC_TIMER_BASE_DIV 0x2 +#define APIC_LVT_TIMER_ONESHOT (0 << 17) #define APIC_LVT_TIMER_PERIODIC (1 << 17) +#define APIC_LVT_TIMER_TSCDEADLINE (2 << 17) #define APIC_LVT_MASKED (1 << 16) #define APIC_LVT_LEVEL_TRIGGER (1 << 15) #define APIC_LVT_REMOTE_IRR (1 << 14) diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h new file mode 100644 index 000000000000..0d9ec770f2f8 --- /dev/null +++ b/arch/x86/include/asm/archrandom.h @@ -0,0 +1,75 @@ +/* + * This file is part of the Linux kernel. + * + * Copyright (c) 2011, Intel Corporation + * Authors: Fenghua Yu <fenghua.yu@intel.com>, + * H. Peter Anvin <hpa@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#ifndef ASM_X86_ARCHRANDOM_H +#define ASM_X86_ARCHRANDOM_H + +#include <asm/processor.h> +#include <asm/cpufeature.h> +#include <asm/alternative.h> +#include <asm/nops.h> + +#define RDRAND_RETRY_LOOPS 10 + +#define RDRAND_INT ".byte 0x0f,0xc7,0xf0" +#ifdef CONFIG_X86_64 +# define RDRAND_LONG ".byte 0x48,0x0f,0xc7,0xf0" +#else +# define RDRAND_LONG RDRAND_INT +#endif + +#ifdef CONFIG_ARCH_RANDOM + +#define GET_RANDOM(name, type, rdrand, nop) \ +static inline int name(type *v) \ +{ \ + int ok; \ + alternative_io("movl $0, %0\n\t" \ + nop, \ + "\n1: " rdrand "\n\t" \ + "jc 2f\n\t" \ + "decl %0\n\t" \ + "jnz 1b\n\t" \ + "2:", \ + X86_FEATURE_RDRAND, \ + ASM_OUTPUT2("=r" (ok), "=a" (*v)), \ + "0" (RDRAND_RETRY_LOOPS)); \ + return ok; \ +} + +#ifdef CONFIG_X86_64 + +GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP5); +GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP4); + +#else + +GET_RANDOM(arch_get_random_long, unsigned long, RDRAND_LONG, ASM_NOP3); +GET_RANDOM(arch_get_random_int, unsigned int, RDRAND_INT, ASM_NOP3); + +#endif /* CONFIG_X86_64 */ + +#endif /* CONFIG_ARCH_RANDOM */ + +extern void x86_init_rdrand(struct cpuinfo_x86 *c); + +#endif /* ASM_X86_ARCHRANDOM_H */ diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index 10572e309ab2..58cb6d4085f7 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -172,18 +172,14 @@ static inline int atomic_add_negative(int i, atomic_t *v) */ static inline int atomic_add_return(int i, atomic_t *v) { - int __i; #ifdef CONFIG_M386 + int __i; unsigned long flags; if (unlikely(boot_cpu_data.x86 <= 3)) goto no_xadd; #endif /* Modern 486+ processor */ - __i = i; - asm volatile(LOCK_PREFIX "xaddl %0, %1" - : "+r" (i), "+m" (v->counter) - : : "memory"); - return i + __i; + return i + xadd(&v->counter, i); #ifdef CONFIG_M386 no_xadd: /* Legacy 386 processor */ diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 017594d403f6..0e1cbfc8ee06 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -170,11 +170,7 @@ static inline int atomic64_add_negative(long i, atomic64_t *v) */ static inline long atomic64_add_return(long i, atomic64_t *v) { - long __i = i; - asm volatile(LOCK_PREFIX "xaddq %0, %1;" - : "+r" (i), "+m" (v->counter) - : : "memory"); - return i + __i; + return i + xadd(&v->counter, i); } static inline long atomic64_sub_return(long i, atomic64_t *v) diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index a460fa088d4c..5d3acdf5a7a6 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -1,5 +1,210 @@ +#ifndef ASM_X86_CMPXCHG_H +#define ASM_X86_CMPXCHG_H + +#include <linux/compiler.h> +#include <asm/alternative.h> /* Provides LOCK_PREFIX */ + +/* + * Non-existant functions to indicate usage errors at link time + * (or compile-time if the compiler implements __compiletime_error(). + */ +extern void __xchg_wrong_size(void) + __compiletime_error("Bad argument size for xchg"); +extern void __cmpxchg_wrong_size(void) + __compiletime_error("Bad argument size for cmpxchg"); +extern void __xadd_wrong_size(void) + __compiletime_error("Bad argument size for xadd"); + +/* + * Constants for operation sizes. On 32-bit, the 64-bit size it set to + * -1 because sizeof will never return -1, thereby making those switch + * case statements guaranteeed dead code which the compiler will + * eliminate, and allowing the "missing symbol in the default case" to + * indicate a usage error. + */ +#define __X86_CASE_B 1 +#define __X86_CASE_W 2 +#define __X86_CASE_L 4 +#ifdef CONFIG_64BIT +#define __X86_CASE_Q 8 +#else +#define __X86_CASE_Q -1 /* sizeof will never return -1 */ +#endif + +/* + * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. + * Since this is generally used to protect other memory information, we + * use "asm volatile" and "memory" clobbers to prevent gcc from moving + * information around. + */ +#define __xchg(x, ptr, size) \ +({ \ + __typeof(*(ptr)) __x = (x); \ + switch (size) { \ + case __X86_CASE_B: \ + { \ + volatile u8 *__ptr = (volatile u8 *)(ptr); \ + asm volatile("xchgb %0,%1" \ + : "=q" (__x), "+m" (*__ptr) \ + : "0" (__x) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_W: \ + { \ + volatile u16 *__ptr = (volatile u16 *)(ptr); \ + asm volatile("xchgw %0,%1" \ + : "=r" (__x), "+m" (*__ptr) \ + : "0" (__x) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_L: \ + { \ + volatile u32 *__ptr = (volatile u32 *)(ptr); \ + asm volatile("xchgl %0,%1" \ + : "=r" (__x), "+m" (*__ptr) \ + : "0" (__x) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_Q: \ + { \ + volatile u64 *__ptr = (volatile u64 *)(ptr); \ + asm volatile("xchgq %0,%1" \ + : "=r" (__x), "+m" (*__ptr) \ + : "0" (__x) \ + : "memory"); \ + break; \ + } \ + default: \ + __xchg_wrong_size(); \ + } \ + __x; \ +}) + +#define xchg(ptr, v) \ + __xchg((v), (ptr), sizeof(*ptr)) + +/* + * Atomic compare and exchange. Compare OLD with MEM, if identical, + * store NEW in MEM. Return the initial value in MEM. Success is + * indicated by comparing RETURN with OLD. + */ +#define __raw_cmpxchg(ptr, old, new, size, lock) \ +({ \ + __typeof__(*(ptr)) __ret; \ + __typeof__(*(ptr)) __old = (old); \ + __typeof__(*(ptr)) __new = (new); \ + switch (size) { \ + case __X86_CASE_B: \ + { \ + volatile u8 *__ptr = (volatile u8 *)(ptr); \ + asm volatile(lock "cmpxchgb %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "q" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_W: \ + { \ + volatile u16 *__ptr = (volatile u16 *)(ptr); \ + asm volatile(lock "cmpxchgw %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "r" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_L: \ + { \ + volatile u32 *__ptr = (volatile u32 *)(ptr); \ + asm volatile(lock "cmpxchgl %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "r" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + case __X86_CASE_Q: \ + { \ + volatile u64 *__ptr = (volatile u64 *)(ptr); \ + asm volatile(lock "cmpxchgq %2,%1" \ + : "=a" (__ret), "+m" (*__ptr) \ + : "r" (__new), "0" (__old) \ + : "memory"); \ + break; \ + } \ + default: \ + __cmpxchg_wrong_size(); \ + } \ + __ret; \ +}) + +#define __cmpxchg(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) + +#define __sync_cmpxchg(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), "lock; ") + +#define __cmpxchg_local(ptr, old, new, size) \ + __raw_cmpxchg((ptr), (old), (new), (size), "") + #ifdef CONFIG_X86_32 # include "cmpxchg_32.h" #else # include "cmpxchg_64.h" #endif + +#ifdef __HAVE_ARCH_CMPXCHG +#define cmpxchg(ptr, old, new) \ + __cmpxchg((ptr), (old), (new), sizeof(*ptr)) + +#define sync_cmpxchg(ptr, old, new) \ + __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr)) + +#define cmpxchg_local(ptr, old, new) \ + __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) +#endif + +#define __xadd(ptr, inc, lock) \ + ({ \ + __typeof__ (*(ptr)) __ret = (inc); \ + switch (sizeof(*(ptr))) { \ + case __X86_CASE_B: \ + asm volatile (lock "xaddb %b0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_W: \ + asm volatile (lock "xaddw %w0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_L: \ + asm volatile (lock "xaddl %0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + case __X86_CASE_Q: \ + asm volatile (lock "xaddq %q0, %1\n" \ + : "+r" (__ret), "+m" (*(ptr)) \ + : : "memory", "cc"); \ + break; \ + default: \ + __xadd_wrong_size(); \ + } \ + __ret; \ + }) + +/* + * xadd() adds "inc" to "*ptr" and atomically returns the previous + * value of "*ptr". + * + * xadd() is locked when multiple CPUs are online + * xadd_sync() is always locked + * xadd_local() is never locked + */ +#define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX) +#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ") +#define xadd_local(ptr, inc) __xadd((ptr), (inc), "") + +#endif /* ASM_X86_CMPXCHG_H */ diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h index 3deb7250624c..fbebb07dd80b 100644 --- a/arch/x86/include/asm/cmpxchg_32.h +++ b/arch/x86/include/asm/cmpxchg_32.h @@ -1,61 +1,11 @@ #ifndef _ASM_X86_CMPXCHG_32_H #define _ASM_X86_CMPXCHG_32_H -#include <linux/bitops.h> /* for LOCK_PREFIX */ - /* * Note: if you use set64_bit(), __cmpxchg64(), or their variants, you * you need to test for the feature in boot_cpu_data. */ -extern void __xchg_wrong_size(void); - -/* - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. - * Since this is generally used to protect other memory information, we - * use "asm volatile" and "memory" clobbers to prevent gcc from moving - * information around. - */ -#define __xchg(x, ptr, size) \ -({ \ - __typeof(*(ptr)) __x = (x); \ - switch (size) { \ - case 1: \ - { \ - volatile u8 *__ptr = (volatile u8 *)(ptr); \ - asm volatile("xchgb %0,%1" \ - : "=q" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - case 2: \ - { \ - volatile u16 *__ptr = (volatile u16 *)(ptr); \ - asm volatile("xchgw %0,%1" \ - : "=r" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - case 4: \ - { \ - volatile u32 *__ptr = (volatile u32 *)(ptr); \ - asm volatile("xchgl %0,%1" \ - : "=r" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - default: \ - __xchg_wrong_size(); \ - } \ - __x; \ -}) - -#define xchg(ptr, v) \ - __xchg((v), (ptr), sizeof(*ptr)) - /* * CMPXCHG8B only writes to the target if we had the previous * value in registers, otherwise it acts as a read and gives us the @@ -84,72 +34,8 @@ static inline void set_64bit(volatile u64 *ptr, u64 value) : "memory"); } -extern void __cmpxchg_wrong_size(void); - -/* - * Atomic compare and exchange. Compare OLD with MEM, if identical, - * store NEW in MEM. Return the initial value in MEM. Success is - * indicated by comparing RETURN with OLD. - */ -#define __raw_cmpxchg(ptr, old, new, size, lock) \ -({ \ - __typeof__(*(ptr)) __ret; \ - __typeof__(*(ptr)) __old = (old); \ - __typeof__(*(ptr)) __new = (new); \ - switch (size) { \ - case 1: \ - { \ - volatile u8 *__ptr = (volatile u8 *)(ptr); \ - asm volatile(lock "cmpxchgb %2,%1" \ - : "=a" (__ret), "+m" (*__ptr) \ - : "q" (__new), "0" (__old) \ - : "memory"); \ - break; \ - } \ - case 2: \ - { \ - volatile u16 *__ptr = (volatile u16 *)(ptr); \ - asm volatile(lock "cmpxchgw %2,%1" \ - : "=a" (__ret), "+m" (*__ptr) \ - : "r" (__new), "0" (__old) \ - : "memory"); \ - break; \ - } \ - case 4: \ - { \ - volatile u32 *__ptr = (volatile u32 *)(ptr); \ - asm volatile(lock "cmpxchgl %2,%1" \ - : "=a" (__ret), "+m" (*__ptr) \ - : "r" (__new), "0" (__old) \ - : "memory"); \ - break; \ - } \ - default: \ - __cmpxchg_wrong_size(); \ - } \ - __ret; \ -}) - -#define __cmpxchg(ptr, old, new, size) \ - __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) - -#define __sync_cmpxchg(ptr, old, new, size) \ - __raw_cmpxchg((ptr), (old), (new), (size), "lock; ") - -#define __cmpxchg_local(ptr, old, new, size) \ - __raw_cmpxchg((ptr), (old), (new), (size), "") - #ifdef CONFIG_X86_CMPXCHG #define __HAVE_ARCH_CMPXCHG 1 - -#define cmpxchg(ptr, old, new) \ - __cmpxchg((ptr), (old), (new), sizeof(*ptr)) - -#define sync_cmpxchg(ptr, old, new) \ - __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr)) - -#define cmpxchg_local(ptr, old, new) \ - __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) #endif #ifdef CONFIG_X86_CMPXCHG64 diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h index 7cf5c0a24434..285da02c38fa 100644 --- a/arch/x86/include/asm/cmpxchg_64.h +++ b/arch/x86/include/asm/cmpxchg_64.h @@ -1,144 +1,13 @@ #ifndef _ASM_X86_CMPXCHG_64_H #define _ASM_X86_CMPXCHG_64_H -#include <asm/alternative.h> /* Provides LOCK_PREFIX */ - static inline void set_64bit(volatile u64 *ptr, u64 val) { *ptr = val; } -extern void __xchg_wrong_size(void); -extern void __cmpxchg_wrong_size(void); - -/* - * Note: no "lock" prefix even on SMP: xchg always implies lock anyway. - * Since this is generally used to protect other memory information, we - * use "asm volatile" and "memory" clobbers to prevent gcc from moving - * information around. - */ -#define __xchg(x, ptr, size) \ -({ \ - __typeof(*(ptr)) __x = (x); \ - switch (size) { \ - case 1: \ - { \ - volatile u8 *__ptr = (volatile u8 *)(ptr); \ - asm volatile("xchgb %0,%1" \ - : "=q" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - case 2: \ - { \ - volatile u16 *__ptr = (volatile u16 *)(ptr); \ - asm volatile("xchgw %0,%1" \ - : "=r" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - case 4: \ - { \ - volatile u32 *__ptr = (volatile u32 *)(ptr); \ - asm volatile("xchgl %0,%1" \ - : "=r" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - case 8: \ - { \ - volatile u64 *__ptr = (volatile u64 *)(ptr); \ - asm volatile("xchgq %0,%1" \ - : "=r" (__x), "+m" (*__ptr) \ - : "0" (__x) \ - : "memory"); \ - break; \ - } \ - default: \ - __xchg_wrong_size(); \ - } \ - __x; \ -}) - -#define xchg(ptr, v) \ - __xchg((v), (ptr), sizeof(*ptr)) - #define __HAVE_ARCH_CMPXCHG 1 -/* - * Atomic compare and exchange. Compare OLD with MEM, if identical, - * store NEW in MEM. Return the initial value in MEM. Success is - * indicated by comparing RETURN with OLD. - */ -#define __raw_cmpxchg(ptr, old, new, size, lock) \ -({ \ - __typeof__(*(ptr)) __ret; \ - __typeof__(*(ptr)) __old = (old); \ - __typeof__(*(ptr)) __new = (new); \ - switch (size) { \ - case 1: \ - { \ - volatile u8 *__ptr = (volatile u8 *)(ptr); \ - asm volatile(lock "cmpxchgb %2,%1" \ - : "=a" (__ret), "+m" (*__ptr) \ - : "q" (__new), "0" (__old) \ - : "memory"); \ - break; \ - } \ - case 2: \ - { \ - volatile u16 *__ptr = (volatile u16 *)(ptr); \ - asm volatile(lock "cmpxchgw %2,%1" \ - : "=a" (__ret), "+m" (*__ptr) \ - : "r" (__new), "0" (__old) \ - : "memory"); \ - break; \ - } \ - case 4: \ - { \ - volatile u32 *__ptr = (volatile u32 *)(ptr); \ - asm volatile(lock "cmpxchgl %2,%1" \ - : "=a" (__ret), "+m" (*__ptr) \ - : "r" (__new), "0" (__old) \ - : "memory"); \ - break; \ - } \ - case 8: \ - { \ - volatile u64 *__ptr = (volatile u64 *)(ptr); \ - asm volatile(lock "cmpxchgq %2,%1" \ - : "=a" (__ret), "+m" (*__ptr) \ - : "r" (__new), "0" (__old) \ - : "memory"); \ - break; \ - } \ - default: \ - __cmpxchg_wrong_size(); \ - } \ - __ret; \ -}) - -#define __cmpxchg(ptr, old, new, size) \ - __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX) - -#define __sync_cmpxchg(ptr, old, new, size) \ - __raw_cmpxchg((ptr), (old), (new), (size), "lock; ") - -#define __cmpxchg_local(ptr, old, new, size) \ - __raw_cmpxchg((ptr), (old), (new), (size), "") - -#define cmpxchg(ptr, old, new) \ - __cmpxchg((ptr), (old), (new), sizeof(*ptr)) - -#define sync_cmpxchg(ptr, old, new) \ - __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr)) - -#define cmpxchg_local(ptr, old, new) \ - __cmpxchg_local((ptr), (old), (new), sizeof(*ptr)) - #define cmpxchg64(ptr, o, n) \ ({ \ BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 1d9cd27c2920..30d737ef2a42 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -108,7 +108,8 @@ struct compat_statfs { compat_fsid_t f_fsid; int f_namelen; /* SunOS ignores this field. */ int f_frsize; - int f_spare[5]; + int f_flags; + int f_spare[4]; }; #define COMPAT_RLIM_OLD_INFINITY 0x7fffffff diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 88b23a43f340..f3444f700f36 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -114,12 +114,14 @@ #define X86_FEATURE_CX16 (4*32+13) /* CMPXCHG16B */ #define X86_FEATURE_XTPR (4*32+14) /* Send Task Priority Messages */ #define X86_FEATURE_PDCM (4*32+15) /* Performance Capabilities */ +#define X86_FEATURE_PCID (4*32+17) /* Process Context Identifiers */ #define X86_FEATURE_DCA (4*32+18) /* Direct Cache Access */ #define X86_FEATURE_XMM4_1 (4*32+19) /* "sse4_1" SSE-4.1 */ #define X86_FEATURE_XMM4_2 (4*32+20) /* "sse4_2" SSE-4.2 */ #define X86_FEATURE_X2APIC (4*32+21) /* x2APIC */ #define X86_FEATURE_MOVBE (4*32+22) /* MOVBE instruction */ #define X86_FEATURE_POPCNT (4*32+23) /* POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER (4*32+24) /* Tsc deadline timer */ #define X86_FEATURE_AES (4*32+25) /* AES instructions */ #define X86_FEATURE_XSAVE (4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ #define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ @@ -257,7 +259,9 @@ extern const char * const x86_power_flags[32]; #define cpu_has_xmm boot_cpu_has(X86_FEATURE_XMM) #define cpu_has_xmm2 boot_cpu_has(X86_FEATURE_XMM2) #define cpu_has_xmm3 boot_cpu_has(X86_FEATURE_XMM3) +#define cpu_has_ssse3 boot_cpu_has(X86_FEATURE_SSSE3) #define cpu_has_aes boot_cpu_has(X86_FEATURE_AES) +#define cpu_has_avx boot_cpu_has(X86_FEATURE_AVX) #define cpu_has_ht boot_cpu_has(X86_FEATURE_HT) #define cpu_has_mp boot_cpu_has(X86_FEATURE_MP) #define cpu_has_nx boot_cpu_has(X86_FEATURE_NX) @@ -285,6 +289,7 @@ extern const char * const x86_power_flags[32]; #define cpu_has_xmm4_2 boot_cpu_has(X86_FEATURE_XMM4_2) #define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) #define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) +#define cpu_has_osxsave boot_cpu_has(X86_FEATURE_OSXSAVE) #define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) #define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) #define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index f2ad2163109d..5f962df30d0f 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -4,6 +4,7 @@ /* * ELF register definitions.. */ +#include <linux/thread_info.h> #include <asm/ptrace.h> #include <asm/user.h> @@ -320,4 +321,34 @@ extern int syscall32_setup_pages(struct linux_binprm *, int exstack); extern unsigned long arch_randomize_brk(struct mm_struct *mm); #define arch_randomize_brk arch_randomize_brk +/* + * True on X86_32 or when emulating IA32 on X86_64 + */ +static inline int mmap_is_ia32(void) +{ +#ifdef CONFIG_X86_32 + return 1; +#endif +#ifdef CONFIG_IA32_EMULATION + if (test_thread_flag(TIF_IA32)) + return 1; +#endif + return 0; +} + +/* The first two values are special, do not change. See align_addr() */ +enum align_flags { + ALIGN_VA_32 = BIT(0), + ALIGN_VA_64 = BIT(1), + ALIGN_VDSO = BIT(2), + ALIGN_TOPDOWN = BIT(3), +}; + +struct va_alignment { + int flags; + unsigned long mask; +} ____cacheline_aligned; + +extern struct va_alignment va_align; +extern unsigned long align_addr(unsigned long, struct file *, enum align_flags); #endif /* _ASM_X86_ELF_H */ diff --git a/arch/x86/include/asm/intel_scu_ipc.h b/arch/x86/include/asm/intel_scu_ipc.h index 29f66793cc55..4420993acc47 100644 --- a/arch/x86/include/asm/intel_scu_ipc.h +++ b/arch/x86/include/asm/intel_scu_ipc.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_INTEL_SCU_IPC_H_ #define _ASM_X86_INTEL_SCU_IPC_H_ +#include <linux/notifier.h> + #define IPCMSG_VRTC 0xFA /* Set vRTC device */ /* Command id associated with message IPCMSG_VRTC */ @@ -44,4 +46,24 @@ int intel_scu_ipc_i2c_cntrl(u32 addr, u32 *data); /* Update FW version */ int intel_scu_ipc_fw_update(u8 *buffer, u32 length); +extern struct blocking_notifier_head intel_scu_notifier; + +static inline void intel_scu_notifier_add(struct notifier_block *nb) +{ + blocking_notifier_chain_register(&intel_scu_notifier, nb); +} + +static inline void intel_scu_notifier_remove(struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&intel_scu_notifier, nb); +} + +static inline int intel_scu_notifier_post(unsigned long v, void *p) +{ + return blocking_notifier_call_chain(&intel_scu_notifier, v, p); +} + +#define SCU_AVAILABLE 1 +#define SCU_DOWN 2 + #endif diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h index 6040d115ef51..a026507893e9 100644 --- a/arch/x86/include/asm/kvm_emulate.h +++ b/arch/x86/include/asm/kvm_emulate.h @@ -262,7 +262,7 @@ struct x86_emulate_ctxt { struct operand dst; bool has_seg_override; u8 seg_override; - unsigned int d; + u64 d; int (*execute)(struct x86_emulate_ctxt *ctxt); int (*check_perm)(struct x86_emulate_ctxt *ctxt); /* modrm */ @@ -275,6 +275,8 @@ struct x86_emulate_ctxt { unsigned long _eip; /* Fields above regs are cleared together. */ unsigned long regs[NR_VCPU_REGS]; + struct operand memop; + struct operand *memopp; struct fetch_cache fetch; struct read_cache io_read; struct read_cache mem_read; diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index dd51c83aa5de..b4973f4dab98 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -26,7 +26,8 @@ #include <asm/mtrr.h> #include <asm/msr-index.h> -#define KVM_MAX_VCPUS 64 +#define KVM_MAX_VCPUS 254 +#define KVM_SOFT_MAX_VCPUS 64 #define KVM_MEMORY_SLOTS 32 /* memory slots that does not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 4 @@ -264,6 +265,7 @@ struct kvm_mmu { void (*new_cr3)(struct kvm_vcpu *vcpu); void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root); unsigned long (*get_cr3)(struct kvm_vcpu *vcpu); + u64 (*get_pdptr)(struct kvm_vcpu *vcpu, int index); int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err, bool prefault); void (*inject_page_fault)(struct kvm_vcpu *vcpu, @@ -411,8 +413,9 @@ struct kvm_vcpu_arch { u32 tsc_catchup_mult; s8 tsc_catchup_shift; - bool nmi_pending; - bool nmi_injected; + atomic_t nmi_queued; /* unprocessed asynchronous NMIs */ + unsigned nmi_pending; /* NMI queued after currently running handler */ + bool nmi_injected; /* Trying to inject an NMI this entry */ struct mtrr_state_type mtrr_state; u32 pat; @@ -628,14 +631,13 @@ struct kvm_x86_ops { void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc); + u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu); void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); int (*check_intercept)(struct kvm_vcpu *vcpu, struct x86_instruction_info *info, enum x86_intercept_stage stage); - - const struct trace_print_flags *exit_reasons_str; }; struct kvm_arch_async_pf { @@ -672,6 +674,8 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn); extern bool tdp_enabled; +u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu); + /* control of guest tsc rate supported? */ extern bool kvm_has_tsc_control; /* minimum supported tsc_khz for guests */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index d52609aeeab8..a6962d9161a0 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -229,6 +229,8 @@ #define MSR_IA32_APICBASE_ENABLE (1<<11) #define MSR_IA32_APICBASE_BASE (0xfffff<<12) +#define MSR_IA32_TSCDEADLINE 0x000006e0 + #define MSR_IA32_UCODE_WRITE 0x00000079 #define MSR_IA32_UCODE_REV 0x0000008b diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h index 704526734bef..e38197806853 100644 --- a/arch/x86/include/asm/pci_x86.h +++ b/arch/x86/include/asm/pci_x86.h @@ -99,10 +99,10 @@ struct pci_raw_ops { int reg, int len, u32 val); }; -extern struct pci_raw_ops *raw_pci_ops; -extern struct pci_raw_ops *raw_pci_ext_ops; +extern const struct pci_raw_ops *raw_pci_ops; +extern const struct pci_raw_ops *raw_pci_ext_ops; -extern struct pci_raw_ops pci_direct_conf1; +extern const struct pci_raw_ops pci_direct_conf1; extern bool port_cf9_safe; /* arch_initcall level */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 0d1171c97729..b650435ffb53 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -111,6 +111,7 @@ struct cpuinfo_x86 { /* Index into per_cpu list: */ u16 cpu_index; #endif + u32 microcode; } __attribute__((__aligned__(SMP_CACHE_BYTES))); #define X86_VENDOR_INTEL 0 @@ -179,7 +180,8 @@ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx, "=b" (*ebx), "=c" (*ecx), "=d" (*edx) - : "0" (*eax), "2" (*ecx)); + : "0" (*eax), "2" (*ecx) + : "memory"); } static inline void load_cr3(pgd_t *pgdir) diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index df4cd32b4cc6..2dbe4a721ce5 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -204,13 +204,7 @@ static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem) */ static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem) { - long tmp = delta; - - asm volatile(LOCK_PREFIX "xadd %0,%1" - : "+r" (tmp), "+m" (sem->count) - : : "memory"); - - return tmp + delta; + return delta + xadd(&sem->count, delta); } #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index ee67edf86fdd..972c260919a3 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -49,109 +49,49 @@ * issues and should be optimal for the uncontended case. Note the tail must be * in the high part, because a wide xadd increment of the low part would carry * up and contaminate the high part. - * - * With fewer than 2^8 possible CPUs, we can use x86's partial registers to - * save some instructions and make the code more elegant. There really isn't - * much between them in performance though, especially as locks are out of line. */ -#if (NR_CPUS < 256) -#define TICKET_SHIFT 8 - static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) { - short inc = 0x0100; - - asm volatile ( - LOCK_PREFIX "xaddw %w0, %1\n" - "1:\t" - "cmpb %h0, %b0\n\t" - "je 2f\n\t" - "rep ; nop\n\t" - "movb %1, %b0\n\t" - /* don't need lfence here, because loads are in-order */ - "jmp 1b\n" - "2:" - : "+Q" (inc), "+m" (lock->slock) - : - : "memory", "cc"); + register struct __raw_tickets inc = { .tail = 1 }; + + inc = xadd(&lock->tickets, inc); + + for (;;) { + if (inc.head == inc.tail) + break; + cpu_relax(); + inc.head = ACCESS_ONCE(lock->tickets.head); + } + barrier(); /* make sure nothing creeps before the lock is taken */ } static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) { - int tmp, new; - - asm volatile("movzwl %2, %0\n\t" - "cmpb %h0,%b0\n\t" - "leal 0x100(%" REG_PTR_MODE "0), %1\n\t" - "jne 1f\n\t" - LOCK_PREFIX "cmpxchgw %w1,%2\n\t" - "1:" - "sete %b1\n\t" - "movzbl %b1,%0\n\t" - : "=&a" (tmp), "=&q" (new), "+m" (lock->slock) - : - : "memory", "cc"); + arch_spinlock_t old, new; + + old.tickets = ACCESS_ONCE(lock->tickets); + if (old.tickets.head != old.tickets.tail) + return 0; + + new.head_tail = old.head_tail + (1 << TICKET_SHIFT); - return tmp; + /* cmpxchg is a full barrier, so nothing can move before it */ + return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; } +#if (NR_CPUS < 256) static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) { asm volatile(UNLOCK_LOCK_PREFIX "incb %0" - : "+m" (lock->slock) + : "+m" (lock->head_tail) : : "memory", "cc"); } #else -#define TICKET_SHIFT 16 - -static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock) -{ - int inc = 0x00010000; - int tmp; - - asm volatile(LOCK_PREFIX "xaddl %0, %1\n" - "movzwl %w0, %2\n\t" - "shrl $16, %0\n\t" - "1:\t" - "cmpl %0, %2\n\t" - "je 2f\n\t" - "rep ; nop\n\t" - "movzwl %1, %2\n\t" - /* don't need lfence here, because loads are in-order */ - "jmp 1b\n" - "2:" - : "+r" (inc), "+m" (lock->slock), "=&r" (tmp) - : - : "memory", "cc"); -} - -static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock) -{ - int tmp; - int new; - - asm volatile("movl %2,%0\n\t" - "movl %0,%1\n\t" - "roll $16, %0\n\t" - "cmpl %0,%1\n\t" - "leal 0x00010000(%" REG_PTR_MODE "0), %1\n\t" - "jne 1f\n\t" - LOCK_PREFIX "cmpxchgl %1,%2\n\t" - "1:" - "sete %b1\n\t" - "movzbl %b1,%0\n\t" - : "=&a" (tmp), "=&q" (new), "+m" (lock->slock) - : - : "memory", "cc"); - - return tmp; -} - static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) { asm volatile(UNLOCK_LOCK_PREFIX "incw %0" - : "+m" (lock->slock) + : "+m" (lock->head_tail) : : "memory", "cc"); } @@ -159,16 +99,16 @@ static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock) static inline int __ticket_spin_is_locked(arch_spinlock_t *lock) { - int tmp = ACCESS_ONCE(lock->slock); + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); - return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1)); + return !!(tmp.tail ^ tmp.head); } static inline int __ticket_spin_is_contended(arch_spinlock_t *lock) { - int tmp = ACCESS_ONCE(lock->slock); + struct __raw_tickets tmp = ACCESS_ONCE(lock->tickets); - return (((tmp >> TICKET_SHIFT) - tmp) & ((1 << TICKET_SHIFT) - 1)) > 1; + return ((tmp.tail - tmp.head) & TICKET_MASK) > 1; } #ifndef CONFIG_PARAVIRT_SPINLOCKS diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 7c7a486fcb68..8ebd5df7451e 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -5,11 +5,29 @@ # error "please don't include this file directly" #endif +#include <linux/types.h> + +#if (CONFIG_NR_CPUS < 256) +typedef u8 __ticket_t; +typedef u16 __ticketpair_t; +#else +typedef u16 __ticket_t; +typedef u32 __ticketpair_t; +#endif + +#define TICKET_SHIFT (sizeof(__ticket_t) * 8) +#define TICKET_MASK ((__ticket_t)((1 << TICKET_SHIFT) - 1)) + typedef struct arch_spinlock { - unsigned int slock; + union { + __ticketpair_t head_tail; + struct __raw_tickets { + __ticket_t head, tail; + } tickets; + }; } arch_spinlock_t; -#define __ARCH_SPIN_LOCK_UNLOCKED { 0 } +#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } #include <asm/rwlock.h> diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 593485b38ab3..599c77d38f33 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -352,10 +352,12 @@ #define __NR_syncfs 344 #define __NR_sendmmsg 345 #define __NR_setns 346 +#define __NR_process_vm_readv 347 +#define __NR_process_vm_writev 348 #ifdef __KERNEL__ -#define NR_syscalls 347 +#define NR_syscalls 349 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 0a6ba337a2eb..0431f193c3f2 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -682,6 +682,10 @@ __SYSCALL(__NR_sendmmsg, sys_sendmmsg) __SYSCALL(__NR_setns, sys_setns) #define __NR_getcpu 309 __SYSCALL(__NR_getcpu, sys_getcpu) +#define __NR_process_vm_readv 310 +__SYSCALL(__NR_process_vm_readv, sys_process_vm_readv) +#define __NR_process_vm_writev 311 +__SYSCALL(__NR_process_vm_writev, sys_process_vm_writev) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h index 37d369859c8e..8e862aaf0d90 100644 --- a/arch/x86/include/asm/uv/uv_bau.h +++ b/arch/x86/include/asm/uv/uv_bau.h @@ -55,6 +55,7 @@ #define UV_BAU_TUNABLES_DIR "sgi_uv" #define UV_BAU_TUNABLES_FILE "bau_tunables" #define WHITESPACE " \t\n" +#define uv_mmask ((1UL << uv_hub_info->m_val) - 1) #define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) #define cpubit_isset(cpu, bau_local_cpumask) \ test_bit((cpu), (bau_local_cpumask).bits) @@ -656,11 +657,7 @@ static inline int atomic_read_short(const struct atomic_short *v) */ static inline int atom_asr(short i, struct atomic_short *v) { - short __i = i; - asm volatile(LOCK_PREFIX "xaddw %0, %1" - : "+r" (i), "+m" (v->counter) - : : "memory"); - return i + __i; + return i + xadd(&v->counter, i); } /* diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h index f26544a15214..54a13aaebc40 100644 --- a/arch/x86/include/asm/uv/uv_hub.h +++ b/arch/x86/include/asm/uv/uv_hub.h @@ -46,6 +46,13 @@ * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant * of the nasid for socket usage. * + * GPA - (global physical address) a socket physical address converted + * so that it can be used by the GRU as a global address. Socket + * physical addresses 1) need additional NASID (node) bits added + * to the high end of the address, and 2) unaliased if the + * partition does not have a physical address 0. In addition, on + * UV2 rev 1, GPAs need the gnode left shifted to bits 39 or 40. + * * * NumaLink Global Physical Address Format: * +--------------------------------+---------------------+ @@ -141,6 +148,8 @@ struct uv_hub_info_s { unsigned int gnode_extra; unsigned char hub_revision; unsigned char apic_pnode_shift; + unsigned char m_shift; + unsigned char n_lshift; unsigned long gnode_upper; unsigned long lowmem_remap_top; unsigned long lowmem_remap_base; @@ -177,6 +186,16 @@ static inline int is_uv2_hub(void) return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE; } +static inline int is_uv2_1_hub(void) +{ + return uv_hub_info->hub_revision == UV2_HUB_REVISION_BASE; +} + +static inline int is_uv2_2_hub(void) +{ + return uv_hub_info->hub_revision == UV2_HUB_REVISION_BASE + 1; +} + union uvh_apicid { unsigned long v; struct uvh_apicid_s { @@ -276,7 +295,10 @@ static inline unsigned long uv_soc_phys_ram_to_gpa(unsigned long paddr) { if (paddr < uv_hub_info->lowmem_remap_top) paddr |= uv_hub_info->lowmem_remap_base; - return paddr | uv_hub_info->gnode_upper; + paddr |= uv_hub_info->gnode_upper; + paddr = ((paddr << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | + ((paddr >> uv_hub_info->m_val) << uv_hub_info->n_lshift); + return paddr; } @@ -300,16 +322,19 @@ static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa) unsigned long remap_base = uv_hub_info->lowmem_remap_base; unsigned long remap_top = uv_hub_info->lowmem_remap_top; + gpa = ((gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift) | + ((gpa >> uv_hub_info->n_lshift) << uv_hub_info->m_val); + gpa = gpa & uv_hub_info->gpa_mask; if (paddr >= remap_base && paddr < remap_base + remap_top) paddr -= remap_base; return paddr; } -/* gnode -> pnode */ +/* gpa -> pnode */ static inline unsigned long uv_gpa_to_gnode(unsigned long gpa) { - return gpa >> uv_hub_info->m_val; + return gpa >> uv_hub_info->n_lshift; } /* gpa -> pnode */ @@ -320,6 +345,12 @@ static inline int uv_gpa_to_pnode(unsigned long gpa) return uv_gpa_to_gnode(gpa) & n_mask; } +/* gpa -> node offset*/ +static inline unsigned long uv_gpa_to_offset(unsigned long gpa) +{ + return (gpa << uv_hub_info->m_shift) >> uv_hub_info->m_shift; +} + /* pnode, offset --> socket virtual */ static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset) { diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index 2caf290e9895..31f180c21ce9 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -350,6 +350,18 @@ enum vmcs_field { #define DEBUG_REG_ACCESS_REG(eq) (((eq) >> 8) & 0xf) /* 11:8, general purpose reg. */ +/* + * Exit Qualifications for APIC-Access + */ +#define APIC_ACCESS_OFFSET 0xfff /* 11:0, offset within the APIC page */ +#define APIC_ACCESS_TYPE 0xf000 /* 15:12, access type */ +#define TYPE_LINEAR_APIC_INST_READ (0 << 12) +#define TYPE_LINEAR_APIC_INST_WRITE (1 << 12) +#define TYPE_LINEAR_APIC_INST_FETCH (2 << 12) +#define TYPE_LINEAR_APIC_EVENT (3 << 12) +#define TYPE_PHYSICAL_APIC_EVENT (10 << 12) +#define TYPE_PHYSICAL_APIC_INST (15 << 12) + /* segment AR */ #define SEGMENT_AR_L_MASK (1 << 13) diff --git a/arch/x86/include/asm/xen/grant_table.h b/arch/x86/include/asm/xen/grant_table.h deleted file mode 100644 index fdbbb45767a6..000000000000 --- a/arch/x86/include/asm/xen/grant_table.h +++ /dev/null @@ -1,7 +0,0 @@ -#ifndef _ASM_X86_XEN_GRANT_TABLE_H -#define _ASM_X86_XEN_GRANT_TABLE_H - -#define xen_alloc_vm_area(size) alloc_vm_area(size) -#define xen_free_vm_area(area) free_vm_area(area) - -#endif /* _ASM_X86_XEN_GRANT_TABLE_H */ diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h index 417777de5a40..5728852fb90f 100644 --- a/arch/x86/include/asm/xen/hypercall.h +++ b/arch/x86/include/asm/xen/hypercall.h @@ -47,6 +47,7 @@ #include <xen/interface/xen.h> #include <xen/interface/sched.h> #include <xen/interface/physdev.h> +#include <xen/interface/platform.h> /* * The hypercall asms have to meet several constraints: @@ -301,6 +302,13 @@ HYPERVISOR_set_timer_op(u64 timeout) } static inline int +HYPERVISOR_dom0_op(struct xen_platform_op *platform_op) +{ + platform_op->interface_version = XENPF_INTERFACE_VERSION; + return _hypercall1(int, dom0_op, platform_op); +} + +static inline int HYPERVISOR_set_debugreg(int reg, unsigned long value) { return _hypercall2(int, set_debugreg, reg, value); diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h index 5d4922ad4b9b..a1f2db5f1170 100644 --- a/arch/x86/include/asm/xen/interface.h +++ b/arch/x86/include/asm/xen/interface.h @@ -55,6 +55,7 @@ DEFINE_GUEST_HANDLE(char); DEFINE_GUEST_HANDLE(int); DEFINE_GUEST_HANDLE(long); DEFINE_GUEST_HANDLE(void); +DEFINE_GUEST_HANDLE(uint64_t); #endif #ifndef HYPERVISOR_VIRT_START diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 75be00ecfff2..62ae3001ae02 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -820,6 +820,10 @@ void __init uv_system_init(void) uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision; + uv_cpu_hub_info(cpu)->m_shift = 64 - m_val; + uv_cpu_hub_info(cpu)->n_lshift = is_uv2_1_hub() ? + (m_val == 40 ? 40 : 39) : m_val; + pnode = uv_apicid_to_pnode(apicid); blade = boot_pnode_to_blade(pnode); lcpu = uv_blade_info[blade].nr_possible_cpus; @@ -850,8 +854,7 @@ void __init uv_system_init(void) if (uv_node_to_blade[nid] >= 0) continue; paddr = node_start_pfn(nid) << PAGE_SHIFT; - paddr = uv_soc_phys_ram_to_gpa(paddr); - pnode = (paddr >> m_val) & pnode_mask; + pnode = uv_gpa_to_pnode(uv_soc_phys_ram_to_gpa(paddr)); blade = boot_pnode_to_blade(pnode); uv_node_to_blade[nid] = blade; } diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index fe6eb197f848..25f24dccdcfa 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -15,6 +15,7 @@ CFLAGS_common.o := $(nostackp) obj-y := intel_cacheinfo.o scattered.o topology.o obj-y += proc.o capflags.o powerflags.o common.o obj-y += vmware.o hypervisor.o sched.o mshyperv.o +obj-y += rdrand.o obj-$(CONFIG_X86_32) += bugs.o obj-$(CONFIG_X86_64) += bugs_64.o diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index b13ed393dfce..c7e46cb35327 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1,5 +1,7 @@ +#include <linux/export.h> #include <linux/init.h> #include <linux/bitops.h> +#include <linux/elf.h> #include <linux/mm.h> #include <linux/io.h> @@ -410,8 +412,38 @@ static void __cpuinit early_init_amd_mc(struct cpuinfo_x86 *c) #endif } +static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c) +{ + if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { + + if (c->x86 > 0x10 || + (c->x86 == 0x10 && c->x86_model >= 0x2)) { + u64 val; + + rdmsrl(MSR_K7_HWCR, val); + if (!(val & BIT(24))) + printk(KERN_WARNING FW_BUG "TSC doesn't count " + "with P0 frequency!\n"); + } + } + + if (c->x86 == 0x15) { + unsigned long upperbit; + u32 cpuid, assoc; + + cpuid = cpuid_edx(0x80000005); + assoc = cpuid >> 16 & 0xff; + upperbit = ((cpuid >> 24) << 10) / assoc; + + va_align.mask = (upperbit - 1) & PAGE_MASK; + va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + } +} + static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { + u32 dummy; + early_init_amd_mc(c); /* @@ -442,22 +474,7 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) } #endif - /* We need to do the following only once */ - if (c != &boot_cpu_data) - return; - - if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) { - - if (c->x86 > 0x10 || - (c->x86 == 0x10 && c->x86_model >= 0x2)) { - u64 val; - - rdmsrl(MSR_K7_HWCR, val); - if (!(val & BIT(24))) - printk(KERN_WARNING FW_BUG "TSC doesn't count " - "with P0 frequency!\n"); - } - } + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); } static void __cpuinit init_amd(struct cpuinfo_x86 *c) @@ -679,6 +696,7 @@ static const struct cpu_dev __cpuinitconst amd_cpu_dev = { .c_size_cache = amd_size_cache, #endif .c_early_init = early_init_amd, + .c_bsp_init = bsp_init_amd, .c_init = init_amd, .c_x86_vendor = X86_VENDOR_AMD, }; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 62184390a601..aa003b13a831 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -15,6 +15,7 @@ #include <asm/stackprotector.h> #include <asm/perf_event.h> #include <asm/mmu_context.h> +#include <asm/archrandom.h> #include <asm/hypervisor.h> #include <asm/processor.h> #include <asm/sections.h> @@ -681,6 +682,9 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) filter_cpuid_features(c, false); setup_smep(c); + + if (this_cpu->c_bsp_init) + this_cpu->c_bsp_init(c); } void __init early_cpu_init(void) @@ -857,6 +861,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) #endif init_hypervisor(c); + x86_init_rdrand(c); /* * Clear/Set all flags overriden by options, need do it diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index e765633f210e..1b22dcc51af4 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -18,6 +18,7 @@ struct cpu_dev { struct cpu_model_info c_models[4]; void (*c_early_init)(struct cpuinfo_x86 *); + void (*c_bsp_init)(struct cpuinfo_x86 *); void (*c_init)(struct cpuinfo_x86 *); void (*c_identify)(struct cpuinfo_x86 *); unsigned int (*c_size_cache)(struct cpuinfo_x86 *, unsigned int); diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index ed6086eedf1d..523131213f08 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -47,6 +47,15 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) (c->x86 == 0x6 && c->x86_model >= 0x0e)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); + if (c->x86 >= 6 && !cpu_has(c, X86_FEATURE_IA64)) { + unsigned lower_word; + + wrmsr(MSR_IA32_UCODE_REV, 0, 0); + /* Required by the SDM */ + sync_core(); + rdmsr(MSR_IA32_UCODE_REV, lower_word, c->microcode); + } + /* * Atom erratum AAE44/AAF40/AAG38/AAH41: * @@ -55,17 +64,10 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) * need the microcode to have already been loaded... so if it is * not, recommend a BIOS update and disable large pages. */ - if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2) { - u32 ucode, junk; - - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - sync_core(); - rdmsr(MSR_IA32_UCODE_REV, junk, ucode); - - if (ucode < 0x20e) { - printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); - clear_cpu_cap(c, X86_FEATURE_PSE); - } + if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2 && + c->microcode < 0x20e) { + printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n"); + clear_cpu_cap(c, X86_FEATURE_PSE); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index c105c533ed94..a3b0811693c9 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -151,28 +151,17 @@ union _cpuid4_leaf_ecx { u32 full; }; -struct amd_l3_cache { - struct amd_northbridge *nb; - unsigned indices; - u8 subcaches[4]; -}; - -struct _cpuid4_info { +struct _cpuid4_info_regs { union _cpuid4_leaf_eax eax; union _cpuid4_leaf_ebx ebx; union _cpuid4_leaf_ecx ecx; unsigned long size; - struct amd_l3_cache *l3; - DECLARE_BITMAP(shared_cpu_map, NR_CPUS); + struct amd_northbridge *nb; }; -/* subset of above _cpuid4_info w/o shared_cpu_map */ -struct _cpuid4_info_regs { - union _cpuid4_leaf_eax eax; - union _cpuid4_leaf_ebx ebx; - union _cpuid4_leaf_ecx ecx; - unsigned long size; - struct amd_l3_cache *l3; +struct _cpuid4_info { + struct _cpuid4_info_regs base; + DECLARE_BITMAP(shared_cpu_map, NR_CPUS); }; unsigned short num_cache_leaves; @@ -314,16 +303,23 @@ struct _cache_attr { /* * L3 cache descriptors */ -static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) +static void __cpuinit amd_calc_l3_indices(struct amd_northbridge *nb) { + struct amd_l3_cache *l3 = &nb->l3_cache; unsigned int sc0, sc1, sc2, sc3; u32 val = 0; - pci_read_config_dword(l3->nb->misc, 0x1C4, &val); + pci_read_config_dword(nb->misc, 0x1C4, &val); /* calculate subcache sizes */ l3->subcaches[0] = sc0 = !(val & BIT(0)); l3->subcaches[1] = sc1 = !(val & BIT(4)); + + if (boot_cpu_data.x86 == 0x15) { + l3->subcaches[0] = sc0 += !(val & BIT(1)); + l3->subcaches[1] = sc1 += !(val & BIT(5)); + } + l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); @@ -333,33 +329,16 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) { - static struct amd_l3_cache *__cpuinitdata l3_caches; int node; /* only for L3, and not in virtualized environments */ - if (index < 3 || amd_nb_num() == 0) + if (index < 3) return; - /* - * Strictly speaking, the amount in @size below is leaked since it is - * never freed but this is done only on shutdown so it doesn't matter. - */ - if (!l3_caches) { - int size = amd_nb_num() * sizeof(struct amd_l3_cache); - - l3_caches = kzalloc(size, GFP_ATOMIC); - if (!l3_caches) - return; - } - node = amd_get_nb_id(smp_processor_id()); - - if (!l3_caches[node].nb) { - l3_caches[node].nb = node_to_amd_nb(node); - amd_calc_l3_indices(&l3_caches[node]); - } - - this_leaf->l3 = &l3_caches[node]; + this_leaf->nb = node_to_amd_nb(node); + if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) + amd_calc_l3_indices(this_leaf->nb); } /* @@ -369,11 +348,11 @@ static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, * * @returns: the disabled index if used or negative value if slot free. */ -int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot) +int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) { unsigned int reg = 0; - pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, ®); + pci_read_config_dword(nb->misc, 0x1BC + slot * 4, ®); /* check whether this slot is activated already */ if (reg & (3UL << 30)) @@ -387,11 +366,10 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, { int index; - if (!this_leaf->l3 || - !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; - index = amd_get_l3_disable_slot(this_leaf->l3, slot); + index = amd_get_l3_disable_slot(this_leaf->base.nb, slot); if (index >= 0) return sprintf(buf, "%d\n", index); @@ -408,7 +386,7 @@ show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \ SHOW_CACHE_DISABLE(0) SHOW_CACHE_DISABLE(1) -static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, +static void amd_l3_disable_index(struct amd_northbridge *nb, int cpu, unsigned slot, unsigned long idx) { int i; @@ -421,10 +399,10 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, for (i = 0; i < 4; i++) { u32 reg = idx | (i << 20); - if (!l3->subcaches[i]) + if (!nb->l3_cache.subcaches[i]) continue; - pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); /* * We need to WBINVD on a core on the node containing the L3 @@ -434,7 +412,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, wbinvd_on_cpu(cpu); reg |= BIT(31); - pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg); + pci_write_config_dword(nb->misc, 0x1BC + slot * 4, reg); } } @@ -448,24 +426,24 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu, * * @return: 0 on success, error status on failure */ -int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot, +int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot, unsigned long index) { int ret = 0; /* check if @slot is already used or the index is already disabled */ - ret = amd_get_l3_disable_slot(l3, slot); + ret = amd_get_l3_disable_slot(nb, slot); if (ret >= 0) return -EINVAL; - if (index > l3->indices) + if (index > nb->l3_cache.indices) return -EINVAL; /* check whether the other slot has disabled the same index already */ - if (index == amd_get_l3_disable_slot(l3, !slot)) + if (index == amd_get_l3_disable_slot(nb, !slot)) return -EINVAL; - amd_l3_disable_index(l3, cpu, slot, index); + amd_l3_disable_index(nb, cpu, slot, index); return 0; } @@ -480,8 +458,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->l3 || - !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) return -EINVAL; cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); @@ -489,7 +466,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; - err = amd_set_l3_disable_slot(this_leaf->l3, cpu, slot, val); + err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); if (err) { if (err == -EEXIST) printk(KERN_WARNING "L3 disable slot %d in use!\n", @@ -518,7 +495,7 @@ static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, static ssize_t show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) { - if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return -EINVAL; return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); @@ -533,7 +510,7 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return -EINVAL; if (strict_strtoul(buf, 16, &val) < 0) @@ -769,7 +746,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) return; } this_leaf = CPUID4_INFO_IDX(cpu, index); - num_threads_sharing = 1 + this_leaf->eax.split.num_threads_sharing; + num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; if (num_threads_sharing == 1) cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map)); @@ -820,29 +797,19 @@ static void __cpuinit free_cache_attributes(unsigned int cpu) for (i = 0; i < num_cache_leaves; i++) cache_remove_shared_cpu_map(cpu, i); - kfree(per_cpu(ici_cpuid4_info, cpu)->l3); kfree(per_cpu(ici_cpuid4_info, cpu)); per_cpu(ici_cpuid4_info, cpu) = NULL; } -static int -__cpuinit cpuid4_cache_lookup(int index, struct _cpuid4_info *this_leaf) -{ - struct _cpuid4_info_regs *leaf_regs = - (struct _cpuid4_info_regs *)this_leaf; - - return cpuid4_cache_lookup_regs(index, leaf_regs); -} - static void __cpuinit get_cpu_leaves(void *_retval) { int j, *retval = _retval, cpu = smp_processor_id(); /* Do cpuid and store the results */ for (j = 0; j < num_cache_leaves; j++) { - struct _cpuid4_info *this_leaf; - this_leaf = CPUID4_INFO_IDX(cpu, j); - *retval = cpuid4_cache_lookup(j, this_leaf); + struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j); + + *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base); if (unlikely(*retval < 0)) { int i; @@ -900,16 +867,16 @@ static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \ return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ } -show_one_plus(level, eax.split.level, 0); -show_one_plus(coherency_line_size, ebx.split.coherency_line_size, 1); -show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1); -show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); -show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); +show_one_plus(level, base.eax.split.level, 0); +show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1); +show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1); +show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1); +show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1); static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) { - return sprintf(buf, "%luK\n", this_leaf->size / 1024); + return sprintf(buf, "%luK\n", this_leaf->base.size / 1024); } static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, @@ -946,7 +913,7 @@ static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf, static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) { - switch (this_leaf->eax.split.type) { + switch (this_leaf->base.eax.split.type) { case CACHE_TYPE_DATA: return sprintf(buf, "Data\n"); case CACHE_TYPE_INST: @@ -1135,7 +1102,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) ktype_cache.default_attrs = default_attrs; #ifdef CONFIG_AMD_NB - if (this_leaf->l3) + if (this_leaf->base.nb) ktype_cache.default_attrs = amd_l3_attrs(); #endif retval = kobject_init_and_add(&(this_object->kobj), diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c index 83930deec3c6..507ea58688e2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-apei.c +++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c @@ -28,6 +28,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include <linux/export.h> #include <linux/kernel.h> #include <linux/acpi.h> #include <linux/cper.h> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index fce51ad1f362..362056aefeb4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -36,8 +36,8 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/debugfs.h> -#include <linux/edac_mce.h> #include <linux/irq_work.h> +#include <linux/export.h> #include <asm/processor.h> #include <asm/mce.h> @@ -144,23 +144,20 @@ static struct mce_log mcelog = { void mce_log(struct mce *mce) { unsigned next, entry; + int ret = 0; /* Emit the trace record: */ trace_mce_record(mce); + ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, mce); + if (ret == NOTIFY_STOP) + return; + mce->finished = 0; wmb(); for (;;) { entry = rcu_dereference_check_mce(mcelog.next); for (;;) { - /* - * If edac_mce is enabled, it will check the error type - * and will process it, if it is a known error. - * Otherwise, the error will be sent through mcelog - * interface - */ - if (edac_mce_parse(mce)) - return; /* * When the buffer fills up discard new entries. @@ -217,8 +214,13 @@ static void print_mce(struct mce *m) pr_cont("MISC %llx ", m->misc); pr_cont("\n"); - pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x\n", - m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid); + /* + * Note this output is parsed by external tools and old fields + * should not be changed. + */ + pr_emerg(HW_ERR "PROCESSOR %u:%x TIME %llu SOCKET %u APIC %x microcode %x\n", + m->cpuvendor, m->cpuid, m->time, m->socketid, m->apicid, + cpu_data(m->extcpu).microcode); /* * Print out human-readable details about the MCE error, @@ -551,10 +553,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { + if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) mce_log(&m); - atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); - } /* * Clear state for this bank. diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 27c625178bf1..787e06c84ea6 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -18,6 +18,7 @@ #include <linux/jiffies.h> #include <linux/kernel.h> #include <linux/percpu.h> +#include <linux/export.h> #include <linux/sysdev.h> #include <linux/types.h> #include <linux/init.h> diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index d944bf6c50e9..0a630dd4b620 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -11,6 +11,8 @@ */ #include <linux/types.h> +#include <linux/time.h> +#include <linux/clocksource.h> #include <linux/module.h> #include <asm/processor.h> #include <asm/hypervisor.h> @@ -36,6 +38,25 @@ static bool __init ms_hyperv_platform(void) !memcmp("Microsoft Hv", hyp_signature, 12); } +static cycle_t read_hv_clock(struct clocksource *arg) +{ + cycle_t current_tick; + /* + * Read the partition counter to get the current tick count. This count + * is set to 0 when the partition is created and is incremented in + * 100 nanosecond units. + */ + rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); + return current_tick; +} + +static struct clocksource hyperv_cs = { + .name = "hyperv_clocksource", + .rating = 400, /* use this when running on Hyperv*/ + .read = read_hv_clock, + .mask = CLOCKSOURCE_MASK(64), +}; + static void __init ms_hyperv_init_platform(void) { /* @@ -46,6 +67,8 @@ static void __init ms_hyperv_init_platform(void) printk(KERN_INFO "HyperV: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); + + clocksource_register_hz(&hyperv_cs, NSEC_PER_SEC/100); } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index e09ca20e86ee..2be5ebe99872 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -9,6 +9,7 @@ #include <linux/types.h> #include <linux/init.h> #include <linux/slab.h> +#include <linux/export.h> #include <asm/hardirq.h> #include <asm/apic.h> diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 62ac8cb6ba27..14b23140e81f 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -85,6 +85,8 @@ static int show_cpuinfo(struct seq_file *m, void *v) seq_printf(m, "stepping\t: %d\n", c->x86_mask); else seq_printf(m, "stepping\t: unknown\n"); + if (c->microcode) + seq_printf(m, "microcode\t: 0x%x\n", c->microcode); if (cpu_has(c, X86_FEATURE_TSC)) { unsigned int freq = cpufreq_quick_get(cpu); diff --git a/arch/x86/kernel/cpu/rdrand.c b/arch/x86/kernel/cpu/rdrand.c new file mode 100644 index 000000000000..feca286c2bb4 --- /dev/null +++ b/arch/x86/kernel/cpu/rdrand.c @@ -0,0 +1,73 @@ +/* + * This file is part of the Linux kernel. + * + * Copyright (c) 2011, Intel Corporation + * Authors: Fenghua Yu <fenghua.yu@intel.com>, + * H. Peter Anvin <hpa@linux.intel.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + * + */ + +#include <asm/processor.h> +#include <asm/archrandom.h> +#include <asm/sections.h> + +static int __init x86_rdrand_setup(char *s) +{ + setup_clear_cpu_cap(X86_FEATURE_RDRAND); + return 1; +} +__setup("nordrand", x86_rdrand_setup); + +/* We can't use arch_get_random_long() here since alternatives haven't run */ +static inline int rdrand_long(unsigned long *v) +{ + int ok; + asm volatile("1: " RDRAND_LONG "\n\t" + "jc 2f\n\t" + "decl %0\n\t" + "jnz 1b\n\t" + "2:" + : "=r" (ok), "=a" (*v) + : "0" (RDRAND_RETRY_LOOPS)); + return ok; +} + +/* + * Force a reseed cycle; we are architecturally guaranteed a reseed + * after no more than 512 128-bit chunks of random data. This also + * acts as a test of the CPU capability. + */ +#define RESEED_LOOP ((512*128)/sizeof(unsigned long)) + +void __cpuinit x86_init_rdrand(struct cpuinfo_x86 *c) +{ +#ifdef CONFIG_ARCH_RANDOM + unsigned long tmp; + int i, count, ok; + + if (!cpu_has(c, X86_FEATURE_RDRAND)) + return; /* Nothing to do */ + + for (count = i = 0; i < RESEED_LOOP; i++) { + ok = rdrand_long(&tmp); + if (ok) + count++; + } + + if (count != RESEED_LOOP) + clear_cpu_cap(c, X86_FEATURE_RDRAND); +#endif +} diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index a621f3427685..52821799a702 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -2,6 +2,7 @@ * Architecture specific OF callbacks. */ #include <linux/bootmem.h> +#include <linux/export.h> #include <linux/io.h> #include <linux/interrupt.h> #include <linux/list.h> diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 3e2ef8425316..303a0e48f076 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -12,6 +12,7 @@ #include <linux/types.h> #include <linux/init.h> #include <linux/crash_dump.h> +#include <linux/export.h> #include <linux/bootmem.h> #include <linux/pfn.h> #include <linux/suspend.h> diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 4aecc54236a9..b946a9eac7d9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1,6 +1,7 @@ #include <linux/clocksource.h> #include <linux/clockchips.h> #include <linux/interrupt.h> +#include <linux/export.h> #include <linux/sysdev.h> #include <linux/delay.h> #include <linux/errno.h> diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 6c0802eb2f7f..429e0c92924e 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -9,6 +9,7 @@ #include <linux/smp.h> #include <linux/ftrace.h> #include <linux/delay.h> +#include <linux/export.h> #include <asm/apic.h> #include <asm/io_apic.h> diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index cacdd46d184d..ea9d5f2f13ef 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -24,8 +24,9 @@ union jump_code_union { } __attribute__((packed)); }; -void arch_jump_label_transform(struct jump_entry *entry, - enum jump_label_type type) +static void __jump_label_transform(struct jump_entry *entry, + enum jump_label_type type, + void *(*poker)(void *, const void *, size_t)) { union jump_code_union code; @@ -35,17 +36,24 @@ void arch_jump_label_transform(struct jump_entry *entry, (entry->code + JUMP_LABEL_NOP_SIZE); } else memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE); + + (*poker)((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); +} + +void arch_jump_label_transform(struct jump_entry *entry, + enum jump_label_type type) +{ get_online_cpus(); mutex_lock(&text_mutex); - text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); + __jump_label_transform(entry, type, text_poke_smp); mutex_unlock(&text_mutex); put_online_cpus(); } -void __init_or_module arch_jump_label_text_poke_early(jump_label_t addr) +void arch_jump_label_transform_static(struct jump_entry *entry, + enum jump_label_type type) { - text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5], - JUMP_LABEL_NOP_SIZE); + __jump_label_transform(entry, type, text_poke_early); } #endif diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 591be0ee1934..d494799aafcd 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -74,14 +74,13 @@ static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); - u32 dummy; if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { pr_warning("CPU%d: family %d not supported\n", cpu, c->x86); return -1; } - rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); + csig->rev = c->microcode; pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev); return 0; @@ -130,6 +129,7 @@ static int apply_microcode_amd(int cpu) int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; struct microcode_amd *mc_amd = uci->mc; + struct cpuinfo_x86 *c = &cpu_data(cpu); /* We should bind the task to the CPU */ BUG_ON(cpu_num != cpu); @@ -150,6 +150,7 @@ static int apply_microcode_amd(int cpu) pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev); uci->cpu_sig.rev = rev; + c->microcode = rev; return 0; } diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index f9242800bc84..f2d2a664e797 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -483,7 +483,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); pr_debug("CPU%d removed\n", cpu); break; - case CPU_DEAD: + + /* + * When a CPU goes offline, don't free up or invalidate the copy of + * the microcode in kernel memory, so that we can reuse it when the + * CPU comes back online without unnecessarily requesting the userspace + * for it again. + */ case CPU_UP_CANCELED_FROZEN: /* The CPU refused to come up during a system resume */ microcode_fini_cpu(cpu); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 1a1b606d3e92..3ca42d0e43a2 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -161,12 +161,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) csig->pf = 1 << ((val[1] >> 18) & 7); } - wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* see notes above for revision 1.07. Apparent chip bug */ - sync_core(); - /* get the current revision from MSR 0x8B */ - rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); - + csig->rev = c->microcode; pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", cpu_num, csig->sig, csig->pf, csig->rev); @@ -299,9 +294,9 @@ static int apply_microcode(int cpu) struct microcode_intel *mc_intel; struct ucode_cpu_info *uci; unsigned int val[2]; - int cpu_num; + int cpu_num = raw_smp_processor_id(); + struct cpuinfo_x86 *c = &cpu_data(cpu_num); - cpu_num = raw_smp_processor_id(); uci = ucode_cpu_info + cpu; mc_intel = uci->mc; @@ -317,7 +312,7 @@ static int apply_microcode(int cpu) (unsigned long) mc_intel->bits >> 16 >> 16); wrmsr(MSR_IA32_UCODE_REV, 0, 0); - /* see notes above for revision 1.07. Apparent chip bug */ + /* As documented in the SDM: Do a CPUID 1 here */ sync_core(); /* get the current revision from MSR 0x8B */ @@ -335,6 +330,7 @@ static int apply_microcode(int cpu) (mc_intel->hdr.date >> 16) & 0xff); uci->cpu_sig.rev = val[1]; + c->microcode = val[1]; return 0; } diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index 7ec5bd140b87..b9c8628974af 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -17,6 +17,7 @@ #include <linux/delay.h> #include <linux/hardirq.h> #include <linux/slab.h> +#include <linux/export.h> #include <linux/mca.h> diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c index 622872054fbe..80dc793b3f63 100644 --- a/arch/x86/kernel/pci-dma.c +++ b/arch/x86/kernel/pci-dma.c @@ -1,6 +1,7 @@ #include <linux/dma-mapping.h> #include <linux/dma-debug.h> #include <linux/dmar.h> +#include <linux/export.h> #include <linux/bootmem.h> #include <linux/gfp.h> #include <linux/pci.h> diff --git a/arch/x86/kernel/probe_roms.c b/arch/x86/kernel/probe_roms.c index 63228035f9d7..34e06e84ce31 100644 --- a/arch/x86/kernel/probe_roms.c +++ b/arch/x86/kernel/probe_roms.c @@ -10,9 +10,9 @@ #include <linux/dmi.h> #include <linux/pfn.h> #include <linux/pci.h> -#include <asm/pci-direct.h> - +#include <linux/export.h> +#include <asm/pci-direct.h> #include <asm/e820.h> #include <asm/mmzone.h> #include <asm/setup.h> diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index ccdbc16b8941..348ce016a835 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -5,6 +5,7 @@ #include <linux/mc146818rtc.h> #include <linux/acpi.h> #include <linux/bcd.h> +#include <linux/export.h> #include <linux/pnp.h> #include <linux/of.h> diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index 013e7eba83bb..16204dc15484 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -16,6 +16,7 @@ #include <linux/mm.h> #include <linux/delay.h> #include <linux/spinlock.h> +#include <linux/export.h> #include <linux/kernel_stat.h> #include <linux/mc146818rtc.h> #include <linux/cache.h> diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index ff14a5044ce6..051489082d59 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -14,10 +14,73 @@ #include <linux/personality.h> #include <linux/random.h> #include <linux/uaccess.h> +#include <linux/elf.h> #include <asm/ia32.h> #include <asm/syscalls.h> +/* + * Align a virtual address to avoid aliasing in the I$ on AMD F15h. + * + * @flags denotes the allocation direction - bottomup or topdown - + * or vDSO; see call sites below. + */ +unsigned long align_addr(unsigned long addr, struct file *filp, + enum align_flags flags) +{ + unsigned long tmp_addr; + + /* handle 32- and 64-bit case with a single conditional */ + if (va_align.flags < 0 || !(va_align.flags & (2 - mmap_is_ia32()))) + return addr; + + if (!(current->flags & PF_RANDOMIZE)) + return addr; + + if (!((flags & ALIGN_VDSO) || filp)) + return addr; + + tmp_addr = addr; + + /* + * We need an address which is <= than the original + * one only when in topdown direction. + */ + if (!(flags & ALIGN_TOPDOWN)) + tmp_addr += va_align.mask; + + tmp_addr &= ~va_align.mask; + + return tmp_addr; +} + +static int __init control_va_addr_alignment(char *str) +{ + /* guard against enabling this on other CPU families */ + if (va_align.flags < 0) + return 1; + + if (*str == 0) + return 1; + + if (*str == '=') + str++; + + if (!strcmp(str, "32")) + va_align.flags = ALIGN_VA_32; + else if (!strcmp(str, "64")) + va_align.flags = ALIGN_VA_64; + else if (!strcmp(str, "off")) + va_align.flags = 0; + else if (!strcmp(str, "on")) + va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + else + return 0; + + return 1; +} +__setup("align_va_addr", control_va_addr_alignment); + SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len, unsigned long, prot, unsigned long, flags, unsigned long, fd, unsigned long, off) @@ -92,6 +155,9 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, start_addr = addr; full_search: + + addr = align_addr(addr, filp, 0); + for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { /* At this point: (!vma || addr < vma->vm_end). */ if (end - len < addr) { @@ -117,6 +183,7 @@ full_search: mm->cached_hole_size = vma->vm_start - addr; addr = vma->vm_end; + addr = align_addr(addr, filp, 0); } } @@ -161,10 +228,13 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, /* make sure it can fit in the remaining address space */ if (addr > len) { - vma = find_vma(mm, addr-len); - if (!vma || addr <= vma->vm_start) + unsigned long tmp_addr = align_addr(addr - len, filp, + ALIGN_TOPDOWN); + + vma = find_vma(mm, tmp_addr); + if (!vma || tmp_addr + len <= vma->vm_start) /* remember the address as a hint for next time */ - return mm->free_area_cache = addr-len; + return mm->free_area_cache = tmp_addr; } if (mm->mmap_base < len) @@ -173,6 +243,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, addr = mm->mmap_base-len; do { + addr = align_addr(addr, filp, ALIGN_TOPDOWN); + /* * Lookup failure means no vma is above this address, * else if new region fits below vma->vm_start, diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index bc19be332bc9..9a0e31293920 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -346,3 +346,5 @@ ENTRY(sys_call_table) .long sys_syncfs .long sys_sendmmsg /* 345 */ .long sys_setns + .long sys_process_vm_readv + .long sys_process_vm_writev diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index e07a2fc876b9..e2410e27f97e 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -22,6 +22,7 @@ #include <linux/dma_remapping.h> #include <linux/init_task.h> #include <linux/spinlock.h> +#include <linux/export.h> #include <linux/delay.h> #include <linux/sched.h> #include <linux/init.h> diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 5a64d057be57..dd5fbf4101fc 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -13,6 +13,7 @@ #include <linux/interrupt.h> #include <linux/i8253.h> #include <linux/time.h> +#include <linux/export.h> #include <linux/mca.h> #include <asm/vsyscall.h> diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c index 8927486a4649..76ee97709a00 100644 --- a/arch/x86/kernel/topology.c +++ b/arch/x86/kernel/topology.c @@ -26,6 +26,7 @@ * Send feedback to <colpatch@us.ibm.com> */ #include <linux/nodemask.h> +#include <linux/export.h> #include <linux/mmzone.h> #include <linux/init.h> #include <linux/smp.h> diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index b56c65de384d..e4d4a22e8b94 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -25,6 +25,7 @@ #include <linux/seqlock.h> #include <linux/jiffies.h> #include <linux/sysctl.h> +#include <linux/topology.h> #include <linux/clocksource.h> #include <linux/getcpu.h> #include <linux/cpu.h> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 8b4cc5f067de..f1e3be18a08f 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -29,6 +29,39 @@ #include "tss.h" /* + * Operand types + */ +#define OpNone 0ull +#define OpImplicit 1ull /* No generic decode */ +#define OpReg 2ull /* Register */ +#define OpMem 3ull /* Memory */ +#define OpAcc 4ull /* Accumulator: AL/AX/EAX/RAX */ +#define OpDI 5ull /* ES:DI/EDI/RDI */ +#define OpMem64 6ull /* Memory, 64-bit */ +#define OpImmUByte 7ull /* Zero-extended 8-bit immediate */ +#define OpDX 8ull /* DX register */ +#define OpCL 9ull /* CL register (for shifts) */ +#define OpImmByte 10ull /* 8-bit sign extended immediate */ +#define OpOne 11ull /* Implied 1 */ +#define OpImm 12ull /* Sign extended immediate */ +#define OpMem16 13ull /* Memory operand (16-bit). */ +#define OpMem32 14ull /* Memory operand (32-bit). */ +#define OpImmU 15ull /* Immediate operand, zero extended */ +#define OpSI 16ull /* SI/ESI/RSI */ +#define OpImmFAddr 17ull /* Immediate far address */ +#define OpMemFAddr 18ull /* Far address in memory */ +#define OpImmU16 19ull /* Immediate operand, 16 bits, zero extended */ +#define OpES 20ull /* ES */ +#define OpCS 21ull /* CS */ +#define OpSS 22ull /* SS */ +#define OpDS 23ull /* DS */ +#define OpFS 24ull /* FS */ +#define OpGS 25ull /* GS */ + +#define OpBits 5 /* Width of operand field */ +#define OpMask ((1ull << OpBits) - 1) + +/* * Opcode effective-address decode tables. * Note that we only emulate instructions that have at least one memory * operand (excluding implicit stack references). We assume that stack @@ -40,37 +73,35 @@ /* Operand sizes: 8-bit operands or specified/overridden size. */ #define ByteOp (1<<0) /* 8-bit operands. */ /* Destination operand type. */ -#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ -#define DstReg (2<<1) /* Register operand. */ -#define DstMem (3<<1) /* Memory operand. */ -#define DstAcc (4<<1) /* Destination Accumulator */ -#define DstDI (5<<1) /* Destination is in ES:(E)DI */ -#define DstMem64 (6<<1) /* 64bit memory operand */ -#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */ -#define DstDX (8<<1) /* Destination is in DX register */ -#define DstMask (0xf<<1) +#define DstShift 1 +#define ImplicitOps (OpImplicit << DstShift) +#define DstReg (OpReg << DstShift) +#define DstMem (OpMem << DstShift) +#define DstAcc (OpAcc << DstShift) +#define DstDI (OpDI << DstShift) +#define DstMem64 (OpMem64 << DstShift) +#define DstImmUByte (OpImmUByte << DstShift) +#define DstDX (OpDX << DstShift) +#define DstMask (OpMask << DstShift) /* Source operand type. */ -#define SrcNone (0<<5) /* No source operand. */ -#define SrcReg (1<<5) /* Register operand. */ -#define SrcMem (2<<5) /* Memory operand. */ -#define SrcMem16 (3<<5) /* Memory operand (16-bit). */ -#define SrcMem32 (4<<5) /* Memory operand (32-bit). */ -#define SrcImm (5<<5) /* Immediate operand. */ -#define SrcImmByte (6<<5) /* 8-bit sign-extended immediate operand. */ -#define SrcOne (7<<5) /* Implied '1' */ -#define SrcImmUByte (8<<5) /* 8-bit unsigned immediate operand. */ -#define SrcImmU (9<<5) /* Immediate operand, unsigned */ -#define SrcSI (0xa<<5) /* Source is in the DS:RSI */ -#define SrcImmFAddr (0xb<<5) /* Source is immediate far address */ -#define SrcMemFAddr (0xc<<5) /* Source is far address in memory */ -#define SrcAcc (0xd<<5) /* Source Accumulator */ -#define SrcImmU16 (0xe<<5) /* Immediate operand, unsigned, 16 bits */ -#define SrcDX (0xf<<5) /* Source is in DX register */ -#define SrcMask (0xf<<5) -/* Generic ModRM decode. */ -#define ModRM (1<<9) -/* Destination is only written; never read. */ -#define Mov (1<<10) +#define SrcShift 6 +#define SrcNone (OpNone << SrcShift) +#define SrcReg (OpReg << SrcShift) +#define SrcMem (OpMem << SrcShift) +#define SrcMem16 (OpMem16 << SrcShift) +#define SrcMem32 (OpMem32 << SrcShift) +#define SrcImm (OpImm << SrcShift) +#define SrcImmByte (OpImmByte << SrcShift) +#define SrcOne (OpOne << SrcShift) +#define SrcImmUByte (OpImmUByte << SrcShift) +#define SrcImmU (OpImmU << SrcShift) +#define SrcSI (OpSI << SrcShift) +#define SrcImmFAddr (OpImmFAddr << SrcShift) +#define SrcMemFAddr (OpMemFAddr << SrcShift) +#define SrcAcc (OpAcc << SrcShift) +#define SrcImmU16 (OpImmU16 << SrcShift) +#define SrcDX (OpDX << SrcShift) +#define SrcMask (OpMask << SrcShift) #define BitOp (1<<11) #define MemAbs (1<<12) /* Memory operand is absolute displacement */ #define String (1<<13) /* String instruction (rep capable) */ @@ -81,6 +112,10 @@ #define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */ #define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */ #define Sse (1<<18) /* SSE Vector instruction */ +/* Generic ModRM decode. */ +#define ModRM (1<<19) +/* Destination is only written; never read. */ +#define Mov (1<<20) /* Misc flags */ #define Prot (1<<21) /* instruction generates #UD if not in prot-mode */ #define VendorSpecific (1<<22) /* Vendor specific instruction */ @@ -91,12 +126,19 @@ #define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ #define No64 (1<<28) /* Source 2 operand type */ -#define Src2None (0<<29) -#define Src2CL (1<<29) -#define Src2ImmByte (2<<29) -#define Src2One (3<<29) -#define Src2Imm (4<<29) -#define Src2Mask (7<<29) +#define Src2Shift (29) +#define Src2None (OpNone << Src2Shift) +#define Src2CL (OpCL << Src2Shift) +#define Src2ImmByte (OpImmByte << Src2Shift) +#define Src2One (OpOne << Src2Shift) +#define Src2Imm (OpImm << Src2Shift) +#define Src2ES (OpES << Src2Shift) +#define Src2CS (OpCS << Src2Shift) +#define Src2SS (OpSS << Src2Shift) +#define Src2DS (OpDS << Src2Shift) +#define Src2FS (OpFS << Src2Shift) +#define Src2GS (OpGS << Src2Shift) +#define Src2Mask (OpMask << Src2Shift) #define X2(x...) x, x #define X3(x...) X2(x), x @@ -108,8 +150,8 @@ #define X16(x...) X8(x), X8(x) struct opcode { - u32 flags; - u8 intercept; + u64 flags : 56; + u64 intercept : 8; union { int (*execute)(struct x86_emulate_ctxt *ctxt); struct opcode *group; @@ -205,105 +247,100 @@ struct gprefix { #define ON64(x) #endif -#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \ +#define ____emulate_2op(ctxt, _op, _x, _y, _suffix, _dsttype) \ do { \ __asm__ __volatile__ ( \ _PRE_EFLAGS("0", "4", "2") \ _op _suffix " %"_x"3,%1; " \ _POST_EFLAGS("0", "4", "2") \ - : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\ + : "=m" ((ctxt)->eflags), \ + "+q" (*(_dsttype*)&(ctxt)->dst.val), \ "=&r" (_tmp) \ - : _y ((_src).val), "i" (EFLAGS_MASK)); \ + : _y ((ctxt)->src.val), "i" (EFLAGS_MASK)); \ } while (0) /* Raw emulation: instruction has two explicit operands. */ -#define __emulate_2op_nobyte(_op,_src,_dst,_eflags,_wx,_wy,_lx,_ly,_qx,_qy) \ +#define __emulate_2op_nobyte(ctxt,_op,_wx,_wy,_lx,_ly,_qx,_qy) \ do { \ unsigned long _tmp; \ \ - switch ((_dst).bytes) { \ + switch ((ctxt)->dst.bytes) { \ case 2: \ - ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\ + ____emulate_2op(ctxt,_op,_wx,_wy,"w",u16); \ break; \ case 4: \ - ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\ + ____emulate_2op(ctxt,_op,_lx,_ly,"l",u32); \ break; \ case 8: \ - ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \ + ON64(____emulate_2op(ctxt,_op,_qx,_qy,"q",u64)); \ break; \ } \ } while (0) -#define __emulate_2op(_op,_src,_dst,_eflags,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ +#define __emulate_2op(ctxt,_op,_bx,_by,_wx,_wy,_lx,_ly,_qx,_qy) \ do { \ unsigned long _tmp; \ - switch ((_dst).bytes) { \ + switch ((ctxt)->dst.bytes) { \ case 1: \ - ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \ + ____emulate_2op(ctxt,_op,_bx,_by,"b",u8); \ break; \ default: \ - __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ + __emulate_2op_nobyte(ctxt, _op, \ _wx, _wy, _lx, _ly, _qx, _qy); \ break; \ } \ } while (0) /* Source operand is byte-sized and may be restricted to just %cl. */ -#define emulate_2op_SrcB(_op, _src, _dst, _eflags) \ - __emulate_2op(_op, _src, _dst, _eflags, \ - "b", "c", "b", "c", "b", "c", "b", "c") +#define emulate_2op_SrcB(ctxt, _op) \ + __emulate_2op(ctxt, _op, "b", "c", "b", "c", "b", "c", "b", "c") /* Source operand is byte, word, long or quad sized. */ -#define emulate_2op_SrcV(_op, _src, _dst, _eflags) \ - __emulate_2op(_op, _src, _dst, _eflags, \ - "b", "q", "w", "r", _LO32, "r", "", "r") +#define emulate_2op_SrcV(ctxt, _op) \ + __emulate_2op(ctxt, _op, "b", "q", "w", "r", _LO32, "r", "", "r") /* Source operand is word, long or quad sized. */ -#define emulate_2op_SrcV_nobyte(_op, _src, _dst, _eflags) \ - __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ - "w", "r", _LO32, "r", "", "r") +#define emulate_2op_SrcV_nobyte(ctxt, _op) \ + __emulate_2op_nobyte(ctxt, _op, "w", "r", _LO32, "r", "", "r") /* Instruction has three operands and one operand is stored in ECX register */ -#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ +#define __emulate_2op_cl(ctxt, _op, _suffix, _type) \ do { \ unsigned long _tmp; \ - _type _clv = (_cl).val; \ - _type _srcv = (_src).val; \ - _type _dstv = (_dst).val; \ + _type _clv = (ctxt)->src2.val; \ + _type _srcv = (ctxt)->src.val; \ + _type _dstv = (ctxt)->dst.val; \ \ __asm__ __volatile__ ( \ _PRE_EFLAGS("0", "5", "2") \ _op _suffix " %4,%1 \n" \ _POST_EFLAGS("0", "5", "2") \ - : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ + : "=m" ((ctxt)->eflags), "+r" (_dstv), "=&r" (_tmp) \ : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ ); \ \ - (_cl).val = (unsigned long) _clv; \ - (_src).val = (unsigned long) _srcv; \ - (_dst).val = (unsigned long) _dstv; \ + (ctxt)->src2.val = (unsigned long) _clv; \ + (ctxt)->src2.val = (unsigned long) _srcv; \ + (ctxt)->dst.val = (unsigned long) _dstv; \ } while (0) -#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ +#define emulate_2op_cl(ctxt, _op) \ do { \ - switch ((_dst).bytes) { \ + switch ((ctxt)->dst.bytes) { \ case 2: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "w", unsigned short); \ + __emulate_2op_cl(ctxt, _op, "w", u16); \ break; \ case 4: \ - __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "l", unsigned int); \ + __emulate_2op_cl(ctxt, _op, "l", u32); \ break; \ case 8: \ - ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ - "q", unsigned long)); \ + ON64(__emulate_2op_cl(ctxt, _op, "q", ulong)); \ break; \ } \ } while (0) -#define __emulate_1op(_op, _dst, _eflags, _suffix) \ +#define __emulate_1op(ctxt, _op, _suffix) \ do { \ unsigned long _tmp; \ \ @@ -311,39 +348,27 @@ struct gprefix { _PRE_EFLAGS("0", "3", "2") \ _op _suffix " %1; " \ _POST_EFLAGS("0", "3", "2") \ - : "=m" (_eflags), "+m" ((_dst).val), \ + : "=m" ((ctxt)->eflags), "+m" ((ctxt)->dst.val), \ "=&r" (_tmp) \ : "i" (EFLAGS_MASK)); \ } while (0) /* Instruction has only one explicit operand (no source operand). */ -#define emulate_1op(_op, _dst, _eflags) \ +#define emulate_1op(ctxt, _op) \ do { \ - switch ((_dst).bytes) { \ - case 1: __emulate_1op(_op, _dst, _eflags, "b"); break; \ - case 2: __emulate_1op(_op, _dst, _eflags, "w"); break; \ - case 4: __emulate_1op(_op, _dst, _eflags, "l"); break; \ - case 8: ON64(__emulate_1op(_op, _dst, _eflags, "q")); break; \ + switch ((ctxt)->dst.bytes) { \ + case 1: __emulate_1op(ctxt, _op, "b"); break; \ + case 2: __emulate_1op(ctxt, _op, "w"); break; \ + case 4: __emulate_1op(ctxt, _op, "l"); break; \ + case 8: ON64(__emulate_1op(ctxt, _op, "q")); break; \ } \ } while (0) -#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \ - do { \ - unsigned long _tmp; \ - \ - __asm__ __volatile__ ( \ - _PRE_EFLAGS("0", "4", "1") \ - _op _suffix " %5; " \ - _POST_EFLAGS("0", "4", "1") \ - : "=m" (_eflags), "=&r" (_tmp), \ - "+a" (_rax), "+d" (_rdx) \ - : "i" (EFLAGS_MASK), "m" ((_src).val), \ - "a" (_rax), "d" (_rdx)); \ - } while (0) - -#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \ +#define __emulate_1op_rax_rdx(ctxt, _op, _suffix, _ex) \ do { \ unsigned long _tmp; \ + ulong *rax = &(ctxt)->regs[VCPU_REGS_RAX]; \ + ulong *rdx = &(ctxt)->regs[VCPU_REGS_RDX]; \ \ __asm__ __volatile__ ( \ _PRE_EFLAGS("0", "5", "1") \ @@ -356,53 +381,27 @@ struct gprefix { "jmp 2b \n\t" \ ".popsection \n\t" \ _ASM_EXTABLE(1b, 3b) \ - : "=m" (_eflags), "=&r" (_tmp), \ - "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \ - : "i" (EFLAGS_MASK), "m" ((_src).val), \ - "a" (_rax), "d" (_rdx)); \ + : "=m" ((ctxt)->eflags), "=&r" (_tmp), \ + "+a" (*rax), "+d" (*rdx), "+qm"(_ex) \ + : "i" (EFLAGS_MASK), "m" ((ctxt)->src.val), \ + "a" (*rax), "d" (*rdx)); \ } while (0) /* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ -#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ +#define emulate_1op_rax_rdx(ctxt, _op, _ex) \ do { \ - switch((_src).bytes) { \ + switch((ctxt)->src.bytes) { \ case 1: \ - __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "b"); \ + __emulate_1op_rax_rdx(ctxt, _op, "b", _ex); \ break; \ case 2: \ - __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "w"); \ + __emulate_1op_rax_rdx(ctxt, _op, "w", _ex); \ break; \ case 4: \ - __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "l"); \ - break; \ - case 8: \ - ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \ - _eflags, "q")); \ - break; \ - } \ - } while (0) - -#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \ - do { \ - switch((_src).bytes) { \ - case 1: \ - __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ - _eflags, "b", _ex); \ - break; \ - case 2: \ - __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ - _eflags, "w", _ex); \ - break; \ - case 4: \ - __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ - _eflags, "l", _ex); \ + __emulate_1op_rax_rdx(ctxt, _op, "l", _ex); \ break; \ case 8: ON64( \ - __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \ - _eflags, "q", _ex)); \ + __emulate_1op_rax_rdx(ctxt, _op, "q", _ex)); \ break; \ } \ } while (0) @@ -651,41 +650,50 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt, return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception); } -static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, - unsigned long eip, u8 *dest) +/* + * Fetch the next byte of the instruction being emulated which is pointed to + * by ctxt->_eip, then increment ctxt->_eip. + * + * Also prefetch the remaining bytes of the instruction without crossing page + * boundary if they are not in fetch_cache yet. + */ +static int do_insn_fetch_byte(struct x86_emulate_ctxt *ctxt, u8 *dest) { struct fetch_cache *fc = &ctxt->fetch; int rc; int size, cur_size; - if (eip == fc->end) { + if (ctxt->_eip == fc->end) { unsigned long linear; - struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip}; + struct segmented_address addr = { .seg = VCPU_SREG_CS, + .ea = ctxt->_eip }; cur_size = fc->end - fc->start; - size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); + size = min(15UL - cur_size, + PAGE_SIZE - offset_in_page(ctxt->_eip)); rc = __linearize(ctxt, addr, size, false, true, &linear); - if (rc != X86EMUL_CONTINUE) + if (unlikely(rc != X86EMUL_CONTINUE)) return rc; rc = ctxt->ops->fetch(ctxt, linear, fc->data + cur_size, size, &ctxt->exception); - if (rc != X86EMUL_CONTINUE) + if (unlikely(rc != X86EMUL_CONTINUE)) return rc; fc->end += size; } - *dest = fc->data[eip - fc->start]; + *dest = fc->data[ctxt->_eip - fc->start]; + ctxt->_eip++; return X86EMUL_CONTINUE; } static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, - unsigned long eip, void *dest, unsigned size) + void *dest, unsigned size) { int rc; /* x86 instructions are limited to 15 bytes. */ - if (eip + size - ctxt->eip > 15) + if (unlikely(ctxt->_eip + size - ctxt->eip > 15)) return X86EMUL_UNHANDLEABLE; while (size--) { - rc = do_insn_fetch_byte(ctxt, eip++, dest++); + rc = do_insn_fetch_byte(ctxt, dest++); if (rc != X86EMUL_CONTINUE) return rc; } @@ -693,20 +701,18 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt, } /* Fetch next part of the instruction being emulated. */ -#define insn_fetch(_type, _size, _eip) \ +#define insn_fetch(_type, _ctxt) \ ({ unsigned long _x; \ - rc = do_insn_fetch(ctxt, (_eip), &_x, (_size)); \ + rc = do_insn_fetch(_ctxt, &_x, sizeof(_type)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ - (_eip) += (_size); \ (_type)_x; \ }) -#define insn_fetch_arr(_arr, _size, _eip) \ -({ rc = do_insn_fetch(ctxt, (_eip), _arr, (_size)); \ +#define insn_fetch_arr(_arr, _size, _ctxt) \ +({ rc = do_insn_fetch(_ctxt, _arr, (_size)); \ if (rc != X86EMUL_CONTINUE) \ goto done; \ - (_eip) += (_size); \ }) /* @@ -894,7 +900,7 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, ctxt->modrm_rm = base_reg = (ctxt->rex_prefix & 1) << 3; /* REG.B */ } - ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); + ctxt->modrm = insn_fetch(u8, ctxt); ctxt->modrm_mod |= (ctxt->modrm & 0xc0) >> 6; ctxt->modrm_reg |= (ctxt->modrm & 0x38) >> 3; ctxt->modrm_rm |= (ctxt->modrm & 0x07); @@ -928,13 +934,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 6) - modrm_ea += insn_fetch(u16, 2, ctxt->_eip); + modrm_ea += insn_fetch(u16, ctxt); break; case 1: - modrm_ea += insn_fetch(s8, 1, ctxt->_eip); + modrm_ea += insn_fetch(s8, ctxt); break; case 2: - modrm_ea += insn_fetch(u16, 2, ctxt->_eip); + modrm_ea += insn_fetch(u16, ctxt); break; } switch (ctxt->modrm_rm) { @@ -971,13 +977,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, } else { /* 32/64-bit ModR/M decode. */ if ((ctxt->modrm_rm & 7) == 4) { - sib = insn_fetch(u8, 1, ctxt->_eip); + sib = insn_fetch(u8, ctxt); index_reg |= (sib >> 3) & 7; base_reg |= sib & 7; scale = sib >> 6; if ((base_reg & 7) == 5 && ctxt->modrm_mod == 0) - modrm_ea += insn_fetch(s32, 4, ctxt->_eip); + modrm_ea += insn_fetch(s32, ctxt); else modrm_ea += ctxt->regs[base_reg]; if (index_reg != 4) @@ -990,13 +996,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt, switch (ctxt->modrm_mod) { case 0: if (ctxt->modrm_rm == 5) - modrm_ea += insn_fetch(s32, 4, ctxt->_eip); + modrm_ea += insn_fetch(s32, ctxt); break; case 1: - modrm_ea += insn_fetch(s8, 1, ctxt->_eip); + modrm_ea += insn_fetch(s8, ctxt); break; case 2: - modrm_ea += insn_fetch(s32, 4, ctxt->_eip); + modrm_ea += insn_fetch(s32, ctxt); break; } } @@ -1013,13 +1019,13 @@ static int decode_abs(struct x86_emulate_ctxt *ctxt, op->type = OP_MEM; switch (ctxt->ad_bytes) { case 2: - op->addr.mem.ea = insn_fetch(u16, 2, ctxt->_eip); + op->addr.mem.ea = insn_fetch(u16, ctxt); break; case 4: - op->addr.mem.ea = insn_fetch(u32, 4, ctxt->_eip); + op->addr.mem.ea = insn_fetch(u32, ctxt); break; case 8: - op->addr.mem.ea = insn_fetch(u64, 8, ctxt->_eip); + op->addr.mem.ea = insn_fetch(u64, ctxt); break; } done: @@ -1452,15 +1458,18 @@ static int em_popf(struct x86_emulate_ctxt *ctxt) return emulate_popf(ctxt, &ctxt->dst.val, ctxt->op_bytes); } -static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg) +static int em_push_sreg(struct x86_emulate_ctxt *ctxt) { + int seg = ctxt->src2.val; + ctxt->src.val = get_segment_selector(ctxt, seg); return em_push(ctxt); } -static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, int seg) +static int em_pop_sreg(struct x86_emulate_ctxt *ctxt) { + int seg = ctxt->src2.val; unsigned long selector; int rc; @@ -1674,64 +1683,74 @@ static int em_grp2(struct x86_emulate_ctxt *ctxt) { switch (ctxt->modrm_reg) { case 0: /* rol */ - emulate_2op_SrcB("rol", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "rol"); break; case 1: /* ror */ - emulate_2op_SrcB("ror", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "ror"); break; case 2: /* rcl */ - emulate_2op_SrcB("rcl", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "rcl"); break; case 3: /* rcr */ - emulate_2op_SrcB("rcr", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "rcr"); break; case 4: /* sal/shl */ case 6: /* sal/shl */ - emulate_2op_SrcB("sal", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "sal"); break; case 5: /* shr */ - emulate_2op_SrcB("shr", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "shr"); break; case 7: /* sar */ - emulate_2op_SrcB("sar", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcB(ctxt, "sar"); break; } return X86EMUL_CONTINUE; } -static int em_grp3(struct x86_emulate_ctxt *ctxt) +static int em_not(struct x86_emulate_ctxt *ctxt) +{ + ctxt->dst.val = ~ctxt->dst.val; + return X86EMUL_CONTINUE; +} + +static int em_neg(struct x86_emulate_ctxt *ctxt) +{ + emulate_1op(ctxt, "neg"); + return X86EMUL_CONTINUE; +} + +static int em_mul_ex(struct x86_emulate_ctxt *ctxt) +{ + u8 ex = 0; + + emulate_1op_rax_rdx(ctxt, "mul", ex); + return X86EMUL_CONTINUE; +} + +static int em_imul_ex(struct x86_emulate_ctxt *ctxt) +{ + u8 ex = 0; + + emulate_1op_rax_rdx(ctxt, "imul", ex); + return X86EMUL_CONTINUE; +} + +static int em_div_ex(struct x86_emulate_ctxt *ctxt) { - unsigned long *rax = &ctxt->regs[VCPU_REGS_RAX]; - unsigned long *rdx = &ctxt->regs[VCPU_REGS_RDX]; u8 de = 0; - switch (ctxt->modrm_reg) { - case 0 ... 1: /* test */ - emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags); - break; - case 2: /* not */ - ctxt->dst.val = ~ctxt->dst.val; - break; - case 3: /* neg */ - emulate_1op("neg", ctxt->dst, ctxt->eflags); - break; - case 4: /* mul */ - emulate_1op_rax_rdx("mul", ctxt->src, *rax, *rdx, ctxt->eflags); - break; - case 5: /* imul */ - emulate_1op_rax_rdx("imul", ctxt->src, *rax, *rdx, ctxt->eflags); - break; - case 6: /* div */ - emulate_1op_rax_rdx_ex("div", ctxt->src, *rax, *rdx, - ctxt->eflags, de); - break; - case 7: /* idiv */ - emulate_1op_rax_rdx_ex("idiv", ctxt->src, *rax, *rdx, - ctxt->eflags, de); - break; - default: - return X86EMUL_UNHANDLEABLE; - } + emulate_1op_rax_rdx(ctxt, "div", de); + if (de) + return emulate_de(ctxt); + return X86EMUL_CONTINUE; +} + +static int em_idiv_ex(struct x86_emulate_ctxt *ctxt) +{ + u8 de = 0; + + emulate_1op_rax_rdx(ctxt, "idiv", de); if (de) return emulate_de(ctxt); return X86EMUL_CONTINUE; @@ -1743,10 +1762,10 @@ static int em_grp45(struct x86_emulate_ctxt *ctxt) switch (ctxt->modrm_reg) { case 0: /* inc */ - emulate_1op("inc", ctxt->dst, ctxt->eflags); + emulate_1op(ctxt, "inc"); break; case 1: /* dec */ - emulate_1op("dec", ctxt->dst, ctxt->eflags); + emulate_1op(ctxt, "dec"); break; case 2: /* call near abs */ { long int old_eip; @@ -1812,8 +1831,9 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt) return rc; } -static int emulate_load_segment(struct x86_emulate_ctxt *ctxt, int seg) +static int em_lseg(struct x86_emulate_ctxt *ctxt) { + int seg = ctxt->src2.val; unsigned short sel; int rc; @@ -2452,7 +2472,7 @@ static int em_das(struct x86_emulate_ctxt *ctxt) ctxt->src.type = OP_IMM; ctxt->src.val = 0; ctxt->src.bytes = 1; - emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "or"); ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF); if (cf) ctxt->eflags |= X86_EFLAGS_CF; @@ -2502,49 +2522,49 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) static int em_add(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "add"); return X86EMUL_CONTINUE; } static int em_or(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("or", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "or"); return X86EMUL_CONTINUE; } static int em_adc(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("adc", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "adc"); return X86EMUL_CONTINUE; } static int em_sbb(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("sbb", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "sbb"); return X86EMUL_CONTINUE; } static int em_and(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("and", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "and"); return X86EMUL_CONTINUE; } static int em_sub(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("sub", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "sub"); return X86EMUL_CONTINUE; } static int em_xor(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("xor", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "xor"); return X86EMUL_CONTINUE; } static int em_cmp(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "cmp"); /* Disable writeback. */ ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; @@ -2552,7 +2572,9 @@ static int em_cmp(struct x86_emulate_ctxt *ctxt) static int em_test(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV("test", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "test"); + /* Disable writeback. */ + ctxt->dst.type = OP_NONE; return X86EMUL_CONTINUE; } @@ -2570,7 +2592,7 @@ static int em_xchg(struct x86_emulate_ctxt *ctxt) static int em_imul(struct x86_emulate_ctxt *ctxt) { - emulate_2op_SrcV_nobyte("imul", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte(ctxt, "imul"); return X86EMUL_CONTINUE; } @@ -3025,9 +3047,14 @@ static struct opcode group1A[] = { }; static struct opcode group3[] = { - D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM), - D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock), - X4(D(SrcMem | ModRM)), + I(DstMem | SrcImm | ModRM, em_test), + I(DstMem | SrcImm | ModRM, em_test), + I(DstMem | SrcNone | ModRM | Lock, em_not), + I(DstMem | SrcNone | ModRM | Lock, em_neg), + I(SrcMem | ModRM, em_mul_ex), + I(SrcMem | ModRM, em_imul_ex), + I(SrcMem | ModRM, em_div_ex), + I(SrcMem | ModRM, em_idiv_ex), }; static struct opcode group4[] = { @@ -3090,16 +3117,20 @@ static struct gprefix pfx_0f_6f_0f_7f = { static struct opcode opcode_table[256] = { /* 0x00 - 0x07 */ I6ALU(Lock, em_add), - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + I(ImplicitOps | Stack | No64 | Src2ES, em_push_sreg), + I(ImplicitOps | Stack | No64 | Src2ES, em_pop_sreg), /* 0x08 - 0x0F */ I6ALU(Lock, em_or), - D(ImplicitOps | Stack | No64), N, + I(ImplicitOps | Stack | No64 | Src2CS, em_push_sreg), + N, /* 0x10 - 0x17 */ I6ALU(Lock, em_adc), - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + I(ImplicitOps | Stack | No64 | Src2SS, em_push_sreg), + I(ImplicitOps | Stack | No64 | Src2SS, em_pop_sreg), /* 0x18 - 0x1F */ I6ALU(Lock, em_sbb), - D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), + I(ImplicitOps | Stack | No64 | Src2DS, em_push_sreg), + I(ImplicitOps | Stack | No64 | Src2DS, em_pop_sreg), /* 0x20 - 0x27 */ I6ALU(Lock, em_and), N, N, /* 0x28 - 0x2F */ @@ -3167,7 +3198,8 @@ static struct opcode opcode_table[256] = { D2bv(DstMem | SrcImmByte | ModRM), I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm), I(ImplicitOps | Stack, em_ret), - D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64), + I(DstReg | SrcMemFAddr | ModRM | No64 | Src2ES, em_lseg), + I(DstReg | SrcMemFAddr | ModRM | No64 | Src2DS, em_lseg), G(ByteOp, group11), G(0, group11), /* 0xC8 - 0xCF */ N, N, N, I(ImplicitOps | Stack, em_ret_far), @@ -3242,20 +3274,22 @@ static struct opcode twobyte_table[256] = { /* 0x90 - 0x9F */ X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), /* 0xA0 - 0xA7 */ - D(ImplicitOps | Stack), D(ImplicitOps | Stack), + I(Stack | Src2FS, em_push_sreg), I(Stack | Src2FS, em_pop_sreg), DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), N, N, /* 0xA8 - 0xAF */ - D(ImplicitOps | Stack), D(ImplicitOps | Stack), + I(Stack | Src2GS, em_push_sreg), I(Stack | Src2GS, em_pop_sreg), DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock), D(DstMem | SrcReg | Src2ImmByte | ModRM), D(DstMem | SrcReg | Src2CL | ModRM), D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), /* 0xB0 - 0xB7 */ D2bv(DstMem | SrcReg | ModRM | Lock), - D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock), - D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM), + I(DstReg | SrcMemFAddr | ModRM | Src2SS, em_lseg), + D(DstMem | SrcReg | ModRM | BitOp | Lock), + I(DstReg | SrcMemFAddr | ModRM | Src2FS, em_lseg), + I(DstReg | SrcMemFAddr | ModRM | Src2GS, em_lseg), D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov), /* 0xB8 - 0xBF */ N, N, @@ -3309,13 +3343,13 @@ static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op, /* NB. Immediates are sign-extended as necessary. */ switch (op->bytes) { case 1: - op->val = insn_fetch(s8, 1, ctxt->_eip); + op->val = insn_fetch(s8, ctxt); break; case 2: - op->val = insn_fetch(s16, 2, ctxt->_eip); + op->val = insn_fetch(s16, ctxt); break; case 4: - op->val = insn_fetch(s32, 4, ctxt->_eip); + op->val = insn_fetch(s32, ctxt); break; } if (!sign_extension) { @@ -3335,6 +3369,125 @@ done: return rc; } +static int decode_operand(struct x86_emulate_ctxt *ctxt, struct operand *op, + unsigned d) +{ + int rc = X86EMUL_CONTINUE; + + switch (d) { + case OpReg: + decode_register_operand(ctxt, op, + op == &ctxt->dst && + ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); + break; + case OpImmUByte: + rc = decode_imm(ctxt, op, 1, false); + break; + case OpMem: + ctxt->memop.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + mem_common: + *op = ctxt->memop; + ctxt->memopp = op; + if ((ctxt->d & BitOp) && op == &ctxt->dst) + fetch_bit_operand(ctxt); + op->orig_val = op->val; + break; + case OpMem64: + ctxt->memop.bytes = 8; + goto mem_common; + case OpAcc: + op->type = OP_REG; + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + op->addr.reg = &ctxt->regs[VCPU_REGS_RAX]; + fetch_register_operand(op); + op->orig_val = op->val; + break; + case OpDI: + op->type = OP_MEM; + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + op->addr.mem.ea = + register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]); + op->addr.mem.seg = VCPU_SREG_ES; + op->val = 0; + break; + case OpDX: + op->type = OP_REG; + op->bytes = 2; + op->addr.reg = &ctxt->regs[VCPU_REGS_RDX]; + fetch_register_operand(op); + break; + case OpCL: + op->bytes = 1; + op->val = ctxt->regs[VCPU_REGS_RCX] & 0xff; + break; + case OpImmByte: + rc = decode_imm(ctxt, op, 1, true); + break; + case OpOne: + op->bytes = 1; + op->val = 1; + break; + case OpImm: + rc = decode_imm(ctxt, op, imm_size(ctxt), true); + break; + case OpMem16: + ctxt->memop.bytes = 2; + goto mem_common; + case OpMem32: + ctxt->memop.bytes = 4; + goto mem_common; + case OpImmU16: + rc = decode_imm(ctxt, op, 2, false); + break; + case OpImmU: + rc = decode_imm(ctxt, op, imm_size(ctxt), false); + break; + case OpSI: + op->type = OP_MEM; + op->bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; + op->addr.mem.ea = + register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]); + op->addr.mem.seg = seg_override(ctxt); + op->val = 0; + break; + case OpImmFAddr: + op->type = OP_IMM; + op->addr.mem.ea = ctxt->_eip; + op->bytes = ctxt->op_bytes + 2; + insn_fetch_arr(op->valptr, op->bytes, ctxt); + break; + case OpMemFAddr: + ctxt->memop.bytes = ctxt->op_bytes + 2; + goto mem_common; + case OpES: + op->val = VCPU_SREG_ES; + break; + case OpCS: + op->val = VCPU_SREG_CS; + break; + case OpSS: + op->val = VCPU_SREG_SS; + break; + case OpDS: + op->val = VCPU_SREG_DS; + break; + case OpFS: + op->val = VCPU_SREG_FS; + break; + case OpGS: + op->val = VCPU_SREG_GS; + break; + case OpImplicit: + /* Special instructions do their own operand decoding. */ + default: + op->type = OP_NONE; /* Disable writeback. */ + break; + } + +done: + return rc; +} + int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) { int rc = X86EMUL_CONTINUE; @@ -3342,8 +3495,9 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) int def_op_bytes, def_ad_bytes, goffset, simd_prefix; bool op_prefix = false; struct opcode opcode; - struct operand memop = { .type = OP_NONE }, *memopp = NULL; + ctxt->memop.type = OP_NONE; + ctxt->memopp = NULL; ctxt->_eip = ctxt->eip; ctxt->fetch.start = ctxt->_eip; ctxt->fetch.end = ctxt->fetch.start + insn_len; @@ -3366,7 +3520,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) break; #endif default: - return -1; + return EMULATION_FAILED; } ctxt->op_bytes = def_op_bytes; @@ -3374,7 +3528,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len) /* Legacy prefixes. */ for (;;) { - switch (ctxt->b = insn_fetch(u8, 1, ctxt->_eip)) { + switch (ctxt->b = insn_fetch(u8, ctxt)) { case 0x66: /* operand-size override */ op_prefix = true; /* switch between 2/4 bytes */ @@ -3430,7 +3584,7 @@ done_prefixes: /* Two-byte opcode? */ if (ctxt->b == 0x0f) { ctxt->twobyte = 1; - ctxt->b = insn_fetch(u8, 1, ctxt->_eip); + ctxt->b = insn_fetch(u8, ctxt); opcode = twobyte_table[ctxt->b]; } ctxt->d = opcode.flags; @@ -3438,13 +3592,13 @@ done_prefixes: while (ctxt->d & GroupMask) { switch (ctxt->d & GroupMask) { case Group: - ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); + ctxt->modrm = insn_fetch(u8, ctxt); --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; opcode = opcode.u.group[goffset]; break; case GroupDual: - ctxt->modrm = insn_fetch(u8, 1, ctxt->_eip); + ctxt->modrm = insn_fetch(u8, ctxt); --ctxt->_eip; goffset = (ctxt->modrm >> 3) & 7; if ((ctxt->modrm >> 6) == 3) @@ -3458,7 +3612,7 @@ done_prefixes: break; case Prefix: if (ctxt->rep_prefix && op_prefix) - return X86EMUL_UNHANDLEABLE; + return EMULATION_FAILED; simd_prefix = op_prefix ? 0x66 : ctxt->rep_prefix; switch (simd_prefix) { case 0x00: opcode = opcode.u.gprefix->pfx_no; break; @@ -3468,10 +3622,10 @@ done_prefixes: } break; default: - return X86EMUL_UNHANDLEABLE; + return EMULATION_FAILED; } - ctxt->d &= ~GroupMask; + ctxt->d &= ~(u64)GroupMask; ctxt->d |= opcode.flags; } @@ -3481,10 +3635,10 @@ done_prefixes: /* Unrecognised? */ if (ctxt->d == 0 || (ctxt->d & Undefined)) - return -1; + return EMULATION_FAILED; if (!(ctxt->d & VendorSpecific) && ctxt->only_vendor_specific_insn) - return -1; + return EMULATION_FAILED; if (mode == X86EMUL_MODE_PROT64 && (ctxt->d & Stack)) ctxt->op_bytes = 8; @@ -3501,96 +3655,27 @@ done_prefixes: /* ModRM and SIB bytes. */ if (ctxt->d & ModRM) { - rc = decode_modrm(ctxt, &memop); + rc = decode_modrm(ctxt, &ctxt->memop); if (!ctxt->has_seg_override) set_seg_override(ctxt, ctxt->modrm_seg); } else if (ctxt->d & MemAbs) - rc = decode_abs(ctxt, &memop); + rc = decode_abs(ctxt, &ctxt->memop); if (rc != X86EMUL_CONTINUE) goto done; if (!ctxt->has_seg_override) set_seg_override(ctxt, VCPU_SREG_DS); - memop.addr.mem.seg = seg_override(ctxt); + ctxt->memop.addr.mem.seg = seg_override(ctxt); - if (memop.type == OP_MEM && ctxt->ad_bytes != 8) - memop.addr.mem.ea = (u32)memop.addr.mem.ea; + if (ctxt->memop.type == OP_MEM && ctxt->ad_bytes != 8) + ctxt->memop.addr.mem.ea = (u32)ctxt->memop.addr.mem.ea; /* * Decode and fetch the source operand: register, memory * or immediate. */ - switch (ctxt->d & SrcMask) { - case SrcNone: - break; - case SrcReg: - decode_register_operand(ctxt, &ctxt->src, 0); - break; - case SrcMem16: - memop.bytes = 2; - goto srcmem_common; - case SrcMem32: - memop.bytes = 4; - goto srcmem_common; - case SrcMem: - memop.bytes = (ctxt->d & ByteOp) ? 1 : - ctxt->op_bytes; - srcmem_common: - ctxt->src = memop; - memopp = &ctxt->src; - break; - case SrcImmU16: - rc = decode_imm(ctxt, &ctxt->src, 2, false); - break; - case SrcImm: - rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), true); - break; - case SrcImmU: - rc = decode_imm(ctxt, &ctxt->src, imm_size(ctxt), false); - break; - case SrcImmByte: - rc = decode_imm(ctxt, &ctxt->src, 1, true); - break; - case SrcImmUByte: - rc = decode_imm(ctxt, &ctxt->src, 1, false); - break; - case SrcAcc: - ctxt->src.type = OP_REG; - ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RAX]; - fetch_register_operand(&ctxt->src); - break; - case SrcOne: - ctxt->src.bytes = 1; - ctxt->src.val = 1; - break; - case SrcSI: - ctxt->src.type = OP_MEM; - ctxt->src.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - ctxt->src.addr.mem.ea = - register_address(ctxt, ctxt->regs[VCPU_REGS_RSI]); - ctxt->src.addr.mem.seg = seg_override(ctxt); - ctxt->src.val = 0; - break; - case SrcImmFAddr: - ctxt->src.type = OP_IMM; - ctxt->src.addr.mem.ea = ctxt->_eip; - ctxt->src.bytes = ctxt->op_bytes + 2; - insn_fetch_arr(ctxt->src.valptr, ctxt->src.bytes, ctxt->_eip); - break; - case SrcMemFAddr: - memop.bytes = ctxt->op_bytes + 2; - goto srcmem_common; - break; - case SrcDX: - ctxt->src.type = OP_REG; - ctxt->src.bytes = 2; - ctxt->src.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; - fetch_register_operand(&ctxt->src); - break; - } - + rc = decode_operand(ctxt, &ctxt->src, (ctxt->d >> SrcShift) & OpMask); if (rc != X86EMUL_CONTINUE) goto done; @@ -3598,85 +3683,18 @@ done_prefixes: * Decode and fetch the second source operand: register, memory * or immediate. */ - switch (ctxt->d & Src2Mask) { - case Src2None: - break; - case Src2CL: - ctxt->src2.bytes = 1; - ctxt->src2.val = ctxt->regs[VCPU_REGS_RCX] & 0xff; - break; - case Src2ImmByte: - rc = decode_imm(ctxt, &ctxt->src2, 1, true); - break; - case Src2One: - ctxt->src2.bytes = 1; - ctxt->src2.val = 1; - break; - case Src2Imm: - rc = decode_imm(ctxt, &ctxt->src2, imm_size(ctxt), true); - break; - } - + rc = decode_operand(ctxt, &ctxt->src2, (ctxt->d >> Src2Shift) & OpMask); if (rc != X86EMUL_CONTINUE) goto done; /* Decode and fetch the destination operand: register or memory. */ - switch (ctxt->d & DstMask) { - case DstReg: - decode_register_operand(ctxt, &ctxt->dst, - ctxt->twobyte && (ctxt->b == 0xb6 || ctxt->b == 0xb7)); - break; - case DstImmUByte: - ctxt->dst.type = OP_IMM; - ctxt->dst.addr.mem.ea = ctxt->_eip; - ctxt->dst.bytes = 1; - ctxt->dst.val = insn_fetch(u8, 1, ctxt->_eip); - break; - case DstMem: - case DstMem64: - ctxt->dst = memop; - memopp = &ctxt->dst; - if ((ctxt->d & DstMask) == DstMem64) - ctxt->dst.bytes = 8; - else - ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - if (ctxt->d & BitOp) - fetch_bit_operand(ctxt); - ctxt->dst.orig_val = ctxt->dst.val; - break; - case DstAcc: - ctxt->dst.type = OP_REG; - ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RAX]; - fetch_register_operand(&ctxt->dst); - ctxt->dst.orig_val = ctxt->dst.val; - break; - case DstDI: - ctxt->dst.type = OP_MEM; - ctxt->dst.bytes = (ctxt->d & ByteOp) ? 1 : ctxt->op_bytes; - ctxt->dst.addr.mem.ea = - register_address(ctxt, ctxt->regs[VCPU_REGS_RDI]); - ctxt->dst.addr.mem.seg = VCPU_SREG_ES; - ctxt->dst.val = 0; - break; - case DstDX: - ctxt->dst.type = OP_REG; - ctxt->dst.bytes = 2; - ctxt->dst.addr.reg = &ctxt->regs[VCPU_REGS_RDX]; - fetch_register_operand(&ctxt->dst); - break; - case ImplicitOps: - /* Special instructions do their own operand decoding. */ - default: - ctxt->dst.type = OP_NONE; /* Disable writeback. */ - break; - } + rc = decode_operand(ctxt, &ctxt->dst, (ctxt->d >> DstShift) & OpMask); done: - if (memopp && memopp->type == OP_MEM && ctxt->rip_relative) - memopp->addr.mem.ea += ctxt->_eip; + if (ctxt->memopp && ctxt->memopp->type == OP_MEM && ctxt->rip_relative) + ctxt->memopp->addr.mem.ea += ctxt->_eip; - return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; + return (rc != X86EMUL_CONTINUE) ? EMULATION_FAILED : EMULATION_OK; } static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) @@ -3825,32 +3843,11 @@ special_insn: goto twobyte_insn; switch (ctxt->b) { - case 0x06: /* push es */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_ES); - break; - case 0x07: /* pop es */ - rc = emulate_pop_sreg(ctxt, VCPU_SREG_ES); - break; - case 0x0e: /* push cs */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_CS); - break; - case 0x16: /* push ss */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_SS); - break; - case 0x17: /* pop ss */ - rc = emulate_pop_sreg(ctxt, VCPU_SREG_SS); - break; - case 0x1e: /* push ds */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_DS); - break; - case 0x1f: /* pop ds */ - rc = emulate_pop_sreg(ctxt, VCPU_SREG_DS); - break; case 0x40 ... 0x47: /* inc r16/r32 */ - emulate_1op("inc", ctxt->dst, ctxt->eflags); + emulate_1op(ctxt, "inc"); break; case 0x48 ... 0x4f: /* dec r16/r32 */ - emulate_1op("dec", ctxt->dst, ctxt->eflags); + emulate_1op(ctxt, "dec"); break; case 0x63: /* movsxd */ if (ctxt->mode != X86EMUL_MODE_PROT64) @@ -3891,12 +3888,6 @@ special_insn: case 0xc0 ... 0xc1: rc = em_grp2(ctxt); break; - case 0xc4: /* les */ - rc = emulate_load_segment(ctxt, VCPU_SREG_ES); - break; - case 0xc5: /* lds */ - rc = emulate_load_segment(ctxt, VCPU_SREG_DS); - break; case 0xcc: /* int3 */ rc = emulate_int(ctxt, 3); break; @@ -3953,9 +3944,6 @@ special_insn: /* complement carry flag from eflags reg */ ctxt->eflags ^= EFLG_CF; break; - case 0xf6 ... 0xf7: /* Grp3 */ - rc = em_grp3(ctxt); - break; case 0xf8: /* clc */ ctxt->eflags &= ~EFLG_CF; break; @@ -4103,36 +4091,24 @@ twobyte_insn: case 0x90 ... 0x9f: /* setcc r/m8 */ ctxt->dst.val = test_cc(ctxt->b, ctxt->eflags); break; - case 0xa0: /* push fs */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_FS); - break; - case 0xa1: /* pop fs */ - rc = emulate_pop_sreg(ctxt, VCPU_SREG_FS); - break; case 0xa3: bt: /* bt */ ctxt->dst.type = OP_NONE; /* only subword offset */ ctxt->src.val &= (ctxt->dst.bytes << 3) - 1; - emulate_2op_SrcV_nobyte("bt", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte(ctxt, "bt"); break; case 0xa4: /* shld imm8, r, r/m */ case 0xa5: /* shld cl, r, r/m */ - emulate_2op_cl("shld", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags); - break; - case 0xa8: /* push gs */ - rc = emulate_push_sreg(ctxt, VCPU_SREG_GS); - break; - case 0xa9: /* pop gs */ - rc = emulate_pop_sreg(ctxt, VCPU_SREG_GS); + emulate_2op_cl(ctxt, "shld"); break; case 0xab: bts: /* bts */ - emulate_2op_SrcV_nobyte("bts", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte(ctxt, "bts"); break; case 0xac: /* shrd imm8, r, r/m */ case 0xad: /* shrd cl, r, r/m */ - emulate_2op_cl("shrd", ctxt->src2, ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_cl(ctxt, "shrd"); break; case 0xae: /* clflush */ break; @@ -4143,7 +4119,7 @@ twobyte_insn: */ ctxt->src.orig_val = ctxt->src.val; ctxt->src.val = ctxt->regs[VCPU_REGS_RAX]; - emulate_2op_SrcV("cmp", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "cmp"); if (ctxt->eflags & EFLG_ZF) { /* Success: write back to memory. */ ctxt->dst.val = ctxt->src.orig_val; @@ -4153,18 +4129,9 @@ twobyte_insn: ctxt->dst.addr.reg = (unsigned long *)&ctxt->regs[VCPU_REGS_RAX]; } break; - case 0xb2: /* lss */ - rc = emulate_load_segment(ctxt, VCPU_SREG_SS); - break; case 0xb3: btr: /* btr */ - emulate_2op_SrcV_nobyte("btr", ctxt->src, ctxt->dst, ctxt->eflags); - break; - case 0xb4: /* lfs */ - rc = emulate_load_segment(ctxt, VCPU_SREG_FS); - break; - case 0xb5: /* lgs */ - rc = emulate_load_segment(ctxt, VCPU_SREG_GS); + emulate_2op_SrcV_nobyte(ctxt, "btr"); break; case 0xb6 ... 0xb7: /* movzx */ ctxt->dst.bytes = ctxt->op_bytes; @@ -4185,7 +4152,7 @@ twobyte_insn: break; case 0xbb: btc: /* btc */ - emulate_2op_SrcV_nobyte("btc", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV_nobyte(ctxt, "btc"); break; case 0xbc: { /* bsf */ u8 zf; @@ -4217,7 +4184,7 @@ twobyte_insn: (s16) ctxt->src.val; break; case 0xc0 ... 0xc1: /* xadd */ - emulate_2op_SrcV("add", ctxt->src, ctxt->dst, ctxt->eflags); + emulate_2op_SrcV(ctxt, "add"); /* Write back the register source. */ ctxt->src.val = ctxt->dst.orig_val; write_register_operand(&ctxt->src); diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index efad72385058..76e3f1cd0369 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -713,14 +713,16 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); kvm_iodevice_init(&pit->dev, &pit_dev_ops); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, KVM_PIT_BASE_ADDRESS, + KVM_PIT_MEM_LENGTH, &pit->dev); if (ret < 0) goto fail; if (flags & KVM_PIT_SPEAKER_DUMMY) { kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, - &pit->speaker_dev); + KVM_SPEAKER_BASE_ADDRESS, 4, + &pit->speaker_dev); if (ret < 0) goto fail_unregister; } diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c index 19fe855e7953..cac4746d7ffb 100644 --- a/arch/x86/kvm/i8259.c +++ b/arch/x86/kvm/i8259.c @@ -34,6 +34,9 @@ #include <linux/kvm_host.h> #include "trace.h" +#define pr_pic_unimpl(fmt, ...) \ + pr_err_ratelimited("kvm: pic: " fmt, ## __VA_ARGS__) + static void pic_irq_request(struct kvm *kvm, int level); static void pic_lock(struct kvm_pic *s) @@ -306,10 +309,10 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val) } s->init_state = 1; if (val & 0x02) - printk(KERN_ERR "single mode not supported"); + pr_pic_unimpl("single mode not supported"); if (val & 0x08) - printk(KERN_ERR - "level sensitive irq not supported"); + pr_pic_unimpl( + "level sensitive irq not supported"); } else if (val & 0x08) { if (val & 0x04) s->poll = 1; @@ -459,22 +462,15 @@ static int picdev_in_range(gpa_t addr) } } -static inline struct kvm_pic *to_pic(struct kvm_io_device *dev) -{ - return container_of(dev, struct kvm_pic, dev); -} - -static int picdev_write(struct kvm_io_device *this, +static int picdev_write(struct kvm_pic *s, gpa_t addr, int len, const void *val) { - struct kvm_pic *s = to_pic(this); unsigned char data = *(unsigned char *)val; if (!picdev_in_range(addr)) return -EOPNOTSUPP; if (len != 1) { - if (printk_ratelimit()) - printk(KERN_ERR "PIC: non byte write\n"); + pr_pic_unimpl("non byte write\n"); return 0; } pic_lock(s); @@ -494,17 +490,15 @@ static int picdev_write(struct kvm_io_device *this, return 0; } -static int picdev_read(struct kvm_io_device *this, +static int picdev_read(struct kvm_pic *s, gpa_t addr, int len, void *val) { - struct kvm_pic *s = to_pic(this); unsigned char data = 0; if (!picdev_in_range(addr)) return -EOPNOTSUPP; if (len != 1) { - if (printk_ratelimit()) - printk(KERN_ERR "PIC: non byte read\n"); + pr_pic_unimpl("non byte read\n"); return 0; } pic_lock(s); @@ -525,6 +519,48 @@ static int picdev_read(struct kvm_io_device *this, return 0; } +static int picdev_master_write(struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + return picdev_write(container_of(dev, struct kvm_pic, dev_master), + addr, len, val); +} + +static int picdev_master_read(struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + return picdev_read(container_of(dev, struct kvm_pic, dev_master), + addr, len, val); +} + +static int picdev_slave_write(struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + return picdev_write(container_of(dev, struct kvm_pic, dev_slave), + addr, len, val); +} + +static int picdev_slave_read(struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + return picdev_read(container_of(dev, struct kvm_pic, dev_slave), + addr, len, val); +} + +static int picdev_eclr_write(struct kvm_io_device *dev, + gpa_t addr, int len, const void *val) +{ + return picdev_write(container_of(dev, struct kvm_pic, dev_eclr), + addr, len, val); +} + +static int picdev_eclr_read(struct kvm_io_device *dev, + gpa_t addr, int len, void *val) +{ + return picdev_read(container_of(dev, struct kvm_pic, dev_eclr), + addr, len, val); +} + /* * callback when PIC0 irq status changed */ @@ -537,9 +573,19 @@ static void pic_irq_request(struct kvm *kvm, int level) s->output = level; } -static const struct kvm_io_device_ops picdev_ops = { - .read = picdev_read, - .write = picdev_write, +static const struct kvm_io_device_ops picdev_master_ops = { + .read = picdev_master_read, + .write = picdev_master_write, +}; + +static const struct kvm_io_device_ops picdev_slave_ops = { + .read = picdev_slave_read, + .write = picdev_slave_write, +}; + +static const struct kvm_io_device_ops picdev_eclr_ops = { + .read = picdev_eclr_read, + .write = picdev_eclr_write, }; struct kvm_pic *kvm_create_pic(struct kvm *kvm) @@ -560,16 +606,39 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm) /* * Initialize PIO device */ - kvm_iodevice_init(&s->dev, &picdev_ops); + kvm_iodevice_init(&s->dev_master, &picdev_master_ops); + kvm_iodevice_init(&s->dev_slave, &picdev_slave_ops); + kvm_iodevice_init(&s->dev_eclr, &picdev_eclr_ops); mutex_lock(&kvm->slots_lock); - ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev); + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x20, 2, + &s->dev_master); + if (ret < 0) + goto fail_unlock; + + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0xa0, 2, &s->dev_slave); + if (ret < 0) + goto fail_unreg_2; + + ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, 0x4d0, 2, &s->dev_eclr); + if (ret < 0) + goto fail_unreg_1; + mutex_unlock(&kvm->slots_lock); - if (ret < 0) { - kfree(s); - return NULL; - } return s; + +fail_unreg_1: + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_slave); + +fail_unreg_2: + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &s->dev_master); + +fail_unlock: + mutex_unlock(&kvm->slots_lock); + + kfree(s); + + return NULL; } void kvm_destroy_pic(struct kvm *kvm) @@ -577,7 +646,9 @@ void kvm_destroy_pic(struct kvm *kvm) struct kvm_pic *vpic = kvm->arch.vpic; if (vpic) { - kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_master); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_slave); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev_eclr); kvm->arch.vpic = NULL; kfree(vpic); } diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h index 53e2d084bffb..2086f2bfba33 100644 --- a/arch/x86/kvm/irq.h +++ b/arch/x86/kvm/irq.h @@ -66,7 +66,9 @@ struct kvm_pic { struct kvm *kvm; struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ int output; /* intr from master PIC */ - struct kvm_io_device dev; + struct kvm_io_device dev_master; + struct kvm_io_device dev_slave; + struct kvm_io_device dev_eclr; void (*ack_notifier)(void *opaque, int irq); unsigned long irq_states[16]; }; diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h index 3377d53fcd36..544076c4f44b 100644 --- a/arch/x86/kvm/kvm_cache_regs.h +++ b/arch/x86/kvm/kvm_cache_regs.h @@ -45,13 +45,6 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index) return vcpu->arch.walk_mmu->pdptrs[index]; } -static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index) -{ - load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu)); - - return mmu->pdptrs[index]; -} - static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) { ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS; diff --git a/arch/x86/kvm/kvm_timer.h b/arch/x86/kvm/kvm_timer.h index 64bc6ea78d90..497dbaa366d4 100644 --- a/arch/x86/kvm/kvm_timer.h +++ b/arch/x86/kvm/kvm_timer.h @@ -2,6 +2,8 @@ struct kvm_timer { struct hrtimer timer; s64 period; /* unit: ns */ + u32 timer_mode_mask; + u64 tscdeadline; atomic_t pending; /* accumulated triggered timers */ bool reinject; struct kvm_timer_ops *t_ops; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 57dcbd4308fa..54abb40199d6 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -68,6 +68,9 @@ #define VEC_POS(v) ((v) & (32 - 1)) #define REG_POS(v) (((v) >> 5) << 4) +static unsigned int min_timer_period_us = 500; +module_param(min_timer_period_us, uint, S_IRUGO | S_IWUSR); + static inline u32 apic_get_reg(struct kvm_lapic *apic, int reg_off) { return *((u32 *) (apic->regs + reg_off)); @@ -135,9 +138,23 @@ static inline int apic_lvt_vector(struct kvm_lapic *apic, int lvt_type) return apic_get_reg(apic, lvt_type) & APIC_VECTOR_MASK; } +static inline int apic_lvtt_oneshot(struct kvm_lapic *apic) +{ + return ((apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_ONESHOT); +} + static inline int apic_lvtt_period(struct kvm_lapic *apic) { - return apic_get_reg(apic, APIC_LVTT) & APIC_LVT_TIMER_PERIODIC; + return ((apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask) == APIC_LVT_TIMER_PERIODIC); +} + +static inline int apic_lvtt_tscdeadline(struct kvm_lapic *apic) +{ + return ((apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask) == + APIC_LVT_TIMER_TSCDEADLINE); } static inline int apic_lvt_nmi_mode(u32 lvt_val) @@ -166,7 +183,7 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic) } static unsigned int apic_lvt_mask[APIC_LVT_NUM] = { - LVT_MASK | APIC_LVT_TIMER_PERIODIC, /* LVTT */ + LVT_MASK , /* part LVTT mask, timer mode mask added at runtime */ LVT_MASK | APIC_MODE_MASK, /* LVTTHMR */ LVT_MASK | APIC_MODE_MASK, /* LVTPC */ LINT_MASK, LINT_MASK, /* LVT0-1 */ @@ -316,8 +333,8 @@ int kvm_apic_match_logical_addr(struct kvm_lapic *apic, u8 mda) result = 1; break; default: - printk(KERN_WARNING "Bad DFR vcpu %d: %08x\n", - apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); + apic_debug("Bad DFR vcpu %d: %08x\n", + apic->vcpu->vcpu_id, apic_get_reg(apic, APIC_DFR)); break; } @@ -354,8 +371,8 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu, struct kvm_lapic *source, result = (target != source); break; default: - printk(KERN_WARNING "Bad dest shorthand value %x\n", - short_hand); + apic_debug("kvm: apic: Bad dest shorthand value %x\n", + short_hand); break; } @@ -401,11 +418,11 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode, break; case APIC_DM_REMRD: - printk(KERN_DEBUG "Ignoring delivery mode 3\n"); + apic_debug("Ignoring delivery mode 3\n"); break; case APIC_DM_SMI: - printk(KERN_DEBUG "Ignoring guest SMI\n"); + apic_debug("Ignoring guest SMI\n"); break; case APIC_DM_NMI: @@ -565,11 +582,13 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset) val = kvm_apic_id(apic) << 24; break; case APIC_ARBPRI: - printk(KERN_WARNING "Access APIC ARBPRI register " - "which is for P6\n"); + apic_debug("Access APIC ARBPRI register which is for P6\n"); break; case APIC_TMCCT: /* Timer CCR */ + if (apic_lvtt_tscdeadline(apic)) + return 0; + val = apic_get_tmcct(apic); break; @@ -664,29 +683,40 @@ static void update_divide_count(struct kvm_lapic *apic) static void start_apic_timer(struct kvm_lapic *apic) { - ktime_t now = apic->lapic_timer.timer.base->get_time(); - - apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) * - APIC_BUS_CYCLE_NS * apic->divide_count; + ktime_t now; atomic_set(&apic->lapic_timer.pending, 0); - if (!apic->lapic_timer.period) - return; - /* - * Do not allow the guest to program periodic timers with small - * interval, since the hrtimers are not throttled by the host - * scheduler. - */ - if (apic_lvtt_period(apic)) { - if (apic->lapic_timer.period < NSEC_PER_MSEC/2) - apic->lapic_timer.period = NSEC_PER_MSEC/2; - } + if (apic_lvtt_period(apic) || apic_lvtt_oneshot(apic)) { + /* lapic timer in oneshot or peroidic mode */ + now = apic->lapic_timer.timer.base->get_time(); + apic->lapic_timer.period = (u64)apic_get_reg(apic, APIC_TMICT) + * APIC_BUS_CYCLE_NS * apic->divide_count; + + if (!apic->lapic_timer.period) + return; + /* + * Do not allow the guest to program periodic timers with small + * interval, since the hrtimers are not throttled by the host + * scheduler. + */ + if (apic_lvtt_period(apic)) { + s64 min_period = min_timer_period_us * 1000LL; + + if (apic->lapic_timer.period < min_period) { + pr_info_ratelimited( + "kvm: vcpu %i: requested %lld ns " + "lapic timer period limited to %lld ns\n", + apic->vcpu->vcpu_id, + apic->lapic_timer.period, min_period); + apic->lapic_timer.period = min_period; + } + } - hrtimer_start(&apic->lapic_timer.timer, - ktime_add_ns(now, apic->lapic_timer.period), - HRTIMER_MODE_ABS); + hrtimer_start(&apic->lapic_timer.timer, + ktime_add_ns(now, apic->lapic_timer.period), + HRTIMER_MODE_ABS); - apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" + apic_debug("%s: bus cycle is %" PRId64 "ns, now 0x%016" PRIx64 ", " "timer initial count 0x%x, period %lldns, " "expire @ 0x%016" PRIx64 ".\n", __func__, @@ -695,6 +725,30 @@ static void start_apic_timer(struct kvm_lapic *apic) apic->lapic_timer.period, ktime_to_ns(ktime_add_ns(now, apic->lapic_timer.period))); + } else if (apic_lvtt_tscdeadline(apic)) { + /* lapic timer in tsc deadline mode */ + u64 guest_tsc, tscdeadline = apic->lapic_timer.tscdeadline; + u64 ns = 0; + struct kvm_vcpu *vcpu = apic->vcpu; + unsigned long this_tsc_khz = vcpu_tsc_khz(vcpu); + unsigned long flags; + + if (unlikely(!tscdeadline || !this_tsc_khz)) + return; + + local_irq_save(flags); + + now = apic->lapic_timer.timer.base->get_time(); + guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); + if (likely(tscdeadline > guest_tsc)) { + ns = (tscdeadline - guest_tsc) * 1000000ULL; + do_div(ns, this_tsc_khz); + } + hrtimer_start(&apic->lapic_timer.timer, + ktime_add_ns(now, ns), HRTIMER_MODE_ABS); + + local_irq_restore(flags); + } } static void apic_manage_nmi_watchdog(struct kvm_lapic *apic, u32 lvt0_val) @@ -782,7 +836,6 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_LVT0: apic_manage_nmi_watchdog(apic, val); - case APIC_LVTT: case APIC_LVTTHMR: case APIC_LVTPC: case APIC_LVT1: @@ -796,7 +849,22 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) break; + case APIC_LVTT: + if ((apic_get_reg(apic, APIC_LVTT) & + apic->lapic_timer.timer_mode_mask) != + (val & apic->lapic_timer.timer_mode_mask)) + hrtimer_cancel(&apic->lapic_timer.timer); + + if (!apic_sw_enabled(apic)) + val |= APIC_LVT_MASKED; + val &= (apic_lvt_mask[0] | apic->lapic_timer.timer_mode_mask); + apic_set_reg(apic, APIC_LVTT, val); + break; + case APIC_TMICT: + if (apic_lvtt_tscdeadline(apic)) + break; + hrtimer_cancel(&apic->lapic_timer.timer); apic_set_reg(apic, APIC_TMICT, val); start_apic_timer(apic); @@ -804,14 +872,14 @@ static int apic_reg_write(struct kvm_lapic *apic, u32 reg, u32 val) case APIC_TDCR: if (val & 4) - printk(KERN_ERR "KVM_WRITE:TDCR %x\n", val); + apic_debug("KVM_WRITE:TDCR %x\n", val); apic_set_reg(apic, APIC_TDCR, val); update_divide_count(apic); break; case APIC_ESR: if (apic_x2apic_mode(apic) && val != 0) { - printk(KERN_ERR "KVM_WRITE:ESR not zero %x\n", val); + apic_debug("KVM_WRITE:ESR not zero %x\n", val); ret = 1; } break; @@ -864,6 +932,15 @@ static int apic_mmio_write(struct kvm_io_device *this, return 0; } +void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + + if (apic) + apic_reg_write(vcpu->arch.apic, APIC_EOI, 0); +} +EXPORT_SYMBOL_GPL(kvm_lapic_set_eoi); + void kvm_free_lapic(struct kvm_vcpu *vcpu) { if (!vcpu->arch.apic) @@ -883,6 +960,32 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu) *---------------------------------------------------------------------- */ +u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + if (!apic) + return 0; + + if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) + return 0; + + return apic->lapic_timer.tscdeadline; +} + +void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data) +{ + struct kvm_lapic *apic = vcpu->arch.apic; + if (!apic) + return; + + if (apic_lvtt_oneshot(apic) || apic_lvtt_period(apic)) + return; + + hrtimer_cancel(&apic->lapic_timer.timer); + apic->lapic_timer.tscdeadline = data; + start_apic_timer(apic); +} + void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8) { struct kvm_lapic *apic = vcpu->arch.apic; diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index 52c9e6b9e725..138e8cc6fea6 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -26,6 +26,7 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu); void kvm_lapic_reset(struct kvm_vcpu *vcpu); u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu); void kvm_lapic_set_tpr(struct kvm_vcpu *vcpu, unsigned long cr8); +void kvm_lapic_set_eoi(struct kvm_vcpu *vcpu); void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value); u64 kvm_lapic_get_base(struct kvm_vcpu *vcpu); void kvm_apic_set_version(struct kvm_vcpu *vcpu); @@ -41,6 +42,9 @@ int kvm_lapic_enabled(struct kvm_vcpu *vcpu); bool kvm_apic_present(struct kvm_vcpu *vcpu); int kvm_lapic_find_highest_irr(struct kvm_vcpu *vcpu); +u64 kvm_get_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu); +void kvm_set_lapic_tscdeadline_msr(struct kvm_vcpu *vcpu, u64 data); + void kvm_lapic_set_vapic_addr(struct kvm_vcpu *vcpu, gpa_t vapic_addr); void kvm_lapic_sync_from_vapic(struct kvm_vcpu *vcpu); void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu); diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 8e8da7960dbe..f1b36cf3e3d0 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -2770,7 +2770,7 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) ASSERT(!VALID_PAGE(root)); if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { - pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i); + pdptr = vcpu->arch.mmu.get_pdptr(vcpu, i); if (!is_present_gpte(pdptr)) { vcpu->arch.mmu.pae_root[i] = 0; continue; @@ -3318,6 +3318,7 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) context->direct_map = true; context->set_cr3 = kvm_x86_ops->set_tdp_cr3; context->get_cr3 = get_cr3; + context->get_pdptr = kvm_pdptr_read; context->inject_page_fault = kvm_inject_page_fault; context->nx = is_nx(vcpu); @@ -3376,6 +3377,7 @@ static int init_kvm_softmmu(struct kvm_vcpu *vcpu) vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3; vcpu->arch.walk_mmu->get_cr3 = get_cr3; + vcpu->arch.walk_mmu->get_pdptr = kvm_pdptr_read; vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault; return r; @@ -3386,6 +3388,7 @@ static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu) struct kvm_mmu *g_context = &vcpu->arch.nested_mmu; g_context->get_cr3 = get_cr3; + g_context->get_pdptr = kvm_pdptr_read; g_context->inject_page_fault = kvm_inject_page_fault; /* diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c index 2460a265be23..746ec259d024 100644 --- a/arch/x86/kvm/mmu_audit.c +++ b/arch/x86/kvm/mmu_audit.c @@ -121,16 +121,16 @@ static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level) static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) { + static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10); unsigned long *rmapp; struct kvm_mmu_page *rev_sp; gfn_t gfn; - rev_sp = page_header(__pa(sptep)); gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt); if (!gfn_to_memslot(kvm, gfn)) { - if (!printk_ratelimit()) + if (!__ratelimit(&ratelimit_state)) return; audit_printk(kvm, "no memslot for gfn %llx\n", gfn); audit_printk(kvm, "index %ld of sp (gfn=%llx)\n", @@ -141,7 +141,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level); if (!*rmapp) { - if (!printk_ratelimit()) + if (!__ratelimit(&ratelimit_state)) return; audit_printk(kvm, "no rmap for writable spte %llx\n", *sptep); diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 507e2b844cfa..92994100638b 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -147,7 +147,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker, gfn_t table_gfn; unsigned index, pt_access, uninitialized_var(pte_access); gpa_t pte_gpa; - bool eperm; + bool eperm, last_gpte; int offset; const int write_fault = access & PFERR_WRITE_MASK; const int user_fault = access & PFERR_USER_MASK; @@ -163,7 +163,7 @@ retry_walk: #if PTTYPE == 64 if (walker->level == PT32E_ROOT_LEVEL) { - pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3); + pte = mmu->get_pdptr(vcpu, (addr >> 30) & 3); trace_kvm_mmu_paging_element(pte, walker->level); if (!is_present_gpte(pte)) goto error; @@ -221,6 +221,17 @@ retry_walk: eperm = true; #endif + last_gpte = FNAME(is_last_gpte)(walker, vcpu, mmu, pte); + if (last_gpte) { + pte_access = pt_access & + FNAME(gpte_access)(vcpu, pte, true); + /* check if the kernel is fetching from user page */ + if (unlikely(pte_access & PT_USER_MASK) && + kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) + if (fetch_fault && !user_fault) + eperm = true; + } + if (!eperm && unlikely(!(pte & PT_ACCESSED_MASK))) { int ret; trace_kvm_mmu_set_accessed_bit(table_gfn, index, @@ -238,18 +249,12 @@ retry_walk: walker->ptes[walker->level - 1] = pte; - if (FNAME(is_last_gpte)(walker, vcpu, mmu, pte)) { + if (last_gpte) { int lvl = walker->level; gpa_t real_gpa; gfn_t gfn; u32 ac; - /* check if the kernel is fetching from user page */ - if (unlikely(pte_access & PT_USER_MASK) && - kvm_read_cr4_bits(vcpu, X86_CR4_SMEP)) - if (fetch_fault && !user_fault) - eperm = true; - gfn = gpte_to_gfn_lvl(pte, lvl); gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT; @@ -295,7 +300,6 @@ retry_walk: walker->ptes[walker->level - 1] = pte; } - pte_access = pt_access & FNAME(gpte_access)(vcpu, pte, true); walker->pt_access = pt_access; walker->pte_access = pte_access; pgprintk("%s: pte %llx pte_access %x pt_access %x\n", diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 475d1c948501..e32243eac2f4 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1084,7 +1084,6 @@ static void init_vmcb(struct vcpu_svm *svm) if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl = 1; - clr_intercept(svm, INTERCEPT_TASK_SWITCH); clr_intercept(svm, INTERCEPT_INVLPG); clr_exception_intercept(svm, PF_VECTOR); clr_cr_intercept(svm, INTERCEPT_CR3_READ); @@ -1844,6 +1843,20 @@ static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu) return svm->nested.nested_cr3; } +static u64 nested_svm_get_tdp_pdptr(struct kvm_vcpu *vcpu, int index) +{ + struct vcpu_svm *svm = to_svm(vcpu); + u64 cr3 = svm->nested.nested_cr3; + u64 pdpte; + int ret; + + ret = kvm_read_guest_page(vcpu->kvm, gpa_to_gfn(cr3), &pdpte, + offset_in_page(cr3) + index * 8, 8); + if (ret) + return 0; + return pdpte; +} + static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root) { @@ -1875,6 +1888,7 @@ static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu) vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3; vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3; + vcpu->arch.mmu.get_pdptr = nested_svm_get_tdp_pdptr; vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit; vcpu->arch.mmu.shadow_root_level = get_npt_level(); vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu; @@ -2182,7 +2196,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm) vmcb->control.exit_info_1, vmcb->control.exit_info_2, vmcb->control.exit_int_info, - vmcb->control.exit_int_info_err); + vmcb->control.exit_int_info_err, + KVM_ISA_SVM); nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, &page); if (!nested_vmcb) @@ -2894,15 +2909,20 @@ static int cr8_write_interception(struct vcpu_svm *svm) return 0; } +u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu) +{ + struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu)); + return vmcb->control.tsc_offset + + svm_scale_tsc(vcpu, native_read_tsc()); +} + static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data) { struct vcpu_svm *svm = to_svm(vcpu); switch (ecx) { case MSR_IA32_TSC: { - struct vmcb *vmcb = get_host_vmcb(svm); - - *data = vmcb->control.tsc_offset + + *data = svm->vmcb->control.tsc_offset + svm_scale_tsc(vcpu, native_read_tsc()); break; @@ -3314,8 +3334,6 @@ static int handle_exit(struct kvm_vcpu *vcpu) struct kvm_run *kvm_run = vcpu->run; u32 exit_code = svm->vmcb->control.exit_code; - trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM); - if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE)) vcpu->arch.cr0 = svm->vmcb->save.cr0; if (npt_enabled) @@ -3335,7 +3353,8 @@ static int handle_exit(struct kvm_vcpu *vcpu) svm->vmcb->control.exit_info_1, svm->vmcb->control.exit_info_2, svm->vmcb->control.exit_int_info, - svm->vmcb->control.exit_int_info_err); + svm->vmcb->control.exit_int_info_err, + KVM_ISA_SVM); vmexit = nested_svm_exit_special(svm); @@ -3768,6 +3787,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; + trace_kvm_exit(svm->vmcb->control.exit_code, vcpu, KVM_ISA_SVM); + if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) kvm_before_handle_nmi(&svm->vcpu); @@ -3897,60 +3918,6 @@ static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) } } -static const struct trace_print_flags svm_exit_reasons_str[] = { - { SVM_EXIT_READ_CR0, "read_cr0" }, - { SVM_EXIT_READ_CR3, "read_cr3" }, - { SVM_EXIT_READ_CR4, "read_cr4" }, - { SVM_EXIT_READ_CR8, "read_cr8" }, - { SVM_EXIT_WRITE_CR0, "write_cr0" }, - { SVM_EXIT_WRITE_CR3, "write_cr3" }, - { SVM_EXIT_WRITE_CR4, "write_cr4" }, - { SVM_EXIT_WRITE_CR8, "write_cr8" }, - { SVM_EXIT_READ_DR0, "read_dr0" }, - { SVM_EXIT_READ_DR1, "read_dr1" }, - { SVM_EXIT_READ_DR2, "read_dr2" }, - { SVM_EXIT_READ_DR3, "read_dr3" }, - { SVM_EXIT_WRITE_DR0, "write_dr0" }, - { SVM_EXIT_WRITE_DR1, "write_dr1" }, - { SVM_EXIT_WRITE_DR2, "write_dr2" }, - { SVM_EXIT_WRITE_DR3, "write_dr3" }, - { SVM_EXIT_WRITE_DR5, "write_dr5" }, - { SVM_EXIT_WRITE_DR7, "write_dr7" }, - { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, - { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, - { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, - { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, - { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, - { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, - { SVM_EXIT_INTR, "interrupt" }, - { SVM_EXIT_NMI, "nmi" }, - { SVM_EXIT_SMI, "smi" }, - { SVM_EXIT_INIT, "init" }, - { SVM_EXIT_VINTR, "vintr" }, - { SVM_EXIT_CPUID, "cpuid" }, - { SVM_EXIT_INVD, "invd" }, - { SVM_EXIT_HLT, "hlt" }, - { SVM_EXIT_INVLPG, "invlpg" }, - { SVM_EXIT_INVLPGA, "invlpga" }, - { SVM_EXIT_IOIO, "io" }, - { SVM_EXIT_MSR, "msr" }, - { SVM_EXIT_TASK_SWITCH, "task_switch" }, - { SVM_EXIT_SHUTDOWN, "shutdown" }, - { SVM_EXIT_VMRUN, "vmrun" }, - { SVM_EXIT_VMMCALL, "hypercall" }, - { SVM_EXIT_VMLOAD, "vmload" }, - { SVM_EXIT_VMSAVE, "vmsave" }, - { SVM_EXIT_STGI, "stgi" }, - { SVM_EXIT_CLGI, "clgi" }, - { SVM_EXIT_SKINIT, "skinit" }, - { SVM_EXIT_WBINVD, "wbinvd" }, - { SVM_EXIT_MONITOR, "monitor" }, - { SVM_EXIT_MWAIT, "mwait" }, - { SVM_EXIT_XSETBV, "xsetbv" }, - { SVM_EXIT_NPF, "npf" }, - { -1, NULL } -}; - static int svm_get_lpage_level(void) { return PT_PDPE_LEVEL; @@ -4223,7 +4190,6 @@ static struct kvm_x86_ops svm_x86_ops = { .get_mt_mask = svm_get_mt_mask, .get_exit_info = svm_get_exit_info, - .exit_reasons_str = svm_exit_reasons_str, .get_lpage_level = svm_get_lpage_level, @@ -4239,6 +4205,7 @@ static struct kvm_x86_ops svm_x86_ops = { .write_tsc_offset = svm_write_tsc_offset, .adjust_tsc_offset = svm_adjust_tsc_offset, .compute_tsc_offset = svm_compute_tsc_offset, + .read_l1_tsc = svm_read_l1_tsc, .set_tdp_cr3 = set_tdp_cr3, diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h index 3ff898c104f7..911d2641f14c 100644 --- a/arch/x86/kvm/trace.h +++ b/arch/x86/kvm/trace.h @@ -2,6 +2,8 @@ #define _TRACE_KVM_H #include <linux/tracepoint.h> +#include <asm/vmx.h> +#include <asm/svm.h> #undef TRACE_SYSTEM #define TRACE_SYSTEM kvm @@ -181,6 +183,95 @@ TRACE_EVENT(kvm_apic, #define KVM_ISA_VMX 1 #define KVM_ISA_SVM 2 +#define VMX_EXIT_REASONS \ + { EXIT_REASON_EXCEPTION_NMI, "EXCEPTION_NMI" }, \ + { EXIT_REASON_EXTERNAL_INTERRUPT, "EXTERNAL_INTERRUPT" }, \ + { EXIT_REASON_TRIPLE_FAULT, "TRIPLE_FAULT" }, \ + { EXIT_REASON_PENDING_INTERRUPT, "PENDING_INTERRUPT" }, \ + { EXIT_REASON_NMI_WINDOW, "NMI_WINDOW" }, \ + { EXIT_REASON_TASK_SWITCH, "TASK_SWITCH" }, \ + { EXIT_REASON_CPUID, "CPUID" }, \ + { EXIT_REASON_HLT, "HLT" }, \ + { EXIT_REASON_INVLPG, "INVLPG" }, \ + { EXIT_REASON_RDPMC, "RDPMC" }, \ + { EXIT_REASON_RDTSC, "RDTSC" }, \ + { EXIT_REASON_VMCALL, "VMCALL" }, \ + { EXIT_REASON_VMCLEAR, "VMCLEAR" }, \ + { EXIT_REASON_VMLAUNCH, "VMLAUNCH" }, \ + { EXIT_REASON_VMPTRLD, "VMPTRLD" }, \ + { EXIT_REASON_VMPTRST, "VMPTRST" }, \ + { EXIT_REASON_VMREAD, "VMREAD" }, \ + { EXIT_REASON_VMRESUME, "VMRESUME" }, \ + { EXIT_REASON_VMWRITE, "VMWRITE" }, \ + { EXIT_REASON_VMOFF, "VMOFF" }, \ + { EXIT_REASON_VMON, "VMON" }, \ + { EXIT_REASON_CR_ACCESS, "CR_ACCESS" }, \ + { EXIT_REASON_DR_ACCESS, "DR_ACCESS" }, \ + { EXIT_REASON_IO_INSTRUCTION, "IO_INSTRUCTION" }, \ + { EXIT_REASON_MSR_READ, "MSR_READ" }, \ + { EXIT_REASON_MSR_WRITE, "MSR_WRITE" }, \ + { EXIT_REASON_MWAIT_INSTRUCTION, "MWAIT_INSTRUCTION" }, \ + { EXIT_REASON_MONITOR_INSTRUCTION, "MONITOR_INSTRUCTION" }, \ + { EXIT_REASON_PAUSE_INSTRUCTION, "PAUSE_INSTRUCTION" }, \ + { EXIT_REASON_MCE_DURING_VMENTRY, "MCE_DURING_VMENTRY" }, \ + { EXIT_REASON_TPR_BELOW_THRESHOLD, "TPR_BELOW_THRESHOLD" }, \ + { EXIT_REASON_APIC_ACCESS, "APIC_ACCESS" }, \ + { EXIT_REASON_EPT_VIOLATION, "EPT_VIOLATION" }, \ + { EXIT_REASON_EPT_MISCONFIG, "EPT_MISCONFIG" }, \ + { EXIT_REASON_WBINVD, "WBINVD" } + +#define SVM_EXIT_REASONS \ + { SVM_EXIT_READ_CR0, "read_cr0" }, \ + { SVM_EXIT_READ_CR3, "read_cr3" }, \ + { SVM_EXIT_READ_CR4, "read_cr4" }, \ + { SVM_EXIT_READ_CR8, "read_cr8" }, \ + { SVM_EXIT_WRITE_CR0, "write_cr0" }, \ + { SVM_EXIT_WRITE_CR3, "write_cr3" }, \ + { SVM_EXIT_WRITE_CR4, "write_cr4" }, \ + { SVM_EXIT_WRITE_CR8, "write_cr8" }, \ + { SVM_EXIT_READ_DR0, "read_dr0" }, \ + { SVM_EXIT_READ_DR1, "read_dr1" }, \ + { SVM_EXIT_READ_DR2, "read_dr2" }, \ + { SVM_EXIT_READ_DR3, "read_dr3" }, \ + { SVM_EXIT_WRITE_DR0, "write_dr0" }, \ + { SVM_EXIT_WRITE_DR1, "write_dr1" }, \ + { SVM_EXIT_WRITE_DR2, "write_dr2" }, \ + { SVM_EXIT_WRITE_DR3, "write_dr3" }, \ + { SVM_EXIT_WRITE_DR5, "write_dr5" }, \ + { SVM_EXIT_WRITE_DR7, "write_dr7" }, \ + { SVM_EXIT_EXCP_BASE + DB_VECTOR, "DB excp" }, \ + { SVM_EXIT_EXCP_BASE + BP_VECTOR, "BP excp" }, \ + { SVM_EXIT_EXCP_BASE + UD_VECTOR, "UD excp" }, \ + { SVM_EXIT_EXCP_BASE + PF_VECTOR, "PF excp" }, \ + { SVM_EXIT_EXCP_BASE + NM_VECTOR, "NM excp" }, \ + { SVM_EXIT_EXCP_BASE + MC_VECTOR, "MC excp" }, \ + { SVM_EXIT_INTR, "interrupt" }, \ + { SVM_EXIT_NMI, "nmi" }, \ + { SVM_EXIT_SMI, "smi" }, \ + { SVM_EXIT_INIT, "init" }, \ + { SVM_EXIT_VINTR, "vintr" }, \ + { SVM_EXIT_CPUID, "cpuid" }, \ + { SVM_EXIT_INVD, "invd" }, \ + { SVM_EXIT_HLT, "hlt" }, \ + { SVM_EXIT_INVLPG, "invlpg" }, \ + { SVM_EXIT_INVLPGA, "invlpga" }, \ + { SVM_EXIT_IOIO, "io" }, \ + { SVM_EXIT_MSR, "msr" }, \ + { SVM_EXIT_TASK_SWITCH, "task_switch" }, \ + { SVM_EXIT_SHUTDOWN, "shutdown" }, \ + { SVM_EXIT_VMRUN, "vmrun" }, \ + { SVM_EXIT_VMMCALL, "hypercall" }, \ + { SVM_EXIT_VMLOAD, "vmload" }, \ + { SVM_EXIT_VMSAVE, "vmsave" }, \ + { SVM_EXIT_STGI, "stgi" }, \ + { SVM_EXIT_CLGI, "clgi" }, \ + { SVM_EXIT_SKINIT, "skinit" }, \ + { SVM_EXIT_WBINVD, "wbinvd" }, \ + { SVM_EXIT_MONITOR, "monitor" }, \ + { SVM_EXIT_MWAIT, "mwait" }, \ + { SVM_EXIT_XSETBV, "xsetbv" }, \ + { SVM_EXIT_NPF, "npf" } + /* * Tracepoint for kvm guest exit: */ @@ -205,8 +296,9 @@ TRACE_EVENT(kvm_exit, ), TP_printk("reason %s rip 0x%lx info %llx %llx", - ftrace_print_symbols_seq(p, __entry->exit_reason, - kvm_x86_ops->exit_reasons_str), + (__entry->isa == KVM_ISA_VMX) ? + __print_symbolic(__entry->exit_reason, VMX_EXIT_REASONS) : + __print_symbolic(__entry->exit_reason, SVM_EXIT_REASONS), __entry->guest_rip, __entry->info1, __entry->info2) ); @@ -486,9 +578,9 @@ TRACE_EVENT(kvm_nested_intercepts, TRACE_EVENT(kvm_nested_vmexit, TP_PROTO(__u64 rip, __u32 exit_code, __u64 exit_info1, __u64 exit_info2, - __u32 exit_int_info, __u32 exit_int_info_err), + __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), TP_ARGS(rip, exit_code, exit_info1, exit_info2, - exit_int_info, exit_int_info_err), + exit_int_info, exit_int_info_err, isa), TP_STRUCT__entry( __field( __u64, rip ) @@ -497,6 +589,7 @@ TRACE_EVENT(kvm_nested_vmexit, __field( __u64, exit_info2 ) __field( __u32, exit_int_info ) __field( __u32, exit_int_info_err ) + __field( __u32, isa ) ), TP_fast_assign( @@ -506,12 +599,14 @@ TRACE_EVENT(kvm_nested_vmexit, __entry->exit_info2 = exit_info2; __entry->exit_int_info = exit_int_info; __entry->exit_int_info_err = exit_int_info_err; + __entry->isa = isa; ), TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx " "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", __entry->rip, - ftrace_print_symbols_seq(p, __entry->exit_code, - kvm_x86_ops->exit_reasons_str), + (__entry->isa == KVM_ISA_VMX) ? + __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : + __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), __entry->exit_info1, __entry->exit_info2, __entry->exit_int_info, __entry->exit_int_info_err) ); @@ -522,9 +617,9 @@ TRACE_EVENT(kvm_nested_vmexit, TRACE_EVENT(kvm_nested_vmexit_inject, TP_PROTO(__u32 exit_code, __u64 exit_info1, __u64 exit_info2, - __u32 exit_int_info, __u32 exit_int_info_err), + __u32 exit_int_info, __u32 exit_int_info_err, __u32 isa), TP_ARGS(exit_code, exit_info1, exit_info2, - exit_int_info, exit_int_info_err), + exit_int_info, exit_int_info_err, isa), TP_STRUCT__entry( __field( __u32, exit_code ) @@ -532,6 +627,7 @@ TRACE_EVENT(kvm_nested_vmexit_inject, __field( __u64, exit_info2 ) __field( __u32, exit_int_info ) __field( __u32, exit_int_info_err ) + __field( __u32, isa ) ), TP_fast_assign( @@ -540,12 +636,14 @@ TRACE_EVENT(kvm_nested_vmexit_inject, __entry->exit_info2 = exit_info2; __entry->exit_int_info = exit_int_info; __entry->exit_int_info_err = exit_int_info_err; + __entry->isa = isa; ), TP_printk("reason: %s ext_inf1: 0x%016llx " "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x", - ftrace_print_symbols_seq(p, __entry->exit_code, - kvm_x86_ops->exit_reasons_str), + (__entry->isa == KVM_ISA_VMX) ? + __print_symbolic(__entry->exit_code, VMX_EXIT_REASONS) : + __print_symbolic(__entry->exit_code, SVM_EXIT_REASONS), __entry->exit_info1, __entry->exit_info2, __entry->exit_int_info, __entry->exit_int_info_err) ); diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index e65a158dee64..a0d6bd9ad442 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -71,6 +71,9 @@ module_param(vmm_exclusive, bool, S_IRUGO); static int __read_mostly yield_on_hlt = 1; module_param(yield_on_hlt, bool, S_IRUGO); +static int __read_mostly fasteoi = 1; +module_param(fasteoi, bool, S_IRUGO); + /* * If nested=1, nested virtualization is supported, i.e., guests may use * VMX and be a hypervisor for its own guests. If nested=0, guests may not @@ -1748,6 +1751,21 @@ static u64 guest_read_tsc(void) } /* + * Like guest_read_tsc, but always returns L1's notion of the timestamp + * counter, even if a nested guest (L2) is currently running. + */ +u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu) +{ + u64 host_tsc, tsc_offset; + + rdtscll(host_tsc); + tsc_offset = is_guest_mode(vcpu) ? + to_vmx(vcpu)->nested.vmcs01_tsc_offset : + vmcs_read64(TSC_OFFSET); + return host_tsc + tsc_offset; +} + +/* * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ * ioctl. In this case the call-back should update internal vmx state to make * the changes effective. @@ -1762,15 +1780,23 @@ static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) */ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) { - vmcs_write64(TSC_OFFSET, offset); - if (is_guest_mode(vcpu)) + if (is_guest_mode(vcpu)) { /* - * We're here if L1 chose not to trap the TSC MSR. Since - * prepare_vmcs12() does not copy tsc_offset, we need to also - * set the vmcs12 field here. + * We're here if L1 chose not to trap WRMSR to TSC. According + * to the spec, this should set L1's TSC; The offset that L1 + * set for L2 remains unchanged, and still needs to be added + * to the newly set TSC to get L2's TSC. */ - get_vmcs12(vcpu)->tsc_offset = offset - - to_vmx(vcpu)->nested.vmcs01_tsc_offset; + struct vmcs12 *vmcs12; + to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset; + /* recalculate vmcs02.TSC_OFFSET: */ + vmcs12 = get_vmcs12(vcpu); + vmcs_write64(TSC_OFFSET, offset + + (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETING) ? + vmcs12->tsc_offset : 0)); + } else { + vmcs_write64(TSC_OFFSET, offset); + } } static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment) @@ -2736,8 +2762,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu) guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { - printk(KERN_DEBUG "%s: tss fixup for long mode. \n", - __func__); + pr_debug_ratelimited("%s: tss fixup for long mode. \n", + __func__); vmcs_write32(GUEST_TR_AR_BYTES, (guest_tr_ar & ~AR_TYPE_MASK) | AR_TYPE_BUSY_64_TSS); @@ -4115,8 +4141,7 @@ static int handle_exception(struct kvm_vcpu *vcpu) error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); if (is_page_fault(intr_info)) { /* EPT won't cause page fault directly */ - if (enable_ept) - BUG(); + BUG_ON(enable_ept); cr2 = vmcs_readl(EXIT_QUALIFICATION); trace_kvm_page_fault(cr2, error_code); @@ -4518,6 +4543,24 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu) static int handle_apic_access(struct kvm_vcpu *vcpu) { + if (likely(fasteoi)) { + unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); + int access_type, offset; + + access_type = exit_qualification & APIC_ACCESS_TYPE; + offset = exit_qualification & APIC_ACCESS_OFFSET; + /* + * Sane guest uses MOV to write EOI, with written value + * not cared. So make a short-circuit here by avoiding + * heavy instruction emulation. + */ + if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) && + (offset == APIC_EOI)) { + kvm_lapic_set_eoi(vcpu); + skip_emulated_instruction(vcpu); + return 1; + } + } return emulate_instruction(vcpu, 0) == EMULATE_DONE; } @@ -5591,8 +5634,8 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu) return 0; if (unlikely(vmx->fail)) { - printk(KERN_INFO "%s failed vm entry %x\n", - __func__, vmcs_read32(VM_INSTRUCTION_ERROR)); + pr_info_ratelimited("%s failed vm entry %x\n", __func__, + vmcs_read32(VM_INSTRUCTION_ERROR)); return 1; } @@ -5696,8 +5739,6 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu) u32 exit_reason = vmx->exit_reason; u32 vectoring_info = vmx->idt_vectoring_info; - trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX); - /* If guest state is invalid, start emulating */ if (vmx->emulation_required && emulate_invalid_guest_state) return handle_invalid_guest_state(vcpu); @@ -6101,6 +6142,7 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->launched = 1; vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); + trace_kvm_exit(vmx->exit_reason, vcpu, KVM_ISA_VMX); vmx_complete_atomic_exit(vmx); vmx_recover_nmi_blocking(vmx); @@ -6241,49 +6283,6 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) return ret; } -#define _ER(x) { EXIT_REASON_##x, #x } - -static const struct trace_print_flags vmx_exit_reasons_str[] = { - _ER(EXCEPTION_NMI), - _ER(EXTERNAL_INTERRUPT), - _ER(TRIPLE_FAULT), - _ER(PENDING_INTERRUPT), - _ER(NMI_WINDOW), - _ER(TASK_SWITCH), - _ER(CPUID), - _ER(HLT), - _ER(INVLPG), - _ER(RDPMC), - _ER(RDTSC), - _ER(VMCALL), - _ER(VMCLEAR), - _ER(VMLAUNCH), - _ER(VMPTRLD), - _ER(VMPTRST), - _ER(VMREAD), - _ER(VMRESUME), - _ER(VMWRITE), - _ER(VMOFF), - _ER(VMON), - _ER(CR_ACCESS), - _ER(DR_ACCESS), - _ER(IO_INSTRUCTION), - _ER(MSR_READ), - _ER(MSR_WRITE), - _ER(MWAIT_INSTRUCTION), - _ER(MONITOR_INSTRUCTION), - _ER(PAUSE_INSTRUCTION), - _ER(MCE_DURING_VMENTRY), - _ER(TPR_BELOW_THRESHOLD), - _ER(APIC_ACCESS), - _ER(EPT_VIOLATION), - _ER(EPT_MISCONFIG), - _ER(WBINVD), - { -1, NULL } -}; - -#undef _ER - static int vmx_get_lpage_level(void) { if (enable_ept && !cpu_has_vmx_ept_1g_page()) @@ -6514,8 +6513,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) set_cr4_guest_host_mask(vmx); - vmcs_write64(TSC_OFFSET, - vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); + if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING) + vmcs_write64(TSC_OFFSET, + vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset); + else + vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); if (enable_vpid) { /* @@ -6610,9 +6612,8 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch) if (vmcs12->vm_entry_msr_load_count > 0 || vmcs12->vm_exit_msr_load_count > 0 || vmcs12->vm_exit_msr_store_count > 0) { - if (printk_ratelimit()) - printk(KERN_WARNING - "%s: VMCS MSR_{LOAD,STORE} unsupported\n", __func__); + pr_warn_ratelimited("%s: VMCS MSR_{LOAD,STORE} unsupported\n", + __func__); nested_vmx_failValid(vcpu, VMXERR_ENTRY_INVALID_CONTROL_FIELD); return 1; } @@ -6922,7 +6923,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu) load_vmcs12_host_state(vcpu, vmcs12); - /* Update TSC_OFFSET if vmx_adjust_tsc_offset() was used while L2 ran */ + /* Update TSC_OFFSET if TSC was changed while L2 ran */ vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset); /* This is needed for same reason as it was needed in prepare_vmcs02 */ @@ -7039,7 +7040,6 @@ static struct kvm_x86_ops vmx_x86_ops = { .get_mt_mask = vmx_get_mt_mask, .get_exit_info = vmx_get_exit_info, - .exit_reasons_str = vmx_exit_reasons_str, .get_lpage_level = vmx_get_lpage_level, @@ -7055,6 +7055,7 @@ static struct kvm_x86_ops vmx_x86_ops = { .write_tsc_offset = vmx_write_tsc_offset, .adjust_tsc_offset = vmx_adjust_tsc_offset, .compute_tsc_offset = vmx_compute_tsc_offset, + .read_l1_tsc = vmx_read_l1_tsc, .set_tdp_cr3 = vmx_set_cr3, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 84a28ea45fa4..c38efd7b792e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -44,6 +44,7 @@ #include <linux/perf_event.h> #include <linux/uaccess.h> #include <linux/hash.h> +#include <linux/pci.h> #include <trace/events/kvm.h> #define CREATE_TRACE_POINTS @@ -83,6 +84,7 @@ static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); static void update_cr8_intercept(struct kvm_vcpu *vcpu); static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid, struct kvm_cpuid_entry2 __user *entries); +static void process_nmi(struct kvm_vcpu *vcpu); struct kvm_x86_ops *kvm_x86_ops; EXPORT_SYMBOL_GPL(kvm_x86_ops); @@ -359,8 +361,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) void kvm_inject_nmi(struct kvm_vcpu *vcpu) { - kvm_make_request(KVM_REQ_EVENT, vcpu); - vcpu->arch.nmi_pending = 1; + atomic_inc(&vcpu->arch.nmi_queued); + kvm_make_request(KVM_REQ_NMI, vcpu); } EXPORT_SYMBOL_GPL(kvm_inject_nmi); @@ -599,6 +601,8 @@ static bool guest_cpuid_has_fsgsbase(struct kvm_vcpu *vcpu) static void update_cpuid(struct kvm_vcpu *vcpu) { struct kvm_cpuid_entry2 *best; + struct kvm_lapic *apic = vcpu->arch.apic; + u32 timer_mode_mask; best = kvm_find_cpuid_entry(vcpu, 1, 0); if (!best) @@ -610,6 +614,16 @@ static void update_cpuid(struct kvm_vcpu *vcpu) if (kvm_read_cr4_bits(vcpu, X86_CR4_OSXSAVE)) best->ecx |= bit(X86_FEATURE_OSXSAVE); } + + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + best->function == 0x1) { + best->ecx |= bit(X86_FEATURE_TSC_DEADLINE_TIMER); + timer_mode_mask = 3 << 17; + } else + timer_mode_mask = 1 << 17; + + if (apic) + apic->lapic_timer.timer_mode_mask = timer_mode_mask; } int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) @@ -825,6 +839,7 @@ static u32 msrs_to_save[] = { static unsigned num_msrs_to_save; static u32 emulated_msrs[] = { + MSR_IA32_TSCDEADLINE, MSR_IA32_MISC_ENABLE, MSR_IA32_MCG_STATUS, MSR_IA32_MCG_CTL, @@ -1000,7 +1015,7 @@ static inline int kvm_tsc_changes_freq(void) return ret; } -static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) +u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu) { if (vcpu->arch.virtual_tsc_khz) return vcpu->arch.virtual_tsc_khz; @@ -1098,7 +1113,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v) /* Keep irq disabled to prevent changes to the clock */ local_irq_save(flags); - kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); + tsc_timestamp = kvm_x86_ops->read_l1_tsc(v); kernel_ns = get_kernel_ns(); this_tsc_khz = vcpu_tsc_khz(v); if (unlikely(this_tsc_khz == 0)) { @@ -1564,6 +1579,9 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) break; case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: return kvm_x2apic_msr_write(vcpu, msr, data); + case MSR_IA32_TSCDEADLINE: + kvm_set_lapic_tscdeadline_msr(vcpu, data); + break; case MSR_IA32_MISC_ENABLE: vcpu->arch.ia32_misc_enable_msr = data; break; @@ -1825,6 +1843,9 @@ static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata); case HV_X64_MSR_TPR: return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata); + case HV_X64_MSR_APIC_ASSIST_PAGE: + data = vcpu->arch.hv_vapic; + break; default: pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr); return 1; @@ -1839,7 +1860,6 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) switch (msr) { case MSR_IA32_PLATFORM_ID: - case MSR_IA32_UCODE_REV: case MSR_IA32_EBL_CR_POWERON: case MSR_IA32_DEBUGCTLMSR: case MSR_IA32_LASTBRANCHFROMIP: @@ -1860,6 +1880,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case MSR_FAM10H_MMIO_CONF_BASE: data = 0; break; + case MSR_IA32_UCODE_REV: + data = 0x100000000ULL; + break; case MSR_MTRRcap: data = 0x500 | KVM_NR_VAR_MTRR; break; @@ -1888,6 +1911,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) case APIC_BASE_MSR ... APIC_BASE_MSR + 0x3ff: return kvm_x2apic_msr_read(vcpu, msr, pdata); break; + case MSR_IA32_TSCDEADLINE: + data = kvm_get_lapic_tscdeadline_msr(vcpu); + break; case MSR_IA32_MISC_ENABLE: data = vcpu->arch.ia32_misc_enable_msr; break; @@ -2086,6 +2112,9 @@ int kvm_dev_ioctl_check_extension(long ext) r = !kvm_x86_ops->cpu_has_accelerated_tpr(); break; case KVM_CAP_NR_VCPUS: + r = KVM_SOFT_MAX_VCPUS; + break; + case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; break; case KVM_CAP_NR_MEMSLOTS: @@ -2095,7 +2124,7 @@ int kvm_dev_ioctl_check_extension(long ext) r = 0; break; case KVM_CAP_IOMMU: - r = iommu_found(); + r = iommu_present(&pci_bus_type); break; case KVM_CAP_MCE: r = KVM_MAX_MCE_BANKS; @@ -2210,7 +2239,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) s64 tsc_delta; u64 tsc; - kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc); + tsc = kvm_x86_ops->read_l1_tsc(vcpu); tsc_delta = !vcpu->arch.last_guest_tsc ? 0 : tsc - vcpu->arch.last_guest_tsc; @@ -2234,7 +2263,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { kvm_x86_ops->vcpu_put(vcpu); kvm_put_guest_fpu(vcpu); - kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); + vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); } static int is_efer_nx(void) @@ -2819,6 +2848,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, struct kvm_vcpu_events *events) { + process_nmi(vcpu); events->exception.injected = vcpu->arch.exception.pending && !kvm_exception_is_soft(vcpu->arch.exception.nr); @@ -2836,7 +2866,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, KVM_X86_SHADOW_INT_MOV_SS | KVM_X86_SHADOW_INT_STI); events->nmi.injected = vcpu->arch.nmi_injected; - events->nmi.pending = vcpu->arch.nmi_pending; + events->nmi.pending = vcpu->arch.nmi_pending != 0; events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); events->nmi.pad = 0; @@ -2856,6 +2886,7 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, | KVM_VCPUEVENT_VALID_SHADOW)) return -EINVAL; + process_nmi(vcpu); vcpu->arch.exception.pending = events->exception.injected; vcpu->arch.exception.nr = events->exception.nr; vcpu->arch.exception.has_error_code = events->exception.has_error_code; @@ -3556,7 +3587,11 @@ long kvm_arch_vm_ioctl(struct file *filp, if (r) { mutex_lock(&kvm->slots_lock); kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, - &vpic->dev); + &vpic->dev_master); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev_slave); + kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, + &vpic->dev_eclr); mutex_unlock(&kvm->slots_lock); kfree(vpic); goto create_irqchip_unlock; @@ -4045,84 +4080,105 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, return 0; } -static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, - void *val, - unsigned int bytes, - struct x86_exception *exception) +int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, + const void *val, int bytes) { - struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); - gpa_t gpa; - int handled, ret; + int ret; + ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); + if (ret < 0) + return 0; + kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); + return 1; +} + +struct read_write_emulator_ops { + int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val, + int bytes); + int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes); + int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, + int bytes, void *val); + int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes); + bool write; +}; + +static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) +{ if (vcpu->mmio_read_completed) { memcpy(val, vcpu->mmio_data, bytes); trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, vcpu->mmio_phys_addr, *(u64 *)val); vcpu->mmio_read_completed = 0; - return X86EMUL_CONTINUE; + return 1; } - ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, false); - - if (ret < 0) - return X86EMUL_PROPAGATE_FAULT; - - if (ret) - goto mmio; - - if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception) - == X86EMUL_CONTINUE) - return X86EMUL_CONTINUE; + return 0; +} -mmio: - /* - * Is this MMIO handled locally? - */ - handled = vcpu_mmio_read(vcpu, gpa, bytes, val); +static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) +{ + return !kvm_read_guest(vcpu->kvm, gpa, val, bytes); +} - if (handled == bytes) - return X86EMUL_CONTINUE; +static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) +{ + return emulator_write_phys(vcpu, gpa, val, bytes); +} - gpa += handled; - bytes -= handled; - val += handled; +static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val) +{ + trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); + return vcpu_mmio_write(vcpu, gpa, bytes, val); +} +static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) +{ trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); - - vcpu->mmio_needed = 1; - vcpu->run->exit_reason = KVM_EXIT_MMIO; - vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; - vcpu->mmio_size = bytes; - vcpu->run->mmio.len = min(vcpu->mmio_size, 8); - vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; - vcpu->mmio_index = 0; - return X86EMUL_IO_NEEDED; } -int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, - const void *val, int bytes) +static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, + void *val, int bytes) { - int ret; - - ret = kvm_write_guest(vcpu->kvm, gpa, val, bytes); - if (ret < 0) - return 0; - kvm_mmu_pte_write(vcpu, gpa, val, bytes, 1); - return 1; + memcpy(vcpu->mmio_data, val, bytes); + memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); + return X86EMUL_CONTINUE; } -static int emulator_write_emulated_onepage(unsigned long addr, - const void *val, - unsigned int bytes, - struct x86_exception *exception, - struct kvm_vcpu *vcpu) +static struct read_write_emulator_ops read_emultor = { + .read_write_prepare = read_prepare, + .read_write_emulate = read_emulate, + .read_write_mmio = vcpu_mmio_read, + .read_write_exit_mmio = read_exit_mmio, +}; + +static struct read_write_emulator_ops write_emultor = { + .read_write_emulate = write_emulate, + .read_write_mmio = write_mmio, + .read_write_exit_mmio = write_exit_mmio, + .write = true, +}; + +static int emulator_read_write_onepage(unsigned long addr, void *val, + unsigned int bytes, + struct x86_exception *exception, + struct kvm_vcpu *vcpu, + struct read_write_emulator_ops *ops) { gpa_t gpa; int handled, ret; + bool write = ops->write; - ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, true); + if (ops->read_write_prepare && + ops->read_write_prepare(vcpu, val, bytes)) + return X86EMUL_CONTINUE; + + ret = vcpu_mmio_gva_to_gpa(vcpu, addr, &gpa, exception, write); if (ret < 0) return X86EMUL_PROPAGATE_FAULT; @@ -4131,15 +4187,14 @@ static int emulator_write_emulated_onepage(unsigned long addr, if (ret) goto mmio; - if (emulator_write_phys(vcpu, gpa, val, bytes)) + if (ops->read_write_emulate(vcpu, gpa, val, bytes)) return X86EMUL_CONTINUE; mmio: - trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, bytes, gpa, *(u64 *)val); /* * Is this MMIO handled locally? */ - handled = vcpu_mmio_write(vcpu, gpa, bytes, val); + handled = ops->read_write_mmio(vcpu, gpa, bytes, val); if (handled == bytes) return X86EMUL_CONTINUE; @@ -4148,23 +4203,20 @@ mmio: val += handled; vcpu->mmio_needed = 1; - memcpy(vcpu->mmio_data, val, bytes); vcpu->run->exit_reason = KVM_EXIT_MMIO; vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; vcpu->mmio_size = bytes; vcpu->run->mmio.len = min(vcpu->mmio_size, 8); - vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; - memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8); + vcpu->run->mmio.is_write = vcpu->mmio_is_write = write; vcpu->mmio_index = 0; - return X86EMUL_CONTINUE; + return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); } -int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, - unsigned long addr, - const void *val, - unsigned int bytes, - struct x86_exception *exception) +int emulator_read_write(struct x86_emulate_ctxt *ctxt, unsigned long addr, + void *val, unsigned int bytes, + struct x86_exception *exception, + struct read_write_emulator_ops *ops) { struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); @@ -4173,16 +4225,38 @@ int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, int rc, now; now = -addr & ~PAGE_MASK; - rc = emulator_write_emulated_onepage(addr, val, now, exception, - vcpu); + rc = emulator_read_write_onepage(addr, val, now, exception, + vcpu, ops); + if (rc != X86EMUL_CONTINUE) return rc; addr += now; val += now; bytes -= now; } - return emulator_write_emulated_onepage(addr, val, bytes, exception, - vcpu); + + return emulator_read_write_onepage(addr, val, bytes, exception, + vcpu, ops); +} + +static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, + void *val, + unsigned int bytes, + struct x86_exception *exception) +{ + return emulator_read_write(ctxt, addr, val, bytes, + exception, &read_emultor); +} + +int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, + unsigned long addr, + const void *val, + unsigned int bytes, + struct x86_exception *exception) +{ + return emulator_read_write(ctxt, addr, (void *)val, bytes, + exception, &write_emultor); } #define CMPXCHG_TYPE(t, ptr, old, new) \ @@ -4712,7 +4786,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) kvm_set_rflags(vcpu, ctxt->eflags); if (irq == NMI_VECTOR) - vcpu->arch.nmi_pending = false; + vcpu->arch.nmi_pending = 0; else vcpu->arch.interrupt.pending = false; @@ -4788,7 +4862,7 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu, trace_kvm_emulate_insn_start(vcpu); ++vcpu->stat.insn_emulation; - if (r) { + if (r != EMULATION_OK) { if (emulation_type & EMULTYPE_TRAP_UD) return EMULATE_FAIL; if (reexecute_instruction(vcpu, cr2)) @@ -5521,7 +5595,7 @@ static void inject_pending_event(struct kvm_vcpu *vcpu) /* try to inject new event if pending */ if (vcpu->arch.nmi_pending) { if (kvm_x86_ops->nmi_allowed(vcpu)) { - vcpu->arch.nmi_pending = false; + --vcpu->arch.nmi_pending; vcpu->arch.nmi_injected = true; kvm_x86_ops->set_nmi(vcpu); } @@ -5553,10 +5627,26 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu) } } +static void process_nmi(struct kvm_vcpu *vcpu) +{ + unsigned limit = 2; + + /* + * x86 is limited to one NMI running, and one NMI pending after it. + * If an NMI is already in progress, limit further NMIs to just one. + * Otherwise, allow two (and we'll inject the first one immediately). + */ + if (kvm_x86_ops->get_nmi_mask(vcpu) || vcpu->arch.nmi_injected) + limit = 1; + + vcpu->arch.nmi_pending += atomic_xchg(&vcpu->arch.nmi_queued, 0); + vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); + kvm_make_request(KVM_REQ_EVENT, vcpu); +} + static int vcpu_enter_guest(struct kvm_vcpu *vcpu) { int r; - bool nmi_pending; bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && vcpu->run->request_interrupt_window; @@ -5596,6 +5686,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) record_steal_time(vcpu); + if (kvm_check_request(KVM_REQ_NMI, vcpu)) + process_nmi(vcpu); } @@ -5603,19 +5695,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (unlikely(r)) goto out; - /* - * An NMI can be injected between local nmi_pending read and - * vcpu->arch.nmi_pending read inside inject_pending_event(). - * But in that case, KVM_REQ_EVENT will be set, which makes - * the race described above benign. - */ - nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending); - if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { inject_pending_event(vcpu); /* enable NMI/IRQ window open exits if needed */ - if (nmi_pending) + if (vcpu->arch.nmi_pending) kvm_x86_ops->enable_nmi_window(vcpu); else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) kvm_x86_ops->enable_irq_window(vcpu); @@ -5678,7 +5762,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) if (hw_breakpoint_active()) hw_breakpoint_restore(); - kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc); + vcpu->arch.last_guest_tsc = kvm_x86_ops->read_l1_tsc(vcpu); vcpu->mode = OUTSIDE_GUEST_MODE; smp_wmb(); @@ -6323,7 +6407,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu) { - vcpu->arch.nmi_pending = false; + atomic_set(&vcpu->arch.nmi_queued, 0); + vcpu->arch.nmi_pending = 0; vcpu->arch.nmi_injected = false; vcpu->arch.switch_db_regs = 0; @@ -6598,7 +6683,7 @@ int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) !vcpu->arch.apf.halted) || !list_empty_careful(&vcpu->async_pf.done) || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED - || vcpu->arch.nmi_pending || + || atomic_read(&vcpu->arch.nmi_queued) || (kvm_arch_interrupt_allowed(vcpu) && kvm_cpu_has_interrupt(vcpu)); } diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 13ee258442ae..cf4603ba866f 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -56,6 +56,7 @@ #include <linux/lguest_launcher.h> #include <linux/virtio_console.h> #include <linux/pm.h> +#include <linux/export.h> #include <asm/apic.h> #include <asm/lguest.h> #include <asm/paravirt.h> @@ -70,6 +71,7 @@ #include <asm/i387.h> #include <asm/stackprotector.h> #include <asm/reboot.h> /* for struct machine_ops */ +#include <asm/kvm_para.h> /*G:010 * Welcome to the Guest! @@ -455,6 +457,15 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, *ax &= 0xFFFFF0FF; *ax |= 0x00000500; break; + + /* + * This is used to detect if we're running under KVM. We might be, + * but that's a Host matter, not us. So say we're not. + */ + case KVM_CPUID_SIGNATURE: + *bx = *cx = *dx = 0; + break; + /* * 0x80000000 returns the highest Extended Function, so we futureproof * like we do above by limiting it to known fields. diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9c7378df740a..5db0490deb07 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -17,7 +17,7 @@ #include <asm/traps.h> /* dotraplinkage, ... */ #include <asm/pgalloc.h> /* pgd_*(), ... */ #include <asm/kmemcheck.h> /* kmemcheck_*(), ... */ -#include <asm/vsyscall.h> +#include <asm/fixmap.h> /* VSYSCALL_START */ /* * Page fault error code bits: diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index dbe34b931374..ea305856151c 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -108,16 +108,6 @@ static inline void get_head_page_multiple(struct page *page, int nr) SetPageReferenced(page); } -static inline void get_huge_page_tail(struct page *page) -{ - /* - * __split_huge_page_refcount() cannot run - * from under us. - */ - VM_BUG_ON(atomic_read(&page->_count) < 0); - atomic_inc(&page->_count); -} - static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 1dab5194fd9d..4b5ba85eb5c9 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -31,6 +31,10 @@ #include <linux/sched.h> #include <asm/elf.h> +struct __read_mostly va_alignment va_align = { + .flags = -1, +}; + static unsigned int stack_maxrandom_size(void) { unsigned int max = 0; @@ -42,7 +46,6 @@ static unsigned int stack_maxrandom_size(void) return max; } - /* * Top of mmap area (just below the process stack). * @@ -51,21 +54,6 @@ static unsigned int stack_maxrandom_size(void) #define MIN_GAP (128*1024*1024UL + stack_maxrandom_size()) #define MAX_GAP (TASK_SIZE/6*5) -/* - * True on X86_32 or when emulating IA32 on X86_64 - */ -static int mmap_is_ia32(void) -{ -#ifdef CONFIG_X86_32 - return 1; -#endif -#ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) - return 1; -#endif - return 0; -} - static int mmap_is_legacy(void) { if (current->personality & ADDR_COMPAT_LAYOUT) diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c index 99176094500b..41bd2a2d2c50 100644 --- a/arch/x86/pci/ce4100.c +++ b/arch/x86/pci/ce4100.c @@ -304,7 +304,7 @@ static int ce4100_conf_write(unsigned int seg, unsigned int bus, return pci_direct_conf1.write(seg, bus, devfn, reg, len, value); } -struct pci_raw_ops ce4100_pci_conf = { +static const struct pci_raw_ops ce4100_pci_conf = { .read = ce4100_conf_read, .write = ce4100_conf_write, }; diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c index 92df322e0b57..7962ccb4d9b2 100644 --- a/arch/x86/pci/common.c +++ b/arch/x86/pci/common.c @@ -33,8 +33,8 @@ int noioapicreroute = 1; int pcibios_last_bus = -1; unsigned long pirq_table_addr; struct pci_bus *pci_root_bus; -struct pci_raw_ops *raw_pci_ops; -struct pci_raw_ops *raw_pci_ext_ops; +const struct pci_raw_ops *__read_mostly raw_pci_ops; +const struct pci_raw_ops *__read_mostly raw_pci_ext_ops; int raw_pci_read(unsigned int domain, unsigned int bus, unsigned int devfn, int reg, int len, u32 *val) diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c index 4f2c70439d7f..15460590b8c5 100644 --- a/arch/x86/pci/direct.c +++ b/arch/x86/pci/direct.c @@ -79,7 +79,7 @@ static int pci_conf1_write(unsigned int seg, unsigned int bus, #undef PCI_CONF1_ADDRESS -struct pci_raw_ops pci_direct_conf1 = { +const struct pci_raw_ops pci_direct_conf1 = { .read = pci_conf1_read, .write = pci_conf1_write, }; @@ -175,7 +175,7 @@ static int pci_conf2_write(unsigned int seg, unsigned int bus, #undef PCI_CONF2_ADDRESS -struct pci_raw_ops pci_direct_conf2 = { +static const struct pci_raw_ops pci_direct_conf2 = { .read = pci_conf2_read, .write = pci_conf2_write, }; @@ -191,7 +191,7 @@ struct pci_raw_ops pci_direct_conf2 = { * This should be close to trivial, but it isn't, because there are buggy * chipsets (yes, you guessed it, by Intel and Compaq) that have no class ID. */ -static int __init pci_sanity_check(struct pci_raw_ops *o) +static int __init pci_sanity_check(const struct pci_raw_ops *o) { u32 x = 0; int year, devfn; diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c index 494f2e7ea2b4..794b092d01ae 100644 --- a/arch/x86/pci/i386.c +++ b/arch/x86/pci/i386.c @@ -26,6 +26,7 @@ #include <linux/types.h> #include <linux/kernel.h> +#include <linux/export.h> #include <linux/pci.h> #include <linux/init.h> #include <linux/ioport.h> diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c index c89266be6048..2c2aeabc2609 100644 --- a/arch/x86/pci/legacy.c +++ b/arch/x86/pci/legacy.c @@ -2,6 +2,7 @@ * legacy.c - traditional, old school PCI bus probing */ #include <linux/init.h> +#include <linux/export.h> #include <linux/pci.h> #include <asm/pci_x86.h> diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c index a3d9c54792ae..5372e86834c0 100644 --- a/arch/x86/pci/mmconfig_32.c +++ b/arch/x86/pci/mmconfig_32.c @@ -117,7 +117,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, return 0; } -static struct pci_raw_ops pci_mmcfg = { +static const struct pci_raw_ops pci_mmcfg = { .read = pci_mmcfg_read, .write = pci_mmcfg_write, }; diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c index e783841bd1d7..915a493502cb 100644 --- a/arch/x86/pci/mmconfig_64.c +++ b/arch/x86/pci/mmconfig_64.c @@ -81,7 +81,7 @@ static int pci_mmcfg_write(unsigned int seg, unsigned int bus, return 0; } -static struct pci_raw_ops pci_mmcfg = { +static const struct pci_raw_ops pci_mmcfg = { .read = pci_mmcfg_read, .write = pci_mmcfg_write, }; diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c index 512a88c41501..51abf02f9226 100644 --- a/arch/x86/pci/numaq_32.c +++ b/arch/x86/pci/numaq_32.c @@ -110,7 +110,7 @@ static int pci_conf1_mq_write(unsigned int seg, unsigned int bus, #undef PCI_CONF1_MQ_ADDRESS -static struct pci_raw_ops pci_direct_conf1_mq = { +static const struct pci_raw_ops pci_direct_conf1_mq = { .read = pci_conf1_mq_read, .write = pci_conf1_mq_write }; diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c index 5262603b04d9..7043a4f0e98a 100644 --- a/arch/x86/pci/olpc.c +++ b/arch/x86/pci/olpc.c @@ -301,7 +301,7 @@ static int pci_olpc_write(unsigned int seg, unsigned int bus, return 0; } -static struct pci_raw_ops pci_olpc_conf = { +static const struct pci_raw_ops pci_olpc_conf = { .read = pci_olpc_read, .write = pci_olpc_write, }; diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c index f68553551467..db0e9a51e611 100644 --- a/arch/x86/pci/pcbios.c +++ b/arch/x86/pci/pcbios.c @@ -303,7 +303,7 @@ static int pci_bios_write(unsigned int seg, unsigned int bus, * Function table for BIOS32 access */ -static struct pci_raw_ops pci_bios_access = { +static const struct pci_raw_ops pci_bios_access = { .read = pci_bios_read, .write = pci_bios_write }; @@ -312,7 +312,7 @@ static struct pci_raw_ops pci_bios_access = { * Try to find PCI BIOS. */ -static struct pci_raw_ops * __devinit pci_find_bios(void) +static const struct pci_raw_ops * __devinit pci_find_bios(void) { union bios32 *check; unsigned char sum; diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile index 021eee91c056..8d874396cb29 100644 --- a/arch/x86/platform/Makefile +++ b/arch/x86/platform/Makefile @@ -1,6 +1,7 @@ # Platform specific code goes here obj-y += ce4100/ obj-y += efi/ +obj-y += geode/ obj-y += iris/ obj-y += mrst/ obj-y += olpc/ diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c index 28071bb31db7..4c61b52191eb 100644 --- a/arch/x86/platform/ce4100/ce4100.c +++ b/arch/x86/platform/ce4100/ce4100.c @@ -109,7 +109,7 @@ static __init void sdv_serial_fixup(void) } #else -static inline void sdv_serial_fixup(void); +static inline void sdv_serial_fixup(void) {}; #endif static void __init sdv_arch_setup(void) diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 3ae4128013e6..37718f0f053d 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -29,6 +29,7 @@ #include <linux/kernel.h> #include <linux/init.h> #include <linux/efi.h> +#include <linux/export.h> #include <linux/bootmem.h> #include <linux/memblock.h> #include <linux/spinlock.h> diff --git a/arch/x86/platform/efi/efi_32.c b/arch/x86/platform/efi/efi_32.c index 5cab48ee61a4..e36bf714cb77 100644 --- a/arch/x86/platform/efi/efi_32.c +++ b/arch/x86/platform/efi/efi_32.c @@ -25,6 +25,7 @@ #include <linux/efi.h> #include <asm/io.h> +#include <asm/desc.h> #include <asm/page.h> #include <asm/pgtable.h> #include <asm/tlbflush.h> diff --git a/arch/x86/platform/geode/Makefile b/arch/x86/platform/geode/Makefile new file mode 100644 index 000000000000..07c9cd05021a --- /dev/null +++ b/arch/x86/platform/geode/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_ALIX) += alix.o diff --git a/arch/x86/platform/geode/alix.c b/arch/x86/platform/geode/alix.c new file mode 100644 index 000000000000..ca1973699d3d --- /dev/null +++ b/arch/x86/platform/geode/alix.c @@ -0,0 +1,142 @@ +/* + * System Specific setup for PCEngines ALIX. + * At the moment this means setup of GPIO control of LEDs + * on Alix.2/3/6 boards. + * + * + * Copyright (C) 2008 Constantin Baranov <const@mimas.ru> + * Copyright (C) 2011 Ed Wildgoose <kernel@wildgooses.com> + * + * TODO: There are large similarities with leds-net5501.c + * by Alessandro Zummo <a.zummo@towertech.it> + * In the future leds-net5501.c should be migrated over to platform + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/io.h> +#include <linux/string.h> +#include <linux/module.h> +#include <linux/leds.h> +#include <linux/platform_device.h> +#include <linux/gpio.h> + +#include <asm/geode.h> + +static int force = 0; +module_param(force, bool, 0444); +/* FIXME: Award bios is not automatically detected as Alix platform */ +MODULE_PARM_DESC(force, "Force detection as ALIX.2/ALIX.3 platform"); + +static struct gpio_led alix_leds[] = { + { + .name = "alix:1", + .gpio = 6, + .default_trigger = "default-on", + .active_low = 1, + }, + { + .name = "alix:2", + .gpio = 25, + .default_trigger = "default-off", + .active_low = 1, + }, + { + .name = "alix:3", + .gpio = 27, + .default_trigger = "default-off", + .active_low = 1, + }, +}; + +static struct gpio_led_platform_data alix_leds_data = { + .num_leds = ARRAY_SIZE(alix_leds), + .leds = alix_leds, +}; + +static struct platform_device alix_leds_dev = { + .name = "leds-gpio", + .id = -1, + .dev.platform_data = &alix_leds_data, +}; + +static void __init register_alix(void) +{ + /* Setup LED control through leds-gpio driver */ + platform_device_register(&alix_leds_dev); +} + +static int __init alix_present(unsigned long bios_phys, + const char *alix_sig, + size_t alix_sig_len) +{ + const size_t bios_len = 0x00010000; + const char *bios_virt; + const char *scan_end; + const char *p; + char name[64]; + + if (force) { + printk(KERN_NOTICE "%s: forced to skip BIOS test, " + "assume system is ALIX.2/ALIX.3\n", + KBUILD_MODNAME); + return 1; + } + + bios_virt = phys_to_virt(bios_phys); + scan_end = bios_virt + bios_len - (alix_sig_len + 2); + for (p = bios_virt; p < scan_end; p++) { + const char *tail; + char *a; + + if (memcmp(p, alix_sig, alix_sig_len) != 0) + continue; + + memcpy(name, p, sizeof(name)); + + /* remove the first \0 character from string */ + a = strchr(name, '\0'); + if (a) + *a = ' '; + + /* cut the string at a newline */ + a = strchr(name, '\r'); + if (a) + *a = '\0'; + + tail = p + alix_sig_len; + if ((tail[0] == '2' || tail[0] == '3')) { + printk(KERN_INFO + "%s: system is recognized as \"%s\"\n", + KBUILD_MODNAME, name); + return 1; + } + } + + return 0; +} + +static int __init alix_init(void) +{ + const char tinybios_sig[] = "PC Engines ALIX."; + const char coreboot_sig[] = "PC Engines\0ALIX."; + + if (!is_geode()) + return 0; + + if (alix_present(0xf0000, tinybios_sig, sizeof(tinybios_sig) - 1) || + alix_present(0x500, coreboot_sig, sizeof(coreboot_sig) - 1)) + register_alix(); + + return 0; +} + +module_init(alix_init); + +MODULE_AUTHOR("Ed Wildgoose <kernel@wildgooses.com>"); +MODULE_DESCRIPTION("PCEngines ALIX System Setup"); +MODULE_LICENSE("GPL"); diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c index e6379526675b..541020df0da6 100644 --- a/arch/x86/platform/mrst/mrst.c +++ b/arch/x86/platform/mrst/mrst.c @@ -26,6 +26,8 @@ #include <linux/platform_device.h> #include <linux/irq.h> #include <linux/module.h> +#include <linux/notifier.h> +#include <linux/mfd/intel_msic.h> #include <asm/setup.h> #include <asm/mpspec_def.h> @@ -483,7 +485,130 @@ static void __init *no_platform_data(void *info) return NULL; } +static struct resource msic_resources[] = { + { + .start = INTEL_MSIC_IRQ_PHYS_BASE, + .end = INTEL_MSIC_IRQ_PHYS_BASE + 64 - 1, + .flags = IORESOURCE_MEM, + }, +}; + +static struct intel_msic_platform_data msic_pdata; + +static struct platform_device msic_device = { + .name = "intel_msic", + .id = -1, + .dev = { + .platform_data = &msic_pdata, + }, + .num_resources = ARRAY_SIZE(msic_resources), + .resource = msic_resources, +}; + +static inline bool mrst_has_msic(void) +{ + return mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL; +} + +static int msic_scu_status_change(struct notifier_block *nb, + unsigned long code, void *data) +{ + if (code == SCU_DOWN) { + platform_device_unregister(&msic_device); + return 0; + } + + return platform_device_register(&msic_device); +} + +static int __init msic_init(void) +{ + static struct notifier_block msic_scu_notifier = { + .notifier_call = msic_scu_status_change, + }; + + /* + * We need to be sure that the SCU IPC is ready before MSIC device + * can be registered. + */ + if (mrst_has_msic()) + intel_scu_notifier_add(&msic_scu_notifier); + + return 0; +} +arch_initcall(msic_init); + +/* + * msic_generic_platform_data - sets generic platform data for the block + * @info: pointer to the SFI device table entry for this block + * @block: MSIC block + * + * Function sets IRQ number from the SFI table entry for given device to + * the MSIC platform data. + */ +static void *msic_generic_platform_data(void *info, enum intel_msic_block block) +{ + struct sfi_device_table_entry *entry = info; + + BUG_ON(block < 0 || block >= INTEL_MSIC_BLOCK_LAST); + msic_pdata.irq[block] = entry->irq; + + return no_platform_data(info); +} + +static void *msic_battery_platform_data(void *info) +{ + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_BATTERY); +} + +static void *msic_gpio_platform_data(void *info) +{ + static struct intel_msic_gpio_pdata pdata; + int gpio = get_gpio_by_name("msic_gpio_base"); + + if (gpio < 0) + return NULL; + + pdata.gpio_base = gpio; + msic_pdata.gpio = &pdata; + + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_GPIO); +} + +static void *msic_audio_platform_data(void *info) +{ + struct platform_device *pdev; + + pdev = platform_device_register_simple("sst-platform", -1, NULL, 0); + if (IS_ERR(pdev)) { + pr_err("failed to create audio platform device\n"); + return NULL; + } + + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_AUDIO); +} + +static void *msic_power_btn_platform_data(void *info) +{ + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_POWER_BTN); +} + +static void *msic_ocd_platform_data(void *info) +{ + static struct intel_msic_ocd_pdata pdata; + int gpio = get_gpio_by_name("ocd_gpio"); + + if (gpio < 0) + return NULL; + + pdata.gpio = gpio; + msic_pdata.ocd = &pdata; + + return msic_generic_platform_data(info, INTEL_MSIC_BLOCK_OCD); +} + static const struct devs_id __initconst device_ids[] = { + {"bma023", SFI_DEV_TYPE_I2C, 1, &no_platform_data}, {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data}, {"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data}, {"i2c_max7315", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data}, @@ -491,7 +616,14 @@ static const struct devs_id __initconst device_ids[] = { {"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data}, {"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data}, {"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data}, - {"msic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data}, + + /* MSIC subdevices */ + {"msic_battery", SFI_DEV_TYPE_IPC, 1, &msic_battery_platform_data}, + {"msic_gpio", SFI_DEV_TYPE_IPC, 1, &msic_gpio_platform_data}, + {"msic_audio", SFI_DEV_TYPE_IPC, 1, &msic_audio_platform_data}, + {"msic_power_btn", SFI_DEV_TYPE_IPC, 1, &msic_power_btn_platform_data}, + {"msic_ocd", SFI_DEV_TYPE_IPC, 1, &msic_ocd_platform_data}, + {}, }; @@ -558,6 +690,9 @@ static void __init intel_scu_i2c_device_register(int bus, i2c_devs[i2c_next_dev++] = new_dev; } +BLOCKING_NOTIFIER_HEAD(intel_scu_notifier); +EXPORT_SYMBOL_GPL(intel_scu_notifier); + /* Called by IPC driver */ void intel_scu_devices_create(void) { @@ -582,6 +717,7 @@ void intel_scu_devices_create(void) } else i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1); } + intel_scu_notifier_post(SCU_AVAILABLE, 0L); } EXPORT_SYMBOL_GPL(intel_scu_devices_create); @@ -590,6 +726,8 @@ void intel_scu_devices_destroy(void) { int i; + intel_scu_notifier_post(SCU_DOWN, 0L); + for (i = 0; i < ipc_next_dev; i++) platform_device_del(ipc_devs[i]); } @@ -606,19 +744,37 @@ static void __init install_irq_resource(struct platform_device *pdev, int irq) platform_device_add_resources(pdev, &res, 1); } -static void __init sfi_handle_ipc_dev(struct platform_device *pdev) +static void __init sfi_handle_ipc_dev(struct sfi_device_table_entry *entry) { const struct devs_id *dev = device_ids; + struct platform_device *pdev; void *pdata = NULL; while (dev->name[0]) { if (dev->type == SFI_DEV_TYPE_IPC && - !strncmp(dev->name, pdev->name, SFI_NAME_LEN)) { - pdata = dev->get_platform_data(pdev); + !strncmp(dev->name, entry->name, SFI_NAME_LEN)) { + pdata = dev->get_platform_data(entry); break; } dev++; } + + /* + * On Medfield the platform device creation is handled by the MSIC + * MFD driver so we don't need to do it here. + */ + if (mrst_has_msic()) + return; + + /* ID as IRQ is a hack that will go away */ + pdev = platform_device_alloc(entry->name, entry->irq); + if (pdev == NULL) { + pr_err("out of memory for SFI platform device '%s'.\n", + entry->name); + return; + } + install_irq_resource(pdev, entry->irq); + pdev->dev.platform_data = pdata; intel_scu_device_register(pdev); } @@ -671,7 +827,6 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) struct sfi_device_table_entry *pentry; struct spi_board_info spi_info; struct i2c_board_info i2c_info; - struct platform_device *pdev; int num, i, bus; int ioapic; struct io_apic_irq_attr irq_attr; @@ -699,17 +854,9 @@ static int __init sfi_parse_devs(struct sfi_table_header *table) switch (pentry->type) { case SFI_DEV_TYPE_IPC: - /* ID as IRQ is a hack that will go away */ - pdev = platform_device_alloc(pentry->name, irq); - if (pdev == NULL) { - pr_err("out of memory for SFI platform device '%s'.\n", - pentry->name); - continue; - } - install_irq_resource(pdev, irq); pr_debug("info[%2d]: IPC bus, name = %16.16s, " - "irq = 0x%2x\n", i, pentry->name, irq); - sfi_handle_ipc_dev(pdev); + "irq = 0x%2x\n", i, pentry->name, pentry->irq); + sfi_handle_ipc_dev(pentry); break; case SFI_DEV_TYPE_SPI: memset(&spi_info, 0, sizeof(spi_info)); diff --git a/arch/x86/platform/mrst/pmu.c b/arch/x86/platform/mrst/pmu.c index 9281da7d91bd..c0ac06da57ac 100644 --- a/arch/x86/platform/mrst/pmu.c +++ b/arch/x86/platform/mrst/pmu.c @@ -70,7 +70,7 @@ static struct mrst_device mrst_devs[] = { /* 24 */ { 0x4110, 0 }, /* Lincroft */ }; -/* n.b. We ignore PCI-id 0x815 in LSS9 b/c MeeGo has no driver for it */ +/* n.b. We ignore PCI-id 0x815 in LSS9 b/c Linux has no driver for it */ static u16 mrst_lss9_pci_ids[] = {0x080a, 0x0814, 0}; static u16 mrst_lss10_pci_ids[] = {0x0800, 0x0801, 0x0802, 0x0803, 0x0804, 0x0805, 0x080f, 0}; diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c index 6d5dbcdd444a..225bd0f0f675 100644 --- a/arch/x86/platform/mrst/vrtc.c +++ b/arch/x86/platform/mrst/vrtc.c @@ -18,6 +18,7 @@ */ #include <linux/kernel.h> +#include <linux/export.h> #include <linux/init.h> #include <linux/sfi.h> #include <linux/platform_device.h> @@ -75,8 +76,8 @@ unsigned long vrtc_get_time(void) spin_unlock_irqrestore(&rtc_lock, flags); - /* vRTC YEAR reg contains the offset to 1960 */ - year += 1960; + /* vRTC YEAR reg contains the offset to 1972 */ + year += 1972; printk(KERN_INFO "vRTC: sec: %d min: %d hour: %d day: %d " "mon: %d year: %d\n", sec, min, hour, mday, mon, year); diff --git a/arch/x86/platform/olpc/olpc-xo1-pm.c b/arch/x86/platform/olpc/olpc-xo1-pm.c index 6f3855a5a2f7..0ce8616c88ae 100644 --- a/arch/x86/platform/olpc/olpc-xo1-pm.c +++ b/arch/x86/platform/olpc/olpc-xo1-pm.c @@ -14,6 +14,7 @@ #include <linux/cs5535.h> #include <linux/platform_device.h> +#include <linux/export.h> #include <linux/pm.h> #include <linux/mfd/core.h> #include <linux/suspend.h> diff --git a/arch/x86/platform/uv/bios_uv.c b/arch/x86/platform/uv/bios_uv.c index 8bc57baaa9ad..766612137a62 100644 --- a/arch/x86/platform/uv/bios_uv.c +++ b/arch/x86/platform/uv/bios_uv.c @@ -20,6 +20,7 @@ */ #include <linux/efi.h> +#include <linux/export.h> #include <asm/efi.h> #include <linux/io.h> #include <asm/uv/bios.h> diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c index db8b915f54bc..5b552198f774 100644 --- a/arch/x86/platform/uv/tlb_uv.c +++ b/arch/x86/platform/uv/tlb_uv.c @@ -115,9 +115,6 @@ early_param("nobau", setup_nobau); /* base pnode in this partition */ static int uv_base_pnode __read_mostly; -/* position of pnode (which is nasid>>1): */ -static int uv_nshift __read_mostly; -static unsigned long uv_mmask __read_mostly; static DEFINE_PER_CPU(struct ptc_stats, ptcstats); static DEFINE_PER_CPU(struct bau_control, bau_control); @@ -1435,7 +1432,7 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) { int i; int cpu; - unsigned long pa; + unsigned long gpa; unsigned long m; unsigned long n; size_t dsize; @@ -1451,9 +1448,9 @@ static void activation_descriptor_init(int node, int pnode, int base_pnode) bau_desc = kmalloc_node(dsize, GFP_KERNEL, node); BUG_ON(!bau_desc); - pa = uv_gpa(bau_desc); /* need the real nasid*/ - n = pa >> uv_nshift; - m = pa & uv_mmask; + gpa = uv_gpa(bau_desc); + n = uv_gpa_to_gnode(gpa); + m = uv_gpa_to_offset(gpa); /* the 14-bit pnode */ write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m)); @@ -1525,9 +1522,9 @@ static void pq_init(int node, int pnode) bcp->queue_last = pqp + (DEST_Q_SIZE - 1); } /* - * need the pnode of where the memory was really allocated + * need the gnode of where the memory was really allocated */ - pn = uv_gpa(pqp) >> uv_nshift; + pn = uv_gpa_to_gnode(uv_gpa(pqp)); first = uv_physnodeaddr(pqp); pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first; last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)); @@ -1837,8 +1834,6 @@ static int __init uv_bau_init(void) zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu)); } - uv_nshift = uv_hub_info->m_val; - uv_mmask = (1UL << uv_hub_info->m_val) - 1; nuvhubs = uv_num_possible_blades(); spin_lock_init(&disable_lock); congested_cycles = usec_2_cycles(congested_respns_us); diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 87bb35e34ef1..f10c0afa1cb4 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -9,6 +9,7 @@ */ #include <linux/suspend.h> +#include <linux/export.h> #include <linux/smp.h> #include <asm/pgtable.h> diff --git a/arch/x86/um/Kconfig b/arch/x86/um/Kconfig new file mode 100644 index 000000000000..1d97bd84b6fb --- /dev/null +++ b/arch/x86/um/Kconfig @@ -0,0 +1,67 @@ +mainmenu "User Mode Linux/$SUBARCH $KERNELVERSION Kernel Configuration" + +source "arch/um/Kconfig.common" + +menu "UML-specific options" + +menu "Host processor type and features" + +config CMPXCHG_LOCAL + bool + default n + +config CMPXCHG_DOUBLE + bool + default n + +source "arch/x86/Kconfig.cpu" + +endmenu + +config UML_X86 + def_bool y + select GENERIC_FIND_FIRST_BIT + +config 64BIT + bool + default SUBARCH = "x86_64" + +config X86_32 + def_bool !64BIT + select HAVE_AOUT + +config X86_64 + def_bool 64BIT + +config RWSEM_XCHGADD_ALGORITHM + def_bool X86_XADD && 64BIT + +config RWSEM_GENERIC_SPINLOCK + def_bool !RWSEM_XCHGADD_ALGORITHM + +config 3_LEVEL_PGTABLES + bool "Three-level pagetables (EXPERIMENTAL)" if !64BIT + default 64BIT + depends on EXPERIMENTAL + help + Three-level pagetables will let UML have more than 4G of physical + memory. All the memory that can't be mapped directly will be treated + as high memory. + + However, this it experimental on 32-bit architectures, so if unsure say + N (on x86-64 it's automatically enabled, instead, as it's safe there). + +config ARCH_HAS_SC_SIGNALS + def_bool !64BIT + +config ARCH_REUSE_HOST_VSYSCALL_AREA + def_bool !64BIT + +config GENERIC_HWEIGHT + def_bool y + +source "arch/um/Kconfig.um" + +endmenu + +source "arch/um/Kconfig.rest" diff --git a/arch/x86/um/Makefile b/arch/x86/um/Makefile new file mode 100644 index 000000000000..8fb58400e415 --- /dev/null +++ b/arch/x86/um/Makefile @@ -0,0 +1,45 @@ +# +# Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) +# + +ifeq ($(CONFIG_X86_32),y) + BITS := 32 +else + BITS := 64 +endif + +obj-y = bug.o bugs_$(BITS).o delay.o fault.o ksyms.o ldt.o \ + ptrace_$(BITS).o ptrace_user.o setjmp_$(BITS).o signal.o \ + stub_$(BITS).o stub_segv.o syscalls_$(BITS).o \ + sys_call_table_$(BITS).o sysrq_$(BITS).o tls_$(BITS).o \ + mem_$(BITS).o subarch.o os-$(OS)/ + +ifeq ($(CONFIG_X86_32),y) + +obj-y += checksum_32.o +obj-$(CONFIG_BINFMT_ELF) += elfcore.o + +subarch-y = ../lib/string_32.o ../lib/atomic64_32.o ../lib/atomic64_cx8_32.o +subarch-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += ../lib/rwsem.o +subarch-$(CONFIG_HIGHMEM) += ../mm/highmem_32.o + +else + +obj-y += vdso/ + +subarch-y = ../lib/csum-partial_64.o ../lib/memcpy_64.o ../lib/thunk_64.o \ + ../lib/rwsem.o + +endif + +subarch-$(CONFIG_MODULES) += ../kernel/module.o + +USER_OBJS := bugs_$(BITS).o ptrace_user.o fault.o + +extra-y += user-offsets.s +$(obj)/user-offsets.s: c_flags = -Wp,-MD,$(depfile) $(USER_CFLAGS) + +UNPROFILE_OBJS := stub_segv.o +CFLAGS_stub_segv.o := $(CFLAGS_NO_HARDENING) + +include arch/um/scripts/Makefile.rules diff --git a/arch/x86/um/asm/apic.h b/arch/x86/um/asm/apic.h new file mode 100644 index 000000000000..876dee84ab11 --- /dev/null +++ b/arch/x86/um/asm/apic.h @@ -0,0 +1,4 @@ +#ifndef __UM_APIC_H +#define __UM_APIC_H + +#endif diff --git a/arch/x86/um/asm/arch_hweight.h b/arch/x86/um/asm/arch_hweight.h new file mode 100644 index 000000000000..c656cf443f4a --- /dev/null +++ b/arch/x86/um/asm/arch_hweight.h @@ -0,0 +1,6 @@ +#ifndef _ASM_UM_HWEIGHT_H +#define _ASM_UM_HWEIGHT_H + +#include <asm-generic/bitops/arch_hweight.h> + +#endif diff --git a/arch/x86/um/asm/archparam.h b/arch/x86/um/asm/archparam.h new file mode 100644 index 000000000000..c17cf68dda0f --- /dev/null +++ b/arch/x86/um/asm/archparam.h @@ -0,0 +1,20 @@ +/* + * Copyright (C) 2000 - 2003 Jeff Dike (jdike@addtoit.com) + * Copyright 2003 PathScale, Inc. + * Licensed under the GPL + */ + +#ifndef __UM_ARCHPARAM_H +#define __UM_ARCHPARAM_H + +#ifdef CONFIG_X86_32 + +#ifdef CONFIG_X86_PAE +#define LAST_PKMAP 512 +#else +#define LAST_PKMAP 1024 +#endif + +#endif + +#endif diff --git a/arch/x86/um/asm/checksum.h b/arch/x86/um/asm/checksum.h new file mode 100644 index 000000000000..b6efe2381b5d --- /dev/null +++ b/arch/x86/um/asm/checksum.h @@ -0,0 +1,10 @@ +#ifndef __UM_CHECKSUM_H +#define __UM_CHECKSUM_H + +#ifdef CONFIG_X86_32 +# include "checksum_32.h" +#else +# include "checksum_64.h" +#endif + +#endif diff --git a/arch/x86/um/asm/checksum_32.h b/arch/x86/um/asm/checksum_32.h new file mode 100644 index 000000000000..caab74252e27 --- /dev/null +++ b/arch/x86/um/asm/checksum_32.h @@ -0,0 +1,201 @@ +/* + * Licensed under the GPL + */ + +#ifndef __UM_SYSDEP_CHECKSUM_H +#define __UM_SYSDEP_CHECKSUM_H + +#include "linux/in6.h" +#include "linux/string.h" + +/* + * computes the checksum of a memory block at buff, length len, + * and adds in "sum" (32-bit) + * + * returns a 32-bit number suitable for feeding into itself + * or csum_tcpudp_magic + * + * this function must be called with even lengths, except + * for the last fragment, which may be odd + * + * it's best to have buff aligned on a 32-bit boundary + */ +__wsum csum_partial(const void *buff, int len, __wsum sum); + +/* + * Note: when you get a NULL pointer exception here this means someone + * passed in an incorrect kernel address to one of these functions. + * + * If you use these functions directly please don't forget the + * access_ok(). + */ + +static __inline__ +__wsum csum_partial_copy_nocheck(const void *src, void *dst, + int len, __wsum sum) +{ + memcpy(dst, src, len); + return csum_partial(dst, len, sum); +} + +/* + * the same as csum_partial, but copies from src while it + * checksums, and handles user-space pointer exceptions correctly, when needed. + * + * here even more important to align src and dst on a 32-bit (or even + * better 64-bit) boundary + */ + +static __inline__ +__wsum csum_partial_copy_from_user(const void __user *src, void *dst, + int len, __wsum sum, int *err_ptr) +{ + if (copy_from_user(dst, src, len)) { + *err_ptr = -EFAULT; + return (__force __wsum)-1; + } + + return csum_partial(dst, len, sum); +} + +/* + * This is a version of ip_compute_csum() optimized for IP headers, + * which always checksum on 4 octet boundaries. + * + * By Jorge Cwik <jorge@laser.satlink.net>, adapted for linux by + * Arnt Gulbrandsen. + */ +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) +{ + unsigned int sum; + + __asm__ __volatile__( + "movl (%1), %0 ;\n" + "subl $4, %2 ;\n" + "jbe 2f ;\n" + "addl 4(%1), %0 ;\n" + "adcl 8(%1), %0 ;\n" + "adcl 12(%1), %0 ;\n" +"1: adcl 16(%1), %0 ;\n" + "lea 4(%1), %1 ;\n" + "decl %2 ;\n" + "jne 1b ;\n" + "adcl $0, %0 ;\n" + "movl %0, %2 ;\n" + "shrl $16, %0 ;\n" + "addw %w2, %w0 ;\n" + "adcl $0, %0 ;\n" + "notl %0 ;\n" +"2: ;\n" + /* Since the input registers which are loaded with iph and ipl + are modified, we must also specify them as outputs, or gcc + will assume they contain their original values. */ + : "=r" (sum), "=r" (iph), "=r" (ihl) + : "1" (iph), "2" (ihl) + : "memory"); + return (__force __sum16)sum; +} + +/* + * Fold a partial checksum + */ + +static inline __sum16 csum_fold(__wsum sum) +{ + __asm__( + "addl %1, %0 ;\n" + "adcl $0xffff, %0 ;\n" + : "=r" (sum) + : "r" ((__force u32)sum << 16), + "0" ((__force u32)sum & 0xffff0000) + ); + return (__force __sum16)(~(__force u32)sum >> 16); +} + +static inline __wsum csum_tcpudp_nofold(__be32 saddr, __be32 daddr, + unsigned short len, + unsigned short proto, + __wsum sum) +{ + __asm__( + "addl %1, %0 ;\n" + "adcl %2, %0 ;\n" + "adcl %3, %0 ;\n" + "adcl $0, %0 ;\n" + : "=r" (sum) + : "g" (daddr), "g"(saddr), "g"((len + proto) << 8), "0"(sum)); + return sum; +} + +/* + * computes the checksum of the TCP/UDP pseudo-header + * returns a 16-bit checksum, already complemented + */ +static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, + unsigned short len, + unsigned short proto, + __wsum sum) +{ + return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum)); +} + +/* + * this routine is used for miscellaneous IP-like checksums, mainly + * in icmp.c + */ + +static inline __sum16 ip_compute_csum(const void *buff, int len) +{ + return csum_fold (csum_partial(buff, len, 0)); +} + +#define _HAVE_ARCH_IPV6_CSUM +static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr, + const struct in6_addr *daddr, + __u32 len, unsigned short proto, + __wsum sum) +{ + __asm__( + "addl 0(%1), %0 ;\n" + "adcl 4(%1), %0 ;\n" + "adcl 8(%1), %0 ;\n" + "adcl 12(%1), %0 ;\n" + "adcl 0(%2), %0 ;\n" + "adcl 4(%2), %0 ;\n" + "adcl 8(%2), %0 ;\n" + "adcl 12(%2), %0 ;\n" + "adcl %3, %0 ;\n" + "adcl %4, %0 ;\n" + "adcl $0, %0 ;\n" + : "=&r" (sum) + : "r" (saddr), "r" (daddr), + "r"(htonl(len)), "r"(htonl(proto)), "0"(sum)); + + return csum_fold(sum); +} + +/* + * Copy and checksum to user + */ +#define HAVE_CSUM_COPY_USER +static __inline__ __wsum csum_and_copy_to_user(const void *src, + void __user *dst, + int len, __wsum sum, int *err_ptr) +{ + if (access_ok(VERIFY_WRITE, dst, len)) { + if (copy_to_user(dst, src, len)) { + *err_ptr = -EFAULT; + return (__force __wsum)-1; + } + + return csum_partial(src, len, sum); + } + + if (len) + *err_ptr = -EFAULT; + + return (__force __wsum)-1; /* invalid checksum */ +} + +#endif + diff --git a/arch/x86/um/asm/checksum_64.h b/arch/x86/um/asm/checksum_64.h new file mode 100644 index 000000000000..a5be9031ea85 --- /dev/null +++ b/arch/x86/um/asm/checksum_64.h @@ -0,0 +1,144 @@ +/* + * Licensed under the GPL + */ + +#ifndef __UM_SYSDEP_CHECKSUM_H +#define __UM_SYSDEP_CHECKSUM_H + +#include "linux/string.h" +#include "linux/in6.h" +#include "asm/uaccess.h" + +extern __wsum csum_partial(const void *buff, int len, __wsum sum); + +/* + * Note: when you get a NULL pointer exception here this means someone + * passed in an incorrect kernel address to one of these functions. + * + * If you use these functions directly please don't forget the + * access_ok(). + */ + +static __inline__ +__wsum csum_partial_copy_nocheck(const void *src, void *dst, + int len, __wsum sum) +{ + memcpy(dst, src, len); + return(csum_partial(dst, len, sum)); +} + +static __inline__ +__wsum csum_partial_copy_from_user(const void __user *src, + void *dst, int len, __wsum sum, + int *err_ptr) +{ + if (copy_from_user(dst, src, len)) { + *err_ptr = -EFAULT; + return (__force __wsum)-1; + } + return csum_partial(dst, len, sum); +} + +/** + * csum_fold - Fold and invert a 32bit checksum. + * sum: 32bit unfolded sum + * + * Fold a 32bit running checksum to 16bit and invert it. This is usually + * the last step before putting a checksum into a packet. + * Make sure not to mix with 64bit checksums. + */ +static inline __sum16 csum_fold(__wsum sum) +{ + __asm__( + " addl %1,%0\n" + " adcl $0xffff,%0" + : "=r" (sum) + : "r" ((__force u32)sum << 16), + "0" ((__force u32)sum & 0xffff0000) + ); + return (__force __sum16)(~(__force u32)sum >> 16); +} + +/** + * csum_tcpup_nofold - Compute an IPv4 pseudo header checksum. + * @saddr: source address + * @daddr: destination address + * @len: length of packet + * @proto: ip protocol of packet + * @sum: initial sum to be added in (32bit unfolded) + * + * Returns the pseudo header checksum the input data. Result is + * 32bit unfolded. + */ +static inline __wsum +csum_tcpudp_nofold(__be32 saddr, __be32 daddr, unsigned short len, + unsigned short proto, __wsum sum) +{ + asm(" addl %1, %0\n" + " adcl %2, %0\n" + " adcl %3, %0\n" + " adcl $0, %0\n" + : "=r" (sum) + : "g" (daddr), "g" (saddr), "g" ((len + proto) << 8), "0" (sum)); + return sum; +} + +/* + * computes the checksum of the TCP/UDP pseudo-header + * returns a 16-bit checksum, already complemented + */ +static inline __sum16 csum_tcpudp_magic(__be32 saddr, __be32 daddr, + unsigned short len, + unsigned short proto, + __wsum sum) +{ + return csum_fold(csum_tcpudp_nofold(saddr,daddr,len,proto,sum)); +} + +/** + * ip_fast_csum - Compute the IPv4 header checksum efficiently. + * iph: ipv4 header + * ihl: length of header / 4 + */ +static inline __sum16 ip_fast_csum(const void *iph, unsigned int ihl) +{ + unsigned int sum; + + asm( " movl (%1), %0\n" + " subl $4, %2\n" + " jbe 2f\n" + " addl 4(%1), %0\n" + " adcl 8(%1), %0\n" + " adcl 12(%1), %0\n" + "1: adcl 16(%1), %0\n" + " lea 4(%1), %1\n" + " decl %2\n" + " jne 1b\n" + " adcl $0, %0\n" + " movl %0, %2\n" + " shrl $16, %0\n" + " addw %w2, %w0\n" + " adcl $0, %0\n" + " notl %0\n" + "2:" + /* Since the input registers which are loaded with iph and ipl + are modified, we must also specify them as outputs, or gcc + will assume they contain their original values. */ + : "=r" (sum), "=r" (iph), "=r" (ihl) + : "1" (iph), "2" (ihl) + : "memory"); + return (__force __sum16)sum; +} + +static inline unsigned add32_with_carry(unsigned a, unsigned b) +{ + asm("addl %2,%0\n\t" + "adcl $0,%0" + : "=r" (a) + : "0" (a), "r" (b)); + return a; +} + +extern __sum16 ip_compute_csum(const void *buff, int len); + +#endif diff --git a/arch/x86/um/asm/desc.h b/arch/x86/um/asm/desc.h new file mode 100644 index 000000000000..4ec34a51b62c --- /dev/null +++ b/arch/x86/um/asm/desc.h @@ -0,0 +1,16 @@ +#ifndef __UM_DESC_H +#define __UM_DESC_H + +/* Taken from asm-i386/desc.h, it's the only thing we need. The rest wouldn't + * compile, and has never been used. */ +#define LDT_empty(info) (\ + (info)->base_addr == 0 && \ + (info)->limit == 0 && \ + (info)->contents == 0 && \ + (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && \ + (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && \ + (info)->useable == 0 ) + +#endif diff --git a/arch/x86/um/asm/elf.h b/arch/x86/um/asm/elf.h new file mode 100644 index 000000000000..f3b0633b69a1 --- /dev/null +++ b/arch/x86/um/asm/elf.h @@ -0,0 +1,221 @@ +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ +#ifndef __UM_ELF_X86_H +#define __UM_ELF_X86_H + +#include <asm/user.h> +#include "skas.h" + +#ifdef CONFIG_X86_32 + +#define R_386_NONE 0 +#define R_386_32 1 +#define R_386_PC32 2 +#define R_386_GOT32 3 +#define R_386_PLT32 4 +#define R_386_COPY 5 +#define R_386_GLOB_DAT 6 +#define R_386_JMP_SLOT 7 +#define R_386_RELATIVE 8 +#define R_386_GOTOFF 9 +#define R_386_GOTPC 10 +#define R_386_NUM 11 + +/* + * This is used to ensure we don't load something for the wrong architecture. + */ +#define elf_check_arch(x) \ + (((x)->e_machine == EM_386) || ((x)->e_machine == EM_486)) + +#define ELF_CLASS ELFCLASS32 +#define ELF_DATA ELFDATA2LSB +#define ELF_ARCH EM_386 + +#define ELF_PLAT_INIT(regs, load_addr) do { \ + PT_REGS_EBX(regs) = 0; \ + PT_REGS_ECX(regs) = 0; \ + PT_REGS_EDX(regs) = 0; \ + PT_REGS_ESI(regs) = 0; \ + PT_REGS_EDI(regs) = 0; \ + PT_REGS_EBP(regs) = 0; \ + PT_REGS_EAX(regs) = 0; \ +} while (0) + +/* Shamelessly stolen from include/asm-i386/elf.h */ + +#define ELF_CORE_COPY_REGS(pr_reg, regs) do { \ + pr_reg[0] = PT_REGS_EBX(regs); \ + pr_reg[1] = PT_REGS_ECX(regs); \ + pr_reg[2] = PT_REGS_EDX(regs); \ + pr_reg[3] = PT_REGS_ESI(regs); \ + pr_reg[4] = PT_REGS_EDI(regs); \ + pr_reg[5] = PT_REGS_EBP(regs); \ + pr_reg[6] = PT_REGS_EAX(regs); \ + pr_reg[7] = PT_REGS_DS(regs); \ + pr_reg[8] = PT_REGS_ES(regs); \ + /* fake once used fs and gs selectors? */ \ + pr_reg[9] = PT_REGS_DS(regs); \ + pr_reg[10] = PT_REGS_DS(regs); \ + pr_reg[11] = PT_REGS_SYSCALL_NR(regs); \ + pr_reg[12] = PT_REGS_IP(regs); \ + pr_reg[13] = PT_REGS_CS(regs); \ + pr_reg[14] = PT_REGS_EFLAGS(regs); \ + pr_reg[15] = PT_REGS_SP(regs); \ + pr_reg[16] = PT_REGS_SS(regs); \ +} while (0); + +extern char * elf_aux_platform; +#define ELF_PLATFORM (elf_aux_platform) + +extern unsigned long vsyscall_ehdr; +extern unsigned long vsyscall_end; +extern unsigned long __kernel_vsyscall; + +/* + * This is the range that is readable by user mode, and things + * acting like user mode such as get_user_pages. + */ +#define FIXADDR_USER_START vsyscall_ehdr +#define FIXADDR_USER_END vsyscall_end + + +/* + * Architecture-neutral AT_ values in 0-17, leave some room + * for more of them, start the x86-specific ones at 32. + */ +#define AT_SYSINFO 32 +#define AT_SYSINFO_EHDR 33 + +#define ARCH_DLINFO \ +do { \ + if ( vsyscall_ehdr ) { \ + NEW_AUX_ENT(AT_SYSINFO, __kernel_vsyscall); \ + NEW_AUX_ENT(AT_SYSINFO_EHDR, vsyscall_ehdr); \ + } \ +} while (0) + +#else + +/* x86-64 relocation types, taken from asm-x86_64/elf.h */ +#define R_X86_64_NONE 0 /* No reloc */ +#define R_X86_64_64 1 /* Direct 64 bit */ +#define R_X86_64_PC32 2 /* PC relative 32 bit signed */ +#define R_X86_64_GOT32 3 /* 32 bit GOT entry */ +#define R_X86_64_PLT32 4 /* 32 bit PLT address */ +#define R_X86_64_COPY 5 /* Copy symbol at runtime */ +#define R_X86_64_GLOB_DAT 6 /* Create GOT entry */ +#define R_X86_64_JUMP_SLOT 7 /* Create PLT entry */ +#define R_X86_64_RELATIVE 8 /* Adjust by program base */ +#define R_X86_64_GOTPCREL 9 /* 32 bit signed pc relative + offset to GOT */ +#define R_X86_64_32 10 /* Direct 32 bit zero extended */ +#define R_X86_64_32S 11 /* Direct 32 bit sign extended */ +#define R_X86_64_16 12 /* Direct 16 bit zero extended */ +#define R_X86_64_PC16 13 /* 16 bit sign extended pc relative */ +#define R_X86_64_8 14 /* Direct 8 bit sign extended */ +#define R_X86_64_PC8 15 /* 8 bit sign extended pc relative */ + +#define R_X86_64_NUM 16 + +/* + * This is used to ensure we don't load something for the wrong architecture. + */ +#define elf_check_arch(x) \ + ((x)->e_machine == EM_X86_64) + +#define ELF_CLASS ELFCLASS64 +#define ELF_DATA ELFDATA2LSB +#define ELF_ARCH EM_X86_64 + +#define ELF_PLAT_INIT(regs, load_addr) do { \ + PT_REGS_RBX(regs) = 0; \ + PT_REGS_RCX(regs) = 0; \ + PT_REGS_RDX(regs) = 0; \ + PT_REGS_RSI(regs) = 0; \ + PT_REGS_RDI(regs) = 0; \ + PT_REGS_RBP(regs) = 0; \ + PT_REGS_RAX(regs) = 0; \ + PT_REGS_R8(regs) = 0; \ + PT_REGS_R9(regs) = 0; \ + PT_REGS_R10(regs) = 0; \ + PT_REGS_R11(regs) = 0; \ + PT_REGS_R12(regs) = 0; \ + PT_REGS_R13(regs) = 0; \ + PT_REGS_R14(regs) = 0; \ + PT_REGS_R15(regs) = 0; \ +} while (0) + +#define ELF_CORE_COPY_REGS(pr_reg, _regs) \ + (pr_reg)[0] = (_regs)->regs.gp[0]; \ + (pr_reg)[1] = (_regs)->regs.gp[1]; \ + (pr_reg)[2] = (_regs)->regs.gp[2]; \ + (pr_reg)[3] = (_regs)->regs.gp[3]; \ + (pr_reg)[4] = (_regs)->regs.gp[4]; \ + (pr_reg)[5] = (_regs)->regs.gp[5]; \ + (pr_reg)[6] = (_regs)->regs.gp[6]; \ + (pr_reg)[7] = (_regs)->regs.gp[7]; \ + (pr_reg)[8] = (_regs)->regs.gp[8]; \ + (pr_reg)[9] = (_regs)->regs.gp[9]; \ + (pr_reg)[10] = (_regs)->regs.gp[10]; \ + (pr_reg)[11] = (_regs)->regs.gp[11]; \ + (pr_reg)[12] = (_regs)->regs.gp[12]; \ + (pr_reg)[13] = (_regs)->regs.gp[13]; \ + (pr_reg)[14] = (_regs)->regs.gp[14]; \ + (pr_reg)[15] = (_regs)->regs.gp[15]; \ + (pr_reg)[16] = (_regs)->regs.gp[16]; \ + (pr_reg)[17] = (_regs)->regs.gp[17]; \ + (pr_reg)[18] = (_regs)->regs.gp[18]; \ + (pr_reg)[19] = (_regs)->regs.gp[19]; \ + (pr_reg)[20] = (_regs)->regs.gp[20]; \ + (pr_reg)[21] = current->thread.arch.fs; \ + (pr_reg)[22] = 0; \ + (pr_reg)[23] = 0; \ + (pr_reg)[24] = 0; \ + (pr_reg)[25] = 0; \ + (pr_reg)[26] = 0; + +#define ELF_PLATFORM "x86_64" + +/* No user-accessible fixmap addresses, i.e. vsyscall */ +#define FIXADDR_USER_START 0 +#define FIXADDR_USER_END 0 + +#define ARCH_HAS_SETUP_ADDITIONAL_PAGES 1 +struct linux_binprm; +extern int arch_setup_additional_pages(struct linux_binprm *bprm, + int uses_interp); + +extern unsigned long um_vdso_addr; +#define AT_SYSINFO_EHDR 33 +#define ARCH_DLINFO NEW_AUX_ENT(AT_SYSINFO_EHDR, um_vdso_addr) + +#endif + +typedef unsigned long elf_greg_t; + +#define ELF_NGREG (sizeof (struct user_regs_struct) / sizeof(elf_greg_t)) +typedef elf_greg_t elf_gregset_t[ELF_NGREG]; + +typedef struct user_i387_struct elf_fpregset_t; + +#define task_pt_regs(t) (&(t)->thread.regs) + +struct task_struct; + +extern int elf_core_copy_fpregs(struct task_struct *t, elf_fpregset_t *fpu); + +#define ELF_CORE_COPY_FPREGS(t, fpu) elf_core_copy_fpregs(t, fpu) + +#define ELF_EXEC_PAGESIZE 4096 + +#define ELF_ET_DYN_BASE (2 * TASK_SIZE / 3) + +extern long elf_aux_hwcap; +#define ELF_HWCAP (elf_aux_hwcap) + +#define SET_PERSONALITY(ex) do ; while(0) +#define __HAVE_ARCH_GATE_AREA 1 + +#endif diff --git a/arch/x86/um/asm/irq_vectors.h b/arch/x86/um/asm/irq_vectors.h new file mode 100644 index 000000000000..272a81e0ce14 --- /dev/null +++ b/arch/x86/um/asm/irq_vectors.h @@ -0,0 +1,10 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __UM_IRQ_VECTORS_H +#define __UM_IRQ_VECTORS_H + +#endif + diff --git a/arch/x86/um/asm/mm_context.h b/arch/x86/um/asm/mm_context.h new file mode 100644 index 000000000000..4a73d63e4760 --- /dev/null +++ b/arch/x86/um/asm/mm_context.h @@ -0,0 +1,72 @@ +/* + * Copyright (C) 2004 Fujitsu Siemens Computers GmbH + * Licensed under the GPL + * + * Author: Bodo Stroesser <bstroesser@fujitsu-siemens.com> + */ + +#ifndef __ASM_LDT_H +#define __ASM_LDT_H + +#include <linux/mutex.h> +#include <asm/ldt.h> + +extern void ldt_host_info(void); + +#define LDT_PAGES_MAX \ + ((LDT_ENTRIES * LDT_ENTRY_SIZE)/PAGE_SIZE) +#define LDT_ENTRIES_PER_PAGE \ + (PAGE_SIZE/LDT_ENTRY_SIZE) +#define LDT_DIRECT_ENTRIES \ + ((LDT_PAGES_MAX*sizeof(void *))/LDT_ENTRY_SIZE) + +struct ldt_entry { + __u32 a; + __u32 b; +}; + +typedef struct uml_ldt { + int entry_count; + struct mutex lock; + union { + struct ldt_entry * pages[LDT_PAGES_MAX]; + struct ldt_entry entries[LDT_DIRECT_ENTRIES]; + } u; +} uml_ldt_t; + +#define LDT_entry_a(info) \ + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) + +#define LDT_entry_b(info) \ + (((info)->base_addr & 0xff000000) | \ + (((info)->base_addr & 0x00ff0000) >> 16) | \ + ((info)->limit & 0xf0000) | \ + (((info)->read_exec_only ^ 1) << 9) | \ + ((info)->contents << 10) | \ + (((info)->seg_not_present ^ 1) << 15) | \ + ((info)->seg_32bit << 22) | \ + ((info)->limit_in_pages << 23) | \ + ((info)->useable << 20) | \ + 0x7000) + +#define _LDT_empty(info) (\ + (info)->base_addr == 0 && \ + (info)->limit == 0 && \ + (info)->contents == 0 && \ + (info)->read_exec_only == 1 && \ + (info)->seg_32bit == 0 && \ + (info)->limit_in_pages == 0 && \ + (info)->seg_not_present == 1 && \ + (info)->useable == 0 ) + +#ifdef CONFIG_X86_64 +#define LDT_empty(info) (_LDT_empty(info) && ((info)->lm == 0)) +#else +#define LDT_empty(info) (_LDT_empty(info)) +#endif + +struct uml_arch_mm_context { + uml_ldt_t ldt; +}; + +#endif diff --git a/arch/x86/um/asm/module.h b/arch/x86/um/asm/module.h new file mode 100644 index 000000000000..61af80e932eb --- /dev/null +++ b/arch/x86/um/asm/module.h @@ -0,0 +1,23 @@ +#ifndef __UM_MODULE_H +#define __UM_MODULE_H + +/* UML is simple */ +struct mod_arch_specific +{ +}; + +#ifdef CONFIG_X86_32 + +#define Elf_Shdr Elf32_Shdr +#define Elf_Sym Elf32_Sym +#define Elf_Ehdr Elf32_Ehdr + +#else + +#define Elf_Shdr Elf64_Shdr +#define Elf_Sym Elf64_Sym +#define Elf_Ehdr Elf64_Ehdr + +#endif + +#endif diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h new file mode 100644 index 000000000000..118c143a9cb4 --- /dev/null +++ b/arch/x86/um/asm/processor.h @@ -0,0 +1,22 @@ +#ifndef __UM_PROCESSOR_H +#define __UM_PROCESSOR_H + +/* include faultinfo structure */ +#include <sysdep/faultinfo.h> + +#ifdef CONFIG_X86_32 +# include "processor_32.h" +#else +# include "processor_64.h" +#endif + +#define KSTK_EIP(tsk) KSTK_REG(tsk, HOST_IP) +#define KSTK_ESP(tsk) KSTK_REG(tsk, HOST_IP) +#define KSTK_EBP(tsk) KSTK_REG(tsk, HOST_BP) + +#define ARCH_IS_STACKGROW(address) \ + (address + 65536 + 32 * sizeof(unsigned long) >= UPT_SP(¤t->thread.regs.regs)) + +#include <asm/processor-generic.h> + +#endif diff --git a/arch/x86/um/asm/processor_32.h b/arch/x86/um/asm/processor_32.h new file mode 100644 index 000000000000..018f732704dd --- /dev/null +++ b/arch/x86/um/asm/processor_32.h @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __UM_PROCESSOR_I386_H +#define __UM_PROCESSOR_I386_H + +#include <linux/string.h> +#include <asm/segment.h> +#include <asm/ldt.h> + +extern int host_has_cmov; + +struct uml_tls_struct { + struct user_desc tls; + unsigned flushed:1; + unsigned present:1; +}; + +struct arch_thread { + struct uml_tls_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; + unsigned long debugregs[8]; + int debugregs_seq; + struct faultinfo faultinfo; +}; + +#define INIT_ARCH_THREAD { \ + .tls_array = { [ 0 ... GDT_ENTRY_TLS_ENTRIES - 1 ] = \ + { .present = 0, .flushed = 0 } }, \ + .debugregs = { [ 0 ... 7 ] = 0 }, \ + .debugregs_seq = 0, \ + .faultinfo = { 0, 0, 0 } \ +} + +static inline void arch_flush_thread(struct arch_thread *thread) +{ + /* Clear any TLS still hanging */ + memset(&thread->tls_array, 0, sizeof(thread->tls_array)); +} + +static inline void arch_copy_thread(struct arch_thread *from, + struct arch_thread *to) +{ + memcpy(&to->tls_array, &from->tls_array, sizeof(from->tls_array)); +} + +#include <asm/user.h> + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static inline void rep_nop(void) +{ + __asm__ __volatile__("rep;nop": : :"memory"); +} + +#define cpu_relax() rep_nop() + +/* + * Default implementation of macro that returns current + * instruction pointer ("program counter"). Stolen + * from asm-i386/processor.h + */ +#define current_text_addr() \ + ({ void *pc; __asm__("movl $1f,%0\n1:":"=g" (pc)); pc; }) + +#endif diff --git a/arch/x86/um/asm/processor_64.h b/arch/x86/um/asm/processor_64.h new file mode 100644 index 000000000000..61de92d916c3 --- /dev/null +++ b/arch/x86/um/asm/processor_64.h @@ -0,0 +1,45 @@ +/* + * Copyright 2003 PathScale, Inc. + * + * Licensed under the GPL + */ + +#ifndef __UM_PROCESSOR_X86_64_H +#define __UM_PROCESSOR_X86_64_H + +struct arch_thread { + unsigned long debugregs[8]; + int debugregs_seq; + unsigned long fs; + struct faultinfo faultinfo; +}; + +/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */ +static inline void rep_nop(void) +{ + __asm__ __volatile__("rep;nop": : :"memory"); +} + +#define cpu_relax() rep_nop() + +#define INIT_ARCH_THREAD { .debugregs = { [ 0 ... 7 ] = 0 }, \ + .debugregs_seq = 0, \ + .fs = 0, \ + .faultinfo = { 0, 0, 0 } } + +static inline void arch_flush_thread(struct arch_thread *thread) +{ +} + +static inline void arch_copy_thread(struct arch_thread *from, + struct arch_thread *to) +{ + to->fs = from->fs; +} + +#include <asm/user.h> + +#define current_text_addr() \ + ({ void *pc; __asm__("movq $1f,%0\n1:":"=g" (pc)); pc; }) + +#endif diff --git a/arch/x86/um/asm/ptrace.h b/arch/x86/um/asm/ptrace.h new file mode 100644 index 000000000000..c8aca8c501b0 --- /dev/null +++ b/arch/x86/um/asm/ptrace.h @@ -0,0 +1,5 @@ +#ifdef CONFIG_X86_32 +# include "ptrace_32.h" +#else +# include "ptrace_64.h" +#endif diff --git a/arch/x86/um/asm/ptrace_32.h b/arch/x86/um/asm/ptrace_32.h new file mode 100644 index 000000000000..5d2a59112537 --- /dev/null +++ b/arch/x86/um/asm/ptrace_32.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#ifndef __UM_PTRACE_I386_H +#define __UM_PTRACE_I386_H + +#define HOST_AUDIT_ARCH AUDIT_ARCH_I386 + +#include "linux/compiler.h" +#include "asm/ptrace-generic.h" + +#define PT_REGS_EAX(r) UPT_EAX(&(r)->regs) +#define PT_REGS_EBX(r) UPT_EBX(&(r)->regs) +#define PT_REGS_ECX(r) UPT_ECX(&(r)->regs) +#define PT_REGS_EDX(r) UPT_EDX(&(r)->regs) +#define PT_REGS_ESI(r) UPT_ESI(&(r)->regs) +#define PT_REGS_EDI(r) UPT_EDI(&(r)->regs) +#define PT_REGS_EBP(r) UPT_EBP(&(r)->regs) + +#define PT_REGS_CS(r) UPT_CS(&(r)->regs) +#define PT_REGS_SS(r) UPT_SS(&(r)->regs) +#define PT_REGS_DS(r) UPT_DS(&(r)->regs) +#define PT_REGS_ES(r) UPT_ES(&(r)->regs) +#define PT_REGS_FS(r) UPT_FS(&(r)->regs) +#define PT_REGS_GS(r) UPT_GS(&(r)->regs) + +#define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs) + +#define PT_REGS_ORIG_SYSCALL(r) PT_REGS_EAX(r) +#define PT_REGS_SYSCALL_RET(r) PT_REGS_EAX(r) +#define PT_FIX_EXEC_STACK(sp) do ; while(0) + +#define profile_pc(regs) PT_REGS_IP(regs) + +#define user_mode(r) UPT_IS_USER(&(r)->regs) + +/* + * Forward declaration to avoid including sysdep/tls.h, which causes a + * circular include, and compilation failures. + */ +struct user_desc; + +extern int ptrace_get_thread_area(struct task_struct *child, int idx, + struct user_desc __user *user_desc); + +extern int ptrace_set_thread_area(struct task_struct *child, int idx, + struct user_desc __user *user_desc); + +#endif diff --git a/arch/x86/um/asm/ptrace_64.h b/arch/x86/um/asm/ptrace_64.h new file mode 100644 index 000000000000..706a0d80545c --- /dev/null +++ b/arch/x86/um/asm/ptrace_64.h @@ -0,0 +1,72 @@ +/* + * Copyright 2003 PathScale, Inc. + * + * Licensed under the GPL + */ + +#ifndef __UM_PTRACE_X86_64_H +#define __UM_PTRACE_X86_64_H + +#include "linux/compiler.h" +#include "asm/errno.h" + +#define __FRAME_OFFSETS /* Needed to get the R* macros */ +#include "asm/ptrace-generic.h" + +#define HOST_AUDIT_ARCH AUDIT_ARCH_X86_64 + +#define PT_REGS_RBX(r) UPT_RBX(&(r)->regs) +#define PT_REGS_RCX(r) UPT_RCX(&(r)->regs) +#define PT_REGS_RDX(r) UPT_RDX(&(r)->regs) +#define PT_REGS_RSI(r) UPT_RSI(&(r)->regs) +#define PT_REGS_RDI(r) UPT_RDI(&(r)->regs) +#define PT_REGS_RBP(r) UPT_RBP(&(r)->regs) +#define PT_REGS_RAX(r) UPT_RAX(&(r)->regs) +#define PT_REGS_R8(r) UPT_R8(&(r)->regs) +#define PT_REGS_R9(r) UPT_R9(&(r)->regs) +#define PT_REGS_R10(r) UPT_R10(&(r)->regs) +#define PT_REGS_R11(r) UPT_R11(&(r)->regs) +#define PT_REGS_R12(r) UPT_R12(&(r)->regs) +#define PT_REGS_R13(r) UPT_R13(&(r)->regs) +#define PT_REGS_R14(r) UPT_R14(&(r)->regs) +#define PT_REGS_R15(r) UPT_R15(&(r)->regs) + +#define PT_REGS_FS(r) UPT_FS(&(r)->regs) +#define PT_REGS_GS(r) UPT_GS(&(r)->regs) +#define PT_REGS_DS(r) UPT_DS(&(r)->regs) +#define PT_REGS_ES(r) UPT_ES(&(r)->regs) +#define PT_REGS_SS(r) UPT_SS(&(r)->regs) +#define PT_REGS_CS(r) UPT_CS(&(r)->regs) + +#define PT_REGS_ORIG_RAX(r) UPT_ORIG_RAX(&(r)->regs) +#define PT_REGS_RIP(r) UPT_IP(&(r)->regs) +#define PT_REGS_SP(r) UPT_SP(&(r)->regs) + +#define PT_REGS_EFLAGS(r) UPT_EFLAGS(&(r)->regs) + +/* XXX */ +#define user_mode(r) UPT_IS_USER(&(r)->regs) +#define PT_REGS_ORIG_SYSCALL(r) PT_REGS_RAX(r) +#define PT_REGS_SYSCALL_RET(r) PT_REGS_RAX(r) + +#define PT_FIX_EXEC_STACK(sp) do ; while(0) + +#define profile_pc(regs) PT_REGS_IP(regs) + +struct user_desc; + +static inline int ptrace_get_thread_area(struct task_struct *child, int idx, + struct user_desc __user *user_desc) +{ + return -ENOSYS; +} + +static inline int ptrace_set_thread_area(struct task_struct *child, int idx, + struct user_desc __user *user_desc) +{ + return -ENOSYS; +} + +extern long arch_prctl(struct task_struct *task, int code, + unsigned long __user *addr); +#endif diff --git a/arch/x86/um/asm/required-features.h b/arch/x86/um/asm/required-features.h new file mode 100644 index 000000000000..dfb967b2d2f3 --- /dev/null +++ b/arch/x86/um/asm/required-features.h @@ -0,0 +1,9 @@ +#ifndef __UM_REQUIRED_FEATURES_H +#define __UM_REQUIRED_FEATURES_H + +/* + * Nothing to see, just need something for the i386 and x86_64 asm + * headers to include. + */ + +#endif diff --git a/arch/x86/um/asm/segment.h b/arch/x86/um/asm/segment.h new file mode 100644 index 000000000000..45183fcd10b6 --- /dev/null +++ b/arch/x86/um/asm/segment.h @@ -0,0 +1,10 @@ +#ifndef __UM_SEGMENT_H +#define __UM_SEGMENT_H + +extern int host_gdt_entry_tls_min; + +#define GDT_ENTRY_TLS_ENTRIES 3 +#define GDT_ENTRY_TLS_MIN host_gdt_entry_tls_min +#define GDT_ENTRY_TLS_MAX (GDT_ENTRY_TLS_MIN + GDT_ENTRY_TLS_ENTRIES - 1) + +#endif diff --git a/arch/x86/um/asm/system.h b/arch/x86/um/asm/system.h new file mode 100644 index 000000000000..a459fd9b7598 --- /dev/null +++ b/arch/x86/um/asm/system.h @@ -0,0 +1,135 @@ +#ifndef _ASM_X86_SYSTEM_H_ +#define _ASM_X86_SYSTEM_H_ + +#include <asm/asm.h> +#include <asm/segment.h> +#include <asm/cpufeature.h> +#include <asm/cmpxchg.h> +#include <asm/nops.h> + +#include <linux/kernel.h> +#include <linux/irqflags.h> + +/* entries in ARCH_DLINFO: */ +#ifdef CONFIG_IA32_EMULATION +# define AT_VECTOR_SIZE_ARCH 2 +#else +# define AT_VECTOR_SIZE_ARCH 1 +#endif + +extern unsigned long arch_align_stack(unsigned long sp); + +void default_idle(void); + +/* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking + * to devices. + */ +#ifdef CONFIG_X86_32 +/* + * Some non-Intel clones support out of order store. wmb() ceases to be a + * nop for these. + */ +#define mb() alternative("lock; addl $0,0(%%esp)", "mfence", X86_FEATURE_XMM2) +#define rmb() alternative("lock; addl $0,0(%%esp)", "lfence", X86_FEATURE_XMM2) +#define wmb() alternative("lock; addl $0,0(%%esp)", "sfence", X86_FEATURE_XMM) +#else +#define mb() asm volatile("mfence":::"memory") +#define rmb() asm volatile("lfence":::"memory") +#define wmb() asm volatile("sfence" ::: "memory") +#endif + +/** + * read_barrier_depends - Flush all pending reads that subsequents reads + * depend on. + * + * No data-dependent reads from memory-like regions are ever reordered + * over this barrier. All reads preceding this primitive are guaranteed + * to access memory (but not necessarily other CPUs' caches) before any + * reads following this primitive that depend on the data return by + * any of the preceding reads. This primitive is much lighter weight than + * rmb() on most CPUs, and is never heavier weight than is + * rmb(). + * + * These ordering constraints are respected by both the local CPU + * and the compiler. + * + * Ordering is not guaranteed by anything other than these primitives, + * not even by data dependencies. See the documentation for + * memory_barrier() for examples and URLs to more information. + * + * For example, the following code would force ordering (the initial + * value of "a" is zero, "b" is one, and "p" is "&a"): + * + * <programlisting> + * CPU 0 CPU 1 + * + * b = 2; + * memory_barrier(); + * p = &b; q = p; + * read_barrier_depends(); + * d = *q; + * </programlisting> + * + * because the read of "*q" depends on the read of "p" and these + * two reads are separated by a read_barrier_depends(). However, + * the following code, with the same initial values for "a" and "b": + * + * <programlisting> + * CPU 0 CPU 1 + * + * a = 2; + * memory_barrier(); + * b = 3; y = b; + * read_barrier_depends(); + * x = a; + * </programlisting> + * + * does not enforce ordering, since there is no data dependency between + * the read of "a" and the read of "b". Therefore, on some CPUs, such + * as Alpha, "y" could be set to 3 and "x" to 0. Use rmb() + * in cases like this where there are no data dependencies. + **/ + +#define read_barrier_depends() do { } while (0) + +#ifdef CONFIG_SMP +#define smp_mb() mb() +#ifdef CONFIG_X86_PPRO_FENCE +# define smp_rmb() rmb() +#else +# define smp_rmb() barrier() +#endif +#ifdef CONFIG_X86_OOSTORE +# define smp_wmb() wmb() +#else +# define smp_wmb() barrier() +#endif +#define smp_read_barrier_depends() read_barrier_depends() +#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#else +#define smp_mb() barrier() +#define smp_rmb() barrier() +#define smp_wmb() barrier() +#define smp_read_barrier_depends() do { } while (0) +#define set_mb(var, value) do { var = value; barrier(); } while (0) +#endif + +/* + * Stop RDTSC speculation. This is needed when you need to use RDTSC + * (or get_cycles or vread that possibly accesses the TSC) in a defined + * code region. + * + * (Could use an alternative three way for this if there was one.) + */ +static inline void rdtsc_barrier(void) +{ + alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); + alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); +} + +extern void *_switch_to(void *prev, void *next, void *last); +#define switch_to(prev, next, last) prev = _switch_to(prev, next, last) + +#endif diff --git a/arch/x86/um/asm/vm-flags.h b/arch/x86/um/asm/vm-flags.h new file mode 100644 index 000000000000..7c297e9e2413 --- /dev/null +++ b/arch/x86/um/asm/vm-flags.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com) + * Copyright 2003 PathScale, Inc. + * Licensed under the GPL + */ + +#ifndef __VM_FLAGS_X86_H +#define __VM_FLAGS_X86_H + +#ifdef CONFIG_X86_32 + +#define VM_DATA_DEFAULT_FLAGS \ + (VM_READ | VM_WRITE | \ + ((current->personality & READ_IMPLIES_EXEC) ? VM_EXEC : 0 ) | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#else + +#define VM_DATA_DEFAULT_FLAGS (VM_READ | VM_WRITE | VM_EXEC | \ + VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) +#define VM_STACK_DEFAULT_FLAGS (VM_GROWSDOWN | VM_READ | VM_WRITE | \ + VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC) + +#endif +#endif diff --git a/arch/x86/um/bug.c b/arch/x86/um/bug.c new file mode 100644 index 000000000000..e8034e363d83 --- /dev/null +++ b/arch/x86/um/bug.c @@ -0,0 +1,21 @@ +/* + * Copyright (C) 2006 Jeff Dike (jdike@addtoit.com) + * Licensed under the GPL V2 + */ + +#include <linux/uaccess.h> + +/* + * Mostly copied from i386/x86_86 - eliminated the eip < PAGE_OFFSET because + * that's not relevant in skas mode. + */ + +int is_valid_bugaddr(unsigned long eip) +{ + unsigned short ud2; + + if (probe_kernel_address((unsigned short __user *)eip, ud2)) + return 0; + + return ud2 == 0x0b0f; +} diff --git a/arch/x86/um/bugs_32.c b/arch/x86/um/bugs_32.c new file mode 100644 index 000000000000..a1fba5fb9dbe --- /dev/null +++ b/arch/x86/um/bugs_32.c @@ -0,0 +1,76 @@ +/* + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include <signal.h> +#include "kern_util.h" +#include "longjmp.h" +#include "sysdep/ptrace.h" +#include <generated/asm-offsets.h> + +/* Set during early boot */ +static int host_has_cmov = 1; +static jmp_buf cmov_test_return; + +#define TASK_PID(task) *((int *) &(((char *) (task))[HOST_TASK_PID])) + +static void cmov_sigill_test_handler(int sig) +{ + host_has_cmov = 0; + longjmp(cmov_test_return, 1); +} + +void arch_check_bugs(void) +{ + struct sigaction old, new; + + printk(UM_KERN_INFO "Checking for host processor cmov support..."); + new.sa_handler = cmov_sigill_test_handler; + + /* Make sure that SIGILL is enabled after the handler longjmps back */ + new.sa_flags = SA_NODEFER; + sigemptyset(&new.sa_mask); + sigaction(SIGILL, &new, &old); + + if (setjmp(cmov_test_return) == 0) { + unsigned long foo = 0; + __asm__ __volatile__("cmovz %0, %1" : "=r" (foo) : "0" (foo)); + printk(UM_KERN_CONT "Yes\n"); + } else + printk(UM_KERN_CONT "No\n"); + + sigaction(SIGILL, &old, &new); +} + +void arch_examine_signal(int sig, struct uml_pt_regs *regs) +{ + unsigned char tmp[2]; + + /* + * This is testing for a cmov (0x0f 0x4x) instruction causing a + * SIGILL in init. + */ + if ((sig != SIGILL) || (TASK_PID(get_current()) != 1)) + return; + + if (copy_from_user_proc(tmp, (void *) UPT_IP(regs), 2)) { + printk(UM_KERN_ERR "SIGILL in init, could not read " + "instructions!\n"); + return; + } + + if ((tmp[0] != 0x0f) || ((tmp[1] & 0xf0) != 0x40)) + return; + + if (host_has_cmov == 0) + printk(UM_KERN_ERR "SIGILL caused by cmov, which this " + "processor doesn't implement. Boot a filesystem " + "compiled for older processors"); + else if (host_has_cmov == 1) + printk(UM_KERN_ERR "SIGILL caused by cmov, which this " + "processor claims to implement"); + else + printk(UM_KERN_ERR "Bad value for host_has_cmov (%d)", + host_has_cmov); +} diff --git a/arch/x86/um/bugs_64.c b/arch/x86/um/bugs_64.c new file mode 100644 index 000000000000..44e02ba2a265 --- /dev/null +++ b/arch/x86/um/bugs_64.c @@ -0,0 +1,15 @@ +/* + * Copyright 2003 PathScale, Inc. + * + * Licensed under the GPL + */ + +#include "sysdep/ptrace.h" + +void arch_check_bugs(void) +{ +} + +void arch_examine_signal(int sig, struct uml_pt_regs *regs) +{ +} diff --git a/arch/x86/um/checksum_32.S b/arch/x86/um/checksum_32.S new file mode 100644 index 000000000000..f058d2f82e18 --- /dev/null +++ b/arch/x86/um/checksum_32.S @@ -0,0 +1,458 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * IP/TCP/UDP checksumming routines + * + * Authors: Jorge Cwik, <jorge@laser.satlink.net> + * Arnt Gulbrandsen, <agulbra@nvg.unit.no> + * Tom May, <ftom@netcom.com> + * Pentium Pro/II routines: + * Alexander Kjeldaas <astor@guardian.no> + * Finn Arne Gangstad <finnag@guardian.no> + * Lots of code moved from tcp.c and ip.c; see those files + * for more names. + * + * Changes: Ingo Molnar, converted csum_partial_copy() to 2.1 exception + * handling. + * Andi Kleen, add zeroing on error + * converted to pure assembler + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <asm/errno.h> + +/* + * computes a partial checksum, e.g. for TCP/UDP fragments + */ + +/* +unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) + */ + +.text +.align 4 +.globl csum_partial + +#ifndef CONFIG_X86_USE_PPRO_CHECKSUM + + /* + * Experiments with Ethernet and SLIP connections show that buff + * is aligned on either a 2-byte or 4-byte boundary. We get at + * least a twofold speedup on 486 and Pentium if it is 4-byte aligned. + * Fortunately, it is easy to convert 2-byte alignment to 4-byte + * alignment for the unrolled loop. + */ +csum_partial: + pushl %esi + pushl %ebx + movl 20(%esp),%eax # Function arg: unsigned int sum + movl 16(%esp),%ecx # Function arg: int len + movl 12(%esp),%esi # Function arg: unsigned char *buff + testl $2, %esi # Check alignment. + jz 2f # Jump if alignment is ok. + subl $2, %ecx # Alignment uses up two bytes. + jae 1f # Jump if we had at least two bytes. + addl $2, %ecx # ecx was < 2. Deal with it. + jmp 4f +1: movw (%esi), %bx + addl $2, %esi + addw %bx, %ax + adcl $0, %eax +2: + movl %ecx, %edx + shrl $5, %ecx + jz 2f + testl %esi, %esi +1: movl (%esi), %ebx + adcl %ebx, %eax + movl 4(%esi), %ebx + adcl %ebx, %eax + movl 8(%esi), %ebx + adcl %ebx, %eax + movl 12(%esi), %ebx + adcl %ebx, %eax + movl 16(%esi), %ebx + adcl %ebx, %eax + movl 20(%esi), %ebx + adcl %ebx, %eax + movl 24(%esi), %ebx + adcl %ebx, %eax + movl 28(%esi), %ebx + adcl %ebx, %eax + lea 32(%esi), %esi + dec %ecx + jne 1b + adcl $0, %eax +2: movl %edx, %ecx + andl $0x1c, %edx + je 4f + shrl $2, %edx # This clears CF +3: adcl (%esi), %eax + lea 4(%esi), %esi + dec %edx + jne 3b + adcl $0, %eax +4: andl $3, %ecx + jz 7f + cmpl $2, %ecx + jb 5f + movw (%esi),%cx + leal 2(%esi),%esi + je 6f + shll $16,%ecx +5: movb (%esi),%cl +6: addl %ecx,%eax + adcl $0, %eax +7: + popl %ebx + popl %esi + ret + +#else + +/* Version for PentiumII/PPro */ + +csum_partial: + pushl %esi + pushl %ebx + movl 20(%esp),%eax # Function arg: unsigned int sum + movl 16(%esp),%ecx # Function arg: int len + movl 12(%esp),%esi # Function arg: const unsigned char *buf + + testl $2, %esi + jnz 30f +10: + movl %ecx, %edx + movl %ecx, %ebx + andl $0x7c, %ebx + shrl $7, %ecx + addl %ebx,%esi + shrl $2, %ebx + negl %ebx + lea 45f(%ebx,%ebx,2), %ebx + testl %esi, %esi + jmp *%ebx + + # Handle 2-byte-aligned regions +20: addw (%esi), %ax + lea 2(%esi), %esi + adcl $0, %eax + jmp 10b + +30: subl $2, %ecx + ja 20b + je 32f + movzbl (%esi),%ebx # csumming 1 byte, 2-aligned + addl %ebx, %eax + adcl $0, %eax + jmp 80f +32: + addw (%esi), %ax # csumming 2 bytes, 2-aligned + adcl $0, %eax + jmp 80f + +40: + addl -128(%esi), %eax + adcl -124(%esi), %eax + adcl -120(%esi), %eax + adcl -116(%esi), %eax + adcl -112(%esi), %eax + adcl -108(%esi), %eax + adcl -104(%esi), %eax + adcl -100(%esi), %eax + adcl -96(%esi), %eax + adcl -92(%esi), %eax + adcl -88(%esi), %eax + adcl -84(%esi), %eax + adcl -80(%esi), %eax + adcl -76(%esi), %eax + adcl -72(%esi), %eax + adcl -68(%esi), %eax + adcl -64(%esi), %eax + adcl -60(%esi), %eax + adcl -56(%esi), %eax + adcl -52(%esi), %eax + adcl -48(%esi), %eax + adcl -44(%esi), %eax + adcl -40(%esi), %eax + adcl -36(%esi), %eax + adcl -32(%esi), %eax + adcl -28(%esi), %eax + adcl -24(%esi), %eax + adcl -20(%esi), %eax + adcl -16(%esi), %eax + adcl -12(%esi), %eax + adcl -8(%esi), %eax + adcl -4(%esi), %eax +45: + lea 128(%esi), %esi + adcl $0, %eax + dec %ecx + jge 40b + movl %edx, %ecx +50: andl $3, %ecx + jz 80f + + # Handle the last 1-3 bytes without jumping + notl %ecx # 1->2, 2->1, 3->0, higher bits are masked + movl $0xffffff,%ebx # by the shll and shrl instructions + shll $3,%ecx + shrl %cl,%ebx + andl -128(%esi),%ebx # esi is 4-aligned so should be ok + addl %ebx,%eax + adcl $0,%eax +80: + popl %ebx + popl %esi + ret + +#endif + +/* +unsigned int csum_partial_copy_generic (const char *src, char *dst, + int len, int sum, int *src_err_ptr, int *dst_err_ptr) + */ + +/* + * Copy from ds while checksumming, otherwise like csum_partial + * + * The macros SRC and DST specify the type of access for the instruction. + * thus we can call a custom exception handler for all access types. + * + * FIXME: could someone double-check whether I haven't mixed up some SRC and + * DST definitions? It's damn hard to trigger all cases. I hope I got + * them all but there's no guarantee. + */ + +#define SRC(y...) \ + 9999: y; \ + .section __ex_table, "a"; \ + .long 9999b, 6001f ; \ + .previous + +#define DST(y...) \ + 9999: y; \ + .section __ex_table, "a"; \ + .long 9999b, 6002f ; \ + .previous + +.align 4 + +#ifndef CONFIG_X86_USE_PPRO_CHECKSUM + +#define ARGBASE 16 +#define FP 12 + +csum_partial_copy_generic_i386: + subl $4,%esp + pushl %edi + pushl %esi + pushl %ebx + movl ARGBASE+16(%esp),%eax # sum + movl ARGBASE+12(%esp),%ecx # len + movl ARGBASE+4(%esp),%esi # src + movl ARGBASE+8(%esp),%edi # dst + + testl $2, %edi # Check alignment. + jz 2f # Jump if alignment is ok. + subl $2, %ecx # Alignment uses up two bytes. + jae 1f # Jump if we had at least two bytes. + addl $2, %ecx # ecx was < 2. Deal with it. + jmp 4f +SRC(1: movw (%esi), %bx ) + addl $2, %esi +DST( movw %bx, (%edi) ) + addl $2, %edi + addw %bx, %ax + adcl $0, %eax +2: + movl %ecx, FP(%esp) + shrl $5, %ecx + jz 2f + testl %esi, %esi +SRC(1: movl (%esi), %ebx ) +SRC( movl 4(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, (%edi) ) + adcl %edx, %eax +DST( movl %edx, 4(%edi) ) + +SRC( movl 8(%esi), %ebx ) +SRC( movl 12(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, 8(%edi) ) + adcl %edx, %eax +DST( movl %edx, 12(%edi) ) + +SRC( movl 16(%esi), %ebx ) +SRC( movl 20(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, 16(%edi) ) + adcl %edx, %eax +DST( movl %edx, 20(%edi) ) + +SRC( movl 24(%esi), %ebx ) +SRC( movl 28(%esi), %edx ) + adcl %ebx, %eax +DST( movl %ebx, 24(%edi) ) + adcl %edx, %eax +DST( movl %edx, 28(%edi) ) + + lea 32(%esi), %esi + lea 32(%edi), %edi + dec %ecx + jne 1b + adcl $0, %eax +2: movl FP(%esp), %edx + movl %edx, %ecx + andl $0x1c, %edx + je 4f + shrl $2, %edx # This clears CF +SRC(3: movl (%esi), %ebx ) + adcl %ebx, %eax +DST( movl %ebx, (%edi) ) + lea 4(%esi), %esi + lea 4(%edi), %edi + dec %edx + jne 3b + adcl $0, %eax +4: andl $3, %ecx + jz 7f + cmpl $2, %ecx + jb 5f +SRC( movw (%esi), %cx ) + leal 2(%esi), %esi +DST( movw %cx, (%edi) ) + leal 2(%edi), %edi + je 6f + shll $16,%ecx +SRC(5: movb (%esi), %cl ) +DST( movb %cl, (%edi) ) +6: addl %ecx, %eax + adcl $0, %eax +7: +5000: + +# Exception handler: +.section .fixup, "ax" + +6001: + movl ARGBASE+20(%esp), %ebx # src_err_ptr + movl $-EFAULT, (%ebx) + + # zero the complete destination - computing the rest + # is too much work + movl ARGBASE+8(%esp), %edi # dst + movl ARGBASE+12(%esp), %ecx # len + xorl %eax,%eax + rep ; stosb + + jmp 5000b + +6002: + movl ARGBASE+24(%esp), %ebx # dst_err_ptr + movl $-EFAULT,(%ebx) + jmp 5000b + +.previous + + popl %ebx + popl %esi + popl %edi + popl %ecx # equivalent to addl $4,%esp + ret + +#else + +/* Version for PentiumII/PPro */ + +#define ROUND1(x) \ + SRC(movl x(%esi), %ebx ) ; \ + addl %ebx, %eax ; \ + DST(movl %ebx, x(%edi) ) ; + +#define ROUND(x) \ + SRC(movl x(%esi), %ebx ) ; \ + adcl %ebx, %eax ; \ + DST(movl %ebx, x(%edi) ) ; + +#define ARGBASE 12 + +csum_partial_copy_generic_i386: + pushl %ebx + pushl %edi + pushl %esi + movl ARGBASE+4(%esp),%esi #src + movl ARGBASE+8(%esp),%edi #dst + movl ARGBASE+12(%esp),%ecx #len + movl ARGBASE+16(%esp),%eax #sum +# movl %ecx, %edx + movl %ecx, %ebx + movl %esi, %edx + shrl $6, %ecx + andl $0x3c, %ebx + negl %ebx + subl %ebx, %esi + subl %ebx, %edi + lea -1(%esi),%edx + andl $-32,%edx + lea 3f(%ebx,%ebx), %ebx + testl %esi, %esi + jmp *%ebx +1: addl $64,%esi + addl $64,%edi + SRC(movb -32(%edx),%bl) ; SRC(movb (%edx),%bl) + ROUND1(-64) ROUND(-60) ROUND(-56) ROUND(-52) + ROUND (-48) ROUND(-44) ROUND(-40) ROUND(-36) + ROUND (-32) ROUND(-28) ROUND(-24) ROUND(-20) + ROUND (-16) ROUND(-12) ROUND(-8) ROUND(-4) +3: adcl $0,%eax + addl $64, %edx + dec %ecx + jge 1b +4: movl ARGBASE+12(%esp),%edx #len + andl $3, %edx + jz 7f + cmpl $2, %edx + jb 5f +SRC( movw (%esi), %dx ) + leal 2(%esi), %esi +DST( movw %dx, (%edi) ) + leal 2(%edi), %edi + je 6f + shll $16,%edx +5: +SRC( movb (%esi), %dl ) +DST( movb %dl, (%edi) ) +6: addl %edx, %eax + adcl $0, %eax +7: +.section .fixup, "ax" +6001: movl ARGBASE+20(%esp), %ebx # src_err_ptr + movl $-EFAULT, (%ebx) + # zero the complete destination (computing the rest is too much work) + movl ARGBASE+8(%esp),%edi # dst + movl ARGBASE+12(%esp),%ecx # len + xorl %eax,%eax + rep; stosb + jmp 7b +6002: movl ARGBASE+24(%esp), %ebx # dst_err_ptr + movl $-EFAULT, (%ebx) + jmp 7b +.previous + + popl %esi + popl %edi + popl %ebx + ret + +#undef ROUND +#undef ROUND1 + +#endif diff --git a/arch/x86/um/delay.c b/arch/x86/um/delay.c new file mode 100644 index 000000000000..f3fe1a688f7e --- /dev/null +++ b/arch/x86/um/delay.c @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2011 Richard Weinberger <richrd@nod.at> + * Mostly copied from arch/x86/lib/delay.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/delay.h> +#include <asm/param.h> + +void __delay(unsigned long loops) +{ + asm volatile( + "test %0,%0\n" + "jz 3f\n" + "jmp 1f\n" + + ".align 16\n" + "1: jmp 2f\n" + + ".align 16\n" + "2: dec %0\n" + " jnz 2b\n" + "3: dec %0\n" + + : /* we don't need output */ + : "a" (loops) + ); +} +EXPORT_SYMBOL(__delay); + +inline void __const_udelay(unsigned long xloops) +{ + int d0; + + xloops *= 4; + asm("mull %%edx" + : "=d" (xloops), "=&a" (d0) + : "1" (xloops), "0" + (loops_per_jiffy * (HZ/4))); + + __delay(++xloops); +} +EXPORT_SYMBOL(__const_udelay); + +void __udelay(unsigned long usecs) +{ + __const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */ +} +EXPORT_SYMBOL(__udelay); + +void __ndelay(unsigned long nsecs) +{ + __const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */ +} +EXPORT_SYMBOL(__ndelay); diff --git a/arch/x86/um/elfcore.c b/arch/x86/um/elfcore.c new file mode 100644 index 000000000000..6bb49b687c97 --- /dev/null +++ b/arch/x86/um/elfcore.c @@ -0,0 +1,83 @@ +#include <linux/elf.h> +#include <linux/coredump.h> +#include <linux/fs.h> +#include <linux/mm.h> + +#include <asm/elf.h> + + +Elf32_Half elf_core_extra_phdrs(void) +{ + return vsyscall_ehdr ? (((struct elfhdr *)vsyscall_ehdr)->e_phnum) : 0; +} + +int elf_core_write_extra_phdrs(struct file *file, loff_t offset, size_t *size, + unsigned long limit) +{ + if ( vsyscall_ehdr ) { + const struct elfhdr *const ehdrp = + (struct elfhdr *) vsyscall_ehdr; + const struct elf_phdr *const phdrp = + (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); + int i; + Elf32_Off ofs = 0; + + for (i = 0; i < ehdrp->e_phnum; ++i) { + struct elf_phdr phdr = phdrp[i]; + + if (phdr.p_type == PT_LOAD) { + ofs = phdr.p_offset = offset; + offset += phdr.p_filesz; + } else { + phdr.p_offset += ofs; + } + phdr.p_paddr = 0; /* match other core phdrs */ + *size += sizeof(phdr); + if (*size > limit + || !dump_write(file, &phdr, sizeof(phdr))) + return 0; + } + } + return 1; +} + +int elf_core_write_extra_data(struct file *file, size_t *size, + unsigned long limit) +{ + if ( vsyscall_ehdr ) { + const struct elfhdr *const ehdrp = + (struct elfhdr *) vsyscall_ehdr; + const struct elf_phdr *const phdrp = + (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); + int i; + + for (i = 0; i < ehdrp->e_phnum; ++i) { + if (phdrp[i].p_type == PT_LOAD) { + void *addr = (void *) phdrp[i].p_vaddr; + size_t filesz = phdrp[i].p_filesz; + + *size += filesz; + if (*size > limit + || !dump_write(file, addr, filesz)) + return 0; + } + } + } + return 1; +} + +size_t elf_core_extra_data_size(void) +{ + if ( vsyscall_ehdr ) { + const struct elfhdr *const ehdrp = + (struct elfhdr *)vsyscall_ehdr; + const struct elf_phdr *const phdrp = + (const struct elf_phdr *) (vsyscall_ehdr + ehdrp->e_phoff); + int i; + + for (i = 0; i < ehdrp->e_phnum; ++i) + if (phdrp[i].p_type == PT_LOAD) + return (size_t) phdrp[i].p_filesz; + } + return 0; +} diff --git a/arch/x86/um/fault.c b/arch/x86/um/fault.c new file mode 100644 index 000000000000..d670f68532f4 --- /dev/null +++ b/arch/x86/um/fault.c @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include "sysdep/ptrace.h" + +/* These two are from asm-um/uaccess.h and linux/module.h, check them. */ +struct exception_table_entry +{ + unsigned long insn; + unsigned long fixup; +}; + +const struct exception_table_entry *search_exception_tables(unsigned long add); + +/* Compare this to arch/i386/mm/extable.c:fixup_exception() */ +int arch_fixup(unsigned long address, struct uml_pt_regs *regs) +{ + const struct exception_table_entry *fixup; + + fixup = search_exception_tables(address); + if (fixup != 0) { + UPT_IP(regs) = fixup->fixup; + return 1; + } + return 0; +} diff --git a/arch/x86/um/ksyms.c b/arch/x86/um/ksyms.c new file mode 100644 index 000000000000..2e8f43ec6214 --- /dev/null +++ b/arch/x86/um/ksyms.c @@ -0,0 +1,13 @@ +#include <linux/module.h> +#include <asm/string.h> +#include <asm/checksum.h> + +#ifndef CONFIG_X86_32 +/*XXX: we need them because they would be exported by x86_64 */ +#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4 +EXPORT_SYMBOL(memcpy); +#else +EXPORT_SYMBOL(__memcpy); +#endif +#endif +EXPORT_SYMBOL(csum_partial); diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c new file mode 100644 index 000000000000..26b0e39d2ce9 --- /dev/null +++ b/arch/x86/um/ldt.c @@ -0,0 +1,502 @@ +/* + * Copyright (C) 2001 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/slab.h> +#include <asm/unistd.h> +#include "os.h" +#include "proc_mm.h" +#include "skas.h" +#include "skas_ptrace.h" +#include "sysdep/tls.h" + +extern int modify_ldt(int func, void *ptr, unsigned long bytecount); + +static long write_ldt_entry(struct mm_id *mm_idp, int func, + struct user_desc *desc, void **addr, int done) +{ + long res; + + if (proc_mm) { + /* + * This is a special handling for the case, that the mm to + * modify isn't current->active_mm. + * If this is called directly by modify_ldt, + * (current->active_mm->context.skas.u == mm_idp) + * will be true. So no call to __switch_mm(mm_idp) is done. + * If this is called in case of init_new_ldt or PTRACE_LDT, + * mm_idp won't belong to current->active_mm, but child->mm. + * So we need to switch child's mm into our userspace, then + * later switch back. + * + * Note: I'm unsure: should interrupts be disabled here? + */ + if (!current->active_mm || current->active_mm == &init_mm || + mm_idp != ¤t->active_mm->context.id) + __switch_mm(mm_idp); + } + + if (ptrace_ldt) { + struct ptrace_ldt ldt_op = (struct ptrace_ldt) { + .func = func, + .ptr = desc, + .bytecount = sizeof(*desc)}; + u32 cpu; + int pid; + + if (!proc_mm) + pid = mm_idp->u.pid; + else { + cpu = get_cpu(); + pid = userspace_pid[cpu]; + } + + res = os_ptrace_ldt(pid, 0, (unsigned long) &ldt_op); + + if (proc_mm) + put_cpu(); + } + else { + void *stub_addr; + res = syscall_stub_data(mm_idp, (unsigned long *)desc, + (sizeof(*desc) + sizeof(long) - 1) & + ~(sizeof(long) - 1), + addr, &stub_addr); + if (!res) { + unsigned long args[] = { func, + (unsigned long)stub_addr, + sizeof(*desc), + 0, 0, 0 }; + res = run_syscall_stub(mm_idp, __NR_modify_ldt, args, + 0, addr, done); + } + } + + if (proc_mm) { + /* + * This is the second part of special handling, that makes + * PTRACE_LDT possible to implement. + */ + if (current->active_mm && current->active_mm != &init_mm && + mm_idp != ¤t->active_mm->context.id) + __switch_mm(¤t->active_mm->context.id); + } + + return res; +} + +static long read_ldt_from_host(void __user * ptr, unsigned long bytecount) +{ + int res, n; + struct ptrace_ldt ptrace_ldt = (struct ptrace_ldt) { + .func = 0, + .bytecount = bytecount, + .ptr = kmalloc(bytecount, GFP_KERNEL)}; + u32 cpu; + + if (ptrace_ldt.ptr == NULL) + return -ENOMEM; + + /* + * This is called from sys_modify_ldt only, so userspace_pid gives + * us the right number + */ + + cpu = get_cpu(); + res = os_ptrace_ldt(userspace_pid[cpu], 0, (unsigned long) &ptrace_ldt); + put_cpu(); + if (res < 0) + goto out; + + n = copy_to_user(ptr, ptrace_ldt.ptr, res); + if (n != 0) + res = -EFAULT; + + out: + kfree(ptrace_ldt.ptr); + + return res; +} + +/* + * In skas mode, we hold our own ldt data in UML. + * Thus, the code implementing sys_modify_ldt_skas + * is very similar to (and mostly stolen from) sys_modify_ldt + * for arch/i386/kernel/ldt.c + * The routines copied and modified in part are: + * - read_ldt + * - read_default_ldt + * - write_ldt + * - sys_modify_ldt_skas + */ + +static int read_ldt(void __user * ptr, unsigned long bytecount) +{ + int i, err = 0; + unsigned long size; + uml_ldt_t *ldt = ¤t->mm->context.arch.ldt; + + if (!ldt->entry_count) + goto out; + if (bytecount > LDT_ENTRY_SIZE*LDT_ENTRIES) + bytecount = LDT_ENTRY_SIZE*LDT_ENTRIES; + err = bytecount; + + if (ptrace_ldt) + return read_ldt_from_host(ptr, bytecount); + + mutex_lock(&ldt->lock); + if (ldt->entry_count <= LDT_DIRECT_ENTRIES) { + size = LDT_ENTRY_SIZE*LDT_DIRECT_ENTRIES; + if (size > bytecount) + size = bytecount; + if (copy_to_user(ptr, ldt->u.entries, size)) + err = -EFAULT; + bytecount -= size; + ptr += size; + } + else { + for (i=0; i<ldt->entry_count/LDT_ENTRIES_PER_PAGE && bytecount; + i++) { + size = PAGE_SIZE; + if (size > bytecount) + size = bytecount; + if (copy_to_user(ptr, ldt->u.pages[i], size)) { + err = -EFAULT; + break; + } + bytecount -= size; + ptr += size; + } + } + mutex_unlock(&ldt->lock); + + if (bytecount == 0 || err == -EFAULT) + goto out; + + if (clear_user(ptr, bytecount)) + err = -EFAULT; + +out: + return err; +} + +static int read_default_ldt(void __user * ptr, unsigned long bytecount) +{ + int err; + + if (bytecount > 5*LDT_ENTRY_SIZE) + bytecount = 5*LDT_ENTRY_SIZE; + + err = bytecount; + /* + * UML doesn't support lcall7 and lcall27. + * So, we don't really have a default ldt, but emulate + * an empty ldt of common host default ldt size. + */ + if (clear_user(ptr, bytecount)) + err = -EFAULT; + + return err; +} + +static int write_ldt(void __user * ptr, unsigned long bytecount, int func) +{ + uml_ldt_t *ldt = ¤t->mm->context.arch.ldt; + struct mm_id * mm_idp = ¤t->mm->context.id; + int i, err; + struct user_desc ldt_info; + struct ldt_entry entry0, *ldt_p; + void *addr = NULL; + + err = -EINVAL; + if (bytecount != sizeof(ldt_info)) + goto out; + err = -EFAULT; + if (copy_from_user(&ldt_info, ptr, sizeof(ldt_info))) + goto out; + + err = -EINVAL; + if (ldt_info.entry_number >= LDT_ENTRIES) + goto out; + if (ldt_info.contents == 3) { + if (func == 1) + goto out; + if (ldt_info.seg_not_present == 0) + goto out; + } + + if (!ptrace_ldt) + mutex_lock(&ldt->lock); + + err = write_ldt_entry(mm_idp, func, &ldt_info, &addr, 1); + if (err) + goto out_unlock; + else if (ptrace_ldt) { + /* With PTRACE_LDT available, this is used as a flag only */ + ldt->entry_count = 1; + goto out; + } + + if (ldt_info.entry_number >= ldt->entry_count && + ldt_info.entry_number >= LDT_DIRECT_ENTRIES) { + for (i=ldt->entry_count/LDT_ENTRIES_PER_PAGE; + i*LDT_ENTRIES_PER_PAGE <= ldt_info.entry_number; + i++) { + if (i == 0) + memcpy(&entry0, ldt->u.entries, + sizeof(entry0)); + ldt->u.pages[i] = (struct ldt_entry *) + __get_free_page(GFP_KERNEL|__GFP_ZERO); + if (!ldt->u.pages[i]) { + err = -ENOMEM; + /* Undo the change in host */ + memset(&ldt_info, 0, sizeof(ldt_info)); + write_ldt_entry(mm_idp, 1, &ldt_info, &addr, 1); + goto out_unlock; + } + if (i == 0) { + memcpy(ldt->u.pages[0], &entry0, + sizeof(entry0)); + memcpy(ldt->u.pages[0]+1, ldt->u.entries+1, + sizeof(entry0)*(LDT_DIRECT_ENTRIES-1)); + } + ldt->entry_count = (i + 1) * LDT_ENTRIES_PER_PAGE; + } + } + if (ldt->entry_count <= ldt_info.entry_number) + ldt->entry_count = ldt_info.entry_number + 1; + + if (ldt->entry_count <= LDT_DIRECT_ENTRIES) + ldt_p = ldt->u.entries + ldt_info.entry_number; + else + ldt_p = ldt->u.pages[ldt_info.entry_number/LDT_ENTRIES_PER_PAGE] + + ldt_info.entry_number%LDT_ENTRIES_PER_PAGE; + + if (ldt_info.base_addr == 0 && ldt_info.limit == 0 && + (func == 1 || LDT_empty(&ldt_info))) { + ldt_p->a = 0; + ldt_p->b = 0; + } + else{ + if (func == 1) + ldt_info.useable = 0; + ldt_p->a = LDT_entry_a(&ldt_info); + ldt_p->b = LDT_entry_b(&ldt_info); + } + err = 0; + +out_unlock: + mutex_unlock(&ldt->lock); +out: + return err; +} + +static long do_modify_ldt_skas(int func, void __user *ptr, + unsigned long bytecount) +{ + int ret = -ENOSYS; + + switch (func) { + case 0: + ret = read_ldt(ptr, bytecount); + break; + case 1: + case 0x11: + ret = write_ldt(ptr, bytecount, func); + break; + case 2: + ret = read_default_ldt(ptr, bytecount); + break; + } + return ret; +} + +static DEFINE_SPINLOCK(host_ldt_lock); +static short dummy_list[9] = {0, -1}; +static short * host_ldt_entries = NULL; + +static void ldt_get_host_info(void) +{ + long ret; + struct ldt_entry * ldt; + short *tmp; + int i, size, k, order; + + spin_lock(&host_ldt_lock); + + if (host_ldt_entries != NULL) { + spin_unlock(&host_ldt_lock); + return; + } + host_ldt_entries = dummy_list+1; + + spin_unlock(&host_ldt_lock); + + for (i = LDT_PAGES_MAX-1, order=0; i; i>>=1, order++) + ; + + ldt = (struct ldt_entry *) + __get_free_pages(GFP_KERNEL|__GFP_ZERO, order); + if (ldt == NULL) { + printk(KERN_ERR "ldt_get_host_info: couldn't allocate buffer " + "for host ldt\n"); + return; + } + + ret = modify_ldt(0, ldt, (1<<order)*PAGE_SIZE); + if (ret < 0) { + printk(KERN_ERR "ldt_get_host_info: couldn't read host ldt\n"); + goto out_free; + } + if (ret == 0) { + /* default_ldt is active, simply write an empty entry 0 */ + host_ldt_entries = dummy_list; + goto out_free; + } + + for (i=0, size=0; i<ret/LDT_ENTRY_SIZE; i++) { + if (ldt[i].a != 0 || ldt[i].b != 0) + size++; + } + + if (size < ARRAY_SIZE(dummy_list)) + host_ldt_entries = dummy_list; + else { + size = (size + 1) * sizeof(dummy_list[0]); + tmp = kmalloc(size, GFP_KERNEL); + if (tmp == NULL) { + printk(KERN_ERR "ldt_get_host_info: couldn't allocate " + "host ldt list\n"); + goto out_free; + } + host_ldt_entries = tmp; + } + + for (i=0, k=0; i<ret/LDT_ENTRY_SIZE; i++) { + if (ldt[i].a != 0 || ldt[i].b != 0) + host_ldt_entries[k++] = i; + } + host_ldt_entries[k] = -1; + +out_free: + free_pages((unsigned long)ldt, order); +} + +long init_new_ldt(struct mm_context *new_mm, struct mm_context *from_mm) +{ + struct user_desc desc; + short * num_p; + int i; + long page, err=0; + void *addr = NULL; + struct proc_mm_op copy; + + + if (!ptrace_ldt) + mutex_init(&new_mm->arch.ldt.lock); + + if (!from_mm) { + memset(&desc, 0, sizeof(desc)); + /* + * We have to initialize a clean ldt. + */ + if (proc_mm) { + /* + * If the new mm was created using proc_mm, host's + * default-ldt currently is assigned, which normally + * contains the call-gates for lcall7 and lcall27. + * To remove these gates, we simply write an empty + * entry as number 0 to the host. + */ + err = write_ldt_entry(&new_mm->id, 1, &desc, &addr, 1); + } + else{ + /* + * Now we try to retrieve info about the ldt, we + * inherited from the host. All ldt-entries found + * will be reset in the following loop + */ + ldt_get_host_info(); + for (num_p=host_ldt_entries; *num_p != -1; num_p++) { + desc.entry_number = *num_p; + err = write_ldt_entry(&new_mm->id, 1, &desc, + &addr, *(num_p + 1) == -1); + if (err) + break; + } + } + new_mm->arch.ldt.entry_count = 0; + + goto out; + } + + if (proc_mm) { + /* + * We have a valid from_mm, so we now have to copy the LDT of + * from_mm to new_mm, because using proc_mm an new mm with + * an empty/default LDT was created in new_mm() + */ + copy = ((struct proc_mm_op) { .op = MM_COPY_SEGMENTS, + .u = + { .copy_segments = + from_mm->id.u.mm_fd } } ); + i = os_write_file(new_mm->id.u.mm_fd, ©, sizeof(copy)); + if (i != sizeof(copy)) + printk(KERN_ERR "new_mm : /proc/mm copy_segments " + "failed, err = %d\n", -i); + } + + if (!ptrace_ldt) { + /* + * Our local LDT is used to supply the data for + * modify_ldt(READLDT), if PTRACE_LDT isn't available, + * i.e., we have to use the stub for modify_ldt, which + * can't handle the big read buffer of up to 64kB. + */ + mutex_lock(&from_mm->arch.ldt.lock); + if (from_mm->arch.ldt.entry_count <= LDT_DIRECT_ENTRIES) + memcpy(new_mm->arch.ldt.u.entries, from_mm->arch.ldt.u.entries, + sizeof(new_mm->arch.ldt.u.entries)); + else { + i = from_mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE; + while (i-->0) { + page = __get_free_page(GFP_KERNEL|__GFP_ZERO); + if (!page) { + err = -ENOMEM; + break; + } + new_mm->arch.ldt.u.pages[i] = + (struct ldt_entry *) page; + memcpy(new_mm->arch.ldt.u.pages[i], + from_mm->arch.ldt.u.pages[i], PAGE_SIZE); + } + } + new_mm->arch.ldt.entry_count = from_mm->arch.ldt.entry_count; + mutex_unlock(&from_mm->arch.ldt.lock); + } + + out: + return err; +} + + +void free_ldt(struct mm_context *mm) +{ + int i; + + if (!ptrace_ldt && mm->arch.ldt.entry_count > LDT_DIRECT_ENTRIES) { + i = mm->arch.ldt.entry_count / LDT_ENTRIES_PER_PAGE; + while (i-- > 0) + free_page((long) mm->arch.ldt.u.pages[i]); + } + mm->arch.ldt.entry_count = 0; +} + +int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount) +{ + return do_modify_ldt_skas(func, ptr, bytecount); +} diff --git a/arch/x86/um/mem_32.c b/arch/x86/um/mem_32.c new file mode 100644 index 000000000000..639900a6fde9 --- /dev/null +++ b/arch/x86/um/mem_32.c @@ -0,0 +1,62 @@ +/* + * Copyright (C) 2011 Richard Weinberger <richrd@nod.at> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/mm.h> +#include <asm/page.h> +#include <asm/mman.h> + +static struct vm_area_struct gate_vma; + +static int __init gate_vma_init(void) +{ + if (!FIXADDR_USER_START) + return 0; + + gate_vma.vm_mm = NULL; + gate_vma.vm_start = FIXADDR_USER_START; + gate_vma.vm_end = FIXADDR_USER_END; + gate_vma.vm_flags = VM_READ | VM_MAYREAD | VM_EXEC | VM_MAYEXEC; + gate_vma.vm_page_prot = __P101; + + /* + * Make sure the vDSO gets into every core dump. + * Dumping its contents makes post-mortem fully interpretable later + * without matching up the same kernel and hardware config to see + * what PC values meant. + */ + gate_vma.vm_flags |= VM_ALWAYSDUMP; + + return 0; +} +__initcall(gate_vma_init); + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ + return FIXADDR_USER_START ? &gate_vma : NULL; +} + +int in_gate_area_no_mm(unsigned long addr) +{ + if (!FIXADDR_USER_START) + return 0; + + if ((addr >= FIXADDR_USER_START) && (addr < FIXADDR_USER_END)) + return 1; + + return 0; +} + +int in_gate_area(struct mm_struct *mm, unsigned long addr) +{ + struct vm_area_struct *vma = get_gate_vma(mm); + + if (!vma) + return 0; + + return (addr >= vma->vm_start) && (addr < vma->vm_end); +} diff --git a/arch/x86/um/mem_64.c b/arch/x86/um/mem_64.c new file mode 100644 index 000000000000..546518727a73 --- /dev/null +++ b/arch/x86/um/mem_64.c @@ -0,0 +1,26 @@ +#include "linux/mm.h" +#include "asm/page.h" +#include "asm/mman.h" + +const char *arch_vma_name(struct vm_area_struct *vma) +{ + if (vma->vm_mm && vma->vm_start == um_vdso_addr) + return "[vdso]"; + + return NULL; +} + +struct vm_area_struct *get_gate_vma(struct mm_struct *mm) +{ + return NULL; +} + +int in_gate_area(struct mm_struct *mm, unsigned long addr) +{ + return 0; +} + +int in_gate_area_no_mm(unsigned long addr) +{ + return 0; +} diff --git a/arch/x86/um/os-Linux/Makefile b/arch/x86/um/os-Linux/Makefile new file mode 100644 index 000000000000..253bfb8cb702 --- /dev/null +++ b/arch/x86/um/os-Linux/Makefile @@ -0,0 +1,13 @@ +# +# Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) +# Licensed under the GPL +# + +obj-y = registers.o task_size.o mcontext.o + +obj-$(CONFIG_X86_32) += tls.o +obj-$(CONFIG_64BIT) += prctl.o + +USER_OBJS := $(obj-y) + +include arch/um/scripts/Makefile.rules diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c new file mode 100644 index 000000000000..1d33d72c6284 --- /dev/null +++ b/arch/x86/um/os-Linux/mcontext.c @@ -0,0 +1,31 @@ +#include <sys/ucontext.h> +#define __FRAME_OFFSETS +#include <asm/ptrace.h> +#include <sysdep/ptrace.h> + +void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc) +{ +#ifdef __i386__ +#define COPY2(X,Y) regs->gp[X] = mc->gregs[REG_##Y] +#define COPY(X) regs->gp[X] = mc->gregs[REG_##X] +#define COPY_SEG(X) regs->gp[X] = mc->gregs[REG_##X] & 0xffff; +#define COPY_SEG_CPL3(X) regs->gp[X] = (mc->gregs[REG_##X] & 0xffff) | 3; + COPY_SEG(GS); COPY_SEG(FS); COPY_SEG(ES); COPY_SEG(DS); + COPY(EDI); COPY(ESI); COPY(EBP); + COPY2(UESP, ESP); /* sic */ + COPY(EBX); COPY(EDX); COPY(ECX); COPY(EAX); + COPY(EIP); COPY_SEG_CPL3(CS); COPY(EFL); COPY_SEG_CPL3(SS); +#else +#define COPY2(X,Y) regs->gp[X/sizeof(unsigned long)] = mc->gregs[REG_##Y] +#define COPY(X) regs->gp[X/sizeof(unsigned long)] = mc->gregs[REG_##X] + COPY(R8); COPY(R9); COPY(R10); COPY(R11); + COPY(R12); COPY(R13); COPY(R14); COPY(R15); + COPY(RDI); COPY(RSI); COPY(RBP); COPY(RBX); + COPY(RDX); COPY(RAX); COPY(RCX); COPY(RSP); + COPY(RIP); + COPY2(EFLAGS, EFL); + COPY2(CS, CSGSFS); + regs->gp[CS / sizeof(unsigned long)] &= 0xffff; + regs->gp[CS / sizeof(unsigned long)] |= 3; +#endif +} diff --git a/arch/x86/um/os-Linux/prctl.c b/arch/x86/um/os-Linux/prctl.c new file mode 100644 index 000000000000..9d34eddb517f --- /dev/null +++ b/arch/x86/um/os-Linux/prctl.c @@ -0,0 +1,12 @@ +/* + * Copyright (C) 2007 Jeff Dike (jdike@{addtoit.com,linux.intel.com}) + * Licensed under the GPL + */ + +#include <sys/ptrace.h> +#include <linux/ptrace.h> + +int os_arch_prctl(int pid, int code, unsigned long *addr) +{ + return ptrace(PTRACE_ARCH_PRCTL, pid, (unsigned long) addr, code); +} diff --git a/arch/x86/um/os-Linux/registers.c b/arch/x86/um/os-Linux/registers.c new file mode 100644 index 000000000000..0cdbb86b012b --- /dev/null +++ b/arch/x86/um/os-Linux/registers.c @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2004 PathScale, Inc + * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include <errno.h> +#include <sys/ptrace.h> +#ifdef __i386__ +#include <sys/user.h> +#endif +#include "longjmp.h" +#include "sysdep/ptrace_user.h" + +int save_fp_registers(int pid, unsigned long *fp_regs) +{ + if (ptrace(PTRACE_GETFPREGS, pid, 0, fp_regs) < 0) + return -errno; + return 0; +} + +int restore_fp_registers(int pid, unsigned long *fp_regs) +{ + if (ptrace(PTRACE_SETFPREGS, pid, 0, fp_regs) < 0) + return -errno; + return 0; +} + +#ifdef __i386__ +int have_fpx_regs = 1; +int save_fpx_registers(int pid, unsigned long *fp_regs) +{ + if (ptrace(PTRACE_GETFPXREGS, pid, 0, fp_regs) < 0) + return -errno; + return 0; +} + +int restore_fpx_registers(int pid, unsigned long *fp_regs) +{ + if (ptrace(PTRACE_SETFPXREGS, pid, 0, fp_regs) < 0) + return -errno; + return 0; +} + +int get_fp_registers(int pid, unsigned long *regs) +{ + if (have_fpx_regs) + return save_fpx_registers(pid, regs); + else + return save_fp_registers(pid, regs); +} + +int put_fp_registers(int pid, unsigned long *regs) +{ + if (have_fpx_regs) + return restore_fpx_registers(pid, regs); + else + return restore_fp_registers(pid, regs); +} + +void arch_init_registers(int pid) +{ + struct user_fpxregs_struct fpx_regs; + int err; + + err = ptrace(PTRACE_GETFPXREGS, pid, 0, &fpx_regs); + if (!err) + return; + + if (errno != EIO) + panic("check_ptrace : PTRACE_GETFPXREGS failed, errno = %d", + errno); + + have_fpx_regs = 0; +} +#else + +int get_fp_registers(int pid, unsigned long *regs) +{ + return save_fp_registers(pid, regs); +} + +int put_fp_registers(int pid, unsigned long *regs) +{ + return restore_fp_registers(pid, regs); +} + +#endif + +unsigned long get_thread_reg(int reg, jmp_buf *buf) +{ + switch (reg) { +#ifdef __i386__ + case HOST_IP: + return buf[0]->__eip; + case HOST_SP: + return buf[0]->__esp; + case HOST_BP: + return buf[0]->__ebp; +#else + case HOST_IP: + return buf[0]->__rip; + case HOST_SP: + return buf[0]->__rsp; + case HOST_BP: + return buf[0]->__rbp; +#endif + default: + printk(UM_KERN_ERR "get_thread_regs - unknown register %d\n", + reg); + return 0; + } +} diff --git a/arch/x86/um/os-Linux/task_size.c b/arch/x86/um/os-Linux/task_size.c new file mode 100644 index 000000000000..efb16c5c9bcf --- /dev/null +++ b/arch/x86/um/os-Linux/task_size.c @@ -0,0 +1,150 @@ +#include <stdio.h> +#include <stdlib.h> +#include <signal.h> +#include <sys/mman.h> +#include "longjmp.h" + +#ifdef __i386__ + +static jmp_buf buf; + +static void segfault(int sig) +{ + longjmp(buf, 1); +} + +static int page_ok(unsigned long page) +{ + unsigned long *address = (unsigned long *) (page << UM_KERN_PAGE_SHIFT); + unsigned long n = ~0UL; + void *mapped = NULL; + int ok = 0; + + /* + * First see if the page is readable. If it is, it may still + * be a VDSO, so we go on to see if it's writable. If not + * then try mapping memory there. If that fails, then we're + * still in the kernel area. As a sanity check, we'll fail if + * the mmap succeeds, but gives us an address different from + * what we wanted. + */ + if (setjmp(buf) == 0) + n = *address; + else { + mapped = mmap(address, UM_KERN_PAGE_SIZE, + PROT_READ | PROT_WRITE, + MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (mapped == MAP_FAILED) + return 0; + if (mapped != address) + goto out; + } + + /* + * Now, is it writeable? If so, then we're in user address + * space. If not, then try mprotecting it and try the write + * again. + */ + if (setjmp(buf) == 0) { + *address = n; + ok = 1; + goto out; + } else if (mprotect(address, UM_KERN_PAGE_SIZE, + PROT_READ | PROT_WRITE) != 0) + goto out; + + if (setjmp(buf) == 0) { + *address = n; + ok = 1; + } + + out: + if (mapped != NULL) + munmap(mapped, UM_KERN_PAGE_SIZE); + return ok; +} + +unsigned long os_get_top_address(void) +{ + struct sigaction sa, old; + unsigned long bottom = 0; + /* + * A 32-bit UML on a 64-bit host gets confused about the VDSO at + * 0xffffe000. It is mapped, is readable, can be reprotected writeable + * and written. However, exec discovers later that it can't be + * unmapped. So, just set the highest address to be checked to just + * below it. This might waste some address space on 4G/4G 32-bit + * hosts, but shouldn't hurt otherwise. + */ + unsigned long top = 0xffffd000 >> UM_KERN_PAGE_SHIFT; + unsigned long test, original; + + printf("Locating the bottom of the address space ... "); + fflush(stdout); + + /* + * We're going to be longjmping out of the signal handler, so + * SA_DEFER needs to be set. + */ + sa.sa_handler = segfault; + sigemptyset(&sa.sa_mask); + sa.sa_flags = SA_NODEFER; + if (sigaction(SIGSEGV, &sa, &old)) { + perror("os_get_top_address"); + exit(1); + } + + /* Manually scan the address space, bottom-up, until we find + * the first valid page (or run out of them). + */ + for (bottom = 0; bottom < top; bottom++) { + if (page_ok(bottom)) + break; + } + + /* If we've got this far, we ran out of pages. */ + if (bottom == top) { + fprintf(stderr, "Unable to determine bottom of address " + "space.\n"); + exit(1); + } + + printf("0x%x\n", bottom << UM_KERN_PAGE_SHIFT); + printf("Locating the top of the address space ... "); + fflush(stdout); + + original = bottom; + + /* This could happen with a 4G/4G split */ + if (page_ok(top)) + goto out; + + do { + test = bottom + (top - bottom) / 2; + if (page_ok(test)) + bottom = test; + else + top = test; + } while (top - bottom > 1); + +out: + /* Restore the old SIGSEGV handling */ + if (sigaction(SIGSEGV, &old, NULL)) { + perror("os_get_top_address"); + exit(1); + } + top <<= UM_KERN_PAGE_SHIFT; + printf("0x%x\n", top); + + return top; +} + +#else + +unsigned long os_get_top_address(void) +{ + /* The old value of CONFIG_TOP_ADDR */ + return 0x7fc0000000; +} + +#endif diff --git a/arch/x86/um/os-Linux/tls.c b/arch/x86/um/os-Linux/tls.c new file mode 100644 index 000000000000..82276b6071af --- /dev/null +++ b/arch/x86/um/os-Linux/tls.c @@ -0,0 +1,67 @@ +#include <errno.h> +#include <linux/unistd.h> + +#include <sys/ptrace.h> +#include <sys/syscall.h> +#include <unistd.h> + +#include "sysdep/tls.h" + +#ifndef PTRACE_GET_THREAD_AREA +#define PTRACE_GET_THREAD_AREA 25 +#endif + +#ifndef PTRACE_SET_THREAD_AREA +#define PTRACE_SET_THREAD_AREA 26 +#endif + +/* Checks whether host supports TLS, and sets *tls_min according to the value + * valid on the host. + * i386 host have it == 6; x86_64 host have it == 12, for i386 emulation. */ +void check_host_supports_tls(int *supports_tls, int *tls_min) +{ + /* Values for x86 and x86_64.*/ + int val[] = {GDT_ENTRY_TLS_MIN_I386, GDT_ENTRY_TLS_MIN_X86_64}; + int i; + + for (i = 0; i < ARRAY_SIZE(val); i++) { + user_desc_t info; + info.entry_number = val[i]; + + if (syscall(__NR_get_thread_area, &info) == 0) { + *tls_min = val[i]; + *supports_tls = 1; + return; + } else { + if (errno == EINVAL) + continue; + else if (errno == ENOSYS) + *supports_tls = 0; + return; + } + } + + *supports_tls = 0; +} + +int os_set_thread_area(user_desc_t *info, int pid) +{ + int ret; + + ret = ptrace(PTRACE_SET_THREAD_AREA, pid, info->entry_number, + (unsigned long) info); + if (ret < 0) + ret = -errno; + return ret; +} + +int os_get_thread_area(user_desc_t *info, int pid) +{ + int ret; + + ret = ptrace(PTRACE_GET_THREAD_AREA, pid, info->entry_number, + (unsigned long) info); + if (ret < 0) + ret = -errno; + return ret; +} diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c new file mode 100644 index 000000000000..3b949daa095c --- /dev/null +++ b/arch/x86/um/ptrace_32.c @@ -0,0 +1,273 @@ +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include "linux/mm.h" +#include "linux/sched.h" +#include "asm/uaccess.h" +#include "skas.h" + +extern int arch_switch_tls(struct task_struct *to); + +void arch_switch_to(struct task_struct *to) +{ + int err = arch_switch_tls(to); + if (!err) + return; + + if (err != -EINVAL) + printk(KERN_WARNING "arch_switch_tls failed, errno %d, " + "not EINVAL\n", -err); + else + printk(KERN_WARNING "arch_switch_tls failed, errno = EINVAL\n"); +} + +int is_syscall(unsigned long addr) +{ + unsigned short instr; + int n; + + n = copy_from_user(&instr, (void __user *) addr, sizeof(instr)); + if (n) { + /* access_process_vm() grants access to vsyscall and stub, + * while copy_from_user doesn't. Maybe access_process_vm is + * slow, but that doesn't matter, since it will be called only + * in case of singlestepping, if copy_from_user failed. + */ + n = access_process_vm(current, addr, &instr, sizeof(instr), 0); + if (n != sizeof(instr)) { + printk(KERN_ERR "is_syscall : failed to read " + "instruction from 0x%lx\n", addr); + return 1; + } + } + /* int 0x80 or sysenter */ + return (instr == 0x80cd) || (instr == 0x340f); +} + +/* determines which flags the user has access to. */ +/* 1 = access 0 = no access */ +#define FLAG_MASK 0x00044dd5 + +static const int reg_offsets[] = { + [EBX] = HOST_BX, + [ECX] = HOST_CX, + [EDX] = HOST_DX, + [ESI] = HOST_SI, + [EDI] = HOST_DI, + [EBP] = HOST_BP, + [EAX] = HOST_AX, + [DS] = HOST_DS, + [ES] = HOST_ES, + [FS] = HOST_FS, + [GS] = HOST_GS, + [EIP] = HOST_IP, + [CS] = HOST_CS, + [EFL] = HOST_EFLAGS, + [UESP] = HOST_SP, + [SS] = HOST_SS, +}; + +int putreg(struct task_struct *child, int regno, unsigned long value) +{ + regno >>= 2; + switch (regno) { + case EBX: + case ECX: + case EDX: + case ESI: + case EDI: + case EBP: + case EAX: + case EIP: + case UESP: + break; + case FS: + if (value && (value & 3) != 3) + return -EIO; + break; + case GS: + if (value && (value & 3) != 3) + return -EIO; + break; + case DS: + case ES: + if (value && (value & 3) != 3) + return -EIO; + value &= 0xffff; + break; + case SS: + case CS: + if ((value & 3) != 3) + return -EIO; + value &= 0xffff; + break; + case EFL: + value &= FLAG_MASK; + child->thread.regs.regs.gp[HOST_EFLAGS] |= value; + return 0; + case ORIG_EAX: + child->thread.regs.regs.syscall = value; + return 0; + default : + panic("Bad register in putreg() : %d\n", regno); + } + child->thread.regs.regs.gp[reg_offsets[regno]] = value; + return 0; +} + +int poke_user(struct task_struct *child, long addr, long data) +{ + if ((addr & 3) || addr < 0) + return -EIO; + + if (addr < MAX_REG_OFFSET) + return putreg(child, addr, data); + else if ((addr >= offsetof(struct user, u_debugreg[0])) && + (addr <= offsetof(struct user, u_debugreg[7]))) { + addr -= offsetof(struct user, u_debugreg[0]); + addr = addr >> 2; + if ((addr == 4) || (addr == 5)) + return -EIO; + child->thread.arch.debugregs[addr] = data; + return 0; + } + return -EIO; +} + +unsigned long getreg(struct task_struct *child, int regno) +{ + unsigned long mask = ~0UL; + + regno >>= 2; + switch (regno) { + case ORIG_EAX: + return child->thread.regs.regs.syscall; + case FS: + case GS: + case DS: + case ES: + case SS: + case CS: + mask = 0xffff; + break; + case EIP: + case UESP: + case EAX: + case EBX: + case ECX: + case EDX: + case ESI: + case EDI: + case EBP: + case EFL: + break; + default: + panic("Bad register in getreg() : %d\n", regno); + } + return mask & child->thread.regs.regs.gp[reg_offsets[regno]]; +} + +/* read the word at location addr in the USER area. */ +int peek_user(struct task_struct *child, long addr, long data) +{ + unsigned long tmp; + + if ((addr & 3) || addr < 0) + return -EIO; + + tmp = 0; /* Default return condition */ + if (addr < MAX_REG_OFFSET) { + tmp = getreg(child, addr); + } + else if ((addr >= offsetof(struct user, u_debugreg[0])) && + (addr <= offsetof(struct user, u_debugreg[7]))) { + addr -= offsetof(struct user, u_debugreg[0]); + addr = addr >> 2; + tmp = child->thread.arch.debugregs[addr]; + } + return put_user(tmp, (unsigned long __user *) data); +} + +static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) +{ + int err, n, cpu = ((struct thread_info *) child->stack)->cpu; + struct user_i387_struct fpregs; + + err = save_fp_registers(userspace_pid[cpu], (unsigned long *) &fpregs); + if (err) + return err; + + n = copy_to_user(buf, &fpregs, sizeof(fpregs)); + if(n > 0) + return -EFAULT; + + return n; +} + +static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) +{ + int n, cpu = ((struct thread_info *) child->stack)->cpu; + struct user_i387_struct fpregs; + + n = copy_from_user(&fpregs, buf, sizeof(fpregs)); + if (n > 0) + return -EFAULT; + + return restore_fp_registers(userspace_pid[cpu], + (unsigned long *) &fpregs); +} + +static int get_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child) +{ + int err, n, cpu = ((struct thread_info *) child->stack)->cpu; + struct user_fxsr_struct fpregs; + + err = save_fpx_registers(userspace_pid[cpu], (unsigned long *) &fpregs); + if (err) + return err; + + n = copy_to_user(buf, &fpregs, sizeof(fpregs)); + if(n > 0) + return -EFAULT; + + return n; +} + +static int set_fpxregs(struct user_fxsr_struct __user *buf, struct task_struct *child) +{ + int n, cpu = ((struct thread_info *) child->stack)->cpu; + struct user_fxsr_struct fpregs; + + n = copy_from_user(&fpregs, buf, sizeof(fpregs)); + if (n > 0) + return -EFAULT; + + return restore_fpx_registers(userspace_pid[cpu], + (unsigned long *) &fpregs); +} + +long subarch_ptrace(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + int ret = -EIO; + void __user *datap = (void __user *) data; + switch (request) { + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + ret = get_fpregs(datap, child); + break; + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + ret = set_fpregs(datap, child); + break; + case PTRACE_GETFPXREGS: /* Get the child FPU state. */ + ret = get_fpxregs(datap, child); + break; + case PTRACE_SETFPXREGS: /* Set the child FPU state. */ + ret = set_fpxregs(datap, child); + break; + default: + ret = -EIO; + } + return ret; +} diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c new file mode 100644 index 000000000000..3b52bf0b418a --- /dev/null +++ b/arch/x86/um/ptrace_64.c @@ -0,0 +1,271 @@ +/* + * Copyright 2003 PathScale, Inc. + * Copyright (C) 2003 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * + * Licensed under the GPL + */ + +#include <linux/mm.h> +#include <linux/sched.h> +#include <linux/errno.h> +#define __FRAME_OFFSETS +#include <asm/ptrace.h> +#include <asm/uaccess.h> + +/* + * determines which flags the user has access to. + * 1 = access 0 = no access + */ +#define FLAG_MASK 0x44dd5UL + +static const int reg_offsets[] = +{ + [R8 >> 3] = HOST_R8, + [R9 >> 3] = HOST_R9, + [R10 >> 3] = HOST_R10, + [R11 >> 3] = HOST_R11, + [R12 >> 3] = HOST_R12, + [R13 >> 3] = HOST_R13, + [R14 >> 3] = HOST_R14, + [R15 >> 3] = HOST_R15, + [RIP >> 3] = HOST_IP, + [RSP >> 3] = HOST_SP, + [RAX >> 3] = HOST_AX, + [RBX >> 3] = HOST_BX, + [RCX >> 3] = HOST_CX, + [RDX >> 3] = HOST_DX, + [RSI >> 3] = HOST_SI, + [RDI >> 3] = HOST_DI, + [RBP >> 3] = HOST_BP, + [CS >> 3] = HOST_CS, + [SS >> 3] = HOST_SS, + [FS_BASE >> 3] = HOST_FS_BASE, + [GS_BASE >> 3] = HOST_GS_BASE, + [DS >> 3] = HOST_DS, + [ES >> 3] = HOST_ES, + [FS >> 3] = HOST_FS, + [GS >> 3] = HOST_GS, + [EFLAGS >> 3] = HOST_EFLAGS, + [ORIG_RAX >> 3] = HOST_ORIG_AX, +}; + +int putreg(struct task_struct *child, int regno, unsigned long value) +{ +#ifdef TIF_IA32 + /* + * Some code in the 64bit emulation may not be 64bit clean. + * Don't take any chances. + */ + if (test_tsk_thread_flag(child, TIF_IA32)) + value &= 0xffffffff; +#endif + switch (regno) { + case R8: + case R9: + case R10: + case R11: + case R12: + case R13: + case R14: + case R15: + case RIP: + case RSP: + case RAX: + case RBX: + case RCX: + case RDX: + case RSI: + case RDI: + case RBP: + case ORIG_RAX: + break; + + case FS: + case GS: + case DS: + case ES: + case SS: + case CS: + if (value && (value & 3) != 3) + return -EIO; + value &= 0xffff; + break; + + case FS_BASE: + case GS_BASE: + if (!((value >> 48) == 0 || (value >> 48) == 0xffff)) + return -EIO; + break; + + case EFLAGS: + value &= FLAG_MASK; + child->thread.regs.regs.gp[HOST_EFLAGS] |= value; + return 0; + + default: + panic("Bad register in putreg(): %d\n", regno); + } + + child->thread.regs.regs.gp[reg_offsets[regno >> 3]] = value; + return 0; +} + +int poke_user(struct task_struct *child, long addr, long data) +{ + if ((addr & 3) || addr < 0) + return -EIO; + + if (addr < MAX_REG_OFFSET) + return putreg(child, addr, data); + else if ((addr >= offsetof(struct user, u_debugreg[0])) && + (addr <= offsetof(struct user, u_debugreg[7]))) { + addr -= offsetof(struct user, u_debugreg[0]); + addr = addr >> 2; + if ((addr == 4) || (addr == 5)) + return -EIO; + child->thread.arch.debugregs[addr] = data; + return 0; + } + return -EIO; +} + +unsigned long getreg(struct task_struct *child, int regno) +{ + unsigned long mask = ~0UL; +#ifdef TIF_IA32 + if (test_tsk_thread_flag(child, TIF_IA32)) + mask = 0xffffffff; +#endif + switch (regno) { + case R8: + case R9: + case R10: + case R11: + case R12: + case R13: + case R14: + case R15: + case RIP: + case RSP: + case RAX: + case RBX: + case RCX: + case RDX: + case RSI: + case RDI: + case RBP: + case ORIG_RAX: + case EFLAGS: + case FS_BASE: + case GS_BASE: + break; + case FS: + case GS: + case DS: + case ES: + case SS: + case CS: + mask = 0xffff; + break; + default: + panic("Bad register in getreg: %d\n", regno); + } + return mask & child->thread.regs.regs.gp[reg_offsets[regno >> 3]]; +} + +int peek_user(struct task_struct *child, long addr, long data) +{ + /* read the word at location addr in the USER area. */ + unsigned long tmp; + + if ((addr & 3) || addr < 0) + return -EIO; + + tmp = 0; /* Default return condition */ + if (addr < MAX_REG_OFFSET) + tmp = getreg(child, addr); + else if ((addr >= offsetof(struct user, u_debugreg[0])) && + (addr <= offsetof(struct user, u_debugreg[7]))) { + addr -= offsetof(struct user, u_debugreg[0]); + addr = addr >> 2; + tmp = child->thread.arch.debugregs[addr]; + } + return put_user(tmp, (unsigned long *) data); +} + +/* XXX Mostly copied from sys-i386 */ +int is_syscall(unsigned long addr) +{ + unsigned short instr; + int n; + + n = copy_from_user(&instr, (void __user *) addr, sizeof(instr)); + if (n) { + /* + * access_process_vm() grants access to vsyscall and stub, + * while copy_from_user doesn't. Maybe access_process_vm is + * slow, but that doesn't matter, since it will be called only + * in case of singlestepping, if copy_from_user failed. + */ + n = access_process_vm(current, addr, &instr, sizeof(instr), 0); + if (n != sizeof(instr)) { + printk("is_syscall : failed to read instruction from " + "0x%lx\n", addr); + return 1; + } + } + /* sysenter */ + return instr == 0x050f; +} + +static int get_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) +{ + int err, n, cpu = ((struct thread_info *) child->stack)->cpu; + long fpregs[HOST_FP_SIZE]; + + BUG_ON(sizeof(*buf) != sizeof(fpregs)); + err = save_fp_registers(userspace_pid[cpu], fpregs); + if (err) + return err; + + n = copy_to_user(buf, fpregs, sizeof(fpregs)); + if (n > 0) + return -EFAULT; + + return n; +} + +static int set_fpregs(struct user_i387_struct __user *buf, struct task_struct *child) +{ + int n, cpu = ((struct thread_info *) child->stack)->cpu; + long fpregs[HOST_FP_SIZE]; + + BUG_ON(sizeof(*buf) != sizeof(fpregs)); + n = copy_from_user(fpregs, buf, sizeof(fpregs)); + if (n > 0) + return -EFAULT; + + return restore_fp_registers(userspace_pid[cpu], fpregs); +} + +long subarch_ptrace(struct task_struct *child, long request, + unsigned long addr, unsigned long data) +{ + int ret = -EIO; + void __user *datap = (void __user *) data; + + switch (request) { + case PTRACE_GETFPREGS: /* Get the child FPU state. */ + ret = get_fpregs(datap, child); + break; + case PTRACE_SETFPREGS: /* Set the child FPU state. */ + ret = set_fpregs(datap, child); + break; + case PTRACE_ARCH_PRCTL: + /* XXX Calls ptrace on the host - needs some SMP thinking */ + ret = arch_prctl(child, data, (void __user *) addr); + break; + } + + return ret; +} diff --git a/arch/x86/um/ptrace_user.c b/arch/x86/um/ptrace_user.c new file mode 100644 index 000000000000..3960ca1dd35a --- /dev/null +++ b/arch/x86/um/ptrace_user.c @@ -0,0 +1,21 @@ +/* + * Copyright (C) 2002 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include <errno.h> +#include "ptrace_user.h" + +int ptrace_getregs(long pid, unsigned long *regs_out) +{ + if (ptrace(PTRACE_GETREGS, pid, 0, regs_out) < 0) + return -errno; + return 0; +} + +int ptrace_setregs(long pid, unsigned long *regs) +{ + if (ptrace(PTRACE_SETREGS, pid, 0, regs) < 0) + return -errno; + return 0; +} diff --git a/arch/x86/um/setjmp_32.S b/arch/x86/um/setjmp_32.S new file mode 100644 index 000000000000..b766792c9933 --- /dev/null +++ b/arch/x86/um/setjmp_32.S @@ -0,0 +1,58 @@ +# +# arch/i386/setjmp.S +# +# setjmp/longjmp for the i386 architecture +# + +# +# The jmp_buf is assumed to contain the following, in order: +# %ebx +# %esp +# %ebp +# %esi +# %edi +# <return address> +# + + .text + .align 4 + .globl setjmp + .type setjmp, @function +setjmp: +#ifdef _REGPARM + movl %eax,%edx +#else + movl 4(%esp),%edx +#endif + popl %ecx # Return address, and adjust the stack + xorl %eax,%eax # Return value + movl %ebx,(%edx) + movl %esp,4(%edx) # Post-return %esp! + pushl %ecx # Make the call/return stack happy + movl %ebp,8(%edx) + movl %esi,12(%edx) + movl %edi,16(%edx) + movl %ecx,20(%edx) # Return address + ret + + .size setjmp,.-setjmp + + .text + .align 4 + .globl longjmp + .type longjmp, @function +longjmp: +#ifdef _REGPARM + xchgl %eax,%edx +#else + movl 4(%esp),%edx # jmp_ptr address + movl 8(%esp),%eax # Return value +#endif + movl (%edx),%ebx + movl 4(%edx),%esp + movl 8(%edx),%ebp + movl 12(%edx),%esi + movl 16(%edx),%edi + jmp *20(%edx) + + .size longjmp,.-longjmp diff --git a/arch/x86/um/setjmp_64.S b/arch/x86/um/setjmp_64.S new file mode 100644 index 000000000000..45f547b4043e --- /dev/null +++ b/arch/x86/um/setjmp_64.S @@ -0,0 +1,54 @@ +# +# arch/x86_64/setjmp.S +# +# setjmp/longjmp for the x86-64 architecture +# + +# +# The jmp_buf is assumed to contain the following, in order: +# %rbx +# %rsp (post-return) +# %rbp +# %r12 +# %r13 +# %r14 +# %r15 +# <return address> +# + + .text + .align 4 + .globl setjmp + .type setjmp, @function +setjmp: + pop %rsi # Return address, and adjust the stack + xorl %eax,%eax # Return value + movq %rbx,(%rdi) + movq %rsp,8(%rdi) # Post-return %rsp! + push %rsi # Make the call/return stack happy + movq %rbp,16(%rdi) + movq %r12,24(%rdi) + movq %r13,32(%rdi) + movq %r14,40(%rdi) + movq %r15,48(%rdi) + movq %rsi,56(%rdi) # Return address + ret + + .size setjmp,.-setjmp + + .text + .align 4 + .globl longjmp + .type longjmp, @function +longjmp: + movl %esi,%eax # Return value (int) + movq (%rdi),%rbx + movq 8(%rdi),%rsp + movq 16(%rdi),%rbp + movq 24(%rdi),%r12 + movq 32(%rdi),%r13 + movq 40(%rdi),%r14 + movq 48(%rdi),%r15 + jmp *56(%rdi) + + .size longjmp,.-longjmp diff --git a/arch/x86/um/shared/sysdep/archsetjmp.h b/arch/x86/um/shared/sysdep/archsetjmp.h new file mode 100644 index 000000000000..ff7766d28226 --- /dev/null +++ b/arch/x86/um/shared/sysdep/archsetjmp.h @@ -0,0 +1,5 @@ +#ifdef __i386__ +#include "archsetjmp_32.h" +#else +#include "archsetjmp_64.h" +#endif diff --git a/arch/x86/um/shared/sysdep/archsetjmp_32.h b/arch/x86/um/shared/sysdep/archsetjmp_32.h new file mode 100644 index 000000000000..0f312085ce1d --- /dev/null +++ b/arch/x86/um/shared/sysdep/archsetjmp_32.h @@ -0,0 +1,22 @@ +/* + * arch/um/include/sysdep-i386/archsetjmp.h + */ + +#ifndef _KLIBC_ARCHSETJMP_H +#define _KLIBC_ARCHSETJMP_H + +struct __jmp_buf { + unsigned int __ebx; + unsigned int __esp; + unsigned int __ebp; + unsigned int __esi; + unsigned int __edi; + unsigned int __eip; +}; + +typedef struct __jmp_buf jmp_buf[1]; + +#define JB_IP __eip +#define JB_SP __esp + +#endif /* _SETJMP_H */ diff --git a/arch/x86/um/shared/sysdep/archsetjmp_64.h b/arch/x86/um/shared/sysdep/archsetjmp_64.h new file mode 100644 index 000000000000..2af8f12ca161 --- /dev/null +++ b/arch/x86/um/shared/sysdep/archsetjmp_64.h @@ -0,0 +1,24 @@ +/* + * arch/um/include/sysdep-x86_64/archsetjmp.h + */ + +#ifndef _KLIBC_ARCHSETJMP_H +#define _KLIBC_ARCHSETJMP_H + +struct __jmp_buf { + unsigned long __rbx; + unsigned long __rsp; + unsigned long __rbp; + unsigned long __r12; + unsigned long __r13; + unsigned long __r14; + unsigned long __r15; + unsigned long __rip; +}; + +typedef struct __jmp_buf jmp_buf[1]; + +#define JB_IP __rip +#define JB_SP __rsp + +#endif /* _SETJMP_H */ diff --git a/arch/x86/um/shared/sysdep/faultinfo.h b/arch/x86/um/shared/sysdep/faultinfo.h new file mode 100644 index 000000000000..862ecb1c7781 --- /dev/null +++ b/arch/x86/um/shared/sysdep/faultinfo.h @@ -0,0 +1,5 @@ +#ifdef __i386__ +#include "faultinfo_32.h" +#else +#include "faultinfo_64.h" +#endif diff --git a/arch/x86/um/shared/sysdep/faultinfo_32.h b/arch/x86/um/shared/sysdep/faultinfo_32.h new file mode 100644 index 000000000000..a26086b8a800 --- /dev/null +++ b/arch/x86/um/shared/sysdep/faultinfo_32.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2004 Fujitsu Siemens Computers GmbH + * Author: Bodo Stroesser <bstroesser@fujitsu-siemens.com> + * Licensed under the GPL + */ + +#ifndef __FAULTINFO_I386_H +#define __FAULTINFO_I386_H + +/* this structure contains the full arch-specific faultinfo + * from the traps. + * On i386, ptrace_faultinfo unfortunately doesn't provide + * all the info, since trap_no is missing. + * All common elements are defined at the same position in + * both structures, thus making it easy to copy the + * contents without knowledge about the structure elements. + */ +struct faultinfo { + int error_code; /* in ptrace_faultinfo misleadingly called is_write */ + unsigned long cr2; /* in ptrace_faultinfo called addr */ + int trap_no; /* missing in ptrace_faultinfo */ +}; + +#define FAULT_WRITE(fi) ((fi).error_code & 2) +#define FAULT_ADDRESS(fi) ((fi).cr2) + +/* This is Page Fault */ +#define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14) + +/* SKAS3 has no trap_no on i386, but get_skas_faultinfo() sets it to 0. */ +#define SEGV_MAYBE_FIXABLE(fi) ((fi)->trap_no == 0 && ptrace_faultinfo) + +#define PTRACE_FULL_FAULTINFO 0 + +#endif diff --git a/arch/x86/um/shared/sysdep/faultinfo_64.h b/arch/x86/um/shared/sysdep/faultinfo_64.h new file mode 100644 index 000000000000..f811cbe15d62 --- /dev/null +++ b/arch/x86/um/shared/sysdep/faultinfo_64.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2004 Fujitsu Siemens Computers GmbH + * Author: Bodo Stroesser <bstroesser@fujitsu-siemens.com> + * Licensed under the GPL + */ + +#ifndef __FAULTINFO_X86_64_H +#define __FAULTINFO_X86_64_H + +/* this structure contains the full arch-specific faultinfo + * from the traps. + * On i386, ptrace_faultinfo unfortunately doesn't provide + * all the info, since trap_no is missing. + * All common elements are defined at the same position in + * both structures, thus making it easy to copy the + * contents without knowledge about the structure elements. + */ +struct faultinfo { + int error_code; /* in ptrace_faultinfo misleadingly called is_write */ + unsigned long cr2; /* in ptrace_faultinfo called addr */ + int trap_no; /* missing in ptrace_faultinfo */ +}; + +#define FAULT_WRITE(fi) ((fi).error_code & 2) +#define FAULT_ADDRESS(fi) ((fi).cr2) + +/* This is Page Fault */ +#define SEGV_IS_FIXABLE(fi) ((fi)->trap_no == 14) + +/* No broken SKAS API, which doesn't pass trap_no, here. */ +#define SEGV_MAYBE_FIXABLE(fi) 0 + +#define PTRACE_FULL_FAULTINFO 1 + +#endif diff --git a/arch/x86/um/shared/sysdep/kernel-offsets.h b/arch/x86/um/shared/sysdep/kernel-offsets.h new file mode 100644 index 000000000000..5868526b5eef --- /dev/null +++ b/arch/x86/um/shared/sysdep/kernel-offsets.h @@ -0,0 +1,21 @@ +#include <linux/stddef.h> +#include <linux/sched.h> +#include <linux/elf.h> +#include <linux/crypto.h> +#include <asm/mman.h> + +#define DEFINE(sym, val) \ + asm volatile("\n->" #sym " %0 " #val : : "i" (val)) + +#define STR(x) #x +#define DEFINE_STR(sym, val) asm volatile("\n->" #sym " " STR(val) " " #val: : ) + +#define BLANK() asm volatile("\n->" : : ) + +#define OFFSET(sym, str, mem) \ + DEFINE(sym, offsetof(struct str, mem)); + +void foo(void) +{ +#include <common-offsets.h> +} diff --git a/arch/x86/um/shared/sysdep/mcontext.h b/arch/x86/um/shared/sysdep/mcontext.h new file mode 100644 index 000000000000..b724c54da316 --- /dev/null +++ b/arch/x86/um/shared/sysdep/mcontext.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#ifndef __SYS_SIGCONTEXT_X86_H +#define __SYS_SIGCONTEXT_X86_H + +extern void get_regs_from_mc(struct uml_pt_regs *, mcontext_t *); + +#ifdef __i386__ + +#define GET_FAULTINFO_FROM_MC(fi, mc) \ + { \ + (fi).cr2 = (mc)->cr2; \ + (fi).error_code = (mc)->gregs[REG_ERR]; \ + (fi).trap_no = (mc)->gregs[REG_TRAPNO]; \ + } + +#else + +#define GET_FAULTINFO_FROM_MC(fi, mc) \ + { \ + (fi).cr2 = (mc)->gregs[REG_CR2]; \ + (fi).error_code = (mc)->gregs[REG_ERR]; \ + (fi).trap_no = (mc)->gregs[REG_TRAPNO]; \ + } + +#endif + +#endif diff --git a/arch/x86/um/shared/sysdep/ptrace.h b/arch/x86/um/shared/sysdep/ptrace.h new file mode 100644 index 000000000000..711b1621747f --- /dev/null +++ b/arch/x86/um/shared/sysdep/ptrace.h @@ -0,0 +1,5 @@ +#ifdef __i386__ +#include "ptrace_32.h" +#else +#include "ptrace_64.h" +#endif diff --git a/arch/x86/um/shared/sysdep/ptrace_32.h b/arch/x86/um/shared/sysdep/ptrace_32.h new file mode 100644 index 000000000000..befd1df32ed0 --- /dev/null +++ b/arch/x86/um/shared/sysdep/ptrace_32.h @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#ifndef __SYSDEP_I386_PTRACE_H +#define __SYSDEP_I386_PTRACE_H + +#include <generated/user_constants.h> +#include "sysdep/faultinfo.h" + +#define MAX_REG_NR (UM_FRAME_SIZE / sizeof(unsigned long)) +#define MAX_REG_OFFSET (UM_FRAME_SIZE) + +static inline void update_debugregs(int seq) {} + +/* syscall emulation path in ptrace */ + +#ifndef PTRACE_SYSEMU +#define PTRACE_SYSEMU 31 +#endif + +void set_using_sysemu(int value); +int get_using_sysemu(void); +extern int sysemu_supported; + +#define REGS_IP(r) ((r)[HOST_IP]) +#define REGS_SP(r) ((r)[HOST_SP]) +#define REGS_EFLAGS(r) ((r)[HOST_EFLAGS]) +#define REGS_EAX(r) ((r)[HOST_AX]) +#define REGS_EBX(r) ((r)[HOST_BX]) +#define REGS_ECX(r) ((r)[HOST_CX]) +#define REGS_EDX(r) ((r)[HOST_DX]) +#define REGS_ESI(r) ((r)[HOST_SI]) +#define REGS_EDI(r) ((r)[HOST_DI]) +#define REGS_EBP(r) ((r)[HOST_BP]) +#define REGS_CS(r) ((r)[HOST_CS]) +#define REGS_SS(r) ((r)[HOST_SS]) +#define REGS_DS(r) ((r)[HOST_DS]) +#define REGS_ES(r) ((r)[HOST_ES]) +#define REGS_FS(r) ((r)[HOST_FS]) +#define REGS_GS(r) ((r)[HOST_GS]) + +#define REGS_SET_SYSCALL_RETURN(r, res) REGS_EAX(r) = (res) + +#define IP_RESTART_SYSCALL(ip) ((ip) -= 2) +#define REGS_RESTART_SYSCALL(r) IP_RESTART_SYSCALL(REGS_IP(r)) + +#ifndef PTRACE_SYSEMU_SINGLESTEP +#define PTRACE_SYSEMU_SINGLESTEP 32 +#endif + +struct uml_pt_regs { + unsigned long gp[MAX_REG_NR]; + unsigned long fp[HOST_FPX_SIZE]; + struct faultinfo faultinfo; + long syscall; + int is_user; +}; + +#define EMPTY_UML_PT_REGS { } + +#define UPT_IP(r) REGS_IP((r)->gp) +#define UPT_SP(r) REGS_SP((r)->gp) +#define UPT_EFLAGS(r) REGS_EFLAGS((r)->gp) +#define UPT_EAX(r) REGS_EAX((r)->gp) +#define UPT_EBX(r) REGS_EBX((r)->gp) +#define UPT_ECX(r) REGS_ECX((r)->gp) +#define UPT_EDX(r) REGS_EDX((r)->gp) +#define UPT_ESI(r) REGS_ESI((r)->gp) +#define UPT_EDI(r) REGS_EDI((r)->gp) +#define UPT_EBP(r) REGS_EBP((r)->gp) +#define UPT_ORIG_EAX(r) ((r)->syscall) +#define UPT_CS(r) REGS_CS((r)->gp) +#define UPT_SS(r) REGS_SS((r)->gp) +#define UPT_DS(r) REGS_DS((r)->gp) +#define UPT_ES(r) REGS_ES((r)->gp) +#define UPT_FS(r) REGS_FS((r)->gp) +#define UPT_GS(r) REGS_GS((r)->gp) + +#define UPT_SYSCALL_ARG1(r) UPT_EBX(r) +#define UPT_SYSCALL_ARG2(r) UPT_ECX(r) +#define UPT_SYSCALL_ARG3(r) UPT_EDX(r) +#define UPT_SYSCALL_ARG4(r) UPT_ESI(r) +#define UPT_SYSCALL_ARG5(r) UPT_EDI(r) +#define UPT_SYSCALL_ARG6(r) UPT_EBP(r) + +extern int user_context(unsigned long sp); + +#define UPT_IS_USER(r) ((r)->is_user) + +struct syscall_args { + unsigned long args[6]; +}; + +#define SYSCALL_ARGS(r) ((struct syscall_args) \ + { .args = { UPT_SYSCALL_ARG1(r), \ + UPT_SYSCALL_ARG2(r), \ + UPT_SYSCALL_ARG3(r), \ + UPT_SYSCALL_ARG4(r), \ + UPT_SYSCALL_ARG5(r), \ + UPT_SYSCALL_ARG6(r) } } ) + +#define UPT_RESTART_SYSCALL(r) REGS_RESTART_SYSCALL((r)->gp) + +#define UPT_ORIG_SYSCALL(r) UPT_EAX(r) +#define UPT_SYSCALL_NR(r) UPT_ORIG_EAX(r) +#define UPT_SYSCALL_RET(r) UPT_EAX(r) + +#define UPT_FAULTINFO(r) (&(r)->faultinfo) + +extern void arch_init_registers(int pid); + +#endif diff --git a/arch/x86/um/shared/sysdep/ptrace_64.h b/arch/x86/um/shared/sysdep/ptrace_64.h new file mode 100644 index 000000000000..031edc53ac57 --- /dev/null +++ b/arch/x86/um/shared/sysdep/ptrace_64.h @@ -0,0 +1,157 @@ +/* + * Copyright 2003 PathScale, Inc. + * Copyright (C) 2003 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * + * Licensed under the GPL + */ + +#ifndef __SYSDEP_X86_64_PTRACE_H +#define __SYSDEP_X86_64_PTRACE_H + +#include <generated/user_constants.h> +#include "sysdep/faultinfo.h" + +#define MAX_REG_OFFSET (UM_FRAME_SIZE) +#define MAX_REG_NR ((MAX_REG_OFFSET) / sizeof(unsigned long)) + +#define REGS_IP(r) ((r)[HOST_IP]) +#define REGS_SP(r) ((r)[HOST_SP]) + +#define REGS_RBX(r) ((r)[HOST_BX]) +#define REGS_RCX(r) ((r)[HOST_CX]) +#define REGS_RDX(r) ((r)[HOST_DX]) +#define REGS_RSI(r) ((r)[HOST_SI]) +#define REGS_RDI(r) ((r)[HOST_DI]) +#define REGS_RBP(r) ((r)[HOST_BP]) +#define REGS_RAX(r) ((r)[HOST_AX]) +#define REGS_R8(r) ((r)[HOST_R8]) +#define REGS_R9(r) ((r)[HOST_R9]) +#define REGS_R10(r) ((r)[HOST_R10]) +#define REGS_R11(r) ((r)[HOST_R11]) +#define REGS_R12(r) ((r)[HOST_R12]) +#define REGS_R13(r) ((r)[HOST_R13]) +#define REGS_R14(r) ((r)[HOST_R14]) +#define REGS_R15(r) ((r)[HOST_R15]) +#define REGS_CS(r) ((r)[HOST_CS]) +#define REGS_EFLAGS(r) ((r)[HOST_EFLAGS]) +#define REGS_SS(r) ((r)[HOST_SS]) + +#define HOST_FS_BASE 21 +#define HOST_GS_BASE 22 +#define HOST_DS 23 +#define HOST_ES 24 +#define HOST_FS 25 +#define HOST_GS 26 + +/* Also defined in asm/ptrace-x86_64.h, but not in libc headers. So, these + * are already defined for kernel code, but not for userspace code. + */ +#ifndef FS_BASE +/* These aren't defined in ptrace.h, but exist in struct user_regs_struct, + * which is what x86_64 ptrace actually uses. + */ +#define FS_BASE (HOST_FS_BASE * sizeof(long)) +#define GS_BASE (HOST_GS_BASE * sizeof(long)) +#define DS (HOST_DS * sizeof(long)) +#define ES (HOST_ES * sizeof(long)) +#define FS (HOST_FS * sizeof(long)) +#define GS (HOST_GS * sizeof(long)) +#endif + +#define REGS_FS_BASE(r) ((r)[HOST_FS_BASE]) +#define REGS_GS_BASE(r) ((r)[HOST_GS_BASE]) +#define REGS_DS(r) ((r)[HOST_DS]) +#define REGS_ES(r) ((r)[HOST_ES]) +#define REGS_FS(r) ((r)[HOST_FS]) +#define REGS_GS(r) ((r)[HOST_GS]) + +#define REGS_ORIG_RAX(r) ((r)[HOST_ORIG_AX]) + +#define REGS_SET_SYSCALL_RETURN(r, res) REGS_RAX(r) = (res) + +#define IP_RESTART_SYSCALL(ip) ((ip) -= 2) +#define REGS_RESTART_SYSCALL(r) IP_RESTART_SYSCALL(REGS_IP(r)) + +#define REGS_FAULT_ADDR(r) ((r)->fault_addr) + +#define REGS_FAULT_WRITE(r) FAULT_WRITE((r)->fault_type) + +#define REGS_TRAP(r) ((r)->trap_type) + +#define REGS_ERR(r) ((r)->fault_type) + +struct uml_pt_regs { + unsigned long gp[MAX_REG_NR]; + unsigned long fp[HOST_FP_SIZE]; + struct faultinfo faultinfo; + long syscall; + int is_user; +}; + +#define EMPTY_UML_PT_REGS { } + +#define UPT_RBX(r) REGS_RBX((r)->gp) +#define UPT_RCX(r) REGS_RCX((r)->gp) +#define UPT_RDX(r) REGS_RDX((r)->gp) +#define UPT_RSI(r) REGS_RSI((r)->gp) +#define UPT_RDI(r) REGS_RDI((r)->gp) +#define UPT_RBP(r) REGS_RBP((r)->gp) +#define UPT_RAX(r) REGS_RAX((r)->gp) +#define UPT_R8(r) REGS_R8((r)->gp) +#define UPT_R9(r) REGS_R9((r)->gp) +#define UPT_R10(r) REGS_R10((r)->gp) +#define UPT_R11(r) REGS_R11((r)->gp) +#define UPT_R12(r) REGS_R12((r)->gp) +#define UPT_R13(r) REGS_R13((r)->gp) +#define UPT_R14(r) REGS_R14((r)->gp) +#define UPT_R15(r) REGS_R15((r)->gp) +#define UPT_CS(r) REGS_CS((r)->gp) +#define UPT_FS_BASE(r) REGS_FS_BASE((r)->gp) +#define UPT_FS(r) REGS_FS((r)->gp) +#define UPT_GS_BASE(r) REGS_GS_BASE((r)->gp) +#define UPT_GS(r) REGS_GS((r)->gp) +#define UPT_DS(r) REGS_DS((r)->gp) +#define UPT_ES(r) REGS_ES((r)->gp) +#define UPT_CS(r) REGS_CS((r)->gp) +#define UPT_SS(r) REGS_SS((r)->gp) +#define UPT_ORIG_RAX(r) REGS_ORIG_RAX((r)->gp) + +#define UPT_IP(r) REGS_IP((r)->gp) +#define UPT_SP(r) REGS_SP((r)->gp) + +#define UPT_EFLAGS(r) REGS_EFLAGS((r)->gp) +#define UPT_SYSCALL_NR(r) ((r)->syscall) +#define UPT_SYSCALL_RET(r) UPT_RAX(r) + +extern int user_context(unsigned long sp); + +#define UPT_IS_USER(r) ((r)->is_user) + +#define UPT_SYSCALL_ARG1(r) UPT_RDI(r) +#define UPT_SYSCALL_ARG2(r) UPT_RSI(r) +#define UPT_SYSCALL_ARG3(r) UPT_RDX(r) +#define UPT_SYSCALL_ARG4(r) UPT_R10(r) +#define UPT_SYSCALL_ARG5(r) UPT_R8(r) +#define UPT_SYSCALL_ARG6(r) UPT_R9(r) + +struct syscall_args { + unsigned long args[6]; +}; + +#define SYSCALL_ARGS(r) ((struct syscall_args) \ + { .args = { UPT_SYSCALL_ARG1(r), \ + UPT_SYSCALL_ARG2(r), \ + UPT_SYSCALL_ARG3(r), \ + UPT_SYSCALL_ARG4(r), \ + UPT_SYSCALL_ARG5(r), \ + UPT_SYSCALL_ARG6(r) } } ) + +#define UPT_RESTART_SYSCALL(r) REGS_RESTART_SYSCALL((r)->gp) + +#define UPT_FAULTINFO(r) (&(r)->faultinfo) + +static inline void arch_init_registers(int pid) +{ +} + +#endif diff --git a/arch/x86/um/shared/sysdep/ptrace_user.h b/arch/x86/um/shared/sysdep/ptrace_user.h new file mode 100644 index 000000000000..16cd6b5e71f7 --- /dev/null +++ b/arch/x86/um/shared/sysdep/ptrace_user.h @@ -0,0 +1,27 @@ +#include <generated/user_constants.h> + +#define PT_OFFSET(r) ((r) * sizeof(long)) + +#define PT_SYSCALL_NR(regs) ((regs)[HOST_ORIG_AX]) +#define PT_SYSCALL_NR_OFFSET PT_OFFSET(HOST_ORIG_AX) + +#define PT_SYSCALL_RET_OFFSET PT_OFFSET(HOST_AX) + +#define REGS_IP_INDEX HOST_IP +#define REGS_SP_INDEX HOST_SP + +#ifdef __i386__ +#define FP_SIZE ((HOST_FPX_SIZE > HOST_FP_SIZE) ? HOST_FPX_SIZE : HOST_FP_SIZE) +#else +#define FP_SIZE HOST_FP_SIZE + +/* + * x86_64 FC3 doesn't define this in /usr/include/linux/ptrace.h even though + * it's defined in the kernel's include/linux/ptrace.h. Additionally, use the + * 2.4 name and value for 2.4 host compatibility. + */ +#ifndef PTRACE_OLDSETOPTIONS +#define PTRACE_OLDSETOPTIONS 21 +#endif + +#endif diff --git a/arch/x86/um/shared/sysdep/skas_ptrace.h b/arch/x86/um/shared/sysdep/skas_ptrace.h new file mode 100644 index 000000000000..453febe98993 --- /dev/null +++ b/arch/x86/um/shared/sysdep/skas_ptrace.h @@ -0,0 +1,22 @@ +/* + * Copyright (C) 2000, 2001, 2002 Jeff Dike (jdike@karaya.com) + * Licensed under the GPL + */ + +#ifndef __SYSDEP_X86_SKAS_PTRACE_H +#define __SYSDEP_X86_SKAS_PTRACE_H + +struct ptrace_faultinfo { + int is_write; + unsigned long addr; +}; + +struct ptrace_ldt { + int func; + void *ptr; + unsigned long bytecount; +}; + +#define PTRACE_LDT 54 + +#endif diff --git a/arch/x86/um/shared/sysdep/stub.h b/arch/x86/um/shared/sysdep/stub.h new file mode 100644 index 000000000000..bd161e300102 --- /dev/null +++ b/arch/x86/um/shared/sysdep/stub.h @@ -0,0 +1,14 @@ +#include <asm/unistd.h> +#include <sys/mman.h> +#include <signal.h> +#include "as-layout.h" +#include "stub-data.h" + +#ifdef __i386__ +#include "stub_32.h" +#else +#include "stub_64.h" +#endif + +extern void stub_segv_handler(int, siginfo_t *, void *); +extern void stub_clone_handler(void); diff --git a/arch/x86/um/shared/sysdep/stub_32.h b/arch/x86/um/shared/sysdep/stub_32.h new file mode 100644 index 000000000000..51fd256c75f0 --- /dev/null +++ b/arch/x86/um/shared/sysdep/stub_32.h @@ -0,0 +1,93 @@ +/* + * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com) + * Licensed under the GPL + */ + +#ifndef __SYSDEP_STUB_H +#define __SYSDEP_STUB_H + +#include <asm/ptrace.h> + +#define STUB_SYSCALL_RET EAX +#define STUB_MMAP_NR __NR_mmap2 +#define MMAP_OFFSET(o) ((o) >> UM_KERN_PAGE_SHIFT) + +static inline long stub_syscall0(long syscall) +{ + long ret; + + __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall)); + + return ret; +} + +static inline long stub_syscall1(long syscall, long arg1) +{ + long ret; + + __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1)); + + return ret; +} + +static inline long stub_syscall2(long syscall, long arg1, long arg2) +{ + long ret; + + __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1), + "c" (arg2)); + + return ret; +} + +static inline long stub_syscall3(long syscall, long arg1, long arg2, long arg3) +{ + long ret; + + __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1), + "c" (arg2), "d" (arg3)); + + return ret; +} + +static inline long stub_syscall4(long syscall, long arg1, long arg2, long arg3, + long arg4) +{ + long ret; + + __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1), + "c" (arg2), "d" (arg3), "S" (arg4)); + + return ret; +} + +static inline long stub_syscall5(long syscall, long arg1, long arg2, long arg3, + long arg4, long arg5) +{ + long ret; + + __asm__ volatile ("int $0x80" : "=a" (ret) : "0" (syscall), "b" (arg1), + "c" (arg2), "d" (arg3), "S" (arg4), "D" (arg5)); + + return ret; +} + +static inline void trap_myself(void) +{ + __asm("int3"); +} + +static inline void remap_stack(int fd, unsigned long offset) +{ + __asm__ volatile ("movl %%eax,%%ebp ; movl %0,%%eax ; int $0x80 ;" + "movl %7, %%ebx ; movl %%eax, (%%ebx)" + : : "g" (STUB_MMAP_NR), "b" (STUB_DATA), + "c" (UM_KERN_PAGE_SIZE), + "d" (PROT_READ | PROT_WRITE), + "S" (MAP_FIXED | MAP_SHARED), "D" (fd), + "a" (offset), + "i" (&((struct stub_data *) STUB_DATA)->err) + : "memory"); +} + +#endif diff --git a/arch/x86/um/shared/sysdep/stub_64.h b/arch/x86/um/shared/sysdep/stub_64.h new file mode 100644 index 000000000000..994df93c5ed3 --- /dev/null +++ b/arch/x86/um/shared/sysdep/stub_64.h @@ -0,0 +1,99 @@ +/* + * Copyright (C) 2004 Jeff Dike (jdike@addtoit.com) + * Licensed under the GPL + */ + +#ifndef __SYSDEP_STUB_H +#define __SYSDEP_STUB_H + +#include <sysdep/ptrace_user.h> + +#define STUB_SYSCALL_RET PT_INDEX(RAX) +#define STUB_MMAP_NR __NR_mmap +#define MMAP_OFFSET(o) (o) + +#define __syscall_clobber "r11","rcx","memory" +#define __syscall "syscall" + +static inline long stub_syscall0(long syscall) +{ + long ret; + + __asm__ volatile (__syscall + : "=a" (ret) + : "0" (syscall) : __syscall_clobber ); + + return ret; +} + +static inline long stub_syscall2(long syscall, long arg1, long arg2) +{ + long ret; + + __asm__ volatile (__syscall + : "=a" (ret) + : "0" (syscall), "D" (arg1), "S" (arg2) : __syscall_clobber ); + + return ret; +} + +static inline long stub_syscall3(long syscall, long arg1, long arg2, long arg3) +{ + long ret; + + __asm__ volatile (__syscall + : "=a" (ret) + : "0" (syscall), "D" (arg1), "S" (arg2), "d" (arg3) + : __syscall_clobber ); + + return ret; +} + +static inline long stub_syscall4(long syscall, long arg1, long arg2, long arg3, + long arg4) +{ + long ret; + + __asm__ volatile ("movq %5,%%r10 ; " __syscall + : "=a" (ret) + : "0" (syscall), "D" (arg1), "S" (arg2), "d" (arg3), + "g" (arg4) + : __syscall_clobber, "r10" ); + + return ret; +} + +static inline long stub_syscall5(long syscall, long arg1, long arg2, long arg3, + long arg4, long arg5) +{ + long ret; + + __asm__ volatile ("movq %5,%%r10 ; movq %6,%%r8 ; " __syscall + : "=a" (ret) + : "0" (syscall), "D" (arg1), "S" (arg2), "d" (arg3), + "g" (arg4), "g" (arg5) + : __syscall_clobber, "r10", "r8" ); + + return ret; +} + +static inline void trap_myself(void) +{ + __asm("int3"); +} + +static inline void remap_stack(long fd, unsigned long offset) +{ + __asm__ volatile ("movq %4,%%r10 ; movq %5,%%r8 ; " + "movq %6, %%r9; " __syscall "; movq %7, %%rbx ; " + "movq %%rax, (%%rbx)": + : "a" (STUB_MMAP_NR), "D" (STUB_DATA), + "S" (UM_KERN_PAGE_SIZE), + "d" (PROT_READ | PROT_WRITE), + "g" (MAP_FIXED | MAP_SHARED), "g" (fd), + "g" (offset), + "i" (&((struct stub_data *) STUB_DATA)->err) + : __syscall_clobber, "r10", "r8", "r9" ); +} + +#endif diff --git a/arch/x86/um/shared/sysdep/syscalls.h b/arch/x86/um/shared/sysdep/syscalls.h new file mode 100644 index 000000000000..bd9a89b67e41 --- /dev/null +++ b/arch/x86/um/shared/sysdep/syscalls.h @@ -0,0 +1,5 @@ +#ifdef __i386__ +#include "syscalls_32.h" +#else +#include "syscalls_64.h" +#endif diff --git a/arch/x86/um/shared/sysdep/syscalls_32.h b/arch/x86/um/shared/sysdep/syscalls_32.h new file mode 100644 index 000000000000..05cb796aecb5 --- /dev/null +++ b/arch/x86/um/shared/sysdep/syscalls_32.h @@ -0,0 +1,20 @@ +/* + * Copyright (C) 2000 - 2008 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include "asm/unistd.h" +#include "sysdep/ptrace.h" + +typedef long syscall_handler_t(struct pt_regs); + +/* Not declared on x86, incompatible declarations on x86_64, so these have + * to go here rather than in sys_call_table.c + */ +extern syscall_handler_t sys_rt_sigaction; + +extern syscall_handler_t *sys_call_table[]; + +#define EXECUTE_SYSCALL(syscall, regs) \ + ((long (*)(struct syscall_args)) \ + (*sys_call_table[syscall]))(SYSCALL_ARGS(®s->regs)) diff --git a/arch/x86/um/shared/sysdep/syscalls_64.h b/arch/x86/um/shared/sysdep/syscalls_64.h new file mode 100644 index 000000000000..8a7d5e1da98e --- /dev/null +++ b/arch/x86/um/shared/sysdep/syscalls_64.h @@ -0,0 +1,32 @@ +/* + * Copyright 2003 PathScale, Inc. + * + * Licensed under the GPL + */ + +#ifndef __SYSDEP_X86_64_SYSCALLS_H__ +#define __SYSDEP_X86_64_SYSCALLS_H__ + +#include <linux/msg.h> +#include <linux/shm.h> + +typedef long syscall_handler_t(void); + +extern syscall_handler_t *sys_call_table[]; + +#define EXECUTE_SYSCALL(syscall, regs) \ + (((long (*)(long, long, long, long, long, long)) \ + (*sys_call_table[syscall]))(UPT_SYSCALL_ARG1(®s->regs), \ + UPT_SYSCALL_ARG2(®s->regs), \ + UPT_SYSCALL_ARG3(®s->regs), \ + UPT_SYSCALL_ARG4(®s->regs), \ + UPT_SYSCALL_ARG5(®s->regs), \ + UPT_SYSCALL_ARG6(®s->regs))) + +extern long old_mmap(unsigned long addr, unsigned long len, + unsigned long prot, unsigned long flags, + unsigned long fd, unsigned long pgoff); +extern syscall_handler_t sys_modify_ldt; +extern syscall_handler_t sys_arch_prctl; + +#endif diff --git a/arch/x86/um/shared/sysdep/tls.h b/arch/x86/um/shared/sysdep/tls.h new file mode 100644 index 000000000000..27cce00c6b30 --- /dev/null +++ b/arch/x86/um/shared/sysdep/tls.h @@ -0,0 +1,39 @@ +#ifndef _SYSDEP_TLS_H +#define _SYSDEP_TLS_H + +# ifndef __KERNEL__ + +/* Change name to avoid conflicts with the original one from <asm/ldt.h>, which + * may be named user_desc (but in 2.4 and in header matching its API was named + * modify_ldt_ldt_s). */ + +typedef struct um_dup_user_desc { + unsigned int entry_number; + unsigned int base_addr; + unsigned int limit; + unsigned int seg_32bit:1; + unsigned int contents:2; + unsigned int read_exec_only:1; + unsigned int limit_in_pages:1; + unsigned int seg_not_present:1; + unsigned int useable:1; +#ifdef __x86_64__ + unsigned int lm:1; +#endif +} user_desc_t; + +# else /* __KERNEL__ */ + +typedef struct user_desc user_desc_t; + +# endif /* __KERNEL__ */ + +extern int os_set_thread_area(user_desc_t *info, int pid); +extern int os_get_thread_area(user_desc_t *info, int pid); + +#ifdef __i386__ +#define GDT_ENTRY_TLS_MIN_I386 6 +#define GDT_ENTRY_TLS_MIN_X86_64 12 +#endif + +#endif /* _SYSDEP_TLS_H */ diff --git a/arch/x86/um/signal.c b/arch/x86/um/signal.c new file mode 100644 index 000000000000..4883b9546016 --- /dev/null +++ b/arch/x86/um/signal.c @@ -0,0 +1,624 @@ +/* + * Copyright (C) 2003 PathScale, Inc. + * Copyright (C) 2003 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + + +#include <linux/personality.h> +#include <linux/ptrace.h> +#include <linux/kernel.h> +#include <asm/unistd.h> +#include <asm/uaccess.h> +#include <asm/ucontext.h> +#include "frame_kern.h" +#include "skas.h" + +#ifdef CONFIG_X86_32 + +/* + * FPU tag word conversions. + */ + +static inline unsigned short twd_i387_to_fxsr(unsigned short twd) +{ + unsigned int tmp; /* to avoid 16 bit prefixes in the code */ + + /* Transform each pair of bits into 01 (valid) or 00 (empty) */ + tmp = ~twd; + tmp = (tmp | (tmp>>1)) & 0x5555; /* 0V0V0V0V0V0V0V0V */ + /* and move the valid bits to the lower byte. */ + tmp = (tmp | (tmp >> 1)) & 0x3333; /* 00VV00VV00VV00VV */ + tmp = (tmp | (tmp >> 2)) & 0x0f0f; /* 0000VVVV0000VVVV */ + tmp = (tmp | (tmp >> 4)) & 0x00ff; /* 00000000VVVVVVVV */ + return tmp; +} + +static inline unsigned long twd_fxsr_to_i387(struct user_fxsr_struct *fxsave) +{ + struct _fpxreg *st = NULL; + unsigned long twd = (unsigned long) fxsave->twd; + unsigned long tag; + unsigned long ret = 0xffff0000; + int i; + +#define FPREG_ADDR(f, n) ((char *)&(f)->st_space + (n) * 16) + + for (i = 0; i < 8; i++) { + if (twd & 0x1) { + st = (struct _fpxreg *) FPREG_ADDR(fxsave, i); + + switch (st->exponent & 0x7fff) { + case 0x7fff: + tag = 2; /* Special */ + break; + case 0x0000: + if ( !st->significand[0] && + !st->significand[1] && + !st->significand[2] && + !st->significand[3] ) { + tag = 1; /* Zero */ + } else { + tag = 2; /* Special */ + } + break; + default: + if (st->significand[3] & 0x8000) { + tag = 0; /* Valid */ + } else { + tag = 2; /* Special */ + } + break; + } + } else { + tag = 3; /* Empty */ + } + ret |= (tag << (2 * i)); + twd = twd >> 1; + } + return ret; +} + +static int convert_fxsr_to_user(struct _fpstate __user *buf, + struct user_fxsr_struct *fxsave) +{ + unsigned long env[7]; + struct _fpreg __user *to; + struct _fpxreg *from; + int i; + + env[0] = (unsigned long)fxsave->cwd | 0xffff0000ul; + env[1] = (unsigned long)fxsave->swd | 0xffff0000ul; + env[2] = twd_fxsr_to_i387(fxsave); + env[3] = fxsave->fip; + env[4] = fxsave->fcs | ((unsigned long)fxsave->fop << 16); + env[5] = fxsave->foo; + env[6] = fxsave->fos; + + if (__copy_to_user(buf, env, 7 * sizeof(unsigned long))) + return 1; + + to = &buf->_st[0]; + from = (struct _fpxreg *) &fxsave->st_space[0]; + for (i = 0; i < 8; i++, to++, from++) { + unsigned long __user *t = (unsigned long __user *)to; + unsigned long *f = (unsigned long *)from; + + if (__put_user(*f, t) || + __put_user(*(f + 1), t + 1) || + __put_user(from->exponent, &to->exponent)) + return 1; + } + return 0; +} + +static int convert_fxsr_from_user(struct user_fxsr_struct *fxsave, + struct _fpstate __user *buf) +{ + unsigned long env[7]; + struct _fpxreg *to; + struct _fpreg __user *from; + int i; + + if (copy_from_user( env, buf, 7 * sizeof(long))) + return 1; + + fxsave->cwd = (unsigned short)(env[0] & 0xffff); + fxsave->swd = (unsigned short)(env[1] & 0xffff); + fxsave->twd = twd_i387_to_fxsr((unsigned short)(env[2] & 0xffff)); + fxsave->fip = env[3]; + fxsave->fop = (unsigned short)((env[4] & 0xffff0000ul) >> 16); + fxsave->fcs = (env[4] & 0xffff); + fxsave->foo = env[5]; + fxsave->fos = env[6]; + + to = (struct _fpxreg *) &fxsave->st_space[0]; + from = &buf->_st[0]; + for (i = 0; i < 8; i++, to++, from++) { + unsigned long *t = (unsigned long *)to; + unsigned long __user *f = (unsigned long __user *)from; + + if (__get_user(*t, f) || + __get_user(*(t + 1), f + 1) || + __get_user(to->exponent, &from->exponent)) + return 1; + } + return 0; +} + +extern int have_fpx_regs; + +#endif + +static int copy_sc_from_user(struct pt_regs *regs, + struct sigcontext __user *from) +{ + struct sigcontext sc; + int err, pid; + + err = copy_from_user(&sc, from, sizeof(sc)); + if (err) + return err; + +#define GETREG(regno, regname) regs->regs.gp[HOST_##regno] = sc.regname + +#ifdef CONFIG_X86_32 + GETREG(GS, gs); + GETREG(FS, fs); + GETREG(ES, es); + GETREG(DS, ds); +#endif + GETREG(DI, di); + GETREG(SI, si); + GETREG(BP, bp); + GETREG(SP, sp); + GETREG(BX, bx); + GETREG(DX, dx); + GETREG(CX, cx); + GETREG(AX, ax); + GETREG(IP, ip); + +#ifdef CONFIG_X86_64 + GETREG(R8, r8); + GETREG(R9, r9); + GETREG(R10, r10); + GETREG(R11, r11); + GETREG(R12, r12); + GETREG(R13, r13); + GETREG(R14, r14); + GETREG(R15, r15); +#endif + + GETREG(CS, cs); + GETREG(EFLAGS, flags); +#ifdef CONFIG_X86_32 + GETREG(SS, ss); +#endif + +#undef GETREG + + pid = userspace_pid[current_thread_info()->cpu]; +#ifdef CONFIG_X86_32 + if (have_fpx_regs) { + struct user_fxsr_struct fpx; + + err = copy_from_user(&fpx, + &((struct _fpstate __user *)sc.fpstate)->_fxsr_env[0], + sizeof(struct user_fxsr_struct)); + if (err) + return 1; + + err = convert_fxsr_from_user(&fpx, sc.fpstate); + if (err) + return 1; + + err = restore_fpx_registers(pid, (unsigned long *) &fpx); + if (err < 0) { + printk(KERN_ERR "copy_sc_from_user - " + "restore_fpx_registers failed, errno = %d\n", + -err); + return 1; + } + } else +#endif + { + struct user_i387_struct fp; + + err = copy_from_user(&fp, sc.fpstate, + sizeof(struct user_i387_struct)); + if (err) + return 1; + + err = restore_fp_registers(pid, (unsigned long *) &fp); + if (err < 0) { + printk(KERN_ERR "copy_sc_from_user - " + "restore_fp_registers failed, errno = %d\n", + -err); + return 1; + } + } + return 0; +} + +static int copy_sc_to_user(struct sigcontext __user *to, + struct _fpstate __user *to_fp, struct pt_regs *regs, + unsigned long mask) +{ + struct sigcontext sc; + struct faultinfo * fi = ¤t->thread.arch.faultinfo; + int err, pid; + memset(&sc, 0, sizeof(struct sigcontext)); + +#define PUTREG(regno, regname) sc.regname = regs->regs.gp[HOST_##regno] + +#ifdef CONFIG_X86_32 + PUTREG(GS, gs); + PUTREG(FS, fs); + PUTREG(ES, es); + PUTREG(DS, ds); +#endif + PUTREG(DI, di); + PUTREG(SI, si); + PUTREG(BP, bp); + PUTREG(SP, sp); + PUTREG(BX, bx); + PUTREG(DX, dx); + PUTREG(CX, cx); + PUTREG(AX, ax); +#ifdef CONFIG_X86_64 + PUTREG(R8, r8); + PUTREG(R9, r9); + PUTREG(R10, r10); + PUTREG(R11, r11); + PUTREG(R12, r12); + PUTREG(R13, r13); + PUTREG(R14, r14); + PUTREG(R15, r15); +#endif + + sc.cr2 = fi->cr2; + sc.err = fi->error_code; + sc.trapno = fi->trap_no; + PUTREG(IP, ip); + PUTREG(CS, cs); + PUTREG(EFLAGS, flags); +#ifdef CONFIG_X86_32 + PUTREG(SP, sp_at_signal); + PUTREG(SS, ss); +#endif +#undef PUTREG + sc.oldmask = mask; + sc.fpstate = to_fp; + + err = copy_to_user(to, &sc, sizeof(struct sigcontext)); + if (err) + return 1; + + pid = userspace_pid[current_thread_info()->cpu]; + +#ifdef CONFIG_X86_32 + if (have_fpx_regs) { + struct user_fxsr_struct fpx; + + err = save_fpx_registers(pid, (unsigned long *) &fpx); + if (err < 0){ + printk(KERN_ERR "copy_sc_to_user - save_fpx_registers " + "failed, errno = %d\n", err); + return 1; + } + + err = convert_fxsr_to_user(to_fp, &fpx); + if (err) + return 1; + + err |= __put_user(fpx.swd, &to_fp->status); + err |= __put_user(X86_FXSR_MAGIC, &to_fp->magic); + if (err) + return 1; + + if (copy_to_user(&to_fp->_fxsr_env[0], &fpx, + sizeof(struct user_fxsr_struct))) + return 1; + } else +#endif + { + struct user_i387_struct fp; + + err = save_fp_registers(pid, (unsigned long *) &fp); + if (copy_to_user(to_fp, &fp, sizeof(struct user_i387_struct))) + return 1; + } + + return 0; +} + +#ifdef CONFIG_X86_32 +static int copy_ucontext_to_user(struct ucontext __user *uc, + struct _fpstate __user *fp, sigset_t *set, + unsigned long sp) +{ + int err = 0; + + err |= put_user(current->sas_ss_sp, &uc->uc_stack.ss_sp); + err |= put_user(sas_ss_flags(sp), &uc->uc_stack.ss_flags); + err |= put_user(current->sas_ss_size, &uc->uc_stack.ss_size); + err |= copy_sc_to_user(&uc->uc_mcontext, fp, ¤t->thread.regs, 0); + err |= copy_to_user(&uc->uc_sigmask, set, sizeof(*set)); + return err; +} + +struct sigframe +{ + char __user *pretcode; + int sig; + struct sigcontext sc; + struct _fpstate fpstate; + unsigned long extramask[_NSIG_WORDS-1]; + char retcode[8]; +}; + +struct rt_sigframe +{ + char __user *pretcode; + int sig; + struct siginfo __user *pinfo; + void __user *puc; + struct siginfo info; + struct ucontext uc; + struct _fpstate fpstate; + char retcode[8]; +}; + +int setup_signal_stack_sc(unsigned long stack_top, int sig, + struct k_sigaction *ka, struct pt_regs *regs, + sigset_t *mask) +{ + struct sigframe __user *frame; + void __user *restorer; + int err = 0; + + /* This is the same calculation as i386 - ((sp + 4) & 15) == 0 */ + stack_top = ((stack_top + 4) & -16UL) - 4; + frame = (struct sigframe __user *) stack_top - 1; + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return 1; + + restorer = frame->retcode; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + + err |= __put_user(restorer, &frame->pretcode); + err |= __put_user(sig, &frame->sig); + err |= copy_sc_to_user(&frame->sc, &frame->fpstate, regs, mask->sig[0]); + if (_NSIG_WORDS > 1) + err |= __copy_to_user(&frame->extramask, &mask->sig[1], + sizeof(frame->extramask)); + + /* + * This is popl %eax ; movl $,%eax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + err |= __put_user(0xb858, (short __user *)(frame->retcode+0)); + err |= __put_user(__NR_sigreturn, (int __user *)(frame->retcode+2)); + err |= __put_user(0x80cd, (short __user *)(frame->retcode+6)); + + if (err) + return err; + + PT_REGS_SP(regs) = (unsigned long) frame; + PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; + PT_REGS_EAX(regs) = (unsigned long) sig; + PT_REGS_EDX(regs) = (unsigned long) 0; + PT_REGS_ECX(regs) = (unsigned long) 0; + + if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) + ptrace_notify(SIGTRAP); + return 0; +} + +int setup_signal_stack_si(unsigned long stack_top, int sig, + struct k_sigaction *ka, struct pt_regs *regs, + siginfo_t *info, sigset_t *mask) +{ + struct rt_sigframe __user *frame; + void __user *restorer; + int err = 0; + + stack_top &= -8UL; + frame = (struct rt_sigframe __user *) stack_top - 1; + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + return 1; + + restorer = frame->retcode; + if (ka->sa.sa_flags & SA_RESTORER) + restorer = ka->sa.sa_restorer; + + err |= __put_user(restorer, &frame->pretcode); + err |= __put_user(sig, &frame->sig); + err |= __put_user(&frame->info, &frame->pinfo); + err |= __put_user(&frame->uc, &frame->puc); + err |= copy_siginfo_to_user(&frame->info, info); + err |= copy_ucontext_to_user(&frame->uc, &frame->fpstate, mask, + PT_REGS_SP(regs)); + + /* + * This is movl $,%eax ; int $0x80 + * + * WE DO NOT USE IT ANY MORE! It's only left here for historical + * reasons and because gdb uses it as a signature to notice + * signal handler stack frames. + */ + err |= __put_user(0xb8, (char __user *)(frame->retcode+0)); + err |= __put_user(__NR_rt_sigreturn, (int __user *)(frame->retcode+1)); + err |= __put_user(0x80cd, (short __user *)(frame->retcode+5)); + + if (err) + return err; + + PT_REGS_SP(regs) = (unsigned long) frame; + PT_REGS_IP(regs) = (unsigned long) ka->sa.sa_handler; + PT_REGS_EAX(regs) = (unsigned long) sig; + PT_REGS_EDX(regs) = (unsigned long) &frame->info; + PT_REGS_ECX(regs) = (unsigned long) &frame->uc; + + if ((current->ptrace & PT_DTRACE) && (current->ptrace & PT_PTRACED)) + ptrace_notify(SIGTRAP); + return 0; +} + +long sys_sigreturn(struct pt_regs *regs) +{ + unsigned long sp = PT_REGS_SP(¤t->thread.regs); + struct sigframe __user *frame = (struct sigframe __user *)(sp - 8); + sigset_t set; + struct sigcontext __user *sc = &frame->sc; + unsigned long __user *oldmask = &sc->oldmask; + unsigned long __user *extramask = frame->extramask; + int sig_size = (_NSIG_WORDS - 1) * sizeof(unsigned long); + + if (copy_from_user(&set.sig[0], oldmask, sizeof(set.sig[0])) || + copy_from_user(&set.sig[1], extramask, sig_size)) + goto segfault; + + sigdelsetmask(&set, ~_BLOCKABLE); + set_current_blocked(&set); + + if (copy_sc_from_user(¤t->thread.regs, sc)) + goto segfault; + + /* Avoid ERESTART handling */ + PT_REGS_SYSCALL_NR(¤t->thread.regs) = -1; + return PT_REGS_SYSCALL_RET(¤t->thread.regs); + + segfault: + force_sig(SIGSEGV, current); + return 0; +} + +#else + +struct rt_sigframe +{ + char __user *pretcode; + struct ucontext uc; + struct siginfo info; + struct _fpstate fpstate; +}; + +int setup_signal_stack_si(unsigned long stack_top, int sig, + struct k_sigaction *ka, struct pt_regs * regs, + siginfo_t *info, sigset_t *set) +{ + struct rt_sigframe __user *frame; + int err = 0; + struct task_struct *me = current; + + frame = (struct rt_sigframe __user *) + round_down(stack_top - sizeof(struct rt_sigframe), 16); + /* Subtract 128 for a red zone and 8 for proper alignment */ + frame = (struct rt_sigframe __user *) ((unsigned long) frame - 128 - 8); + + if (!access_ok(VERIFY_WRITE, frame, sizeof(*frame))) + goto out; + + if (ka->sa.sa_flags & SA_SIGINFO) { + err |= copy_siginfo_to_user(&frame->info, info); + if (err) + goto out; + } + + /* Create the ucontext. */ + err |= __put_user(0, &frame->uc.uc_flags); + err |= __put_user(0, &frame->uc.uc_link); + err |= __put_user(me->sas_ss_sp, &frame->uc.uc_stack.ss_sp); + err |= __put_user(sas_ss_flags(PT_REGS_SP(regs)), + &frame->uc.uc_stack.ss_flags); + err |= __put_user(me->sas_ss_size, &frame->uc.uc_stack.ss_size); + err |= copy_sc_to_user(&frame->uc.uc_mcontext, &frame->fpstate, regs, + set->sig[0]); + err |= __put_user(&frame->fpstate, &frame->uc.uc_mcontext.fpstate); + if (sizeof(*set) == 16) { + __put_user(set->sig[0], &frame->uc.uc_sigmask.sig[0]); + __put_user(set->sig[1], &frame->uc.uc_sigmask.sig[1]); + } + else + err |= __copy_to_user(&frame->uc.uc_sigmask, set, + sizeof(*set)); + + /* + * Set up to return from userspace. If provided, use a stub + * already in userspace. + */ + /* x86-64 should always use SA_RESTORER. */ + if (ka->sa.sa_flags & SA_RESTORER) + err |= __put_user(ka->sa.sa_restorer, &frame->pretcode); + else + /* could use a vstub here */ + return err; + + if (err) + return err; + + /* Set up registers for signal handler */ + { + struct exec_domain *ed = current_thread_info()->exec_domain; + if (unlikely(ed && ed->signal_invmap && sig < 32)) + sig = ed->signal_invmap[sig]; + } + + PT_REGS_SP(regs) = (unsigned long) frame; + PT_REGS_RDI(regs) = sig; + /* In case the signal handler was declared without prototypes */ + PT_REGS_RAX(regs) = 0; + + /* + * This also works for non SA_SIGINFO handlers because they expect the + * next argument after the signal number on the stack. + */ + PT_REGS_RSI(regs) = (unsigned long) &frame->info; + PT_REGS_RDX(regs) = (unsigned long) &frame->uc; + PT_REGS_RIP(regs) = (unsigned long) ka->sa.sa_handler; + out: + return err; +} +#endif + +long sys_rt_sigreturn(struct pt_regs *regs) +{ + unsigned long sp = PT_REGS_SP(¤t->thread.regs); + struct rt_sigframe __user *frame = + (struct rt_sigframe __user *)(sp - sizeof(long)); + struct ucontext __user *uc = &frame->uc; + sigset_t set; + + if (copy_from_user(&set, &uc->uc_sigmask, sizeof(set))) + goto segfault; + + sigdelsetmask(&set, ~_BLOCKABLE); + set_current_blocked(&set); + + if (copy_sc_from_user(¤t->thread.regs, &uc->uc_mcontext)) + goto segfault; + + /* Avoid ERESTART handling */ + PT_REGS_SYSCALL_NR(¤t->thread.regs) = -1; + return PT_REGS_SYSCALL_RET(¤t->thread.regs); + + segfault: + force_sig(SIGSEGV, current); + return 0; +} + +#ifdef CONFIG_X86_32 +long ptregs_sigreturn(void) +{ + return sys_sigreturn(NULL); +} +long ptregs_rt_sigreturn(void) +{ + return sys_rt_sigreturn(NULL); +} +#endif diff --git a/arch/x86/um/stub_32.S b/arch/x86/um/stub_32.S new file mode 100644 index 000000000000..54a36ec20cb7 --- /dev/null +++ b/arch/x86/um/stub_32.S @@ -0,0 +1,51 @@ +#include "as-layout.h" + + .globl syscall_stub +.section .__syscall_stub, "ax" + + .globl batch_syscall_stub +batch_syscall_stub: + /* load pointer to first operation */ + mov $(STUB_DATA+8), %esp + +again: + /* load length of additional data */ + mov 0x0(%esp), %eax + + /* if(length == 0) : end of list */ + /* write possible 0 to header */ + mov %eax, STUB_DATA+4 + cmpl $0, %eax + jz done + + /* save current pointer */ + mov %esp, STUB_DATA+4 + + /* skip additional data */ + add %eax, %esp + + /* load syscall-# */ + pop %eax + + /* load syscall params */ + pop %ebx + pop %ecx + pop %edx + pop %esi + pop %edi + pop %ebp + + /* execute syscall */ + int $0x80 + + /* check return value */ + pop %ebx + cmp %ebx, %eax + je again + +done: + /* save return value */ + mov %eax, STUB_DATA + + /* stop */ + int3 diff --git a/arch/x86/um/stub_64.S b/arch/x86/um/stub_64.S new file mode 100644 index 000000000000..20e4a96a6dcb --- /dev/null +++ b/arch/x86/um/stub_64.S @@ -0,0 +1,66 @@ +#include "as-layout.h" + + .globl syscall_stub +.section .__syscall_stub, "ax" +syscall_stub: + syscall + /* We don't have 64-bit constants, so this constructs the address + * we need. + */ + movq $(STUB_DATA >> 32), %rbx + salq $32, %rbx + movq $(STUB_DATA & 0xffffffff), %rcx + or %rcx, %rbx + movq %rax, (%rbx) + int3 + + .globl batch_syscall_stub +batch_syscall_stub: + mov $(STUB_DATA >> 32), %rbx + sal $32, %rbx + mov $(STUB_DATA & 0xffffffff), %rax + or %rax, %rbx + /* load pointer to first operation */ + mov %rbx, %rsp + add $0x10, %rsp +again: + /* load length of additional data */ + mov 0x0(%rsp), %rax + + /* if(length == 0) : end of list */ + /* write possible 0 to header */ + mov %rax, 8(%rbx) + cmp $0, %rax + jz done + + /* save current pointer */ + mov %rsp, 8(%rbx) + + /* skip additional data */ + add %rax, %rsp + + /* load syscall-# */ + pop %rax + + /* load syscall params */ + pop %rdi + pop %rsi + pop %rdx + pop %r10 + pop %r8 + pop %r9 + + /* execute syscall */ + syscall + + /* check return value */ + pop %rcx + cmp %rcx, %rax + je again + +done: + /* save return value */ + mov %rax, (%rbx) + + /* stop */ + int3 diff --git a/arch/x86/um/stub_segv.c b/arch/x86/um/stub_segv.c new file mode 100644 index 000000000000..b7450bd22e7d --- /dev/null +++ b/arch/x86/um/stub_segv.c @@ -0,0 +1,19 @@ +/* + * Copyright (C) 2004 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Licensed under the GPL + */ + +#include "sysdep/stub.h" +#include "sysdep/faultinfo.h" +#include "sysdep/mcontext.h" + +void __attribute__ ((__section__ (".__syscall_stub"))) +stub_segv_handler(int sig, siginfo_t *info, void *p) +{ + struct ucontext *uc = p; + + GET_FAULTINFO_FROM_MC(*((struct faultinfo *) STUB_DATA), + &uc->uc_mcontext); + trap_myself(); +} + diff --git a/arch/x86/um/sys_call_table_32.S b/arch/x86/um/sys_call_table_32.S new file mode 100644 index 000000000000..a7ca80d2dceb --- /dev/null +++ b/arch/x86/um/sys_call_table_32.S @@ -0,0 +1,26 @@ +#include <linux/linkage.h> +/* Steal i386 syscall table for our purposes, but with some slight changes.*/ + +#define sys_iopl sys_ni_syscall +#define sys_ioperm sys_ni_syscall + +#define sys_vm86old sys_ni_syscall +#define sys_vm86 sys_ni_syscall + +#define old_mmap sys_old_mmap + +#define ptregs_fork sys_fork +#define ptregs_execve sys_execve +#define ptregs_iopl sys_iopl +#define ptregs_vm86old sys_vm86old +#define ptregs_clone sys_clone +#define ptregs_vm86 sys_vm86 +#define ptregs_sigaltstack sys_sigaltstack +#define ptregs_vfork sys_vfork + +.section .rodata,"a" + +#include "../kernel/syscall_table_32.S" + +ENTRY(syscall_table_size) +.long .-sys_call_table diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c new file mode 100644 index 000000000000..99522f78b162 --- /dev/null +++ b/arch/x86/um/sys_call_table_64.c @@ -0,0 +1,64 @@ +/* + * System call table for UML/x86-64, copied from arch/x86_64/kernel/syscall.c + * with some changes for UML. + */ + +#include <linux/linkage.h> +#include <linux/sys.h> +#include <linux/cache.h> + +#define __NO_STUBS + +/* + * Below you can see, in terms of #define's, the differences between the x86-64 + * and the UML syscall table. + */ + +/* Not going to be implemented by UML, since we have no hardware. */ +#define stub_iopl sys_ni_syscall +#define sys_ioperm sys_ni_syscall + +/* + * The UML TLS problem. Note that x86_64 does not implement this, so the below + * is needed only for the ia32 compatibility. + */ + +/* On UML we call it this way ("old" means it's not mmap2) */ +#define sys_mmap old_mmap + +#define stub_clone sys_clone +#define stub_fork sys_fork +#define stub_vfork sys_vfork +#define stub_execve sys_execve +#define stub_rt_sigsuspend sys_rt_sigsuspend +#define stub_sigaltstack sys_sigaltstack +#define stub_rt_sigreturn sys_rt_sigreturn + +#define __SYSCALL(nr, sym) extern asmlinkage void sym(void) ; +#undef _ASM_X86_UNISTD_64_H +#include "../../x86/include/asm/unistd_64.h" + +#undef __SYSCALL +#define __SYSCALL(nr, sym) [ nr ] = sym, +#undef _ASM_X86_UNISTD_64_H + +typedef void (*sys_call_ptr_t)(void); + +extern void sys_ni_syscall(void); + +/* + * We used to have a trick here which made sure that holes in the + * x86_64 table were filled in with sys_ni_syscall, but a comment in + * unistd_64.h says that holes aren't allowed, so the trick was + * removed. + * The trick looked like this + * [0 ... UM_NR_syscall_max] = &sys_ni_syscall + * before including unistd_64.h - the later initializations overwrote + * the sys_ni_syscall filler. + */ + +sys_call_ptr_t sys_call_table[] __cacheline_aligned = { +#include <asm/unistd_64.h> +}; + +int syscall_table_size = sizeof(sys_call_table); diff --git a/arch/x86/um/syscalls_32.c b/arch/x86/um/syscalls_32.c new file mode 100644 index 000000000000..70ca357393b8 --- /dev/null +++ b/arch/x86/um/syscalls_32.c @@ -0,0 +1,66 @@ +/* + * Copyright (C) 2000 - 2003 Jeff Dike (jdike@addtoit.com) + * Licensed under the GPL + */ + +#include "linux/sched.h" +#include "linux/shm.h" +#include "linux/ipc.h" +#include "linux/syscalls.h" +#include "asm/mman.h" +#include "asm/uaccess.h" +#include "asm/unistd.h" + +/* + * The prototype on i386 is: + * + * int clone(int flags, void * child_stack, int * parent_tidptr, struct user_desc * newtls, int * child_tidptr) + * + * and the "newtls" arg. on i386 is read by copy_thread directly from the + * register saved on the stack. + */ +long sys_clone(unsigned long clone_flags, unsigned long newsp, + int __user *parent_tid, void *newtls, int __user *child_tid) +{ + long ret; + + if (!newsp) + newsp = UPT_SP(¤t->thread.regs.regs); + + current->thread.forking = 1; + ret = do_fork(clone_flags, newsp, ¤t->thread.regs, 0, parent_tid, + child_tid); + current->thread.forking = 0; + return ret; +} + +long sys_sigaction(int sig, const struct old_sigaction __user *act, + struct old_sigaction __user *oact) +{ + struct k_sigaction new_ka, old_ka; + int ret; + + if (act) { + old_sigset_t mask; + if (!access_ok(VERIFY_READ, act, sizeof(*act)) || + __get_user(new_ka.sa.sa_handler, &act->sa_handler) || + __get_user(new_ka.sa.sa_restorer, &act->sa_restorer)) + return -EFAULT; + __get_user(new_ka.sa.sa_flags, &act->sa_flags); + __get_user(mask, &act->sa_mask); + siginitset(&new_ka.sa.sa_mask, mask); + } + + ret = do_sigaction(sig, act ? &new_ka : NULL, oact ? &old_ka : NULL); + + if (!ret && oact) { + if (!access_ok(VERIFY_WRITE, oact, sizeof(*oact)) || + __put_user(old_ka.sa.sa_handler, &oact->sa_handler) || + __put_user(old_ka.sa.sa_restorer, &oact->sa_restorer)) + return -EFAULT; + __put_user(old_ka.sa.sa_flags, &oact->sa_flags); + __put_user(old_ka.sa.sa_mask.sig[0], &oact->sa_mask); + } + + return ret; +} diff --git a/arch/x86/um/syscalls_64.c b/arch/x86/um/syscalls_64.c new file mode 100644 index 000000000000..f3d82bb6e15a --- /dev/null +++ b/arch/x86/um/syscalls_64.c @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2003 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com) + * Copyright 2003 PathScale, Inc. + * + * Licensed under the GPL + */ + +#include "linux/linkage.h" +#include "linux/personality.h" +#include "linux/utsname.h" +#include "asm/prctl.h" /* XXX This should get the constants from libc */ +#include "asm/uaccess.h" +#include "os.h" + +long arch_prctl(struct task_struct *task, int code, unsigned long __user *addr) +{ + unsigned long *ptr = addr, tmp; + long ret; + int pid = task->mm->context.id.u.pid; + + /* + * With ARCH_SET_FS (and ARCH_SET_GS is treated similarly to + * be safe), we need to call arch_prctl on the host because + * setting %fs may result in something else happening (like a + * GDT or thread.fs being set instead). So, we let the host + * fiddle the registers and thread struct and restore the + * registers afterwards. + * + * So, the saved registers are stored to the process (this + * needed because a stub may have been the last thing to run), + * arch_prctl is run on the host, then the registers are read + * back. + */ + switch (code) { + case ARCH_SET_FS: + case ARCH_SET_GS: + ret = restore_registers(pid, ¤t->thread.regs.regs); + if (ret) + return ret; + break; + case ARCH_GET_FS: + case ARCH_GET_GS: + /* + * With these two, we read to a local pointer and + * put_user it to the userspace pointer that we were + * given. If addr isn't valid (because it hasn't been + * faulted in or is just bogus), we want put_user to + * fault it in (or return -EFAULT) instead of having + * the host return -EFAULT. + */ + ptr = &tmp; + } + + ret = os_arch_prctl(pid, code, ptr); + if (ret) + return ret; + + switch (code) { + case ARCH_SET_FS: + current->thread.arch.fs = (unsigned long) ptr; + ret = save_registers(pid, ¤t->thread.regs.regs); + break; + case ARCH_SET_GS: + ret = save_registers(pid, ¤t->thread.regs.regs); + break; + case ARCH_GET_FS: + ret = put_user(tmp, addr); + break; + case ARCH_GET_GS: + ret = put_user(tmp, addr); + break; + } + + return ret; +} + +long sys_arch_prctl(int code, unsigned long addr) +{ + return arch_prctl(current, code, (unsigned long __user *) addr); +} + +long sys_clone(unsigned long clone_flags, unsigned long newsp, + void __user *parent_tid, void __user *child_tid) +{ + long ret; + + if (!newsp) + newsp = UPT_SP(¤t->thread.regs.regs); + current->thread.forking = 1; + ret = do_fork(clone_flags, newsp, ¤t->thread.regs, 0, parent_tid, + child_tid); + current->thread.forking = 0; + return ret; +} + +void arch_switch_to(struct task_struct *to) +{ + if ((to->thread.arch.fs == 0) || (to->mm == NULL)) + return; + + arch_prctl(to, ARCH_SET_FS, (void __user *) to->thread.arch.fs); +} diff --git a/arch/x86/um/sysrq_32.c b/arch/x86/um/sysrq_32.c new file mode 100644 index 000000000000..171b3e9dc867 --- /dev/null +++ b/arch/x86/um/sysrq_32.c @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2001 - 2003 Jeff Dike (jdike@addtoit.com) + * Licensed under the GPL + */ + +#include "linux/kernel.h" +#include "linux/smp.h" +#include "linux/sched.h" +#include "linux/kallsyms.h" +#include "asm/ptrace.h" +#include "sysrq.h" + +/* This is declared by <linux/sched.h> */ +void show_regs(struct pt_regs *regs) +{ + printk("\n"); + printk("EIP: %04lx:[<%08lx>] CPU: %d %s", + 0xffff & PT_REGS_CS(regs), PT_REGS_IP(regs), + smp_processor_id(), print_tainted()); + if (PT_REGS_CS(regs) & 3) + printk(" ESP: %04lx:%08lx", 0xffff & PT_REGS_SS(regs), + PT_REGS_SP(regs)); + printk(" EFLAGS: %08lx\n %s\n", PT_REGS_EFLAGS(regs), + print_tainted()); + printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", + PT_REGS_EAX(regs), PT_REGS_EBX(regs), + PT_REGS_ECX(regs), + PT_REGS_EDX(regs)); + printk("ESI: %08lx EDI: %08lx EBP: %08lx", + PT_REGS_ESI(regs), PT_REGS_EDI(regs), + PT_REGS_EBP(regs)); + printk(" DS: %04lx ES: %04lx\n", + 0xffff & PT_REGS_DS(regs), + 0xffff & PT_REGS_ES(regs)); + + show_trace(NULL, (unsigned long *) ®s); +} + +/* Copied from i386. */ +static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) +{ + return p > (void *)tinfo && + p < (void *)tinfo + THREAD_SIZE - 3; +} + +/* Adapted from i386 (we also print the address we read from). */ +static inline unsigned long print_context_stack(struct thread_info *tinfo, + unsigned long *stack, unsigned long ebp) +{ + unsigned long addr; + +#ifdef CONFIG_FRAME_POINTER + while (valid_stack_ptr(tinfo, (void *)ebp)) { + addr = *(unsigned long *)(ebp + 4); + printk("%08lx: [<%08lx>]", ebp + 4, addr); + print_symbol(" %s", addr); + printk("\n"); + ebp = *(unsigned long *)ebp; + } +#else + while (valid_stack_ptr(tinfo, stack)) { + addr = *stack; + if (__kernel_text_address(addr)) { + printk("%08lx: [<%08lx>]", (unsigned long) stack, addr); + print_symbol(" %s", addr); + printk("\n"); + } + stack++; + } +#endif + return ebp; +} + +void show_trace(struct task_struct* task, unsigned long * stack) +{ + unsigned long ebp; + struct thread_info *context; + + /* Turn this into BUG_ON if possible. */ + if (!stack) { + stack = (unsigned long*) &stack; + printk("show_trace: got NULL stack, implicit assumption task == current"); + WARN_ON(1); + } + + if (!task) + task = current; + + if (task != current) { + ebp = (unsigned long) KSTK_EBP(task); + } else { + asm ("movl %%ebp, %0" : "=r" (ebp) : ); + } + + context = (struct thread_info *) + ((unsigned long)stack & (~(THREAD_SIZE - 1))); + print_context_stack(context, stack, ebp); + + printk("\n"); +} + diff --git a/arch/x86/um/sysrq_64.c b/arch/x86/um/sysrq_64.c new file mode 100644 index 000000000000..e8913436d7dc --- /dev/null +++ b/arch/x86/um/sysrq_64.c @@ -0,0 +1,41 @@ +/* + * Copyright 2003 PathScale, Inc. + * + * Licensed under the GPL + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/utsname.h> +#include <asm/current.h> +#include <asm/ptrace.h> +#include "sysrq.h" + +void __show_regs(struct pt_regs *regs) +{ + printk("\n"); + print_modules(); + printk(KERN_INFO "Pid: %d, comm: %.20s %s %s\n", task_pid_nr(current), + current->comm, print_tainted(), init_utsname()->release); + printk(KERN_INFO "RIP: %04lx:[<%016lx>]\n", PT_REGS_CS(regs) & 0xffff, + PT_REGS_RIP(regs)); + printk(KERN_INFO "RSP: %016lx EFLAGS: %08lx\n", PT_REGS_SP(regs), + PT_REGS_EFLAGS(regs)); + printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", + PT_REGS_RAX(regs), PT_REGS_RBX(regs), PT_REGS_RCX(regs)); + printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", + PT_REGS_RDX(regs), PT_REGS_RSI(regs), PT_REGS_RDI(regs)); + printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", + PT_REGS_RBP(regs), PT_REGS_R8(regs), PT_REGS_R9(regs)); + printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", + PT_REGS_R10(regs), PT_REGS_R11(regs), PT_REGS_R12(regs)); + printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", + PT_REGS_R13(regs), PT_REGS_R14(regs), PT_REGS_R15(regs)); +} + +void show_regs(struct pt_regs *regs) +{ + __show_regs(regs); + show_trace(current, (unsigned long *) ®s); +} diff --git a/arch/x86/um/tls_32.c b/arch/x86/um/tls_32.c new file mode 100644 index 000000000000..c6c7131e563b --- /dev/null +++ b/arch/x86/um/tls_32.c @@ -0,0 +1,396 @@ +/* + * Copyright (C) 2005 Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> + * Licensed under the GPL + */ + +#include "linux/percpu.h" +#include "linux/sched.h" +#include "asm/uaccess.h" +#include "os.h" +#include "skas.h" +#include "sysdep/tls.h" + +/* + * If needed we can detect when it's uninitialized. + * + * These are initialized in an initcall and unchanged thereafter. + */ +static int host_supports_tls = -1; +int host_gdt_entry_tls_min; + +int do_set_thread_area(struct user_desc *info) +{ + int ret; + u32 cpu; + + cpu = get_cpu(); + ret = os_set_thread_area(info, userspace_pid[cpu]); + put_cpu(); + + if (ret) + printk(KERN_ERR "PTRACE_SET_THREAD_AREA failed, err = %d, " + "index = %d\n", ret, info->entry_number); + + return ret; +} + +int do_get_thread_area(struct user_desc *info) +{ + int ret; + u32 cpu; + + cpu = get_cpu(); + ret = os_get_thread_area(info, userspace_pid[cpu]); + put_cpu(); + + if (ret) + printk(KERN_ERR "PTRACE_GET_THREAD_AREA failed, err = %d, " + "index = %d\n", ret, info->entry_number); + + return ret; +} + +/* + * sys_get_thread_area: get a yet unused TLS descriptor index. + * XXX: Consider leaving one free slot for glibc usage at first place. This must + * be done here (and by changing GDT_ENTRY_TLS_* macros) and nowhere else. + * + * Also, this must be tested when compiling in SKAS mode with dynamic linking + * and running against NPTL. + */ +static int get_free_idx(struct task_struct* task) +{ + struct thread_struct *t = &task->thread; + int idx; + + if (!t->arch.tls_array) + return GDT_ENTRY_TLS_MIN; + + for (idx = 0; idx < GDT_ENTRY_TLS_ENTRIES; idx++) + if (!t->arch.tls_array[idx].present) + return idx + GDT_ENTRY_TLS_MIN; + return -ESRCH; +} + +static inline void clear_user_desc(struct user_desc* info) +{ + /* Postcondition: LDT_empty(info) returns true. */ + memset(info, 0, sizeof(*info)); + + /* + * Check the LDT_empty or the i386 sys_get_thread_area code - we obtain + * indeed an empty user_desc. + */ + info->read_exec_only = 1; + info->seg_not_present = 1; +} + +#define O_FORCE 1 + +static int load_TLS(int flags, struct task_struct *to) +{ + int ret = 0; + int idx; + + for (idx = GDT_ENTRY_TLS_MIN; idx < GDT_ENTRY_TLS_MAX; idx++) { + struct uml_tls_struct* curr = + &to->thread.arch.tls_array[idx - GDT_ENTRY_TLS_MIN]; + + /* + * Actually, now if it wasn't flushed it gets cleared and + * flushed to the host, which will clear it. + */ + if (!curr->present) { + if (!curr->flushed) { + clear_user_desc(&curr->tls); + curr->tls.entry_number = idx; + } else { + WARN_ON(!LDT_empty(&curr->tls)); + continue; + } + } + + if (!(flags & O_FORCE) && curr->flushed) + continue; + + ret = do_set_thread_area(&curr->tls); + if (ret) + goto out; + + curr->flushed = 1; + } +out: + return ret; +} + +/* + * Verify if we need to do a flush for the new process, i.e. if there are any + * present desc's, only if they haven't been flushed. + */ +static inline int needs_TLS_update(struct task_struct *task) +{ + int i; + int ret = 0; + + for (i = GDT_ENTRY_TLS_MIN; i < GDT_ENTRY_TLS_MAX; i++) { + struct uml_tls_struct* curr = + &task->thread.arch.tls_array[i - GDT_ENTRY_TLS_MIN]; + + /* + * Can't test curr->present, we may need to clear a descriptor + * which had a value. + */ + if (curr->flushed) + continue; + ret = 1; + break; + } + return ret; +} + +/* + * On a newly forked process, the TLS descriptors haven't yet been flushed. So + * we mark them as such and the first switch_to will do the job. + */ +void clear_flushed_tls(struct task_struct *task) +{ + int i; + + for (i = GDT_ENTRY_TLS_MIN; i < GDT_ENTRY_TLS_MAX; i++) { + struct uml_tls_struct* curr = + &task->thread.arch.tls_array[i - GDT_ENTRY_TLS_MIN]; + + /* + * Still correct to do this, if it wasn't present on the host it + * will remain as flushed as it was. + */ + if (!curr->present) + continue; + + curr->flushed = 0; + } +} + +/* + * In SKAS0 mode, currently, multiple guest threads sharing the same ->mm have a + * common host process. So this is needed in SKAS0 too. + * + * However, if each thread had a different host process (and this was discussed + * for SMP support) this won't be needed. + * + * And this will not need be used when (and if) we'll add support to the host + * SKAS patch. + */ + +int arch_switch_tls(struct task_struct *to) +{ + if (!host_supports_tls) + return 0; + + /* + * We have no need whatsoever to switch TLS for kernel threads; beyond + * that, that would also result in us calling os_set_thread_area with + * userspace_pid[cpu] == 0, which gives an error. + */ + if (likely(to->mm)) + return load_TLS(O_FORCE, to); + + return 0; +} + +static int set_tls_entry(struct task_struct* task, struct user_desc *info, + int idx, int flushed) +{ + struct thread_struct *t = &task->thread; + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].tls = *info; + t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].present = 1; + t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].flushed = flushed; + + return 0; +} + +int arch_copy_tls(struct task_struct *new) +{ + struct user_desc info; + int idx, ret = -EFAULT; + + if (copy_from_user(&info, + (void __user *) UPT_ESI(&new->thread.regs.regs), + sizeof(info))) + goto out; + + ret = -EINVAL; + if (LDT_empty(&info)) + goto out; + + idx = info.entry_number; + + ret = set_tls_entry(new, &info, idx, 0); +out: + return ret; +} + +/* XXX: use do_get_thread_area to read the host value? I'm not at all sure! */ +static int get_tls_entry(struct task_struct *task, struct user_desc *info, + int idx) +{ + struct thread_struct *t = &task->thread; + + if (!t->arch.tls_array) + goto clear; + + if (idx < GDT_ENTRY_TLS_MIN || idx > GDT_ENTRY_TLS_MAX) + return -EINVAL; + + if (!t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].present) + goto clear; + + *info = t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].tls; + +out: + /* + * Temporary debugging check, to make sure that things have been + * flushed. This could be triggered if load_TLS() failed. + */ + if (unlikely(task == current && + !t->arch.tls_array[idx - GDT_ENTRY_TLS_MIN].flushed)) { + printk(KERN_ERR "get_tls_entry: task with pid %d got here " + "without flushed TLS.", current->pid); + } + + return 0; +clear: + /* + * When the TLS entry has not been set, the values read to user in the + * tls_array are 0 (because it's cleared at boot, see + * arch/i386/kernel/head.S:cpu_gdt_table). Emulate that. + */ + clear_user_desc(info); + info->entry_number = idx; + goto out; +} + +int sys_set_thread_area(struct user_desc __user *user_desc) +{ + struct user_desc info; + int idx, ret; + + if (!host_supports_tls) + return -ENOSYS; + + if (copy_from_user(&info, user_desc, sizeof(info))) + return -EFAULT; + + idx = info.entry_number; + + if (idx == -1) { + idx = get_free_idx(current); + if (idx < 0) + return idx; + info.entry_number = idx; + /* Tell the user which slot we chose for him.*/ + if (put_user(idx, &user_desc->entry_number)) + return -EFAULT; + } + + ret = do_set_thread_area(&info); + if (ret) + return ret; + return set_tls_entry(current, &info, idx, 1); +} + +/* + * Perform set_thread_area on behalf of the traced child. + * Note: error handling is not done on the deferred load, and this differ from + * i386. However the only possible error are caused by bugs. + */ +int ptrace_set_thread_area(struct task_struct *child, int idx, + struct user_desc __user *user_desc) +{ + struct user_desc info; + + if (!host_supports_tls) + return -EIO; + + if (copy_from_user(&info, user_desc, sizeof(info))) + return -EFAULT; + + return set_tls_entry(child, &info, idx, 0); +} + +int sys_get_thread_area(struct user_desc __user *user_desc) +{ + struct user_desc info; + int idx, ret; + + if (!host_supports_tls) + return -ENOSYS; + + if (get_user(idx, &user_desc->entry_number)) + return -EFAULT; + + ret = get_tls_entry(current, &info, idx); + if (ret < 0) + goto out; + + if (copy_to_user(user_desc, &info, sizeof(info))) + ret = -EFAULT; + +out: + return ret; +} + +/* + * Perform get_thread_area on behalf of the traced child. + */ +int ptrace_get_thread_area(struct task_struct *child, int idx, + struct user_desc __user *user_desc) +{ + struct user_desc info; + int ret; + + if (!host_supports_tls) + return -EIO; + + ret = get_tls_entry(child, &info, idx); + if (ret < 0) + goto out; + + if (copy_to_user(user_desc, &info, sizeof(info))) + ret = -EFAULT; +out: + return ret; +} + +/* + * This code is really i386-only, but it detects and logs x86_64 GDT indexes + * if a 32-bit UML is running on a 64-bit host. + */ +static int __init __setup_host_supports_tls(void) +{ + check_host_supports_tls(&host_supports_tls, &host_gdt_entry_tls_min); + if (host_supports_tls) { + printk(KERN_INFO "Host TLS support detected\n"); + printk(KERN_INFO "Detected host type: "); + switch (host_gdt_entry_tls_min) { + case GDT_ENTRY_TLS_MIN_I386: + printk(KERN_CONT "i386"); + break; + case GDT_ENTRY_TLS_MIN_X86_64: + printk(KERN_CONT "x86_64"); + break; + } + printk(KERN_CONT " (GDT indexes %d to %d)\n", + host_gdt_entry_tls_min, + host_gdt_entry_tls_min + GDT_ENTRY_TLS_ENTRIES); + } else + printk(KERN_ERR " Host TLS support NOT detected! " + "TLS support inside UML will not work\n"); + return 0; +} + +__initcall(__setup_host_supports_tls); diff --git a/arch/x86/um/tls_64.c b/arch/x86/um/tls_64.c new file mode 100644 index 000000000000..f7ba46200ecd --- /dev/null +++ b/arch/x86/um/tls_64.c @@ -0,0 +1,17 @@ +#include "linux/sched.h" + +void clear_flushed_tls(struct task_struct *task) +{ +} + +int arch_copy_tls(struct task_struct *t) +{ + /* + * If CLONE_SETTLS is set, we need to save the thread id + * (which is argument 5, child_tid, of clone) so it can be set + * during context switches. + */ + t->thread.arch.fs = t->thread.regs.regs.gp[R8 / sizeof(long)]; + + return 0; +} diff --git a/arch/x86/um/user-offsets.c b/arch/x86/um/user-offsets.c new file mode 100644 index 000000000000..ca49be8ddd0c --- /dev/null +++ b/arch/x86/um/user-offsets.c @@ -0,0 +1,80 @@ +#include <stdio.h> +#include <stddef.h> +#include <signal.h> +#include <sys/poll.h> +#include <sys/mman.h> +#include <sys/user.h> +#define __FRAME_OFFSETS +#include <asm/ptrace.h> +#include <asm/types.h> + +#define DEFINE(sym, val) \ + asm volatile("\n->" #sym " %0 " #val : : "i" (val)) + +#define DEFINE_LONGS(sym, val) \ + asm volatile("\n->" #sym " %0 " #val : : "i" (val/sizeof(unsigned long))) + +void foo(void) +{ +#ifdef __i386__ + DEFINE_LONGS(HOST_FP_SIZE, sizeof(struct user_fpregs_struct)); + DEFINE_LONGS(HOST_FPX_SIZE, sizeof(struct user_fpxregs_struct)); + + DEFINE(HOST_IP, EIP); + DEFINE(HOST_SP, UESP); + DEFINE(HOST_EFLAGS, EFL); + DEFINE(HOST_AX, EAX); + DEFINE(HOST_BX, EBX); + DEFINE(HOST_CX, ECX); + DEFINE(HOST_DX, EDX); + DEFINE(HOST_SI, ESI); + DEFINE(HOST_DI, EDI); + DEFINE(HOST_BP, EBP); + DEFINE(HOST_CS, CS); + DEFINE(HOST_SS, SS); + DEFINE(HOST_DS, DS); + DEFINE(HOST_FS, FS); + DEFINE(HOST_ES, ES); + DEFINE(HOST_GS, GS); + DEFINE(HOST_ORIG_AX, ORIG_EAX); +#else + DEFINE(HOST_FP_SIZE, sizeof(struct _fpstate) / sizeof(unsigned long)); + DEFINE_LONGS(HOST_BX, RBX); + DEFINE_LONGS(HOST_CX, RCX); + DEFINE_LONGS(HOST_DI, RDI); + DEFINE_LONGS(HOST_SI, RSI); + DEFINE_LONGS(HOST_DX, RDX); + DEFINE_LONGS(HOST_BP, RBP); + DEFINE_LONGS(HOST_AX, RAX); + DEFINE_LONGS(HOST_R8, R8); + DEFINE_LONGS(HOST_R9, R9); + DEFINE_LONGS(HOST_R10, R10); + DEFINE_LONGS(HOST_R11, R11); + DEFINE_LONGS(HOST_R12, R12); + DEFINE_LONGS(HOST_R13, R13); + DEFINE_LONGS(HOST_R14, R14); + DEFINE_LONGS(HOST_R15, R15); + DEFINE_LONGS(HOST_ORIG_AX, ORIG_RAX); + DEFINE_LONGS(HOST_CS, CS); + DEFINE_LONGS(HOST_SS, SS); + DEFINE_LONGS(HOST_EFLAGS, EFLAGS); +#if 0 + DEFINE_LONGS(HOST_FS, FS); + DEFINE_LONGS(HOST_GS, GS); + DEFINE_LONGS(HOST_DS, DS); + DEFINE_LONGS(HOST_ES, ES); +#endif + + DEFINE_LONGS(HOST_IP, RIP); + DEFINE_LONGS(HOST_SP, RSP); +#endif + + DEFINE(UM_FRAME_SIZE, sizeof(struct user_regs_struct)); + DEFINE(UM_POLLIN, POLLIN); + DEFINE(UM_POLLPRI, POLLPRI); + DEFINE(UM_POLLOUT, POLLOUT); + + DEFINE(UM_PROT_READ, PROT_READ); + DEFINE(UM_PROT_WRITE, PROT_WRITE); + DEFINE(UM_PROT_EXEC, PROT_EXEC); +} diff --git a/arch/x86/um/vdso/Makefile b/arch/x86/um/vdso/Makefile new file mode 100644 index 000000000000..6c803ca49b5d --- /dev/null +++ b/arch/x86/um/vdso/Makefile @@ -0,0 +1,90 @@ +# +# Building vDSO images for x86. +# + +VDSO64-y := y + +vdso-install-$(VDSO64-y) += vdso.so + + +# files to link into the vdso +vobjs-y := vdso-note.o um_vdso.o + +# files to link into kernel +obj-$(VDSO64-y) += vdso.o vma.o + +vobjs := $(foreach F,$(vobjs-y),$(obj)/$F) + +$(obj)/vdso.o: $(obj)/vdso.so + +targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y) + +export CPPFLAGS_vdso.lds += -P -C + +VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \ + -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 + +$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so + +$(obj)/vdso.so.dbg: $(src)/vdso.lds $(vobjs) FORCE + $(call if_changed,vdso) + +$(obj)/%.so: OBJCOPYFLAGS := -S +$(obj)/%.so: $(obj)/%.so.dbg FORCE + $(call if_changed,objcopy) + +# +# Don't omit frame pointers for ease of userspace debugging, but do +# optimize sibling calls. +# +CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ + $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ + -fno-omit-frame-pointer -foptimize-sibling-calls + +$(vobjs): KBUILD_CFLAGS += $(CFL) + +# +# vDSO code runs in userspace and -pg doesn't help with profiling anyway. +# +CFLAGS_REMOVE_vdso-note.o = -pg -fprofile-arcs -ftest-coverage +CFLAGS_REMOVE_um_vdso.o = -pg -fprofile-arcs -ftest-coverage + +targets += vdso-syms.lds +obj-$(VDSO64-y) += vdso-syms.lds + +# +# Match symbols in the DSO that look like VDSO*; produce a file of constants. +# +sed-vdsosym := -e 's/^00*/0/' \ + -e 's/^\([0-9a-fA-F]*\) . \(VDSO[a-zA-Z0-9_]*\)$$/\2 = 0x\1;/p' +quiet_cmd_vdsosym = VDSOSYM $@ +define cmd_vdsosym + $(NM) $< | LC_ALL=C sed -n $(sed-vdsosym) | LC_ALL=C sort > $@ +endef + +$(obj)/%-syms.lds: $(obj)/%.so.dbg FORCE + $(call if_changed,vdsosym) + +# +# The DSO images are built using a special linker script. +# +quiet_cmd_vdso = VDSO $@ + cmd_vdso = $(CC) -nostdlib -o $@ \ + $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \ + -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) && \ + sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' + +VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) +GCOV_PROFILE := n + +# +# Install the unstripped copy of vdso*.so listed in $(vdso-install-y). +# +quiet_cmd_vdso_install = INSTALL $@ + cmd_vdso_install = cp $(obj)/$@.dbg $(MODLIB)/vdso/$@ +$(vdso-install-y): %.so: $(obj)/%.so.dbg FORCE + @mkdir -p $(MODLIB)/vdso + $(call cmd,vdso_install) + +PHONY += vdso_install $(vdso-install-y) +vdso_install: $(vdso-install-y) diff --git a/arch/x86/um/vdso/checkundef.sh b/arch/x86/um/vdso/checkundef.sh new file mode 100644 index 000000000000..7ee90a9b549d --- /dev/null +++ b/arch/x86/um/vdso/checkundef.sh @@ -0,0 +1,10 @@ +#!/bin/sh +nm="$1" +file="$2" +$nm "$file" | grep '^ *U' > /dev/null 2>&1 +if [ $? -eq 1 ]; then + exit 0 +else + echo "$file: undefined symbols found" >&2 + exit 1 +fi diff --git a/arch/x86/um/vdso/um_vdso.c b/arch/x86/um/vdso/um_vdso.c new file mode 100644 index 000000000000..7c441b59d375 --- /dev/null +++ b/arch/x86/um/vdso/um_vdso.c @@ -0,0 +1,71 @@ +/* + * Copyright (C) 2011 Richard Weinberger <richrd@nod.at> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This vDSO turns all calls into a syscall so that UML can trap them. + */ + + +/* Disable profiling for userspace code */ +#define DISABLE_BRANCH_PROFILING + +#include <linux/time.h> +#include <linux/getcpu.h> +#include <asm/unistd.h> + +int __vdso_clock_gettime(clockid_t clock, struct timespec *ts) +{ + long ret; + + asm("syscall" : "=a" (ret) : + "0" (__NR_clock_gettime), "D" (clock), "S" (ts) : "memory"); + + return ret; +} +int clock_gettime(clockid_t, struct timespec *) + __attribute__((weak, alias("__vdso_clock_gettime"))); + +int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz) +{ + long ret; + + asm("syscall" : "=a" (ret) : + "0" (__NR_gettimeofday), "D" (tv), "S" (tz) : "memory"); + + return ret; +} +int gettimeofday(struct timeval *, struct timezone *) + __attribute__((weak, alias("__vdso_gettimeofday"))); + +time_t __vdso_time(time_t *t) +{ + long secs; + + asm volatile("syscall" + : "=a" (secs) + : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory"); + + return secs; +} +int time(time_t *t) __attribute__((weak, alias("__vdso_time"))); + +long +__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) +{ + /* + * UML does not support SMP, we can cheat here. :) + */ + + if (cpu) + *cpu = 0; + if (node) + *node = 0; + + return 0; +} + +long getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache) + __attribute__((weak, alias("__vdso_getcpu"))); diff --git a/arch/x86/um/vdso/vdso-layout.lds.S b/arch/x86/um/vdso/vdso-layout.lds.S new file mode 100644 index 000000000000..634a2cf62046 --- /dev/null +++ b/arch/x86/um/vdso/vdso-layout.lds.S @@ -0,0 +1,64 @@ +/* + * Linker script for vDSO. This is an ELF shared object prelinked to + * its virtual address, and with only one read-only segment. + * This script controls its layout. + */ + +SECTIONS +{ + . = VDSO_PRELINK + SIZEOF_HEADERS; + + .hash : { *(.hash) } :text + .gnu.hash : { *(.gnu.hash) } + .dynsym : { *(.dynsym) } + .dynstr : { *(.dynstr) } + .gnu.version : { *(.gnu.version) } + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + + .note : { *(.note.*) } :text :note + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + + .dynamic : { *(.dynamic) } :text :dynamic + + .rodata : { *(.rodata*) } :text + .data : { + *(.data*) + *(.sdata*) + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.bss*) + *(.dynbss*) + *(.gnu.linkonce.b.*) + } + + .altinstructions : { *(.altinstructions) } + .altinstr_replacement : { *(.altinstr_replacement) } + + /* + * Align the actual code well away from the non-instruction data. + * This is the best thing for the I-cache. + */ + . = ALIGN(0x100); + + .text : { *(.text*) } :text =0x90909090 +} + +/* + * Very old versions of ld do not recognize this name token; use the constant. + */ +#define PT_GNU_EH_FRAME 0x6474e550 + +/* + * We must supply the ELF program headers explicitly to get just one + * PT_LOAD segment, and set the flags explicitly to make segments read-only. + */ +PHDRS +{ + text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + eh_frame_hdr PT_GNU_EH_FRAME; +} diff --git a/arch/x86/um/vdso/vdso-note.S b/arch/x86/um/vdso/vdso-note.S new file mode 100644 index 000000000000..79a071e4357e --- /dev/null +++ b/arch/x86/um/vdso/vdso-note.S @@ -0,0 +1,12 @@ +/* + * This supplies .note.* sections to go into the PT_NOTE inside the vDSO text. + * Here we can supply some information useful to userland. + */ + +#include <linux/uts.h> +#include <linux/version.h> +#include <linux/elfnote.h> + +ELFNOTE_START(Linux, 0, "a") + .long LINUX_VERSION_CODE +ELFNOTE_END diff --git a/arch/x86/um/vdso/vdso.S b/arch/x86/um/vdso/vdso.S new file mode 100644 index 000000000000..1cb468adacbb --- /dev/null +++ b/arch/x86/um/vdso/vdso.S @@ -0,0 +1,10 @@ +#include <linux/init.h> + +__INITDATA + + .globl vdso_start, vdso_end +vdso_start: + .incbin "arch/x86/um/vdso/vdso.so" +vdso_end: + +__FINIT diff --git a/arch/x86/um/vdso/vdso.lds.S b/arch/x86/um/vdso/vdso.lds.S new file mode 100644 index 000000000000..b96b2677cad8 --- /dev/null +++ b/arch/x86/um/vdso/vdso.lds.S @@ -0,0 +1,32 @@ +/* + * Linker script for 64-bit vDSO. + * We #include the file to define the layout details. + * Here we only choose the prelinked virtual address. + * + * This file defines the version script giving the user-exported symbols in + * the DSO. We can define local symbols here called VDSO* to make their + * values visible using the asm-x86/vdso.h macros from the kernel proper. + */ + +#define VDSO_PRELINK 0xffffffffff700000 +#include "vdso-layout.lds.S" + +/* + * This controls what userland symbols we export from the vDSO. + */ +VERSION { + LINUX_2.6 { + global: + clock_gettime; + __vdso_clock_gettime; + gettimeofday; + __vdso_gettimeofday; + getcpu; + __vdso_getcpu; + time; + __vdso_time; + local: *; + }; +} + +VDSO64_PRELINK = VDSO_PRELINK; diff --git a/arch/x86/um/vdso/vma.c b/arch/x86/um/vdso/vma.c new file mode 100644 index 000000000000..91f4ec9a0a56 --- /dev/null +++ b/arch/x86/um/vdso/vma.c @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2011 Richard Weinberger <richrd@nod.at> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/slab.h> +#include <linux/sched.h> +#include <linux/mm.h> +#include <asm/page.h> +#include <linux/init.h> + +unsigned int __read_mostly vdso_enabled = 1; +unsigned long um_vdso_addr; + +extern unsigned long task_size; +extern char vdso_start[], vdso_end[]; + +static struct page **vdsop; + +static int __init init_vdso(void) +{ + struct page *um_vdso; + + BUG_ON(vdso_end - vdso_start > PAGE_SIZE); + + um_vdso_addr = task_size - PAGE_SIZE; + + vdsop = kmalloc(sizeof(struct page *), GFP_KERNEL); + if (!vdsop) + goto oom; + + um_vdso = alloc_page(GFP_KERNEL); + if (!um_vdso) { + kfree(vdsop); + + goto oom; + } + + copy_page(page_address(um_vdso), vdso_start); + *vdsop = um_vdso; + + return 0; + +oom: + printk(KERN_ERR "Cannot allocate vdso\n"); + vdso_enabled = 0; + + return -ENOMEM; +} +subsys_initcall(init_vdso); + +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + int err; + struct mm_struct *mm = current->mm; + + if (!vdso_enabled) + return 0; + + down_write(&mm->mmap_sem); + + err = install_special_mapping(mm, um_vdso_addr, PAGE_SIZE, + VM_READ|VM_EXEC| + VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC| + VM_ALWAYSDUMP, + vdsop); + + up_write(&mm->mmap_sem); + + return err; +} diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 316fbca3490e..153407c35b75 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -89,6 +89,15 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) addr = start + (offset << PAGE_SHIFT); if (addr >= end) addr = end; + + /* + * page-align it here so that get_unmapped_area doesn't + * align it wrongfully again to the next page. addr can come in 4K + * unaligned here as a result of stack start randomization. + */ + addr = PAGE_ALIGN(addr); + addr = align_addr(addr, NULL, ALIGN_VDSO); + return addr; } diff --git a/arch/x86/video/fbdev.c b/arch/x86/video/fbdev.c index 69527688f794..c5ffb6ac8707 100644 --- a/arch/x86/video/fbdev.c +++ b/arch/x86/video/fbdev.c @@ -8,6 +8,7 @@ */ #include <linux/fb.h> #include <linux/pci.h> +#include <linux/module.h> int fb_is_primary_device(struct fb_info *info) { diff --git a/arch/x86/xen/grant-table.c b/arch/x86/xen/grant-table.c index 49ba9b5224d1..6bbfd7ac5e81 100644 --- a/arch/x86/xen/grant-table.c +++ b/arch/x86/xen/grant-table.c @@ -71,7 +71,7 @@ int arch_gnttab_map_shared(unsigned long *frames, unsigned long nr_gframes, if (shared == NULL) { struct vm_struct *area = - xen_alloc_vm_area(PAGE_SIZE * max_nr_gframes); + alloc_vm_area(PAGE_SIZE * max_nr_gframes); BUG_ON(area == NULL); shared = area->addr; *__shared = shared; diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c index 163b4679556e..0296a9522501 100644 --- a/arch/x86/xen/time.c +++ b/arch/x86/xen/time.c @@ -201,8 +201,22 @@ static unsigned long xen_get_wallclock(void) static int xen_set_wallclock(unsigned long now) { + struct xen_platform_op op; + int rc; + /* do nothing for domU */ - return -1; + if (!xen_initial_domain()) + return -1; + + op.cmd = XENPF_settime; + op.u.settime.secs = now; + op.u.settime.nsecs = 0; + op.u.settime.system_time = xen_clocksource_read(); + + rc = HYPERVISOR_dom0_op(&op); + WARN(rc != 0, "XENPF_settime failed: now=%ld\n", now); + + return rc; } static struct clocksource xen_clocksource __read_mostly = { |