--- a/Documentation/cachetlb.txt
+++ b/Documentation/cachetlb.txt
@@ -37,6 +37,7 @@ changes occur:
 	This is usually invoked when the kernel page tables are
 	changed, since such translations are "global" in nature.
 
+
 2) void flush_tlb_mm(struct mm_struct *mm)
 
 	This interface flushes an entire user address space from
@@ -317,10 +318,10 @@ maps this page at its virtual address.
 	about doing this.
 
 	The idea is, first at flush_dcache_page() time, if
-	page->mapping->i_mmap is an empty tree and ->i_mmap_nonlinear
-	an empty list, just mark the architecture private page flag bit.
-	Later, in update_mmu_cache(), a check is made of this flag bit,
-	and if set the flush is done and the flag bit is cleared.
+	page->mapping->i_mmap is an empty tree, just mark the architecture
+	private page flag bit.  Later, in update_mmu_cache(), a check is
+	made of this flag bit, and if set the flush is done and the flag
+	bit is cleared.
 
 	IMPORTANT NOTE: It is often important, if you defer the flush,
 			that the actual flush occurs on the same CPU
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -24,64 +24,27 @@ Please note that implementation details can be changed.
 
    a page/swp_entry may be charged (usage += PAGE_SIZE) at
 
-	mem_cgroup_newpage_charge()
-	  Called at new page fault and Copy-On-Write.
-
-	mem_cgroup_try_charge_swapin()
-	  Called at do_swap_page() (page fault on swap entry) and swapoff.
-	  Followed by charge-commit-cancel protocol. (With swap accounting)
-	  At commit, a charge recorded in swap_cgroup is removed.
-
-	mem_cgroup_cache_charge()
-	  Called at add_to_page_cache()
-
-	mem_cgroup_cache_charge_swapin()
-	  Called at shmem's swapin.
-
-	mem_cgroup_prepare_migration()
-	  Called before migration. "extra" charge is done and followed by
-	  charge-commit-cancel protocol.
-	  At commit, charge against oldpage or newpage will be committed.
+	mem_cgroup_try_charge()
 
 2. Uncharge
   a page/swp_entry may be uncharged (usage -= PAGE_SIZE) by
 
-	mem_cgroup_uncharge_page()
-	  Called when an anonymous page is fully unmapped. I.e., mapcount goes
-	  to 0. If the page is SwapCache, uncharge is delayed until
-	  mem_cgroup_uncharge_swapcache().
-
-	mem_cgroup_uncharge_cache_page()
-	  Called when a page-cache is deleted from radix-tree. If the page is
-	  SwapCache, uncharge is delayed until mem_cgroup_uncharge_swapcache().
-
-	mem_cgroup_uncharge_swapcache()
-	  Called when SwapCache is removed from radix-tree. The charge itself
-	  is moved to swap_cgroup. (If mem+swap controller is disabled, no
-	  charge to swap occurs.)
+	mem_cgroup_uncharge()
+	  Called when a page's refcount goes down to 0.
 
 	mem_cgroup_uncharge_swap()
 	  Called when swp_entry's refcnt goes down to 0. A charge against swap
 	  disappears.
 
-	mem_cgroup_end_migration(old, new)
-	At success of migration old is uncharged (if necessary), a charge
-	to new page is committed. At failure, charge to old page is committed.
-
 3. charge-commit-cancel
-	In some case, we can't know this "charge" is valid or not at charging
-	(because of races).
-	To handle such case, there are charge-commit-cancel functions.
-		mem_cgroup_try_charge_XXX
-		mem_cgroup_commit_charge_XXX
-		mem_cgroup_cancel_charge_XXX
-	these are used in swap-in and migration.
+	Memcg pages are charged in two steps:
+		mem_cgroup_try_charge()
+		mem_cgroup_commit_charge() or mem_cgroup_cancel_charge()
 
 	At try_charge(), there are no flags to say "this page is charged".
 	at this point, usage += PAGE_SIZE.
 
-	At commit(), the function checks the page should be charged or not
-	and set flags or avoid charging.(usage -= PAGE_SIZE)
+	At commit(), the page is associated with the memcg.
 
 	At cancel(), simply usage -= PAGE_SIZE.
 
@@ -91,18 +54,6 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	Anonymous page is newly allocated at
 		  - page fault into MAP_ANONYMOUS mapping.
 		  - Copy-On-Write.
- 	It is charged right after it's allocated before doing any page table
-	related operations. Of course, it's uncharged when another page is used
-	for the fault address.
-
-	At freeing anonymous page (by exit() or munmap()), zap_pte() is called
-	and pages for ptes are freed one by one.(see mm/memory.c). Uncharges
-	are done at page_remove_rmap() when page_mapcount() goes down to 0.
-
-	Another page freeing is by page-reclaim (vmscan.c) and anonymous
-	pages are swapped out. In this case, the page is marked as
-	PageSwapCache(). uncharge() routine doesn't uncharge the page marked
-	as SwapCache(). It's delayed until __delete_from_swap_cache().
 
 	4.1 Swap-in.
 	At swap-in, the page is taken from swap-cache. There are 2 cases.
@@ -111,41 +62,6 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	(b) If the SwapCache has been mapped by processes, it has been
 	    charged already.
 
-	This swap-in is one of the most complicated work. In do_swap_page(),
-	following events occur when pte is unchanged.
-
-	(1) the page (SwapCache) is looked up.
-	(2) lock_page()
-	(3) try_charge_swapin()
-	(4) reuse_swap_page() (may call delete_swap_cache())
-	(5) commit_charge_swapin()
-	(6) swap_free().
-
-	Considering following situation for example.
-
-	(A) The page has not been charged before (2) and reuse_swap_page()
-	    doesn't call delete_from_swap_cache().
-	(B) The page has not been charged before (2) and reuse_swap_page()
-	    calls delete_from_swap_cache().
-	(C) The page has been charged before (2) and reuse_swap_page() doesn't
-	    call delete_from_swap_cache().
-	(D) The page has been charged before (2) and reuse_swap_page() calls
-	    delete_from_swap_cache().
-
-	    memory.usage/memsw.usage changes to this page/swp_entry will be
-	 Case          (A)      (B)       (C)     (D)
-         Event
-       Before (2)     0/ 1     0/ 1      1/ 1    1/ 1
-          ===========================================
-          (3)        +1/+1    +1/+1     +1/+1   +1/+1
-          (4)          -       0/ 0       -     -1/ 0
-          (5)         0/-1     0/ 0     -1/-1    0/ 0
-          (6)          -       0/-1       -      0/-1
-          ===========================================
-       Result         1/ 1     1/ 1      1/ 1    1/ 1
-
-       In any cases, charges to this page should be 1/ 1.
-
 	4.2 Swap-out.
 	At swap-out, typical state transition is below.
 
@@ -158,28 +74,20 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	    swp_entry's refcnt -= 1.
 
 
-	At (b), the page is marked as SwapCache and not uncharged.
-	At (d), the page is removed from SwapCache and a charge in page_cgroup
-	is moved to swap_cgroup.
-
 	Finally, at task exit,
 	(e) zap_pte() is called and swp_entry's refcnt -=1 -> 0.
-	Here, a charge in swap_cgroup disappears.
 
 5. Page Cache
    	Page Cache is charged at
 	- add_to_page_cache_locked().
 
-	uncharged at
-	- __remove_from_page_cache().
-
 	The logic is very clear. (About migration, see below)
 	Note: __remove_from_page_cache() is called by remove_from_page_cache()
 	and __remove_mapping().
 
 6. Shmem(tmpfs) Page Cache
-	Memcg's charge/uncharge have special handlers of shmem. The best way
-	to understand shmem's page state transition is to read mm/shmem.c.
+	The best way to understand shmem's page state transition is to read
+	mm/shmem.c.
 	But brief explanation of the behavior of memcg around shmem will be
 	helpful to understand the logic.
 
@@ -192,56 +100,10 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	It's charged when...
 	- A new page is added to shmem's radix-tree.
 	- A swp page is read. (move a charge from swap_cgroup to page_cgroup)
-	It's uncharged when
-	- A page is removed from radix-tree and not SwapCache.
-	- When SwapCache is removed, a charge is moved to swap_cgroup.
-	- When swp_entry's refcnt goes down to 0, a charge in swap_cgroup
-	  disappears.
 
 7. Page Migration
-   	One of the most complicated functions is page-migration-handler.
-	Memcg has 2 routines. Assume that we are migrating a page's contents
-	from OLDPAGE to NEWPAGE.
-
-	Usual migration logic is..
-	(a) remove the page from LRU.
-	(b) allocate NEWPAGE (migration target)
-	(c) lock by lock_page().
-	(d) unmap all mappings.
-	(e-1) If necessary, replace entry in radix-tree.
-	(e-2) move contents of a page.
-	(f) map all mappings again.
-	(g) pushback the page to LRU.
-	(-) OLDPAGE will be freed.
-
-	Before (g), memcg should complete all necessary charge/uncharge to
-	NEWPAGE/OLDPAGE.
-
-	The point is....
-	- If OLDPAGE is anonymous, all charges will be dropped at (d) because
-          try_to_unmap() drops all mapcount and the page will not be
-	  SwapCache.
-
-	- If OLDPAGE is SwapCache, charges will be kept at (g) because
-	  __delete_from_swap_cache() isn't called at (e-1)
-
-	- If OLDPAGE is page-cache, charges will be kept at (g) because
-	  remove_from_swap_cache() isn't called at (e-1)
-
-	memcg provides following hooks.
-
-	- mem_cgroup_prepare_migration(OLDPAGE)
-	  Called after (b) to account a charge (usage += PAGE_SIZE) against
-	  memcg which OLDPAGE belongs to.
-
-        - mem_cgroup_end_migration(OLDPAGE, NEWPAGE)
-	  Called after (f) before (g).
-	  If OLDPAGE is used, commit OLDPAGE again. If OLDPAGE is already
-	  charged, a charge by prepare_migration() is automatically canceled.
-	  If NEWPAGE is used, commit NEWPAGE and uncharge OLDPAGE.
-
-	  But zap_pte() (by exit or munmap) can be called while migration,
-	  we have to check if OLDPAGE/NEWPAGE is a valid page after commit().
+
+	mem_cgroup_migrate()
 
 8. LRU
         Each memcg has its own private LRU. Now, its handling is under global
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -1825,6 +1825,7 @@ Configuring procfs
 The following mount options are supported:
 
 	hidepid=	Set /proc/<pid>/ access mode.
+	hidepidns=	Hide tasks from nested pid-namespaces.
 	gid=		Set the group authorized to learn processes information.
 
 hidepid=0 means classic mode - everybody may access all /proc/<pid>/ directories
@@ -1847,6 +1848,9 @@ information about running processes, whether some daemon runs with elevated
 privileges, whether other user runs some sensitive program, whether other users
 run any program at all, etc.
 
+hidepidns=1 makes all tasks from nested pid-namespaces invisible. They are still
+accessible via /proc/<pid>/, but readdir will not show them.
+
 gid= defines a group authorized to learn processes information otherwise
 prohibited by hidepid=.  If you use some daemon like identd which needs to learn
 information about processes information, just add identd to this group.
--- /dev/null
+++ b/Documentation/kasan.txt
@@ -0,0 +1,169 @@
+Kernel address sanitizer
+================
+
+0. Overview
+===========
+
+Kernel Address sanitizer (KASan) is a dynamic memory error detector. It provides
+a fast and comprehensive solution for finding use-after-free and out-of-bounds
+bugs.
+
+KASan uses compile-time instrumentation for checking every memory access,
+therefore you will need a certain version of GCC > 4.9.2
+
+Currently KASan is supported only for x86_64 architecture and requires that the
+kernel be built with the SLUB allocator.
+
+1. Usage
+=========
+
+To enable KASAN configure kernel with:
+
+	  CONFIG_KASAN = y
+
+and choose between CONFIG_KASAN_OUTLINE and CONFIG_KASAN_INLINE. Outline/inline
+is compiler instrumentation types. The former produces smaller binary the
+latter is 1.1 - 2 times faster. Inline instrumentation requires GCC 5.0 or
+latter.
+
+Currently KASAN works only with the SLUB memory allocator.
+For better bug detection and nicer report and enable CONFIG_STACKTRACE.
+
+To disable instrumentation for specific files or directories, add a line
+similar to the following to the respective kernel Makefile:
+
+        For a single file (e.g. main.o):
+                KASAN_SANITIZE_main.o := n
+
+        For all files in one directory:
+                KASAN_SANITIZE := n
+
+1.1 Error reports
+==========
+
+A typical out of bounds access report looks like this:
+
+==================================================================
+BUG: AddressSanitizer: out of bounds access in kmalloc_oob_right+0x65/0x75 [test_kasan] at addr ffff8800693bc5d3
+Write of size 1 by task modprobe/1689
+=============================================================================
+BUG kmalloc-128 (Not tainted): kasan error
+-----------------------------------------------------------------------------
+
+Disabling lock debugging due to kernel taint
+INFO: Allocated in kmalloc_oob_right+0x3d/0x75 [test_kasan] age=0 cpu=0 pid=1689
+ __slab_alloc+0x4b4/0x4f0
+ kmem_cache_alloc_trace+0x10b/0x190
+ kmalloc_oob_right+0x3d/0x75 [test_kasan]
+ init_module+0x9/0x47 [test_kasan]
+ do_one_initcall+0x99/0x200
+ load_module+0x2cb3/0x3b20
+ SyS_finit_module+0x76/0x80
+ system_call_fastpath+0x12/0x17
+INFO: Slab 0xffffea0001a4ef00 objects=17 used=7 fp=0xffff8800693bd728 flags=0x100000000004080
+INFO: Object 0xffff8800693bc558 @offset=1368 fp=0xffff8800693bc720
+
+Bytes b4 ffff8800693bc548: 00 00 00 00 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a  ........ZZZZZZZZ
+Object ffff8800693bc558: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc568: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc578: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc588: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc598: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc5a8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc5b8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b  kkkkkkkkkkkkkkkk
+Object ffff8800693bc5c8: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b a5  kkkkkkkkkkkkkkk.
+Redzone ffff8800693bc5d8: cc cc cc cc cc cc cc cc                          ........
+Padding ffff8800693bc718: 5a 5a 5a 5a 5a 5a 5a 5a                          ZZZZZZZZ
+CPU: 0 PID: 1689 Comm: modprobe Tainted: G    B          3.18.0-rc1-mm1+ #98
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.7.5-0-ge51488c-20140602_164612-nilsson.home.kraxel.org 04/01/2014
+ ffff8800693bc000 0000000000000000 ffff8800693bc558 ffff88006923bb78
+ ffffffff81cc68ae 00000000000000f3 ffff88006d407600 ffff88006923bba8
+ ffffffff811fd848 ffff88006d407600 ffffea0001a4ef00 ffff8800693bc558
+Call Trace:
+ [<ffffffff81cc68ae>] dump_stack+0x46/0x58
+ [<ffffffff811fd848>] print_trailer+0xf8/0x160
+ [<ffffffffa00026a7>] ? kmem_cache_oob+0xc3/0xc3 [test_kasan]
+ [<ffffffff811ff0f5>] object_err+0x35/0x40
+ [<ffffffffa0002065>] ? kmalloc_oob_right+0x65/0x75 [test_kasan]
+ [<ffffffff8120b9fa>] kasan_report_error+0x38a/0x3f0
+ [<ffffffff8120a79f>] ? kasan_poison_shadow+0x2f/0x40
+ [<ffffffff8120b344>] ? kasan_unpoison_shadow+0x14/0x40
+ [<ffffffff8120a79f>] ? kasan_poison_shadow+0x2f/0x40
+ [<ffffffffa00026a7>] ? kmem_cache_oob+0xc3/0xc3 [test_kasan]
+ [<ffffffff8120a995>] __asan_store1+0x75/0xb0
+ [<ffffffffa0002601>] ? kmem_cache_oob+0x1d/0xc3 [test_kasan]
+ [<ffffffffa0002065>] ? kmalloc_oob_right+0x65/0x75 [test_kasan]
+ [<ffffffffa0002065>] kmalloc_oob_right+0x65/0x75 [test_kasan]
+ [<ffffffffa00026b0>] init_module+0x9/0x47 [test_kasan]
+ [<ffffffff810002d9>] do_one_initcall+0x99/0x200
+ [<ffffffff811e4e5c>] ? __vunmap+0xec/0x160
+ [<ffffffff81114f63>] load_module+0x2cb3/0x3b20
+ [<ffffffff8110fd70>] ? m_show+0x240/0x240
+ [<ffffffff81115f06>] SyS_finit_module+0x76/0x80
+ [<ffffffff81cd3129>] system_call_fastpath+0x12/0x17
+Memory state around the buggy address:
+ ffff8800693bc300: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc380: fc fc 00 00 00 00 00 00 00 00 00 00 00 00 00 fc
+ ffff8800693bc400: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc500: fc fc fc fc fc fc fc fc fc fc fc 00 00 00 00 00
+>ffff8800693bc580: 00 00 00 00 00 00 00 00 00 00 03 fc fc fc fc fc
+                                                 ^
+ ffff8800693bc600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc680: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
+ ffff8800693bc700: fc fc fc fc fb fb fb fb fb fb fb fb fb fb fb fb
+ ffff8800693bc780: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ffff8800693bc800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+==================================================================
+
+First sections describe slub object where bad access happened.
+See 'SLUB Debug output' section in Documentation/vm/slub.txt for details.
+
+In the last section the report shows memory state around the accessed address.
+Reading this part requires some more understanding of how KASAN works.
+
+Each 8 bytes of memory are encoded in one shadow byte as accessible,
+partially accessible, freed or they can be part of a redzone.
+We use the following encoding for each shadow byte: 0 means that all 8 bytes
+of the corresponding memory region are accessible; number N (1 <= N <= 7) means
+that the first N bytes are accessible, and other (8 - N) bytes are not;
+any negative value indicates that the entire 8-byte word is inaccessible.
+We use different negative values to distinguish between different kinds of
+inaccessible memory like redzones or freed memory (see mm/kasan/kasan.h).
+
+In the report above the arrows point to the shadow byte 03, which means that
+the accessed address is partially accessible.
+
+
+2. Implementation details
+========================
+
+From a high level, our approach to memory error detection is similar to that
+of kmemcheck: use shadow memory to record whether each byte of memory is safe
+to access, and use compile-time instrumentation to check shadow memory on each
+memory access.
+
+AddressSanitizer dedicates 1/8 of kernel memory to its shadow memory
+(e.g. 16TB to cover 128TB on x86_64) and uses direct mapping with a scale and
+offset to translate a memory address to its corresponding shadow address.
+
+Here is the function witch translate an address to its corresponding shadow
+address:
+
+static inline void *kasan_mem_to_shadow(const void *addr)
+{
+	return ((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT)
+		+ KASAN_SHADOW_OFFSET;
+}
+
+where KASAN_SHADOW_SCALE_SHIFT = 3.
+
+Compile-time instrumentation used for checking memory accesses. Compiler inserts
+function calls (__asan_load*(addr), __asan_store*(addr)) before each memory
+access of size 1, 2, 4, 8 or 16. These functions check whether memory access is
+valid or not by checking corresponding shadow memory.
+
+GCC 5.0 has possibility to perform inline instrumentation. Instead of making
+function calls GCC directly inserts the code to check the shadow memory.
+This option significantly enlarges kernel but it gives x1.1-x2 performance
+boost over outline instrumented kernel.
--- /dev/null
+++ b/Documentation/kcov.txt
@@ -0,0 +1,111 @@
+kcov: code coverage for fuzzing
+===============================
+
+kcov exposes kernel code coverage information in a form suitable for coverage-
+guided fuzzing (randomized testing). Coverage data of a running kernel is
+exported via the "kcov" debugfs file. Coverage collection is enabled on a task
+basis, and thus it can capture precise coverage of a single system call.
+
+Note that kcov does not aim to collect as much coverage as possible. It aims
+to collect more or less stable coverage that is function of syscall inputs.
+To achieve this goal it does not collect coverage in soft/hard interrupts
+and instrumentation of some inherently non-deterministic parts of kernel is
+disbled (e.g. scheduler, locking).
+
+Usage:
+======
+
+Configure kernel with:
+
+        CONFIG_KCOV=y
+
+CONFIG_KCOV requires gcc built on revision 231296 or later.
+Profiling data will only become accessible once debugfs has been mounted:
+
+        mount -t debugfs none /sys/kernel/debug
+
+The following program demonstrates kcov usage from within a test program:
+
+#include <stdio.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+#define KCOV_INIT_TRACE			_IOR('c', 1, unsigned long)
+#define KCOV_ENABLE			_IO('c', 100)
+#define KCOV_DISABLE			_IO('c', 101)
+#define COVER_SIZE			(64<<10)
+
+int main(int argc, char **argv)
+{
+	int fd;
+	unsigned long *cover, n, i;
+
+	/* A single fd descriptor allows coverage collection on a single
+	 * thread.
+	 */
+	fd = open("/sys/kernel/debug/kcov", O_RDWR);
+	if (fd == -1)
+		perror("open"), exit(1);
+	/* Setup trace mode and trace size. */
+	if (ioctl(fd, KCOV_INIT_TRACE, COVER_SIZE))
+		perror("ioctl"), exit(1);
+	/* Mmap buffer shared between kernel- and user-space. */
+	cover = (unsigned long*)mmap(NULL, COVER_SIZE * sizeof(unsigned long),
+				     PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+	if ((void*)cover == MAP_FAILED)
+		perror("mmap"), exit(1);
+	/* Enable coverage collection on the current thread. */
+	if (ioctl(fd, KCOV_ENABLE, 0))
+		perror("ioctl"), exit(1);
+	/* Reset coverage from the tail of the ioctl() call. */
+	__atomic_store_n(&cover[0], 0, __ATOMIC_RELAXED);
+	/* That's the target syscal call. */
+	read(-1, NULL, 0);
+	/* Read number of PCs collected. */
+	n = __atomic_load_n(&cover[0], __ATOMIC_RELAXED);
+	for (i = 0; i < n; i++)
+		printf("0x%lx\n", cover[i + 1]);
+	/* Disable coverage collection for the current thread. After this call
+	 * coverage can be enabled for a different thread.
+	 */
+	if (ioctl(fd, KCOV_DISABLE, 0))
+		perror("ioctl"), exit(1);
+	/* Free resources. */
+	if (munmap(cover, COVER_SIZE * sizeof(unsigned long)))
+		perror("munmap"), exit(1);
+	if (close(fd))
+		perror("close"), exit(1);
+	return 0;
+}
+
+After piping through addr2line output of the program looks as follows:
+
+SyS_read
+fs/read_write.c:562
+__fdget_pos
+fs/file.c:774
+__fget_light
+fs/file.c:746
+__fget_light
+fs/file.c:750
+__fget_light
+fs/file.c:760
+__fdget_pos
+fs/file.c:784
+SyS_read
+fs/read_write.c:562
+
+If a program needs to collect coverage from several threads (independently),
+it needs to open /sys/kernel/debug/kcov in each thread separately.
+
+The interface is fine-grained to allow efficient forking of test processes.
+That is, a parent process opens /sys/kernel/debug/kcov, enables trace mode,
+mmaps coverage buffer and then forks child processes in a loop. Child processes
+only need to enable coverage (disable happens automatically on thread end).
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -3070,6 +3070,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	spia_pedr=
 	spia_peddr=
 
+	stack_guard_gap=	[MM]
+			override the default stack gap protection. The value
+			is in page units and it defines how many pages prior
+			to (for stacks growing down) resp. after (for stacks
+			growing up) the main stack are reserved for no other
+			mapping. Default value is 256 pages.
+
 	stacktrace	[FTRACE]
 			Enabled the stack tracer on boot up.
 
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -1412,6 +1412,7 @@ struct kvm_irq_routing_entry {
 	union {
 		struct kvm_irq_routing_irqchip irqchip;
 		struct kvm_irq_routing_msi msi;
+		struct kvm_irq_routing_hv_sint hv_sint;
 		__u32 pad[8];
 	} u;
 };
@@ -1419,6 +1420,7 @@ struct kvm_irq_routing_entry {
 /* gsi routing entry types */
 #define KVM_IRQ_ROUTING_IRQCHIP 1
 #define KVM_IRQ_ROUTING_MSI 2
+#define KVM_IRQ_ROUTING_HV_SINT 4
 
 No flags are specified so far, the corresponding field must be set to zero.
 
@@ -1434,12 +1436,16 @@ struct kvm_irq_routing_msi {
 	__u32 pad;
 };
 
+struct kvm_irq_routing_hv_sint {
+	__u32 vcpu;
+	__u32 sint;
+};
+
 On x86, address_hi is ignored unless the KVM_X2APIC_API_USE_32BIT_IDS
 feature of KVM_CAP_X2APIC_API capability is enabled.  If it is enabled,
 address_hi bits 31-8 provide bits 31-8 of the destination id.  Bits 7-0 of
 address_hi must be zero.
 
-
 4.53 KVM_ASSIGN_SET_MSIX_NR
 
 Capability: none
@@ -2913,6 +2919,50 @@ the userspace IOAPIC should process the EOI and retrigger the interrupt if
 it is still asserted.  Vector is the LAPIC interrupt vector for which the
 EOI was received.
 
+		/* KVM_EXIT_SYSTEM_EVENT */
+		struct {
+#define KVM_SYSTEM_EVENT_SHUTDOWN       1
+#define KVM_SYSTEM_EVENT_RESET          2
+#define KVM_SYSTEM_EVENT_CRASH          3
+			__u32 type;
+			__u64 flags;
+		} system_event;
+
+If exit_reason is KVM_EXIT_SYSTEM_EVENT then the vcpu has triggered
+a system-level event using some architecture specific mechanism (hypercall
+or some special instruction). In case of ARM/ARM64, this is triggered using
+HVC instruction based PSCI call from the vcpu. The 'type' field describes
+the system-level event type. The 'flags' field describes architecture
+specific flags for the system-level event.
+
+		struct kvm_hyperv_exit {
+#define KVM_EXIT_HYPERV_SYNIC          1
+#define KVM_EXIT_HYPERV_HCALL          2
+			__u32 type;
+			union {
+				struct {
+					__u32 msr;
+					__u64 control;
+					__u64 evt_page;
+					__u64 msg_page;
+				} synic;
+				struct {
+					__u64 input;
+					__u64 result;
+					__u64 params[2];
+				} hcall;
+			} u;
+		};
+		/* KVM_EXIT_HYPERV */
+                struct kvm_hyperv_exit hyperv;
+Indicates that the VCPU exits into userspace to process some tasks
+related to Hyper-V emulation.
+Valid values for 'type' are:
+	KVM_EXIT_HYPERV_SYNIC -- synchronously notify user-space about
+Hyper-V SynIC state change. Notification is used to remap SynIC
+event/message pages and to enable/disable SynIC messages/events processing
+in userspace.
+
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -3186,3 +3236,16 @@ available, means that that the kernel has an implementation of the
 H_RANDOM hypercall backed by a hardware random-number generator.
 If present, the kernel H_RANDOM handler can be enabled for guest use
 with the KVM_CAP_PPC_ENABLE_HCALL capability.
+
+8.2 KVM_CAP_HYPERV_SYNIC
+
+Architectures: x86
+This capability, if KVM_CHECK_EXTENSION indicates that it is
+available, means that that the kernel has an implementation of the
+Hyper-V Synthetic interrupt controller(SynIC). Hyper-V SynIC is
+used to support Windows Hyper-V based guest paravirt drivers(VMBus).
+
+In order to use SynIC, it has to be activated by setting this
+capability via KVM_ENABLE_CAP ioctl on the vcpu fd. Note that this
+will disable the use of APIC hardware virtualization even if supported
+by the CPU, as it's incompatible with SynIC auto-EOI behavior.
--- a/Documentation/vm/00-INDEX
+++ b/Documentation/vm/00-INDEX
@@ -12,6 +12,8 @@ hugetlbpage.txt
 	- a brief summary of hugetlbpage support in the Linux kernel.
 hwpoison.txt
 	- explains what hwpoison is
+idle_page_tracking.txt
+	- description of the idle page tracking feature.
 ksm.txt
 	- how to use the Kernel Samepage Merging feature.
 locking
--- a/Documentation/vm/cleancache.txt
+++ b/Documentation/vm/cleancache.txt
@@ -28,9 +28,7 @@ IMPLEMENTATION OVERVIEW
 A cleancache "backend" that provides transcendent memory registers itself
 to the kernel's cleancache "frontend" by calling cleancache_register_ops,
 passing a pointer to a cleancache_ops structure with funcs set appropriately.
-Note that cleancache_register_ops returns the previous settings so that
-chaining can be performed if desired. The functions provided must conform to
-certain semantics as follows:
+The functions provided must conform to certain semantics as follows:
 
 Most important, cleancache is "ephemeral".  Pages which are copied into
 cleancache have an indefinite lifetime which is completely unknowable
--- /dev/null
+++ b/Documentation/vm/idle_page_tracking.txt
@@ -0,0 +1,98 @@
+MOTIVATION
+
+The idle page tracking feature allows to track which memory pages are being
+accessed by a workload and which are idle. This information can be useful for
+estimating the workload's working set size, which, in turn, can be taken into
+account when configuring the workload parameters, setting memory cgroup limits,
+or deciding where to place the workload within a compute cluster.
+
+It is enabled by CONFIG_IDLE_PAGE_TRACKING=y.
+
+USER API
+
+The idle page tracking API is located at /sys/kernel/mm/page_idle. Currently,
+it consists of the only read-write file, /sys/kernel/mm/page_idle/bitmap.
+
+The file implements a bitmap where each bit corresponds to a memory page. The
+bitmap is represented by an array of 8-byte integers, and the page at PFN #i is
+mapped to bit #i%64 of array element #i/64, byte order is native. When a bit is
+set, the corresponding page is idle.
+
+A page is considered idle if it has not been accessed since it was marked idle
+(for more details on what "accessed" actually means see the IMPLEMENTATION
+DETAILS section). To mark a page idle one has to set the bit corresponding to
+the page by writing to the file. A value written to the file is OR-ed with the
+current bitmap value.
+
+Only accesses to user memory pages are tracked. These are pages mapped to a
+process address space, page cache and buffer pages, swap cache pages. For other
+page types (e.g. SLAB pages) an attempt to mark a page idle is silently ignored,
+and hence such pages are never reported idle.
+
+For huge pages the idle flag is set only on the head page, so one has to read
+/proc/kpageflags in order to correctly count idle huge pages.
+
+Reading from or writing to /sys/kernel/mm/page_idle/bitmap will return
+-EINVAL if you are not starting the read/write on an 8-byte boundary, or
+if the size of the read/write is not a multiple of 8 bytes. Writing to
+this file beyond max PFN will return -ENXIO.
+
+That said, in order to estimate the amount of pages that are not used by a
+workload one should:
+
+ 1. Mark all the workload's pages as idle by setting corresponding bits in
+    /sys/kernel/mm/page_idle/bitmap. The pages can be found by reading
+    /proc/pid/pagemap if the workload is represented by a process, or by
+    filtering out alien pages using /proc/kpagecgroup in case the workload is
+    placed in a memory cgroup.
+
+ 2. Wait until the workload accesses its working set.
+
+ 3. Read /sys/kernel/mm/page_idle/bitmap and count the number of bits set. If
+    one wants to ignore certain types of pages, e.g. mlocked pages since they
+    are not reclaimable, he or she can filter them out using /proc/kpageflags.
+
+See Documentation/vm/pagemap.txt for more information about /proc/pid/pagemap,
+/proc/kpageflags, and /proc/kpagecgroup.
+
+IMPLEMENTATION DETAILS
+
+The kernel internally keeps track of accesses to user memory pages in order to
+reclaim unreferenced pages first on memory shortage conditions. A page is
+considered referenced if it has been recently accessed via a process address
+space, in which case one or more PTEs it is mapped to will have the Accessed bit
+set, or marked accessed explicitly by the kernel (see mark_page_accessed()). The
+latter happens when:
+
+ - a userspace process reads or writes a page using a system call (e.g. read(2)
+   or write(2))
+
+ - a page that is used for storing filesystem buffers is read or written,
+   because a process needs filesystem metadata stored in it (e.g. lists a
+   directory tree)
+
+ - a page is accessed by a device driver using get_user_pages()
+
+When a dirty page is written to swap or disk as a result of memory reclaim or
+exceeding the dirty memory limit, it is not marked referenced.
+
+The idle memory tracking feature adds a new page flag, the Idle flag. This flag
+is set manually, by writing to /sys/kernel/mm/page_idle/bitmap (see the USER API
+section), and cleared automatically whenever a page is referenced as defined
+above.
+
+When a page is marked idle, the Accessed bit must be cleared in all PTEs it is
+mapped to, otherwise we will not be able to detect accesses to the page coming
+from a process address space. To avoid interference with the reclaimer, which,
+as noted above, uses the Accessed bit to promote actively referenced pages, one
+more page flag is introduced, the Young flag. When the PTE Accessed bit is
+cleared as a result of setting or updating a page's Idle flag, the Young flag
+is set on the page. The reclaimer treats the Young flag as an extra PTE
+Accessed bit and therefore will consider such a page as referenced.
+
+Since the idle memory tracking feature is based on the memory reclaimer logic,
+it only works with pages that are on an LRU list, other pages are silently
+ignored. That means it will ignore a user memory page if it is isolated, but
+since there are usually not many of them, it should not affect the overall
+result noticeably. In order not to stall scanning of the idle page bitmap,
+locked pages may be skipped too.
--- a/Documentation/vm/pagemap.txt
+++ b/Documentation/vm/pagemap.txt
@@ -5,7 +5,7 @@ pagemap is a new (as of 2.6.25) set of interfaces in the kernel that allow
 userspace programs to examine the page tables and related information by
 reading files in /proc.
 
-There are three components to pagemap:
+There are four components to pagemap:
 
  * /proc/pid/pagemap.  This file lets a userspace process find out which
    physical frame each virtual page is mapped to.  It contains one 64-bit
@@ -63,6 +63,10 @@ There are three components to pagemap:
     21. KSM
     22. THP
 
+ * /proc/kpagecgroup.  This file contains a 64-bit inode number of the
+   memory cgroup each page is charged to, indexed by PFN. Only available when
+   CONFIG_MEMCG is set.
+
 Short descriptions to the page flags:
 
  0. LOCKED
--- /dev/null
+++ b/Documentation/vm/remap_file_pages.txt
@@ -0,0 +1,27 @@
+The remap_file_pages() system call is used to create a nonlinear mapping,
+that is, a mapping in which the pages of the file are mapped into a
+nonsequential order in memory. The advantage of using remap_file_pages()
+over using repeated calls to mmap(2) is that the former approach does not
+require the kernel to create additional VMA (Virtual Memory Area) data
+structures.
+
+Supporting of nonlinear mapping requires significant amount of non-trivial
+code in kernel virtual memory subsystem including hot paths. Also to get
+nonlinear mapping work kernel need a way to distinguish normal page table
+entries from entries with file offset (pte_file). Kernel reserves flag in
+PTE for this purpose. PTE flags are scarce resource especially on some CPU
+architectures. It would be nice to free up the flag for other usage.
+
+Fortunately, there are not many users of remap_file_pages() in the wild.
+It's only known that one enterprise RDBMS implementation uses the syscall
+on 32-bit systems to map files bigger than can linearly fit into 32-bit
+virtual address space. This use-case is not critical anymore since 64-bit
+systems are widely available.
+
+The syscall is deprecated and replaced it with an emulation now. The
+emulation creates new VMAs instead of nonlinear mappings. It's going to
+work slower for rare users of remap_file_pages() but ABI is preserved.
+
+One side effect of emulation (apart from performance) is that user can hit
+vm.max_map_count limit more easily due to additional VMAs. See comment for
+DEFAULT_MAX_MAP_COUNT for more details on the limit.
--- a/Documentation/x86/x86_64/mm.txt
+++ b/Documentation/x86/x86_64/mm.txt
@@ -12,6 +12,9 @@ ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
 ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
 ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
+ ... unused hole ...
+ffffec0000000000 - fffffc0000000000 (=44 bits) kasan shadow memory (16TB)
+... unused hole ...
 ffffffff80000000 - ffffffffa0000000 (=512 MB)  kernel text mapping, from phys 0
 ffffffffa0000000 - ffffffffff5fffff (=1525 MB) module mapping space
 ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls
--- a/Makefile
+++ b/Makefile
@@ -13,6 +13,22 @@ RHEL_RELEASE = 514.16.1
 RHEL_DRM_VERSION = 4
 RHEL_DRM_PATCHLEVEL = 6
 RHEL_DRM_SUBLEVEL = 5
+# VZVERSION = ovz.30.15
+VZVERSION = ovz.custom
+
+ifeq ($(VZVERSION), ovz.custom)
+  GIT_DIR := .git
+  ifneq ("$(wildcard $(GIT_DIR) )", "")
+    VZVERSION := $(shell git describe --abbrev=0 2>/dev/null | \
+		   sed -r 's/^.*\.vz7\.//')
+  else
+    VZVERSION := custom
+  endif
+
+  ifeq ($(EXTRAVERSION),)
+    EXTRAVERSION := -$(RHEL_RELEASE).ovz.$(VZVERSION)
+  endif
+endif
 
 # *DOCUMENTATION*
 # To see a list of typical targets execute "make help"
@@ -367,6 +383,7 @@ LDFLAGS_MODULE  =
 CFLAGS_KERNEL	=
 AFLAGS_KERNEL	=
 CFLAGS_GCOV	= -fprofile-arcs -ftest-coverage
+CFLAGS_KCOV	= -fsanitize-coverage=trace-pc
 
 
 # Use USERINCLUDE when you must reference the UAPI directories only.
@@ -419,14 +436,14 @@ KBUILD_LDFLAGS_MODULE := -T $(srctree)/scripts/module-common.lds
 KERNELRELEASE = $(shell cat include/config/kernel.release 2> /dev/null)
 KERNELVERSION = $(VERSION)$(if $(PATCHLEVEL),.$(PATCHLEVEL)$(if $(SUBLEVEL),.$(SUBLEVEL)))$(EXTRAVERSION)
 
-export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION
+export VERSION PATCHLEVEL SUBLEVEL KERNELRELEASE KERNELVERSION VZVERSION
 export ARCH SRCARCH CONFIG_SHELL HOSTCC HOSTCFLAGS CROSS_COMPILE AS LD CC
 export CPP AR NM STRIP OBJCOPY OBJDUMP
 export MAKE AWK GENKSYMS INSTALLKERNEL PERL UTS_MACHINE
 export HOSTCXX HOSTCXXFLAGS LDFLAGS_MODULE CHECK CHECKFLAGS
 
 export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS
-export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV
+export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KASAN CFLAGS_KCOV
 export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE
 export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE
 export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL
@@ -647,6 +664,14 @@ else
 endif
 KBUILD_CFLAGS += $(stackp-flag)
 
+ifdef CONFIG_KCOV
+  ifeq ($(call cc-option, $(CFLAGS_KCOV)),)
+    $(warning Cannot use CONFIG_KCOV: \
+             -fsanitize-coverage=trace-pc is not supported by compiler)
+    CFLAGS_KCOV =
+  endif
+endif
+
 # This warning generated too much noise in a regular build.
 # Use make W=1 to enable this warning (see scripts/Makefile.build)
 KBUILD_CFLAGS += $(call cc-disable-warning, unused-but-set-variable)
@@ -717,6 +742,8 @@ ifeq ($(shell $(CONFIG_SHELL) $(srctree)/scripts/gcc-goto.sh $(CC)), y)
 	KBUILD_CFLAGS += -DCC_HAVE_ASM_GOTO
 endif
 
+include $(srctree)/scripts/Makefile.kasan
+
 # Add user supplied CPPFLAGS, AFLAGS and CFLAGS as the last assignments
 KBUILD_CPPFLAGS += $(KCPPFLAGS)
 KBUILD_AFLAGS += $(KAFLAGS)
@@ -919,7 +946,8 @@ define filechk_utsrelease.h
 	  echo '"$(KERNELRELEASE)" exceeds $(uts_len) characters' >&2;    \
 	  exit 1;                                                         \
 	fi;                                                               \
-	(echo \#define UTS_RELEASE \"$(KERNELRELEASE)\";)
+	(echo \#define UTS_RELEASE \"$(KERNELRELEASE)\"; 		  \
+		echo \#define VZVERSION \"$(VZVERSION)\";)
 endef
 
 define filechk_version.h
--- a/arch/arc/mm/mmap.c
+++ b/arch/arc/mm/mmap.c
@@ -64,7 +64,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
--- a/arch/arm/include/asm/kvm_host.h
+++ b/arch/arm/include/asm/kvm_host.h
@@ -32,6 +32,7 @@
 #define KVM_PRIVATE_MEM_SLOTS 4
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #define KVM_HAVE_ONE_REG
+#define KVM_HALT_POLL_NS_DEFAULT 500000
 
 #define KVM_VCPU_MAX_FEATURES 1
 
--- a/arch/arm/kernel/module.c
+++ b/arch/arm/kernel/module.c
@@ -40,7 +40,7 @@
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL_EXEC, -1,
+				GFP_KERNEL, PAGE_KERNEL_EXEC, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 #endif
--- a/arch/arm/mm/mmap.c
+++ b/arch/arm/mm/mmap.c
@@ -89,7 +89,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
@@ -140,7 +140,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-				(!vma || addr + len <= vma->vm_start))
+				(!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
--- a/arch/arm64/kernel/module.c
+++ b/arch/arm64/kernel/module.c
@@ -29,8 +29,8 @@
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				    GFP_KERNEL, PAGE_KERNEL_EXEC, -1,
-				    __builtin_return_address(0));
+				    GFP_KERNEL, PAGE_KERNEL_EXEC, 0,
+				    NUMA_NO_NODE, __builtin_return_address(0));
 }
 
 enum aarch64_reloc_op {
--- a/arch/frv/mm/elf-fdpic.c
+++ b/arch/frv/mm/elf-fdpic.c
@@ -74,7 +74,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(current->mm, addr);
 		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			goto success;
 	}
 
--- a/arch/mips/include/asm/kvm_host.h
+++ b/arch/mips/include/asm/kvm_host.h
@@ -26,6 +26,7 @@
 #define KVM_PRIVATE_MEM_SLOTS 	0
 
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
+#define KVM_HALT_POLL_NS_DEFAULT 500000
 
 /* Don't support huge pages */
 #define KVM_HPAGE_GFN_SHIFT(x)	0
--- a/arch/mips/kernel/module.c
+++ b/arch/mips/kernel/module.c
@@ -23,6 +23,7 @@
 #include <linux/moduleloader.h>
 #include <linux/elf.h>
 #include <linux/mm.h>
+#include <linux/numa.h>
 #include <linux/vmalloc.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
@@ -46,7 +47,7 @@ static DEFINE_SPINLOCK(dbe_lock);
 void *module_alloc(unsigned long size)
 {
 	return __vmalloc_node_range(size, 1, MODULE_START, MODULE_END,
-				GFP_KERNEL, PAGE_KERNEL, -1,
+				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 #endif
--- a/arch/mips/mm/mmap.c
+++ b/arch/mips/mm/mmap.c
@@ -92,7 +92,7 @@ static unsigned long arch_get_unmapped_area_common(struct file *filp,
 
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
--- a/arch/parisc/kernel/module.c
+++ b/arch/parisc/kernel/module.c
@@ -219,7 +219,7 @@ void *module_alloc(unsigned long size)
 	 * init_data correctly */
 	return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
 				    GFP_KERNEL | __GFP_HIGHMEM,
-				    PAGE_KERNEL_RWX, -1,
+				    PAGE_KERNEL_RWX, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
 
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -1062,6 +1062,8 @@ endif
 config	ARCH_RANDOM
 	def_bool n
 
+source "kernel/Kconfig.openvz"
+
 source "net/Kconfig"
 
 source "drivers/Kconfig"
@@ -1072,6 +1074,8 @@ source "arch/powerpc/sysdev/qe_lib/Kconfig"
 
 source "lib/Kconfig"
 
+source "kernel/bc/Kconfig"
+
 source "arch/powerpc/Kconfig.debug"
 
 source "security/Kconfig"
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -45,6 +45,7 @@
 #ifdef CONFIG_KVM_MMIO
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 1
 #endif
+#define KVM_HALT_POLL_NS_DEFAULT 500000
 
 /* These values are internal and can be increased later */
 #define KVM_NR_IRQCHIPS          1
--- a/arch/powerpc/include/asm/systbl.h
+++ b/arch/powerpc/include/asm/systbl.h
@@ -369,3 +369,21 @@ SYSCALL(ni_syscall) /* sys_bpf */
 SYSCALL(ni_syscall) /* sys_execveat */
 PPC64ONLY(switch_endian)
 SYSCALL_SPU(userfaultfd)
+SYSCALL(ni_syscall) /* 365-370 reserved */
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(ni_syscall)
+SYSCALL(getluid)
+SYSCALL(setluid)
+SYSCALL(setublimit)
+SYSCALL(ubstat)
--- a/arch/powerpc/include/asm/unistd.h
+++ b/arch/powerpc/include/asm/unistd.h
@@ -11,8 +11,7 @@
 
 #include <uapi/asm/unistd.h>
 
-
-#define __NR_syscalls		365
+#define __NR_syscalls		382
 
 #define __NR__exit __NR_exit
 #define NR_syscalls	__NR_syscalls
--- a/arch/powerpc/include/uapi/asm/ioctls.h
+++ b/arch/powerpc/include/uapi/asm/ioctls.h
@@ -116,4 +116,6 @@
 #define TIOCMIWAIT	0x545C	/* wait for a change on serial input line(s) */
 #define TIOCGICOUNT	0x545D	/* read serial port inline interrupt counts */
 
+#define TIOSAK		_IO('T', 0x66)	/* "Secure Attention Key" */
+
 #endif	/* _ASM_POWERPC_IOCTLS_H */
--- a/arch/powerpc/include/uapi/asm/unistd.h
+++ b/arch/powerpc/include/uapi/asm/unistd.h
@@ -388,5 +388,9 @@
 #define __NR_switch_endian	363
 #define __NR_userfaultfd	364
 
+#define __NR_getluid		379
+#define __NR_setluid		380
+#define __NR_setublimit		381
+#define __NR_ubstat		382
 
 #endif /* _UAPI_ASM_POWERPC_UNISTD_H_ */
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -37,6 +37,7 @@
 #include <linux/personality.h>
 #include <linux/random.h>
 #include <linux/hw_breakpoint.h>
+#include <linux/ve.h>
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
--- a/arch/powerpc/mm/slice.c
+++ b/arch/powerpc/mm/slice.c
@@ -103,7 +103,7 @@ static int slice_area_is_free(struct mm_struct *mm, unsigned long addr,
 	if ((mm->task_size - len) < addr)
 		return 0;
 	vma = find_vma(mm, addr);
-	return (!vma || (addr + len) <= vma->vm_start);
+	return (!vma || (addr + len) <= vm_start_gap(vma));
 }
 
 static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice)
--- a/arch/powerpc/platforms/cell/spufs/inode.c
+++ b/arch/powerpc/platforms/cell/spufs/inode.c
@@ -763,7 +763,7 @@ static int __init spufs_init(void)
 	ret = -ENOMEM;
 	spufs_inode_cache = kmem_cache_create("spufs_inode_cache",
 			sizeof(struct spufs_inode_info), 0,
-			SLAB_HWCACHE_ALIGN, spufs_init_once);
+			SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, spufs_init_once);
 
 	if (!spufs_inode_cache)
 		goto out;
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -25,6 +25,8 @@
 #define KVM_MAX_VCPUS 64
 #define KVM_USER_MEM_SLOTS 32
 
+#define KVM_HALT_POLL_NS_DEFAULT 0
+
 struct sca_entry {
 	atomic_t scn;
 	__u32	reserved;
--- a/arch/s390/kernel/module.c
+++ b/arch/s390/kernel/module.c
@@ -50,7 +50,7 @@ void *module_alloc(unsigned long size)
 	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				    GFP_KERNEL, PAGE_KERNEL, -1,
+				    GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
 				    __builtin_return_address(0));
 }
 #endif
--- a/arch/sh/mm/mmap.c
+++ b/arch/sh/mm/mmap.c
@@ -63,7 +63,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
@@ -113,7 +113,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
--- a/arch/sparc/kernel/module.c
+++ b/arch/sparc/kernel/module.c
@@ -29,7 +29,7 @@ static void *module_map(unsigned long size)
 	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
 	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
-				GFP_KERNEL, PAGE_KERNEL, -1,
+				GFP_KERNEL, PAGE_KERNEL, 0, NUMA_NO_NODE,
 				__builtin_return_address(0));
 }
 #else
--- a/arch/sparc/kernel/sys_sparc_64.c
+++ b/arch/sparc/kernel/sys_sparc_64.c
@@ -119,7 +119,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi
 
 		vma = find_vma(mm, addr);
 		if (task_size - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
@@ -182,7 +182,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 
 		vma = find_vma(mm, addr);
 		if (task_size - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
--- a/arch/sparc/mm/hugetlbpage.c
+++ b/arch/sparc/mm/hugetlbpage.c
@@ -118,7 +118,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		addr = ALIGN(addr, HPAGE_SIZE);
 		vma = find_vma(mm, addr);
 		if (task_size - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 	if (mm->get_unmapped_area == arch_get_unmapped_area)
--- a/arch/tile/mm/hugetlbpage.c
+++ b/arch/tile/mm/hugetlbpage.c
@@ -269,7 +269,7 @@ unsigned long hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		addr = ALIGN(addr, huge_page_size(h));
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 	if (current->mm->get_unmapped_area == arch_get_unmapped_area)
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -82,6 +82,8 @@ config X86
 	select HAVE_CMPXCHG_LOCAL
 	select HAVE_CMPXCHG_DOUBLE
 	select HAVE_ARCH_KMEMCHECK
+	select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP
+	select ARCH_HAS_KCOV			if X86_64
 	select HAVE_USER_RETURN_NOTIFIER
 	select ARCH_BINFMT_ELF_RANDOMIZE_PIE
 	select HAVE_ARCH_JUMP_LABEL
@@ -1229,16 +1231,25 @@ config DIRECT_GBPAGES
 	  support it. This can improve the kernel's performance a tiny bit by
 	  reducing TLB pressure. If in doubt, say "Y".
 
-config TRACK_DIRTY_PAGES
-	bool "Enable dirty page tracking"
-	default n
-	depends on !KMEMCHECK
-	---help---
-	  Turning this on enables tracking of re-dirtied and
-	  changed pages.  This is needed by the Live Kernel
-	  Self Migration project (lksm.sourceforge.net) to perform
-	  live copying of memory and system state to another system.
-	  Most users will say n here.
+#
+# This tracker is breaking MEM_SOFT_DIRTY option because
+# it conflicts with the bits used there.
+#
+# So turn it off permanently because vanilla kernel already
+# has a tracker, no need to invent new one!
+#
+# 	-- cyrillos
+#
+#config TRACK_DIRTY_PAGES
+#	bool "Enable dirty page tracking"
+#	default n
+#	depends on !KMEMCHECK
+#	---help---
+#	  Turning this on enables tracking of re-dirtied and
+#	  changed pages.  This is needed by the Live Kernel
+#	  Self Migration project (lksm.sourceforge.net) to perform
+#	  live copying of memory and system state to another system.
+#	  Most users will say n here.
 
 # Common NUMA Features
 config NUMA
@@ -2449,6 +2460,8 @@ config VMD
 	  single domain. If you know your system provides one of these and
 	  has devices attached to it, say Y; if you are not sure, say N.
 
+source "kernel/Kconfig.openvz"
+
 source "net/Kconfig"
 
 source "drivers/Kconfig"
@@ -2466,3 +2479,5 @@ source "crypto/Kconfig"
 source "arch/x86/kvm/Kconfig"
 
 source "lib/Kconfig"
+
+source "kernel/bc/Kconfig"
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -11,11 +11,20 @@
 
 OBJECT_FILES_NON_STANDARD	:= y
 
+# Kernel does not boot with kcov instrumentation here.
+# One of the problems observed was insertion of __sanitizer_cov_trace_pc()
+# callback into middle of per-cpu data enabling code. Thus the callback observed
+# inconsistent state and crashed. We are interested mostly in syscall coverage,
+# so boot code is not interesting anyway.
+KCOV_INSTRUMENT		:= n
+
 # If you want to preset the SVGA mode, uncomment the next line and
 # set SVGA_MODE to whatever number you want.
 # Set it to -DSVGA_MODE=NORMAL_VGA if you just want the EGA/VGA mode.
 # The number is the same as you would ordinarily press at bootup.
 
+KASAN_SANITIZE := n
+
 SVGA_MODE	:= -DSVGA_MODE=NORMAL_VGA
 
 targets		:= vmlinux.bin setup.bin setup.elf bzImage
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -19,6 +19,10 @@ KBUILD_CFLAGS += $(call cc-option,-fno-stack-protector)
 
 KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
 GCOV_PROFILE := n
+KASAN_SANITIZE := n
+
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
 
 LDFLAGS := -m elf_$(UTS_MACHINE)
 LDFLAGS_vmlinux := -T
--- a/arch/x86/boot/compressed/eboot.c
+++ b/arch/x86/boot/compressed/eboot.c
@@ -7,6 +7,9 @@
  *
  * ----------------------------------------------------------------------- */
 
+#include "misc.h"
+#include <linux/types.h>
+#include "../string.h"
 #include <linux/efi.h>
 #include <linux/pci.h>
 #include <asm/efi.h>
@@ -14,8 +17,6 @@
 #include <asm/desc.h>
 #include <asm/bootparam_utils.h>
 
-#undef memcpy			/* Use memcpy from misc.c */
-
 #include "eboot.h"
 
 static efi_system_table_t *sys_table;
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -7,6 +7,7 @@
  * we just keep it from happening
  */
 #undef CONFIG_PARAVIRT
+#undef CONFIG_KASAN
 #ifdef CONFIG_X86_32
 #define _ASM_X86_DESC_H 1
 #endif
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -1996,8 +1996,13 @@ again:
 	intel_pmu_lbr_read();
 	intel_pmu_ack_status(status);
 	if (++loops > 100) {
-		WARN_ONCE(1, "perfevents: irq loop stuck!\n");
-		perf_event_print_debug();
+		static bool warned = false;
+		if (!warned) {
+			pr_warn("perfevents: irq loop stuck!\n");
+			dump_stack();
+			perf_event_print_debug();
+			warned = true;
+		}
 		intel_pmu_reset();
 		goto done;
 	}
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -19,6 +19,7 @@
 #include <linux/personality.h>
 #include <linux/compat.h>
 #include <linux/binfmts.h>
+#include <linux/ptrace.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
@@ -34,10 +35,28 @@
 #include <asm/sys_ia32.h>
 #include <asm/smap.h>
 
-int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
+void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact)
+{
+	/* Don't leak in-kernel non-uapi flags to user-space */
+	if (oact)
+		oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
+
+	if (!act)
+		return;
+
+	/* Don't let flags to be set from userspace */
+	act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI);
+
+	if (is_ia32_task())
+		act->sa.sa_flags |= SA_IA32_ABI;
+	if (is_x32_task())
+		act->sa.sa_flags |= SA_X32_ABI;
+}
+
+int __copy_siginfo_to_user32(compat_siginfo_t __user *to, const siginfo_t *from,
+		bool x32_ABI)
 {
 	int err = 0;
-	bool ia32 = test_thread_flag(TIF_IA32);
 
 	if (!access_ok(VERIFY_WRITE, to, sizeof(compat_siginfo_t)))
 		return -EFAULT;
@@ -71,7 +90,7 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
 				put_user_ex(from->si_arch, &to->si_arch);
 				break;
 			case __SI_CHLD >> 16:
-				if (ia32) {
+				if (!x32_ABI) {
 					put_user_ex(from->si_utime, &to->si_utime);
 					put_user_ex(from->si_stime, &to->si_stime);
 				} else {
@@ -105,6 +124,12 @@ int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
 	return err;
 }
 
+/* from syscall's path, where we know the ABI */
+int copy_siginfo_to_user32(compat_siginfo_t __user *to, siginfo_t *from)
+{
+	return __copy_siginfo_to_user32(to, from, is_x32_task());
+}
+
 int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from)
 {
 	int err = 0;
@@ -473,7 +498,7 @@ int ia32_setup_rt_frame(int sig, struct ksignal *ksig,
 		put_user_ex(*((u64 *)&code), (u64 __user *)frame->retcode);
 	} put_user_catch(err);
 
-	err |= copy_siginfo_to_user32(&frame->info, &ksig->info);
+	err |= __copy_siginfo_to_user32(&frame->info, &ksig->info, false);
 	err |= ia32_setup_sigcontext(&frame->uc.uc_mcontext, fpstate,
 				     regs, set->sig[0]);
 	err |= __copy_to_user(&frame->uc.uc_sigmask, set, sizeof(*set));
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -201,20 +201,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
 				advice);
 }
 
-long sys32_vm86_warning(void)
-{
-	struct task_struct *me = current;
-	static char lastcomm[sizeof(me->comm)];
-
-	if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
-		compat_printk(KERN_INFO
-			      "%s: vm86 mode not supported on 64 bit kernel\n",
-			      me->comm);
-		strncpy(lastcomm, me->comm, sizeof(lastcomm));
-	}
-	return -ENOSYS;
-}
-
 asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi,
 				   size_t count)
 {
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -261,19 +261,17 @@ struct compat_shmid64_ds {
 /*
  * The type of struct elf_prstatus.pr_reg in compatible core dumps.
  */
-#ifdef CONFIG_X86_X32_ABI
 typedef struct user_regs_struct compat_elf_gregset_t;
 
-#define PR_REG_SIZE(S) (test_thread_flag(TIF_IA32) ? 68 : 216)
-#define PRSTATUS_SIZE(S) (test_thread_flag(TIF_IA32) ? 144 : 296)
-#define SET_PR_FPVALID(S,V) \
-  do { *(int *) (((void *) &((S)->pr_reg)) + PR_REG_SIZE(0)) = (V); } \
+/* Full regset -- prstatus on x32, otherwise on ia32 */
+#define PRSTATUS_SIZE(S, R) (R != sizeof(S.pr_reg) ? 144 : 296)
+#define SET_PR_FPVALID(S, V, R) \
+  do { *(int *) (((void *) &((S)->pr_reg)) + R) = (V); } \
   while (0)
 
+#ifdef CONFIG_X86_X32_ABI
 #define COMPAT_USE_64BIT_TIME \
 	(!!(task_pt_regs(current)->orig_ax & __X32_SYSCALL_BIT))
-#else
-typedef struct user_regs_struct32 compat_elf_gregset_t;
 #endif
 
 /*
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -107,6 +107,7 @@
 #define X86_FEATURE_APERFMPERF	(3*32+28) /* APERFMPERF */
 #define X86_FEATURE_EAGER_FPU	(3*32+29) /* "eagerfpu" Non lazy FPU restore */
 #define X86_FEATURE_NONSTOP_TSC_S3 (3*32+30) /* TSC doesn't stop in S3 state */
+#define X86_FEATURE_CPUID_FAULTING (3*32+31) /* cpuid faulting */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3	(4*32+ 0) /* "pni" SSE-3 */
@@ -389,6 +390,7 @@ extern const char * const x86_power_flags[32];
 #define cpu_has_eager_fpu	boot_cpu_has(X86_FEATURE_EAGER_FPU)
 #define cpu_has_topoext		boot_cpu_has(X86_FEATURE_TOPOEXT)
 #define cpu_has_bpext		boot_cpu_has(X86_FEATURE_BPEXT)
+#define cpu_has_cpuid_faulting	boot_cpu_has(X86_FEATURE_CPUID_FAULTING)
 
 #if __GNUC__ >= 4
 /*
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -336,6 +336,10 @@ extern int x32_setup_additional_pages(struct linux_binprm *bprm,
 extern int syscall32_setup_pages(struct linux_binprm *, int exstack);
 #define compat_arch_setup_additional_pages	syscall32_setup_pages
 
+#ifdef CONFIG_X86_64
+extern int do_map_compat_vdso(unsigned long addr);
+#endif
+
 extern unsigned long arch_randomize_brk(struct mm_struct *mm);
 #define arch_randomize_brk arch_randomize_brk
 
--- a/arch/x86/include/asm/fpu-internal.h
+++ b/arch/x86/include/asm/fpu-internal.h
@@ -22,6 +22,7 @@
 #include <asm/uaccess.h>
 #include <asm/xsave.h>
 #include <asm/smap.h>
+#include <asm/signal.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/sigcontext32.h>
@@ -38,6 +39,12 @@ int ia32_setup_frame(int sig, struct ksignal *ksig,
 # define ia32_setup_rt_frame	__setup_rt_frame
 #endif
 
+#ifdef CONFIG_COMPAT
+int __copy_siginfo_to_user32(compat_siginfo_t __user *to,
+		const siginfo_t *from, bool x32_ABI);
+#endif
+
+
 extern unsigned int mxcsr_feature_mask;
 extern void fpu_init(void);
 extern void eager_fpu_init(void);
@@ -69,20 +76,21 @@ extern void finit_soft_fpu(struct i387_soft_struct *soft);
 static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
 #endif
 
-static inline int is_ia32_compat_frame(void)
+static inline int is_ia32_compat_frame(struct ksignal *ksig)
 {
 	return config_enabled(CONFIG_IA32_EMULATION) &&
-	       test_thread_flag(TIF_IA32);
+		ksig->ka.sa.sa_flags & SA_IA32_ABI;
 }
 
-static inline int is_ia32_frame(void)
+static inline int is_ia32_frame(struct ksignal *ksig)
 {
-	return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame();
+	return config_enabled(CONFIG_X86_32) || is_ia32_compat_frame(ksig);
 }
 
-static inline int is_x32_frame(void)
+static inline int is_x32_frame(struct ksignal *ksig)
 {
-	return config_enabled(CONFIG_X86_X32_ABI) && test_thread_flag(TIF_X32);
+	return config_enabled(CONFIG_X86_X32_ABI) &&
+		ksig->ka.sa.sa_flags & SA_X32_ABI;
 }
 
 #define X87_FSW_ES (1 << 7)	/* Exception Summary */
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -110,7 +110,7 @@ static inline bool setup_remapped_irq(int irq,
 	return false;
 }
 
-int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
+static inline int irq_set_vcpu_affinity(unsigned int irq, void *vcpu_info)
 {
 	return -ENOSYS;
 }
--- /dev/null
+++ b/arch/x86/include/asm/kasan.h
@@ -0,0 +1,27 @@
+#ifndef _ASM_X86_KASAN_H
+#define _ASM_X86_KASAN_H
+
+/*
+ * Compiler uses shadow offset assuming that addresses start
+ * from 0. Kernel addresses don't start from 0, so shadow
+ * for kernel really starts from compiler's shadow offset +
+ * 'kernel address space start' >> KASAN_SHADOW_SCALE_SHIFT
+ */
+#define KASAN_SHADOW_START      (KASAN_SHADOW_OFFSET + \
+					(0xffff800000000000ULL >> 3))
+/* 47 bits for kernel address -> (47 - 3) bits for shadow */
+#define KASAN_SHADOW_END        (KASAN_SHADOW_START + (1ULL << (47 - 3)))
+
+#ifndef __ASSEMBLY__
+
+#ifdef CONFIG_KASAN
+void __init kasan_early_init(void);
+void __init kasan_init(void);
+#else
+static inline void kasan_early_init(void) { }
+static inline void kasan_init(void) { }
+#endif
+
+#endif
+
+#endif
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -25,6 +25,7 @@
 #include <linux/pvclock_gtod.h>
 #include <linux/clocksource.h>
 #include <linux/irqbypass.h>
+#include <linux/hyperv.h>
 
 #include <asm/pvclock-abi.h>
 #include <asm/desc.h>
@@ -43,6 +44,7 @@
 
 #define KVM_PIO_PAGE_OFFSET 1
 #define KVM_COALESCED_MMIO_PAGE_OFFSET 2
+#define KVM_HALT_POLL_NS_DEFAULT 500000
 
 #define KVM_IRQCHIP_NUM_PINS  KVM_IOAPIC_NUM_PINS
 
@@ -357,9 +359,38 @@ enum {
 	KVM_DEBUGREG_RELOAD = 4,
 };
 
+/* Hyper-V SynIC timer */
+struct kvm_vcpu_hv_stimer {
+	struct hrtimer timer;
+	int index;
+	u64 config;
+	u64 count;
+	u64 exp_time;
+	struct hv_message msg;
+	bool msg_pending;
+};
+
+/* Hyper-V synthetic interrupt controller (SynIC)*/
+struct kvm_vcpu_hv_synic {
+	u64 version;
+	u64 control;
+	u64 msg_page;
+	u64 evt_page;
+	atomic64_t sint[HV_SYNIC_SINT_COUNT];
+	atomic_t sint_to_gsi[HV_SYNIC_SINT_COUNT];
+	DECLARE_BITMAP(auto_eoi_bitmap, 256);
+	DECLARE_BITMAP(vec_bitmap, 256);
+	bool active;
+};
+
 /* Hyper-V per vcpu emulation context */
 struct kvm_vcpu_hv {
 	u64 hv_vapic;
+	s64 runtime_offset;
+	struct kvm_vcpu_hv_synic synic;
+	struct kvm_hyperv_exit exit;
+	struct kvm_vcpu_hv_stimer stimer[HV_SYNIC_STIMER_COUNT];
+	DECLARE_BITMAP(stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
 };
 
 struct kvm_vcpu_arch {
@@ -383,6 +414,7 @@ struct kvm_vcpu_arch {
 	u64 apic_base;
 	struct kvm_lapic *apic;    /* kernel irqchip context */
 	u64 eoi_exit_bitmap[4];
+	bool apicv_active;
 	unsigned long apic_attention;
 	int32_t apic_arb_prio;
 	int mp_state;
@@ -472,6 +504,7 @@ struct kvm_vcpu_arch {
 		struct kvm_steal_time steal;
 	} st;
 
+	u64 tsc_offset;
 	u64 last_guest_tsc;
 	u64 last_host_tsc;
 	u64 tsc_offset_adjustment;
@@ -599,6 +632,12 @@ struct kvm_hv {
 	u64 hv_guest_os_id;
 	u64 hv_hypercall;
 	u64 hv_tsc_page;
+
+	/* Hyper-v based guest crash (NT kernel bugcheck) parameters */
+	u64 hv_crash_param[HV_X64_MSR_CRASH_PARAMS];
+	u64 hv_crash_ctl;
+
+	HV_REFERENCE_TSC_PAGE tsc_ref;
 };
 
 struct kvm_arch {
@@ -814,7 +853,8 @@ struct kvm_x86_ops {
 	void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
 	void (*enable_irq_window)(struct kvm_vcpu *vcpu);
 	void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
-	int (*cpu_uses_apicv)(struct kvm_vcpu *vcpu);
+	bool (*get_enable_apicv)(void);
+	void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu);
 	void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr);
 	void (*hwapic_isr_update)(struct kvm *kvm, int isr);
 	void (*load_eoi_exitmap)(struct kvm_vcpu *vcpu);
@@ -828,7 +868,6 @@ struct kvm_x86_ops {
 	int (*get_lpage_level)(void);
 	bool (*rdtscp_supported)(void);
 	bool (*invpcid_supported)(void);
-	void (*adjust_tsc_offset_guest)(struct kvm_vcpu *vcpu, s64 adjustment);
 
 	void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
 
@@ -836,11 +875,8 @@ struct kvm_x86_ops {
 
 	bool (*has_wbinvd_exit)(void);
 
-	u64 (*read_tsc_offset)(struct kvm_vcpu *vcpu);
 	void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
 
-	u64 (*read_l1_tsc)(struct kvm_vcpu *vcpu, u64 host_tsc);
-
 	void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
 
 	int (*check_intercept)(struct kvm_vcpu *vcpu,
@@ -1071,6 +1107,8 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
 				struct x86_exception *exception);
 
+void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu);
+
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
 
 int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
@@ -1201,7 +1239,7 @@ void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
 int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
 int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
 int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v, bool make_req);
 void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
 void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu);
 void kvm_arch_mmu_notifier_invalidate_page(struct kvm *kvm,
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -48,6 +48,7 @@
 #define MSR_MTRRcap			0x000000fe
 #define MSR_IA32_BBL_CR_CTL		0x00000119
 #define MSR_IA32_BBL_CR_CTL3		0x0000011e
+#define MSR_MISC_FEATURES_ENABLES	0x00000140
 
 #define MSR_IA32_SYSENTER_CS		0x00000174
 #define MSR_IA32_SYSENTER_ESP		0x00000175
--- a/arch/x86/include/asm/page_64_types.h
+++ b/arch/x86/include/asm/page_64_types.h
@@ -1,17 +1,23 @@
 #ifndef _ASM_X86_PAGE_64_DEFS_H
 #define _ASM_X86_PAGE_64_DEFS_H
 
-#define THREAD_SIZE_ORDER	2
+#ifdef CONFIG_KASAN
+#define KASAN_STACK_ORDER 1
+#else
+#define KASAN_STACK_ORDER 0
+#endif
+
+#define THREAD_SIZE_ORDER	(2 + KASAN_STACK_ORDER)
 #define THREAD_SIZE  (PAGE_SIZE << THREAD_SIZE_ORDER)
 #define CURRENT_MASK (~(THREAD_SIZE - 1))
 
-#define EXCEPTION_STACK_ORDER 0
+#define EXCEPTION_STACK_ORDER (0 + KASAN_STACK_ORDER)
 #define EXCEPTION_STKSZ (PAGE_SIZE << EXCEPTION_STACK_ORDER)
 
 #define DEBUG_STACK_ORDER (EXCEPTION_STACK_ORDER + 1)
 #define DEBUG_STKSZ (PAGE_SIZE << DEBUG_STACK_ORDER)
 
-#define IRQ_STACK_ORDER 2
+#define IRQ_STACK_ORDER (2 + KASAN_STACK_ORDER)
 #define IRQ_STACK_SIZE (PAGE_SIZE << IRQ_STACK_ORDER)
 
 #define DOUBLEFAULT_STACK 1
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -128,7 +128,8 @@ do {							\
 do {									\
 	typedef typeof(var) pao_T__;					\
 	const int pao_ID__ = (__builtin_constant_p(val) &&		\
-			      ((val) == 1 || (val) == -1)) ? (val) : 0;	\
+			      ((val) == 1 || (val) == -1)) ?		\
+				(int)(val) : 0;				\
 	if (0) {							\
 		pao_T__ pao_tmp__;					\
 		pao_tmp__ = (val);					\
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -4,6 +4,7 @@
 #include <linux/threads.h>
 #include <linux/mm.h>		/* for struct page */
 #include <linux/pagemap.h>
+#include <linux/sched.h>	/* for init_mm */
 
 static inline int  __paravirt_pgd_alloc(struct mm_struct *mm) { return 0; }
 
@@ -81,11 +82,15 @@ static inline void pmd_populate(struct mm_struct *mm, pmd_t *pmd,
 static inline pmd_t *pmd_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
 	struct page *page;
-	page = alloc_pages(GFP_KERNEL | __GFP_REPEAT | __GFP_ZERO, 0);
+	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_REPEAT | __GFP_ZERO;
+
+	if (mm == &init_mm)
+		gfp &= ~__GFP_ACCOUNT;
+	page = alloc_pages(gfp, 0);
 	if (!page)
 		return NULL;
 	if (!pgtable_pmd_page_ctor(page)) {
-		__free_pages(page, 0);
+		__free_page(page);
 		return NULL;
 	}
 	return (pmd_t *)page_address(page);
@@ -125,7 +130,11 @@ static inline void pgd_populate(struct mm_struct *mm, pgd_t *pgd, pud_t *pud)
 
 static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
 {
-	return (pud_t *)get_zeroed_page(GFP_KERNEL|__GFP_REPEAT);
+	gfp_t gfp = GFP_KERNEL_ACCOUNT | __GFP_REPEAT;
+
+	if (mm == &init_mm)
+		gfp &= ~__GFP_ACCOUNT;
+	return (pud_t *)get_zeroed_page(gfp);
 }
 
 static inline void pud_free(struct mm_struct *mm, pud_t *pud)
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -60,93 +60,9 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
 #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
 #endif
 
-#ifdef CONFIG_MEM_SOFT_DIRTY
-
-/*
- * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE, _PAGE_BIT_SOFT_DIRTY and
- * _PAGE_BIT_PROTNONE are taken, split up the 28 bits of offset
- * into this range.
- */
-#define PTE_FILE_MAX_BITS	28
-#define PTE_FILE_SHIFT1		(_PAGE_BIT_PRESENT + 1)
-#define PTE_FILE_SHIFT2		(_PAGE_BIT_FILE + 1)
-#define PTE_FILE_SHIFT3		(_PAGE_BIT_PROTNONE + 1)
-#define PTE_FILE_SHIFT4		(_PAGE_BIT_SOFT_DIRTY + 1)
-#define PTE_FILE_BITS1		(PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
-#define PTE_FILE_BITS2		(PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
-#define PTE_FILE_BITS3		(PTE_FILE_SHIFT4 - PTE_FILE_SHIFT3 - 1)
-
-#define pte_to_pgoff(pte)						\
-	((((pte).pte_low >> (PTE_FILE_SHIFT1))				\
-	  & ((1U << PTE_FILE_BITS1) - 1)))				\
-	+ ((((pte).pte_low >> (PTE_FILE_SHIFT2))			\
-	    & ((1U << PTE_FILE_BITS2) - 1))				\
-	   << (PTE_FILE_BITS1))						\
-	+ ((((pte).pte_low >> (PTE_FILE_SHIFT3))			\
-	    & ((1U << PTE_FILE_BITS3) - 1))				\
-	   << (PTE_FILE_BITS1 + PTE_FILE_BITS2))			\
-	+ ((((pte).pte_low >> (PTE_FILE_SHIFT4)))			\
-	    << (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3))
-
-#define pgoff_to_pte(off)						\
-	((pte_t) { .pte_low =						\
-	 ((((off)) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1)	\
-	 + ((((off) >> PTE_FILE_BITS1)					\
-	     & ((1U << PTE_FILE_BITS2) - 1))				\
-	    << PTE_FILE_SHIFT2)						\
-	 + ((((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2))		\
-	     & ((1U << PTE_FILE_BITS3) - 1))				\
-	    << PTE_FILE_SHIFT3)						\
-	 + ((((off) >>							\
-	      (PTE_FILE_BITS1 + PTE_FILE_BITS2 + PTE_FILE_BITS3)))	\
-	    << PTE_FILE_SHIFT4)						\
-	 + _PAGE_FILE })
-
-#else /* CONFIG_MEM_SOFT_DIRTY */
-
-/*
- * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
- * split up the 29 bits of offset into this range.
- */
-#define PTE_FILE_MAX_BITS	29
-#define PTE_FILE_SHIFT1		(_PAGE_BIT_PRESENT + 1)
-#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
-#define PTE_FILE_SHIFT2		(_PAGE_BIT_FILE + 1)
-#define PTE_FILE_SHIFT3		(_PAGE_BIT_PROTNONE + 1)
-#else
-#define PTE_FILE_SHIFT2		(_PAGE_BIT_PROTNONE + 1)
-#define PTE_FILE_SHIFT3		(_PAGE_BIT_FILE + 1)
-#endif
-#define PTE_FILE_BITS1		(PTE_FILE_SHIFT2 - PTE_FILE_SHIFT1 - 1)
-#define PTE_FILE_BITS2		(PTE_FILE_SHIFT3 - PTE_FILE_SHIFT2 - 1)
-
-#define pte_to_pgoff(pte)						\
-	((((pte).pte_low >> PTE_FILE_SHIFT1)				\
-	  & ((1U << PTE_FILE_BITS1) - 1))				\
-	 + ((((pte).pte_low >> PTE_FILE_SHIFT2)				\
-	     & ((1U << PTE_FILE_BITS2) - 1)) << PTE_FILE_BITS1)		\
-	 + (((pte).pte_low >> PTE_FILE_SHIFT3)				\
-	    << (PTE_FILE_BITS1 + PTE_FILE_BITS2)))
-
-#define pgoff_to_pte(off)						\
-	((pte_t) { .pte_low =						\
-	 (((off) & ((1U << PTE_FILE_BITS1) - 1)) << PTE_FILE_SHIFT1)	\
-	 + ((((off) >> PTE_FILE_BITS1) & ((1U << PTE_FILE_BITS2) - 1))	\
-	    << PTE_FILE_SHIFT2)						\
-	 + (((off) >> (PTE_FILE_BITS1 + PTE_FILE_BITS2))		\
-	    << PTE_FILE_SHIFT3)						\
-	 + _PAGE_FILE })
-
-#endif /* CONFIG_MEM_SOFT_DIRTY */
-
 /* Encode and de-code a swap entry */
-#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
-#define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
+#define SWP_TYPE_BITS 5
 #define SWP_OFFSET_SHIFT (_PAGE_BIT_PROTNONE + 1)
-#else
-#define SWP_TYPE_BITS (_PAGE_BIT_PROTNONE - _PAGE_BIT_PRESENT - 1)
-#define SWP_OFFSET_SHIFT (_PAGE_BIT_FILE + 1)
-#endif
 
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > SWP_TYPE_BITS)
 
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -189,18 +189,6 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
 #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
 #endif
 
-/*
- * Bits 0, 6 and 7 are taken in the low part of the pte,
- * put the 32 bits of offset into the high part.
- *
- * For soft-dirty tracking 11 bit is taken from
- * the low part of pte as well.
- */
-#define pte_to_pgoff(pte) ((pte).pte_high)
-#define pgoff_to_pte(off)						\
-	((pte_t) { { .pte_low = _PAGE_FILE, .pte_high = (off) } })
-#define PTE_FILE_MAX_BITS       32
-
 /* Encode and de-code a swap entry */
 #define MAX_SWAPFILES_CHECK() BUILD_BUG_ON(MAX_SWAPFILES_SHIFT > 5)
 #define __swp_type(x)			(((x).val) & 0x1f)
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -108,11 +108,6 @@ static inline int pte_write(pte_t pte)
 	return pte_flags(pte) & _PAGE_RW;
 }
 
-static inline int pte_file(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_FILE;
-}
-
 static inline int pte_huge(pte_t pte)
 {
 	return pte_flags(pte) & _PAGE_PSE;
@@ -340,21 +335,6 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
 	return pte_clear_flags(pte, _PAGE_SWP_SOFT_DIRTY);
 }
 
-static inline pte_t pte_file_clear_soft_dirty(pte_t pte)
-{
-	return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
-}
-
-static inline pte_t pte_file_mksoft_dirty(pte_t pte)
-{
-	return pte_set_flags(pte, _PAGE_SOFT_DIRTY);
-}
-
-static inline int pte_file_soft_dirty(pte_t pte)
-{
-	return pte_flags(pte) & _PAGE_SOFT_DIRTY;
-}
-
 /*
  * Mask out unsupported bits in a present pgprot.  Non-present pgprots
  * can use those bits for other purposes, so leave them be.
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -140,10 +140,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 /* PUD - Level3 access */
 
 /* PMD  - Level 2 access */
-#define pte_to_pgoff(pte) ((pte_val((pte)) & PHYSICAL_PAGE_MASK) >> PAGE_SHIFT)
-#define pgoff_to_pte(off) ((pte_t) { .pte = ((off) << PAGE_SHIFT) |	\
-					    _PAGE_FILE })
-#define PTE_FILE_MAX_BITS __PHYSICAL_MASK_SHIFT
 
 /* PTE - Level 1 access. */
 
@@ -151,11 +147,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 #define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
 #define pte_unmap(pte) ((void)(pte))/* NOP */
 
-/* Encode and de-code a swap entry */
-#if _PAGE_BIT_FILE > _PAGE_BIT_PROTNONE
-#error unsupported PTE bit arrangement
-#endif
-
 /*
  * Encode and de-code a swap entry
  *
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -30,13 +30,6 @@
 /* If _PAGE_BIT_PRESENT is clear, we use these: */
 /* - if the user mapped it with PROT_NONE; pte_present gives true */
 #define _PAGE_BIT_PROTNONE	_PAGE_BIT_GLOBAL
-/* - set: nonlinear file mapping, saved PTE; unset:swap */
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-/* Pick a bit unaffected by the "KNL4 erratum": */
-#define _PAGE_BIT_FILE		_PAGE_BIT_PCD
-#else
-#define _PAGE_BIT_FILE		_PAGE_BIT_DIRTY
-#endif
 
 #define _PAGE_PRESENT	(_AT(pteval_t, 1) << _PAGE_BIT_PRESENT)
 #define _PAGE_RW	(_AT(pteval_t, 1) << _PAGE_BIT_RW)
@@ -105,29 +98,6 @@
 #endif
 
 #if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
-/*
- * Do compile-time checks for all the bits that may be set on
- * non-present PTEs
- */
-#if _PAGE_BIT_FILE == _PAGE_BIT_SWP_SOFT_DIRTY
-#error conflicting _PAGE_BIT_FILE
-#endif
-#if _PAGE_BIT_FILE == _PAGE_BIT_PROTNONE
-#error conflicting _PAGE_BIT_FILE
-#endif
-/*
- * Do compile-time checks for all the bits affected by the "KNL4"
- * erratum:
- */
-#if _PAGE_BIT_FILE == _PAGE_BIT_DIRTY
-#error conflicting _PAGE_BIT_FILE
-#endif
-#if _PAGE_BIT_FILE == _PAGE_BIT_ACCESSED
-#error conflicting _PAGE_BIT_FILE
-#endif
-#endif
-
-#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
 #define _PAGE_NX	(_AT(pteval_t, 1) << _PAGE_BIT_NX)
 #define _PAGE_DEVMAP	(_AT(u64, 1) << _PAGE_BIT_DEVMAP)
 #define __HAVE_ARCH_PTE_DEVMAP
@@ -136,7 +106,6 @@
 #define _PAGE_DEVMAP	(_AT(pteval_t, 0))
 #endif
 
-#define _PAGE_FILE	(_AT(pteval_t, 1) << _PAGE_BIT_FILE)
 #define _PAGE_PROTNONE	(_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE)
 
 /*
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -891,7 +891,8 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 #define task_pt_regs(task)                                             \
 ({                                                                     \
        struct pt_regs *__regs__;                                       \
-       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
+       __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task)) - \
+				     TOP_OF_KERNEL_STACK_PADDING);     \
        __regs__ - 1;                                                   \
 })
 
@@ -1006,4 +1007,7 @@ bool xen_set_default_idle(void);
 
 void stop_this_cpu(void *dummy);
 
+extern void (*set_cpuid_faulting_cb)(bool enable);
+extern void set_cpuid_faulting(bool enable);
+
 #endif /* _ASM_X86_PROCESSOR_H */
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -16,6 +16,24 @@ void pvclock_resume(void);
 
 void pvclock_touch_watchdogs(void);
 
+static __always_inline
+unsigned pvclock_read_begin(const struct pvclock_vcpu_time_info *src)
+{
+	unsigned version = src->version & ~1;
+	/* Make sure that the version is read before the data. */
+	smp_rmb();
+	return version;
+}
+
+static __always_inline
+bool pvclock_read_retry(const struct pvclock_vcpu_time_info *src,
+			unsigned version)
+{
+	/* Make sure that the version is re-read after the data. */
+	smp_rmb();
+	return unlikely(version != src->version);
+}
+
 /*
  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
  * yielding a 64-bit result.
@@ -60,30 +78,12 @@ static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
 }
 
 static __always_inline
-u64 pvclock_get_nsec_offset(const struct pvclock_vcpu_time_info *src)
+cycle_t __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src, u64 tsc)
 {
-	u64 delta = rdtsc_ordered() - src->tsc_timestamp;
-	return pvclock_scale_delta(delta, src->tsc_to_system_mul,
-				   src->tsc_shift);
-}
-
-static __always_inline
-unsigned __pvclock_read_cycles(const struct pvclock_vcpu_time_info *src,
-			       cycle_t *cycles, u8 *flags)
-{
-	unsigned version;
-	cycle_t ret, offset;
-	u8 ret_flags;
-
-	version = src->version;
-
-	offset = pvclock_get_nsec_offset(src);
-	ret = src->system_time + offset;
-	ret_flags = src->flags;
-
-	*cycles = ret;
-	*flags = ret_flags;
-	return version;
+	u64 delta = tsc - src->tsc_timestamp;
+	cycle_t offset = pvclock_scale_delta(delta, src->tsc_to_system_mul,
+					     src->tsc_shift);
+	return src->system_time + offset;
 }
 
 struct pvclock_vsyscall_time_info {
--- a/arch/x86/include/asm/signal.h
+++ b/arch/x86/include/asm/signal.h
@@ -23,6 +23,10 @@ typedef struct {
 	unsigned long sig[_NSIG_WORDS];
 } sigset_t;
 
+/* non-uapi in-kernel SA_FLAGS for those indicates ABI for a signal frame */
+#define SA_IA32_ABI	0x02000000u
+#define SA_X32_ABI	0x01000000u
+
 #ifndef CONFIG_COMPAT
 typedef sigset_t compat_sigset_t;
 #endif
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -27,11 +27,12 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t
    function. */
 
 #define __HAVE_ARCH_MEMCPY 1
+extern void *__memcpy(void *to, const void *from, size_t len);
+
 #ifndef CONFIG_KMEMCHECK
 #if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
 extern void *memcpy(void *to, const void *from, size_t len);
 #else
-extern void *__memcpy(void *to, const void *from, size_t len);
 #define memcpy(dst, src, len)					\
 ({								\
 	size_t __len = (len);					\
@@ -53,9 +54,11 @@ extern void *__memcpy(void *to, const void *from, size_t len);
 
 #define __HAVE_ARCH_MEMSET
 void *memset(void *s, int c, size_t n);
+void *__memset(void *s, int c, size_t n);
 
 #define __HAVE_ARCH_MEMMOVE
 void *memmove(void *dest, const void *src, size_t count);
+void *__memmove(void *dest, const void *src, size_t count);
 
 int memcmp(const void *cs, const void *ct, size_t count);
 size_t strlen(const char *s);
@@ -63,6 +66,19 @@ char *strcpy(char *dest, const char *src);
 char *strcat(char *dest, const char *src);
 int strcmp(const char *cs, const char *ct);
 
+#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)
+
+/*
+ * For files that not instrumented (e.g. mm/slub.c) we
+ * should use not instrumented version of mem* functions.
+ */
+
+#undef memcpy
+#define memcpy(dst, src, len) __memcpy(dst, src, len)
+#define memmove(dst, src, len) __memmove(dst, src, len)
+#define memset(s, c, n) __memset(s, c, n)
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_X86_STRING_64_H */
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -37,7 +37,6 @@ asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
 asmlinkage long sys32_pwrite(unsigned int, const char __user *, u32, u32, u32);
 
 long sys32_fadvise64_64(int, __u32, __u32, __u32, __u32, int);
-long sys32_vm86_warning(void);
 
 asmlinkage ssize_t sys32_readahead(int, unsigned, unsigned, size_t);
 asmlinkage long sys32_sync_file_range(int, unsigned, unsigned,
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -12,6 +12,33 @@
 #include <asm/types.h>
 
 /*
+ * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we
+ * reserve at the top of the kernel stack.  We do it because of a nasty
+ * 32-bit corner case.  On x86_32, the hardware stack frame is
+ * variable-length.  Except for vm86 mode, struct pt_regs assumes a
+ * maximum-length frame.  If we enter from CPL 0, the top 8 bytes of
+ * pt_regs don't actually exist.  Ordinarily this doesn't matter, but it
+ * does in at least one case:
+ *
+ * If we take an NMI early enough in SYSENTER, then we can end up with
+ * pt_regs that extends above sp0.  On the way out, in the espfix code,
+ * we can read the saved SS value, but that value will be above sp0.
+ * Without this offset, that can result in a page fault.  (We are
+ * careful that, in this case, the value we read doesn't matter.)
+ *
+ * In vm86 mode, the hardware frame is much longer still, but we neither
+ * access the extra members from NMI context, nor do we write such a
+ * frame at sp0 at all.
+ *
+ * x86_64 has a fixed-length stack frame.
+ */
+#ifdef CONFIG_X86_32
+# define TOP_OF_KERNEL_STACK_PADDING 8
+#else
+# define TOP_OF_KERNEL_STACK_PADDING 0
+#endif
+
+/*
  * low level task data that entry.S needs immediate access to
  * - this struct should fit entirely inside of one cache line
  * - this struct shares the supervisor stack pages
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -116,6 +116,8 @@ asmlinkage void smp_threshold_interrupt(void);
 asmlinkage void smp_deferred_error_interrupt(void);
 #endif
 
+void do_cpuid_fault(struct pt_regs *);
+
 /* Interrupts/Exceptions */
 enum {
 	X86_TRAP_DE = 0,	/*  0, Divide-by-zero */
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -5,6 +5,7 @@
  */
 #include <linux/errno.h>
 #include <linux/compiler.h>
+#include <linux/kasan-checks.h>
 #include <linux/thread_info.h>
 #include <linux/string.h>
 #include <asm/asm.h>
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -7,6 +7,7 @@
 #include <linux/compiler.h>
 #include <linux/errno.h>
 #include <linux/lockdep.h>
+#include <linux/kasan-checks.h>
 #include <asm/alternative.h>
 #include <asm/cpufeature.h>
 #include <asm/page.h>
@@ -59,6 +60,7 @@ static inline unsigned long __must_check copy_from_user(void *to,
 	int sz = __compiletime_object_size(to);
 
 	might_fault();
+	kasan_check_write(to, n);
 	if (likely(sz == -1 || sz >= n))
 		n = _copy_from_user(to, from, n);
 #ifdef CONFIG_DEBUG_VM
@@ -72,7 +74,7 @@ static __always_inline __must_check
 int copy_to_user(void __user *dst, const void *src, unsigned size)
 {
 	might_fault();
-
+	kasan_check_read(src, size);
 	return _copy_to_user(dst, src, size);
 }
 
@@ -81,6 +83,7 @@ int __copy_from_user(void *dst, const void __user *src, unsigned size)
 {
 	int ret = 0;
 
+	kasan_check_write(dst, size);
 	might_fault();
 	if (!__builtin_constant_p(size))
 		return copy_user_generic(dst, (__force void *)src, size);
@@ -125,6 +128,7 @@ int __copy_to_user(void __user *dst, const void *src, unsigned size)
 {
 	int ret = 0;
 
+	kasan_check_read(src, size);
 	might_fault();
 	if (!__builtin_constant_p(size))
 		return copy_user_generic((__force void *)dst, src, size);
@@ -220,12 +224,14 @@ int __copy_in_user(void __user *dst, const void __user *src, unsigned size)
 static __must_check __always_inline int
 __copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
 {
+	kasan_check_write(dst, size);
 	return copy_user_generic(dst, (__force const void *)src, size);
 }
 
 static __must_check __always_inline int
 __copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
 {
+	kasan_check_read(src, size);
 	return copy_user_generic((__force void *)dst, src, size);
 }
 
@@ -236,6 +242,7 @@ static inline int
 __copy_from_user_nocache(void *dst, const void __user *src, unsigned size)
 {
 	might_fault();
+	kasan_check_write(dst, size);
 	return __copy_user_nocache(dst, src, size, 1);
 }
 
@@ -243,6 +250,7 @@ static inline int
 __copy_from_user_inatomic_nocache(void *dst, const void __user *src,
 				  unsigned size)
 {
+	kasan_check_write(dst, size);
 	return __copy_user_nocache(dst, src, size, 0);
 }
 
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -1,6 +1,20 @@
 #ifndef _ASM_X86_VDSO_H
 #define _ASM_X86_VDSO_H
 
+#ifdef CONFIG_X86_64
+extern const char VDSO64_PRELINK[];
+
+/*
+ * Given a pointer to the vDSO image, find the pointer to VDSO64_name
+ * as that symbol is defined in the vDSO sources or linker script.
+ */
+#define VDSO64_SYMBOL(base, name)					\
+({									\
+	extern const char VDSO64_##name[];				\
+	(void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
+})
+#endif
+
 #if defined CONFIG_X86_32 || defined CONFIG_COMPAT
 extern const char VDSO32_PRELINK[];
 
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -21,6 +21,7 @@ struct vsyscall_gtod_data {
 	u64		monotonic_time_snsec;
 	time_t		monotonic_time_sec;
 
+	int		gettime_monotonic_enabled;
 	struct timezone sys_tz;
 	struct timespec wall_time_coarse;
 	struct timespec monotonic_time_coarse;
--- a/arch/x86/include/asm/vvar.h
+++ b/arch/x86/include/asm/vvar.h
@@ -45,6 +45,7 @@
 
 DECLARE_VVAR(0, volatile unsigned long, jiffies)
 DECLARE_VVAR(16, int, vgetcpu_mode)
+DECLARE_VVAR(64, volatile unsigned long, fence_wdog_jiffies64)
 DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
 
 #undef DECLARE_VVAR
--- a/arch/x86/include/uapi/asm/hyperv.h
+++ b/arch/x86/include/uapi/asm/hyperv.h
@@ -151,6 +151,12 @@
 /* MSR used to provide vcpu index */
 #define HV_X64_MSR_VP_INDEX			0x40000002
 
+/* MSR used to reset the guest OS. */
+#define HV_X64_MSR_RESET			0x40000003
+
+/* MSR used to provide vcpu runtime in 100ns units */
+#define HV_X64_MSR_VP_RUNTIME			0x40000010
+
 /* MSR used to read the per-partition time reference counter */
 #define HV_X64_MSR_TIME_REF_COUNT		0x40000020
 
@@ -201,13 +207,38 @@
 #define HV_X64_MSR_STIMER3_CONFIG		0x400000B6
 #define HV_X64_MSR_STIMER3_COUNT		0x400000B7
 
+/* Hyper-V guest crash notification MSR's */
+#define HV_X64_MSR_CRASH_P0			0x40000100
+#define HV_X64_MSR_CRASH_P1			0x40000101
+#define HV_X64_MSR_CRASH_P2			0x40000102
+#define HV_X64_MSR_CRASH_P3			0x40000103
+#define HV_X64_MSR_CRASH_P4			0x40000104
+#define HV_X64_MSR_CRASH_CTL			0x40000105
+#define HV_X64_MSR_CRASH_CTL_NOTIFY		(1ULL << 63)
+#define HV_X64_MSR_CRASH_PARAMS		\
+		(1 + (HV_X64_MSR_CRASH_P4 - HV_X64_MSR_CRASH_P0))
+
+/*
+ * Synthetic Timer MSRs. Four timers per vcpu.
+ */
+#define HV_X64_MSR_STIMER0_CONFIG              0x400000B0
+#define HV_X64_MSR_STIMER0_COUNT               0x400000B1
+#define HV_X64_MSR_STIMER1_CONFIG              0x400000B2
+#define HV_X64_MSR_STIMER1_COUNT               0x400000B3
+#define HV_X64_MSR_STIMER2_CONFIG              0x400000B4
+#define HV_X64_MSR_STIMER2_COUNT               0x400000B5
+#define HV_X64_MSR_STIMER3_CONFIG              0x400000B6
+#define HV_X64_MSR_STIMER3_COUNT               0x400000B7
+
 #define HV_X64_MSR_HYPERCALL_ENABLE		0x00000001
 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT	12
 #define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK	\
 		(~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
 
 /* Declare the various hypercall operations. */
-#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT		0x0008
+#define HVCALL_NOTIFY_LONG_SPIN_WAIT		0x0008
+#define HVCALL_POST_MESSAGE			0x005c
+#define HVCALL_SIGNAL_EVENT			0x005d
 
 #define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE		0x00000001
 #define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT	12
@@ -250,4 +281,96 @@ typedef struct _HV_REFERENCE_TSC_PAGE {
 #define HV_SYNIC_SINT_AUTO_EOI		(1ULL << 17)
 #define HV_SYNIC_SINT_VECTOR_MASK	(0xFF)
 
+#define HV_SYNIC_STIMER_COUNT		(4)
+
+/* Define synthetic interrupt controller message constants. */
+#define HV_MESSAGE_SIZE			(256)
+#define HV_MESSAGE_PAYLOAD_BYTE_COUNT	(240)
+#define HV_MESSAGE_PAYLOAD_QWORD_COUNT	(30)
+
+/* Define hypervisor message types. */
+enum hv_message_type {
+	HVMSG_NONE			= 0x00000000,
+
+	/* Memory access messages. */
+	HVMSG_UNMAPPED_GPA		= 0x80000000,
+	HVMSG_GPA_INTERCEPT		= 0x80000001,
+
+	/* Timer notification messages. */
+	HVMSG_TIMER_EXPIRED			= 0x80000010,
+
+	/* Error messages. */
+	HVMSG_INVALID_VP_REGISTER_VALUE	= 0x80000020,
+	HVMSG_UNRECOVERABLE_EXCEPTION	= 0x80000021,
+	HVMSG_UNSUPPORTED_FEATURE		= 0x80000022,
+
+	/* Trace buffer complete messages. */
+	HVMSG_EVENTLOG_BUFFERCOMPLETE	= 0x80000040,
+
+	/* Platform-specific processor intercept messages. */
+	HVMSG_X64_IOPORT_INTERCEPT		= 0x80010000,
+	HVMSG_X64_MSR_INTERCEPT		= 0x80010001,
+	HVMSG_X64_CPUID_INTERCEPT		= 0x80010002,
+	HVMSG_X64_EXCEPTION_INTERCEPT	= 0x80010003,
+	HVMSG_X64_APIC_EOI			= 0x80010004,
+	HVMSG_X64_LEGACY_FP_ERROR		= 0x80010005
+};
+
+/* Define synthetic interrupt controller message flags. */
+union hv_message_flags {
+	__u8 asu8;
+	struct {
+		__u8 msg_pending:1;
+		__u8 reserved:7;
+	};
+};
+
+/* Define port identifier type. */
+union hv_port_id {
+	__u32 asu32;
+	struct {
+		__u32 id:24;
+		__u32 reserved:8;
+	} u;
+};
+
+/* Define synthetic interrupt controller message header. */
+struct hv_message_header {
+	__u32 message_type;
+	__u8 payload_size;
+	union hv_message_flags message_flags;
+	__u8 reserved[2];
+	union {
+		__u64 sender;
+		union hv_port_id port;
+	};
+};
+
+/* Define synthetic interrupt controller message format. */
+struct hv_message {
+	struct hv_message_header header;
+	union {
+		__u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT];
+	} u;
+};
+
+/* Define the synthetic interrupt message page layout. */
+struct hv_message_page {
+	struct hv_message sint_message[HV_SYNIC_SINT_COUNT];
+};
+
+/* Define timer message payload structure. */
+struct hv_timer_message_payload {
+	__u32 timer_index;
+	__u32 reserved;
+	__u64 expiration_time;	/* When the timer expired */
+	__u64 delivery_time;	/* When the message was delivered */
+};
+
+#define HV_STIMER_ENABLE		(1ULL << 0)
+#define HV_STIMER_PERIODIC		(1ULL << 1)
+#define HV_STIMER_LAZY			(1ULL << 2)
+#define HV_STIMER_AUTOENABLE		(1ULL << 3)
+#define HV_STIMER_SINT(config)		(__u8)(((config) >> 16) & 0x0F)
+
 #endif
--- a/arch/x86/include/uapi/asm/prctl.h
+++ b/arch/x86/include/uapi/asm/prctl.h
@@ -6,4 +6,10 @@
 #define ARCH_GET_FS 0x1003
 #define ARCH_GET_GS 0x1004
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+# define ARCH_MAP_VDSO_X32     0x2001
+# define ARCH_MAP_VDSO_32      0x2002
+# define ARCH_MAP_VDSO_64      0x2003
+#endif
+
 #endif /* _ASM_X86_PRCTL_H */
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -22,6 +22,16 @@ OBJECT_FILES_NON_STANDARD_mcount_$(BITS).o		:= y
 OBJECT_FILES_NON_STANDARD_test_nx.o			:= y
 OBJECT_FILES_NON_STANDARD_entry_$(BITS).o		:= y
 
+KASAN_SANITIZE_head$(BITS).o := n
+KASAN_SANITIZE_dumpstack.o := n
+KASAN_SANITIZE_dumpstack_$(BITS).o := n
+
+# If instrumentation of this dir is enabled, boot hangs during first second.
+# Probably could be more selective here, but note that files related to irqs,
+# boot, dumpstack/stacktrace, etc are either non-interesting or can lead to
+# non-deterministic coverage.
+KCOV_INSTRUMENT		:= n
+
 CFLAGS_irq.o := -I$(src)/../include/asm/trace
 
 obj-y			:= process_$(BITS).o signal.o entry_$(BITS).o
@@ -43,6 +53,7 @@ obj-y			+= alternative.o i8253.o pci-nommu.o hw_breakpoint.o
 obj-y			+= tsc.o tsc_msr.o io_delay.o rtc.o
 obj-y			+= pci-iommu_table.o
 obj-y			+= resource.o
+obj-y			+= cpuid_fault.o
 
 obj-y				+= process.o
 obj-y				+= i387.o xsave.o
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,6 +2,10 @@
 # Makefile for local APIC drivers and for the IO-APIC code
 #
 
+# Leads to non-deterministic coverage that is not a function of syscall inputs.
+# In particualr, smp_apic_timer_interrupt() is called in random places.
+KCOV_INSTRUMENT		:= n
+
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o apic_noop.o ipi.o
 obj-y				+= hw_nmi.o
 
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -8,6 +8,10 @@ CFLAGS_REMOVE_common.o = -pg
 CFLAGS_REMOVE_perf_event.o = -pg
 endif
 
+# If these files are instrumented, boot hangs during the first second.
+KCOV_INSTRUMENT_common.o := n
+KCOV_INSTRUMENT_perf_event.o := n
+
 # Make sure load_percpu_segment has no stackprotector
 nostackp := $(call cc-option, -fno-stack-protector)
 CFLAGS_common.o		:= $(nostackp)
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -402,6 +402,31 @@ static void detect_vmx_virtcap(struct cpuinfo_x86 *c)
 	}
 }
 
+static void intel_set_cpuid_faulting(bool enable)
+{
+	unsigned int l1, l2;
+
+	rdmsr(MSR_MISC_FEATURES_ENABLES, l1, l2);
+	l1 &= ~1;
+	if (enable)
+		l1 |= 1;
+	wrmsr(MSR_MISC_FEATURES_ENABLES, l1, l2);
+}
+
+static void intel_cpuid_faulting_init(struct cpuinfo_x86 *c)
+{
+	unsigned int l1, l2;
+
+	if (rdmsr_safe(MSR_PLATFORM_INFO, &l1, &l2) != 0 ||
+	    !(l1 & (1 << 31)))
+		return;
+
+	set_cpu_cap(c, X86_FEATURE_CPUID_FAULTING);
+	set_cpuid_faulting_cb = intel_set_cpuid_faulting;
+
+	intel_set_cpuid_faulting(false);
+}
+
 static void init_intel(struct cpuinfo_x86 *c)
 {
 	unsigned int l2 = 0;
@@ -517,6 +542,8 @@ static void init_intel(struct cpuinfo_x86 *c)
 			wrmsrl(MSR_IA32_ENERGY_PERF_BIAS, epb);
 		}
 	}
+
+	intel_cpuid_faulting_init(c);
 }
 
 #ifdef CONFIG_X86_32
--- a/arch/x86/kernel/cpu/proc.c
+++ b/arch/x86/kernel/cpu/proc.c
@@ -3,6 +3,7 @@
 #include <linux/string.h>
 #include <linux/seq_file.h>
 #include <linux/cpufreq.h>
+#include <linux/sched.h>
 
 /*
  *	Get CPU information for use by the procfs.
@@ -51,10 +52,58 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c)
 }
 #endif
 
+extern void __do_cpuid_fault(unsigned int op, unsigned int count,
+			     unsigned int *eax, unsigned int *ebx,
+			     unsigned int *ecx, unsigned int *edx);
+
+struct cpu_flags {
+	u32 val[NCAPINTS];
+};
+
+static DEFINE_PER_CPU(struct cpu_flags, cpu_flags);
+
+static void init_cpu_flags(void *dummy)
+{
+	int cpu = smp_processor_id();
+	struct cpu_flags *flags = &per_cpu(cpu_flags, cpu);
+	struct cpuinfo_x86 *c = &cpu_data(cpu);
+	unsigned int eax, ebx, ecx, edx;
+
+	memcpy(flags->val, c->x86_capability, NCAPINTS * sizeof(u32));
+
+	/*
+	 * Clear feature bits masked using cpuid masking/faulting.
+	 */
+
+	if (c->cpuid_level >= 0x00000001) {
+		__do_cpuid_fault(0x00000001, 0, &eax, &ebx, &ecx, &edx);
+		flags->val[4] &= ecx;
+		flags->val[0] &= edx;
+	}
+
+	if (c->cpuid_level >= 0x00000007) {
+		__do_cpuid_fault(0x00000007, 0, &eax, &ebx, &ecx, &edx);
+		flags->val[9] &= ebx;
+	}
+
+	if ((c->extended_cpuid_level & 0xffff0000) == 0x80000000 &&
+	    c->extended_cpuid_level >= 0x80000001) {
+		__do_cpuid_fault(0x80000001, 0, &eax, &ebx, &ecx, &edx);
+		flags->val[6] &= ecx;
+		flags->val[1] &= edx;
+	}
+
+	if (c->cpuid_level >= 0x0000000d) {
+		__do_cpuid_fault(0x0000000d, 1, &eax, &ebx, &ecx, &edx);
+		flags->val[10] &= eax;
+	}
+}
+
 static int show_cpuinfo(struct seq_file *m, void *v)
 {
 	struct cpuinfo_x86 *c = v;
 	unsigned int cpu;
+	int is_super = ve_is_super(get_exec_env());
 	int i;
 
 	cpu = c->cpu_index;
@@ -81,6 +130,7 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 
 		if (!freq)
 			freq = cpu_khz;
+		freq = sched_cpulimit_scale_cpufreq(freq);
 		seq_printf(m, "cpu MHz\t\t: %u.%03u\n",
 			   freq / 1000, (freq % 1000));
 	}
@@ -94,7 +144,10 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 
 	seq_printf(m, "flags\t\t:");
 	for (i = 0; i < 32*NCAPINTS; i++)
-		if (cpu_has(c, i) && x86_cap_flags[i] != NULL)
+		if (x86_cap_flags[i] != NULL &&
+				((is_super && cpu_has(c, i)) ||
+				 (!is_super && test_bit(i, (unsigned long *)
+							&per_cpu(cpu_flags, cpu)))))
 			seq_printf(m, " %s", x86_cap_flags[i]);
 
 	seq_printf(m, "\nbogomips\t: %lu.%02lu\n",
@@ -128,18 +181,24 @@ static int show_cpuinfo(struct seq_file *m, void *v)
 	return 0;
 }
 
-static void *c_start(struct seq_file *m, loff_t *pos)
+static void *__c_start(struct seq_file *m, loff_t *pos)
 {
 	*pos = cpumask_next(*pos - 1, cpu_online_mask);
-	if ((*pos) < nr_cpu_ids)
+	if (__cpus_weight(cpu_online_mask, *pos) < num_online_vcpus())
 		return &cpu_data(*pos);
 	return NULL;
 }
 
+static void *c_start(struct seq_file *m, loff_t *pos)
+{
+	on_each_cpu(init_cpu_flags, NULL, 1);
+	return __c_start(m, pos);
+}
+
 static void *c_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	(*pos)++;
-	return c_start(m, pos);
+	return __c_start(m, pos);
 }
 
 static void c_stop(struct seq_file *m, void *v)
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -1,5 +1,6 @@
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/ve.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
 #include "cpu.h"
--- /dev/null
+++ b/arch/x86/kernel/cpuid_fault.c
@@ -0,0 +1,315 @@
+/*
+ *  arch/x86/kernel/cpuid_fault.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/notifier.h>
+#include <linux/reboot.h>
+#include <linux/ve.h>
+#include <asm/uaccess.h>
+
+struct cpuid_override_entry {
+	unsigned int op;
+	unsigned int count;
+	bool has_count;
+	unsigned int eax;
+	unsigned int ebx;
+	unsigned int ecx;
+	unsigned int edx;
+};
+
+#define MAX_CPUID_OVERRIDE_ENTRIES	16
+
+struct cpuid_override_table {
+	struct rcu_head rcu_head;
+	int size;
+	struct cpuid_override_entry entries[MAX_CPUID_OVERRIDE_ENTRIES];
+};
+
+static struct cpuid_override_table __rcu *cpuid_override __read_mostly;
+static DEFINE_SPINLOCK(cpuid_override_lock);
+
+#define cpuid_override_active		(!!rcu_access_pointer(cpuid_override))
+
+void (*set_cpuid_faulting_cb)(bool enable);
+static DEFINE_PER_CPU(bool, cpuid_faulting_enabled);
+
+void set_cpuid_faulting(bool enable)
+{
+	bool *enabled;
+
+	if (!cpu_has_cpuid_faulting)
+		return;
+	if (!cpuid_override_active)
+		enable = false;
+
+	enabled = &get_cpu_var(cpuid_faulting_enabled);
+	if (*enabled != enable) {
+		set_cpuid_faulting_cb(enable);
+		*enabled = enable;
+	}
+	put_cpu_var(cpuid_faulting_enabled);
+}
+EXPORT_SYMBOL(set_cpuid_faulting);
+
+static void cpuid_override_update(struct cpuid_override_table *new_table)
+{
+	struct cpuid_override_table *old_table;
+
+	spin_lock(&cpuid_override_lock);
+	old_table = rcu_access_pointer(cpuid_override);
+	rcu_assign_pointer(cpuid_override, new_table);
+	spin_unlock(&cpuid_override_lock);
+
+	if (old_table)
+		kfree_rcu(old_table, rcu_head);
+}
+
+static bool cpuid_override_match(unsigned int op, unsigned int count,
+				 unsigned int *eax, unsigned int *ebx,
+				 unsigned int *ecx, unsigned int *edx)
+{
+	bool ret = false;
+	struct cpuid_override_table *t;
+	struct cpuid_override_entry *e;
+	int i;
+
+	rcu_read_lock();
+	t = rcu_dereference(cpuid_override);
+	if (!t)
+		goto out;
+
+	for (i = 0; i < t->size; i++) {
+		e = &t->entries[i];
+		if (e->op != op)
+			continue;
+		if (e->has_count && e->count != count)
+			continue;
+		*eax = e->eax;
+		*ebx = e->ebx;
+		*ecx = e->ecx;
+		*edx = e->edx;
+		ret = true;
+		break;
+	}
+out:
+	rcu_read_unlock();
+	return ret;
+}
+
+void __do_cpuid_fault(unsigned int op, unsigned int count,
+		      unsigned int *eax, unsigned int *ebx,
+		      unsigned int *ecx, unsigned int *edx)
+{
+	/* check if op is overridden */
+	if (cpuid_override_match(op, count, eax, ebx, ecx, edx))
+		return;
+
+	/* fallback to real cpuid */
+	cpuid_count(op, count, eax, ebx, ecx, edx);
+}
+
+void do_cpuid_fault(struct pt_regs *regs)
+{
+	unsigned int eax, ebx, ecx, edx;
+
+	__do_cpuid_fault(regs->ax, regs->cx, &eax, &ebx, &ecx, &edx);
+
+	regs->ax = eax;
+	regs->bx = ebx;
+	regs->cx = ecx;
+	regs->dx = edx;
+}
+
+/*
+ * CPUID override entry format:
+ *
+ * op[ count]: eax ebx ecx edx
+ *
+ * All values are in HEX.
+ */
+static int cpuid_override_entry_parse(const char *s, char **endp,
+				      struct cpuid_override_entry *e)
+{
+	int taken;
+	char *end;
+
+	if (sscanf(s, "%x %x: %x %x %x %x%n",
+		   &e->op, &e->count, &e->eax, &e->ebx, &e->ecx, &e->edx,
+		   &taken) == 6)
+		e->has_count = true;
+	else if (sscanf(s, "%x: %x %x %x %x%n",
+			&e->op, &e->eax, &e->ebx, &e->ecx, &e->edx,
+			&taken) == 5)
+		e->has_count = false;
+	else
+		return -EINVAL;
+
+	end = (char *)s + taken;
+	if (*end) {
+		if (*end != '\n')
+			return -EINVAL;
+		++end;
+	}
+	*endp = end;
+	return 0;
+}
+
+static ssize_t cpuid_override_write(struct file *file, const char __user *buf,
+				    size_t count, loff_t *ppos)
+{
+	struct cpuid_override_table *t = NULL;
+	void *page = NULL;
+	char *s;
+	int err;
+
+	err = -E2BIG;
+	if (count >= PAGE_SIZE)
+		goto out;
+
+	err = -ENOMEM;
+	t = kmalloc(sizeof(*t), GFP_KERNEL);
+	if (!t)
+		goto out;
+
+	page = (void *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		goto out;
+
+	err = copy_from_user(page, buf, count);
+	if (err)
+		goto out;
+
+	s = page;
+	s[count] = '\0';
+	t->size = 0;
+	while (*(s = skip_spaces(s))) {
+		err = -E2BIG;
+		if (t->size == MAX_CPUID_OVERRIDE_ENTRIES)
+			goto out;
+		err = -EINVAL;
+		if (cpuid_override_entry_parse(s, &s, &t->entries[t->size++]))
+			goto out;
+	}
+	if (!t->size) {
+		kfree(t);
+		t = NULL;
+	}
+	err = 0;
+out:
+	free_page((unsigned long)page);
+
+	if (!err)
+		cpuid_override_update(t);
+	else
+		kfree(t);
+
+	return err ?: count;
+}
+
+static void *__cpuid_override_seq_start(loff_t pos)
+{
+	struct cpuid_override_table *t = rcu_dereference(cpuid_override);
+	return t && pos < t->size ? &t->entries[pos] : NULL;
+}
+
+static void *cpuid_override_seq_start(struct seq_file *seq, loff_t *ppos)
+{
+	rcu_read_lock();
+	return __cpuid_override_seq_start(*ppos);
+}
+
+static void *cpuid_override_seq_next(struct seq_file *seq,
+				     void *v, loff_t *ppos)
+{
+	++*ppos;
+	return __cpuid_override_seq_start(*ppos);
+}
+
+static void cpuid_override_seq_stop(struct seq_file *s, void *v)
+{
+	rcu_read_unlock();
+}
+
+static int cpuid_override_seq_show(struct seq_file *s, void *v)
+{
+	struct cpuid_override_entry *e = v;
+
+	seq_printf(s, "0x%08x", e->op);
+	if (e->has_count)
+		seq_printf(s, " 0x%08x", e->count);
+	seq_printf(s, ": 0x%08x 0x%08x 0x%08x 0x%08x\n",
+		   e->eax, e->ebx, e->ecx, e->edx);
+	return 0;
+}
+
+static struct seq_operations cpuid_override_seq_ops = {
+	.start = cpuid_override_seq_start,
+	.next  = cpuid_override_seq_next,
+	.stop  = cpuid_override_seq_stop,
+	.show  = cpuid_override_seq_show,
+};
+
+static int cpuid_override_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &cpuid_override_seq_ops);
+}
+
+static struct file_operations proc_cpuid_override_ops = {
+	.owner   = THIS_MODULE,
+	.open    = cpuid_override_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = cpuid_override_write,
+};
+
+static void disable_cpuid_faulting_fn(void *unused)
+{
+	set_cpuid_faulting(false);
+}
+
+static int cpuid_faulting_reboot_notify(struct notifier_block *nb,
+					unsigned long code, void *unused)
+{
+	if (code == SYS_RESTART) {
+		/*
+		 * Disable cpuid faulting before loading a new kernel by kexec
+		 * in case the new kernel does not support this feature.
+		 */
+		cpuid_override_update(NULL);
+		on_each_cpu(disable_cpuid_faulting_fn, NULL, 1);
+	}
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block cpuid_faulting_reboot_nb = {
+	.notifier_call = cpuid_faulting_reboot_notify,
+};
+
+static int __init cpuid_fault_init(void)
+{
+	struct proc_dir_entry *proc;
+
+	if (!cpu_has_cpuid_faulting)
+		return 0;
+
+	register_reboot_notifier(&cpuid_faulting_reboot_nb);
+
+	proc = proc_create("cpuid_override", 0644, proc_vz_dir,
+			   &proc_cpuid_override_ops);
+	if (!proc)
+		return -ENOMEM;
+
+	return 0;
+}
+module_init(cpuid_fault_init);
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -258,7 +258,10 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
 	printk("SMP ");
 #endif
 #ifdef CONFIG_DEBUG_PAGEALLOC
-	printk("DEBUG_PAGEALLOC");
+	printk("DEBUG_PAGEALLOC ");
+#endif
+#ifdef CONFIG_KASAN
+	printk("KASAN");
 #endif
 	printk("\n");
 	if (notify_die(DIE_OOPS, str, regs, err,
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -409,7 +409,7 @@ sysenter_past_esp:
 	 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
 	 * pushed above; +8 corresponds to copy_thread's esp0 setting.
 	 */
-	pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
+	pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+TOP_OF_KERNEL_STACK_PADDING+4*4)(%esp)
 	CFI_REL_OFFSET eip, 0
 
 	pushl_cfi %eax
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -27,6 +27,7 @@
 #include <asm/bios_ebda.h>
 #include <asm/bootparam_utils.h>
 #include <asm/microcode.h>
+#include <asm/kasan.h>
 
 /*
  * Manage page tables very early on.
@@ -46,7 +47,7 @@ static void __init reset_early_page_tables(void)
 
 	next_early_pgt = 0;
 
-	write_cr3(__pa(early_level4_pgt));
+	write_cr3(__pa_nodebug(early_level4_pgt));
 }
 
 /* Create a new PMD entry */
@@ -59,7 +60,7 @@ int __init early_make_pgtable(unsigned long address)
 	pmdval_t pmd, *pmd_p;
 
 	/* Invalid address or early pgt is done ?  */
-	if (physaddr >= MAXMEM || read_cr3() != __pa(early_level4_pgt))
+	if (physaddr >= MAXMEM || read_cr3() != __pa_nodebug(early_level4_pgt))
 		return -1;
 
 again:
@@ -158,9 +159,12 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	/* Kill off the identity-map trampoline */
 	reset_early_page_tables();
 
-	/* clear bss before set_intr_gate with early_idt_handler */
 	clear_bss();
 
+	clear_page(init_level4_pgt);
+
+	kasan_early_init();
+
 	for (i = 0; i < NUM_EXCEPTION_VECTORS; i++)
 		set_intr_gate(i, early_idt_handlers[i]);
 	load_idt((const struct desc_ptr *)&idt_descr);
@@ -175,7 +179,6 @@ void __init x86_64_start_kernel(char * real_mode_data)
 	if (console_loglevel == 10)
 		early_printk("Kernel alive\n");
 
-	clear_page(init_level4_pgt);
 	/* set init_level4_pgt kernel high mapping*/
 	init_level4_pgt[511] = early_level4_pgt[511];
 
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -542,3 +542,4 @@ ENTRY(trace_idt_table)
 	__PAGE_ALIGNED_BSS
 NEXT_PAGE(empty_zero_page)
 	.skip PAGE_SIZE
+
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -14,6 +14,9 @@
 #include <linux/smp.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ratelimit.h>
+#include <linux/slab.h>
 
 #include <asm/ldt.h>
 #include <asm/desc.h>
@@ -39,9 +42,9 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 	mincount = (mincount + (PAGE_SIZE / LDT_ENTRY_SIZE - 1)) &
 			(~(PAGE_SIZE / LDT_ENTRY_SIZE - 1));
 	if (mincount * LDT_ENTRY_SIZE > PAGE_SIZE)
-		newldt = vmalloc(mincount * LDT_ENTRY_SIZE);
+		newldt = vmalloc_account(mincount * LDT_ENTRY_SIZE);
 	else
-		newldt = (void *)__get_free_page(GFP_KERNEL);
+		newldt = (void *)__get_free_page(GFP_KERNEL_ACCOUNT);
 
 	if (!newldt)
 		return -ENOMEM;
@@ -80,7 +83,7 @@ static int alloc_ldt(mm_context_t *pc, int mincount, int reload)
 		if (oldsize * LDT_ENTRY_SIZE > PAGE_SIZE)
 			vfree(oldldt);
 		else
-			put_page(virt_to_page(oldldt));
+			__free_page(virt_to_page(oldldt));
 	}
 	return 0;
 }
@@ -135,7 +138,7 @@ void destroy_context(struct mm_struct *mm)
 		if (mm->context.size * LDT_ENTRY_SIZE > PAGE_SIZE)
 			vfree(mm->context.ldt);
 		else
-			put_page(virt_to_page(mm->context.ldt));
+			__free_page(virt_to_page(mm->context.ldt));
 		mm->context.size = 0;
 	}
 }
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
 #include <linux/fs.h>
 #include <linux/string.h>
 #include <linux/kernel.h>
+#include <linux/kasan.h>
 #include <linux/bug.h>
 #include <linux/mm.h>
 #include <linux/gfp.h>
@@ -45,11 +46,18 @@ do {							\
 
 void *module_alloc(unsigned long size)
 {
+	void *p;
+
 	if (PAGE_ALIGN(size) > MODULES_LEN)
 		return NULL;
-	return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
+	p =  __vmalloc_node_range(size, MODULE_ALIGN, MODULES_VADDR, MODULES_END,
 				GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
-				-1, __builtin_return_address(0));
+				0, NUMA_NO_NODE, __builtin_return_address(0));
+	if (p && (kasan_module_alloc(p, size) < 0)) {
+		vfree(p);
+		return NULL;
+	}
+	return p;
 }
 
 #ifdef CONFIG_X86_32
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -6,7 +6,6 @@
 #include <linux/smp.h>
 #include <linux/prctl.h>
 #include <linux/slab.h>
-#include <linux/sched.h>
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
@@ -14,6 +13,8 @@
 #include <linux/user-return-notifier.h>
 #include <linux/dmi.h>
 #include <linux/utsname.h>
+#include <linux/ve.h>
+#include <generated/utsrelease.h>
 #include <linux/stackprotector.h>
 #include <linux/tick.h>
 #include <linux/cpuidle.h>
@@ -472,3 +473,58 @@ unsigned long arch_randomize_brk(struct mm_struct *mm)
 	return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
 }
 
+/*
+ * Called from fs/proc with a reference on @p to find the function
+ * which called into schedule(). This needs to be done carefully
+ * because the task might wake up and we might look at a stack
+ * changing under us.
+ */
+unsigned long get_wchan(struct task_struct *p)
+{
+	unsigned long start, bottom, top, sp, fp, ip;
+	int count = 0;
+
+	if (!p || p == current || p->state == TASK_RUNNING)
+		return 0;
+
+	start = (unsigned long)task_stack_page(p);
+	if (!start)
+		return 0;
+
+	/*
+	 * Layout of the stack page:
+	 *
+	 * ----------- topmax = start + THREAD_SIZE - sizeof(unsigned long)
+	 * PADDING
+	 * ----------- top = topmax - TOP_OF_KERNEL_STACK_PADDING
+	 * stack
+	 * ----------- bottom = start + sizeof(thread_info)
+	 * thread_info
+	 * ----------- start
+	 *
+	 * The tasks stack pointer points at the location where the
+	 * framepointer is stored. The data on the stack is:
+	 * ... IP FP ... IP FP
+	 *
+	 * We need to read FP and IP, so we need to adjust the upper
+	 * bound by another unsigned long.
+	 */
+	top = start + THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;
+	top -= 2 * sizeof(unsigned long);
+	bottom = start + sizeof(struct thread_info);
+
+	sp = READ_ONCE(p->thread.sp);
+	if (sp < bottom || sp > top)
+		return 0;
+
+	fp = READ_ONCE_NOCHECK(*(unsigned long *)sp);
+	do {
+		if (fp < bottom || fp > top)
+			return 0;
+		ip = READ_ONCE_NOCHECK(*(unsigned long *)(fp + sizeof(unsigned long)));
+		if (!in_sched_functions(ip))
+			return ip;
+		fp = READ_ONCE_NOCHECK(*(unsigned long *)fp);
+	} while (count++ < 16 && p->state != TASK_RUNNING);
+	return 0;
+}
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -313,31 +313,3 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	return prev_p;
 }
-
-#define top_esp                (THREAD_SIZE - sizeof(unsigned long))
-#define top_ebp                (THREAD_SIZE - 2*sizeof(unsigned long))
-
-unsigned long get_wchan(struct task_struct *p)
-{
-	unsigned long bp, sp, ip;
-	unsigned long stack_page;
-	int count = 0;
-	if (!p || p == current || p->state == TASK_RUNNING)
-		return 0;
-	stack_page = (unsigned long)task_stack_page(p);
-	sp = p->thread.sp;
-	if (!stack_page || sp < stack_page || sp > top_esp+stack_page)
-		return 0;
-	/* include/asm-i386/system.h:switch_to() pushes bp last. */
-	bp = *(unsigned long *) sp;
-	do {
-		if (bp < stack_page || bp > top_ebp+stack_page)
-			return 0;
-		ip = *(unsigned long *) (bp+4);
-		if (!in_sched_functions(ip))
-			return ip;
-		bp = *(unsigned long *) bp;
-	} while (count++ < 16);
-	return 0;
-}
-
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -467,30 +467,6 @@ void set_personality_ia32(bool x32)
 }
 EXPORT_SYMBOL_GPL(set_personality_ia32);
 
-unsigned long get_wchan(struct task_struct *p)
-{
-	unsigned long stack;
-	u64 fp, ip;
-	int count = 0;
-
-	if (!p || p == current || p->state == TASK_RUNNING)
-		return 0;
-	stack = (unsigned long)task_stack_page(p);
-	if (p->thread.sp < stack || p->thread.sp >= stack+THREAD_SIZE)
-		return 0;
-	fp = *(u64 *)(p->thread.sp);
-	do {
-		if (fp < (unsigned long)stack ||
-		    fp >= (unsigned long)stack+THREAD_SIZE)
-			return 0;
-		ip = *(u64 *)(fp+8);
-		if (!in_sched_functions(ip))
-			return ip;
-		fp = *(u64 *)fp;
-	} while (count++ < 16);
-	return 0;
-}
-
 long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 {
 	int ret = 0;
@@ -578,6 +554,19 @@ long do_arch_prctl(struct task_struct *task, int code, unsigned long addr)
 		break;
 	}
 
+#ifdef CONFIG_CHECKPOINT_RESTORE
+	case ARCH_MAP_VDSO_32:
+		return do_map_compat_vdso(addr);
+
+	/*
+	 * x32 and 64 vDSO remap API is omitted for simplicity.
+	 * We do need 32-bit vDSO blob mapping for compatible
+	 * applications Restore, but not x32/64 (at least, for now).
+	 */
+	case ARCH_MAP_VDSO_X32:
+	case ARCH_MAP_VDSO_64:
+#endif
+
 	default:
 		ret = -EINVAL;
 		break;
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1434,7 +1434,7 @@ void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
 const struct user_regset_view *task_user_regset_view(struct task_struct *task)
 {
 #ifdef CONFIG_IA32_EMULATION
-	if (test_tsk_thread_flag(task, TIF_IA32))
+	if (!user_64bit_mode(task_pt_regs(task)))
 #endif
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
 		return &user_x86_32_view;
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -61,12 +61,12 @@ void pvclock_resume(void)
 u8 pvclock_read_flags(struct pvclock_vcpu_time_info *src)
 {
 	unsigned version;
-	cycle_t ret;
 	u8 flags;
 
 	do {
-		version = __pvclock_read_cycles(src, &ret, &flags);
-	} while ((src->version & 1) || version != src->version);
+		version = pvclock_read_begin(src);
+		flags = src->flags;
+	} while (pvclock_read_retry(src, version));
 
 	return flags & valid_flags;
 }
@@ -79,8 +79,10 @@ cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
 	u8 flags;
 
 	do {
-		version = __pvclock_read_cycles(src, &ret, &flags);
-	} while ((src->version & 1) || version != src->version);
+		version = pvclock_read_begin(src);
+		ret = __pvclock_read_cycles(src, rdtsc_ordered());
+		flags = src->flags;
+	} while (pvclock_read_retry(src, version));
 
 	if (unlikely((flags & PVCLOCK_GUEST_STOPPED) != 0)) {
 		src->flags &= ~PVCLOCK_GUEST_STOPPED;
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -91,6 +91,7 @@
 #include <asm/cacheflush.h>
 #include <asm/processor.h>
 #include <asm/bugs.h>
+#include <asm/kasan.h>
 
 #include <asm/vsyscall.h>
 #include <asm/cpu.h>
@@ -1268,6 +1269,8 @@ void __init setup_arch(char **cmdline_p)
 
 	x86_init.paging.pagetable_init();
 
+	kasan_init();
+
 	if (boot_cpu_data.cpuid_level >= 0) {
 		/* A CPU has %cr4 if and only if it has CPUID */
 		mmu_cr4_features = read_cr4();
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -479,7 +479,7 @@ static int x32_setup_rt_frame(struct ksignal *ksig,
 		return -EFAULT;
 
 	if (ksig->ka.sa.sa_flags & SA_SIGINFO) {
-		if (copy_siginfo_to_user32(&frame->info, &ksig->info))
+		if (__copy_siginfo_to_user32(&frame->info, &ksig->info, true))
 			return -EFAULT;
 	}
 
@@ -612,12 +612,12 @@ setup_rt_frame(struct ksignal *ksig, struct pt_regs *regs)
 	compat_sigset_t *cset = (compat_sigset_t *) set;
 
 	/* Set up the stack frame */
-	if (is_ia32_frame()) {
+	if (is_ia32_frame(ksig)) {
 		if (ksig->ka.sa.sa_flags & SA_SIGINFO)
 			return ia32_setup_rt_frame(usig, ksig, cset, regs);
 		else
 			return ia32_setup_frame(usig, ksig, cset, regs);
-	} else if (is_x32_frame()) {
+	} else if (is_x32_frame(ksig)) {
 		return x32_setup_rt_frame(ksig, cset, regs);
 	} else {
 		return __setup_rt_frame(ksig->sig, ksig, set, regs);
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -143,7 +143,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
 		if (end - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
@@ -186,7 +186,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 		addr = PAGE_ALIGN(addr);
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-				(!vma || addr + len <= vma->vm_start))
+				(!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -331,6 +331,27 @@ exit_trap:
 	exception_exit(prev_state);
 }
 
+static int check_cpuid_fault(struct pt_regs *regs, long error_code)
+{
+	unsigned long addr;
+	unsigned short opcode;
+
+	if (error_code != 0)
+		return 0;
+
+	addr = convert_ip_to_linear(current, regs);
+	if (get_user(opcode, (unsigned short __user *)addr))
+		return 0;
+
+	if (opcode != 0xa20f)
+		return 0;
+
+	do_cpuid_fault(regs);
+
+	regs->ip += 2;
+	return 1;
+}
+
 dotraplinkage void __kprobes
 do_general_protection(struct pt_regs *regs, long error_code)
 {
@@ -361,6 +382,9 @@ do_general_protection(struct pt_regs *regs, long error_code)
 		goto exit;
 	}
 
+	if (check_cpuid_fault(regs, error_code))
+		return;
+
 	tsk->thread.error_code = error_code;
 	tsk->thread.trap_nr = X86_TRAP_GP;
 
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -54,7 +54,7 @@
 #include "vsyscall_trace.h"
 
 DEFINE_VVAR(int, vgetcpu_mode);
-DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data);
+DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = { .gettime_monotonic_enabled = 0, };
 
 static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE;
 
@@ -327,6 +327,24 @@ sigsegv:
 	force_sig(SIGSEGV, current);
 	return true;
 }
+#ifdef CONFIG_SYSCTL
+static ctl_table kernel_table2[] = {
+	{ .procname = "vsyscall64_gettime_monotonic",
+	  .data = &vsyscall_gtod_data.gettime_monotonic_enabled, .maxlen = sizeof(int),
+	  .mode = 0644,
+	  .proc_handler = proc_dointvec
+	},
+	{
+	},
+};
+static ctl_table kernel_root_table2[] = {
+	{ .procname = "kernel", .mode = 0555,
+	  .child = kernel_table2
+	},
+	{
+	}
+};
+#endif
 
 /*
  * Assume __initcall executes before all user space. Hopefully kmod
@@ -394,6 +412,9 @@ static int __init vsyscall_init(void)
 {
 	BUG_ON(VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE));
 
+#ifdef CONFIG_SYSCTL
+	register_sysctl_table(kernel_root_table2);
+#endif
 	cpu_notifier_register_begin();
 
 	on_each_cpu(cpu_vsyscall_init, NULL, 1);
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -50,13 +50,19 @@ EXPORT_SYMBOL(csum_partial);
 #undef memset
 #undef memmove
 
+extern void *__memset(void *, int, __kernel_size_t);
+extern void *__memcpy(void *, const void *, __kernel_size_t);
+extern void *__memmove(void *, const void *, __kernel_size_t);
 extern void *memset(void *, int, __kernel_size_t);
 extern void *memcpy(void *, const void *, __kernel_size_t);
-extern void *__memcpy(void *, const void *, __kernel_size_t);
+extern void *memmove(void *, const void *, __kernel_size_t);
+
+EXPORT_SYMBOL(__memset);
+EXPORT_SYMBOL(__memcpy);
+EXPORT_SYMBOL(__memmove);
 
 EXPORT_SYMBOL(memset);
 EXPORT_SYMBOL(memcpy);
-EXPORT_SYMBOL(__memcpy);
 EXPORT_SYMBOL(memmove);
 
 #ifndef CONFIG_DEBUG_VIRTUAL
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -811,6 +811,20 @@ static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
 	return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
 }
 
+static int segmented_write_std(struct x86_emulate_ctxt *ctxt,
+			       struct segmented_address addr,
+			       void *data,
+			       unsigned int size)
+{
+	int rc;
+	ulong linear;
+
+	rc = linearize(ctxt, addr, size, true, &linear);
+	if (rc != X86EMUL_CONTINUE)
+		return rc;
+	return ctxt->ops->write_std(ctxt, linear, data, size, &ctxt->exception);
+}
+
 /*
  * Prefetch the remaining bytes of the instruction without crossing page
  * boundary if they are not in fetch_cache yet.
@@ -1564,7 +1578,6 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 				    &ctxt->exception);
 }
 
-/* Does not support long mode */
 static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 				     u16 selector, int seg, u8 cpl,
 				     enum x86_transfer_type transfer,
@@ -1601,20 +1614,34 @@ static int __load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 
 	rpl = selector & 3;
 
-	/* NULL selector is not valid for TR, CS and SS (except for long mode) */
-	if ((seg == VCPU_SREG_CS
-	     || (seg == VCPU_SREG_SS
-		 && (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl))
-	     || seg == VCPU_SREG_TR)
-	    && null_selector)
-		goto exception;
-
 	/* TR should be in GDT only */
 	if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
 		goto exception;
 
-	if (null_selector) /* for NULL selector skip all following checks */
+	/* NULL selector is not valid for TR, CS and (except for long mode) SS */
+	if (null_selector) {
+		if (seg == VCPU_SREG_CS || seg == VCPU_SREG_TR)
+			goto exception;
+
+		if (seg == VCPU_SREG_SS) {
+			if (ctxt->mode != X86EMUL_MODE_PROT64 || rpl != cpl)
+				goto exception;
+
+			/*
+			 * ctxt->ops->set_segment expects the CPL to be in
+			 * SS.DPL, so fake an expand-up 32-bit data segment.
+			 */
+			seg_desc.type = 3;
+			seg_desc.p = 1;
+			seg_desc.s = 1;
+			seg_desc.dpl = cpl;
+			seg_desc.d = 1;
+			seg_desc.g = 1;
+		}
+
+		/* Skip all following checks */
 		goto load;
+	}
 
 	ret = read_segment_descriptor(ctxt, selector, &seg_desc, &desc_addr);
 	if (ret != X86EMUL_CONTINUE)
@@ -1730,6 +1757,21 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
 				   u16 selector, int seg)
 {
 	u8 cpl = ctxt->ops->cpl(ctxt);
+
+	/*
+	 * None of MOV, POP and LSS can load a NULL selector in CPL=3, but
+	 * they can load it at CPL<3 (Intel's manual says only LSS can,
+	 * but it's wrong).
+	 *
+	 * However, the Intel manual says that putting IST=1/DPL=3 in
+	 * an interrupt gate will result in SS=3 (the AMD manual instead
+	 * says it doesn't), so allow SS=3 in __load_segment_descriptor
+	 * and only forbid it here.
+	 */
+	if (seg == VCPU_SREG_SS && selector == 3 &&
+	    ctxt->mode == X86EMUL_MODE_PROT64)
+		return emulate_exception(ctxt, GP_VECTOR, 0, true);
+
 	return __load_segment_descriptor(ctxt, selector, seg, cpl,
 					 X86_TRANSFER_NONE, NULL);
 }
@@ -2125,16 +2167,10 @@ static int em_iret(struct x86_emulate_ctxt *ctxt)
 static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
 {
 	int rc;
-	unsigned short sel, old_sel;
-	struct desc_struct old_desc, new_desc;
-	const struct x86_emulate_ops *ops = ctxt->ops;
+	unsigned short sel;
+	struct desc_struct new_desc;
 	u8 cpl = ctxt->ops->cpl(ctxt);
 
-	/* Assignment of RIP may only fail in 64-bit mode */
-	if (ctxt->mode == X86EMUL_MODE_PROT64)
-		ops->get_segment(ctxt, &old_sel, &old_desc, NULL,
-				 VCPU_SREG_CS);
-
 	memcpy(&sel, ctxt->src.valptr + ctxt->op_bytes, 2);
 
 	rc = __load_segment_descriptor(ctxt, sel, VCPU_SREG_CS, cpl,
@@ -2144,12 +2180,10 @@ static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
 		return rc;
 
 	rc = assign_eip_far(ctxt, ctxt->src.val, &new_desc);
-	if (rc != X86EMUL_CONTINUE) {
-		WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64);
-		/* assigning eip failed; restore the old cs */
-		ops->set_segment(ctxt, old_sel, &old_desc, 0, VCPU_SREG_CS);
-		return rc;
-	}
+	/* Error handling is not implemented. */
+	if (rc != X86EMUL_CONTINUE)
+		return X86EMUL_UNHANDLEABLE;
+
 	return rc;
 }
 
@@ -2209,14 +2243,8 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
 {
 	int rc;
 	unsigned long eip, cs;
-	u16 old_cs;
 	int cpl = ctxt->ops->cpl(ctxt);
-	struct desc_struct old_desc, new_desc;
-	const struct x86_emulate_ops *ops = ctxt->ops;
-
-	if (ctxt->mode == X86EMUL_MODE_PROT64)
-		ops->get_segment(ctxt, &old_cs, &old_desc, NULL,
-				 VCPU_SREG_CS);
+	struct desc_struct new_desc;
 
 	rc = emulate_pop(ctxt, &eip, ctxt->op_bytes);
 	if (rc != X86EMUL_CONTINUE)
@@ -2233,10 +2261,10 @@ static int em_ret_far(struct x86_emulate_ctxt *ctxt)
 	if (rc != X86EMUL_CONTINUE)
 		return rc;
 	rc = assign_eip_far(ctxt, eip, &new_desc);
-	if (rc != X86EMUL_CONTINUE) {
-		WARN_ON(ctxt->mode != X86EMUL_MODE_PROT64);
-		ops->set_segment(ctxt, old_cs, &old_desc, 0, VCPU_SREG_CS);
-	}
+	/* Error handling is not implemented. */
+	if (rc != X86EMUL_CONTINUE)
+		return X86EMUL_UNHANDLEABLE;
+
 	return rc;
 }
 
@@ -3692,8 +3720,8 @@ static int emulate_store_desc_ptr(struct x86_emulate_ctxt *ctxt,
 	}
 	/* Disable writeback. */
 	ctxt->dst.type = OP_NONE;
-	return segmented_write(ctxt, ctxt->dst.addr.mem,
-			       &desc_ptr, 2 + ctxt->op_bytes);
+	return segmented_write_std(ctxt, ctxt->dst.addr.mem,
+				   &desc_ptr, 2 + ctxt->op_bytes);
 }
 
 static int em_sgdt(struct x86_emulate_ctxt *ctxt)
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -23,13 +23,679 @@
 
 #include "x86.h"
 #include "lapic.h"
+#include "ioapic.h"
 #include "hyperv.h"
 
 #include <linux/kvm_host.h>
+#include <linux/highmem.h>
+#include <asm/apicdef.h>
 #include <trace/events/kvm.h>
 
 #include "trace.h"
 
+static inline u64 synic_read_sint(struct kvm_vcpu_hv_synic *synic, int sint)
+{
+	return atomic64_read(&synic->sint[sint]);
+}
+
+static inline int synic_get_sint_vector(u64 sint_value)
+{
+	if (sint_value & HV_SYNIC_SINT_MASKED)
+		return -1;
+	return sint_value & HV_SYNIC_SINT_VECTOR_MASK;
+}
+
+static bool synic_has_vector_connected(struct kvm_vcpu_hv_synic *synic,
+				      int vector)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+		if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+			return true;
+	}
+	return false;
+}
+
+static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic,
+				     int vector)
+{
+	int i;
+	u64 sint_value;
+
+	for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+		sint_value = synic_read_sint(synic, i);
+		if (synic_get_sint_vector(sint_value) == vector &&
+		    sint_value & HV_SYNIC_SINT_AUTO_EOI)
+			return true;
+	}
+	return false;
+}
+
+static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint,
+			  u64 data, bool host)
+{
+	int vector;
+
+	vector = data & HV_SYNIC_SINT_VECTOR_MASK;
+	if (vector < 16 && !host)
+		return 1;
+	/*
+	 * Guest may configure multiple SINTs to use the same vector, so
+	 * we maintain a bitmap of vectors handled by synic, and a
+	 * bitmap of vectors with auto-eoi behavior.  The bitmaps are
+	 * updated here, and atomically queried on fast paths.
+	 */
+
+	atomic64_set(&synic->sint[sint], data);
+
+	if (synic_has_vector_connected(synic, vector))
+		__set_bit(vector, synic->vec_bitmap);
+	else
+		__clear_bit(vector, synic->vec_bitmap);
+
+	if (synic_has_vector_auto_eoi(synic, vector))
+		__set_bit(vector, synic->auto_eoi_bitmap);
+	else
+		__clear_bit(vector, synic->auto_eoi_bitmap);
+
+	/* Load SynIC vectors into EOI exit bitmap */
+	kvm_make_request(KVM_REQ_SCAN_IOAPIC, synic_to_vcpu(synic));
+	return 0;
+}
+
+static struct kvm_vcpu_hv_synic *synic_get(struct kvm *kvm, u32 vcpu_id)
+{
+	struct kvm_vcpu *vcpu;
+	struct kvm_vcpu_hv_synic *synic;
+
+	if (vcpu_id >= atomic_read(&kvm->online_vcpus))
+		return NULL;
+	vcpu = kvm_get_vcpu(kvm, vcpu_id);
+	if (!vcpu)
+		return NULL;
+	synic = vcpu_to_synic(vcpu);
+	return (synic->active) ? synic : NULL;
+}
+
+static void synic_clear_sint_msg_pending(struct kvm_vcpu_hv_synic *synic,
+					u32 sint)
+{
+	struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+	struct page *page;
+	gpa_t gpa;
+	struct hv_message *msg;
+	struct hv_message_page *msg_page;
+
+	gpa = synic->msg_page & PAGE_MASK;
+	page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
+	if (is_error_page(page)) {
+		vcpu_err(vcpu, "Hyper-V SynIC can't get msg page, gpa 0x%llx\n",
+			 gpa);
+		return;
+	}
+	msg_page = kmap_atomic(page);
+
+	msg = &msg_page->sint_message[sint];
+	msg->header.message_flags.msg_pending = 0;
+
+	kunmap_atomic(msg_page);
+	kvm_release_page_dirty(page);
+	kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+}
+
+static void kvm_hv_notify_acked_sint(struct kvm_vcpu *vcpu, u32 sint)
+{
+	struct kvm *kvm = vcpu->kvm;
+	struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+	struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+	struct kvm_vcpu_hv_stimer *stimer;
+	int gsi, idx, stimers_pending;
+
+	trace_kvm_hv_notify_acked_sint(vcpu->vcpu_id, sint);
+
+	if (synic->msg_page & HV_SYNIC_SIMP_ENABLE)
+		synic_clear_sint_msg_pending(synic, sint);
+
+	/* Try to deliver pending Hyper-V SynIC timers messages */
+	stimers_pending = 0;
+	for (idx = 0; idx < ARRAY_SIZE(hv_vcpu->stimer); idx++) {
+		stimer = &hv_vcpu->stimer[idx];
+		if (stimer->msg_pending &&
+		    (stimer->config & HV_STIMER_ENABLE) &&
+		    HV_STIMER_SINT(stimer->config) == sint) {
+			set_bit(stimer->index,
+				hv_vcpu->stimer_pending_bitmap);
+			stimers_pending++;
+		}
+	}
+	if (stimers_pending)
+		kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
+
+	idx = srcu_read_lock(&kvm->irq_srcu);
+	gsi = atomic_read(&synic->sint_to_gsi[sint]);
+	if (gsi != -1)
+		kvm_notify_acked_gsi(kvm, gsi);
+	srcu_read_unlock(&kvm->irq_srcu, idx);
+}
+
+static void synic_exit(struct kvm_vcpu_hv_synic *synic, u32 msr)
+{
+	struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+	struct kvm_vcpu_hv *hv_vcpu = &vcpu->arch.hyperv;
+
+	hv_vcpu->exit.type = KVM_EXIT_HYPERV_SYNIC;
+	hv_vcpu->exit.u.synic.msr = msr;
+	hv_vcpu->exit.u.synic.control = synic->control;
+	hv_vcpu->exit.u.synic.evt_page = synic->evt_page;
+	hv_vcpu->exit.u.synic.msg_page = synic->msg_page;
+
+	kvm_make_request(KVM_REQ_HV_EXIT, vcpu);
+}
+
+static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
+			 u32 msr, u64 data, bool host)
+{
+	struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+	int ret;
+
+	if (!synic->active)
+		return 1;
+
+	trace_kvm_hv_synic_set_msr(vcpu->vcpu_id, msr, data, host);
+
+	ret = 0;
+	switch (msr) {
+	case HV_X64_MSR_SCONTROL:
+		synic->control = data;
+		if (!host)
+			synic_exit(synic, msr);
+		break;
+	case HV_X64_MSR_SVERSION:
+		if (!host) {
+			ret = 1;
+			break;
+		}
+		synic->version = data;
+		break;
+	case HV_X64_MSR_SIEFP:
+		if (data & HV_SYNIC_SIEFP_ENABLE && !host)
+			if (kvm_clear_guest(vcpu->kvm,
+					    data & PAGE_MASK, PAGE_SIZE)) {
+				ret = 1;
+				break;
+			}
+		synic->evt_page = data;
+		if (!host)
+			synic_exit(synic, msr);
+		break;
+	case HV_X64_MSR_SIMP:
+		if (data & HV_SYNIC_SIMP_ENABLE && !host)
+			if (kvm_clear_guest(vcpu->kvm,
+					    data & PAGE_MASK, PAGE_SIZE)) {
+				ret = 1;
+				break;
+			}
+		synic->msg_page = data;
+		if (!host)
+			synic_exit(synic, msr);
+		break;
+	case HV_X64_MSR_EOM: {
+		int i;
+
+		for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+			kvm_hv_notify_acked_sint(vcpu, i);
+		break;
+	}
+	case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+		ret = synic_set_sint(synic, msr - HV_X64_MSR_SINT0, data, host);
+		break;
+	default:
+		ret = 1;
+		break;
+	}
+	return ret;
+}
+
+static int synic_get_msr(struct kvm_vcpu_hv_synic *synic, u32 msr, u64 *pdata)
+{
+	int ret;
+
+	if (!synic->active)
+		return 1;
+
+	ret = 0;
+	switch (msr) {
+	case HV_X64_MSR_SCONTROL:
+		*pdata = synic->control;
+		break;
+	case HV_X64_MSR_SVERSION:
+		*pdata = synic->version;
+		break;
+	case HV_X64_MSR_SIEFP:
+		*pdata = synic->evt_page;
+		break;
+	case HV_X64_MSR_SIMP:
+		*pdata = synic->msg_page;
+		break;
+	case HV_X64_MSR_EOM:
+		*pdata = 0;
+		break;
+	case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+		*pdata = atomic64_read(&synic->sint[msr - HV_X64_MSR_SINT0]);
+		break;
+	default:
+		ret = 1;
+		break;
+	}
+	return ret;
+}
+
+int synic_set_irq(struct kvm_vcpu_hv_synic *synic, u32 sint)
+{
+	struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+	struct kvm_lapic_irq irq;
+	int ret, vector;
+
+	if (sint >= ARRAY_SIZE(synic->sint))
+		return -EINVAL;
+
+	vector = synic_get_sint_vector(synic_read_sint(synic, sint));
+	if (vector < 0)
+		return -ENOENT;
+
+	memset(&irq, 0, sizeof(irq));
+	irq.dest_id = kvm_apic_id(vcpu->arch.apic);
+	irq.dest_mode = APIC_DEST_PHYSICAL;
+	irq.delivery_mode = APIC_DM_FIXED;
+	irq.vector = vector;
+	irq.level = 1;
+
+	ret = kvm_irq_delivery_to_apic(vcpu->kvm, NULL, &irq, NULL);
+	trace_kvm_hv_synic_set_irq(vcpu->vcpu_id, sint, irq.vector, ret);
+	return ret;
+}
+
+int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint)
+{
+	struct kvm_vcpu_hv_synic *synic;
+
+	synic = synic_get(kvm, vcpu_id);
+	if (!synic)
+		return -EINVAL;
+
+	return synic_set_irq(synic, sint);
+}
+
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector)
+{
+	struct kvm_vcpu_hv_synic *synic = vcpu_to_synic(vcpu);
+	int i;
+
+	trace_kvm_hv_synic_send_eoi(vcpu->vcpu_id, vector);
+
+	for (i = 0; i < ARRAY_SIZE(synic->sint); i++)
+		if (synic_get_sint_vector(synic_read_sint(synic, i)) == vector)
+			kvm_hv_notify_acked_sint(vcpu, i);
+}
+
+static int kvm_hv_set_sint_gsi(struct kvm *kvm, u32 vcpu_id, u32 sint, int gsi)
+{
+	struct kvm_vcpu_hv_synic *synic;
+
+	synic = synic_get(kvm, vcpu_id);
+	if (!synic)
+		return -EINVAL;
+
+	if (sint >= ARRAY_SIZE(synic->sint_to_gsi))
+		return -EINVAL;
+
+	atomic_set(&synic->sint_to_gsi[sint], gsi);
+	return 0;
+}
+
+void kvm_hv_irq_routing_update(struct kvm *kvm)
+{
+	struct kvm_irq_routing_table *irq_rt;
+	struct kvm_kernel_irq_routing_entry *e;
+	u32 gsi;
+
+	irq_rt = srcu_dereference_check(kvm->irq_routing, &kvm->irq_srcu,
+					lockdep_is_held(&kvm->irq_lock));
+
+	for (gsi = 0; gsi < irq_rt->nr_rt_entries; gsi++) {
+		hlist_for_each_entry(e, &irq_rt->map[gsi], link) {
+			if (e->type == KVM_IRQ_ROUTING_HV_SINT)
+				kvm_hv_set_sint_gsi(kvm, e->hv_sint.vcpu,
+						    e->hv_sint.sint, gsi);
+		}
+	}
+}
+
+static void synic_init(struct kvm_vcpu_hv_synic *synic)
+{
+	int i;
+
+	memset(synic, 0, sizeof(*synic));
+	synic->version = HV_SYNIC_VERSION_1;
+	for (i = 0; i < ARRAY_SIZE(synic->sint); i++) {
+		atomic64_set(&synic->sint[i], HV_SYNIC_SINT_MASKED);
+		atomic_set(&synic->sint_to_gsi[i], -1);
+	}
+}
+
+static u64 get_time_ref_counter(struct kvm *kvm)
+{
+	struct kvm_hv *hv = &kvm->arch.hyperv;
+	struct kvm_vcpu *vcpu;
+	u64 tsc;
+
+	/*
+	 * The guest has not set up the TSC page or the clock isn't
+	 * stable, fall back to get_kvmclock_ns.
+	 */
+	if (!hv->tsc_ref.tsc_sequence)
+		return div_u64(get_kvmclock_ns(kvm), 100);
+
+	vcpu = kvm_get_vcpu(kvm, 0);
+	tsc = kvm_read_l1_tsc(vcpu, native_read_tsc());
+	return mul_u64_u64_shr(tsc, hv->tsc_ref.tsc_scale, 64)
+		+ hv->tsc_ref.tsc_offset;
+}
+
+static void stimer_mark_pending(struct kvm_vcpu_hv_stimer *stimer,
+				bool vcpu_kick)
+{
+	struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+
+	set_bit(stimer->index,
+		vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+	kvm_make_request(KVM_REQ_HV_STIMER, vcpu);
+	if (vcpu_kick)
+		kvm_vcpu_kick(vcpu);
+}
+
+static void stimer_cleanup(struct kvm_vcpu_hv_stimer *stimer)
+{
+	struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+
+	trace_kvm_hv_stimer_cleanup(stimer_to_vcpu(stimer)->vcpu_id,
+				    stimer->index);
+
+	hrtimer_cancel(&stimer->timer);
+	clear_bit(stimer->index,
+		  vcpu_to_hv_vcpu(vcpu)->stimer_pending_bitmap);
+	stimer->msg_pending = false;
+	stimer->exp_time = 0;
+}
+
+static enum hrtimer_restart stimer_timer_callback(struct hrtimer *timer)
+{
+	struct kvm_vcpu_hv_stimer *stimer;
+
+	stimer = container_of(timer, struct kvm_vcpu_hv_stimer, timer);
+	trace_kvm_hv_stimer_callback(stimer_to_vcpu(stimer)->vcpu_id,
+				     stimer->index);
+	stimer_mark_pending(stimer, true);
+
+	return HRTIMER_NORESTART;
+}
+
+/*
+ * stimer_start() assumptions:
+ * a) stimer->count is not equal to 0
+ * b) stimer->config has HV_STIMER_ENABLE flag
+ */
+static int stimer_start(struct kvm_vcpu_hv_stimer *stimer)
+{
+	u64 time_now;
+	ktime_t ktime_now;
+
+	time_now = get_time_ref_counter(stimer_to_vcpu(stimer)->kvm);
+	ktime_now = ktime_get();
+
+	if (stimer->config & HV_STIMER_PERIODIC) {
+		if (stimer->exp_time) {
+			if (time_now >= stimer->exp_time) {
+				u64 remainder;
+
+				div64_u64_rem(time_now - stimer->exp_time,
+					      stimer->count, &remainder);
+				stimer->exp_time =
+					time_now + (stimer->count - remainder);
+			}
+		} else
+			stimer->exp_time = time_now + stimer->count;
+
+		trace_kvm_hv_stimer_start_periodic(
+					stimer_to_vcpu(stimer)->vcpu_id,
+					stimer->index,
+					time_now, stimer->exp_time);
+
+		hrtimer_start(&stimer->timer,
+			      ktime_add_ns(ktime_now,
+					   100 * (stimer->exp_time - time_now)),
+			      HRTIMER_MODE_ABS);
+		return 0;
+	}
+	stimer->exp_time = stimer->count;
+	if (time_now >= stimer->count) {
+		/*
+		 * Expire timer according to Hypervisor Top-Level Functional
+		 * specification v4(15.3.1):
+		 * "If a one shot is enabled and the specified count is in
+		 * the past, it will expire immediately."
+		 */
+		stimer_mark_pending(stimer, false);
+		return 0;
+	}
+
+	trace_kvm_hv_stimer_start_one_shot(stimer_to_vcpu(stimer)->vcpu_id,
+					   stimer->index,
+					   time_now, stimer->count);
+
+	hrtimer_start(&stimer->timer,
+		      ktime_add_ns(ktime_now, 100 * (stimer->count - time_now)),
+		      HRTIMER_MODE_ABS);
+	return 0;
+}
+
+static int stimer_set_config(struct kvm_vcpu_hv_stimer *stimer, u64 config,
+			     bool host)
+{
+	trace_kvm_hv_stimer_set_config(stimer_to_vcpu(stimer)->vcpu_id,
+				       stimer->index, config, host);
+
+	stimer_cleanup(stimer);
+	if ((stimer->config & HV_STIMER_ENABLE) && HV_STIMER_SINT(config) == 0)
+		config &= ~HV_STIMER_ENABLE;
+	stimer->config = config;
+	stimer_mark_pending(stimer, false);
+	return 0;
+}
+
+static int stimer_set_count(struct kvm_vcpu_hv_stimer *stimer, u64 count,
+			    bool host)
+{
+	trace_kvm_hv_stimer_set_count(stimer_to_vcpu(stimer)->vcpu_id,
+				      stimer->index, count, host);
+
+	stimer_cleanup(stimer);
+	stimer->count = count;
+	if (stimer->count == 0)
+		stimer->config &= ~HV_STIMER_ENABLE;
+	else if (stimer->config & HV_STIMER_AUTOENABLE)
+		stimer->config |= HV_STIMER_ENABLE;
+	stimer_mark_pending(stimer, false);
+	return 0;
+}
+
+static int stimer_get_config(struct kvm_vcpu_hv_stimer *stimer, u64 *pconfig)
+{
+	*pconfig = stimer->config;
+	return 0;
+}
+
+static int stimer_get_count(struct kvm_vcpu_hv_stimer *stimer, u64 *pcount)
+{
+	*pcount = stimer->count;
+	return 0;
+}
+
+static int synic_deliver_msg(struct kvm_vcpu_hv_synic *synic, u32 sint,
+			     struct hv_message *src_msg)
+{
+	struct kvm_vcpu *vcpu = synic_to_vcpu(synic);
+	struct page *page;
+	gpa_t gpa;
+	struct hv_message *dst_msg;
+	int r;
+	struct hv_message_page *msg_page;
+
+	if (!(synic->msg_page & HV_SYNIC_SIMP_ENABLE))
+		return -ENOENT;
+
+	gpa = synic->msg_page & PAGE_MASK;
+	page = kvm_vcpu_gfn_to_page(vcpu, gpa >> PAGE_SHIFT);
+	if (is_error_page(page))
+		return -EFAULT;
+
+	msg_page = kmap_atomic(page);
+	dst_msg = &msg_page->sint_message[sint];
+	if (sync_cmpxchg(&dst_msg->header.message_type, HVMSG_NONE,
+			 src_msg->header.message_type) != HVMSG_NONE) {
+		dst_msg->header.message_flags.msg_pending = 1;
+		r = -EAGAIN;
+	} else {
+		memcpy(&dst_msg->u.payload, &src_msg->u.payload,
+		       src_msg->header.payload_size);
+		dst_msg->header.message_type = src_msg->header.message_type;
+		dst_msg->header.payload_size = src_msg->header.payload_size;
+		r = synic_set_irq(synic, sint);
+		if (r >= 1)
+			r = 0;
+		else if (r == 0)
+			r = -EFAULT;
+	}
+	kunmap_atomic(msg_page);
+	kvm_release_page_dirty(page);
+	kvm_vcpu_mark_page_dirty(vcpu, gpa >> PAGE_SHIFT);
+	return r;
+}
+
+static int stimer_send_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+	struct kvm_vcpu *vcpu = stimer_to_vcpu(stimer);
+	struct hv_message *msg = &stimer->msg;
+	struct hv_timer_message_payload *payload =
+			(struct hv_timer_message_payload *)&msg->u.payload;
+
+	payload->expiration_time = stimer->exp_time;
+	payload->delivery_time = get_time_ref_counter(vcpu->kvm);
+	return synic_deliver_msg(vcpu_to_synic(vcpu),
+				 HV_STIMER_SINT(stimer->config), msg);
+}
+
+static void stimer_expiration(struct kvm_vcpu_hv_stimer *stimer)
+{
+	int r;
+
+	stimer->msg_pending = true;
+	r = stimer_send_msg(stimer);
+	trace_kvm_hv_stimer_expiration(stimer_to_vcpu(stimer)->vcpu_id,
+				       stimer->index, r);
+	if (!r) {
+		stimer->msg_pending = false;
+		if (!(stimer->config & HV_STIMER_PERIODIC))
+			stimer->config &= ~HV_STIMER_ENABLE;
+	}
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+	struct kvm_vcpu_hv_stimer *stimer;
+	u64 time_now, exp_time;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+		if (test_and_clear_bit(i, hv_vcpu->stimer_pending_bitmap)) {
+			stimer = &hv_vcpu->stimer[i];
+			if (stimer->config & HV_STIMER_ENABLE) {
+				exp_time = stimer->exp_time;
+
+				if (exp_time) {
+					time_now =
+						get_time_ref_counter(vcpu->kvm);
+					if (time_now >= exp_time)
+						stimer_expiration(stimer);
+				}
+
+				if ((stimer->config & HV_STIMER_ENABLE) &&
+				    stimer->count)
+					stimer_start(stimer);
+				else
+					stimer_cleanup(stimer);
+			}
+		}
+}
+
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+		stimer_cleanup(&hv_vcpu->stimer[i]);
+}
+
+static void stimer_prepare_msg(struct kvm_vcpu_hv_stimer *stimer)
+{
+	struct hv_message *msg = &stimer->msg;
+	struct hv_timer_message_payload *payload =
+			(struct hv_timer_message_payload *)&msg->u.payload;
+
+	memset(&msg->header, 0, sizeof(msg->header));
+	msg->header.message_type = HVMSG_TIMER_EXPIRED;
+	msg->header.payload_size = sizeof(*payload);
+
+	payload->timer_index = stimer->index;
+	payload->expiration_time = 0;
+	payload->delivery_time = 0;
+}
+
+static void stimer_init(struct kvm_vcpu_hv_stimer *stimer, int timer_index)
+{
+	memset(stimer, 0, sizeof(*stimer));
+	stimer->index = timer_index;
+	hrtimer_init(&stimer->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+	stimer->timer.function = stimer_timer_callback;
+	stimer_prepare_msg(stimer);
+}
+
+void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu)
+{
+	struct kvm_vcpu_hv *hv_vcpu = vcpu_to_hv_vcpu(vcpu);
+	int i;
+
+	synic_init(&hv_vcpu->synic);
+
+	bitmap_zero(hv_vcpu->stimer_pending_bitmap, HV_SYNIC_STIMER_COUNT);
+	for (i = 0; i < ARRAY_SIZE(hv_vcpu->stimer); i++)
+		stimer_init(&hv_vcpu->stimer[i], i);
+}
+
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu)
+{
+	/*
+	 * Hyper-V SynIC auto EOI SINT's are
+	 * not compatible with APICV, so deactivate APICV
+	 */
+	kvm_vcpu_deactivate_apicv(vcpu);
+	vcpu_to_synic(vcpu)->active = true;
+	return 0;
+}
+
 static bool kvm_hv_msr_partition_wide(u32 msr)
 {
 	bool r = false;
@@ -39,6 +705,9 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
 	case HV_X64_MSR_HYPERCALL:
 	case HV_X64_MSR_REFERENCE_TSC:
 	case HV_X64_MSR_TIME_REF_COUNT:
+	case HV_X64_MSR_CRASH_CTL:
+	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+	case HV_X64_MSR_RESET:
 		r = true;
 		break;
 	}
@@ -46,7 +715,186 @@ static bool kvm_hv_msr_partition_wide(u32 msr)
 	return r;
 }
 
-static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+static int kvm_hv_msr_get_crash_data(struct kvm_vcpu *vcpu,
+				     u32 index, u64 *pdata)
+{
+	struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+
+	if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
+		return -EINVAL;
+
+	*pdata = hv->hv_crash_param[index];
+	return 0;
+}
+
+static int kvm_hv_msr_get_crash_ctl(struct kvm_vcpu *vcpu, u64 *pdata)
+{
+	struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+
+	*pdata = hv->hv_crash_ctl;
+	return 0;
+}
+
+static int kvm_hv_msr_set_crash_ctl(struct kvm_vcpu *vcpu, u64 data, bool host)
+{
+	struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+
+	if (host)
+		hv->hv_crash_ctl = data & HV_X64_MSR_CRASH_CTL_NOTIFY;
+
+	if (!host && (data & HV_X64_MSR_CRASH_CTL_NOTIFY)) {
+
+		vcpu_debug(vcpu, "hv crash (0x%llx 0x%llx 0x%llx 0x%llx 0x%llx)\n",
+			  hv->hv_crash_param[0],
+			  hv->hv_crash_param[1],
+			  hv->hv_crash_param[2],
+			  hv->hv_crash_param[3],
+			  hv->hv_crash_param[4]);
+
+		/* Send notification about crash to user space */
+		kvm_make_request(KVM_REQ_HV_CRASH, vcpu);
+	}
+
+	return 0;
+}
+
+static int kvm_hv_msr_set_crash_data(struct kvm_vcpu *vcpu,
+				     u32 index, u64 data)
+{
+	struct kvm_hv *hv = &vcpu->kvm->arch.hyperv;
+
+	if (WARN_ON_ONCE(index >= ARRAY_SIZE(hv->hv_crash_param)))
+		return -EINVAL;
+
+	hv->hv_crash_param[index] = data;
+	return 0;
+}
+
+/*
+ * The kvmclock and Hyper-V TSC page use similar formulas, and converting
+ * between them is possible:
+ *
+ * kvmclock formula:
+ *    nsec = (ticks - tsc_timestamp) * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           + system_time
+ *
+ * Hyper-V formula:
+ *    nsec/100 = ticks * scale / 2^64 + offset
+ *
+ * When tsc_timestamp = system_time = 0, offset is zero in the Hyper-V formula.
+ * By dividing the kvmclock formula by 100 and equating what's left we get:
+ *    ticks * scale / 2^64 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *            scale / 2^64 =         tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *            scale        =         tsc_to_system_mul * 2^(32+tsc_shift) / 100
+ *
+ * Now expand the kvmclock formula and divide by 100:
+ *    nsec = ticks * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32)
+ *           + system_time
+ *    nsec/100 = ticks * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *               - tsc_timestamp * tsc_to_system_mul * 2^(tsc_shift-32) / 100
+ *               + system_time / 100
+ *
+ * Replace tsc_to_system_mul * 2^(tsc_shift-32) / 100 by scale / 2^64:
+ *    nsec/100 = ticks * scale / 2^64
+ *               - tsc_timestamp * scale / 2^64
+ *               + system_time / 100
+ *
+ * Equate with the Hyper-V formula so that ticks * scale / 2^64 cancels out:
+ *    offset = system_time / 100 - tsc_timestamp * scale / 2^64
+ *
+ * These two equivalencies are implemented in this function.
+ */
+static bool compute_tsc_page_parameters(struct pvclock_vcpu_time_info *hv_clock,
+					HV_REFERENCE_TSC_PAGE *tsc_ref)
+{
+	u64 max_mul;
+
+	if (!(hv_clock->flags & PVCLOCK_TSC_STABLE_BIT))
+		return false;
+
+	/*
+	 * check if scale would overflow, if so we use the time ref counter
+	 *    tsc_to_system_mul * 2^(tsc_shift+32) / 100 >= 2^64
+	 *    tsc_to_system_mul / 100 >= 2^(32-tsc_shift)
+	 *    tsc_to_system_mul >= 100 * 2^(32-tsc_shift)
+	 */
+	max_mul = 100ull << (32 - hv_clock->tsc_shift);
+	if (hv_clock->tsc_to_system_mul >= max_mul)
+		return false;
+
+	/*
+	 * Otherwise compute the scale and offset according to the formulas
+	 * derived above.
+	 */
+	tsc_ref->tsc_scale =
+		mul_u64_u32_div(1ULL << (32 + hv_clock->tsc_shift),
+				hv_clock->tsc_to_system_mul,
+				100);
+
+	tsc_ref->tsc_offset = hv_clock->system_time;
+	do_div(tsc_ref->tsc_offset, 100);
+	tsc_ref->tsc_offset -=
+		mul_u64_u64_shr(hv_clock->tsc_timestamp, tsc_ref->tsc_scale, 64);
+	return true;
+}
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+			   struct pvclock_vcpu_time_info *hv_clock)
+{
+	struct kvm_hv *hv = &kvm->arch.hyperv;
+	u32 tsc_seq;
+	u64 gfn;
+
+	BUILD_BUG_ON(sizeof(tsc_seq) != sizeof(hv->tsc_ref.tsc_sequence));
+	BUILD_BUG_ON(offsetof(HV_REFERENCE_TSC_PAGE, tsc_sequence) != 0);
+
+	if (!(hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE))
+		return;
+
+	gfn = hv->hv_tsc_page >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
+	/*
+	 * Because the TSC parameters only vary when there is a
+	 * change in the master clock, do not bother with caching.
+	 */
+	if (unlikely(kvm_read_guest(kvm, gfn_to_gpa(gfn),
+				    &tsc_seq, sizeof(tsc_seq))))
+		return;
+
+	/*
+	 * While we're computing and writing the parameters, force the
+	 * guest to use the time reference count MSR.
+	 */
+	hv->tsc_ref.tsc_sequence = 0;
+	if (kvm_write_guest(kvm, gfn_to_gpa(gfn),
+			    &hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence)))
+		return;
+
+	if (!compute_tsc_page_parameters(hv_clock, &hv->tsc_ref))
+		return;
+
+	/* Ensure sequence is zero before writing the rest of the struct.  */
+	smp_wmb();
+	if (kvm_write_guest(kvm, gfn_to_gpa(gfn), &hv->tsc_ref, sizeof(hv->tsc_ref)))
+		return;
+
+	/*
+	 * Now switch to the TSC page mechanism by writing the sequence.
+	 */
+	tsc_seq++;
+	if (tsc_seq == 0xFFFFFFFF || tsc_seq == 0)
+		tsc_seq = 1;
+
+	/* Write the struct entirely before the non-zero sequence.  */
+	smp_wmb();
+
+	hv->tsc_ref.tsc_sequence = tsc_seq;
+	kvm_write_guest(kvm, gfn_to_gpa(gfn),
+			&hv->tsc_ref, sizeof(hv->tsc_ref.tsc_sequence));
+}
+
+static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data,
+			     bool host)
 {
 	struct kvm *kvm = vcpu->kvm;
 	struct kvm_hv *hv = &kvm->arch.hyperv;
@@ -82,23 +930,23 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		mark_page_dirty(kvm, gfn);
 		break;
 	}
-	case HV_X64_MSR_REFERENCE_TSC: {
-		u64 gfn;
-		HV_REFERENCE_TSC_PAGE tsc_ref;
-
-		memset(&tsc_ref, 0, sizeof(tsc_ref));
+	case HV_X64_MSR_REFERENCE_TSC:
 		hv->hv_tsc_page = data;
-		if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE))
-			break;
-		gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT;
-		if (kvm_write_guest(
-				kvm,
-				gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT,
-				&tsc_ref, sizeof(tsc_ref)))
-			return 1;
-		mark_page_dirty(kvm, gfn);
+		if (hv->hv_tsc_page & HV_X64_MSR_TSC_REFERENCE_ENABLE)
+			kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu);
+		break;
+	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+		return kvm_hv_msr_set_crash_data(vcpu,
+						 msr - HV_X64_MSR_CRASH_P0,
+						 data);
+	case HV_X64_MSR_CRASH_CTL:
+		return kvm_hv_msr_set_crash_ctl(vcpu, data, host);
+	case HV_X64_MSR_RESET:
+		if (data == 1) {
+			vcpu_debug(vcpu, "hyper-v reset requested\n");
+			kvm_make_request(KVM_REQ_HV_RESET, vcpu);
+		}
 		break;
-	}
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
 			    msr, data);
@@ -107,7 +955,16 @@ static int kvm_hv_set_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 	return 0;
 }
 
-static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+/* Calculate cpu time spent by current task in 100ns units */
+static u64 current_task_runtime_100ns(void)
+{
+	cputime_t utime, stime;
+
+	task_cputime_adjusted(current, &utime, &stime);
+	return div_u64(cputime_to_nsecs(utime + stime), 100);
+}
+
+static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 {
 	struct kvm_vcpu_hv *hv = &vcpu->arch.hyperv;
 
@@ -141,6 +998,36 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data)
 		return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
 	case HV_X64_MSR_TPR:
 		return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
+	case HV_X64_MSR_VP_RUNTIME:
+		if (!host)
+			return 1;
+		hv->runtime_offset = data - current_task_runtime_100ns();
+		break;
+	case HV_X64_MSR_SCONTROL:
+	case HV_X64_MSR_SVERSION:
+	case HV_X64_MSR_SIEFP:
+	case HV_X64_MSR_SIMP:
+	case HV_X64_MSR_EOM:
+	case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+		return synic_set_msr(vcpu_to_synic(vcpu), msr, data, host);
+	case HV_X64_MSR_STIMER0_CONFIG:
+	case HV_X64_MSR_STIMER1_CONFIG:
+	case HV_X64_MSR_STIMER2_CONFIG:
+	case HV_X64_MSR_STIMER3_CONFIG: {
+		int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+		return stimer_set_config(vcpu_to_stimer(vcpu, timer_index),
+					 data, host);
+	}
+	case HV_X64_MSR_STIMER0_COUNT:
+	case HV_X64_MSR_STIMER1_COUNT:
+	case HV_X64_MSR_STIMER2_COUNT:
+	case HV_X64_MSR_STIMER3_COUNT: {
+		int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+		return stimer_set_count(vcpu_to_stimer(vcpu, timer_index),
+					data, host);
+	}
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V uhandled wrmsr: 0x%x data 0x%llx\n",
 			    msr, data);
@@ -163,14 +1050,21 @@ static int kvm_hv_get_msr_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case HV_X64_MSR_HYPERCALL:
 		data = hv->hv_hypercall;
 		break;
-	case HV_X64_MSR_TIME_REF_COUNT: {
-		data =
-		     div_u64(get_kernel_ns() + kvm->arch.kvmclock_offset, 100);
+	case HV_X64_MSR_TIME_REF_COUNT:
+		data = get_time_ref_counter(kvm);
 		break;
-	}
 	case HV_X64_MSR_REFERENCE_TSC:
 		data = hv->hv_tsc_page;
 		break;
+	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+		return kvm_hv_msr_get_crash_data(vcpu,
+						 msr - HV_X64_MSR_CRASH_P0,
+						 pdata);
+	case HV_X64_MSR_CRASH_CTL:
+		return kvm_hv_msr_get_crash_ctl(vcpu, pdata);
+	case HV_X64_MSR_RESET:
+		data = 0;
+		break;
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
@@ -207,6 +1101,34 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	case HV_X64_MSR_APIC_ASSIST_PAGE:
 		data = hv->hv_vapic;
 		break;
+	case HV_X64_MSR_VP_RUNTIME:
+		data = current_task_runtime_100ns() + hv->runtime_offset;
+		break;
+	case HV_X64_MSR_SCONTROL:
+	case HV_X64_MSR_SVERSION:
+	case HV_X64_MSR_SIEFP:
+	case HV_X64_MSR_SIMP:
+	case HV_X64_MSR_EOM:
+	case HV_X64_MSR_SINT0 ... HV_X64_MSR_SINT15:
+		return synic_get_msr(vcpu_to_synic(vcpu), msr, pdata);
+	case HV_X64_MSR_STIMER0_CONFIG:
+	case HV_X64_MSR_STIMER1_CONFIG:
+	case HV_X64_MSR_STIMER2_CONFIG:
+	case HV_X64_MSR_STIMER3_CONFIG: {
+		int timer_index = (msr - HV_X64_MSR_STIMER0_CONFIG)/2;
+
+		return stimer_get_config(vcpu_to_stimer(vcpu, timer_index),
+					 pdata);
+	}
+	case HV_X64_MSR_STIMER0_COUNT:
+	case HV_X64_MSR_STIMER1_COUNT:
+	case HV_X64_MSR_STIMER2_COUNT:
+	case HV_X64_MSR_STIMER3_COUNT: {
+		int timer_index = (msr - HV_X64_MSR_STIMER0_COUNT)/2;
+
+		return stimer_get_count(vcpu_to_stimer(vcpu, timer_index),
+					pdata);
+	}
 	default:
 		vcpu_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
 		return 1;
@@ -215,17 +1137,17 @@ static int kvm_hv_get_msr(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
 	return 0;
 }
 
-int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
+int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host)
 {
 	if (kvm_hv_msr_partition_wide(msr)) {
 		int r;
 
 		mutex_lock(&vcpu->kvm->lock);
-		r = kvm_hv_set_msr_pw(vcpu, msr, data);
+		r = kvm_hv_set_msr_pw(vcpu, msr, data, host);
 		mutex_unlock(&vcpu->kvm->lock);
 		return r;
 	} else
-		return kvm_hv_set_msr(vcpu, msr, data);
+		return kvm_hv_set_msr(vcpu, msr, data, host);
 }
 
 int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
@@ -246,6 +1168,27 @@ bool kvm_hv_hypercall_enabled(struct kvm *kvm)
 	return kvm->arch.hyperv.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
 }
 
+static void kvm_hv_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
+{
+	bool longmode;
+
+	longmode = is_64_bit_mode(vcpu);
+	if (longmode)
+		kvm_register_write(vcpu, VCPU_REGS_RAX, result);
+	else {
+		kvm_register_write(vcpu, VCPU_REGS_RDX, result >> 32);
+		kvm_register_write(vcpu, VCPU_REGS_RAX, result & 0xffffffff);
+	}
+}
+
+static int kvm_hv_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
+{
+	struct kvm_run *run = vcpu->run;
+
+	kvm_hv_hypercall_set_result(vcpu, run->hyperv.u.hcall.result);
+	return 1;
+}
+
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 {
 	u64 param, ingpa, outgpa, ret;
@@ -258,7 +1201,7 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 	 */
 	if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
 		kvm_queue_exception(vcpu, UD_VECTOR);
-		return 0;
+		return 1;
 	}
 
 	longmode = is_64_bit_mode(vcpu);
@@ -286,22 +1229,33 @@ int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
 
 	trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
 
+	/* Hypercall continuation is not supported yet */
+	if (rep_cnt || rep_idx) {
+		res = HV_STATUS_INVALID_HYPERCALL_CODE;
+		goto set_result;
+	}
+
 	switch (code) {
-	case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
+	case HVCALL_NOTIFY_LONG_SPIN_WAIT:
 		kvm_vcpu_on_spin(vcpu);
 		break;
+	case HVCALL_POST_MESSAGE:
+	case HVCALL_SIGNAL_EVENT:
+		vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+		vcpu->run->hyperv.type = KVM_EXIT_HYPERV_HCALL;
+		vcpu->run->hyperv.u.hcall.input = param;
+		vcpu->run->hyperv.u.hcall.params[0] = ingpa;
+		vcpu->run->hyperv.u.hcall.params[1] = outgpa;
+		vcpu->arch.complete_userspace_io =
+				kvm_hv_hypercall_complete_userspace;
+		return 0;
 	default:
 		res = HV_STATUS_INVALID_HYPERCALL_CODE;
 		break;
 	}
 
+set_result:
 	ret = res | (((u64)rep_done & 0xfff) << 32);
-	if (longmode) {
-		kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
-	} else {
-		kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
-		kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
-	}
-
+	kvm_hv_hypercall_set_result(vcpu, ret);
 	return 1;
 }
--- a/arch/x86/kvm/hyperv.h
+++ b/arch/x86/kvm/hyperv.h
@@ -24,9 +24,67 @@
 #ifndef __ARCH_X86_KVM_HYPERV_H__
 #define __ARCH_X86_KVM_HYPERV_H__
 
-int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
+static inline struct kvm_vcpu_hv *vcpu_to_hv_vcpu(struct kvm_vcpu *vcpu)
+{
+	return &vcpu->arch.hyperv;
+}
+
+static inline struct kvm_vcpu *hv_vcpu_to_vcpu(struct kvm_vcpu_hv *hv_vcpu)
+{
+	struct kvm_vcpu_arch *arch;
+
+	arch = container_of(hv_vcpu, struct kvm_vcpu_arch, hyperv);
+	return container_of(arch, struct kvm_vcpu, arch);
+}
+
+static inline struct kvm_vcpu_hv_synic *vcpu_to_synic(struct kvm_vcpu *vcpu)
+{
+	return &vcpu->arch.hyperv.synic;
+}
+
+static inline struct kvm_vcpu *synic_to_vcpu(struct kvm_vcpu_hv_synic *synic)
+{
+	return hv_vcpu_to_vcpu(container_of(synic, struct kvm_vcpu_hv, synic));
+}
+
+int kvm_hv_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host);
 int kvm_hv_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
+
 bool kvm_hv_hypercall_enabled(struct kvm *kvm);
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu);
 
+void kvm_hv_irq_routing_update(struct kvm *kvm);
+int kvm_hv_synic_set_irq(struct kvm *kvm, u32 vcpu_id, u32 sint);
+void kvm_hv_synic_send_eoi(struct kvm_vcpu *vcpu, int vector);
+int kvm_hv_activate_synic(struct kvm_vcpu *vcpu);
+
+void kvm_hv_vcpu_init(struct kvm_vcpu *vcpu);
+void kvm_hv_vcpu_uninit(struct kvm_vcpu *vcpu);
+
+static inline struct kvm_vcpu_hv_stimer *vcpu_to_stimer(struct kvm_vcpu *vcpu,
+							int timer_index)
+{
+	return &vcpu_to_hv_vcpu(vcpu)->stimer[timer_index];
+}
+
+static inline struct kvm_vcpu *stimer_to_vcpu(struct kvm_vcpu_hv_stimer *stimer)
+{
+	struct kvm_vcpu_hv *hv_vcpu;
+
+	hv_vcpu = container_of(stimer - stimer->index, struct kvm_vcpu_hv,
+			       stimer[0]);
+	return hv_vcpu_to_vcpu(hv_vcpu);
+}
+
+static inline bool kvm_hv_has_stimer_pending(struct kvm_vcpu *vcpu)
+{
+	return !bitmap_empty(vcpu->arch.hyperv.stimer_pending_bitmap,
+			     HV_SYNIC_STIMER_COUNT);
+}
+
+void kvm_hv_process_stimers(struct kvm_vcpu *vcpu);
+
+void kvm_hv_setup_tsc_page(struct kvm *kvm,
+			   struct pvclock_vcpu_time_info *hv_clock);
+
 #endif
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -236,22 +236,14 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
 {
 	struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
 						 irq_ack_notifier);
-	int value;
-
-	spin_lock(&ps->inject_lock);
-	value = atomic_dec_return(&ps->pending);
-	if (value < 0)
-		/* spurious acks can be generated if, for example, the
-		 * PIC is being reset.  Handle it gracefully here
-		 */
-		atomic_inc(&ps->pending);
-	else if (value > 0 && ps->reinject)
-		/* in this case, we had multiple outstanding pit interrupts
-		 * that we needed to inject.  Reinject
-		 */
+
+	atomic_set(&ps->irq_ack, 1);
+	/* irq_ack should be set before pending is read.  Order accesses with
+	 * inc(pending) in pit_timer_fn and xchg(irq_ack, 0) in pit_do_work.
+	 */
+	smp_mb();
+	if (atomic_dec_if_positive(&ps->pending) > 0 && ps->reinject)
 		queue_kthread_work(&ps->pit->worker, &ps->pit->expired);
-	ps->irq_ack = 1;
-	spin_unlock(&ps->inject_lock);
 }
 
 void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -282,36 +274,25 @@ static void pit_do_work(struct kthread_work *work)
 	struct kvm_vcpu *vcpu;
 	int i;
 	struct kvm_kpit_state *ps = &pit->pit_state;
-	int inject = 0;
 
-	/* Try to inject pending interrupts when
-	 * last one has been acked.
+	if (ps->reinject && !atomic_xchg(&ps->irq_ack, 0))
+		return;
+
+	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
+	kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
+
+	/*
+	 * Provides NMI watchdog support via Virtual Wire mode.
+	 * The route is: PIT -> LVT0 in NMI mode.
+	 *
+	 * Note: Our Virtual Wire implementation does not follow
+	 * the MP specification.  We propagate a PIT interrupt to all
+	 * VCPUs and only when LVT0 is in NMI mode.  The interrupt can
+	 * also be simultaneously delivered through PIC and IOAPIC.
 	 */
-	spin_lock(&ps->inject_lock);
-	if (!ps->reinject)
-		inject = 1;
-	else if (ps->irq_ack) {
-		ps->irq_ack = 0;
-		inject = 1;
-	}
-	spin_unlock(&ps->inject_lock);
-	if (inject) {
-		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1, false);
-		kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0, false);
-
-		/*
-		 * Provides NMI watchdog support via Virtual Wire mode.
-		 * The route is: PIT -> PIC -> LVT0 in NMI mode.
-		 *
-		 * Note: Our Virtual Wire implementation is simplified, only
-		 * propagating PIT interrupts to all VCPUs when they have set
-		 * LVT0 to NMI delivery. Other PIC interrupts are just sent to
-		 * VCPU0, and only if its LVT0 is in EXTINT mode.
-		 */
-		if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
-			kvm_for_each_vcpu(i, vcpu, kvm)
-				kvm_apic_nmi_wd_deliver(vcpu);
-	}
+	if (atomic_read(&kvm->arch.vapics_in_nmi_mode) > 0)
+		kvm_for_each_vcpu(i, vcpu, kvm)
+			kvm_apic_nmi_wd_deliver(vcpu);
 }
 
 static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
@@ -331,6 +312,12 @@ static enum hrtimer_restart pit_timer_fn(struct hrtimer *data)
 		return HRTIMER_NORESTART;
 }
 
+static inline void kvm_pit_reset_reinject(struct kvm_pit *pit)
+{
+	atomic_set(&pit->pit_state.pending, 0);
+	atomic_set(&pit->pit_state.irq_ack, 1);
+}
+
 static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 {
 	struct kvm_kpit_state *ps = &kvm->arch.vpit->pit_state;
@@ -353,8 +340,7 @@ static void create_pit_timer(struct kvm *kvm, u32 val, int is_period)
 	ps->timer.function = pit_timer_fn;
 	ps->kvm = ps->pit->kvm;
 
-	atomic_set(&ps->pending, 0);
-	ps->irq_ack = 1;
+	kvm_pit_reset_reinject(ps->pit);
 
 	/*
 	 * Do not allow the guest to program periodic timers with small
@@ -649,18 +635,15 @@ void kvm_pit_reset(struct kvm_pit *pit)
 	}
 	mutex_unlock(&pit->pit_state.lock);
 
-	atomic_set(&pit->pit_state.pending, 0);
-	pit->pit_state.irq_ack = 1;
+	kvm_pit_reset_reinject(pit);
 }
 
 static void pit_mask_notifer(struct kvm_irq_mask_notifier *kimn, bool mask)
 {
 	struct kvm_pit *pit = container_of(kimn, struct kvm_pit, mask_notifier);
 
-	if (!mask) {
-		atomic_set(&pit->pit_state.pending, 0);
-		pit->pit_state.irq_ack = 1;
-	}
+	if (!mask)
+		kvm_pit_reset_reinject(pit);
 }
 
 static const struct kvm_io_device_ops pit_dev_ops = {
@@ -694,7 +677,6 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
 
 	mutex_init(&pit->pit_state.lock);
 	mutex_lock(&pit->pit_state.lock);
-	spin_lock_init(&pit->pit_state.inject_lock);
 
 	pid = get_pid(task_tgid(current));
 	pid_nr = pid_vnr(pid);
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -33,8 +33,7 @@ struct kvm_kpit_state {
 	u32    speaker_data_on;
 	struct mutex lock;
 	struct kvm_pit *pit;
-	spinlock_t inject_lock;
-	unsigned long irq_ack;
+	atomic_t irq_ack;
 	struct kvm_irq_ack_notifier irq_ack_notifier;
 };
 
--- a/arch/x86/kvm/ioapic.c
+++ b/arch/x86/kvm/ioapic.c
@@ -235,7 +235,6 @@ static void kvm_ioapic_inject_all(struct kvm_ioapic *ioapic, unsigned long irr)
 	kvm_rtc_eoi_tracking_restore_all(ioapic);
 }
 
-
 void kvm_ioapic_scan_entry(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 {
 	struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic;
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -79,10 +79,10 @@ int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v)
 	if (kvm_cpu_has_extint(v))
 		return 1;
 
-	if (kvm_vcpu_apic_vid_enabled(v))
+	if (kvm_vcpu_apicv_active(v))
 		return 0;
 
-	return kvm_apic_has_interrupt(v) != -1; /* LAPIC */
+	return kvm_apic_has_interrupt(v, false) != -1; /* LAPIC */
 }
 
 /*
@@ -97,7 +97,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *v)
 	if (kvm_cpu_has_extint(v))
 		return 1;
 
-	return kvm_apic_has_interrupt(v) != -1;	/* LAPIC */
+	return kvm_apic_has_interrupt(v, true) != -1;	/* LAPIC */
 }
 EXPORT_SYMBOL_GPL(kvm_cpu_has_interrupt);
 
@@ -122,7 +122,7 @@ static int kvm_cpu_get_extint(struct kvm_vcpu *v)
 /*
  * Read pending interrupt vector and intack.
  */
-int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
+int kvm_cpu_get_interrupt(struct kvm_vcpu *v, bool make_req)
 {
 	int vector;
 
@@ -134,7 +134,7 @@ int kvm_cpu_get_interrupt(struct kvm_vcpu *v)
 	if (vector != -1)
 		return vector;			/* PIC */
 
-	return kvm_get_apic_interrupt(v);	/* APIC */
+	return kvm_get_apic_interrupt(v, make_req);	/* APIC */
 }
 EXPORT_SYMBOL_GPL(kvm_cpu_get_interrupt);
 
--- a/arch/x86/kvm/irq_comm.c
+++ b/arch/x86/kvm/irq_comm.c
@@ -35,6 +35,8 @@
 
 #include "x86.h"
 
+#include "hyperv.h"
+
 static int kvm_set_pic_irq(struct kvm_kernel_irq_routing_entry *e,
 			   struct kvm *kvm, int irq_source_id, int level,
 			   bool line_status)
@@ -253,6 +255,16 @@ void kvm_fire_mask_notifiers(struct kvm *kvm, unsigned irqchip, unsigned pin,
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
 
+static int kvm_hv_set_sint(struct kvm_kernel_irq_routing_entry *e,
+		    struct kvm *kvm, int irq_source_id, int level,
+		    bool line_status)
+{
+	if (!level)
+		return -1;
+
+	return kvm_hv_synic_set_irq(kvm, e->hv_sint.vcpu, e->hv_sint.sint);
+}
+
 int kvm_set_routing_entry(struct kvm *kvm,
 			  struct kvm_kernel_irq_routing_entry *e,
 			  const struct kvm_irq_routing_entry *ue)
@@ -295,6 +307,11 @@ int kvm_set_routing_entry(struct kvm *kvm,
 		if (kvm_msi_route_invalid(kvm, e))
 			goto out;
 		break;
+	case KVM_IRQ_ROUTING_HV_SINT:
+		e->set = kvm_hv_set_sint;
+		e->hv_sint.vcpu = ue->u.hv_sint.vcpu;
+		e->hv_sint.sint = ue->u.hv_sint.sint;
+		break;
 	default:
 		goto out;
 	}
@@ -370,9 +387,21 @@ int kvm_setup_empty_irq_routing(struct kvm *kvm)
 	return kvm_set_irq_routing(kvm, empty_routing, 0, 0);
 }
 
-void kvm_arch_irq_routing_update(struct kvm *kvm)
+int kvm_arch_set_irq(struct kvm_kernel_irq_routing_entry *irq, struct kvm *kvm,
+		     int irq_source_id, int level, bool line_status)
 {
-	if (ioapic_in_kernel(kvm) || !irqchip_in_kernel(kvm))
+	switch (irq->type) {
+	case KVM_IRQ_ROUTING_HV_SINT:
+		return kvm_hv_set_sint(irq, kvm, irq_source_id, level,
+				       line_status);
+	default:
+		return -EWOULDBLOCK;
+	}
+}
+
+void kvm_arch_post_irq_routing_update(struct kvm *kvm)
+{
+	if (!irqchip_split(kvm))
 		return;
 	kvm_make_scan_ioapic_request(kvm);
 }
@@ -406,3 +435,8 @@ void kvm_scan_ioapic_routes(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap)
 	}
 	srcu_read_unlock(&kvm->irq_srcu, idx);
 }
+
+void kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+	kvm_hv_irq_routing_update(kvm);
+}
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -41,6 +41,7 @@
 #include "trace.h"
 #include "x86.h"
 #include "cpuid.h"
+#include "hyperv.h"
 
 #ifndef CONFIG_X86_64
 #define mod_64(x, y) ((x) - (y) * div64_u64(x, y))
@@ -128,17 +129,6 @@ static inline int apic_enabled(struct kvm_lapic *apic)
 	(LVT_MASK | APIC_MODE_MASK | APIC_INPUT_POLARITY | \
 	 APIC_LVT_REMOTE_IRR | APIC_LVT_LEVEL_TRIGGER)
 
-static inline u32 kvm_apic_id(struct kvm_lapic *apic)
-{
-	/* To avoid a race between apic_base and following APIC_ID update when
-	 * switching to x2apic_mode, the x2apic mode returns initial x2apic id.
-	 */
-	if (apic_x2apic_mode(apic))
-		return apic->vcpu->vcpu_id;
-
-	return kvm_apic_get_reg(apic, APIC_ID) >> 24;
-}
-
 static inline bool kvm_apic_map_get_logical_dest(struct kvm_apic_map *map,
 		u32 dest_id, struct kvm_lapic ***cluster, u16 *mask) {
 	switch (map->mode) {
@@ -402,7 +392,8 @@ static inline int apic_find_highest_irr(struct kvm_lapic *apic)
 	if (!apic->irr_pending)
 		return -1;
 
-	kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
+	if (apic->vcpu->arch.apicv_active)
+		kvm_x86_ops->sync_pir_to_irr(apic->vcpu);
 	result = apic_search_irr(apic);
 	ASSERT(result == -1 || result >= 16);
 
@@ -415,7 +406,7 @@ static inline void apic_clear_irr(int vec, struct kvm_lapic *apic)
 
 	vcpu = apic->vcpu;
 
-	if (unlikely(kvm_vcpu_apic_vid_enabled(vcpu))) {
+	if (unlikely(vcpu->arch.apicv_active)) {
 		/* try to update RVI */
 		apic_clear_vector(vec, apic->regs + APIC_IRR);
 		kvm_make_request(KVM_REQ_EVENT, vcpu);
@@ -441,7 +432,7 @@ static inline void apic_set_isr(int vec, struct kvm_lapic *apic)
 	 * because the processor can modify ISR under the hood.  Instead
 	 * just set SVI.
 	 */
-	if (unlikely(kvm_x86_ops->hwapic_isr_update))
+	if (unlikely(vcpu->arch.apicv_active))
 		kvm_x86_ops->hwapic_isr_update(vcpu->kvm, vec);
 	else {
 		++apic->isr_count;
@@ -489,7 +480,7 @@ static inline void apic_clear_isr(int vec, struct kvm_lapic *apic)
 	 * on the other hand isr_count and highest_isr_cache are unused
 	 * and must be left alone.
 	 */
-	if (unlikely(kvm_x86_ops->hwapic_isr_update))
+	if (unlikely(vcpu->arch.apicv_active))
 		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
 					       apic_find_highest_isr(apic));
 	else {
@@ -570,7 +561,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
 	__clear_bit(KVM_APIC_PV_EOI_PENDING, &vcpu->arch.apic_attention);
 }
 
-static void apic_update_ppr(struct kvm_lapic *apic)
+static void apic_update_ppr(struct kvm_lapic *apic, bool make_req)
 {
 	u32 tpr, isrv, ppr, old_ppr;
 	int isr;
@@ -590,7 +581,7 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 
 	if (old_ppr != ppr) {
 		apic_set_reg(apic, APIC_PROCPRI, ppr);
-		if (ppr < old_ppr)
+		if (make_req && ppr < old_ppr)
 			kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
 	}
 }
@@ -598,7 +589,7 @@ static void apic_update_ppr(struct kvm_lapic *apic)
 static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
 {
 	apic_set_reg(apic, APIC_TASKPRI, tpr);
-	apic_update_ppr(apic);
+	apic_update_ppr(apic, true);
 }
 
 static bool kvm_apic_broadcast(struct kvm_lapic *apic, u32 mda)
@@ -937,7 +928,14 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
 				apic_clear_vector(vector, apic->regs + APIC_TMR);
 		}
 
-		if (kvm_x86_ops->deliver_posted_interrupt)
+		if (apic_test_vector(vector, apic->regs + APIC_TMR) != !!trig_mode) {
+			if (trig_mode)
+				apic_set_vector(vector, apic->regs + APIC_TMR);
+			else
+				apic_clear_vector(vector, apic->regs + APIC_TMR);
+		}
+
+		if (vcpu->arch.apicv_active)
 			kvm_x86_ops->deliver_posted_interrupt(vcpu, vector);
 		else {
 			apic_set_irr(vector, apic);
@@ -1057,7 +1055,10 @@ static int apic_set_eoi(struct kvm_lapic *apic)
 		return vector;
 
 	apic_clear_isr(vector, apic);
-	apic_update_ppr(apic);
+	apic_update_ppr(apic, true);
+
+	if (test_bit(vector, vcpu_to_synic(apic->vcpu)->vec_bitmap))
+		kvm_hv_synic_send_eoi(apic->vcpu, vector);
 
 	kvm_ioapic_send_eoi(apic, vector);
 	kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
@@ -1169,7 +1170,7 @@ static u32 __apic_read(struct kvm_lapic *apic, unsigned int offset)
 		val = apic_get_tmcct(apic);
 		break;
 	case APIC_PROCPRI:
-		apic_update_ppr(apic);
+		apic_update_ppr(apic, true);
 		val = kvm_apic_get_reg(apic, offset);
 		break;
 	case APIC_TASKPRI:
@@ -1274,7 +1275,7 @@ static bool lapic_timer_int_injected(struct kvm_vcpu *vcpu)
 		int vec = reg & APIC_VECTOR_MASK;
 		void *bitmap = apic->regs + APIC_ISR;
 
-		if (kvm_x86_ops->deliver_posted_interrupt)
+		if (vcpu->arch.apicv_active)
 			bitmap = apic->regs + APIC_IRR;
 
 		if (apic_test_vector(vec, bitmap))
@@ -1767,8 +1768,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 		apic_set_reg(apic, APIC_ISR + 0x10 * i, 0);
 		apic_set_reg(apic, APIC_TMR + 0x10 * i, 0);
 	}
-	apic->irr_pending = kvm_vcpu_apic_vid_enabled(vcpu);
-	apic->isr_count = kvm_x86_ops->hwapic_isr_update ? 1 : 0;
+	apic->irr_pending = vcpu->arch.apicv_active;
+	apic->isr_count = vcpu->arch.apicv_active ? 1 : 0;
 	apic->highest_isr_cache = -1;
 	update_divide_count(apic);
 	atomic_set(&apic->lapic_timer.pending, 0);
@@ -1776,7 +1777,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event)
 		kvm_lapic_set_base(vcpu,
 				vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP);
 	vcpu->arch.pv_eoi.msr_val = 0;
-	apic_update_ppr(apic);
+	apic_update_ppr(apic, true);
 
 	vcpu->arch.apic_arb_prio = 0;
 	vcpu->arch.apic_attention = 0;
@@ -1897,7 +1898,7 @@ nomem:
 	return -ENOMEM;
 }
 
-int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu, bool make_req)
 {
 	struct kvm_lapic *apic = vcpu->arch.apic;
 	int highest_irr;
@@ -1905,7 +1906,7 @@ int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu)
 	if (!apic_enabled(apic))
 		return -1;
 
-	apic_update_ppr(apic);
+	apic_update_ppr(apic, make_req);
 	highest_irr = apic_find_highest_irr(apic);
 	if ((highest_irr == -1) ||
 	    ((highest_irr & 0xF0) <= kvm_apic_get_reg(apic, APIC_PROCPRI)))
@@ -1938,9 +1939,9 @@ void kvm_inject_apic_timer_irqs(struct kvm_vcpu *vcpu)
 	}
 }
 
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu, bool make_req)
 {
-	int vector = kvm_apic_has_interrupt(vcpu);
+	int vector = kvm_apic_has_interrupt(vcpu, make_req);
 	struct kvm_lapic *apic = vcpu->arch.apic;
 
 	if (vector == -1)
@@ -1954,8 +1955,14 @@ int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu)
 	 */
 
 	apic_set_isr(vector, apic);
-	apic_update_ppr(apic);
+	apic_update_ppr(apic, true);
 	apic_clear_irr(vector, apic);
+
+	if (test_bit(vector, vcpu_to_synic(vcpu)->auto_eoi_bitmap)) {
+		apic_clear_isr(vector, apic);
+		apic_update_ppr(apic, true);
+	}
+
 	return vector;
 }
 
@@ -2003,22 +2010,22 @@ int kvm_apic_set_state(struct kvm_vcpu *vcpu, struct kvm_lapic_state *s)
 	recalculate_apic_map(vcpu->kvm);
 	kvm_apic_set_version(vcpu);
 
-	apic_update_ppr(apic);
+	apic_update_ppr(apic, true);
 	hrtimer_cancel(&apic->lapic_timer.timer);
 	apic_update_lvtt(apic);
 	apic_manage_nmi_watchdog(apic, kvm_apic_get_reg(apic, APIC_LVT0));
 	update_divide_count(apic);
 	start_apic_timer(apic);
 	apic->irr_pending = true;
-	apic->isr_count = kvm_x86_ops->hwapic_isr_update ?
+	apic->isr_count = vcpu->arch.apicv_active ?
 				1 : count_vectors(apic->regs + APIC_ISR);
 	apic->highest_isr_cache = -1;
-	if (kvm_x86_ops->hwapic_irr_update)
+	if (vcpu->arch.apicv_active) {
 		kvm_x86_ops->hwapic_irr_update(vcpu,
 				apic_find_highest_irr(apic));
-	if (unlikely(kvm_x86_ops->hwapic_isr_update))
 		kvm_x86_ops->hwapic_isr_update(vcpu->kvm,
 				apic_find_highest_isr(apic));
+	}
 	kvm_make_request(KVM_REQ_EVENT, vcpu);
 	if (ioapic_in_kernel(vcpu->kvm))
 		kvm_rtc_eoi_tracking_restore_one(vcpu);
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -48,9 +48,9 @@ struct dest_map;
 int kvm_create_lapic(struct kvm_vcpu *vcpu);
 void kvm_free_lapic(struct kvm_vcpu *vcpu);
 
-int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu);
+int kvm_apic_has_interrupt(struct kvm_vcpu *vcpu, bool make_req);
 int kvm_apic_accept_pic_intr(struct kvm_vcpu *vcpu);
-int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu);
+int kvm_get_apic_interrupt(struct kvm_vcpu *vcpu, bool make_req);
 void kvm_apic_accept_events(struct kvm_vcpu *vcpu);
 void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event);
 u64 kvm_lapic_get_cr8(struct kvm_vcpu *vcpu);
@@ -146,9 +146,9 @@ static inline int apic_x2apic_mode(struct kvm_lapic *apic)
 	return apic->vcpu->arch.apic_base & X2APIC_ENABLE;
 }
 
-static inline bool kvm_vcpu_apic_vid_enabled(struct kvm_vcpu *vcpu)
+static inline bool kvm_vcpu_apicv_active(struct kvm_vcpu *vcpu)
 {
-	return kvm_x86_ops->cpu_uses_apicv(vcpu);
+	return vcpu->arch.apic && vcpu->arch.apicv_active;
 }
 
 static inline bool kvm_apic_has_events(struct kvm_vcpu *vcpu)
@@ -167,6 +167,17 @@ static inline bool kvm_lowest_prio_delivery(struct kvm_lapic_irq *irq)
 			irq->msi_redir_hint);
 }
 
+static inline u32 kvm_apic_id(struct kvm_lapic *apic)
+{
+	/* To avoid a race between apic_base and following APIC_ID update when
+	 * switching to x2apic_mode, the x2apic mode returns initial x2apic id.
+	 */
+	if (apic_x2apic_mode(apic))
+		return apic->vcpu->vcpu_id;
+
+	return kvm_apic_get_reg(apic, APIC_ID) >> 24;
+}
+
 bool kvm_apic_pending_eoi(struct kvm_vcpu *vcpu, int vector);
 
 void wait_lapic_expire(struct kvm_vcpu *vcpu);
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3528,7 +3528,7 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
 	if (!async)
 		return false; /* *pfn has correct page already */
 
-	if (!prefault && can_do_async_pf(vcpu)) {
+	if (!prefault && !is_guest_mode(vcpu) && can_do_async_pf(vcpu)) {
 		trace_kvm_try_async_get_page(gva, gfn);
 		if (kvm_find_async_pf_gfn(vcpu, gfn)) {
 			trace_kvm_async_pf_doublefault(gva, gfn);
@@ -4889,13 +4889,12 @@ void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, struct kvm_memslots *slots)
 	}
 }
 
-static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+mmu_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct kvm *kvm;
 	int nr_to_scan = sc->nr_to_scan;
-
-	if (nr_to_scan == 0)
-		goto out;
+	unsigned long freed = 0;
 
 	spin_lock(&kvm_lock);
 
@@ -4930,25 +4929,37 @@ static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
 			goto unlock;
 		}
 
-		prepare_zap_oldest_mmu_page(kvm, &invalid_list);
+		if (prepare_zap_oldest_mmu_page(kvm, &invalid_list))
+			freed++;
 		kvm_mmu_commit_zap_page(kvm, &invalid_list);
 
 unlock:
 		spin_unlock(&kvm->mmu_lock);
 		srcu_read_unlock(&kvm->srcu, idx);
 
+		/*
+		 * unfair on small ones
+		 * per-vm shrinkers cry out
+		 * sadness comes quickly
+		 */
 		list_move_tail(&kvm->vm_list, &vm_list);
 		break;
 	}
 
 	spin_unlock(&kvm_lock);
+	return freed;
 
-out:
+}
+
+static unsigned long
+mmu_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
 	return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
 }
 
 static struct shrinker mmu_shrinker = {
-	.shrink = mmu_shrink,
+	.count_objects = mmu_shrink_count,
+	.scan_objects = mmu_shrink_scan,
 	.seeks = DEFAULT_SEEKS * 10,
 };
 
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -957,13 +957,6 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
 	seg->base = 0;
 }
 
-static u64 svm_read_tsc_offset(struct kvm_vcpu *vcpu)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	return svm->vmcb->control.tsc_offset;
-}
-
 static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -983,21 +976,6 @@ static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
 }
 
-static void svm_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
-{
-	struct vcpu_svm *svm = to_svm(vcpu);
-
-	svm->vmcb->control.tsc_offset += adjustment;
-	if (is_guest_mode(vcpu))
-		svm->nested.hsave->control.tsc_offset += adjustment;
-	else
-		trace_kvm_write_tsc_offset(vcpu->vcpu_id,
-				     svm->vmcb->control.tsc_offset - adjustment,
-				     svm->vmcb->control.tsc_offset);
-
-	mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
-}
-
 static void init_vmcb(struct vcpu_svm *svm)
 {
 	struct vmcb_control_area *control = &svm->vmcb->control;
@@ -1853,8 +1831,7 @@ static int halt_interception(struct vcpu_svm *svm)
 static int vmmcall_interception(struct vcpu_svm *svm)
 {
 	svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
-	kvm_emulate_hypercall(&svm->vcpu);
-	return 1;
+	return kvm_emulate_hypercall(&svm->vcpu);
 }
 
 static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
@@ -2972,12 +2949,6 @@ static int cr8_write_interception(struct vcpu_svm *svm)
 	return 0;
 }
 
-static u64 svm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
-{
-	struct vmcb *vmcb = get_host_vmcb(to_svm(vcpu));
-	return vmcb->control.tsc_offset + host_tsc;
-}
-
 static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 {
 	struct vcpu_svm *svm = to_svm(vcpu);
@@ -3552,9 +3523,14 @@ static void svm_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	return;
 }
 
-static int svm_cpu_uses_apicv(struct kvm_vcpu *vcpu)
+static bool svm_get_enable_apicv(void)
 {
-	return 0;
+	return false;
+}
+
+static void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+	return;
 }
 
 static void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu)
@@ -4314,7 +4290,8 @@ static struct kvm_x86_ops svm_x86_ops = {
 	.enable_irq_window = enable_irq_window,
 	.update_cr8_intercept = update_cr8_intercept,
 	.set_virtual_x2apic_mode = svm_set_virtual_x2apic_mode,
-	.cpu_uses_apicv = svm_cpu_uses_apicv,
+	.get_enable_apicv = svm_get_enable_apicv,
+	.refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl,
 	.load_eoi_exitmap = svm_load_eoi_exitmap,
 	.sync_pir_to_irr = svm_sync_pir_to_irr,
 
@@ -4336,10 +4313,7 @@ static struct kvm_x86_ops svm_x86_ops = {
 
 	.has_wbinvd_exit = svm_has_wbinvd_exit,
 
-	.read_tsc_offset = svm_read_tsc_offset,
 	.write_tsc_offset = svm_write_tsc_offset,
-	.adjust_tsc_offset_guest = svm_adjust_tsc_offset_guest,
-	.read_l1_tsc = svm_read_l1_tsc,
 
 	.set_tdp_cr3 = set_tdp_cr3,
 
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -1029,6 +1029,269 @@ TRACE_EVENT(kvm_pi_irte_update,
 		  __entry->pi_desc_addr)
 );
 
+/*
+ * Tracepoint for kvm_hv_notify_acked_sint.
+ */
+TRACE_EVENT(kvm_hv_notify_acked_sint,
+	TP_PROTO(int vcpu_id, u32 sint),
+	TP_ARGS(vcpu_id, sint),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(u32, sint)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->sint = sint;
+	),
+
+	TP_printk("vcpu_id %d sint %u", __entry->vcpu_id, __entry->sint)
+);
+
+/*
+ * Tracepoint for synic_set_irq.
+ */
+TRACE_EVENT(kvm_hv_synic_set_irq,
+	TP_PROTO(int vcpu_id, u32 sint, int vector, int ret),
+	TP_ARGS(vcpu_id, sint, vector, ret),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(u32, sint)
+		__field(int, vector)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->sint = sint;
+		__entry->vector = vector;
+		__entry->ret = ret;
+	),
+
+	TP_printk("vcpu_id %d sint %u vector %d ret %d",
+		  __entry->vcpu_id, __entry->sint, __entry->vector,
+		  __entry->ret)
+);
+
+/*
+ * Tracepoint for kvm_hv_synic_send_eoi.
+ */
+TRACE_EVENT(kvm_hv_synic_send_eoi,
+	TP_PROTO(int vcpu_id, int vector),
+	TP_ARGS(vcpu_id, vector),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(u32, sint)
+		__field(int, vector)
+		__field(int, ret)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->vector	= vector;
+	),
+
+	TP_printk("vcpu_id %d vector %d", __entry->vcpu_id, __entry->vector)
+);
+
+/*
+ * Tracepoint for synic_set_msr.
+ */
+TRACE_EVENT(kvm_hv_synic_set_msr,
+	TP_PROTO(int vcpu_id, u32 msr, u64 data, bool host),
+	TP_ARGS(vcpu_id, msr, data, host),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(u32, msr)
+		__field(u64, data)
+		__field(bool, host)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->msr = msr;
+		__entry->data = data;
+		__entry->host = host
+	),
+
+	TP_printk("vcpu_id %d msr 0x%x data 0x%llx host %d",
+		  __entry->vcpu_id, __entry->msr, __entry->data, __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_set_config.
+ */
+TRACE_EVENT(kvm_hv_stimer_set_config,
+	TP_PROTO(int vcpu_id, int timer_index, u64 config, bool host),
+	TP_ARGS(vcpu_id, timer_index, config, host),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(int, timer_index)
+		__field(u64, config)
+		__field(bool, host)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->timer_index = timer_index;
+		__entry->config = config;
+		__entry->host = host;
+	),
+
+	TP_printk("vcpu_id %d timer %d config 0x%llx host %d",
+		  __entry->vcpu_id, __entry->timer_index, __entry->config,
+		  __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_set_count.
+ */
+TRACE_EVENT(kvm_hv_stimer_set_count,
+	TP_PROTO(int vcpu_id, int timer_index, u64 count, bool host),
+	TP_ARGS(vcpu_id, timer_index, count, host),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(int, timer_index)
+		__field(u64, count)
+		__field(bool, host)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->timer_index = timer_index;
+		__entry->count = count;
+		__entry->host = host;
+	),
+
+	TP_printk("vcpu_id %d timer %d count %llu host %d",
+		  __entry->vcpu_id, __entry->timer_index, __entry->count,
+		  __entry->host)
+);
+
+/*
+ * Tracepoint for stimer_start(periodic timer case).
+ */
+TRACE_EVENT(kvm_hv_stimer_start_periodic,
+	TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 exp_time),
+	TP_ARGS(vcpu_id, timer_index, time_now, exp_time),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(int, timer_index)
+		__field(u64, time_now)
+		__field(u64, exp_time)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->timer_index = timer_index;
+		__entry->time_now = time_now;
+		__entry->exp_time = exp_time;
+	),
+
+	TP_printk("vcpu_id %d timer %d time_now %llu exp_time %llu",
+		  __entry->vcpu_id, __entry->timer_index, __entry->time_now,
+		  __entry->exp_time)
+);
+
+/*
+ * Tracepoint for stimer_start(one-shot timer case).
+ */
+TRACE_EVENT(kvm_hv_stimer_start_one_shot,
+	TP_PROTO(int vcpu_id, int timer_index, u64 time_now, u64 count),
+	TP_ARGS(vcpu_id, timer_index, time_now, count),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(int, timer_index)
+		__field(u64, time_now)
+		__field(u64, count)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->timer_index = timer_index;
+		__entry->time_now = time_now;
+		__entry->count = count;
+	),
+
+	TP_printk("vcpu_id %d timer %d time_now %llu count %llu",
+		  __entry->vcpu_id, __entry->timer_index, __entry->time_now,
+		  __entry->count)
+);
+
+/*
+ * Tracepoint for stimer_timer_callback.
+ */
+TRACE_EVENT(kvm_hv_stimer_callback,
+	TP_PROTO(int vcpu_id, int timer_index),
+	TP_ARGS(vcpu_id, timer_index),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(int, timer_index)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->timer_index = timer_index;
+	),
+
+	TP_printk("vcpu_id %d timer %d",
+		  __entry->vcpu_id, __entry->timer_index)
+);
+
+/*
+ * Tracepoint for stimer_expiration.
+ */
+TRACE_EVENT(kvm_hv_stimer_expiration,
+	TP_PROTO(int vcpu_id, int timer_index, int msg_send_result),
+	TP_ARGS(vcpu_id, timer_index, msg_send_result),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(int, timer_index)
+		__field(int, msg_send_result)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->timer_index = timer_index;
+		__entry->msg_send_result = msg_send_result;
+	),
+
+	TP_printk("vcpu_id %d timer %d msg send result %d",
+		  __entry->vcpu_id, __entry->timer_index,
+		  __entry->msg_send_result)
+);
+
+/*
+ * Tracepoint for stimer_cleanup.
+ */
+TRACE_EVENT(kvm_hv_stimer_cleanup,
+	TP_PROTO(int vcpu_id, int timer_index),
+	TP_ARGS(vcpu_id, timer_index),
+
+	TP_STRUCT__entry(
+		__field(int, vcpu_id)
+		__field(int, timer_index)
+	),
+
+	TP_fast_assign(
+		__entry->vcpu_id = vcpu_id;
+		__entry->timer_index = timer_index;
+	),
+
+	TP_printk("vcpu_id %d timer %d",
+		  __entry->vcpu_id, __entry->timer_index)
+);
+
 #endif /* _TRACE_KVM_H */
 
 #undef TRACE_INCLUDE_PATH
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -19,6 +19,7 @@
 #include "irq.h"
 #include "mmu.h"
 #include "cpuid.h"
+#include "lapic.h"
 
 #include <linux/kvm_host.h>
 #include <linux/module.h>
@@ -404,7 +405,6 @@ struct nested_vmx {
 	/* vmcs02_list cache of VMCSs recently used to run L2 guests */
 	struct list_head vmcs02_pool;
 	int vmcs02_num;
-	u64 vmcs01_tsc_offset;
 	/* L2 must run next, and mustn't decide to exit to L1. */
 	bool nested_run_pending;
 	/*
@@ -858,7 +858,6 @@ static unsigned long nested_ept_get_cr3(struct kvm_vcpu *vcpu);
 static u64 construct_eptp(unsigned long root_hpa);
 static void kvm_cpu_vmxon(u64 addr);
 static void kvm_cpu_vmxoff(void);
-static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu);
 static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
 static void vmx_set_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg);
@@ -866,7 +865,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
 			    struct kvm_segment *var, int seg);
 static bool guest_state_valid(struct kvm_vcpu *vcpu);
 static u32 vmx_segment_access_rights(struct kvm_segment *var);
-static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu);
 static void copy_vmcs12_to_shadow(struct vcpu_vmx *vmx);
 static void copy_shadow_to_vmcs12(struct vcpu_vmx *vmx);
 static int alloc_identity_pagetable(struct kvm *kvm);
@@ -1245,10 +1243,10 @@ static inline bool nested_cpu_has_posted_intr(struct vmcs12 *vmcs12)
 	return vmcs12->pin_based_vm_exec_control & PIN_BASED_POSTED_INTR;
 }
 
-static inline bool is_exception(u32 intr_info)
+static inline bool is_nmi(u32 intr_info)
 {
 	return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VALID_MASK))
-		== (INTR_TYPE_HARD_EXCEPTION | INTR_INFO_VALID_MASK);
+		== (INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK);
 }
 
 static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
@@ -2331,7 +2329,9 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
 
 	if (is_guest_mode(vcpu))
 		msr_bitmap = vmx_msr_bitmap_nested;
-	else if (vcpu->arch.apic_base & X2APIC_ENABLE) {
+    else if (cpu_has_secondary_exec_ctrls() &&
+             (vmcs_read32(SECONDARY_VM_EXEC_CONTROL) &
+              SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
 		if (is_long_mode(vcpu))
 			msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
 		else
@@ -2404,25 +2404,6 @@ static u64 guest_read_tsc(struct kvm_vcpu *vcpu)
 }
 
 /*
- * Like guest_read_tsc, but always returns L1's notion of the timestamp
- * counter, even if a nested guest (L2) is currently running.
- */
-static u64 vmx_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
-{
-	u64 tsc_offset;
-
-	tsc_offset = is_guest_mode(vcpu) ?
-		to_vmx(vcpu)->nested.vmcs01_tsc_offset :
-		vmcs_read64(TSC_OFFSET);
-	return host_tsc + tsc_offset;
-}
-
-static u64 vmx_read_tsc_offset(struct kvm_vcpu *vcpu)
-{
-	return vmcs_read64(TSC_OFFSET);
-}
-
-/*
  * writes 'offset' into guest's timestamp counter offset register
  */
 static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@@ -2435,7 +2416,6 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 		 * to the newly set TSC to get L2's TSC.
 		 */
 		struct vmcs12 *vmcs12;
-		to_vmx(vcpu)->nested.vmcs01_tsc_offset = offset;
 		/* recalculate vmcs02.TSC_OFFSET: */
 		vmcs12 = get_vmcs12(vcpu);
 		vmcs_write64(TSC_OFFSET, offset +
@@ -2448,19 +2428,6 @@ static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
 	}
 }
 
-static void vmx_adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, s64 adjustment)
-{
-	u64 offset = vmcs_read64(TSC_OFFSET);
-
-	vmcs_write64(TSC_OFFSET, offset + adjustment);
-	if (is_guest_mode(vcpu)) {
-		/* Even when running L2, the adjustment needs to apply to L1 */
-		to_vmx(vcpu)->nested.vmcs01_tsc_offset += adjustment;
-	} else
-		trace_kvm_write_tsc_offset(vcpu->vcpu_id, offset,
-					   offset + adjustment);
-}
-
 static bool guest_cpuid_has_vmx(struct kvm_vcpu *vcpu)
 {
 	struct kvm_cpuid_entry2 *best = kvm_find_cpuid_entry(vcpu, 1, 0);
@@ -2518,7 +2485,7 @@ static void nested_vmx_setup_ctls_msrs(struct vcpu_vmx *vmx)
 	vmx->nested.nested_vmx_pinbased_ctls_high |=
 		PIN_BASED_ALWAYSON_WITHOUT_TRUE_MSR |
 		PIN_BASED_VMX_PREEMPTION_TIMER;
-	if (vmx_cpu_uses_apicv(&vmx->vcpu))
+	if (kvm_vcpu_apicv_active(&vmx->vcpu))
 		vmx->nested.nested_vmx_pinbased_ctls_high |=
 			PIN_BASED_POSTED_INTR;
 
@@ -4466,11 +4433,6 @@ static void vmx_disable_intercept_msr_write_x2apic(u32 msr)
 			msr, MSR_TYPE_W);
 }
 
-static int vmx_cpu_uses_apicv(struct kvm_vcpu *vcpu)
-{
-	return enable_apicv && lapic_in_kernel(vcpu);
-}
-
 static int vmx_complete_nested_posted_interrupt(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -4555,6 +4517,7 @@ static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
 	}
 	return -1;
 }
+
 /*
  * Send interrupt to vcpu via posted interrupt way.
  * 1. If target vcpu is running(non-root mode), send posted interrupt
@@ -4590,11 +4553,6 @@ static void vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
 	kvm_apic_update_irr(vcpu, vmx->pi_desc.pir);
 }
 
-static void vmx_sync_pir_to_irr_dummy(struct kvm_vcpu *vcpu)
-{
-	return;
-}
-
 /*
  * Set up the vmcs's constant host-state fields, i.e., host-state fields that
  * will not change in the lifetime of the guest.
@@ -4664,11 +4622,36 @@ static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
 {
 	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
 
-	if (!vmx_cpu_uses_apicv(&vmx->vcpu))
+	if (!kvm_vcpu_apicv_active(&vmx->vcpu))
 		pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
 	return pin_based_exec_ctrl;
 }
 
+static bool vmx_get_enable_apicv(void)
+{
+	return enable_apicv;
+}
+
+static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
+{
+	struct vcpu_vmx *vmx = to_vmx(vcpu);
+
+	vmcs_write32(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_ctrl(vmx));
+	if (cpu_has_secondary_exec_ctrls()) {
+		if (kvm_vcpu_apicv_active(vcpu))
+			vmcs_set_bits(SECONDARY_VM_EXEC_CONTROL,
+				      SECONDARY_EXEC_APIC_REGISTER_VIRT |
+				      SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+		else
+			vmcs_clear_bits(SECONDARY_VM_EXEC_CONTROL,
+					SECONDARY_EXEC_APIC_REGISTER_VIRT |
+					SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
+	}
+
+	if (cpu_has_vmx_msr_bitmap())
+		vmx_set_msr_bitmap(vcpu);
+}
+
 static u32 vmx_exec_control(struct vcpu_vmx *vmx)
 {
 	u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
@@ -4707,7 +4690,7 @@ static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
 		exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
 	if (!ple_gap)
 		exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
-	if (!vmx_cpu_uses_apicv(&vmx->vcpu))
+	if (!kvm_vcpu_apicv_active(&vmx->vcpu))
 		exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT |
 				  SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
 	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
@@ -4767,7 +4750,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
 		vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
 				vmx_secondary_exec_control(vmx));
 
-	if (vmx_cpu_uses_apicv(&vmx->vcpu)) {
+	if (kvm_vcpu_apicv_active(&vmx->vcpu)) {
 		vmcs_write64(EOI_EXIT_BITMAP0, 0);
 		vmcs_write64(EOI_EXIT_BITMAP1, 0);
 		vmcs_write64(EOI_EXIT_BITMAP2, 0);
@@ -4916,7 +4899,7 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
 
 	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
 
-	if (vmx_cpu_uses_apicv(vcpu))
+	if (kvm_vcpu_apicv_active(vcpu))
 		memset(&vmx->pi_desc, 0, sizeof(struct pi_desc));
 
 	if (vmx->vpid != 0)
@@ -5014,29 +4997,30 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
 {
 	struct vcpu_vmx *vmx = to_vmx(vcpu);
 
-	if (is_guest_mode(vcpu))
-		return;
+	if (!is_guest_mode(vcpu)) {
+		if (!cpu_has_virtual_nmis()) {
+			/*
+			 * Tracking the NMI-blocked state in software is built upon
+			 * finding the next open IRQ window. This, in turn, depends on
+			 * well-behaving guests: They have to keep IRQs disabled at
+			 * least as long as the NMI handler runs. Otherwise we may
+			 * cause NMI nesting, maybe breaking the guest. But as this is
+			 * highly unlikely, we can live with the residual risk.
+			 */
+			vmx->soft_vnmi_blocked = 1;
+			vmx->vnmi_blocked_time = 0;
+		}
 
-	if (!cpu_has_virtual_nmis()) {
-		/*
-		 * Tracking the NMI-blocked state in software is built upon
-		 * finding the next open IRQ window. This, in turn, depends on
-		 * well-behaving guests: They have to keep IRQs disabled at
-		 * least as long as the NMI handler runs. Otherwise we may
-		 * cause NMI nesting, maybe breaking the guest. But as this is
-		 * highly unlikely, we can live with the residual risk.
-		 */
-		vmx->soft_vnmi_blocked = 1;
-		vmx->vnmi_blocked_time = 0;
+		++vcpu->stat.nmi_injections;
+		vmx->nmi_known_unmasked = false;
 	}
 
-	++vcpu->stat.nmi_injections;
-	vmx->nmi_known_unmasked = false;
 	if (vmx->rmode.vm86_active) {
 		if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
 			kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
 		return;
 	}
+
 	vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
 			INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
 }
@@ -5202,7 +5186,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
 	if (is_machine_check(intr_info))
 		return handle_machine_check(vcpu);
 
-	if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
+	if (is_nmi(intr_info))
 		return 1;  /* already handled by vmx_vcpu_run() */
 
 	if (is_no_device(intr_info)) {
@@ -5669,8 +5653,7 @@ static int handle_halt(struct kvm_vcpu *vcpu)
 
 static int handle_vmcall(struct kvm_vcpu *vcpu)
 {
-	kvm_emulate_hypercall(vcpu);
-	return 1;
+	return kvm_emulate_hypercall(vcpu);
 }
 
 static int handle_invd(struct kvm_vcpu *vcpu)
@@ -6178,15 +6161,6 @@ static __init int hardware_setup(void)
 		kvm_tsc_scaling_ratio_frac_bits = 48;
 	}
 
-	if (enable_apicv)
-		kvm_x86_ops->update_cr8_intercept = NULL;
-	else {
-		kvm_x86_ops->hwapic_irr_update = NULL;
-		kvm_x86_ops->hwapic_isr_update = NULL;
-		kvm_x86_ops->deliver_posted_interrupt = NULL;
-		kvm_x86_ops->sync_pir_to_irr = vmx_sync_pir_to_irr_dummy;
-	}
-
 	vmx_disable_intercept_for_msr(MSR_FS_BASE, false);
 	vmx_disable_intercept_for_msr(MSR_GS_BASE, false);
 	vmx_disable_intercept_for_msr(MSR_KERNEL_GS_BASE, true);
@@ -6202,23 +6176,20 @@ static __init int hardware_setup(void)
 
 	set_bit(0, vmx_vpid_bitmap); /* 0 is reserved for host */
 
-	if (enable_apicv) {
-		for (msr = 0x800; msr <= 0x8ff; msr++)
-			vmx_disable_intercept_msr_read_x2apic(msr);
-
-		/* According SDM, in x2apic mode, the whole id reg is used.
-		 * But in KVM, it only use the highest eight bits. Need to
-		 * intercept it */
-		vmx_enable_intercept_msr_read_x2apic(0x802);
-		/* TMCCT */
-		vmx_enable_intercept_msr_read_x2apic(0x839);
-		/* TPR */
-		vmx_disable_intercept_msr_write_x2apic(0x808);
-		/* EOI */
-		vmx_disable_intercept_msr_write_x2apic(0x80b);
-		/* SELF-IPI */
-		vmx_disable_intercept_msr_write_x2apic(0x83f);
-	}
+	for (msr = 0x800; msr <= 0x8ff; msr++)
+		vmx_disable_intercept_msr_read_x2apic(msr);
+
+	/* According SDM, in x2apic mode, the whole id reg is used.  But in
+	 * KVM, it only use the highest eight bits. Need to intercept it */
+	vmx_enable_intercept_msr_read_x2apic(0x802);
+	/* TMCCT */
+	vmx_enable_intercept_msr_read_x2apic(0x839);
+	/* TPR */
+	vmx_disable_intercept_msr_write_x2apic(0x808);
+	/* EOI */
+	vmx_disable_intercept_msr_write_x2apic(0x80b);
+	/* SELF-IPI */
+	vmx_disable_intercept_msr_write_x2apic(0x83f);
 
 	if (enable_ept) {
 		kvm_mmu_set_mask_ptes(0ull,
@@ -7659,7 +7630,7 @@ static bool nested_vmx_exit_handled(struct kvm_vcpu *vcpu)
 
 	switch (exit_reason) {
 	case EXIT_REASON_EXCEPTION_NMI:
-		if (!is_exception(intr_info))
+		if (is_nmi(intr_info))
 			return false;
 		else if (is_page_fault(intr_info))
 			return enable_ept;
@@ -8123,7 +8094,7 @@ static void vmx_set_virtual_x2apic_mode(struct kvm_vcpu *vcpu, bool set)
 	 * apicv
 	 */
 	if (!cpu_has_vmx_virtualize_x2apic_mode() ||
-				!vmx_cpu_uses_apicv(vcpu))
+				!kvm_vcpu_apicv_active(vcpu))
 		return;
 
 	if (!cpu_need_tpr_shadow(vcpu))
@@ -8231,7 +8202,7 @@ static void vmx_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr)
 static void vmx_load_eoi_exitmap(struct kvm_vcpu *vcpu)
 {
 	u64 *eoi_exit_bitmap = vcpu->arch.eoi_exit_bitmap;
-	if (!vmx_cpu_uses_apicv(vcpu))
+	if (!kvm_vcpu_apicv_active(vcpu))
 		return;
 
 	vmcs_write64(EOI_EXIT_BITMAP0, eoi_exit_bitmap[0]);
@@ -8256,8 +8227,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
 		kvm_machine_check();
 
 	/* We need to handle NMIs before interrupts are enabled */
-	if ((exit_intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR &&
-	    (exit_intr_info & INTR_INFO_VALID_MASK)) {
+	if (is_nmi(exit_intr_info)) {
 		kvm_before_handle_nmi(&vmx->vcpu);
 		asm("int $2");
 		kvm_after_handle_nmi(&vmx->vcpu);
@@ -9632,9 +9602,9 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12)
 
 	if (vmcs12->cpu_based_vm_exec_control & CPU_BASED_USE_TSC_OFFSETING)
 		vmcs_write64(TSC_OFFSET,
-			vmx->nested.vmcs01_tsc_offset + vmcs12->tsc_offset);
+			vcpu->arch.tsc_offset + vmcs12->tsc_offset);
 	else
-		vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+		vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset + vmcs12->tsc_offset);
 
 	if (enable_vpid) {
 		/*
@@ -9862,8 +9832,6 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool launch)
 
 	enter_guest_mode(vcpu);
 
-	vmx->nested.vmcs01_tsc_offset = vmcs_read64(TSC_OFFSET);
-
 	if (!(vmcs12->vm_entry_controls & VM_ENTRY_LOAD_DEBUG_CONTROLS))
 		vmx->nested.vmcs01_debugctl = vmcs_read64(GUEST_IA32_DEBUGCTL);
 
@@ -10358,7 +10326,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 
 	if ((exit_reason == EXIT_REASON_EXTERNAL_INTERRUPT)
 	    && nested_exit_intr_ack_set(vcpu)) {
-		int irq = kvm_cpu_get_interrupt(vcpu);
+		int irq = kvm_cpu_get_interrupt(vcpu, true);
 		WARN_ON(irq < 0);
 		vmcs12->vm_exit_intr_info = irq |
 			INTR_INFO_VALID_MASK | INTR_TYPE_EXT_INTR;
@@ -10382,7 +10350,7 @@ static void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 exit_reason,
 	load_vmcs12_host_state(vcpu, vmcs12);
 
 	/* Update TSC_OFFSET if TSC was changed while L2 ran */
-	vmcs_write64(TSC_OFFSET, vmx->nested.vmcs01_tsc_offset);
+	vmcs_write64(TSC_OFFSET, vcpu->arch.tsc_offset);
 
 	/* This is needed for same reason as it was needed in prepare_vmcs02 */
 	vmx->host_rsp = 0;
@@ -10770,7 +10738,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
 	.update_cr8_intercept = update_cr8_intercept,
 	.set_virtual_x2apic_mode = vmx_set_virtual_x2apic_mode,
 	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
-	.cpu_uses_apicv = vmx_cpu_uses_apicv,
+	.get_enable_apicv = vmx_get_enable_apicv,
+	.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
 	.load_eoi_exitmap = vmx_load_eoi_exitmap,
 	.hwapic_irr_update = vmx_hwapic_irr_update,
 	.hwapic_isr_update = vmx_hwapic_isr_update,
@@ -10794,10 +10763,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
 
 	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
 
-	.read_tsc_offset = vmx_read_tsc_offset,
 	.write_tsc_offset = vmx_write_tsc_offset,
-	.adjust_tsc_offset_guest = vmx_adjust_tsc_offset_guest,
-	.read_l1_tsc = vmx_read_l1_tsc,
 
 	.set_tdp_cr3 = vmx_set_cr3,
 
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -208,7 +208,18 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
 	struct kvm_shared_msrs *locals
 		= container_of(urn, struct kvm_shared_msrs, urn);
 	struct kvm_shared_msr_values *values;
+	unsigned long flags;
 
+	/*
+	 * Disabling irqs at this point since the following code could be
+	 * interrupted and executed through kvm_arch_hardware_disable()
+	 */
+	local_irq_save(flags);
+	if (locals->registered) {
+		locals->registered = false;
+		user_return_notifier_unregister(urn);
+	}
+	local_irq_restore(flags);
 	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
 		values = &locals->values[slot];
 		if (values->host != values->curr) {
@@ -216,8 +227,6 @@ static void kvm_on_user_return(struct user_return_notifier *urn)
 			values->curr = values->host;
 		}
 	}
-	locals->registered = false;
-	user_return_notifier_unregister(urn);
 }
 
 static void shared_msr_update(unsigned slot, u32 msr)
@@ -968,6 +977,13 @@ static u32 emulated_msrs[] = {
 	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
 	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
 	HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC,
+	HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2,
+	HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL,
+	HV_X64_MSR_RESET,
+	HV_X64_MSR_VP_INDEX,
+	HV_X64_MSR_VP_RUNTIME,
+	HV_X64_MSR_SCONTROL,
+	HV_X64_MSR_STIMER0_CONFIG,
 	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
 	MSR_KVM_PV_EOI_EN,
 
@@ -1340,7 +1356,7 @@ static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu)
 
 static void update_ia32_tsc_adjust_msr(struct kvm_vcpu *vcpu, s64 offset)
 {
-	u64 curr_offset = kvm_x86_ops->read_tsc_offset(vcpu);
+	u64 curr_offset = vcpu->arch.tsc_offset;
 	vcpu->arch.ia32_tsc_adjust_msr += offset - curr_offset;
 }
 
@@ -1382,19 +1398,25 @@ static u64 kvm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
 
 u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc)
 {
-	return kvm_x86_ops->read_l1_tsc(vcpu, kvm_scale_tsc(vcpu, host_tsc));
+	return vcpu->arch.tsc_offset + kvm_scale_tsc(vcpu, host_tsc);
 }
 EXPORT_SYMBOL_GPL(kvm_read_l1_tsc);
 
+static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
+{
+	kvm_x86_ops->write_tsc_offset(vcpu, offset);
+	vcpu->arch.tsc_offset = offset;
+}
+
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 {
 	struct kvm *kvm = vcpu->kvm;
 	u64 offset, ns, elapsed;
 	unsigned long flags;
-	s64 usdiff;
 	bool matched;
 	bool already_matched;
 	u64 data = msr->data;
+	bool synchronizing = false;
 
 	raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
 	offset = kvm_compute_tsc_offset(vcpu, data);
@@ -1402,51 +1424,32 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 	elapsed = ns - kvm->arch.last_tsc_nsec;
 
 	if (vcpu->arch.virtual_tsc_khz) {
-		int faulted = 0;
-
-		/* n.b - signed multiplication and division required */
-		usdiff = data - kvm->arch.last_tsc_write;
-#ifdef CONFIG_X86_64
-		usdiff = (usdiff * 1000) / vcpu->arch.virtual_tsc_khz;
-#else
-		/* do_div() only does unsigned */
-		asm("1: idivl %[divisor]\n"
-		    "2: xor %%edx, %%edx\n"
-		    "   movl $0, %[faulted]\n"
-		    "3:\n"
-		    ".section .fixup,\"ax\"\n"
-		    "4: movl $1, %[faulted]\n"
-		    "   jmp  3b\n"
-		    ".previous\n"
-
-		_ASM_EXTABLE(1b, 4b)
-
-		: "=A"(usdiff), [faulted] "=r" (faulted)
-		: "A"(usdiff * 1000), [divisor] "rm"(vcpu->arch.virtual_tsc_khz));
-
-#endif
-		do_div(elapsed, 1000);
-		usdiff -= elapsed;
-		if (usdiff < 0)
-			usdiff = -usdiff;
-
-		/* idivl overflow => difference is larger than USEC_PER_SEC */
-		if (faulted)
-			usdiff = USEC_PER_SEC;
-	} else
-		usdiff = USEC_PER_SEC; /* disable TSC match window below */
+		if ((data == 0) && msr->host_initiated) {
+			/*
+			* detection of vcpu initialization -- need to sync with other vCPUs
+			* particularly helps to keep kvm_clock stable after CPU hotplug
+			*/
+			synchronizing = true;
+		} else {
+			u64 tsc_exp = kvm->arch.last_tsc_write +
+						nsec_to_cycles(vcpu, elapsed);
+			u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL;
+			/*
+			 * Special case: TSC write with a small delta (1 second) of virtual
+			 * cycle time against real time is interpreted as an attempt to
+			 * synchronize the CPU.
+			 */
+			synchronizing = data < tsc_exp + tsc_hz && data > tsc_exp - tsc_hz;
+		}
+	}
 
 	/*
-	 * Special case: TSC write with a small delta (1 second) of virtual
-	 * cycle time against real time is interpreted as an attempt to
-	 * synchronize the CPU.
-         *
 	 * For a reliable TSC, we can match TSC offsets, and for an unstable
 	 * TSC, we add elapsed time in this computation.  We could let the
 	 * compensation code attempt to catch up if we fall behind, but
 	 * it's better to try to match offsets from the beginning.
          */
-	if (usdiff < USEC_PER_SEC &&
+	if (synchronizing &&
 	    vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) {
 		if (!check_tsc_unstable()) {
 			offset = kvm->arch.cur_tsc_offset;
@@ -1495,7 +1498,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr)
 
 	if (guest_cpuid_has_tsc_adjust(vcpu) && !msr->host_initiated)
 		update_ia32_tsc_adjust_msr(vcpu, offset);
-	kvm_x86_ops->write_tsc_offset(vcpu, offset);
+	kvm_vcpu_write_tsc_offset(vcpu, offset);
 	raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
 
 	spin_lock(&kvm->arch.pvclock_gtod_sync_lock);
@@ -1514,7 +1517,7 @@ EXPORT_SYMBOL_GPL(kvm_write_tsc);
 static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu,
 					   s64 adjustment)
 {
-	kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment);
+	kvm_vcpu_write_tsc_offset(vcpu, vcpu->arch.tsc_offset + adjustment);
 }
 
 static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
@@ -1522,7 +1525,7 @@ static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment)
 	if (vcpu->arch.tsc_scaling_ratio != kvm_default_tsc_scaling_ratio)
 		WARN_ON(adjustment < 0);
 	adjustment = kvm_scale_tsc(vcpu, (u64) adjustment);
-	kvm_x86_ops->adjust_tsc_offset_guest(vcpu, adjustment);
+	adjust_tsc_offset_guest(vcpu, adjustment);
 }
 
 #ifdef CONFIG_X86_64
@@ -1684,6 +1687,88 @@ static void kvm_gen_update_masterclock(struct kvm *kvm)
 #endif
 }
 
+static u64 __get_kvmclock_ns(struct kvm *kvm)
+{
+	struct kvm_vcpu *vcpu = kvm_get_vcpu(kvm, 0);
+	struct kvm_arch *ka = &kvm->arch;
+	u64 ns;
+
+	if (vcpu->arch.hv_clock.flags & PVCLOCK_TSC_STABLE_BIT) {
+		u64 tsc = kvm_read_l1_tsc(vcpu, native_read_tsc());
+		ns = __pvclock_read_cycles(&vcpu->arch.hv_clock, tsc);
+	} else {
+		ns = ktime_to_ns(ktime_get_boottime()) + ka->kvmclock_offset;
+	}
+
+	return ns;
+}
+
+u64 get_kvmclock_ns(struct kvm *kvm)
+{
+	unsigned long flags;
+	s64 ns;
+
+	local_irq_save(flags);
+	ns = __get_kvmclock_ns(kvm);
+	local_irq_restore(flags);
+
+	return ns;
+}
+
+static void kvm_setup_pvclock_page(struct kvm_vcpu *v)
+{
+	struct kvm_vcpu_arch *vcpu = &v->arch;
+	struct pvclock_vcpu_time_info guest_hv_clock;
+
+	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
+		&guest_hv_clock, sizeof(guest_hv_clock))))
+		return;
+
+	/* This VCPU is paused, but it's legal for a guest to read another
+	 * VCPU's kvmclock, so we really have to follow the specification where
+	 * it says that version is odd if data is being modified, and even after
+	 * it is consistent.
+	 *
+	 * Version field updates must be kept separate.  This is because
+	 * kvm_write_guest_cached might use a "rep movs" instruction, and
+	 * writes within a string instruction are weakly ordered.  So there
+	 * are three writes overall.
+	 *
+	 * As a small optimization, only write the version field in the first
+	 * and third write.  The vcpu->pv_time cache is still valid, because the
+	 * version field is the first in the struct.
+	 */
+	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
+
+	vcpu->hv_clock.version = guest_hv_clock.version + 1;
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock.version));
+
+	smp_wmb();
+
+	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
+	vcpu->hv_clock.flags |= (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
+
+	if (vcpu->pvclock_set_guest_stopped_request) {
+		vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED;
+		vcpu->pvclock_set_guest_stopped_request = false;
+	}
+
+	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
+
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock));
+
+	smp_wmb();
+
+	vcpu->hv_clock.version++;
+	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
+				&vcpu->hv_clock,
+				sizeof(vcpu->hv_clock.version));
+}
+
 static int kvm_guest_time_update(struct kvm_vcpu *v)
 {
 	unsigned long flags, this_tsc_khz, tgt_tsc_khz;
@@ -1691,7 +1776,6 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 	struct kvm_arch *ka = &v->kvm->arch;
 	s64 kernel_ns;
 	u64 tsc_timestamp, host_tsc;
-	struct pvclock_vcpu_time_info guest_hv_clock;
 	u8 pvclock_flags;
 	bool use_master_clock;
 
@@ -1745,8 +1829,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 
 	local_irq_restore(flags);
 
-	if (!vcpu->pv_time_enabled)
-		return 0;
+	/* With all the info we got, fill in the values */
 
 	if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
 		tgt_tsc_khz = kvm_has_tsc_control ?
@@ -1757,64 +1840,21 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
 		vcpu->hw_tsc_khz = this_tsc_khz;
 	}
 
-	/* With all the info we got, fill in the values */
 	vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
 	vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
 	vcpu->last_guest_tsc = tsc_timestamp;
 
-	if (unlikely(kvm_read_guest_cached(v->kvm, &vcpu->pv_time,
-		&guest_hv_clock, sizeof(guest_hv_clock))))
-		return 0;
-
-	/* This VCPU is paused, but it's legal for a guest to read another
-	 * VCPU's kvmclock, so we really have to follow the specification where
-	 * it says that version is odd if data is being modified, and even after
-	 * it is consistent.
-	 *
-	 * Version field updates must be kept separate.  This is because
-	 * kvm_write_guest_cached might use a "rep movs" instruction, and
-	 * writes within a string instruction are weakly ordered.  So there
-	 * are three writes overall.
-	 *
-	 * As a small optimization, only write the version field in the first
-	 * and third write.  The vcpu->pv_time cache is still valid, because the
-	 * version field is the first in the struct.
-	 */
-	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0);
-
-	vcpu->hv_clock.version = guest_hv_clock.version + 1;
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock.version));
-
-	smp_wmb();
-
-	/* retain PVCLOCK_GUEST_STOPPED if set in guest copy */
-	pvclock_flags = (guest_hv_clock.flags & PVCLOCK_GUEST_STOPPED);
-
-	if (vcpu->pvclock_set_guest_stopped_request) {
-		pvclock_flags |= PVCLOCK_GUEST_STOPPED;
-		vcpu->pvclock_set_guest_stopped_request = false;
-	}
-
 	/* If the host uses TSC clocksource, then it is stable */
+	pvclock_flags = 0;
 	if (use_master_clock)
 		pvclock_flags |= PVCLOCK_TSC_STABLE_BIT;
 
 	vcpu->hv_clock.flags = pvclock_flags;
 
-	trace_kvm_pvclock_update(v->vcpu_id, &vcpu->hv_clock);
-
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock));
-
-	smp_wmb();
-
-	vcpu->hv_clock.version++;
-	kvm_write_guest_cached(v->kvm, &vcpu->pv_time,
-				&vcpu->hv_clock,
-				sizeof(vcpu->hv_clock.version));
+	if (vcpu->pv_time_enabled)
+		kvm_setup_pvclock_page(v);
+	if (v == kvm_get_vcpu(v->kvm, 0))
+		kvm_hv_setup_tsc_page(v->kvm, &vcpu->hv_clock);
 	return 0;
 }
 
@@ -2350,7 +2390,11 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		 */
 		break;
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
-		return kvm_hv_set_msr_common(vcpu, msr, data);
+	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+	case HV_X64_MSR_CRASH_CTL:
+	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
+		return kvm_hv_set_msr_common(vcpu, msr, data,
+					     msr_info->host_initiated);
 	case MSR_IA32_BBL_CR_CTL3:
 		/* Drop writes to this legacy MSR -- see rdmsr
 		 * counterpart for further detail.
@@ -2590,6 +2634,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info)
 		msr_info->data = 0x20000000;
 		break;
 	case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
+	case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4:
+	case HV_X64_MSR_CRASH_CTL:
+	case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT:
 		return kvm_hv_get_msr_common(vcpu,
 					     msr_info->index, &msr_info->data);
 		break;
@@ -2729,6 +2776,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
 	case KVM_CAP_HYPERV:
 	case KVM_CAP_HYPERV_VAPIC:
 	case KVM_CAP_HYPERV_SPIN:
+	case KVM_CAP_HYPERV_SYNIC:
 	case KVM_CAP_PCI_SEGMENT:
 	case KVM_CAP_DEBUGREGS:
 	case KVM_CAP_X86_ROBUST_SINGLESTEP:
@@ -2912,7 +2960,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 		if (check_tsc_unstable()) {
 			u64 offset = kvm_compute_tsc_offset(vcpu,
 						vcpu->arch.last_guest_tsc);
-			kvm_x86_ops->write_tsc_offset(vcpu, offset);
+			kvm_vcpu_write_tsc_offset(vcpu, offset);
 			vcpu->arch.tsc_catchup = 1;
 		}
 		/*
@@ -2939,7 +2987,8 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
 				    struct kvm_lapic_state *s)
 {
-	kvm_x86_ops->sync_pir_to_irr(vcpu);
+	if (vcpu->arch.apicv_active)
+		kvm_x86_ops->sync_pir_to_irr(vcpu);
 
 	return kvm_apic_get_state(vcpu, s);
 }
@@ -3394,6 +3443,20 @@ static int kvm_set_guest_paused(struct kvm_vcpu *vcpu)
 	return 0;
 }
 
+static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu,
+				     struct kvm_enable_cap *cap)
+{
+	if (cap->flags)
+		return -EINVAL;
+
+	switch (cap->cap) {
+	case KVM_CAP_HYPERV_SYNIC:
+		return kvm_hv_activate_synic(vcpu);
+	default:
+		return -EINVAL;
+	}
+}
+
 long kvm_arch_vcpu_ioctl(struct file *filp,
 			 unsigned int ioctl, unsigned long arg)
 {
@@ -3658,6 +3721,15 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
 		r = kvm_set_guest_paused(vcpu);
 		goto out;
 	}
+	case KVM_ENABLE_CAP: {
+		struct kvm_enable_cap cap;
+
+		r = -EFAULT;
+		if (copy_from_user(&cap, argp, sizeof(cap)))
+			goto out;
+		r = kvm_vcpu_ioctl_enable_cap(vcpu, &cap);
+		break;
+	}
 	default:
 		r = -EINVAL;
 	}
@@ -4157,7 +4229,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
 	case KVM_SET_CLOCK: {
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
-		s64 delta;
 
 		r = -EFAULT;
 		if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
@@ -4169,10 +4240,9 @@ long kvm_arch_vm_ioctl(struct file *filp,
 
 		r = 0;
 		local_irq_disable();
-		now_ns = get_kernel_ns();
-		delta = user_ns.clock - now_ns;
+		now_ns = __get_kvmclock_ns(kvm);
+		kvm->arch.kvmclock_offset += user_ns.clock - now_ns;
 		local_irq_enable();
-		kvm->arch.kvmclock_offset = delta;
 		kvm_gen_update_masterclock(kvm);
 		break;
 	}
@@ -4180,10 +4250,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
 		struct kvm_clock_data user_ns;
 		u64 now_ns;
 
-		local_irq_disable();
-		now_ns = get_kernel_ns();
-		user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
-		local_irq_enable();
+		now_ns = get_kvmclock_ns(kvm);
+		user_ns.clock = now_ns;
 		user_ns.flags = 0;
 		memset(&user_ns.pad, 0, sizeof(user_ns.pad));
 
@@ -6081,6 +6149,12 @@ static void kvm_pv_kick_cpu_op(struct kvm *kvm, unsigned long flags, int apicid)
 	kvm_irq_delivery_to_apic(kvm, NULL, &lapic_irq, NULL);
 }
 
+void kvm_vcpu_deactivate_apicv(struct kvm_vcpu *vcpu)
+{
+	vcpu->arch.apicv_active = false;
+	kvm_x86_ops->refresh_apicv_exec_ctrl(vcpu);
+}
+
 int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
 {
 	unsigned long nr, a0, a1, a2, a3, ret;
@@ -6174,6 +6248,9 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
 	if (!lapic_in_kernel(vcpu))
 		return;
 
+	if (vcpu->arch.apicv_active)
+		return;
+
 	if (!vcpu->arch.apic->vapic_addr)
 		max_irr = kvm_lapic_find_highest_irr(vcpu);
 	else
@@ -6253,7 +6330,7 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool req_int_win)
 				return r;
 		}
 		if (kvm_x86_ops->interrupt_allowed(vcpu)) {
-			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu),
+			kvm_queue_interrupt(vcpu, kvm_cpu_get_interrupt(vcpu, false),
 					    false);
 			kvm_x86_ops->set_irq(vcpu);
 		}
@@ -6521,11 +6598,16 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
 
 	memset(vcpu->arch.eoi_exit_bitmap, 0, 256 / 8);
 
-	kvm_x86_ops->sync_pir_to_irr(vcpu);
 	if (irqchip_split(vcpu->kvm))
 		kvm_scan_ioapic_routes(vcpu, vcpu->arch.eoi_exit_bitmap);
-	else
+	else {
+		if (vcpu->arch.apicv_active)
+			kvm_x86_ops->sync_pir_to_irr(vcpu);
 		kvm_ioapic_scan_entry(vcpu, vcpu->arch.eoi_exit_bitmap);
+		bitmap_or((ulong*)vcpu->arch.eoi_exit_bitmap,
+			  (ulong*)vcpu->arch.eoi_exit_bitmap,
+			  vcpu_to_synic(vcpu)->vec_bitmap, 256);
+	}
 	kvm_x86_ops->load_eoi_exitmap(vcpu);
 }
 
@@ -6646,6 +6728,32 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 			vcpu_scan_ioapic(vcpu);
 		if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu))
 			kvm_vcpu_reload_apic_access_page(vcpu);
+		if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) {
+			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH;
+			r = 0;
+			goto out;
+		}
+		if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) {
+			vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT;
+			vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET;
+			r = 0;
+			goto out;
+		}
+		if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) {
+			vcpu->run->exit_reason = KVM_EXIT_HYPERV;
+			vcpu->run->hyperv = vcpu->arch.hyperv.exit;
+			r = 0;
+			goto out;
+		}
+
+		/*
+		 * KVM_REQ_HV_STIMER has to be processed after
+		 * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers
+		 * depend on the guest clock being up-to-date
+		 */
+		if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu))
+			kvm_hv_process_stimers(vcpu);
 	}
 
 	/*
@@ -6657,7 +6765,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
 		 * Update architecture specific hints for APIC
 		 * virtual interrupt delivery.
 		 */
-		if (kvm_x86_ops->hwapic_irr_update)
+		if (vcpu->arch.apicv_active)
 			kvm_x86_ops->hwapic_irr_update(vcpu,
 				kvm_lapic_find_highest_irr(vcpu));
 	}
@@ -6893,7 +7001,8 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 			break;
 		}
 
-		kvm_check_async_pf_completion(vcpu);
+		if (!is_guest_mode(vcpu))
+			kvm_check_async_pf_completion(vcpu);
 
 		if (signal_pending(current)) {
 			r = -EINTR;
@@ -6903,7 +7012,7 @@ static int vcpu_run(struct kvm_vcpu *vcpu)
 		}
 		if (need_resched()) {
 			srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
-			cond_resched();
+			cond_resched_may_throttle();
 			vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
 		}
 	}
@@ -7637,7 +7746,7 @@ int kvm_arch_hardware_enable(void)
 	 * before any KVM threads can be running.  Unfortunately, we can't
 	 * bring the TSCs fully up to date with real time, as we aren't yet far
 	 * enough into CPU bringup that we know how much real time has actually
-	 * elapsed; our helper function, get_kernel_ns() will be using boot
+	 * elapsed; our helper function, ktime_get_boot_ns() will be using boot
 	 * variables that haven't been updated yet.
 	 *
 	 * So we simply find the maximum observed TSC above, then record the
@@ -7761,6 +7870,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 	BUG_ON(vcpu->kvm == NULL);
 	kvm = vcpu->kvm;
 
+	vcpu->arch.apicv_active = kvm_x86_ops->get_enable_apicv();
 	vcpu->arch.pv.pv_unhalted = false;
 	vcpu->arch.emulate_ctxt.ops = &emulate_ops;
 	if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_reset_bsp(vcpu))
@@ -7820,6 +7930,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 
 	vcpu->arch.pending_external_vector = -1;
 
+	kvm_hv_vcpu_init(vcpu);
+
 	return 0;
 fail_free_wbinvd_dirty_mask:
 	free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
@@ -7839,6 +7951,7 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
 {
 	int idx;
 
+	kvm_hv_vcpu_uninit(vcpu);
 	kvm_pmu_destroy(vcpu);
 	kfree(vcpu->arch.mce_banks);
 	kvm_free_lapic(vcpu);
@@ -7876,6 +7989,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type)
 	mutex_init(&kvm->arch.apic_map_lock);
 	spin_lock_init(&kvm->arch.pvclock_gtod_sync_lock);
 
+	kvm->arch.kvmclock_offset = -ktime_to_ns(ktime_get_boottime());
 	pvclock_update_vm_gtod_copy(kvm);
 
 	INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn);
@@ -8233,6 +8347,9 @@ static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu)
 	    kvm_cpu_has_interrupt(vcpu))
 		return true;
 
+	if (kvm_hv_has_stimer_pending(vcpu))
+		return true;
+
 	return false;
 }
 
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -163,6 +163,7 @@ void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
 int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
 
 void kvm_write_tsc(struct kvm_vcpu *vcpu, struct msr_data *msr);
+u64 get_kvmclock_ns(struct kvm *kvm);
 
 int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
 	gva_t addr, void *val, unsigned int bytes,
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -10,6 +10,9 @@ OBJECT_FILES_NON_STANDARD_memmove_64.o		+= y
 OBJECT_FILES_NON_STANDARD_memset_64.o		+= y
 OBJECT_FILES_NON_STANDARD_rwlock.o		+= y
 
+# Produces uninteresting flaky coverage.
+KCOV_INSTRUMENT_delay.o	:= n
+
 inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
 inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
 quiet_cmd_inat_tables = GEN     $@
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -53,6 +53,8 @@
 .Lmemcpy_e_e:
 	.previous
 
+.weak memcpy
+
 ENTRY(__memcpy)
 ENTRY(memcpy)
 	CFI_STARTPROC
@@ -199,8 +201,8 @@ ENDPROC(__memcpy)
 	 * only outcome...
 	 */
 	.section .altinstructions, "a"
-	altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
+	altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
 			     .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
-	altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
+	altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
 			     .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
 	.previous
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -24,7 +24,10 @@
  * Output:
  * rax: dest
  */
+.weak memmove
+
 ENTRY(memmove)
+ENTRY(__memmove)
 	CFI_STARTPROC
 
 	/* Handle more 32 bytes in loop */
@@ -220,4 +223,5 @@ ENTRY(memmove)
 		.Lmemmove_end_forward-.Lmemmove_begin_forward,	\
 		.Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
 	.previous
+ENDPROC(__memmove)
 ENDPROC(memmove)
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -56,6 +56,8 @@
 .Lmemset_e_e:
 	.previous
 
+.weak memset
+
 ENTRY(memset)
 ENTRY(__memset)
 	CFI_STARTPROC
@@ -147,8 +149,8 @@ ENDPROC(__memset)
          * feature to implement the right patch order.
 	 */
 	.section .altinstructions,"a"
-	altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
-			     .Lfinal-memset,.Lmemset_e-.Lmemset_c
-	altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
-			     .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
+	altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
+			     .Lfinal-__memset,.Lmemset_e-.Lmemset_c
+	altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
+			     .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
 	.previous
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -1,3 +1,6 @@
+# Kernel does not boot with instrumentation of tlb.c.
+KCOV_INSTRUMENT_tlb.o	:= n
+
 obj-y	:=  init.o init_$(BITS).o fault.o ioremap.o extable.o pageattr.o mmap.o \
 	    pat.o pgtable.o physaddr.o gup.o setup_nx.o
 
@@ -20,6 +23,9 @@ obj-$(CONFIG_HIGHMEM)		+= highmem_32.o
 
 obj-$(CONFIG_KMEMCHECK)		+= kmemcheck/
 
+KASAN_SANITIZE_kasan_init_$(BITS).o := n
+obj-$(CONFIG_KASAN)		+= kasan_init_$(BITS).o
+
 obj-$(CONFIG_MMIOTRACE)		+= mmiotrace.o
 mmiotrace-y			:= kmmio.o pf_in.o mmio-mod.o
 obj-$(CONFIG_MMIOTRACE_TEST)	+= testmmiotrace.o
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -716,14 +716,14 @@ show_signal_msg(struct pt_regs *regs, unsigned long error_code,
 	if (!printk_ratelimit())
 		return;
 
-	printk("%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
+	ve_printk(VE_LOG, "%s%s[%d]: segfault at %lx ip %p sp %p error %lx",
 		task_pid_nr(tsk) > 1 ? KERN_INFO : KERN_EMERG,
 		tsk->comm, task_pid_nr(tsk), address,
 		(void *)regs->ip, (void *)regs->sp, error_code);
 
-	print_vma_addr(KERN_CONT " in ", regs->ip);
+	ve_print_vma_addr(VE_LOG, KERN_CONT " in ", regs->ip);
 
-	printk(KERN_CONT "\n");
+	ve_printk(VE_LOG, KERN_CONT "\n");
 }
 
 static void
@@ -953,7 +953,7 @@ spurious_fault(unsigned long error_code, unsigned long address)
 	return ret;
 }
 
-int show_unhandled_signals = 1;
+int show_unhandled_signals = 0;
 
 static inline int
 access_error(unsigned long error_code, struct vm_area_struct *vma)
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -7,6 +7,7 @@
 #include <linux/init.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
+#include <linux/ratelimit.h>
 #include <linux/hugetlb.h>
 #include <linux/pagemap.h>
 #include <linux/err.h>
@@ -146,7 +147,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		addr = ALIGN(addr, huge_page_size(h));
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 	if (mm->get_unmapped_area == arch_get_unmapped_area)
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -13,6 +13,7 @@
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/mmiotrace.h>
+#include <linux/ratelimit.h>
 
 #include <asm/cacheflush.h>
 #include <asm/e820.h>
--- /dev/null
+++ b/arch/x86/mm/kasan_init_64.c
@@ -0,0 +1,243 @@
+#define pr_fmt(fmt) "kasan: " fmt
+#include <linux/bootmem.h>
+#include <linux/kasan.h>
+#include <linux/kdebug.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+
+#include <asm/tlbflush.h>
+#include <asm/sections.h>
+
+extern pgd_t early_level4_pgt[PTRS_PER_PGD];
+extern struct range pfn_mapped[E820_X_MAX];
+
+static pud_t kasan_zero_pud[PTRS_PER_PUD] __page_aligned_bss;
+static pmd_t kasan_zero_pmd[PTRS_PER_PMD] __page_aligned_bss;
+static pte_t kasan_zero_pte[PTRS_PER_PTE] __page_aligned_bss;
+
+/*
+ * This page used as early shadow. We don't use empty_zero_page
+ * at early stages, stack instrumentation could write some garbage
+ * to this page.
+ * Latter we reuse it as zero shadow for large ranges of memory
+ * that allowed to access, but not instrumented by kasan
+ * (vmalloc/vmemmap ...).
+ */
+static unsigned char kasan_zero_page[PAGE_SIZE] __page_aligned_bss;
+
+static int __init map_range(struct range *range)
+{
+	unsigned long start;
+	unsigned long end;
+
+	start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start));
+	end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end));
+
+	/*
+	 * end + 1 here is intentional. We check several shadow bytes in advance
+	 * to slightly speed up fastpath. In some rare cases we could cross
+	 * boundary of mapped shadow, so we just map some more here.
+	 */
+	return vmemmap_populate(start, end + 1, pfn_to_nid(range->start));
+}
+
+static void __init clear_pgds(unsigned long start,
+			unsigned long end)
+{
+	for (; start < end; start += PGDIR_SIZE)
+		pgd_clear(pgd_offset_k(start));
+}
+
+static void __init kasan_map_early_shadow(pgd_t *pgd)
+{
+	int i;
+	unsigned long start = KASAN_SHADOW_START;
+	unsigned long end = KASAN_SHADOW_END;
+
+	for (i = pgd_index(start); start < end; i++) {
+		pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud)
+				| _KERNPG_TABLE);
+		start += PGDIR_SIZE;
+	}
+}
+
+static int __init zero_pte_populate(pmd_t *pmd, unsigned long addr,
+				unsigned long end)
+{
+	pte_t *pte = pte_offset_kernel(pmd, addr);
+
+	while (addr + PAGE_SIZE <= end) {
+		WARN_ON(!pte_none(*pte));
+		set_pte(pte, __pte(__pa_nodebug(kasan_zero_page)
+					| __PAGE_KERNEL_RO));
+		addr += PAGE_SIZE;
+		pte = pte_offset_kernel(pmd, addr);
+	}
+	return 0;
+}
+
+static int __init zero_pmd_populate(pud_t *pud, unsigned long addr,
+				unsigned long end)
+{
+	int ret = 0;
+	pmd_t *pmd = pmd_offset(pud, addr);
+
+	while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) {
+		WARN_ON(!pmd_none(*pmd));
+		set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte)
+					| _KERNPG_TABLE));
+		addr += PMD_SIZE;
+		pmd = pmd_offset(pud, addr);
+	}
+	if (addr < end) {
+		if (pmd_none(*pmd)) {
+			void *p = vmemmap_alloc_block(PAGE_SIZE, 0);
+			if (!p)
+				return -ENOMEM;
+			set_pmd(pmd, __pmd(__pa_nodebug(p) | _KERNPG_TABLE));
+		}
+		ret = zero_pte_populate(pmd, addr, end);
+	}
+	return ret;
+}
+
+
+static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr,
+				unsigned long end)
+{
+	int ret = 0;
+	pud_t *pud = pud_offset(pgd, addr);
+
+	while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) {
+		WARN_ON(!pud_none(*pud));
+		set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd)
+					| _KERNPG_TABLE));
+		addr += PUD_SIZE;
+		pud = pud_offset(pgd, addr);
+	}
+
+	if (addr < end) {
+		if (pud_none(*pud)) {
+			void *p = vmemmap_alloc_block(PAGE_SIZE, 0);
+			if (!p)
+				return -ENOMEM;
+			set_pud(pud, __pud(__pa_nodebug(p) | _KERNPG_TABLE));
+		}
+		ret = zero_pmd_populate(pud, addr, end);
+	}
+	return ret;
+}
+
+static int __init zero_pgd_populate(unsigned long addr, unsigned long end)
+{
+	int ret = 0;
+	pgd_t *pgd = pgd_offset_k(addr);
+
+	while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) {
+		WARN_ON(!pgd_none(*pgd));
+		set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud)
+					| _KERNPG_TABLE));
+		addr += PGDIR_SIZE;
+		pgd = pgd_offset_k(addr);
+	}
+
+	if (addr < end) {
+		if (pgd_none(*pgd)) {
+			void *p = vmemmap_alloc_block(PAGE_SIZE, 0);
+			if (!p)
+				return -ENOMEM;
+			set_pgd(pgd, __pgd(__pa_nodebug(p) | _KERNPG_TABLE));
+		}
+		ret = zero_pud_populate(pgd, addr, end);
+	}
+	return ret;
+}
+
+
+static void __init populate_zero_shadow(const void *start, const void *end)
+{
+	if (zero_pgd_populate((unsigned long)start, (unsigned long)end))
+		panic("kasan: unable to map zero shadow!");
+}
+
+
+#ifdef CONFIG_KASAN_INLINE
+static int kasan_die_handler(struct notifier_block *self,
+			     unsigned long val,
+			     void *data)
+{
+	if (val == DIE_GPF) {
+		pr_emerg("CONFIG_KASAN_INLINE enabled");
+		pr_emerg("GPF could be caused by NULL-ptr deref or user memory access");
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block kasan_die_notifier = {
+	.notifier_call = kasan_die_handler,
+};
+#endif
+
+void __init kasan_early_init(void)
+{
+	int i;
+	pteval_t pte_val = __pa_nodebug(kasan_zero_page) | __PAGE_KERNEL;
+	pmdval_t pmd_val = __pa_nodebug(kasan_zero_pte) | _KERNPG_TABLE;
+	pudval_t pud_val = __pa_nodebug(kasan_zero_pmd) | _KERNPG_TABLE;
+
+	for (i = 0; i < PTRS_PER_PTE; i++)
+		kasan_zero_pte[i] = __pte(pte_val);
+
+	for (i = 0; i < PTRS_PER_PMD; i++)
+		kasan_zero_pmd[i] = __pmd(pmd_val);
+
+	for (i = 0; i < PTRS_PER_PUD; i++)
+		kasan_zero_pud[i] = __pud(pud_val);
+
+	kasan_map_early_shadow(early_level4_pgt);
+	kasan_map_early_shadow(init_level4_pgt);
+}
+
+void __init kasan_init(void)
+{
+	int i;
+
+#ifdef CONFIG_KASAN_INLINE
+	register_die_notifier(&kasan_die_notifier);
+#endif
+
+	memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt));
+	load_cr3(early_level4_pgt);
+	__flush_tlb_all();
+
+	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+
+	populate_zero_shadow((void *)KASAN_SHADOW_START,
+			kasan_mem_to_shadow((void *)PAGE_OFFSET));
+
+	for (i = 0; i < E820_X_MAX; i++) {
+		if (pfn_mapped[i].end == 0)
+			break;
+
+		if (map_range(&pfn_mapped[i]))
+			panic("kasan: unable to allocate shadow!");
+	}
+	populate_zero_shadow(kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM),
+			kasan_mem_to_shadow((void *)__START_KERNEL_map));
+
+	vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext),
+			(unsigned long)kasan_mem_to_shadow(_end),
+			0);
+
+	populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END),
+			(void *)KASAN_SHADOW_END);
+
+	memset(kasan_zero_page, 0, PAGE_SIZE);
+
+	load_cr3(init_level4_pgt);
+	__flush_tlb_all();
+	init_task.kasan_depth = 0;
+
+	pr_info("Kernel address sanitizer initialized\n");
+}
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -6,7 +6,7 @@
 #include <asm/fixmap.h>
 #include <asm/mtrr.h>
 
-#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
+#define PGALLOC_GFP (GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO)
 
 #ifdef CONFIG_HIGHPTE
 #define PGALLOC_USER_GFP __GFP_HIGHMEM
@@ -18,7 +18,7 @@ gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
 
 pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
 {
-	return (pte_t *)__get_free_page(PGALLOC_GFP);
+	return (pte_t *)__get_free_page(PGALLOC_GFP & ~__GFP_ACCOUNT);
 }
 
 pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
@@ -78,8 +78,10 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd)
 #if PAGETABLE_LEVELS > 3
 void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud)
 {
+	struct page *page = virt_to_page(pud);
+
 	paravirt_release_pud(__pa(pud) >> PAGE_SHIFT);
-	tlb_remove_page(tlb, virt_to_page(pud));
+	tlb_remove_page(tlb, page);
 }
 #endif	/* PAGETABLE_LEVELS > 3 */
 #endif	/* PAGETABLE_LEVELS > 2 */
@@ -202,13 +204,17 @@ static void free_pmds(pmd_t *pmds[])
 		}
 }
 
-static int preallocate_pmds(pmd_t *pmds[])
+static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[])
 {
 	int i;
 	bool failed = false;
+	gfp_t gfp = PGALLOC_GFP;
+
+	if (mm == &init_mm)
+		gfp &= ~__GFP_ACCOUNT;
 
 	for(i = 0; i < PREALLOCATED_PMDS; i++) {
-		pmd_t *pmd = (pmd_t *)__get_free_page(PGALLOC_GFP);
+		pmd_t *pmd = (pmd_t *)__get_free_page(gfp);
 		if (!pmd)
 			failed = true;
 		if (pmd && !pgtable_pmd_page_ctor(virt_to_page(pmd))) {
@@ -286,7 +292,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
 
 	mm->pgd = pgd;
 
-	if (preallocate_pmds(pmds) != 0)
+	if (preallocate_pmds(mm, pmds) != 0)
 		goto out_free_pgd;
 
 	if (paravirt_pgd_alloc(mm) != 0)
--- a/arch/x86/realmode/Makefile
+++ b/arch/x86/realmode/Makefile
@@ -7,7 +7,7 @@
 #
 #
 OBJECT_FILES_NON_STANDARD	:= y
-
+KASAN_SANITIZE := n
 subdir- := rm
 
 obj-y += init.o
--- a/arch/x86/realmode/rm/Makefile
+++ b/arch/x86/realmode/rm/Makefile
@@ -7,6 +7,10 @@
 #
 #
 OBJECT_FILES_NON_STANDARD	:= y
+KASAN_SANITIZE := n
+
+# Prevents link failures: __sanitizer_cov_trace_pc() is not linked in.
+KCOV_INSTRUMENT		:= n
 
 always := realmode.bin realmode.relocs
 
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -119,7 +119,7 @@
 110	i386	iopl			sys_iopl
 111	i386	vhangup			sys_vhangup
 112	i386	idle
-113	i386	vm86old			sys_vm86old			sys32_vm86_warning
+113	i386	vm86old			sys_vm86old			sys_ni_syscall
 114	i386	wait4			sys_wait4			compat_sys_wait4
 115	i386	swapoff			sys_swapoff
 116	i386	sysinfo			sys_sysinfo			compat_sys_sysinfo
@@ -172,7 +172,7 @@
 163	i386	mremap			sys_mremap
 164	i386	setresuid		sys_setresuid16
 165	i386	getresuid		sys_getresuid16
-166	i386	vm86			sys_vm86			sys32_vm86_warning
+166	i386	vm86			sys_vm86			sys_ni_syscall
 167	i386	query_module
 168	i386	poll			sys_poll
 169	i386	nfsservctl
@@ -361,3 +361,8 @@
 352	i386	sched_getattr		sys_sched_getattr
 356	i386	memfd_create		sys_memfd_create
 374	i386	userfaultfd		sys_userfaultfd
+
+510	i386	getluid			sys_getluid
+511	i386	setluid			sys_setluid
+512	i386	setublimit		sys_setublimit			compat_sys_setublimit
+513	i386	ubstat			sys_ubstat			compat_sys_ubstat
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -327,6 +327,11 @@
 320	common	kexec_file_load		sys_kexec_file_load
 323	common	userfaultfd		sys_userfaultfd
 
+500	64	getluid			sys_getluid
+501	64	setluid			sys_setluid
+502	64	setublimit		sys_setublimit
+503	64	ubstat			sys_ubstat
+
 #
 # x32-specific system call numbers start at 512 to avoid cache impact
 # for native 64-bit operation.
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -184,7 +184,7 @@ $(obj)/vdso32-syms.lds: $(vdso32.so-y:%=$(obj)/vdso32-%-syms.lds) FORCE
 # The DSO images are built using a special linker script.
 #
 quiet_cmd_vdso = VDSO    $@
-      cmd_vdso = $(CC) -nostdlib -o $@ \
+      cmd_vdso = $(CC) $(call cc-option, -fno-use-linker-plugin) -nostdlib -o $@ \
 		       $(VDSO_LDFLAGS) $(VDSO_LDFLAGS_$(filter %.lds,$(^F))) \
 		       -Wl,-T,$(filter %.lds,$^) $(filter %.o,$^) \
 		$(if $(AFTER_LINK),; $(AFTER_LINK)) && \
@@ -192,6 +192,8 @@ quiet_cmd_vdso = VDSO    $@
 
 VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv)
 GCOV_PROFILE := n
+KASAN_SANITIZE := n
+KCOV_INSTRUMENT := n
 
 #
 # Install the unstripped copy of vdso*.so listed in $(vdso-install-y).
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -68,51 +68,47 @@ static notrace const struct pvclock_vsyscall_time_info *get_pvti(int cpu)
 	return &pvti_base[offset];
 }
 
-static notrace cycle_t vread_pvclock(int *mode)
+static notrace u64 vread_pvclock(int *mode)
 {
-	const struct pvclock_vsyscall_time_info *pvti;
-	cycle_t ret;
+	const struct pvclock_vcpu_time_info *pvti = &get_pvti(0)->pvti;
+	u64 ret;
 	u64 last;
 	u32 version;
-	u8 flags;
-	unsigned cpu, cpu1;
-
 
 	/*
-	 * Note: hypervisor must guarantee that:
-	 * 1. cpu ID number maps 1:1 to per-CPU pvclock time info.
-	 * 2. that per-CPU pvclock time info is updated if the
-	 *    underlying CPU changes.
-	 * 3. that version is increased whenever underlying CPU
-	 *    changes.
+	 * Note: The kernel and hypervisor must guarantee that cpu ID
+	 * number maps 1:1 to per-CPU pvclock time info.
+	 *
+	 * Because the hypervisor is entirely unaware of guest userspace
+	 * preemption, it cannot guarantee that per-CPU pvclock time
+	 * info is updated if the underlying CPU changes or that that
+	 * version is increased whenever underlying CPU changes.
+	 *
+	 * On KVM, we are guaranteed that pvti updates for any vCPU are
+	 * atomic as seen by *all* vCPUs.  This is an even stronger
+	 * guarantee than we get with a normal seqlock.
+	 *
+	 * On Xen, we don't appear to have that guarantee, but Xen still
+	 * supplies a valid seqlock using the version field.
 	 *
+	 * We only do pvclock vdso timing at all if
+	 * PVCLOCK_TSC_STABLE_BIT is set, and we interpret that bit to
+	 * mean that all vCPUs have matching pvti and that the TSC is
+	 * synced, so we can just look at vCPU 0's pvti.
 	 */
+
 	do {
-		cpu = __getcpu() & VGETCPU_CPU_MASK;
-		/* TODO: We can put vcpu id into higher bits of pvti.version.
-		 * This will save a couple of cycles by getting rid of
-		 * __getcpu() calls (Gleb).
-		 */
-
-		pvti = get_pvti(cpu);
-
-		version = __pvclock_read_cycles(&pvti->pvti, &ret, &flags);
-
-		/*
-		 * Test we're still on the cpu as well as the version.
-		 * We could have been migrated just after the first
-		 * vgetcpu but before fetching the version, so we
-		 * wouldn't notice a version change.
-		 */
-		cpu1 = __getcpu() & VGETCPU_CPU_MASK;
-	} while (unlikely(cpu != cpu1 ||
-			  (pvti->pvti.version & 1) ||
-			  pvti->pvti.version != version));
-
-	if (unlikely(!(flags & PVCLOCK_TSC_STABLE_BIT)))
-		*mode = VCLOCK_NONE;
-
-	/* refer to tsc.c read_tsc() comment for rationale */
+		version = pvclock_read_begin(pvti);
+
+		if (unlikely(!(pvti->flags & PVCLOCK_TSC_STABLE_BIT))) {
+			*mode = VCLOCK_NONE;
+			return 0;
+		}
+
+		ret = __pvclock_read_cycles(pvti, rdtsc_ordered());
+	} while (pvclock_read_retry(pvti, version));
+
+	/* refer to vread_tsc() comment for rationale */
 	last = VVAR(vsyscall_gtod_data).clock.cycle_last;
 
 	if (likely(ret >= last))
@@ -231,12 +227,14 @@ notrace int __vdso_clock_gettime(clockid_t clock, struct timespec *ts)
 		ret = do_realtime(ts);
 		break;
 	case CLOCK_MONOTONIC:
-		ret = do_monotonic(ts);
+		if (gtod->gettime_monotonic_enabled)
+			ret = do_monotonic(ts);
 		break;
 	case CLOCK_REALTIME_COARSE:
 		return do_realtime_coarse(ts);
 	case CLOCK_MONOTONIC_COARSE:
-		return do_monotonic_coarse(ts);
+		if (gtod->gettime_monotonic_enabled)
+			return do_monotonic_coarse(ts);
 	}
 
 	if (ret == VCLOCK_NONE)
--- a/arch/x86/vdso/vdso-note.S
+++ b/arch/x86/vdso/vdso-note.S
@@ -7,6 +7,8 @@
 #include <linux/version.h>
 #include <linux/elfnote.h>
 
+	.globl VDSO64_linux_version_code
 ELFNOTE_START(Linux, 0, "a")
+VDSO64_linux_version_code:
 	.long LINUX_VERSION_CODE
 ELFNOTE_END
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -26,6 +26,10 @@
 #include <asm/vdso.h>
 #include <asm/proto.h>
 
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/ve.h>
+
 enum {
 	VDSO_DISABLED = 0,
 	VDSO_ENABLED = 1,
@@ -303,6 +307,146 @@ int __init sysenter_setup(void)
 	return 0;
 }
 
+static DEFINE_MUTEX(vdso32_mutex);
+
+static struct page **uts_prep_vdso_pages_locked(int map)
+{
+	struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
+	struct mm_struct *mm = current->mm;
+	struct ve_struct *ve = get_exec_env();
+	struct page **pages = vdso32_pages;
+	int n1, n2, n3, new_version;
+	struct page **new_pages, **p;
+	void *addr;
+
+	/*
+	 * Simply reuse vDSO pages if we can.
+	 */
+	if (uts_ns == &init_uts_ns)
+		return vdso32_pages;
+
+	/*
+	 * Dirty lockless hack. Strictly speaking
+	 * we need to return @p here if it's non-nil,
+	 * but since there only one trasition possible
+	 * { =0 ; !=0 } we simply return @uts_ns->vdso32.pages
+	 */
+	p = ACCESS_ONCE(uts_ns->vdso32.pages);
+	smp_read_barrier_depends();
+	if (p)
+		return uts_ns->vdso32.pages;
+
+	up_write(&mm->mmap_sem);
+
+	if (sscanf(uts_ns->name.release, "%d.%d.%d", &n1, &n2, &n3) == 3) {
+		/*
+		 * If there were no changes on version simply reuse
+		 * preallocated one.
+		 */
+		new_version = KERNEL_VERSION(n1, n2, n3);
+		if (new_version == LINUX_VERSION_CODE)
+			goto out;
+#ifdef CONFIG_X86_32
+		else {
+			/*
+			 * Native x86-32 mode requires vDSO runtime
+			 * relocations applied which is not supported
+			 * in the old vanilla kernels, moreover even
+			 * being ported we would break compatibility
+			 * with rhel5 vdso which has addresses hardcoded.
+			 * Thus simply warn about this problem and
+			 * continue execution without virtualization.
+			 * After all i686 is pretty outdated nowadays.
+			 */
+			pr_warn_once("x86-32 vDSO virtualization is not supported.");
+			goto out;
+		}
+#endif
+	} else {
+		/*
+		 * If admin is passed malformed string here
+		 * lets warn him once but continue working
+		 * not using vDSO virtualization at all. It's
+		 * better than walk out with error.
+		 */
+		pr_warn_once("Wrong release uts name format detected."
+			     " Ignoring vDSO virtualization.\n");
+		goto out;
+	}
+
+	mutex_lock(&vdso32_mutex);
+	if (uts_ns->vdso32.pages) {
+		pages = uts_ns->vdso32.pages;
+		goto out_unlock;
+	}
+
+	uts_ns->vdso32.nr_pages		= 1;
+	uts_ns->vdso32.size		= PAGE_SIZE;
+	uts_ns->vdso32.version_off	= (unsigned long)VDSO32_SYMBOL(0, linux_version_code);
+	new_pages			= kmalloc(sizeof(struct page *), GFP_KERNEL);
+	if (!new_pages) {
+		pr_err("Can't allocate vDSO pages array for VE %d\n", ve->veid);
+		pages = ERR_PTR(-ENOMEM);
+		goto out_unlock;
+	}
+
+	new_pages[0] = alloc_page(GFP_KERNEL);
+	if (!new_pages[0]) {
+		pr_err("Can't allocate page for VE %d\n", ve->veid);
+		kfree(new_pages);
+		pages = ERR_PTR(-ENOMEM);
+		goto out_unlock;
+	}
+
+	copy_page(page_address(new_pages[0]), page_address(vdso32_pages[0]));
+
+	addr = page_address(new_pages[0]);
+	*((int *)(addr + uts_ns->vdso32.version_off)) = new_version;
+	smp_wmb();
+
+	pages = uts_ns->vdso32.pages = new_pages;
+
+	pr_debug("vDSO version transition %d -> %d for VE %d\n",
+		 LINUX_VERSION_CODE, new_version, ve->veid);
+
+out_unlock:
+	mutex_unlock(&vdso32_mutex);
+out:
+	down_write(&mm->mmap_sem);
+	return pages;
+}
+
+/* Call under mm->mmap_sem */
+static int __arch_setup_additional_pages(unsigned long addr, bool compat)
+{
+	struct mm_struct *mm = current->mm;
+	int ret;
+
+	current->mm->context.vdso = (void *)addr;
+
+	if (compat_uses_vma || !compat) {
+		struct page **pages = uts_prep_vdso_pages_locked(compat);
+		if (IS_ERR(pages))
+			return PTR_ERR(pages);
+
+		/*
+		 * MAYWRITE to allow gdb to COW and set breakpoints
+		 */
+		ret = install_special_mapping(mm, addr, PAGE_SIZE,
+					      VM_READ|VM_EXEC|
+					      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
+					      pages);
+
+		if (ret)
+			return ret;
+	}
+
+	current_thread_info()->sysenter_return =
+		VDSO32_SYMBOL(addr, SYSENTER_RETURN);
+
+	return 0;
+}
+
 /* Setup a VMA at program startup for the vsyscall page */
 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
@@ -337,35 +481,73 @@ int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 		}
 	}
 
-	current->mm->context.vdso = (void *)addr;
+	ret = __arch_setup_additional_pages(addr, compat);
+	if (ret)
+		current->mm->context.vdso = NULL;
 
-	if (compat_uses_vma || !compat) {
-		/*
-		 * MAYWRITE to allow gdb to COW and set breakpoints
-		 */
-		ret = install_special_mapping(mm, addr, PAGE_SIZE,
-					      VM_READ|VM_EXEC|
-					      VM_MAYREAD|VM_MAYWRITE|VM_MAYEXEC,
-					      vdso32_pages);
+up_fail:
 
-		if (ret)
-			goto up_fail;
+	up_write(&mm->mmap_sem);
+
+	return ret;
+}
+
+#ifdef CONFIG_X86_64
+
+int do_map_compat_vdso(unsigned long req_addr)
+{
+	struct mm_struct *mm = current->mm;
+	unsigned long vdso_addr;
+	struct vm_area_struct *vdso_vma;
+	int ret;
+	bool compat;
+
+	if (vdso_enabled == VDSO_DISABLED)
+		return -ENOENT;
+
+	down_write(&mm->mmap_sem);
+
+	compat = (vdso_enabled == VDSO_COMPAT);
+	/* Maybe we can omit this check, but yet let it be for safety */
+	if (compat && req_addr != VDSO_HIGH_BASE) {
+		ret = -EFAULT;
+		goto up_fail;
 	}
 
-	current_thread_info()->sysenter_return =
-		VDSO32_SYMBOL(addr, SYSENTER_RETURN);
+	/* Don't wanna copy security checks like security_mmap_addr() */
+	vdso_addr = get_unmapped_area(NULL, req_addr, PAGE_SIZE, 0, 0);
+	if (IS_ERR_VALUE(vdso_addr)) {
+		ret = vdso_addr;
+		goto up_fail;
+	}
 
-  up_fail:
+	if (req_addr != vdso_addr) {
+		ret = -EFAULT;
+		goto up_fail;
+	}
+
+	/*
+	 * Firstly, unmap old vdso - as install_special_mapping may not
+	 * do rlimit/cgroup accounting right - get rid of the old one by
+	 * remove_vma().
+	 */
+	vdso_vma = find_vma_intersection(mm, (unsigned long)mm->context.vdso,
+			(unsigned long)mm->context.vdso +
+			PAGE_SIZE*init_uts_ns.vdso.nr_pages);
+	if (vdso_vma)
+		do_munmap(mm, vdso_vma->vm_start,
+			vdso_vma->vm_end - vdso_vma->vm_start);
+
+	ret = __arch_setup_additional_pages(req_addr, compat);
 	if (ret)
 		current->mm->context.vdso = NULL;
 
+up_fail:
 	up_write(&mm->mmap_sem);
 
 	return ret;
 }
 
-#ifdef CONFIG_X86_64
-
 subsys_initcall(sysenter_setup);
 
 #ifdef CONFIG_SYSCTL
--- a/arch/x86/vdso/vdso32/note.S
+++ b/arch/x86/vdso/vdso32/note.S
@@ -9,7 +9,9 @@
 /* Ideally this would use UTS_NAME, but using a quoted string here
    doesn't work. Remember to change this when changing the
    kernel's name. */
+	.globl linux_version_code
 ELFNOTE_START(Linux, 0, "a")
+linux_version_code:
 	.long LINUX_VERSION_CODE
 ELFNOTE_END
 
--- a/arch/x86/vdso/vdso32/vdso32.lds.S
+++ b/arch/x86/vdso/vdso32/vdso32.lds.S
@@ -35,3 +35,4 @@ VDSO32_PRELINK		= VDSO_PRELINK;
 VDSO32_vsyscall		= __kernel_vsyscall;
 VDSO32_sigreturn	= __kernel_sigreturn;
 VDSO32_rt_sigreturn	= __kernel_rt_sigreturn;
+VDSO32_linux_version_code = linux_version_code;
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -16,6 +16,10 @@
 #include <asm/vdso.h>
 #include <asm/page.h>
 
+#include <linux/utsname.h>
+#include <linux/version.h>
+#include <linux/ve.h>
+
 unsigned int __read_mostly vdso_enabled = 1;
 
 extern char vdso_start[], vdso_end[];
@@ -111,6 +115,12 @@ static int __init init_vdso(void)
 		vdsox32_pages[i] = virt_to_page(vdsox32_start + i*PAGE_SIZE);
 #endif
 
+	init_uts_ns.vdso.addr		= vdso_start;
+	init_uts_ns.vdso.pages		= vdso_pages;
+	init_uts_ns.vdso.nr_pages	= npages;
+	init_uts_ns.vdso.size		= vdso_size;
+	init_uts_ns.vdso.version_off	= (unsigned long)VDSO64_SYMBOL(0, linux_version_code);
+
 	return 0;
 }
 subsys_initcall(init_vdso);
@@ -199,10 +209,112 @@ up_fail:
 	return ret;
 }
 
+static DEFINE_MUTEX(vdso_mutex);
+
+static int uts_arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
+{
+	struct uts_namespace *uts_ns = current->nsproxy->uts_ns;
+	struct ve_struct *ve = get_exec_env();
+	int i, n1, n2, n3, new_version;
+	struct page **new_pages, **p;
+
+	/*
+	 * For node or in case we've not changed UTS simply
+	 * map preallocated original vDSO.
+	 *
+	 * In turn if we already allocated one for this UTS
+	 * simply reuse it. It improves speed significantly.
+	 */
+	if (uts_ns == &init_uts_ns)
+		goto map_init_uts;
+	/*
+	 * Dirty lockless hack. Strictly speaking
+	 * we need to return @p here if it's non-nil,
+	 * but since there only one trasition possible
+	 * { =0 ; !=0 } we simply return @uts_ns->vdso.pages
+	 */
+	p = ACCESS_ONCE(uts_ns->vdso.pages);
+	smp_read_barrier_depends();
+	if (p)
+		goto map_uts;
+
+	if (sscanf(uts_ns->name.release, "%d.%d.%d", &n1, &n2, &n3) == 3) {
+		/*
+		 * If there were no changes on version simply reuse
+		 * preallocated one.
+		 */
+		new_version = KERNEL_VERSION(n1, n2, n3);
+		if (new_version == LINUX_VERSION_CODE)
+			goto map_init_uts;
+	} else {
+		/*
+		 * If admin is passed malformed string here
+		 * lets warn him once but continue working
+		 * not using vDSO virtualization at all. It's
+		 * better than walk out with error.
+		 */
+		pr_warn_once("Wrong release uts name format detected."
+			     " Ignoring vDSO virtualization.\n");
+		goto map_init_uts;
+	}
+
+	mutex_lock(&vdso_mutex);
+	if (uts_ns->vdso.pages) {
+		mutex_unlock(&vdso_mutex);
+		goto map_uts;
+	}
+
+	uts_ns->vdso.nr_pages	= init_uts_ns.vdso.nr_pages;
+	uts_ns->vdso.size	= init_uts_ns.vdso.size;
+	uts_ns->vdso.version_off= init_uts_ns.vdso.version_off;
+	new_pages		= kmalloc(sizeof(struct page *) * init_uts_ns.vdso.nr_pages, GFP_KERNEL);
+	if (!new_pages) {
+		pr_err("Can't allocate vDSO pages array for VE %d\n", ve->veid);
+		goto out_unlock;
+	}
+
+	for (i = 0; i < uts_ns->vdso.nr_pages; i++) {
+		struct page *p = alloc_page(GFP_KERNEL);
+		if (!p) {
+			pr_err("Can't allocate page for VE %d\n", ve->veid);
+			for (; i > 0; i--)
+				put_page(new_pages[i - 1]);
+			kfree(new_pages);
+			goto out_unlock;
+		}
+		new_pages[i] = p;
+		copy_page(page_address(p), page_address(init_uts_ns.vdso.pages[i]));
+	}
+
+	uts_ns->vdso.addr = vmap(new_pages, uts_ns->vdso.nr_pages, 0, PAGE_KERNEL);
+	if (!uts_ns->vdso.addr) {
+		pr_err("Can't map vDSO pages for VE %d\n", ve->veid);
+		for (i = 0; i < uts_ns->vdso.nr_pages; i++)
+			put_page(new_pages[i]);
+		kfree(new_pages);
+		goto out_unlock;
+	}
+
+	*((int *)(uts_ns->vdso.addr + uts_ns->vdso.version_off)) = new_version;
+	smp_wmb();
+	uts_ns->vdso.pages = new_pages;
+	mutex_unlock(&vdso_mutex);
+
+	pr_debug("vDSO version transition %d -> %d for VE %d\n",
+		 LINUX_VERSION_CODE, new_version, ve->veid);
+
+map_uts:
+	return setup_additional_pages(bprm, uses_interp, uts_ns->vdso.pages, uts_ns->vdso.size);
+map_init_uts:
+	return setup_additional_pages(bprm, uses_interp, init_uts_ns.vdso.pages, init_uts_ns.vdso.size);
+out_unlock:
+	mutex_unlock(&vdso_mutex);
+	return -ENOMEM;
+}
+
 int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp)
 {
-	return setup_additional_pages(bprm, uses_interp, vdso_pages,
-				      vdso_size);
+	return uts_arch_setup_additional_pages(bprm, uses_interp);
 }
 
 #ifdef CONFIG_X86_X32_ABI
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -48,6 +48,7 @@
 #include <linux/memblock.h>
 #include <linux/seq_file.h>
 #include <linux/crash_dump.h>
+#include <linux/ratelimit.h>
 
 #include <trace/events/xen.h>
 
--- a/arch/xtensa/kernel/syscall.c
+++ b/arch/xtensa/kernel/syscall.c
@@ -86,7 +86,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		/* At this point:  (!vmm || addr < vmm->vm_end). */
 		if (TASK_SIZE - len < addr)
 			return -ENOMEM;
-		if (!vmm || addr + len <= vmm->vm_start)
+		if (!vmm || addr + len <= vm_start_gap(vmm))
 			return addr;
 		addr = vmm->vm_end;
 		if (flags & MAP_SHARED)
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -118,6 +118,14 @@ source "block/partitions/Kconfig"
 
 endmenu
 
+config BLK_DEV_CBT
+	bool "Block layer changed block tracking support"
+	---help---
+	Block layer changed block tracking support, It can be used to optimize
+	device backup,copy.
+
+	If unsure, say N.
+
 endif # BLOCK
 
 config BLOCK_COMPAT
--- a/block/Makefile
+++ b/block/Makefile
@@ -20,3 +20,4 @@ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
+obj-$(CONFIG_BLK_DEV_CBT)	+= blk-cbt.o
--- /dev/null
+++ b/block/blk-cbt.c
@@ -0,0 +1,833 @@
+/*
+ *  block/blk-cbt.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/string.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/cpu.h>
+#include <linux/cpuset.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/pagemap.h>
+#include <linux/vmalloc.h>
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+
+#define CBT_MAX_EXTENTS	(UINT_MAX / sizeof(struct blk_user_cbt_extent))
+#define NR_PAGES(bits) (((bits) + PAGE_SIZE*8 - 1) / (PAGE_SIZE*8))
+#define BITS_PER_PAGE		(1UL << (PAGE_SHIFT + 3))
+
+#define CBT_PAGE_MISSED (struct page *)(0x1)
+#define CBT_PAGE(cbt, idx) (cbt->map[idx] == CBT_PAGE_MISSED ? \
+			    NULL : cbt->map[idx])
+
+static __cacheline_aligned_in_smp DEFINE_MUTEX(cbt_mutex);
+
+struct cbt_extent{
+	blkcnt_t start;
+	blkcnt_t len;
+};
+
+struct cbt_info {
+	__u8 	 uuid[16];
+	struct request_queue *queue;
+	blkcnt_t block_max;
+	blkcnt_t block_bits;
+	unsigned long flags;
+
+	struct rcu_head rcu;
+	unsigned int count;
+	struct cbt_extent __percpu *cache;
+	struct page **map;
+	spinlock_t lock;
+};
+
+
+enum CBT_FLAGS
+{
+	CBT_ERROR = 0,
+	CBT_DEAD  = 1,
+	CBT_NOCACHE  = 2,
+};
+static void cbt_release_callback(struct rcu_head *head);
+static void cbt_flush_cache(struct cbt_info *cbt);
+
+static inline void spin_lock_page(struct page *page)
+{
+	while(!trylock_page(page))
+		cpu_relax();
+}
+
+static void set_bits(void *bm, int cur, int len, bool is_set)
+{
+	__u32 *addr;
+	__u32 pattern = is_set? 0xffffffff : 0;
+
+	len = cur + len;
+	while (cur < len) {
+		if ((cur & 31) == 0 && (len - cur) >= 32) {
+			/* fast path: set whole word at once */
+			addr = bm + (cur >> 3);
+
+			*addr = pattern;
+			cur += 32;
+			continue;
+		}
+		if (is_set)
+			set_bit(cur, bm);
+		else
+			clear_bit(cur, bm);
+		cur++;
+	}
+}
+
+/*
+ * Return values:
+ * 0 if OK,
+ * -EAGAIN if cbt was updated,
+ * -EBADF if cbt is dead,
+ * -ENOMEM if alloc_page failed.
+ */
+static int cbt_page_alloc(struct cbt_info  **cbt_pp, unsigned long idx,
+			  int in_rcu)
+{
+	struct cbt_info	 *cbt = *cbt_pp;
+	struct page *page;
+
+	/* Page not allocated yet. Synchronization required */
+	spin_lock_irq(&cbt->lock);
+	if (likely(!test_bit(CBT_DEAD, &cbt->flags))) {
+		cbt->count++;
+	} else {
+		struct cbt_info *new = rcu_dereference(cbt->queue->cbt);
+
+		spin_unlock_irq(&cbt->lock);
+		/* was cbt updated ? */
+		if (new != cbt) {
+			*cbt_pp = new;
+			return -EAGAIN;
+		} else {
+			return -EBADF;
+		}
+	}
+	spin_unlock_irq(&cbt->lock);
+	if (in_rcu)
+		rcu_read_unlock();
+	page = alloc_page(GFP_NOIO|__GFP_ZERO);
+	if (in_rcu)
+		rcu_read_lock();
+	spin_lock_irq(&cbt->lock);
+	if (unlikely(!cbt->count-- && test_bit(CBT_DEAD, &cbt->flags))) {
+		spin_unlock_irq(&cbt->lock);
+		call_rcu(&cbt->rcu, &cbt_release_callback);
+		if (page)
+			__free_page(page);
+		return -EBADF;
+	}
+	if (unlikely(!page)) {
+		set_bit(CBT_ERROR, &cbt->flags);
+		spin_unlock_irq(&cbt->lock);
+		return -ENOMEM;
+	}
+
+	if (likely(CBT_PAGE(cbt, idx) == NULL))
+		cbt->map[idx] = page;
+	else
+		__free_page(page);
+
+	page = NULL;
+	spin_unlock_irq(&cbt->lock);
+
+	return 0;
+}
+
+static int __blk_cbt_set(struct cbt_info  *cbt, blkcnt_t block,
+			 blkcnt_t count, bool in_rcu, bool set,
+			 unsigned long *pages_missed,
+			 unsigned long *idx_first)
+{
+	struct page *page;
+
+	if (unlikely(block + count > cbt->block_max)) {
+		printk("WARN: %s eof access block:%lld, len: %lld, max:%lld\n",
+		       __FUNCTION__, (unsigned long long) block,
+		       (unsigned long long)count,
+		       (unsigned long long)cbt->block_max);
+		set_bit(CBT_ERROR, &cbt->flags);
+		return -EINVAL;
+	}
+
+	while(count) {
+		unsigned long idx = block >> (PAGE_SHIFT + 3);
+		unsigned long off = block & (BITS_PER_PAGE -1);
+		unsigned long len = min_t(unsigned long, BITS_PER_PAGE - off,
+					  count);
+		int ret;
+
+		page = CBT_PAGE(cbt, idx);
+		if (page) {
+			spin_lock_page(page);
+			set_bits(page_address(page), off, len, set);
+			unlock_page(page);
+			count -= len;
+			block += len;
+			continue;
+		} else if (pages_missed) {
+			(*pages_missed)++;
+			if (!*idx_first)
+				*idx_first = idx;
+			cbt->map[idx] = CBT_PAGE_MISSED;
+			count -= len;
+			block += len;
+			continue;
+		}  else {
+			if (!set) {
+				/* Nothing to do */
+				count -= len;
+				block += len;
+				continue;
+			}
+		}
+
+		ret = cbt_page_alloc(&cbt, idx, in_rcu);
+		if (ret == -EAGAIN) /* new cbt */
+			continue;
+		else if (ret == -EBADF) /* dead cbt */
+			break;
+		else if (ret)
+			return ret;
+	}
+	return (pages_missed && *pages_missed) ? -EAGAIN : 0;
+}
+
+static void blk_cbt_add(struct request_queue *q, blkcnt_t start, blkcnt_t len)
+{
+	struct cbt_info *cbt;
+	struct cbt_extent *ex;
+	struct cbt_extent old;
+	blkcnt_t end;
+	/* Check per-cpu cache */
+
+	rcu_read_lock();
+	cbt = rcu_dereference(q->cbt);
+	if (unlikely(!cbt))
+		goto out_rcu;
+
+	if (unlikely(test_bit(CBT_ERROR, &cbt->flags)))
+		goto out_rcu;
+	end = (start + len + (1 << cbt->block_bits) -1) >> cbt->block_bits;
+	start >>= cbt->block_bits;
+	len = end - start;
+	if (unlikely(test_bit(CBT_NOCACHE, &cbt->flags))) {
+		__blk_cbt_set(cbt, start, len, 1, 1, NULL, NULL);
+		goto out_rcu;
+	}
+	ex = this_cpu_ptr(cbt->cache);
+	if (ex->start + ex->len == start) {
+		ex->len += len;
+		goto out_rcu;
+	}
+	old = *ex;
+	ex->start = start;
+	ex->len = len;
+
+	if (likely(old.len))
+		__blk_cbt_set(cbt, old.start, old.len, 1, 1, NULL, NULL);
+out_rcu:
+	rcu_read_unlock();
+}
+
+inline void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio)
+{
+	if (!q->cbt || bio_data_dir(bio) == READ || !bio->bi_size)
+		return;
+
+	blk_cbt_add(q, bio->bi_sector << 9, bio->bi_size);
+}
+
+static struct cbt_info* do_cbt_alloc(struct request_queue *q, __u8 *uuid,
+				     loff_t size, loff_t blocksize)
+{
+	struct cbt_info *cbt;
+	struct cbt_extent *ex;
+	int i;
+
+
+	cbt = kzalloc(sizeof(*cbt), GFP_KERNEL);
+	if (!cbt)
+		return ERR_PTR(-ENOMEM);
+
+	cbt->block_bits = ilog2(blocksize);
+	cbt->block_max  = (size + blocksize - 1) >> cbt->block_bits;
+	spin_lock_init(&cbt->lock);
+	memcpy(cbt->uuid, uuid, sizeof(cbt->uuid));
+	cbt->cache = alloc_percpu(struct cbt_extent);
+	if (!cbt->cache)
+		goto err_cbt;
+
+	for_each_possible_cpu(i) {
+		ex = per_cpu_ptr(cbt->cache, i);
+		memset(ex, 0, sizeof (*ex));
+	}
+
+	cbt->map = vmalloc(NR_PAGES(cbt->block_max) * sizeof(void*));
+	if (!cbt->map)
+		goto err_pcpu;
+
+	memset(cbt->map, 0, NR_PAGES(cbt->block_max) * sizeof(void*));
+	cbt->queue = q;
+	return cbt;
+err_pcpu:
+	free_percpu(cbt->cache);
+err_cbt:
+	kfree(cbt);
+	return ERR_PTR(-ENOMEM);
+}
+
+int blk_cbt_map_copy_once(struct request_queue *q, __u8 *uuid,
+			  struct page ***map_ptr, blkcnt_t *block_max,
+			  blkcnt_t *block_bits)
+{
+	struct cbt_info *cbt;
+	struct page **map;
+	unsigned long npages;
+	unsigned long i;
+
+	mutex_lock(&cbt_mutex);
+	cbt = q->cbt;
+
+	if (!cbt) {
+		mutex_unlock(&cbt_mutex);
+		return -ENOENT;
+	}
+
+	BUG_ON(!cbt->map);
+	BUG_ON(!cbt->block_max);
+
+	if (!uuid || memcmp(uuid, cbt->uuid, sizeof(cbt->uuid))) {
+		mutex_unlock(&cbt_mutex);
+		return -EINVAL;
+	}
+
+	cbt_flush_cache(cbt);
+
+	npages = NR_PAGES(cbt->block_max);
+	map = vmalloc(npages * sizeof(void*));
+	if (!map)
+		goto fail;
+
+	memset(map, 0, npages * sizeof(void*));
+
+	for (i = 0; i < npages; i++) {
+		struct page *page = cbt->map[i];
+
+		BUG_ON(page == CBT_PAGE_MISSED);
+
+		if (page) {
+			map[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+			if (!map[i])
+				goto fail_pages;
+
+			spin_lock_page(page);
+			memcpy(page_address(map[i]), page_address(page),
+			       PAGE_SIZE);
+			memset(page_address(page), 0, PAGE_SIZE);
+			unlock_page(page);
+		}
+	}
+	mutex_unlock(&cbt_mutex);
+
+	*map_ptr = map;
+	*block_max = cbt->block_max;
+	*block_bits = cbt->block_bits;
+	return 0;
+
+fail_pages:
+	while (--i >= 0) {
+		if (map[i])
+			__free_page(map[i]);
+	}
+fail:
+	vfree(map);
+	mutex_unlock(&cbt_mutex);
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(blk_cbt_map_copy_once);
+
+static void blk_cbt_page_merge(struct page *pg_from, struct page *pg_to)
+{
+	u32 *from = page_address(pg_from);
+	u32 *to = page_address(pg_to);
+	u32 *fin = to + PAGE_SIZE/sizeof(*to);
+
+	while (to < fin) {
+		*to |= *from;
+		to++;
+		from++;
+	}
+}
+
+int blk_cbt_map_merge(struct request_queue *q, __u8 *uuid,
+		      struct page **map, blkcnt_t block_max,
+		      blkcnt_t block_bits)
+{
+	struct cbt_info *cbt;
+	unsigned long i;
+
+	mutex_lock(&cbt_mutex);
+	cbt = q->cbt;
+
+	if (!cbt) {
+		mutex_unlock(&cbt_mutex);
+		return -ENOENT;
+	}
+
+	BUG_ON(!cbt->map);
+	BUG_ON(!cbt->block_max);
+
+	if (!map || !uuid || memcmp(uuid, cbt->uuid, sizeof(cbt->uuid)) ||
+	    block_max != cbt->block_max || block_bits != cbt->block_bits) {
+		mutex_unlock(&cbt_mutex);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < NR_PAGES(cbt->block_max); i++) {
+		struct page *page_main = cbt->map[i];
+		struct page *page_addon = map[i];
+
+		BUG_ON(page_main == CBT_PAGE_MISSED);
+		BUG_ON(page_addon == CBT_PAGE_MISSED);
+
+		if (!page_addon)
+			continue;
+
+		if (!page_main) {
+			int ret = cbt_page_alloc(&cbt, i, 0);
+			if (ret) {
+				mutex_unlock(&cbt_mutex);
+				return ret;
+			}
+			page_main = cbt->map[i];
+			BUG_ON(page_main == NULL);
+			BUG_ON(page_main == CBT_PAGE_MISSED);
+		}
+
+		spin_lock_page(page_main);
+		blk_cbt_page_merge(page_addon, page_main);
+		unlock_page(page_main);
+	}
+	mutex_unlock(&cbt_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(blk_cbt_map_merge);
+
+void blk_cbt_update_size(struct block_device *bdev)
+{
+	struct request_queue *q;
+	struct cbt_info *new, *cbt;
+	unsigned long to_cpy, idx;
+	unsigned bsz;
+	loff_t new_sz = i_size_read(bdev->bd_inode);
+	int in_use = 0;
+
+	if (!bdev->bd_disk || !bdev_get_queue(bdev))
+		return;
+
+	q = bdev_get_queue(bdev);
+	mutex_lock(&cbt_mutex);
+	cbt = q->cbt;
+	if (!cbt) {
+		mutex_unlock(&cbt_mutex);
+		return;
+	}
+	bsz = 1 << cbt->block_bits;
+	if ((new_sz + bsz - 1) >> cbt->block_bits <= cbt->block_max)
+		goto err_mtx;
+
+	new = do_cbt_alloc(q, cbt->uuid, new_sz, bsz);
+	if (IS_ERR(new)) {
+		set_bit(CBT_ERROR, &cbt->flags);
+		goto err_mtx;
+	}
+	to_cpy = NR_PAGES(cbt->block_max);
+	set_bit(CBT_NOCACHE, &cbt->flags);
+	cbt_flush_cache(cbt);
+	spin_lock_irq(&cbt->lock);
+	set_bit(CBT_DEAD, &cbt->flags);
+	for (idx = 0; idx < to_cpy; idx++){
+		new->map[idx] = cbt->map[idx];
+		if (CBT_PAGE(new, idx))
+			get_page(CBT_PAGE(new, idx));
+	}
+	rcu_assign_pointer(q->cbt, new);
+	in_use = cbt->count;
+	spin_unlock(&cbt->lock);
+	if (!in_use)
+		call_rcu(&cbt->rcu, &cbt_release_callback);
+err_mtx:
+	mutex_unlock(&cbt_mutex);
+
+
+}
+
+static int cbt_ioc_init(struct block_device *bdev, struct blk_user_cbt_info __user *ucbt_ioc)
+{
+	struct request_queue *q;
+	struct blk_user_cbt_info ci;
+	struct cbt_info *cbt;
+	int ret = 0;
+
+	if (copy_from_user(&ci, ucbt_ioc, sizeof(ci)))
+		return -EFAULT;
+
+	if (((ci.ci_blksize -1) & ci.ci_blksize))
+		return -EINVAL;
+
+	q = bdev_get_queue(bdev);
+	mutex_lock(&cbt_mutex);
+	if (q->cbt) {
+		ret = -EBUSY;
+		goto err_mtx;
+	}
+	cbt = do_cbt_alloc(q, ci.ci_uuid, i_size_read(bdev->bd_inode), ci.ci_blksize);
+	if (IS_ERR(cbt))
+		ret = PTR_ERR(cbt);
+	else
+		rcu_assign_pointer(q->cbt, cbt);
+err_mtx:
+	mutex_unlock(&cbt_mutex);
+	return ret;
+}
+
+static void cbt_release_callback(struct rcu_head *head)
+{
+	struct cbt_info *cbt;
+	int nr_pages, i;
+
+	cbt = container_of(head, struct cbt_info, rcu);
+	nr_pages = NR_PAGES(cbt->block_max);
+	for (i = 0; i < nr_pages; i++)
+		if (CBT_PAGE(cbt, i))
+			__free_page(CBT_PAGE(cbt, i));
+
+	vfree(cbt->map);
+	free_percpu(cbt->cache);
+	kfree(cbt);
+}
+
+void blk_cbt_release(struct request_queue *q)
+{
+	struct cbt_info *cbt;
+	int in_use = 0;
+
+	cbt = q->cbt;
+	if (!cbt)
+		return;
+	spin_lock(&cbt->lock);
+	set_bit(CBT_DEAD, &cbt->flags);
+	rcu_assign_pointer(q->cbt, NULL);
+	in_use = cbt->count;
+	spin_unlock(&cbt->lock);
+	if (in_use)
+		call_rcu(&cbt->rcu, &cbt_release_callback);
+}
+
+static int cbt_ioc_stop(struct block_device *bdev)
+{
+	struct request_queue *q;
+
+	mutex_lock(&cbt_mutex);
+	q = bdev_get_queue(bdev);
+	if(!q->cbt) {
+		mutex_unlock(&cbt_mutex);
+		return -EINVAL;
+	}
+	blk_cbt_release(q);
+	mutex_unlock(&cbt_mutex);
+	return 0;
+}
+
+struct flush_ctx {
+	struct cbt_info *cbt;
+	unsigned long pages_missed;
+	unsigned long idx_first;
+};
+
+static inline void __cbt_flush_cpu_cache(void *ptr)
+{
+	struct flush_ctx *ctx = (struct flush_ctx *)ptr;
+	struct cbt_info *cbt = ctx->cbt;
+	struct cbt_extent *ex = this_cpu_ptr(cbt->cache);
+
+	if (ex->len) {
+		int ret = __blk_cbt_set(cbt, ex->start, ex->len, 0, 1,
+					&ctx->pages_missed,
+					&ctx->idx_first);
+		if (!ret) {
+			ex->start += ex->len;
+			ex->len = 0;
+		}
+	}
+}
+
+static void cbt_flush_cache(struct cbt_info *cbt)
+{
+	for (;;) {
+		struct flush_ctx ctx;
+		unsigned long i;
+try_again:
+		ctx.cbt = cbt;
+		ctx.pages_missed = 0;
+		ctx.idx_first = 0;
+
+		on_each_cpu(__cbt_flush_cpu_cache, &ctx, 1);
+
+		if (likely(!ctx.pages_missed))
+			return;
+
+		for (i = ctx.idx_first; i < NR_PAGES(cbt->block_max); i++) {
+			int ret;
+
+			if (cbt->map[i] != CBT_PAGE_MISSED)
+				continue;
+
+			ret = cbt_page_alloc(&cbt, i, 0);
+			if (ret == -EAGAIN) /* new cbt */
+				goto try_again;
+			else if (ret) /* dead cbt or alloc_page failed */
+				return;
+
+			/* cbt_page_alloc succeeded ... */
+			if (!--ctx.pages_missed)
+				break;
+		}
+	}
+}
+
+static void cbt_find_next_extent(struct cbt_info *cbt, blkcnt_t block, struct cbt_extent *ex)
+{
+	unsigned long off, off2, idx;
+	struct page *page;
+	bool found = 0;
+
+	ex->start = cbt->block_max;
+	ex->len = 0;
+
+	idx = block >> (PAGE_SHIFT + 3);
+	while (block < cbt->block_max) {
+		off = block & (BITS_PER_PAGE -1);
+		page = CBT_PAGE(cbt, idx);
+		if (!page) {
+			if (found)
+				break;
+			goto next;
+		}
+		spin_lock_page(page);
+		/* Find extent start */
+		if (!found) {
+			ex->start = find_next_bit(page_address(page), BITS_PER_PAGE, off);
+			if (ex->start != BITS_PER_PAGE) {
+				off = ex->start;
+				ex->start += idx << (PAGE_SHIFT + 3);
+				found = 1;
+			} else {
+				unlock_page(page);
+				goto next;
+			}
+		}
+		if (found) {
+			off2 = find_next_zero_bit(page_address(page), BITS_PER_PAGE, off);
+			ex->len += off2 - off;
+			if (off2 != BITS_PER_PAGE) {
+				unlock_page(page);
+				break;
+			}
+		}
+		unlock_page(page);
+	next:
+		idx++;
+		block = idx << (PAGE_SHIFT + 3);
+		continue;
+	}
+}
+
+static int cbt_ioc_get(struct block_device *bdev, struct blk_user_cbt_info __user *ucbt_ioc)
+{
+	struct request_queue *q;
+	struct blk_user_cbt_info ci;
+	struct blk_user_cbt_extent __user *cur_u_ex;
+	struct blk_user_cbt_extent u_ex;
+	struct cbt_info *cbt;
+	struct cbt_extent ex;
+	blkcnt_t block , end;
+	int ret = 0;
+
+	if (copy_from_user(&ci, ucbt_ioc, sizeof(ci)))
+		return -EFAULT;
+	if (ci.ci_flags &  ~CI_FLAG_ONCE)
+		return -EINVAL;
+	if (ci.ci_extent_count > CBT_MAX_EXTENTS)
+		return -EINVAL;
+
+	cur_u_ex = (struct blk_user_cbt_extent __user*)
+		((char *)ucbt_ioc + sizeof(struct blk_user_cbt_info));
+
+	if (ci.ci_extent_count != 0 &&
+	    !access_ok(VERIFY_WRITE, cur_u_ex,
+		       ci.ci_extent_count * sizeof(struct blk_user_cbt_extent))){
+		return -EFAULT;
+	}
+	q = bdev_get_queue(bdev);
+	mutex_lock(&cbt_mutex);
+	cbt = q->cbt;
+	if (!cbt) {
+		mutex_unlock(&cbt_mutex);
+		return -EINVAL;
+	}
+	if ((ci.ci_start >> cbt->block_bits) > cbt->block_max) {
+		mutex_unlock(&cbt_mutex);
+		return -EINVAL;
+	}
+	if (test_bit(CBT_ERROR, &cbt->flags)) {
+		mutex_unlock(&cbt_mutex);
+		return -EIO;
+	}
+	cbt_flush_cache(cbt);
+
+	memcpy(&ci.ci_uuid, cbt->uuid, sizeof(cbt->uuid));
+	ci.ci_blksize = 1UL << cbt->block_bits;
+	block = ci.ci_start >> cbt->block_bits;
+	end = (ci.ci_start + ci.ci_length) >> cbt->block_bits;
+	if (end > cbt->block_max)
+		end = cbt->block_max;
+
+	memset(&u_ex, 0, sizeof(u_ex));
+	while (ci.ci_mapped_extents < ci.ci_extent_count) {
+		cbt_find_next_extent(cbt, block, &ex);
+		if (!ex.len || ex.start > end) {
+			ret = 0;
+			break;
+		}
+		u_ex.ce_physical = ex.start << cbt->block_bits;
+		u_ex.ce_length = ex.len << cbt->block_bits;
+		if (copy_to_user(cur_u_ex, &u_ex, sizeof(u_ex))) {
+			ret = -EFAULT;
+			break;
+		}
+		if (ci.ci_flags & CI_FLAG_ONCE)
+			__blk_cbt_set(cbt, ex.start, ex.len, 0, 0, NULL, NULL);
+		cur_u_ex++;
+		ci.ci_mapped_extents++;
+		block = ex.start + ex.len;
+	}
+	mutex_unlock(&cbt_mutex);
+	if (!ret && copy_to_user(ucbt_ioc, &ci, sizeof(ci)))
+		ret = -EFAULT;
+
+	return ret;
+}
+
+static int cbt_ioc_set(struct block_device *bdev, struct blk_user_cbt_info __user *ucbt_ioc, bool set)
+{
+	struct request_queue *q = bdev_get_queue(bdev);
+	struct cbt_info *cbt;
+	struct blk_user_cbt_info ci;
+	struct blk_user_cbt_extent __user u_ex, *cur_u_ex, *end;
+	int ret = 0;
+
+	if (copy_from_user(&ci, ucbt_ioc, sizeof(ci)))
+		return -EFAULT;
+	if (ci.ci_extent_count > CBT_MAX_EXTENTS)
+		return -EINVAL;
+	if (ci.ci_extent_count < ci.ci_mapped_extents)
+		return -EINVAL;
+
+	cur_u_ex = (struct blk_user_cbt_extent __user*)
+		((char *)ucbt_ioc + sizeof(struct blk_user_cbt_info));
+	end = cur_u_ex + ci.ci_mapped_extents;
+	if (!access_ok(VERIFY_READ, cur_u_ex,
+		       ci.ci_mapped_extents * sizeof(struct blk_user_cbt_extent)))
+		return -EFAULT;
+
+	mutex_lock(&cbt_mutex);
+	cbt = q->cbt;
+	if (!cbt) {
+		mutex_unlock(&cbt_mutex);
+		return -EINVAL;
+	}
+	if (ci.ci_flags & CI_FLAG_NEW_UUID)
+		memcpy(cbt->uuid, &ci.ci_uuid, sizeof(ci.ci_uuid));
+	else if (memcmp(cbt->uuid, &ci.ci_uuid, sizeof(ci.ci_uuid))) {
+			mutex_unlock(&cbt_mutex);
+			return -EINVAL;
+	}
+	if (test_bit(CBT_ERROR, &cbt->flags)) {
+		mutex_unlock(&cbt_mutex);
+		return -EIO;
+	}
+
+	/* Do not care about pcpu caches on set, only in case of clear */
+	if (!set)
+		cbt_flush_cache(cbt);
+
+	while (cur_u_ex < end) {
+		struct cbt_extent ex;
+
+		if (copy_from_user(&u_ex, cur_u_ex, sizeof(u_ex))) {
+			ret = -EFAULT;
+			break;
+		}
+		ex.start  = u_ex.ce_physical >> cbt->block_bits;
+		ex.len  = (u_ex.ce_length + (1 << cbt->block_bits) -1) >> cbt->block_bits;
+		if (ex.start > q->cbt->block_max ||
+		    ex.start + ex.len > q->cbt->block_max ||
+		    ex.len == 0) {
+			ret = -EINVAL;
+			break;
+		}
+		ret = __blk_cbt_set(cbt, ex.start, ex.len, 0, set, NULL, NULL);
+		if (ret)
+			break;
+		cur_u_ex++;
+	}
+	mutex_unlock(&cbt_mutex);
+	return ret;
+}
+
+int blk_cbt_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
+{
+	struct blk_user_cbt_info __user *ucbt_ioc = (struct blk_user_cbt_info __user *) arg;
+
+	switch(cmd) {
+	case BLKCBTSTART:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+		return cbt_ioc_init(bdev, ucbt_ioc);
+	case BLKCBTSTOP:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		return cbt_ioc_stop(bdev);
+	case BLKCBTGET:
+		return cbt_ioc_get(bdev, ucbt_ioc);
+	case BLKCBTSET:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		return cbt_ioc_set(bdev, ucbt_ioc, 1);
+	case BLKCBTCLR:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		return cbt_ioc_set(bdev, ucbt_ioc, 0);
+	default:
+		BUG();
+	}
+	return -ENOTTY;
+}
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1937,6 +1937,7 @@ generic_make_request_checks(struct bio *bio)
 		return false;	/* throttled, will be resubmitted later */
 
 	trace_block_bio_queue(q, bio);
+	blk_cbt_bio_queue(q, bio);
 	return true;
 
 end_io:
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -8,6 +8,7 @@
 #include <linux/blkdev.h>
 #include <linux/bootmem.h>	/* for max_pfn/max_low_pfn */
 #include <linux/slab.h>
+#include <bc/beancounter.h>
 
 #include "blk.h"
 
@@ -117,6 +118,9 @@ static void ioc_release_fn(struct work_struct *work)
 
 	spin_unlock_irqrestore(&ioc->lock, flags);
 
+#ifdef CONFIG_BEANCOUNTERS
+	put_beancounter(ioc->ioc_ub);
+#endif
 	kmem_cache_free(iocontext_cachep, ioc);
 }
 
@@ -150,8 +154,12 @@ void put_io_context(struct io_context *ioc)
 		spin_unlock_irqrestore(&ioc->lock, flags);
 	}
 
-	if (free_ioc)
+	if (free_ioc) {
+#ifdef CONFIG_BEANCOUNTERS
+		put_beancounter(ioc->ioc_ub);
+#endif
 		kmem_cache_free(iocontext_cachep, ioc);
+	}
 }
 EXPORT_SYMBOL(put_io_context);
 
@@ -195,6 +203,7 @@ retry:
 
 	put_io_context(ioc);
 }
+EXPORT_SYMBOL(put_io_context_active);
 
 /* Called by the exiting task */
 void exit_io_context(struct task_struct *task)
@@ -209,6 +218,7 @@ void exit_io_context(struct task_struct *task)
 	atomic_dec(&ioc->nr_tasks);
 	put_io_context_active(ioc);
 }
+EXPORT_SYMBOL(exit_io_context);
 
 /**
  * ioc_clear_queue - break any ioc association with the specified queue
@@ -249,6 +259,9 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
 	INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC | __GFP_HIGH);
 	INIT_HLIST_HEAD(&ioc->icq_list);
 	INIT_WORK(&ioc->release_work, ioc_release_fn);
+#ifdef CONFIG_BEANCOUNTERS
+	ioc->ioc_ub = get_beancounter(get_exec_ub());
+#endif
 
 	/*
 	 * Try to install.  ioc shouldn't be installed if someone else
@@ -261,8 +274,12 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
 	if (!task->io_context &&
 	    (task == current || !(task->flags & PF_EXITING)))
 		task->io_context = ioc;
-	else
+	else {
+#ifdef CONFIG_BEANCOUNTERS
+		put_beancounter(ioc->ioc_ub);
+#endif
 		kmem_cache_free(iocontext_cachep, ioc);
+	}
 
 	ret = task->io_context ? 0 : -EBUSY;
 
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -560,6 +560,7 @@ static void blk_release_queue(struct kobject *kobj)
 		blk_mq_release(q);
 
 	blk_trace_shutdown(q);
+	blk_cbt_release(q);
 
 	bdi_destroy(&q->backing_dev_info);
 
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -14,6 +14,9 @@
 #include <linux/rbtree.h>
 #include <linux/ioprio.h>
 #include <linux/blktrace_api.h>
+#include <linux/nmi.h>
+#include <bc/io_acct.h>
+
 #include "blk.h"
 #include "blk-cgroup.h"
 
@@ -858,8 +861,7 @@ static inline int cfqg_busy_async_queues(struct cfq_data *cfqd,
 
 static void cfq_dispatch_insert(struct request_queue *, struct request *);
 static struct cfq_queue *cfq_get_queue(struct cfq_data *cfqd, bool is_sync,
-				       struct cfq_io_cq *cic, struct bio *bio,
-				       gfp_t gfp_mask);
+				       struct cfq_io_cq *cic, struct bio *bio);
 
 static inline struct cfq_io_cq *icq_to_cic(struct io_cq *icq)
 {
@@ -1750,6 +1752,18 @@ static int cfq_set_leaf_weight(struct cgroup *cgrp, struct cftype *cft, u64 val)
 	return __cfq_set_weight(cgrp, cft, val, true);
 }
 
+#ifdef CONFIG_BC_IO_PRIORITY
+unsigned int blkcg_get_weight(struct cgroup *cgrp)
+{
+	return cgroup_to_blkcg(cgrp)->cfq_weight;
+}
+
+int blkcg_set_weight(struct cgroup *cgrp, unsigned int weight)
+{
+	return cfq_set_weight(cgrp, NULL, weight);
+}
+#endif
+
 static int cfqg_print_stat(struct cgroup *cgrp, struct cftype *cft,
 			   struct seq_file *sf)
 {
@@ -1806,6 +1820,56 @@ static int cfqg_print_rwstat_recursive(struct cgroup *cgrp, struct cftype *cft,
 	return 0;
 }
 
+#ifdef CONFIG_BC_IO_PRIORITY
+static u64 cfqg_prfill_ub_iostat(struct seq_file *sf,
+				 struct blkg_policy_data *pd, int unused)
+{
+	struct user_beancounter *ub = sf->private;
+	struct blkg_rwstat queued, serviced, wait_time;
+	u64 sectors, time;
+	const char *dev_name;
+
+	if (pd->blkg->q->kobj.parent)
+		dev_name = kobject_name(pd->blkg->q->kobj.parent);
+	else
+		dev_name = "none";
+
+	queued = cfqg_rwstat_pd_recursive_sum(pd,
+			offsetof(struct cfq_group, stats.queued));
+	serviced = cfqg_rwstat_pd_recursive_sum(pd,
+			offsetof(struct cfq_group, stats.serviced));
+	wait_time = cfqg_rwstat_pd_recursive_sum(pd,
+			offsetof(struct cfq_group, stats.wait_time));
+	sectors = cfqg_stat_pd_recursive_sum(pd,
+			offsetof(struct cfq_group, stats.sectors));
+	time = cfqg_stat_pd_recursive_sum(pd,
+			offsetof(struct cfq_group, stats.time));
+
+	seq_printf(sf, "%s %s . %llu 0 0 %llu %llu %llu %llu %llu %llu\n",
+		   dev_name, ub->ub_name,
+		   (unsigned long long)(queued.cnt[BLKG_RWSTAT_READ] +
+					queued.cnt[BLKG_RWSTAT_WRITE]),
+		   (unsigned long long)div_u64(wait_time.cnt[BLKG_RWSTAT_READ] +
+					       wait_time.cnt[BLKG_RWSTAT_WRITE],
+					       NSEC_PER_MSEC),
+		   (unsigned long long)time,
+		   (unsigned long long)(serviced.cnt[BLKG_RWSTAT_READ] +
+					serviced.cnt[BLKG_RWSTAT_WRITE]),
+		   (unsigned long long)sectors,
+		   (unsigned long long)serviced.cnt[BLKG_RWSTAT_READ],
+		   (unsigned long long)serviced.cnt[BLKG_RWSTAT_WRITE]);
+	return 0;
+}
+
+void blkcg_show_ub_iostat(struct cgroup *cgrp, struct seq_file *sf)
+{
+	struct blkcg *blkcg = cgroup_to_blkcg(cgrp);
+
+	blkcg_print_blkgs(sf, blkcg, cfqg_prfill_ub_iostat,
+			  &blkcg_policy_cfq, 0, false);
+}
+#endif
+
 #ifdef CONFIG_DEBUG_BLK_CGROUP
 static u64 cfqg_prfill_avg_queue_size(struct seq_file *sf,
 				      struct blkg_policy_data *pd, int off)
@@ -2357,6 +2421,11 @@ static void cfq_bio_merged(struct request_queue *q, struct request *req,
 				struct bio *bio)
 {
 	cfqg_stats_update_io_merged(RQ_CFQG(req), bio->bi_rw);
+#ifdef CONFIG_BC_IO_PRIORITY
+	if (get_exec_ub()->ub_bound_css[UB_BLKIO_CGROUP] !=
+	    &(RQ_CFQG(req))->pd.blkg->blkcg->css)
+		ub_writeback_io(0, bio_sectors(bio));
+#endif
 }
 
 static void
@@ -3169,6 +3238,7 @@ static int cfq_forced_dispatch(struct cfq_data *cfqd)
 	while ((cfqq = cfq_get_next_queue_forced(cfqd)) != NULL) {
 		__cfq_set_active_queue(cfqd, cfqq);
 		dispatched += __cfq_forced_dispatch_cfqq(cfqq);
+		touch_nmi_watchdog();
 	}
 
 	BUG_ON(cfqd->busy_queues);
@@ -3500,8 +3570,7 @@ static void check_ioprio_changed(struct cfq_io_cq *cic, struct bio *bio)
 	cfqq = cic->cfqq[BLK_RW_ASYNC];
 	if (cfqq) {
 		struct cfq_queue *new_cfqq;
-		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio,
-					 GFP_ATOMIC);
+		new_cfqq = cfq_get_queue(cfqd, BLK_RW_ASYNC, cic, bio);
 		if (new_cfqq) {
 			cic->cfqq[BLK_RW_ASYNC] = new_cfqq;
 			cfq_put_queue(cfqq);
@@ -3572,13 +3641,12 @@ static inline void check_blkcg_changed(struct cfq_io_cq *cic, struct bio *bio) {
 
 static struct cfq_queue *
 cfq_find_alloc_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
-		     struct bio *bio, gfp_t gfp_mask)
+		     struct bio *bio)
 {
 	struct blkcg *blkcg;
-	struct cfq_queue *cfqq, *new_cfqq = NULL;
+	struct cfq_queue *cfqq;
 	struct cfq_group *cfqg;
 
-retry:
 	rcu_read_lock();
 
 	blkcg = bio_blkcg(bio);
@@ -3595,27 +3663,9 @@ retry:
 	 * originally, since it should just be a temporary situation.
 	 */
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-		cfqq = NULL;
-		if (new_cfqq) {
-			cfqq = new_cfqq;
-			new_cfqq = NULL;
-		} else if (gfp_mask & __GFP_WAIT) {
-			rcu_read_unlock();
-			spin_unlock_irq(cfqd->queue->queue_lock);
-			new_cfqq = kmem_cache_alloc_node(cfq_pool,
-					gfp_mask | __GFP_ZERO,
-					cfqd->queue->node);
-			spin_lock_irq(cfqd->queue->queue_lock);
-			if (new_cfqq)
-				goto retry;
-			else
-				return &cfqd->oom_cfqq;
-		} else {
-			cfqq = kmem_cache_alloc_node(cfq_pool,
-					gfp_mask | __GFP_ZERO,
-					cfqd->queue->node);
-		}
-
+		cfqq = kmem_cache_alloc_node(cfq_pool,
+					     GFP_ATOMIC | __GFP_ZERO,
+					     cfqd->queue->node);
 		if (cfqq) {
 			cfq_init_cfqq(cfqd, cfqq, current->pid, is_sync);
 			cfq_init_prio_data(cfqq, cic);
@@ -3625,9 +3675,6 @@ retry:
 			cfqq = &cfqd->oom_cfqq;
 	}
 out:
-	if (new_cfqq)
-		kmem_cache_free(cfq_pool, new_cfqq);
-
 	rcu_read_unlock();
 	return cfqq;
 }
@@ -3652,29 +3699,30 @@ cfq_async_queue_prio(struct cfq_data *cfqd, int ioprio_class, int ioprio)
 
 static struct cfq_queue *
 cfq_get_queue(struct cfq_data *cfqd, bool is_sync, struct cfq_io_cq *cic,
-	      struct bio *bio, gfp_t gfp_mask)
+	      struct bio *bio)
 {
 	const int ioprio_class = IOPRIO_PRIO_CLASS(cic->ioprio);
 	const int ioprio = IOPRIO_PRIO_DATA(cic->ioprio);
-	struct cfq_queue **async_cfqq = NULL;
-	struct cfq_queue *cfqq = NULL;
+	struct cfq_queue **async_cfqq;
+	struct cfq_queue *cfqq;
 
 	if (!is_sync) {
 		async_cfqq = cfq_async_queue_prio(cfqd, ioprio_class, ioprio);
 		cfqq = *async_cfqq;
+		if (cfqq)
+			goto out;
 	}
 
-	if (!cfqq)
-		cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio, gfp_mask);
+	cfqq = cfq_find_alloc_queue(cfqd, is_sync, cic, bio);
 
 	/*
 	 * pin the queue now that it's allocated, scheduler exit will prune it
 	 */
-	if (!is_sync && !(*async_cfqq)) {
+	if (!is_sync && cfqq != &cfqd->oom_cfqq) {
 		cfqq->ref++;
 		*async_cfqq = cfqq;
 	}
-
+out:
 	cfqq->ref++;
 	return cfqq;
 }
@@ -3940,6 +3988,13 @@ static void cfq_insert_request(struct request_queue *q, struct request *rq)
 	cfq_add_rq_rb(rq);
 	cfqg_stats_update_io_add(RQ_CFQG(rq), cfqd->serving_group,
 				 rq->cmd_flags);
+
+#ifdef CONFIG_BC_IO_PRIORITY
+	if (get_exec_ub()->ub_bound_css[UB_BLKIO_CGROUP] !=
+	    &(RQ_CFQG(rq))->pd.blkg->blkcg->css)
+		ub_writeback_io(1, blk_rq_sectors(rq));
+#endif
+	virtinfo_notifier_call_irq(VITYPE_IO, VIRTINFO_IO_OP_ACCOUNT, q);
 	cfq_rq_enqueued(cfqd, cfqq, rq);
 }
 
@@ -4209,8 +4264,6 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
 	const bool is_sync = rq_is_sync(rq);
 	struct cfq_queue *cfqq;
 
-	might_sleep_if(gfp_mask & __GFP_WAIT);
-
 	spin_lock_irq(q->queue_lock);
 
 	check_ioprio_changed(cic, bio);
@@ -4218,7 +4271,9 @@ cfq_set_request(struct request_queue *q, struct request *rq, struct bio *bio,
 new_queue:
 	cfqq = cic_to_cfqq(cic, is_sync);
 	if (!cfqq || cfqq == &cfqd->oom_cfqq) {
-		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio, gfp_mask);
+		if (cfqq)
+			cfq_put_queue(cfqq);
+		cfqq = cfq_get_queue(cfqd, is_sync, cic, bio);
 		cic_set_cfqq(cic, cfqq, is_sync);
 	} else {
 		/*
--- a/block/deadline-iosched.c
+++ b/block/deadline-iosched.c
@@ -13,6 +13,7 @@
 #include <linux/init.h>
 #include <linux/compiler.h>
 #include <linux/rbtree.h>
+#include <bc/io_acct.h>
 
 /*
  * See Documentation/block/deadline-iosched.txt
@@ -108,6 +109,8 @@ deadline_add_request(struct request_queue *q, struct request *rq)
 	 */
 	rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
 	list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
+	ub_writeback_io(1, blk_rq_sectors(rq));
+	virtinfo_notifier_call_irq(VITYPE_IO, VIRTINFO_IO_OP_ACCOUNT, q);
 }
 
 /*
@@ -186,6 +189,12 @@ deadline_merged_requests(struct request_queue *q, struct request *req,
 	deadline_remove_request(q, next);
 }
 
+static void deadline_bio_merged(struct request_queue *q, struct request *req,
+				struct bio *bio)
+{
+	ub_writeback_io(0, bio_sectors(bio));
+}
+
 /*
  * move request from sort list to dispatch queue.
  */
@@ -445,6 +454,7 @@ static struct elevator_type iosched_deadline = {
 		.elevator_merge_fn = 		deadline_merge,
 		.elevator_merged_fn =		deadline_merged_request,
 		.elevator_merge_req_fn =	deadline_merged_requests,
+		.elevator_bio_merged_fn =	deadline_bio_merged,
 		.elevator_dispatch_fn =		deadline_dispatch_requests,
 		.elevator_add_req_fn =		deadline_add_request,
 		.elevator_former_req_fn =	elv_rb_former_request,
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -20,6 +20,7 @@
 #include <linux/log2.h>
 #include <linux/pm_runtime.h>
 #include <linux/badblocks.h>
+#include <linux/device_cgroup.h>
 
 #include "blk.h"
 
@@ -35,7 +36,7 @@ struct kobject *block_depr;
 static DEFINE_SPINLOCK(ext_devt_lock);
 static DEFINE_IDR(ext_devt_idr);
 
-static struct device_type disk_type;
+struct device_type disk_type;
 
 static void disk_check_events(struct disk_events *ev,
 			      unsigned int *clearing_ptr);
@@ -260,8 +261,12 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
 
 	if (offset < BLKDEV_MAJOR_HASH_SIZE) {
 		mutex_lock(&block_class_lock);
-		for (dp = major_names[offset]; dp; dp = dp->next)
+		for (dp = major_names[offset]; dp; dp = dp->next) {
+			if (!devcgroup_device_visible(S_IFBLK, dp->major,
+						0, INT_MAX))
+				continue;
 			seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
+		}
 		mutex_unlock(&block_class_lock);
 	}
 }
@@ -854,6 +859,7 @@ static void disk_seqf_stop(struct seq_file *seqf, void *v)
 	if (iter) {
 		class_dev_iter_exit(iter);
 		kfree(iter);
+		seqf->private = NULL;
 	}
 }
 
@@ -883,11 +889,15 @@ static int show_partition(struct seq_file *seqf, void *v)
 
 	/* show the full disk and all non-0 size partitions of it */
 	disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0);
-	while ((part = disk_part_iter_next(&piter)))
-		seq_printf(seqf, "%4d  %7d %10llu %s\n",
-			   MAJOR(part_devt(part)), MINOR(part_devt(part)),
+	while ((part = disk_part_iter_next(&piter))) {
+		unsigned int major = MAJOR(part_devt(part));
+		unsigned int minor = MINOR(part_devt(part));
+
+		if (devcgroup_device_visible(S_IFBLK, major, minor, 1))
+			seq_printf(seqf, "%4d  %7d %10llu %s\n", major, minor,
 			   (unsigned long long)part_nr_sects_read(part) >> 1,
 			   disk_name(sgp, part->partno, buf));
+	}
 	disk_part_iter_exit(&piter);
 
 	return 0;
@@ -1140,6 +1150,7 @@ static void disk_release(struct device *dev)
 struct class block_class = {
 	.name		= "block",
 };
+EXPORT_SYMBOL(block_class);
 
 static char *block_devnode(struct device *dev, umode_t *mode,
 			   kuid_t *uid, kgid_t *gid)
@@ -1151,12 +1162,13 @@ static char *block_devnode(struct device *dev, umode_t *mode,
 	return NULL;
 }
 
-static struct device_type disk_type = {
+struct device_type disk_type = {
 	.name		= "disk",
 	.groups		= disk_attr_groups,
 	.release	= disk_release,
 	.devnode	= block_devnode,
 };
+EXPORT_SYMBOL(disk_type);
 
 #ifdef CONFIG_PROC_FS
 /*
@@ -1231,7 +1243,7 @@ static const struct file_operations proc_diskstats_operations = {
 static int __init proc_genhd_init(void)
 {
 	proc_create("diskstats", 0, NULL, &proc_diskstats_operations);
-	proc_create("partitions", 0, NULL, &proc_partitions_operations);
+	proc_create("partitions", S_ISVTX, NULL, &proc_partitions_operations);
 	return 0;
 }
 module_init(proc_genhd_init);
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -140,12 +140,27 @@ static int blkpg_ioctl(struct block_device *bdev, struct blkpg_ioctl_arg __user
 			}
 			disk_part_iter_exit(&piter);
 			part_nr_sects_write(part, (sector_t)length);
-			i_size_write(bdevp->bd_inode, p.length);
+			bd_write_size(bdevp, p.length);
 			mutex_unlock(&bdevp->bd_mutex);
 			mutex_unlock(&bdev->bd_mutex);
 			bdput(bdevp);
 			disk_put_part(part);
 			return 0;
+		case BLKPG_GET_PARTITION:
+			mutex_lock(&bdev->bd_mutex);
+			part = disk_get_part(disk, partno);
+			if (!part)
+			{
+				mutex_unlock(&bdev->bd_mutex);
+				return -ENXIO;
+			}
+			p.start = part->start_sect << 9;
+			p.length = part->nr_sects << 9;
+			disk_put_part(part);
+			mutex_unlock(&bdev->bd_mutex);
+			if (copy_to_user(a.data, &p, sizeof(struct blkpg_partition)))
+				return -EFAULT;
+			return 0;
 		default:
 			return -EINVAL;
 	}
@@ -580,6 +595,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
 		return blkdev_pr_preempt(bdev, argp, true);
 	case IOC_PR_CLEAR:
 		return blkdev_pr_clear(bdev, argp);
+	case BLKCBTSTART:
+	case BLKCBTSTOP:
+	case BLKCBTGET:
+	case BLKCBTSET:
+	case BLKCBTCLR:
+		return blk_cbt_ioctl(bdev, cmd, (char __user *)arg);
 	default:
 		return __blkdev_driver_ioctl(bdev, mode, cmd, arg);
 	}
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -43,6 +43,7 @@ char *disk_name(struct gendisk *hd, int partno, char *buf)
 
 	return buf;
 }
+EXPORT_SYMBOL(disk_name);
 
 const char *bdevname(struct block_device *bdev, char *buf)
 {
--- a/crypto/Makefile
+++ b/crypto/Makefile
@@ -2,8 +2,13 @@
 # Cryptographic API
 #
 
+# memneq MUST be built with -Os or -O0 to prevent early-return optimizations
+# that will defeat memneq's actual purpose to prevent timing attacks.
+CFLAGS_REMOVE_memneq.o := -O1 -O2 -O3
+CFLAGS_memneq.o := -Os
+
 obj-$(CONFIG_CRYPTO) += crypto.o
-crypto-y := api.o cipher.o compress.o
+crypto-y := api.o cipher.o compress.o memneq.o
 
 obj-$(CONFIG_CRYPTO_WORKQUEUE) += crypto_wq.o
 
--- a/crypto/ablkcipher.c
+++ b/crypto/ablkcipher.c
@@ -379,6 +379,7 @@ static int crypto_init_ablkcipher_ops(struct crypto_tfm *tfm, u32 type,
 	}
 	crt->base = __crypto_ablkcipher_cast(tfm);
 	crt->ivsize = alg->ivsize;
+	crt->has_setkey = alg->max_keysize;
 
 	return 0;
 }
@@ -460,6 +461,7 @@ static int crypto_init_givcipher_ops(struct crypto_tfm *tfm, u32 type,
 	crt->givdecrypt = alg->givdecrypt ?: no_givdecrypt;
 	crt->base = __crypto_ablkcipher_cast(tfm);
 	crt->ivsize = alg->ivsize;
+	crt->has_setkey = alg->max_keysize;
 
 	return 0;
 }
--- a/crypto/af_alg.c
+++ b/crypto/af_alg.c
@@ -76,6 +76,8 @@ int af_alg_register_type(const struct af_alg_type *type)
 		goto unlock;
 
 	type->ops->owner = THIS_MODULE;
+	if (type->ops_nokey)
+		type->ops_nokey->owner = THIS_MODULE;
 	node->type = type;
 	list_add(&node->list, &alg_types);
 	err = 0;
@@ -125,6 +127,26 @@ int af_alg_release(struct socket *sock)
 }
 EXPORT_SYMBOL_GPL(af_alg_release);
 
+void af_alg_release_parent(struct sock *sk)
+{
+	struct alg_sock *ask = alg_sk(sk);
+	unsigned int nokey = ask->nokey_refcnt;
+	bool last = nokey && !ask->refcnt;
+
+	sk = ask->parent;
+	ask = alg_sk(sk);
+
+	lock_sock(sk);
+	ask->nokey_refcnt -= nokey;
+	if (!last)
+		last = !--ask->refcnt;
+	release_sock(sk);
+
+	if (last)
+		sock_put(sk);
+}
+EXPORT_SYMBOL_GPL(af_alg_release_parent);
+
 static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 {
 	struct sock *sk = sock->sk;
@@ -132,6 +154,7 @@ static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 	struct sockaddr_alg *sa = (void *)uaddr;
 	const struct af_alg_type *type;
 	void *private;
+	int err;
 
 	if (sock->state == SS_CONNECTED)
 		return -EINVAL;
@@ -157,16 +180,22 @@ static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
 		return PTR_ERR(private);
 	}
 
+	err = -EBUSY;
 	lock_sock(sk);
+	if (ask->refcnt | ask->nokey_refcnt)
+		goto unlock;
 
 	swap(ask->type, type);
 	swap(ask->private, private);
 
+	err = 0;
+
+unlock:
 	release_sock(sk);
 
 	alg_do_release(type, private);
 
-	return 0;
+	return err;
 }
 
 static int alg_setkey(struct sock *sk, char __user *ukey,
@@ -199,11 +228,15 @@ static int alg_setsockopt(struct socket *sock, int level, int optname,
 	struct sock *sk = sock->sk;
 	struct alg_sock *ask = alg_sk(sk);
 	const struct af_alg_type *type;
-	int err = -ENOPROTOOPT;
+	int err = -EBUSY;
 
 	lock_sock(sk);
+	if (ask->refcnt)
+		goto unlock;
+
 	type = ask->type;
 
+	err = -ENOPROTOOPT;
 	if (level != SOL_ALG || !type)
 		goto unlock;
 
@@ -228,6 +261,7 @@ int af_alg_accept(struct sock *sk, struct socket *newsock)
 	struct alg_sock *ask = alg_sk(sk);
 	const struct af_alg_type *type;
 	struct sock *sk2;
+	unsigned int nokey;
 	int err;
 
 	lock_sock(sk);
@@ -247,20 +281,29 @@ int af_alg_accept(struct sock *sk, struct socket *newsock)
 	security_sk_clone(sk, sk2);
 
 	err = type->accept(ask->private, sk2);
-	if (err) {
-		sk_free(sk2);
+
+	nokey = err == -ENOKEY;
+	if (nokey && type->accept_nokey)
+		err = type->accept_nokey(ask->private, sk2);
+
+	if (err)
 		goto unlock;
-	}
 
 	sk2->sk_family = PF_ALG;
 
-	sock_hold(sk);
+	if (nokey || !ask->refcnt++)
+		sock_hold(sk);
+	ask->nokey_refcnt += nokey;
 	alg_sk(sk2)->parent = sk;
 	alg_sk(sk2)->type = type;
+	alg_sk(sk2)->nokey_refcnt = nokey;
 
 	newsock->ops = type->ops;
 	newsock->state = SS_CONNECTED;
 
+	if (nokey)
+		newsock->ops = type->ops_nokey;
+
 	err = 0;
 
 unlock:
--- a/crypto/ahash.c
+++ b/crypto/ahash.c
@@ -68,8 +68,9 @@ static int hash_walk_new_entry(struct crypto_hash_walk *walk)
 	struct scatterlist *sg;
 
 	sg = walk->sg;
-	walk->pg = sg_page(sg);
 	walk->offset = sg->offset;
+	walk->pg = sg_page(walk->sg) + (walk->offset >> PAGE_SHIFT);
+	walk->offset = offset_in_page(walk->offset);
 	walk->entrylen = sg->length;
 
 	if (walk->entrylen > walk->total)
@@ -406,6 +407,7 @@ static int crypto_ahash_init_tfm(struct crypto_tfm *tfm)
 	struct ahash_alg *alg = crypto_ahash_alg(hash);
 
 	hash->setkey = ahash_nosetkey;
+	hash->has_setkey = false;
 	hash->export = ahash_no_export;
 	hash->import = ahash_no_import;
 
@@ -418,8 +420,10 @@ static int crypto_ahash_init_tfm(struct crypto_tfm *tfm)
 	hash->finup = alg->finup ?: ahash_def_finup;
 	hash->digest = alg->digest;
 
-	if (alg->setkey)
+	if (alg->setkey) {
 		hash->setkey = alg->setkey;
+		hash->has_setkey = true;
+	}
 	if (alg->export)
 		hash->export = alg->export;
 	if (alg->import)
--- a/crypto/algif_hash.c
+++ b/crypto/algif_hash.c
@@ -34,6 +34,11 @@ struct hash_ctx {
 	struct ahash_request req;
 };
 
+struct algif_hash_tfm {
+	struct crypto_ahash *hash;
+	bool has_key;
+};
+
 static int hash_sendmsg(struct kiocb *unused, struct socket *sock,
 			struct msghdr *msg, size_t ignored)
 {
@@ -51,7 +56,8 @@ static int hash_sendmsg(struct kiocb *unused, struct socket *sock,
 
 	lock_sock(sk);
 	if (!ctx->more) {
-		err = crypto_ahash_init(&ctx->req);
+		err = af_alg_wait_for_completion(crypto_ahash_init(&ctx->req),
+						&ctx->completion);
 		if (err)
 			goto unlock;
 	}
@@ -131,6 +137,7 @@ static ssize_t hash_sendpage(struct socket *sock, struct page *page,
 	} else {
 		if (!ctx->more) {
 			err = crypto_ahash_init(&ctx->req);
+			err = af_alg_wait_for_completion(err, &ctx->completion);
 			if (err)
 				goto unlock;
 		}
@@ -192,9 +199,14 @@ static int hash_accept(struct socket *sock, struct socket *newsock, int flags)
 	struct sock *sk2;
 	struct alg_sock *ask2;
 	struct hash_ctx *ctx2;
+	bool more;
 	int err;
 
-	err = crypto_ahash_export(req, state);
+	lock_sock(sk);
+	more = ctx->more;
+	err = more ? crypto_ahash_export(req, state) : 0;
+	release_sock(sk);
+
 	if (err)
 		return err;
 
@@ -205,7 +217,10 @@ static int hash_accept(struct socket *sock, struct socket *newsock, int flags)
 	sk2 = newsock->sk;
 	ask2 = alg_sk(sk2);
 	ctx2 = ask2->private;
-	ctx2->more = 1;
+	ctx2->more = more;
+
+	if (!more)
+		return err;
 
 	err = crypto_ahash_import(&ctx2->req, state);
 	if (err) {
@@ -238,19 +253,151 @@ static struct proto_ops algif_hash_ops = {
 	.accept		=	hash_accept,
 };
 
+static int hash_check_key(struct socket *sock)
+{
+	int err = 0;
+	struct sock *psk;
+	struct alg_sock *pask;
+	struct algif_hash_tfm *tfm;
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+
+	lock_sock(sk);
+	if (ask->refcnt)
+		goto unlock_child;
+
+	psk = ask->parent;
+	pask = alg_sk(ask->parent);
+	tfm = pask->private;
+
+	err = -ENOKEY;
+	lock_sock_nested(psk, SINGLE_DEPTH_NESTING);
+	if (!tfm->has_key)
+		goto unlock;
+
+	if (!pask->refcnt++)
+		sock_hold(psk);
+
+	ask->refcnt = 1;
+	sock_put(psk);
+
+	err = 0;
+
+unlock:
+	release_sock(psk);
+unlock_child:
+	release_sock(sk);
+
+	return err;
+}
+
+static int hash_sendmsg_nokey(struct kiocb *unused, struct socket *sock,
+			      struct msghdr *msg, size_t size)
+{
+	int err;
+
+	err = hash_check_key(sock);
+	if (err)
+		return err;
+
+	return hash_sendmsg(unused, sock, msg, size);
+}
+
+static ssize_t hash_sendpage_nokey(struct socket *sock, struct page *page,
+				   int offset, size_t size, int flags)
+{
+	int err;
+
+	err = hash_check_key(sock);
+	if (err)
+		return err;
+
+	return hash_sendpage(sock, page, offset, size, flags);
+}
+
+static int hash_recvmsg_nokey(struct kiocb *unused, struct socket *sock,
+			      struct msghdr *msg, size_t ignored, int flags)
+{
+	int err;
+
+	err = hash_check_key(sock);
+	if (err)
+		return err;
+
+	return hash_recvmsg(unused, sock, msg, ignored, flags);
+}
+
+static int hash_accept_nokey(struct socket *sock, struct socket *newsock,
+			     int flags)
+{
+	int err;
+
+	err = hash_check_key(sock);
+	if (err)
+		return err;
+
+	return hash_accept(sock, newsock, flags);
+}
+
+static struct proto_ops algif_hash_ops_nokey = {
+	.family		=	PF_ALG,
+
+	.connect	=	sock_no_connect,
+	.socketpair	=	sock_no_socketpair,
+	.getname	=	sock_no_getname,
+	.ioctl		=	sock_no_ioctl,
+	.listen		=	sock_no_listen,
+	.shutdown	=	sock_no_shutdown,
+	.getsockopt	=	sock_no_getsockopt,
+	.mmap		=	sock_no_mmap,
+	.bind		=	sock_no_bind,
+	.setsockopt	=	sock_no_setsockopt,
+	.poll		=	sock_no_poll,
+
+	.release	=	af_alg_release,
+	.sendmsg	=	hash_sendmsg_nokey,
+	.sendpage	=	hash_sendpage_nokey,
+	.recvmsg	=	hash_recvmsg_nokey,
+	.accept		=	hash_accept_nokey,
+};
+
 static void *hash_bind(const char *name, u32 type, u32 mask)
 {
-	return crypto_alloc_ahash(name, type, mask);
+	struct algif_hash_tfm *tfm;
+	struct crypto_ahash *hash;
+
+	tfm = kzalloc(sizeof(*tfm), GFP_KERNEL);
+	if (!tfm)
+		return ERR_PTR(-ENOMEM);
+
+	hash = crypto_alloc_ahash(name, type, mask);
+	if (IS_ERR(hash)) {
+		kfree(tfm);
+		return ERR_CAST(hash);
+	}
+
+	tfm->hash = hash;
+
+	return tfm;
 }
 
 static void hash_release(void *private)
 {
-	crypto_free_ahash(private);
+	struct algif_hash_tfm *tfm = private;
+
+	crypto_free_ahash(tfm->hash);
+	kfree(tfm);
 }
 
 static int hash_setkey(void *private, const u8 *key, unsigned int keylen)
 {
-	return crypto_ahash_setkey(private, key, keylen);
+	struct algif_hash_tfm *tfm = private;
+	int err;
+
+	err = crypto_ahash_setkey(tfm->hash, key, keylen);
+	tfm->has_key = !err;
+
+	return err;
 }
 
 static void hash_sock_destruct(struct sock *sk)
@@ -264,12 +411,14 @@ static void hash_sock_destruct(struct sock *sk)
 	af_alg_release_parent(sk);
 }
 
-static int hash_accept_parent(void *private, struct sock *sk)
+static int hash_accept_parent_nokey(void *private, struct sock *sk)
 {
 	struct hash_ctx *ctx;
 	struct alg_sock *ask = alg_sk(sk);
-	unsigned len = sizeof(*ctx) + crypto_ahash_reqsize(private);
-	unsigned ds = crypto_ahash_digestsize(private);
+	struct algif_hash_tfm *tfm = private;
+	struct crypto_ahash *hash = tfm->hash;
+	unsigned len = sizeof(*ctx) + crypto_ahash_reqsize(hash);
+	unsigned ds = crypto_ahash_digestsize(hash);
 
 	ctx = sock_kmalloc(sk, len, GFP_KERNEL);
 	if (!ctx)
@@ -289,7 +438,7 @@ static int hash_accept_parent(void *private, struct sock *sk)
 
 	ask->private = ctx;
 
-	ahash_request_set_tfm(&ctx->req, private);
+	ahash_request_set_tfm(&ctx->req, hash);
 	ahash_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
 				   af_alg_complete, &ctx->completion);
 
@@ -298,12 +447,24 @@ static int hash_accept_parent(void *private, struct sock *sk)
 	return 0;
 }
 
+static int hash_accept_parent(void *private, struct sock *sk)
+{
+	struct algif_hash_tfm *tfm = private;
+
+	if (!tfm->has_key && crypto_ahash_has_setkey(tfm->hash))
+		return -ENOKEY;
+
+	return hash_accept_parent_nokey(private, sk);
+}
+
 static const struct af_alg_type algif_type_hash = {
 	.bind		=	hash_bind,
 	.release	=	hash_release,
 	.setkey		=	hash_setkey,
 	.accept		=	hash_accept_parent,
+	.accept_nokey	=	hash_accept_parent_nokey,
 	.ops		=	&algif_hash_ops,
+	.ops_nokey	=	&algif_hash_ops_nokey,
 	.name		=	"hash",
 	.owner		=	THIS_MODULE
 };
--- a/crypto/algif_skcipher.c
+++ b/crypto/algif_skcipher.c
@@ -31,6 +31,11 @@ struct skcipher_sg_list {
 	struct scatterlist sg[0];
 };
 
+struct skcipher_tfm {
+	struct crypto_ablkcipher *skcipher;
+	bool has_key;
+};
+
 struct skcipher_ctx {
 	struct list_head tsgl;
 	struct af_alg_sgl rsgl;
@@ -441,13 +446,6 @@ static int skcipher_recvmsg(struct kiocb *unused, struct socket *sock,
 		char __user *from = iov->iov_base;
 
 		while (seglen) {
-			sgl = list_first_entry(&ctx->tsgl,
-					       struct skcipher_sg_list, list);
-			sg = sgl->sg;
-
-			while (!sg->length)
-				sg++;
-
 			used = ctx->used;
 			if (!used) {
 				err = skcipher_wait_for_data(sk, flags);
@@ -469,6 +467,13 @@ static int skcipher_recvmsg(struct kiocb *unused, struct socket *sock,
 			if (!used)
 				goto free;
 
+			sgl = list_first_entry(&ctx->tsgl,
+					       struct skcipher_sg_list, list);
+			sg = sgl->sg;
+
+			while (!sg->length)
+				sg++;
+
 			ablkcipher_request_set_crypt(&ctx->req, sg,
 						     ctx->rsgl.sg, used,
 						     ctx->iv);
@@ -544,19 +549,139 @@ static struct proto_ops algif_skcipher_ops = {
 	.poll		=	skcipher_poll,
 };
 
+static int skcipher_check_key(struct socket *sock)
+{
+	int err = 0;
+	struct sock *psk;
+	struct alg_sock *pask;
+	struct skcipher_tfm *tfm;
+	struct sock *sk = sock->sk;
+	struct alg_sock *ask = alg_sk(sk);
+
+	lock_sock(sk);
+	if (ask->refcnt)
+		goto unlock_child;
+
+	psk = ask->parent;
+	pask = alg_sk(ask->parent);
+	tfm = pask->private;
+
+	err = -ENOKEY;
+	lock_sock_nested(psk, SINGLE_DEPTH_NESTING);
+	if (!tfm->has_key)
+		goto unlock;
+
+	if (!pask->refcnt++)
+		sock_hold(psk);
+
+	ask->refcnt = 1;
+	sock_put(psk);
+
+	err = 0;
+
+unlock:
+	release_sock(psk);
+unlock_child:
+	release_sock(sk);
+
+	return err;
+}
+
+static int skcipher_sendmsg_nokey(struct kiocb *unused, struct socket *sock,
+				  struct msghdr *msg, size_t size)
+{
+	int err;
+
+	err = skcipher_check_key(sock);
+	if (err)
+		return err;
+
+	return skcipher_sendmsg(unused, sock, msg, size);
+}
+
+static ssize_t skcipher_sendpage_nokey(struct socket *sock, struct page *page,
+				       int offset, size_t size, int flags)
+{
+	int err;
+
+	err = skcipher_check_key(sock);
+	if (err)
+		return err;
+
+	return skcipher_sendpage(sock, page, offset, size, flags);
+}
+
+static int skcipher_recvmsg_nokey(struct kiocb *unused, struct socket *sock,
+				  struct msghdr *msg, size_t ignored, int flags)
+{
+	int err;
+
+	err = skcipher_check_key(sock);
+	if (err)
+		return err;
+
+	return skcipher_recvmsg(unused, sock, msg, ignored, flags);
+}
+
+static struct proto_ops algif_skcipher_ops_nokey = {
+	.family		=	PF_ALG,
+
+	.connect	=	sock_no_connect,
+	.socketpair	=	sock_no_socketpair,
+	.getname	=	sock_no_getname,
+	.ioctl		=	sock_no_ioctl,
+	.listen		=	sock_no_listen,
+	.shutdown	=	sock_no_shutdown,
+	.getsockopt	=	sock_no_getsockopt,
+	.mmap		=	sock_no_mmap,
+	.bind		=	sock_no_bind,
+	.accept		=	sock_no_accept,
+	.setsockopt	=	sock_no_setsockopt,
+
+	.release	=	af_alg_release,
+	.sendmsg	=	skcipher_sendmsg_nokey,
+	.sendpage	=	skcipher_sendpage_nokey,
+	.recvmsg	=	skcipher_recvmsg_nokey,
+	.poll		=	skcipher_poll,
+};
+
 static void *skcipher_bind(const char *name, u32 type, u32 mask)
 {
-	return crypto_alloc_ablkcipher(name, type, mask);
+	struct skcipher_tfm *tfm;
+	struct crypto_ablkcipher *skcipher;
+
+	tfm = kzalloc(sizeof(*tfm), GFP_KERNEL);
+	if (!tfm)
+		return ERR_PTR(-ENOMEM);
+
+	skcipher = crypto_alloc_ablkcipher(name, type, mask);
+	if (IS_ERR(skcipher)) {
+		kfree(tfm);
+		return ERR_CAST(skcipher);
+	}
+
+	tfm->skcipher = skcipher;
+
+	return tfm;
 }
 
 static void skcipher_release(void *private)
 {
-	crypto_free_ablkcipher(private);
+	struct skcipher_tfm *tfm = private;
+
+	crypto_free_ablkcipher(tfm->skcipher);
+	kfree(tfm);
 }
 
 static int skcipher_setkey(void *private, const u8 *key, unsigned int keylen)
 {
-	return crypto_ablkcipher_setkey(private, key, keylen);
+	struct skcipher_tfm *tfm = private;
+	int err;
+
+	err = crypto_ablkcipher_setkey(tfm->skcipher, key, keylen);
+	tfm->has_key = !err;
+
+	return err;
 }
 
 static void skcipher_sock_destruct(struct sock *sk)
@@ -571,24 +696,25 @@ static void skcipher_sock_destruct(struct sock *sk)
 	af_alg_release_parent(sk);
 }
 
-static int skcipher_accept_parent(void *private, struct sock *sk)
+static int skcipher_accept_parent_nokey(void *private, struct sock *sk)
 {
 	struct skcipher_ctx *ctx;
 	struct alg_sock *ask = alg_sk(sk);
-	unsigned int len = sizeof(*ctx) + crypto_ablkcipher_reqsize(private);
+	struct skcipher_tfm *tfm = private;
+	struct crypto_ablkcipher *skcipher = tfm->skcipher;
+	unsigned int len = sizeof(*ctx) + crypto_ablkcipher_reqsize(skcipher);
 
 	ctx = sock_kmalloc(sk, len, GFP_KERNEL);
 	if (!ctx)
 		return -ENOMEM;
-
-	ctx->iv = sock_kmalloc(sk, crypto_ablkcipher_ivsize(private),
+	ctx->iv = sock_kmalloc(sk, crypto_ablkcipher_ivsize(skcipher),
 			       GFP_KERNEL);
 	if (!ctx->iv) {
 		sock_kfree_s(sk, ctx, len);
 		return -ENOMEM;
 	}
 
-	memset(ctx->iv, 0, crypto_ablkcipher_ivsize(private));
+	memset(ctx->iv, 0, crypto_ablkcipher_ivsize(skcipher));
 
 	INIT_LIST_HEAD(&ctx->tsgl);
 	ctx->len = len;
@@ -600,21 +726,33 @@ static int skcipher_accept_parent(void *private, struct sock *sk)
 
 	ask->private = ctx;
 
-	ablkcipher_request_set_tfm(&ctx->req, private);
+	ablkcipher_request_set_tfm(&ctx->req, skcipher);
 	ablkcipher_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG,
-					af_alg_complete, &ctx->completion);
+				      af_alg_complete, &ctx->completion);
 
 	sk->sk_destruct = skcipher_sock_destruct;
 
 	return 0;
 }
 
+static int skcipher_accept_parent(void *private, struct sock *sk)
+{
+	struct skcipher_tfm *tfm = private;
+
+	if (!tfm->has_key && crypto_ablkcipher_has_setkey(tfm->skcipher))
+		return -ENOKEY;
+
+	return skcipher_accept_parent_nokey(private, sk);
+}
+
 static const struct af_alg_type algif_type_skcipher = {
 	.bind		=	skcipher_bind,
 	.release	=	skcipher_release,
 	.setkey		=	skcipher_setkey,
 	.accept		=	skcipher_accept_parent,
+	.accept_nokey	=	skcipher_accept_parent_nokey,
 	.ops		=	&algif_skcipher_ops,
+	.ops_nokey	=	&algif_skcipher_ops_nokey,
 	.name		=	"skcipher",
 	.owner		=	THIS_MODULE
 };
--- a/crypto/asymmetric_keys/rsa.c
+++ b/crypto/asymmetric_keys/rsa.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/slab.h>
+#include <crypto/algapi.h>
 #include "public_key.h"
 
 MODULE_LICENSE("GPL");
@@ -189,12 +190,12 @@ static int RSA_verify(const u8 *H, const u8 *EM, size_t k, size_t hash_size,
 		}
 	}
 
-	if (memcmp(asn1_template, EM + T_offset, asn1_size) != 0) {
+	if (crypto_memneq(asn1_template, EM + T_offset, asn1_size) != 0) {
 		kleave(" = -EBADMSG [EM[T] ASN.1 mismatch]");
 		return -EBADMSG;
 	}
 
-	if (memcmp(H, EM + T_offset + asn1_size, hash_size) != 0) {
+	if (crypto_memneq(H, EM + T_offset + asn1_size, hash_size) != 0) {
 		kleave(" = -EKEYREJECTED [EM[T] hash mismatch]");
 		return -EKEYREJECTED;
 	}
--- a/crypto/authenc.c
+++ b/crypto/authenc.c
@@ -200,7 +200,7 @@ static void authenc_verify_ahash_update_done(struct crypto_async_request *areq,
 	scatterwalk_map_and_copy(ihash, areq_ctx->sg, areq_ctx->cryptlen,
 				 authsize, 0);
 
-	err = memcmp(ihash, ahreq->result, authsize) ? -EBADMSG : 0;
+	err = crypto_memneq(ihash, ahreq->result, authsize) ? -EBADMSG : 0;
 	if (err)
 		goto out;
 
@@ -239,7 +239,7 @@ static void authenc_verify_ahash_done(struct crypto_async_request *areq,
 	scatterwalk_map_and_copy(ihash, areq_ctx->sg, areq_ctx->cryptlen,
 				 authsize, 0);
 
-	err = memcmp(ihash, ahreq->result, authsize) ? -EBADMSG : 0;
+	err = crypto_memneq(ihash, ahreq->result, authsize) ? -EBADMSG : 0;
 	if (err)
 		goto out;
 
@@ -475,7 +475,7 @@ static int crypto_authenc_verify(struct aead_request *req,
 	ihash = ohash + authsize;
 	scatterwalk_map_and_copy(ihash, areq_ctx->sg, areq_ctx->cryptlen,
 				 authsize, 0);
-	return memcmp(ihash, ohash, authsize) ? -EBADMSG : 0;
+	return crypto_memneq(ihash, ohash, authsize) ? -EBADMSG : 0;
 }
 
 static int crypto_authenc_iverify(struct aead_request *req, u8 *iv,
--- a/crypto/authencesn.c
+++ b/crypto/authencesn.c
@@ -247,7 +247,7 @@ static void authenc_esn_verify_ahash_update_done(struct crypto_async_request *ar
 	scatterwalk_map_and_copy(ihash, areq_ctx->sg, areq_ctx->cryptlen,
 				 authsize, 0);
 
-	err = memcmp(ihash, ahreq->result, authsize) ? -EBADMSG : 0;
+	err = crypto_memneq(ihash, ahreq->result, authsize) ? -EBADMSG : 0;
 	if (err)
 		goto out;
 
@@ -296,7 +296,7 @@ static void authenc_esn_verify_ahash_update_done2(struct crypto_async_request *a
 	scatterwalk_map_and_copy(ihash, areq_ctx->sg, areq_ctx->cryptlen,
 				 authsize, 0);
 
-	err = memcmp(ihash, ahreq->result, authsize) ? -EBADMSG : 0;
+	err = crypto_memneq(ihash, ahreq->result, authsize) ? -EBADMSG : 0;
 	if (err)
 		goto out;
 
@@ -336,7 +336,7 @@ static void authenc_esn_verify_ahash_done(struct crypto_async_request *areq,
 	scatterwalk_map_and_copy(ihash, areq_ctx->sg, areq_ctx->cryptlen,
 				 authsize, 0);
 
-	err = memcmp(ihash, ahreq->result, authsize) ? -EBADMSG : 0;
+	err = crypto_memneq(ihash, ahreq->result, authsize) ? -EBADMSG : 0;
 	if (err)
 		goto out;
 
@@ -568,7 +568,7 @@ static int crypto_authenc_esn_verify(struct aead_request *req)
 	ihash = ohash + authsize;
 	scatterwalk_map_and_copy(ihash, areq_ctx->sg, areq_ctx->cryptlen,
 				 authsize, 0);
-	return memcmp(ihash, ohash, authsize) ? -EBADMSG : 0;
+	return crypto_memneq(ihash, ohash, authsize) ? -EBADMSG : 0;
 }
 
 static int crypto_authenc_esn_iverify(struct aead_request *req, u8 *iv,
--- a/crypto/blkcipher.c
+++ b/crypto/blkcipher.c
@@ -238,6 +238,8 @@ static int blkcipher_walk_next(struct blkcipher_desc *desc,
 		return blkcipher_walk_done(desc, walk, -EINVAL);
 	}
 
+	bsize = min(walk->blocksize, n);
+
 	walk->flags &= ~(BLKCIPHER_WALK_SLOW | BLKCIPHER_WALK_COPY |
 			 BLKCIPHER_WALK_DIFF);
 	if (!scatterwalk_aligned(&walk->in, alignmask) ||
@@ -250,7 +252,6 @@ static int blkcipher_walk_next(struct blkcipher_desc *desc,
 		}
 	}
 
-	bsize = min(walk->blocksize, n);
 	n = scatterwalk_clamp(&walk->in, n);
 	n = scatterwalk_clamp(&walk->out, n);
 
@@ -458,6 +459,7 @@ static int crypto_init_blkcipher_ops_async(struct crypto_tfm *tfm)
 	}
 	crt->base = __crypto_ablkcipher_cast(tfm);
 	crt->ivsize = alg->ivsize;
+	crt->has_setkey = alg->max_keysize;
 
 	return 0;
 }
--- a/crypto/ccm.c
+++ b/crypto/ccm.c
@@ -364,7 +364,7 @@ static void crypto_ccm_decrypt_done(struct crypto_async_request *areq,
 
 	if (!err) {
 		err = crypto_ccm_auth(req, req->dst, cryptlen);
-		if (!err && memcmp(pctx->auth_tag, pctx->odata, authsize))
+		if (!err && crypto_memneq(pctx->auth_tag, pctx->odata, authsize))
 			err = -EBADMSG;
 	}
 	aead_request_complete(req, err);
@@ -423,7 +423,7 @@ static int crypto_ccm_decrypt(struct aead_request *req)
 		return err;
 
 	/* verify */
-	if (memcmp(authtag, odata, authsize))
+	if (crypto_memneq(authtag, odata, authsize))
 		return -EBADMSG;
 
 	return err;
--- a/crypto/cryptd.c
+++ b/crypto/cryptd.c
@@ -565,9 +565,14 @@ static int cryptd_hash_export(struct ahash_request *req, void *out)
 
 static int cryptd_hash_import(struct ahash_request *req, const void *in)
 {
-	struct cryptd_hash_request_ctx *rctx = ahash_request_ctx(req);
+	struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
+	struct cryptd_hash_ctx *ctx = crypto_ahash_ctx(tfm);
+	struct shash_desc *desc = cryptd_shash_desc(req);
+
+	desc->tfm = ctx->child;
+	desc->flags = req->base.flags;
 
-	return crypto_shash_import(&rctx->desc, in);
+	return crypto_shash_import(desc, in);
 }
 
 static int cryptd_create_hash(struct crypto_template *tmpl, struct rtattr **tb,
--- a/crypto/crypto_user.c
+++ b/crypto/crypto_user.c
@@ -499,6 +499,7 @@ static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		if (link->dump == NULL)
 			return -EINVAL;
 
+		down_read(&crypto_alg_sem);
 		list_for_each_entry(alg, &crypto_alg_list, cra_list)
 			dump_alloc += CRYPTO_REPORT_MAXSIZE;
 
@@ -508,8 +509,11 @@ static int crypto_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 				.done = link->done,
 				.min_dump_alloc = dump_alloc,
 			};
-			return netlink_dump_start(crypto_nlsk, skb, nlh, &c);
+			err = netlink_dump_start(crypto_nlsk, skb, nlh, &c);
 		}
+		up_read(&crypto_alg_sem);
+
+		return err;
 	}
 
 	err = nlmsg_parse(nlh, crypto_msg_min[type], attrs, CRYPTOCFGA_MAX,
--- a/crypto/gcm.c
+++ b/crypto/gcm.c
@@ -582,7 +582,7 @@ static int crypto_gcm_verify(struct aead_request *req,
 
 	crypto_xor(auth_tag, iauth_tag, 16);
 	scatterwalk_map_and_copy(iauth_tag, req->src, cryptlen, authsize, 0);
-	return memcmp(iauth_tag, auth_tag, authsize) ? -EBADMSG : 0;
+	return crypto_memneq(iauth_tag, auth_tag, authsize) ? -EBADMSG : 0;
 }
 
 static void gcm_decrypt_done(struct crypto_async_request *areq, int err)
@@ -716,7 +716,9 @@ static struct crypto_instance *crypto_gcm_alloc_common(struct rtattr **tb,
 
 	ghash_alg = crypto_find_alg(ghash_name, &crypto_ahash_type,
 				    CRYPTO_ALG_TYPE_HASH,
-				    CRYPTO_ALG_TYPE_AHASH_MASK);
+				    CRYPTO_ALG_TYPE_AHASH_MASK |
+				    crypto_requires_sync(algt->type,
+							 algt->mask));
 	if (IS_ERR(ghash_alg))
 		return ERR_CAST(ghash_alg);
 
--- /dev/null
+++ b/crypto/memneq.c
@@ -0,0 +1,138 @@
+/*
+ * Constant-time equality testing of memory regions.
+ *
+ * Authors:
+ *
+ *   James Yonan <james@openvpn.net>
+ *   Daniel Borkmann <dborkman@redhat.com>
+ *
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright(c) 2013 OpenVPN Technologies, Inc. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ * The full GNU General Public License is included in this distribution
+ * in the file called LICENSE.GPL.
+ *
+ * BSD LICENSE
+ *
+ * Copyright(c) 2013 OpenVPN Technologies, Inc. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of OpenVPN Technologies nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <crypto/algapi.h>
+
+#ifndef __HAVE_ARCH_CRYPTO_MEMNEQ
+
+/* Generic path for arbitrary size */
+static inline unsigned long
+__crypto_memneq_generic(const void *a, const void *b, size_t size)
+{
+	unsigned long neq = 0;
+
+#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS)
+	while (size >= sizeof(unsigned long)) {
+		neq |= *(unsigned long *)a ^ *(unsigned long *)b;
+		a += sizeof(unsigned long);
+		b += sizeof(unsigned long);
+		size -= sizeof(unsigned long);
+	}
+#endif /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */
+	while (size > 0) {
+		neq |= *(unsigned char *)a ^ *(unsigned char *)b;
+		a += 1;
+		b += 1;
+		size -= 1;
+	}
+	return neq;
+}
+
+/* Loop-free fast-path for frequently used 16-byte size */
+static inline unsigned long __crypto_memneq_16(const void *a, const void *b)
+{
+#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
+	if (sizeof(unsigned long) == 8)
+		return ((*(unsigned long *)(a)   ^ *(unsigned long *)(b))
+		      | (*(unsigned long *)(a+8) ^ *(unsigned long *)(b+8)));
+	else if (sizeof(unsigned int) == 4)
+		return ((*(unsigned int *)(a)    ^ *(unsigned int *)(b))
+                      | (*(unsigned int *)(a+4)  ^ *(unsigned int *)(b+4))
+		      | (*(unsigned int *)(a+8)  ^ *(unsigned int *)(b+8))
+	              | (*(unsigned int *)(a+12) ^ *(unsigned int *)(b+12)));
+	else
+#endif /* CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS */
+		return ((*(unsigned char *)(a)    ^ *(unsigned char *)(b))
+		      | (*(unsigned char *)(a+1)  ^ *(unsigned char *)(b+1))
+		      | (*(unsigned char *)(a+2)  ^ *(unsigned char *)(b+2))
+		      | (*(unsigned char *)(a+3)  ^ *(unsigned char *)(b+3))
+		      | (*(unsigned char *)(a+4)  ^ *(unsigned char *)(b+4))
+		      | (*(unsigned char *)(a+5)  ^ *(unsigned char *)(b+5))
+		      | (*(unsigned char *)(a+6)  ^ *(unsigned char *)(b+6))
+		      | (*(unsigned char *)(a+7)  ^ *(unsigned char *)(b+7))
+		      | (*(unsigned char *)(a+8)  ^ *(unsigned char *)(b+8))
+		      | (*(unsigned char *)(a+9)  ^ *(unsigned char *)(b+9))
+		      | (*(unsigned char *)(a+10) ^ *(unsigned char *)(b+10))
+		      | (*(unsigned char *)(a+11) ^ *(unsigned char *)(b+11))
+		      | (*(unsigned char *)(a+12) ^ *(unsigned char *)(b+12))
+		      | (*(unsigned char *)(a+13) ^ *(unsigned char *)(b+13))
+		      | (*(unsigned char *)(a+14) ^ *(unsigned char *)(b+14))
+		      | (*(unsigned char *)(a+15) ^ *(unsigned char *)(b+15)));
+}
+
+/* Compare two areas of memory without leaking timing information,
+ * and with special optimizations for common sizes.  Users should
+ * not call this function directly, but should instead use
+ * crypto_memneq defined in crypto/algapi.h.
+ */
+noinline unsigned long __crypto_memneq(const void *a, const void *b,
+				       size_t size)
+{
+	switch (size) {
+	case 16:
+		return __crypto_memneq_16(a, b);
+	default:
+		return __crypto_memneq_generic(a, b, size);
+	}
+}
+EXPORT_SYMBOL(__crypto_memneq);
+
+#endif /* __HAVE_ARCH_CRYPTO_MEMNEQ */
--- a/crypto/scatterwalk.c
+++ b/crypto/scatterwalk.c
@@ -68,7 +68,8 @@ static void scatterwalk_pagedone(struct scatter_walk *walk, int out,
 
 void scatterwalk_done(struct scatter_walk *walk, int out, int more)
 {
-	if (!(scatterwalk_pagelen(walk) & (PAGE_SIZE - 1)) || !more)
+	if (!more || walk->offset >= walk->sg->offset + walk->sg->length ||
+	    !(walk->offset & (PAGE_SIZE - 1)))
 		scatterwalk_pagedone(walk, out, more);
 }
 EXPORT_SYMBOL_GPL(scatterwalk_done);
--- a/crypto/shash.c
+++ b/crypto/shash.c
@@ -353,9 +353,10 @@ int crypto_init_shash_ops_async(struct crypto_tfm *tfm)
 	crt->final = shash_async_final;
 	crt->finup = shash_async_finup;
 	crt->digest = shash_async_digest;
+	crt->setkey = shash_async_setkey;
+
+	crt->has_setkey = alg->setkey != shash_no_setkey;
 
-	if (alg->setkey)
-		crt->setkey = shash_async_setkey;
 	if (alg->export)
 		crt->export = shash_async_export;
 	if (alg->import)
--- a/drivers/Makefile
+++ b/drivers/Makefile
@@ -82,6 +82,7 @@ obj-$(CONFIG_ZORRO)		+= zorro/
 obj-$(CONFIG_MAC)		+= macintosh/
 obj-$(CONFIG_ATA_OVER_ETH)	+= block/aoe/
 obj-$(CONFIG_PARIDE) 		+= block/paride/
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= block/ploop/
 obj-$(CONFIG_TC)		+= tc/
 obj-$(CONFIG_UWB)		+= uwb/
 obj-$(CONFIG_USB_PHY)		+= usb/
--- a/drivers/ata/libata-eh.c
+++ b/drivers/ata/libata-eh.c
@@ -43,6 +43,7 @@
 #include <scsi/scsi_cmnd.h>
 #include <scsi/scsi_dbg.h>
 #include "../scsi/scsi_transport_api.h"
+#include "../scsi/scsi_dbg.h"
 
 #include <linux/libata.h>
 
@@ -686,6 +687,7 @@ void ata_scsi_cmd_error_handler(struct Scsi_Host *host, struct ata_port *ap,
 				 * Successfully complete it.
 				 */
 				scmd->retries = scmd->allowed;
+				scsi_debug_log_cmnd(ATA_SCSI_CMD_ERROR_HANDLER_CALLS_EH_FINISH, scmd);
 				scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
 			}
 		}
@@ -1002,6 +1004,7 @@ void ata_std_sched_eh(struct ata_port *ap)
 		return;
 
 	ata_eh_set_pending(ap, 1);
+	scsi_debug_log_shost(ATA_STD_SCHED_EH_CALLS_SCHEDULE_EH, ap->scsi_host);
 	scsi_schedule_eh(ap->scsi_host);
 
 	DPRINTK("port EH scheduled\n");
@@ -1024,6 +1027,7 @@ void ata_std_end_eh(struct ata_port *ap)
 {
 	struct Scsi_Host *host = ap->scsi_host;
 
+	scsi_debug_log_shost(ATA_STD_END_EH_ZERO_EH_SCHEDULED, host);
 	host->host_eh_scheduled = 0;
 }
 EXPORT_SYMBOL(ata_std_end_eh);
@@ -1299,6 +1303,7 @@ static void __ata_eh_qc_complete(struct ata_queued_cmd *qc)
 	WARN_ON(ata_tag_valid(qc->tag));
 	spin_unlock_irqrestore(ap->lock, flags);
 
+	scsi_debug_log_cmnd(ATA_EH_QC_COMPLETE_CALLS_EH_FINISH, scmd);
 	scsi_eh_finish_cmd(scmd, &ap->eh_done_q);
 }
 
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -27,6 +27,7 @@
 #include <linux/async.h>
 #include <linux/pm_runtime.h>
 #include <linux/netdevice.h>
+#include <linux/ve.h>
 
 #include "base.h"
 #include "power/power.h"
@@ -1024,7 +1025,7 @@ static void device_remove_sys_dev_entry(struct device *dev)
 
 	if (kobj) {
 		format_dev_t(devt_str, dev->devt);
-		sysfs_remove_link(kobj, devt_str);
+		sysfs_delete_link(kobj, &dev->kobj, devt_str);
 	}
 }
 
@@ -1536,7 +1537,7 @@ int __init devices_init(void)
 	sysfs_dev_block_kobj = kobject_create_and_add("block", dev_kobj);
 	if (!sysfs_dev_block_kobj)
 		goto block_kobj_err;
-	sysfs_dev_char_kobj = kobject_create_and_add("char", dev_kobj);
+	sysfs_dev_char_kobj = kobject_create_and_add_ve("char", dev_kobj);
 	if (!sysfs_dev_char_kobj)
 		goto char_kobj_err;
 
--- a/drivers/base/devtmpfs.c
+++ b/drivers/base/devtmpfs.c
@@ -21,9 +21,9 @@
 #include <linux/fs.h>
 #include <linux/shmem_fs.h>
 #include <linux/ramfs.h>
-#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/kthread.h>
+#include <linux/ve.h>
 #include "base.h"
 
 static struct task_struct *thread;
@@ -54,9 +54,64 @@ static int __init mount_param(char *str)
 }
 __setup("devtmpfs.mount=", mount_param);
 
+#ifdef CONFIG_VE
+static int ve_test_dev_sb(struct super_block *s, void *p)
+{
+	return get_exec_env()->dev_sb == s;
+}
+
+static int ve_set_dev_sb(struct super_block *s, void *p)
+{
+	struct ve_struct *ve = get_exec_env();
+	int error;
+
+	error = set_anon_super(s, p);
+	if (!error) {
+		BUG_ON(ve->dev_sb);
+		ve->dev_sb = s;
+		atomic_inc(&s->s_active);
+	}
+	return error;
+}
+
+static struct dentry *ve_dev_mount(struct file_system_type *fs_type, int flags,
+		      const char *dev_name, void *data)
+{
+	int (*fill_super)(struct super_block *, void *, int);
+	struct super_block *s;
+	int error;
+
+#ifdef CONFIG_TMPFS
+	fill_super = shmem_fill_super;
+#else
+	fill_super = ramfs_fill_super;
+#endif
+	s = sget(fs_type, ve_test_dev_sb, ve_set_dev_sb, flags, NULL);
+	if (IS_ERR(s))
+		return ERR_CAST(s);
+
+	if (!s->s_root) {
+		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+		if (error) {
+			deactivate_locked_super(s);
+			return ERR_PTR(error);
+		}
+		s->s_flags |= MS_ACTIVE;
+	}
+	return dget(s->s_root);
+}
+#endif /* CONFIG_VE */
+
 static struct dentry *dev_mount(struct file_system_type *fs_type, int flags,
 		      const char *dev_name, void *data)
 {
+	if (!current_user_ns_initial())
+		return ERR_PTR(-EPERM);
+
+#ifdef CONFIG_VE
+	if (!ve_is_super(get_exec_env()))
+		return ve_dev_mount(fs_type, flags, dev_name, data);
+#endif
 #ifdef CONFIG_TMPFS
 	return mount_single(fs_type, flags, data, shmem_fill_super);
 #else
@@ -68,6 +123,7 @@ static struct file_system_type dev_fs_type = {
 	.name = "devtmpfs",
 	.mount = dev_mount,
 	.kill_sb = kill_litter_super,
+	.fs_flags = FS_VIRTUALIZED | FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
 };
 
 #ifdef CONFIG_BLOCK
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -281,6 +281,22 @@ config BLK_DEV_CRYPTOLOOP
 	  instead, which can be configured to be on-disk compatible with the
 	  cryptoloop device.
 
+config BLK_DEV_PLOOP
+	tristate "Virtuozzo loopback device support"
+	depends on BLK_DEV_CBT
+	---help---
+	  Saying Y here will allow you to use a regular file as a block
+	  device; you can then create a file system on that block device and
+	  mount it just as you would mount other block devices such as hard
+	  drive partitions, CD-ROM drives or floppy drives. The loop devices
+	  are block special device files with major number 182 and typically
+	  called /dev/ploop0, /dev/ploop1 etc.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called loop.
+
+	  Most users will answer N here.
+
 source "drivers/block/drbd/Kconfig"
 
 config BLK_DEV_NBD
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -616,7 +616,7 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
 		fsync_bdev(bdev);
 		mutex_lock(&nbd->tx_lock);
 		blk_rq_init(NULL, &sreq);
-		sreq.cmd_type = REQ_TYPE_SPECIAL;
+		sreq.cmd_type = REQ_TYPE_DRV_PRIV;
 		nbd_cmd(&sreq) = NBD_CMD_DISC;
 
 		/* Check again after getting mutex back.  */
--- /dev/null
+++ b/drivers/block/ploop/Makefile
@@ -0,0 +1,23 @@
+#
+# Makefile for Virtuozzo loop device
+#
+# Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+#
+
+CFLAGS_io_direct.o = -I$(src)
+CFLAGS_ploop_events.o = -I$(src)
+
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= ploop.o
+ploop-objs := dev.o map.o io.o sysfs.o tracker.o freeblks.o ploop_events.o discard.o push_backup.o
+
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= pfmt_ploop1.o
+pfmt_ploop1-objs := fmt_ploop1.o
+
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= pfmt_raw.o
+pfmt_raw-objs := fmt_raw.o
+
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= pio_direct.o
+pio_direct-objs := io_direct.o io_direct_map.o
+
+obj-$(CONFIG_BLK_DEV_PLOOP)	+= pio_kaio.o
+pio_kaio-objs := io_kaio.o io_kaio_map.o
--- /dev/null
+++ b/drivers/block/ploop/dev.c
@@ -0,0 +1,5528 @@
+/*
+ *  drivers/block/ploop/dev.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/bio.h>
+#include <linux/interrupt.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+#include <linux/statfs.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/random.h>
+#include <linux/ve.h>
+#include <asm/uaccess.h>
+
+#include <trace/events/block.h>
+
+#include <linux/ploop/ploop.h>
+#include "ploop_events.h"
+#include "freeblks.h"
+#include "discard.h"
+#include "push_backup.h"
+
+/* Structures and terms:
+ *
+ * ploop_device is root of everything.
+ *	Normally we use local variable "plo" to refer to it.
+ *
+ * ploop_device -> list of ploop_delta's.
+ *	Head of list is "top delta", tail of list is "root delta".
+ *	"top delta" is delta, where all the modifications are written,
+ *	"root delta" is base image. "Level" is distance from root.
+ *
+ * ploop_delta  -> { ops, priv } refers to particulat format of delta.
+ *		-> ploop_io refers to image on disk.
+ *
+ * ploop_io	-> list of ploop_file, each file maps an area in image.
+ *	*** Further is "ideal", right now we support only one ploop_file
+ *	*** and we do not support creation of new ploop_file's.
+ *		-> { ops , priv } generic image ops, mostly creation
+ *		   of new chunks.
+ *
+ * ploop_file	-> { file, ops, priv } how we do real IO on this file.
+ */
+
+static int ploop_max __read_mostly = PLOOP_DEVICE_RANGE;
+static int ploop_major __read_mostly = PLOOP_DEVICE_MAJOR;
+int max_map_pages __read_mostly;
+
+static long root_threshold __read_mostly = 2L * 1024 * 1024; /* 2GB in KB */
+static long user_threshold __read_mostly = 4L * 1024 * 1024; /* 4GB in KB */
+
+static int large_disk_support __read_mostly = 1; /* true */
+
+static struct rb_root ploop_devices_tree = RB_ROOT;
+static DEFINE_MUTEX(ploop_devices_mutex);
+
+static LIST_HEAD(ploop_formats);
+static DEFINE_MUTEX(ploop_formats_mutex);
+
+int ploop_register_format(struct ploop_delta_ops * ops)
+{
+	mutex_lock(&ploop_formats_mutex);
+	list_add(&ops->list, &ploop_formats);
+	mutex_unlock(&ploop_formats_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ploop_register_format);
+
+void ploop_unregister_format(struct ploop_delta_ops * ops)
+{
+	mutex_lock(&ploop_formats_mutex);
+	list_del(&ops->list);
+	mutex_unlock(&ploop_formats_mutex);
+}
+EXPORT_SYMBOL(ploop_unregister_format);
+
+struct ploop_delta_ops * ploop_format_get(unsigned int id)
+{
+	struct ploop_delta_ops * ops;
+
+	mutex_lock(&ploop_formats_mutex);
+	list_for_each_entry(ops, &ploop_formats, list) {
+		if (ops->id == id && try_module_get(ops->owner)) {
+			mutex_unlock(&ploop_formats_mutex);
+			return ops;
+		}
+	}
+	mutex_unlock(&ploop_formats_mutex);
+	return NULL;
+}
+
+void ploop_format_put(struct ploop_delta_ops * ops)
+{
+	module_put(ops->owner);
+}
+
+void ploop_msg_once(struct ploop_device *plo, const char *fmt, ...)
+{
+	va_list args;
+
+	if (test_and_set_bit(PLOOP_S_ONCE, &plo->state))
+		return;
+
+	va_start(args, fmt);
+	printk("ploop(%d): ", plo->index);
+	vprintk(fmt, args);
+	printk("\n");
+	va_end(args);
+}
+EXPORT_SYMBOL(ploop_msg_once);
+
+static void mitigation_timeout(unsigned long data)
+{
+	struct ploop_device * plo = (void*)data;
+
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return;
+
+	spin_lock_irq(&plo->lock);
+	if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+	    (!list_empty(&plo->entry_queue) ||
+	     ((plo->bio_head || !bio_list_empty(&plo->bio_discard_list)) &&
+	      !list_empty(&plo->free_list))) &&
+	      waitqueue_active(&plo->waitq))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irq(&plo->lock);
+}
+
+static void freeze_timeout(unsigned long data)
+{
+	struct ploop_device * plo = (void*)data;
+
+	spin_lock_irq(&plo->lock);
+	if (waitqueue_active(&plo->freeze_waitq))
+		wake_up_interruptible(&plo->freeze_waitq);
+	spin_unlock_irq(&plo->lock);
+}
+
+static void ploop_congest(struct ploop_device *plo)
+{
+	if (!test_bit(PLOOP_S_CONGESTED, &plo->state) &&
+	    PLOOP_CONGESTED(plo) > plo->tune.congestion_high_watermark)
+		set_bit(PLOOP_S_CONGESTED, &plo->state);
+}
+
+static void ploop_uncongest(struct ploop_device *plo)
+{
+	if (PLOOP_CONGESTED(plo) <= plo->tune.congestion_low_watermark &&
+	    test_and_clear_bit(PLOOP_S_CONGESTED, &plo->state)) {
+		struct backing_dev_info *bdi = &plo->queue->backing_dev_info;
+
+		if (waitqueue_active(&bdi->cong_waitq))
+			wake_up_all(&bdi->cong_waitq);
+	}
+}
+
+static struct ploop_request *
+ploop_alloc_request(struct ploop_device * plo)
+{
+	struct ploop_request * preq;
+
+	/* We allow only finite amount of request in process.
+	 * If caller does not stop to congest us, we force him to wait.
+	 *
+	 * _XXX_ I am afraid this logic is flawed. The justification is
+	 * that conventional devices, using request queues, do similar thing
+	 * blocking in add_request(), but I am still not sure that logic
+	 * applies here.
+	 */
+	if (list_empty(&plo->free_list)) {
+		DEFINE_WAIT(_wait);
+		for (;;) {
+			prepare_to_wait(&plo->req_waitq, &_wait, TASK_UNINTERRUPTIBLE);
+			if (!list_empty(&plo->free_list))
+				break;
+			plo->st.bio_full++;
+			spin_unlock_irq(&plo->lock);
+			io_schedule();
+			spin_lock_irq(&plo->lock);
+		}
+		finish_wait(&plo->req_waitq, &_wait);
+	}
+
+	preq = list_entry(plo->free_list.next, struct ploop_request, list);
+	list_del_init(&preq->list);
+	plo->free_qlen--;
+	ploop_congest(plo);
+	return preq;
+}
+
+static void ploop_grab_iocontext(struct bio *bio)
+{
+	struct io_context **ioc_pp = (struct io_context **)(&bio->bi_bdev);
+	if (current->io_context) {
+		ioc_task_link(current->io_context);
+		*ioc_pp = current->io_context;
+		set_bit(BIO_BDEV_REUSED, &bio->bi_flags);
+	}
+}
+
+/* always called with plo->lock held */
+static inline void preq_unlink(struct ploop_request * preq,
+			       struct list_head *drop_list)
+{
+	list_del(&preq->list);
+	ploop_entry_qlen_dec(preq);
+	list_add(&preq->list, drop_list);
+}
+
+static void ploop_set_blockable(struct ploop_device *plo,
+				struct ploop_request *preq)
+{
+	if (!test_and_set_bit(PLOOP_REQ_BLOCKABLE, &preq->state))
+		plo->blockable_reqs++;
+}
+
+static void ploop_test_and_clear_blockable(struct ploop_device *plo,
+					   struct ploop_request *preq)
+{
+	if (test_and_clear_bit(PLOOP_REQ_BLOCKABLE, &preq->state))
+		plo->blockable_reqs--;
+}
+
+/* always called with plo->lock released */
+void ploop_preq_drop(struct ploop_device * plo, struct list_head *drop_list,
+		      int keep_locked)
+{
+	struct ploop_request * preq;
+	int drop_qlen = 0;
+
+	list_for_each_entry(preq, drop_list, list) {
+		if (preq->ioc) {
+			atomic_dec(&preq->ioc->nr_tasks);
+			put_io_context_active(preq->ioc);
+			preq->ioc = NULL;
+		}
+
+		BUG_ON (test_bit(PLOOP_REQ_ZERO, &preq->state));
+		ploop_test_and_clear_blockable(plo, preq);
+		drop_qlen++;
+	}
+
+	spin_lock_irq(&plo->lock);
+
+	list_splice_init(drop_list, plo->free_list.prev);
+	plo->free_qlen += drop_qlen;
+	if (waitqueue_active(&plo->req_waitq))
+		wake_up(&plo->req_waitq);
+	else if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+		waitqueue_active(&plo->waitq) &&
+		(plo->bio_head || !bio_list_empty(&plo->bio_discard_list)))
+		wake_up_interruptible(&plo->waitq);
+
+	ploop_uncongest(plo);
+
+	if (!keep_locked)
+		spin_unlock_irq(&plo->lock);
+}
+
+static void merge_rw_flags_to_req(unsigned long rw,
+				  struct ploop_request * preq)
+{
+		if (rw & REQ_FLUSH)
+			preq->req_rw |= REQ_FLUSH;
+		if (rw & REQ_FUA)
+			preq->req_rw |= REQ_FUA;
+}
+
+static void preq_set_sync_bit(struct ploop_request * preq)
+{
+	if (!test_bit(PLOOP_REQ_SYNC, &preq->state)) {
+		if (!(preq->req_rw & WRITE) || (preq->req_rw & (REQ_FLUSH|REQ_FUA))) {
+			preq->plo->read_sync_reqs++;
+			__set_bit(PLOOP_REQ_RSYNC, &preq->state);
+		}
+		__set_bit(PLOOP_REQ_SYNC, &preq->state);
+	}
+}
+
+static void overlap_forward(struct ploop_device * plo,
+			    struct ploop_request * preq,
+			    struct ploop_request * preq1,
+			    struct list_head *drop_list)
+{
+	struct rb_node * n;
+
+	if (preq->req_sector + preq->req_size == preq1->req_sector) {
+		preq->bl.tail->bi_next = preq1->bl.head;
+		preq->bl.tail = preq1->bl.tail;
+		preq1->bl.head = preq1->bl.tail = NULL;
+		preq->req_size += preq1->req_size;
+		if (test_bit(PLOOP_REQ_SYNC, &preq1->state))
+			preq_set_sync_bit(preq);
+		merge_rw_flags_to_req(preq1->req_rw, preq);
+		rb_erase(&preq1->lockout_link, &plo->entry_tree[preq1->req_rw & WRITE]);
+		preq_unlink(preq1, drop_list);
+		plo->st.coal_mforw++;
+	}
+
+	while ((n = rb_next(&preq->lockout_link)) != NULL) {
+		preq1 = rb_entry(n, struct ploop_request, lockout_link);
+		if (preq->req_sector + preq->req_size <= preq1->req_sector)
+			break;
+		rb_erase(n, &plo->entry_tree[preq->req_rw & WRITE]);
+		__clear_bit(PLOOP_REQ_SORTED, &preq1->state);
+		plo->st.coal_oforw++;
+	}
+}
+
+static void overlap_backward(struct ploop_device * plo,
+			     struct ploop_request * preq,
+			     struct ploop_request * preq1,
+			     struct list_head *drop_list)
+{
+	struct rb_node * n;
+
+	if (preq1->req_sector + preq1->req_size == preq->req_sector) {
+		preq1->bl.tail->bi_next = preq->bl.head;
+		preq->bl.head = preq1->bl.head;
+		preq1->bl.head = preq1->bl.tail = NULL;
+		preq->req_size += preq1->req_size;
+		preq->req_sector = preq1->req_sector;
+		if (test_bit(PLOOP_REQ_SYNC, &preq1->state))
+			preq_set_sync_bit(preq);
+		merge_rw_flags_to_req(preq1->req_rw, preq);
+		rb_erase(&preq1->lockout_link, &plo->entry_tree[preq->req_rw & WRITE]);
+		preq_unlink(preq1, drop_list);
+		plo->st.coal_mback++;
+	}
+
+	while ((n = rb_prev(&preq->lockout_link)) != NULL) {
+		preq1 = rb_entry(n, struct ploop_request, lockout_link);
+		if (preq1->req_sector + preq1->req_size <= preq->req_sector)
+			break;
+		rb_erase(n, &plo->entry_tree[preq->req_rw & WRITE]);
+		__clear_bit(PLOOP_REQ_SORTED, &preq1->state);
+		plo->st.coal_oback++;
+	}
+}
+
+static int try_merge(struct ploop_device *plo, struct ploop_request * preq,
+		     struct bio * bio, struct list_head *drop_list)
+{
+	struct rb_node * n;
+
+	/* Merge to tail */
+	if (bio->bi_sector == preq->req_sector + preq->req_size) {
+		preq->bl.tail->bi_next = bio;
+		preq->bl.tail = bio;
+		preq->req_size += (bio->bi_size >> 9);
+		preq->tstamp = jiffies;
+		if (bio->bi_rw & REQ_SYNC)
+			preq_set_sync_bit(preq);
+		merge_rw_flags_to_req(bio->bi_rw, preq);
+		plo->st.coal_forw++;
+		n = rb_next(&preq->lockout_link);
+		if (n) {
+			struct ploop_request * preq1;
+
+			preq1 = rb_entry(n, struct ploop_request, lockout_link);
+			if (preq1->req_cluster == preq->req_cluster &&
+			    preq->req_sector + preq->req_size >= preq1->req_sector)
+				overlap_forward(plo, preq, preq1, drop_list);
+		}
+		return 1;
+	}
+
+	if (bio->bi_sector + (bio->bi_size >> 9) == preq->req_sector) {
+		bio->bi_next = preq->bl.head;
+		preq->bl.head = bio;
+		preq->req_size += (bio->bi_size >> 9);
+		preq->req_sector = bio->bi_sector;
+		preq->tstamp = jiffies;
+		plo->st.coal_back++;
+		if (bio->bi_rw & REQ_SYNC)
+			preq_set_sync_bit(preq);
+		merge_rw_flags_to_req(bio->bi_rw, preq);
+		n = rb_prev(&preq->lockout_link);
+		if (n) {
+			struct ploop_request * preq1;
+
+			preq1 = rb_entry(n, struct ploop_request, lockout_link);
+			if (preq1->req_cluster == preq->req_cluster &&
+			    preq->req_sector <= preq1->req_sector + preq1->req_size)
+				overlap_backward(plo, preq, preq1, drop_list);
+		}
+		return 1;
+	}
+
+	return 0;
+}
+
+static struct ploop_request *
+tree_insert(struct rb_root *root, struct ploop_request * preq0)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct ploop_request * preq;
+
+	while (*p) {
+		parent = *p;
+		preq = rb_entry(parent, struct ploop_request, lockout_link);
+
+		if (preq0->req_cluster < preq->req_cluster)
+			p = &(*p)->rb_left;
+		else if (preq0->req_cluster > preq->req_cluster)
+			p = &(*p)->rb_right;
+		else if (preq0->req_sector + preq0->req_size < preq->req_sector)
+			p = &(*p)->rb_left;
+		else if (preq0->req_sector > preq->req_sector + preq->req_size)
+			p = &(*p)->rb_right;
+		else
+			return preq;
+	}
+
+	rb_link_node(&preq0->lockout_link, parent, p);
+	rb_insert_color(&preq0->lockout_link, root);
+	__set_bit(PLOOP_REQ_SORTED, &preq0->state);
+	return NULL;
+}
+
+static int
+insert_entry_tree(struct ploop_device * plo, struct ploop_request * preq0,
+		  struct list_head *drop_list)
+{
+	struct ploop_request * clash;
+	struct rb_node * n;
+
+	clash = tree_insert(&plo->entry_tree[preq0->req_rw & WRITE], preq0);
+	if (!clash)
+		return 0;
+
+	if (preq0->req_sector == clash->req_sector + clash->req_size) {
+		clash->bl.tail->bi_next = preq0->bl.head;
+		clash->bl.tail = preq0->bl.tail;
+		clash->req_size += preq0->req_size;
+		clash->tstamp = jiffies;
+		if (test_bit(PLOOP_REQ_SYNC, &preq0->state))
+			preq_set_sync_bit(clash);
+		merge_rw_flags_to_req(preq0->req_rw, clash);
+		preq_unlink(preq0, drop_list);
+		plo->st.coal_forw2++;
+
+		n = rb_next(&clash->lockout_link);
+		if (n) {
+			struct ploop_request * preq1;
+
+			preq1 = rb_entry(n, struct ploop_request, lockout_link);
+			if (preq1->req_cluster == clash->req_cluster &&
+			    clash->req_sector + clash->req_size >= preq1->req_sector)
+				overlap_forward(plo, clash, preq1, drop_list);
+		}
+		return 1;
+	}
+
+	if (clash->req_sector == preq0->req_sector + preq0->req_size) {
+		preq0->bl.tail->bi_next = clash->bl.head;
+		clash->bl.head = preq0->bl.head;
+		clash->req_size += preq0->req_size;
+		clash->req_sector = preq0->req_sector;
+		clash->tstamp = jiffies;
+		plo->st.coal_back2++;
+		if (test_bit(PLOOP_REQ_SYNC, &preq0->state))
+			preq_set_sync_bit(clash);
+		merge_rw_flags_to_req(preq0->req_rw, clash);
+		preq_unlink(preq0, drop_list);
+
+		n = rb_prev(&clash->lockout_link);
+		if (n) {
+			struct ploop_request * preq1;
+
+			preq1 = rb_entry(n, struct ploop_request, lockout_link);
+			if (preq1->req_cluster == clash->req_cluster &&
+			    clash->req_sector <= preq1->req_sector + preq1->req_size)
+				overlap_backward(plo, clash, preq1, drop_list);
+		}
+		return 1;
+	}
+
+	plo->st.coal_overlap++;
+
+	return 0;
+}
+
+static void
+ploop_bio_queue(struct ploop_device * plo, struct bio * bio,
+		struct list_head *drop_list, int account_blockable)
+{
+	struct ploop_request * preq;
+
+	BUG_ON(list_empty(&plo->free_list));
+	BUG_ON(plo->free_qlen <= 0);
+	preq = list_entry(plo->free_list.next, struct ploop_request, list);
+	list_del_init(&preq->list);
+	plo->free_qlen--;
+
+	preq->req_cluster = bio->bi_sector >> plo->cluster_log;
+	bio->bi_next = NULL;
+	preq->req_sector = bio->bi_sector;
+	preq->req_size = bio->bi_size >> 9;
+	preq->req_rw = bio->bi_rw;
+	preq->eng_state = PLOOP_E_ENTRY;
+	preq->state = 0;
+	preq->ppb_state = 0;
+	preq->error = 0;
+	preq->tstamp = jiffies;
+	preq->iblock = 0;
+	preq->prealloc_size = 0;
+
+	if (account_blockable && (bio->bi_rw & REQ_WRITE) && bio->bi_size &&
+	    ploop_pb_check_and_clear_bit(plo->pbd, preq->req_cluster))
+		ploop_set_blockable(plo, preq);
+
+	if (unlikely(bio->bi_rw & REQ_DISCARD)) {
+		int clu_size = 1 << plo->cluster_log;
+		int i = (clu_size - 1) & bio->bi_sector;
+		int err = 0;
+
+		if (i) {
+			preq->req_cluster++;
+			if (preq->req_size >= clu_size)
+				preq->req_size -= clu_size - i;
+		}
+
+		if (preq->req_size < clu_size ||
+		    (err = ploop_discard_add_bio(plo->fbd, bio))) {
+			if (test_bit(BIO_BDEV_REUSED, &bio->bi_flags)) {
+				struct io_context *ioc;
+				ioc = (struct io_context *)(bio->bi_bdev);
+				atomic_dec(&ioc->nr_tasks);
+				put_io_context_active(ioc);
+
+				bio->bi_bdev = plo->bdev;
+				clear_bit(BIO_BDEV_REUSED, &bio->bi_flags);
+			}
+			BIO_ENDIO(plo->queue, bio, err);
+			list_add(&preq->list, &plo->free_list);
+			plo->free_qlen++;
+			plo->bio_discard_qlen--;
+			plo->bio_total--;
+			return;
+		}
+
+		preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_DISCARD);
+		preq->dst_iblock = 0;
+		preq->bl.head = preq->bl.tail = NULL;
+	} else
+		preq->bl.head = preq->bl.tail = bio;
+
+	if (test_bit(BIO_BDEV_REUSED, &bio->bi_flags)) {
+		    preq->ioc = (struct io_context *)(bio->bi_bdev);
+		    bio->bi_bdev = plo->bdev;
+		    clear_bit(BIO_BDEV_REUSED, &bio->bi_flags);
+	} else {
+		preq->ioc = NULL;
+	}
+
+	if (unlikely(bio->bi_rw & REQ_SYNC))
+		__set_bit(PLOOP_REQ_SYNC, &preq->state);
+	if (unlikely(bio == plo->bio_sync)) {
+		__set_bit(PLOOP_REQ_SYNC, &preq->state);
+		plo->bio_sync = NULL;
+	}
+
+	__TRACE("A %p %u\n", preq, preq->req_cluster);
+
+	if (unlikely(bio->bi_rw & REQ_DISCARD))
+		plo->bio_discard_qlen--;
+	else
+		plo->bio_qlen--;
+	ploop_entry_add(plo, preq);
+
+	if (bio->bi_size && !(bio->bi_rw & REQ_DISCARD))
+		insert_entry_tree(plo, preq, drop_list);
+
+	trace_bio_queue(preq);
+}
+
+static inline struct ploop_request *
+ploop_get_request(struct ploop_device * plo, struct list_head * list)
+{
+	struct ploop_request * preq;
+
+	if (unlikely(list_empty(list)))
+		return NULL;
+
+	preq = list_first_entry(list, struct ploop_request, list);
+	list_del_init(&preq->list);
+	return preq;
+}
+
+static struct ploop_delta * find_delta(struct ploop_device * plo, int level)
+{
+	struct ploop_delta * delta;
+
+	list_for_each_entry(delta, &plo->map.delta_list, list) {
+		if (delta->level == level)
+			return delta;
+	}
+
+	return NULL;
+}
+
+DEFINE_BIO_CB(ploop_fast_end_io)
+{
+	unsigned long flags;
+	struct ploop_device * plo;
+	struct bio * orig = bio->bi_private;
+
+	plo = orig->bi_bdev->bd_disk->private_data;
+
+	BIO_ENDIO(plo->queue, orig, err);
+
+	/* End of fast bio wakes up main process only when this could
+	 * mean exit from ATTENTION state.
+	 */
+	spin_lock_irqsave(&plo->lock, flags);
+	plo->active_reqs--;
+	plo->fastpath_reqs--;
+	plo->bio_total--;
+
+	if (plo->active_reqs == 0 &&
+	    test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+	    waitqueue_active(&plo->waitq) &&
+	    (test_bit(PLOOP_S_EXITING, &plo->state) ||
+	     !list_empty(&plo->entry_queue)))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irqrestore(&plo->lock, flags);
+
+	bio_put(bio);
+}
+END_BIO_CB(ploop_fast_end_io)
+
+static struct ploop_delta *
+ploop_fast_lookup(struct ploop_device * plo, sector_t sec,
+		  unsigned long rw, sector_t * isec)
+{
+	struct ploop_delta * top_delta, * delta;
+	int level;
+	cluster_t bio_cluster = sec >> plo->cluster_log;
+	iblock_t iblk;
+
+	level = ploop_fastmap(&plo->map, bio_cluster, &iblk);
+	if (level < 0)
+		return NULL;
+
+	top_delta = ploop_top_delta(plo);
+	delta = top_delta;
+
+	if (level != top_delta->level) {
+		/* _XXX_ here is a problem. While merge_bvec() we do
+		 * not know whether this bio is read or write. If it is read
+		 * we should check backing map. This is tradeoff:
+		 * either we will direct reads to slow path, or we
+		 * do not aggregate writes, which makes COW much
+		 * slower. For now we select optimization of COW.
+		 */
+		if (rw & REQ_WRITE)
+			return NULL;
+
+		delta = find_delta(plo, level);
+	}
+	if (delta) {
+		*isec = ((sector_t)iblk << plo->cluster_log) +
+			(sec & ((1 << plo->cluster_log) - 1));
+	}
+	return delta;
+}
+
+
+/* Got a bio, which is mapped 1-1 to block device.
+ * But there is a problem, this bio could bypass device merge functions,
+ * because we skipped it while our own merge_fn.
+ *
+ * We cannot split bio in fast path, but we can revalidate it.
+ *
+ * q->max_phys_segments and q->max_hw_segments must be set to minimal
+ * of all participating backing devices.
+ */
+
+static int
+bio_fast_map(struct ploop_device * plo, struct bio * orig_bio, struct bio * bio)
+{
+	struct ploop_delta * delta;
+	sector_t isector;
+
+	if (orig_bio->bi_size == 0)
+		delta = ploop_top_delta(plo);
+	else
+		delta = ploop_fast_lookup(plo, orig_bio->bi_sector,
+					  orig_bio->bi_rw, &isector);
+	if (delta == NULL) {
+		plo->st.fast_neg_nomap++;
+		return 1;
+	}
+
+	if (delta->io.ops->fastmap == NULL)
+		return 1;
+
+	return delta->io.ops->fastmap(&delta->io, orig_bio, bio, isector);
+}
+
+static inline unsigned int block_vecs(struct ploop_device * plo)
+{
+	return 1 << (plo->cluster_log + 9 - PAGE_SHIFT);
+}
+
+static int whole_block(struct ploop_device * plo, struct ploop_request *preq)
+{
+	if (preq->req_size != (1<<plo->cluster_log))
+		return 0;
+	return !(preq->req_sector & ((1<<plo->cluster_log) - 1));
+}
+
+static struct bio *
+preallocate_bio(struct bio * orig_bio, struct ploop_device * plo)
+{
+	struct bio * nbio = NULL;
+
+	if (plo->cached_bio) {
+		spin_lock_irq(&plo->lock);
+		nbio = plo->cached_bio;
+		if (nbio) {
+			if (orig_bio->bi_vcnt <= nbio->bi_max_vecs)
+				plo->cached_bio = NULL;
+			else
+				nbio = NULL;
+		}
+		spin_unlock_irq(&plo->lock);
+	}
+
+	if (nbio == NULL)
+		nbio = bio_alloc(GFP_NOIO, max(orig_bio->bi_max_vecs, block_vecs(plo)));
+	return nbio;
+}
+
+static void process_bio_queue_one(struct ploop_device * plo,
+				  struct list_head *drop_list,
+				  int check_push_backup)
+{
+	struct bio *bio = plo->bio_head;
+
+	BUG_ON (!plo->bio_tail);
+	plo->bio_head = plo->bio_head->bi_next;
+	if (!plo->bio_head)
+		plo->bio_tail = NULL;
+
+	if (check_push_backup &&
+	    (bio->bi_rw & REQ_WRITE) && bio->bi_size &&
+	    plo->free_qlen <= plo->free_qmax / 2 &&
+	    plo->blockable_reqs > plo->free_qmax / 4 &&
+	    ploop_pb_bio_detained(plo->pbd, bio))
+		plo->blocked_bios++;
+	else
+		ploop_bio_queue(plo, bio, drop_list, check_push_backup);
+}
+
+static void process_bio_queue_optional(struct ploop_device * plo,
+				       struct list_head *drop_list)
+{
+	while (plo->bio_head && !list_empty(&plo->free_list) &&
+	       (!test_bit(PLOOP_S_PUSH_BACKUP, &plo->state) ||
+		plo->free_qlen > plo->free_qmax / 2))
+		process_bio_queue_one(plo, drop_list, 0);
+}
+
+static void process_bio_queue_main(struct ploop_device * plo,
+				   struct list_head *drop_list)
+{
+	int check = test_bit(PLOOP_S_PUSH_BACKUP, &plo->state);
+
+	while (plo->bio_head && !list_empty(&plo->free_list))
+		process_bio_queue_one(plo, drop_list, check);
+}
+
+static void ploop_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+	struct ploop_device *plo = cb->data;
+
+	clear_bit(PLOOP_S_SYNC, &plo->state);
+
+	/* And kick our "soft" queue too in case mitigation timer is in effect */
+	spin_lock_irq(&plo->lock);
+	if (plo->bio_head) {
+		BUG_ON (!plo->bio_tail);
+		/* another way would be: bio_tail->bi_rw |= BIO_RW_SYNCIO; */
+		plo->bio_sync = plo->bio_tail;
+	} else if (!list_empty(&plo->entry_queue)) {
+		struct ploop_request * preq = list_entry(plo->entry_queue.prev,
+							 struct ploop_request,
+							 list);
+		preq_set_sync_bit(preq);
+	}
+
+	if ((!list_empty(&plo->entry_queue) ||
+	     (plo->bio_head && !list_empty(&plo->free_list))) &&
+	    test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+	    waitqueue_active(&plo->waitq))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irq(&plo->lock);
+
+	kfree(cb);
+}
+
+static void
+process_discard_bio_queue(struct ploop_device * plo, struct list_head *drop_list)
+{
+	bool discard = test_bit(PLOOP_S_DISCARD, &plo->state);
+
+	while (!list_empty(&plo->free_list)) {
+		struct bio *tmp;
+
+		/* Only one discard bio can be handled concurrently */
+		if (discard && ploop_discard_is_inprogress(plo->fbd))
+			return;
+
+		tmp = bio_list_pop(&plo->bio_discard_list);
+		if (tmp == NULL)
+			break;
+
+		/* If PLOOP_S_DISCARD isn't set, ploop_bio_queue
+		 * will complete it with a proper error.
+		 */
+		ploop_bio_queue(plo, tmp, drop_list, 0);
+	}
+}
+
+static void ploop_make_request(struct request_queue *q, struct bio *bio)
+{
+	struct bio * nbio;
+	struct ploop_device * plo = q->queuedata;
+	unsigned long rw = bio_data_dir(bio);
+	struct hd_struct *part;
+	int cpu;
+	LIST_HEAD(drop_list);
+
+	trace_make_request(bio);
+
+	plo->st.bio_in++;
+
+	BUG_ON(bio->bi_idx);
+	BUG_ON(bio->bi_size & 511);
+
+	cpu = part_stat_lock();
+	part = disk_map_sector_rcu(plo->disk, bio->bi_sector);
+	part_stat_inc(cpu, part, ios[rw]);
+	part_stat_add(cpu, part, sectors[rw], bio_sectors(bio));
+	part_stat_unlock();
+
+	if (unlikely(bio->bi_size == 0)) {
+		/* Is it possible? This makes sense if the request is
+		 * marked as FLUSH, otherwise just warn and complete. */
+		if (!(bio->bi_rw & REQ_FLUSH)) {
+			WARN_ON(1);
+			BIO_ENDIO(q, bio, 0);
+			return;
+		}
+		/* useless to pass this bio further */
+		if (!plo->tune.pass_flushes) {
+			ploop_acc_ff_in(plo, bio->bi_rw);
+			BIO_ENDIO(q, bio, 0);
+			return;
+		}
+	}
+
+	/* This is crazy. Pattern is borrowed from raid0.c
+	 * bio layer assumes that it can prepare single-page bio
+	 * not depending on any alignment constraints. So be it.
+	 */
+	if (!(bio->bi_rw & REQ_DISCARD) && bio->bi_size &&
+	    (bio->bi_sector >> plo->cluster_log) !=
+	    ((bio->bi_sector + (bio->bi_size >> 9) - 1) >> plo->cluster_log)) {
+		struct bio_pair *bp;
+		unsigned int first_sectors = (1<<plo->cluster_log)
+			- (bio->bi_sector & ((1<<plo->cluster_log) - 1));
+
+		plo->st.bio_splits++;
+
+		BUG_ON(bio->bi_vcnt != 1 || bio->bi_idx != 0);
+
+		bp = bio_split(bio, first_sectors);
+		ploop_make_request(q, &bp->bio1);
+		ploop_make_request(q, &bp->bio2);
+		bio_pair_release(bp);
+		return;
+	}
+
+	rw = bio->bi_rw;
+	if (unlikely((bio->bi_rw & REQ_FLUSH) &&
+		     !plo->tune.pass_flushes))
+		bio->bi_rw &= ~REQ_FLUSH;
+	if (unlikely((bio->bi_rw & REQ_FUA) &&
+		     !plo->tune.pass_fuas))
+		bio->bi_rw &= ~REQ_FUA;
+
+	/* Allocate new bio now. */
+	nbio = preallocate_bio(bio, plo);
+
+	if (!current->io_context) {
+		struct io_context *ioc;
+		ioc = get_task_io_context(current, GFP_NOIO, NUMA_NO_NODE);
+		if (ioc)
+			put_io_context(ioc);
+	}
+
+	spin_lock_irq(&plo->lock);
+	ploop_acc_ff_in_locked(plo, rw);
+	plo->bio_total++;
+
+	/* Device is aborted, everything is in error. This should not happen. */
+	if (unlikely(!test_bit(PLOOP_S_RUNNING, &plo->state) ||
+		     ((bio->bi_rw & REQ_WRITE) &&
+		      test_bit(PLOOP_S_ABORT, &plo->state)))) {
+		plo->bio_total--;
+		spin_unlock_irq(&plo->lock);
+
+		BIO_ENDIO(q, bio, -EIO);
+		if (nbio)
+			bio_put(nbio);
+		return;
+	}
+
+	if (bio->bi_rw & REQ_DISCARD) {
+		bio_list_add(&plo->bio_discard_list, bio);
+		plo->bio_discard_qlen++;
+		goto queued;
+	}
+
+	/* Write tracking in fast path does not work at the moment. */
+	if (unlikely(test_bit(PLOOP_S_TRACK, &plo->state) &&
+		     (bio->bi_rw & WRITE)))
+		goto queue;
+
+	/* No fast path, when maintenance is in progress.
+	 * (PLOOP_S_TRACK was checked immediately above) */
+	if (FAST_PATH_DISABLED(plo->maintenance_type))
+		goto queue;
+
+	/* Attention state, always queue */
+	if (unlikely(test_bit(PLOOP_S_ATTENTION, &plo->state)))
+		goto queue;
+
+	/* Some barriers have been already enqueued, always queue */
+	if (unlikely(plo->barrier_reqs))
+		goto queue;
+
+	if (unlikely(nbio == NULL))
+		goto queue;
+
+	/* Try to merge before checking for fastpath. Maybe, this
+	 * is not wise.
+	 */
+	if (!RB_EMPTY_ROOT(&plo->entry_tree[bio->bi_rw & WRITE]) &&
+	    bio->bi_size) {
+		struct ploop_request * preq;
+		struct rb_node * n = plo->entry_tree[bio->bi_rw & WRITE].rb_node;
+		u32 bio_cluster = bio->bi_sector >> plo->cluster_log;
+
+		while (n) {
+			preq = rb_entry(n, struct ploop_request, lockout_link);
+
+			if (bio_cluster < preq->req_cluster)
+				n = n->rb_left;
+			else if (bio_cluster > preq->req_cluster)
+				n = n->rb_right;
+			else if (bio->bi_sector + (bio->bi_size >> 9) < preq->req_sector)
+				n = n->rb_left;
+			else if (bio->bi_sector > preq->req_sector + preq->req_size)
+				n = n->rb_right;
+			else
+				break;
+		}
+
+		if (n && try_merge(plo, preq, bio, &drop_list))
+			goto out;
+	}
+
+
+	/* Try fast path. If all the mappings are available
+	 * and bio can be remapped without split, just do it.
+	 */
+	if (!bio_fast_map(plo, bio, nbio)) {
+		/* Here is a little problem. It would be really good
+		 * to remap original bio and to return 1. It is how
+		 * make_request() engine is supposed to work.
+		 * Nevertheless, this logic is flawed.
+		 *
+		 * We cannot return remapped bio, because we lose track of it
+		 * and have no way to wait for end of IO f.e. to start
+		 * snapshot or to replace image file.
+		 */
+		trace_bio_fast_map(bio);
+		nbio->bi_private = bio;
+		nbio->bi_end_io = ploop_fast_end_io;
+		plo->active_reqs++;
+		plo->fastpath_reqs++;
+		plo->st.bio_fast++;
+		ploop_acc_ff_out_locked(plo, nbio->bi_rw);
+
+		spin_unlock_irq(&plo->lock);
+
+		generic_make_request(nbio);
+		return;
+	}
+
+	/* Otherwise: queue */
+
+queue:
+	BUG_ON (bio->bi_bdev != plo->bdev && bio_sectors(bio));
+	if (bio->bi_bdev == plo->bdev) {
+		BUG_ON (test_bit(BIO_BDEV_REUSED, &bio->bi_flags));
+		ploop_grab_iocontext(bio);
+	}
+
+	BUG_ON (bio->bi_next);
+	if (plo->bio_tail) {
+		BUG_ON (!plo->bio_head);
+		BUG_ON (plo->bio_tail->bi_next);
+		plo->bio_tail->bi_next = bio;
+		plo->bio_tail = bio;
+	} else {
+		BUG_ON (plo->bio_head);
+		plo->bio_head = plo->bio_tail = bio;
+	}
+	plo->bio_qlen++;
+	ploop_congest(plo);
+
+	/* second chance to merge requests */
+	process_bio_queue_optional(plo, &drop_list);
+
+queued:
+	/* If main thread is waiting for requests, wake it up.
+	 * But try to mitigate wakeups, delaying wakeup for some short
+	 * time.
+	 */
+	if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state)) {
+		/* Synchronous requests are not batched. */
+		if (plo->entry_qlen > plo->tune.batch_entry_qlen ||
+			(bio->bi_rw & (REQ_FLUSH|REQ_FUA)) ||
+			(!bio_list_empty(&plo->bio_discard_list) &&
+			 !list_empty(&plo->free_list)) ||
+			!current->plug) {
+			wake_up_interruptible(&plo->waitq);
+		} else if (!timer_pending(&plo->mitigation_timer)) {
+			mod_timer(&plo->mitigation_timer,
+				  jiffies + plo->tune.batch_entry_delay);
+		}
+	}
+out:
+	if (nbio) {
+		if (!plo->cached_bio)
+			plo->cached_bio = nbio;
+		else
+			bio_put(nbio);
+	}
+	spin_unlock_irq(&plo->lock);
+
+	blk_check_plugged(ploop_unplug, plo, sizeof(struct blk_plug_cb));
+
+	if (!list_empty(&drop_list))
+		ploop_preq_drop(plo, &drop_list, 0);
+
+	return;
+}
+
+
+/* q->merge_bvec_fn
+ *
+ * According to API, this function returns length which we are able
+ * to merge, but nobody uses it actually, so that we return either 0
+ * or bvec->bv_len.
+ */
+
+static int
+ploop_merge_bvec(struct request_queue *q, struct bvec_merge_data *bm_data,
+		 struct bio_vec *bvec)
+{
+	struct ploop_device *plo = q->queuedata;
+	struct ploop_delta * delta;
+	sector_t sec;
+	sector_t isector;
+	unsigned int len, ret;
+	unsigned long flags;
+
+	sec = bm_data->bi_sector + get_start_sect(bm_data->bi_bdev);
+	len = bm_data->bi_size + bvec->bv_len;
+	ret = bvec->bv_len;
+
+	/* Always allow to add the first bvec. */
+	if (!bm_data->bi_size)
+		return ret;
+
+	/* Is this possible? This would not contradict to anything. */
+	BUG_ON(len & 511);
+
+	len >>= 9;
+
+	if ((sec >> plo->cluster_log) != 
+	    ((sec + len - 1) >> plo->cluster_log)) {
+		plo->st.merge_neg_cluster++;
+		return 0;
+	}
+
+	/* We can return ret right now, the further action is an optimization
+	 * to prevent splitting overhead and to enable fast path.
+	 */
+	spin_lock_irqsave(&plo->lock, flags);
+	delta = ploop_fast_lookup(plo, sec, 0, &isector);
+	if (delta &&
+	    delta->io.ops->disable_merge &&
+	    delta->io.ops->disable_merge(&delta->io, isector, len)) {
+		plo->st.merge_neg_disable++;
+		ret = 0;
+	}
+	spin_unlock_irqrestore(&plo->lock, flags);
+
+	/* If no mapping is available, merge up to cluster boundary */
+	return ret;
+}
+
+static int ploop_congested2(void *data, int bits)
+{
+	struct ploop_device * plo = data;
+
+	if (test_bit(PLOOP_S_CONGESTED, &plo->state))
+		return bits;
+
+	return 0;
+}
+
+static int ploop_congested(void *data, int bits)
+{
+	struct ploop_device * plo = data;
+	struct ploop_delta * top_delta;
+	int ret = 0;
+
+	top_delta = ploop_top_delta(plo);
+	if (top_delta->io.ops->congested)
+		ret |= top_delta->io.ops->congested(&top_delta->io, bits);
+
+	return ret;
+}
+
+static int __check_lockout(struct ploop_request *preq, bool pb)
+{
+	struct ploop_device * plo = preq->plo;
+	struct rb_node * n = pb ? plo->lockout_pb_tree.rb_node :
+				  plo->lockout_tree.rb_node;
+	struct ploop_request * p;
+	int lockout_bit = pb ? PLOOP_REQ_PB_LOCKOUT : PLOOP_REQ_LOCKOUT;
+
+	if (n == NULL)
+		return 0;
+
+	if (test_bit(lockout_bit, &preq->state))
+		return 0;
+
+	while (n) {
+		if (pb)
+			p = rb_entry(n, struct ploop_request, lockout_pb_link);
+		else
+			p = rb_entry(n, struct ploop_request, lockout_link);
+
+		if (preq->req_cluster < p->req_cluster)
+			n = n->rb_left;
+		else if (preq->req_cluster > p->req_cluster)
+			n = n->rb_right;
+		else {
+			list_add_tail(&preq->list, &p->delay_list);
+			plo->st.bio_lockouts++;
+			trace_preq_lockout(preq, p);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+static int check_lockout(struct ploop_request *preq)
+{
+	if (__check_lockout(preq, false))
+		return 1;
+
+	/* push_backup passes READs intact */
+	if (!(preq->req_rw & REQ_WRITE))
+		return 0;
+
+	if (__check_lockout(preq, true))
+		return 1;
+
+	return 0;
+}
+
+static int __ploop_add_lockout(struct ploop_request *preq, int try, bool pb)
+{
+	struct ploop_device * plo = preq->plo;
+	struct rb_node ** p;
+	struct rb_node *parent = NULL;
+	struct ploop_request * pr;
+	struct rb_node *link;
+	struct rb_root *tree;
+	int lockout_bit;
+
+	if (pb) {
+		link = &preq->lockout_pb_link;
+		tree = &plo->lockout_pb_tree;
+		lockout_bit = PLOOP_REQ_PB_LOCKOUT;
+	} else {
+		link = &preq->lockout_link;
+		tree = &plo->lockout_tree;
+		lockout_bit = PLOOP_REQ_LOCKOUT;
+	}
+
+	if (test_bit(lockout_bit, &preq->state))
+		return 0;
+
+	p = &tree->rb_node;
+	while (*p) {
+		parent = *p;
+		if (pb)
+			pr = rb_entry(parent, struct ploop_request, lockout_pb_link);
+		else
+			pr = rb_entry(parent, struct ploop_request, lockout_link);
+
+		if (preq->req_cluster == pr->req_cluster) {
+			if (try)
+				return 1;
+			BUG();
+		}
+
+		if (preq->req_cluster < pr->req_cluster)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	trace_add_lockout(preq);
+
+	rb_link_node(link, parent, p);
+	rb_insert_color(link, tree);
+	__set_bit(lockout_bit, &preq->state);
+	return 0;
+}
+
+int ploop_add_lockout(struct ploop_request *preq, int try)
+{
+	return __ploop_add_lockout(preq, try, false);
+}
+EXPORT_SYMBOL(ploop_add_lockout);
+
+static void ploop_add_pb_lockout(struct ploop_request *preq)
+{
+	__ploop_add_lockout(preq, 0, true);
+}
+
+static void __del_lockout(struct ploop_request *preq, bool pb)
+{
+	struct ploop_device * plo = preq->plo;
+	struct rb_node *link;
+	struct rb_root *tree;
+	int lockout_bit;
+
+	if (pb) {
+		link = &preq->lockout_pb_link;
+		tree = &plo->lockout_pb_tree;
+		lockout_bit = PLOOP_REQ_PB_LOCKOUT;
+	} else {
+		link = &preq->lockout_link;
+		tree = &plo->lockout_tree;
+		lockout_bit = PLOOP_REQ_LOCKOUT;
+	}
+
+	if (!test_and_clear_bit(lockout_bit, &preq->state))
+		return;
+
+	trace_del_lockout(preq);
+
+	rb_erase(link, tree);
+}
+
+void del_lockout(struct ploop_request *preq)
+{
+	__del_lockout(preq, false);
+}
+
+static void del_pb_lockout(struct ploop_request *preq)
+{
+	__del_lockout(preq, true);
+}
+
+static void ploop_discard_wakeup(struct ploop_request *preq, int err)
+{
+	struct ploop_device *plo = preq->plo;
+
+	if (err || !ploop_fb_get_n_free(plo->fbd)) {
+		/* Only one discard request is processed */
+		ploop_fb_reinit(plo->fbd, err);
+	} else
+		set_bit(PLOOP_S_DISCARD_LOADED, &plo->state);
+
+	if (atomic_dec_and_test(&plo->maintenance_cnt))
+		if (test_bit(PLOOP_S_DISCARD_LOADED, &plo->state) ||
+		    !test_bit(PLOOP_S_DISCARD, &plo->state))
+			complete(&plo->maintenance_comp);
+}
+
+static void ploop_complete_request(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	int nr_completed = 0;
+	struct io_context *ioc;
+
+	trace_complete_request(preq);
+
+	__TRACE("Z %p %u\n", preq, preq->req_cluster);
+
+	while (preq->bl.head) {
+		struct bio * bio = preq->bl.head;
+		preq->bl.head = bio->bi_next;
+		bio->bi_next = NULL;
+		BIO_ENDIO(plo->queue, bio, preq->error);
+		nr_completed++;
+	}
+	preq->bl.tail = NULL;
+
+	WARN_ON(!preq->error && test_bit(PLOOP_REQ_ISSUE_FLUSH, &preq->state));
+
+	if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
+	    test_bit(PLOOP_REQ_RELOC_S, &preq->state) ||
+	    test_bit(PLOOP_REQ_RELOC_N, &preq->state)) {
+		if (preq->error)
+			set_bit(PLOOP_S_ABORT, &plo->state);
+
+		if (atomic_dec_and_test(&plo->maintenance_cnt))
+			complete(&plo->maintenance_comp);
+	} else if (test_bit(PLOOP_REQ_MERGE, &preq->state)) {
+		if (!preq->error) {
+			if (plo->merge_ptr < plo->trans_map->max_index) {
+				spin_lock_irq(&plo->lock);
+				if (preq->map) {
+					map_release(preq->map);
+					preq->map = NULL;
+				}
+				if (preq->trans_map) {
+					map_release(preq->trans_map);
+					preq->trans_map = NULL;
+				}
+
+				del_lockout(preq);
+
+				preq->req_cluster = ~0U;
+
+				if (!list_empty(&preq->delay_list))
+					list_splice_init(&preq->delay_list, plo->ready_queue.prev);
+				plo->active_reqs--;
+
+				preq->eng_state = PLOOP_E_ENTRY;
+				ploop_entry_add(plo, preq);
+				spin_unlock_irq(&plo->lock);
+				return;
+			}
+		} else
+			set_bit(PLOOP_S_ABORT, &plo->state);
+
+		if (atomic_dec_and_test(&plo->maintenance_cnt))
+			complete(&plo->maintenance_comp);
+	} else if (test_bit(PLOOP_REQ_DISCARD, &preq->state))
+		ploop_discard_wakeup(preq, preq->error);
+
+	if (preq->aux_bio) {
+		int i;
+		struct bio * bio = preq->aux_bio;
+
+		for (i = 0; i < bio->bi_vcnt; i++) {
+			struct page *page = bio->bi_io_vec[i].bv_page;
+			if (page != ZERO_PAGE(0))
+				put_page(page);
+		}
+
+		bio_put(bio);
+
+		preq->aux_bio = NULL;
+	}
+
+	spin_lock_irq(&plo->lock);
+
+	del_lockout(preq);
+	del_pb_lockout(preq); /* preq may die via ploop_fail_immediate() */
+	ploop_test_and_clear_blockable(plo, preq);
+
+	if (!list_empty(&preq->delay_list))
+		list_splice_init(&preq->delay_list, plo->ready_queue.prev);
+
+	if (preq->map) {
+		map_release(preq->map);
+		preq->map = NULL;
+	}
+	if (preq->trans_map) {
+		map_release(preq->trans_map);
+		preq->trans_map = NULL;
+	}
+
+	ioc = preq->ioc;
+	preq->ioc = NULL;
+
+	plo->active_reqs--;
+
+	if (unlikely(test_bit(PLOOP_REQ_ZERO, &preq->state))) {
+		ploop_fb_put_zero_request(plo->fbd, preq);
+	} else {
+		ploop_uncongest(plo);
+		list_add(&preq->list, &plo->free_list);
+		plo->free_qlen++;
+		if (waitqueue_active(&plo->req_waitq))
+			wake_up(&plo->req_waitq);
+		else if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+			 waitqueue_active(&plo->waitq) &&
+			 (plo->bio_head ||
+			  !bio_list_empty(&plo->bio_discard_list)))
+			wake_up_interruptible(&plo->waitq);
+	}
+	plo->bio_total -= nr_completed;
+
+	if (plo->tune.congestion_detection &&
+	    plo->entry_qlen + plo->active_reqs - plo->fastpath_reqs
+	    <= plo->tune.max_requests/2) {
+		if (test_and_clear_bit(PLOOP_S_WRITE_CONG, &plo->state))
+			clear_bdi_congested(&plo->queue->backing_dev_info, WRITE);
+		if (test_and_clear_bit(PLOOP_S_READ_CONG, &plo->state))
+			clear_bdi_congested(&plo->queue->backing_dev_info, READ);
+	}
+
+	spin_unlock_irq(&plo->lock);
+
+	if (ioc) {
+		atomic_dec(&ioc->nr_tasks);
+		put_io_context_active(ioc);
+	}
+}
+
+void ploop_fail_request(struct ploop_request * preq, int err)
+{
+	struct ploop_device * plo = preq->plo;
+
+	ploop_req_set_error(preq, err);
+
+	spin_lock_irq(&plo->lock);
+	if (err == -ENOSPC) {
+		set_bit(PLOOP_S_ENOSPC_EVENT, &plo->state);
+		list_add(&preq->list, &plo->ready_queue);
+		if (waitqueue_active(&plo->event_waitq))
+			wake_up_interruptible(&plo->event_waitq);
+	} else {
+		set_bit(PLOOP_S_ABORT, &plo->state);
+		list_add_tail(&preq->list, &plo->ready_queue);
+	}
+	spin_unlock_irq(&plo->lock);
+}
+EXPORT_SYMBOL(ploop_fail_request);
+
+void ploop_fail_immediate(struct ploop_request * preq, int err)
+{
+	struct ploop_device * plo = preq->plo;
+
+	ploop_req_set_error(preq, err);
+
+	set_bit(PLOOP_S_ABORT, &plo->state);
+	preq->eng_state = PLOOP_E_COMPLETE;
+	ploop_complete_request(preq);
+}
+
+#define PLOOP_REQ_FAIL_IMMEDIATE(preq, err)		\
+	do {						\
+		PLOOP_REQ_TRACE_ERROR(preq, err);	\
+		ploop_fail_immediate(preq, err);	\
+	} while (0);
+
+void ploop_complete_io_state(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	unsigned long flags;
+
+	spin_lock_irqsave(&plo->lock, flags);
+	__TRACE("C %p %u\n", preq, preq->req_cluster);
+	if (preq->error)
+		set_bit(PLOOP_S_ABORT, &plo->state);
+
+	list_add_tail(&preq->list, &plo->ready_queue);
+	if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state) &&
+	    waitqueue_active(&plo->waitq))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irqrestore(&plo->lock, flags);
+}
+EXPORT_SYMBOL(ploop_complete_io_state);
+
+
+static int fill_bio(struct ploop_device *plo, struct bio * bio, cluster_t blk)
+{
+	int pages = block_vecs(plo);
+
+	for (; bio->bi_vcnt < pages; bio->bi_vcnt++) {
+		bio->bi_io_vec[bio->bi_vcnt].bv_page = alloc_page(GFP_NOFS);
+		if (bio->bi_io_vec[bio->bi_vcnt].bv_page == NULL)
+			return -ENOMEM;
+		bio->bi_io_vec[bio->bi_vcnt].bv_offset = 0;
+		bio->bi_io_vec[bio->bi_vcnt].bv_len = PAGE_SIZE;
+	}
+	bio->bi_sector = blk << plo->cluster_log;
+	bio->bi_size = (1 << (plo->cluster_log + 9));
+	return 0;
+}
+
+/* Not generic. We assume that dst is aligned properly, i.e. it is
+ * array of the whole pages starting at cluster boundary.
+ */
+static void bio_bcopy(struct bio *dst, struct bio *src, struct ploop_device *plo)
+{
+	int i;
+	unsigned int doff, soff, bv_off;
+
+	doff = (src->bi_sector & ((1<<plo->cluster_log) - 1)) << 9;
+	soff = 0;
+	bv_off = 0;
+	i = 0;
+
+	while (soff < src->bi_size) {
+		struct bio_vec * bv = src->bi_io_vec + i;
+		unsigned int copy;
+		int didx;
+		int poff;
+		void * ksrc;
+
+		if (bv_off >= bv->bv_len) {
+			i++;
+			bv++;
+			bv_off = 0;
+		}
+
+		didx = doff / PAGE_SIZE;
+		poff = doff & (PAGE_SIZE-1);
+		copy = bv->bv_len - bv_off;
+		if (copy > PAGE_SIZE - poff)
+			copy = PAGE_SIZE - poff;
+
+		ksrc = kmap_atomic(bv->bv_page);
+		memcpy(page_address(dst->bi_io_vec[didx].bv_page) + poff,
+		       ksrc + bv->bv_offset + bv_off,
+		       copy);
+		kunmap_atomic(ksrc);
+
+		bv_off += copy;
+		doff += copy;
+		soff += copy;
+	}
+}
+
+int check_zeros(struct bio_list * bl)
+{
+	struct bio * bio;
+
+	bio_list_for_each(bio, bl) {
+		int i;
+
+		for (i = 0; i < bio->bi_vcnt; i++) {
+			struct bio_vec * bv = bio->bi_io_vec + i;
+			unsigned long * ptr;
+			void * kaddr;
+			int k;
+
+			if (bv->bv_page == ZERO_PAGE(0))
+				continue;
+
+			kaddr = kmap_atomic(bv->bv_page);
+			ptr = kaddr + bv->bv_offset;
+			k = bv->bv_len/sizeof(unsigned long);
+			while (k) {
+				if (*ptr)
+					break;
+				ptr++;
+				k--;
+			}
+			kunmap_atomic(kaddr);
+			if (k)
+				return 0;
+		}
+	}
+	return 1;
+}
+
+static int prepare_merge_req(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	u32 iblk;
+	int res;
+
+	BUG_ON (preq->trans_map == NULL);
+
+	if (trans_map_get_index(preq, preq->req_cluster, &iblk)) {
+		u32 cluster = preq->req_cluster;
+
+		preq->req_cluster = ~0U;
+
+		if (cluster + 1 != plo->merge_ptr)
+			goto drop_map;
+
+		do {
+			cluster++;
+
+			if (cluster >= plo->trans_map->max_index)
+				goto drop_map;
+
+			if (cluster > map_get_mn_end(preq->trans_map)) {
+				plo->merge_ptr = cluster;
+				goto drop_map;
+			}
+		} while (trans_map_get_index(preq, cluster, &iblk));
+
+		preq->req_cluster = cluster;
+		plo->merge_ptr = cluster + 1;
+	}
+
+	spin_lock_irq(&plo->lock);
+	res = ploop_add_lockout(preq, 1);
+	spin_unlock_irq(&plo->lock);
+	return res;
+
+drop_map:
+	spin_lock_irq(&plo->lock);
+	map_release(preq->trans_map);
+	preq->trans_map = NULL;
+	if (preq->map) {
+		map_release(preq->map);
+		preq->map = NULL;
+	}
+	spin_unlock_irq(&plo->lock);
+	return 1;
+}
+
+void ploop_queue_zero_request(struct ploop_device *plo,
+			      struct ploop_request *orig_preq, cluster_t clu)
+{
+	struct ploop_request * preq;
+
+	spin_lock_irq(&plo->lock);
+
+	preq = ploop_fb_get_zero_request(plo->fbd);
+	preq->bl.tail = preq->bl.head = NULL;
+	preq->req_cluster = clu;
+	preq->req_size = 0;
+	preq->req_rw = WRITE_SYNC;
+	preq->eng_state = PLOOP_E_ENTRY;
+	preq->state = (1 << PLOOP_REQ_ZERO);
+	if (test_bit(PLOOP_REQ_SYNC, &orig_preq->state))
+		preq->state |= (1 << PLOOP_REQ_SYNC);
+	preq->error = 0;
+	preq->tstamp = jiffies;
+	preq->iblock = 0;
+
+	if (test_bit(PLOOP_REQ_RELOC_S, &orig_preq->state)) {
+		if (orig_preq->dst_iblock == ~0U)
+			orig_preq->eng_state = PLOOP_E_RELOC_COMPLETE;
+	} else {
+		orig_preq->eng_state = orig_preq->iblock ?
+			PLOOP_E_DELTA_ZERO_INDEX : PLOOP_E_ZERO_INDEX;
+	}
+	orig_preq->iblock = 0;
+	INIT_LIST_HEAD(&preq->delay_list);
+	list_add_tail(&orig_preq->list, &preq->delay_list);
+
+	list_add(&preq->list, &plo->ready_queue);
+	plo->active_reqs++;
+
+	spin_unlock_irq(&plo->lock);
+}
+
+static void
+ploop_reloc_sched_read(struct ploop_request *preq, iblock_t iblk)
+{
+	struct ploop_device *plo   = preq->plo;
+	struct ploop_delta  *delta = ploop_top_delta(plo);
+	struct bio_list sbl;
+
+	spin_lock_irq(&plo->lock);
+	if (check_lockout(preq)) {
+		__TRACE("l2 %p %u\n", preq, preq->req_cluster);
+		spin_unlock_irq(&plo->lock);
+		return;
+	}
+	ploop_add_lockout(preq, 0);
+	spin_unlock_irq(&plo->lock);
+
+	if (!preq->aux_bio) {
+		preq->aux_bio = bio_alloc(GFP_NOFS, block_vecs(plo));
+
+		if (!preq->aux_bio ||
+		    fill_bio(plo, preq->aux_bio, preq->req_cluster)) {
+			PLOOP_REQ_FAIL_IMMEDIATE(preq, -ENOMEM);
+			return;
+		}
+	}
+
+	preq->iblock = iblk;
+	preq->eng_state = PLOOP_E_RELOC_DATA_READ;
+	sbl.head = sbl.tail = preq->aux_bio;
+	delta->io.ops->submit(&delta->io, preq, READ_SYNC,
+			      &sbl, iblk, 1<<plo->cluster_log);
+}
+
+/*
+ * Returns 0 if and only if a free block was successfully reused
+ */
+static int
+ploop_reuse_free_block(struct ploop_request *preq)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+	iblock_t  iblk;
+	cluster_t clu;
+	int	  rc;
+	unsigned long pin_state;
+
+	if (plo->maintenance_type != PLOOP_MNTN_FBLOADED &&
+	    plo->maintenance_type != PLOOP_MNTN_RELOC)
+		return -1;
+
+	rc = ploop_fb_get_free_block(plo->fbd, &clu, &iblk);
+
+	/* simple case - no free blocks left */
+	if (rc < 0)
+		return rc;
+
+	/* a free block to reuse requires zeroing index */
+	if (rc > 0) {
+		ploop_queue_zero_request(plo, preq, clu);
+		return 0;
+	}
+
+	/* 'rc == 0' - use iblk as a lost block */
+	pin_state = preq->iblock ? PLOOP_E_DELTA_ZERO_INDEX :
+				   PLOOP_E_ZERO_INDEX;
+	preq->iblock = iblk;
+
+	/* pin preq to some reloc request processing iblk ? */
+	if (ploop_fb_check_reloc_req(plo->fbd, preq, pin_state))
+		return 0;
+
+	/* iblk is a lost block and nobody is relocating it now */
+	preq->eng_state = PLOOP_E_DATA_WBI;
+	__TRACE("T2 %p %u\n", preq, preq->req_cluster);
+	plo->st.bio_out++;
+
+	if (pin_state == PLOOP_E_ZERO_INDEX) {
+		top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+					  &preq->bl, preq->iblock,
+					  preq->req_size);
+	} else { /* PLOOP_E_DELTA_READ */
+		struct bio_list sbl;
+
+		BUG_ON (preq->aux_bio == NULL);
+		sbl.head = sbl.tail = preq->aux_bio;
+
+		top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+				      &sbl, preq->iblock, 1<<plo->cluster_log);
+	}
+
+	return 0;
+}
+
+/*
+ * Returns 0 if and only if zero preq was successfully processed
+ */
+static int
+ploop_entry_zero_req(struct ploop_request *preq)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+	int	 level;
+	iblock_t iblk = 0;
+	int	 err;
+
+	err = ploop_find_map(&plo->map, preq);
+	if (err) {
+		if (err == 1) {
+			__TRACE("m %p %u\n", preq, *clu);
+			return 0;
+		}
+		return err;
+	}
+
+	level = map_get_index(preq, preq->req_cluster, &iblk);
+	if (level != top_delta->level) {
+		printk("Can't zero index on wrong level=%d "
+		       "(top_level=%d req_cluster=%u iblk=%u/%u)\n",
+		       level, top_delta->level, preq->req_cluster,
+		       iblk, preq->iblock);
+		return -EIO;
+	}
+
+	ploop_index_update(preq);
+	return 0;
+}
+
+#define MAP_MAX_IND(preq) min(map_get_mn_end(preq->map),	\
+			      preq->plo->map.max_index - 1)
+
+/*
+ * Returns 0 if and only if RELOC_A preq was successfully processed.
+ *
+ * Advance preq->req_cluster till it points to *iblk in grow range.
+ * Returning 0, always set *iblk to a meaningful value: either zero
+ * (if preq->req_cluster went out of allowed range or map is being read)
+ * or iblock in grow range that preq->req_cluster points to.
+ */
+static int
+ploop_entry_reloc_a_req(struct ploop_request *preq, iblock_t *iblk)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+	cluster_t           *clu       = &preq->req_cluster;
+	int level;
+	int err;
+	BUG_ON (*clu == ~0U);
+
+	while(*clu < plo->map.max_index) {
+		err = ploop_find_map(&plo->map, preq);
+		if (err) {
+			if (err == 1) {
+				__TRACE("m %p %u\n", preq, *clu);
+				*iblk = 0;
+				return 0;
+			}
+			return err;
+		}
+		BUG_ON (preq->map == NULL);
+
+		for (; *clu <= MAP_MAX_IND(preq); (*clu)++) {
+			level = map_get_index(preq, *clu, iblk);
+			if (level == top_delta->level &&
+			    *iblk >= plo->grow_start &&
+			    *iblk <= plo->grow_end)
+				break;
+		}
+
+		if (*clu <= MAP_MAX_IND(preq))
+			break;
+
+		spin_lock_irq(&plo->lock);
+		map_release(preq->map);
+		preq->map = NULL;
+		spin_unlock_irq(&plo->lock);
+	}
+
+	if (*clu >= plo->map.max_index) {
+		preq->eng_state = PLOOP_E_COMPLETE;
+		ploop_complete_request(preq);
+		*iblk = 0;
+		return 0;
+	}
+
+	return 0;
+}
+
+/*
+ * Returns 0 if and only if RELOC_S preq was successfully processed.
+ *
+ * Sets preq->req_cluster to the block we're going to relocate.
+ * Returning 0, always set *iblk to a meaningful value: either
+ * zero (if no more blocks to relocate or block to relocate is free
+ *	 (and zero-index op is scheduled) or map is being read)
+ * or iblock that preq->req_cluster points to.
+ */
+static int
+ploop_entry_reloc_s_req(struct ploop_request *preq, iblock_t *iblk)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+
+	cluster_t from_clu, to_clu;
+	iblock_t from_iblk, to_iblk;
+	u32 free;
+	int level;
+	int err;
+
+	*iblk = 0;
+
+	if (preq->req_cluster == ~0U) {
+		cluster_t zero_cluster;
+
+		BUG_ON (preq->error);
+		err = ploop_fb_get_reloc_block(plo->fbd, &from_clu, &from_iblk,
+					       &to_clu, &to_iblk, &free);
+		if (err < 0) {
+			preq->eng_state = PLOOP_E_COMPLETE;
+			ploop_complete_request(preq);
+			return 0;
+		}
+
+		preq->req_cluster = from_clu;
+		preq->src_iblock  = from_iblk;
+		ploop_fb_add_reloc_req(plo->fbd, preq);
+
+		if (free) {
+			preq->dst_iblock  = ~0U;
+			preq->dst_cluster = ~0U;
+			zero_cluster = preq->req_cluster;
+		} else {
+			preq->dst_iblock  = to_iblk;
+			preq->dst_cluster = to_clu;
+			zero_cluster = preq->dst_cluster;
+		}
+
+		ploop_queue_zero_request(plo, preq, zero_cluster);
+		return 0;
+	}
+
+	err = ploop_find_map(&plo->map, preq);
+	if (err) {
+		if (err == 1) {
+			__TRACE("m %p %u\n", preq, *clu);
+			return 0;
+		}
+		return err;
+	}
+	BUG_ON (preq->map == NULL);
+
+	level = map_get_index(preq, preq->req_cluster, iblk);
+	if (level != top_delta->level) {
+		printk("Can't relocate block on wrong level=%d "
+		       "(top_level=%d req_cluster=%u iblk=%u/%u)\n",
+		       level, top_delta->level, preq->req_cluster,
+		       *iblk, preq->iblock);
+		return -EIO;
+	}
+	if (preq->src_iblock != *iblk) {
+		printk("Can't relocate block due to wrong mapping: "
+		       "req_cluster=%u should point to iblk=%u while "
+		       "map_get_index() calculated iblk=%u\n",
+		       preq->req_cluster, preq->src_iblock, *iblk);
+		return -EIO;
+	}
+
+	return 0;
+}
+
+/* dummy wrapper around ploop_entry_reloc_[a|s]_req() */
+static int
+ploop_entry_reloc_req(struct ploop_request *preq, iblock_t *iblk)
+{
+	if (test_bit(PLOOP_REQ_RELOC_A, &preq->state))
+		return ploop_entry_reloc_a_req(preq, iblk);
+	else if (test_bit(PLOOP_REQ_RELOC_S, &preq->state))
+		return ploop_entry_reloc_s_req(preq, iblk);
+	else
+		BUG();
+}
+
+static void fill_zero_bio(struct ploop_device *plo, struct bio * bio)
+{
+	int pages = block_vecs(plo);
+
+	for (; bio->bi_vcnt < pages; bio->bi_vcnt++) {
+		bio->bi_io_vec[bio->bi_vcnt].bv_page = ZERO_PAGE(0);
+		bio->bi_io_vec[bio->bi_vcnt].bv_offset = 0;
+		bio->bi_io_vec[bio->bi_vcnt].bv_len = PAGE_SIZE;
+	}
+	bio->bi_sector = 0;
+	bio->bi_size = (1 << (plo->cluster_log + 9));
+}
+
+/*
+ * Returns 0 if and only if RELOC_A preq was successfully processed.
+ *
+ * Advance preq->req_cluster till it points to *iblk in grow range.
+ * Returning 0, always set *iblk to a meaningful value: either zero
+ * (if preq->req_cluster went out of allowed range or map is being read)
+ * or iblock in grow range that preq->req_cluster points to.
+ */
+static int
+ploop_entry_nullify_req(struct ploop_request *preq)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+	struct bio_list sbl;
+
+	if (!preq->aux_bio) {
+		preq->aux_bio = bio_alloc(GFP_NOFS, block_vecs(plo));
+		if (!preq->aux_bio)
+			return -ENOMEM;
+		fill_zero_bio(plo, preq->aux_bio);
+	}
+
+	sbl.head = sbl.tail = preq->aux_bio;
+	preq->eng_state = PLOOP_E_RELOC_NULLIFY;
+	list_del_init(&preq->list);
+
+	/*
+	 * Lately we think we does sync of nullified blocks at format
+	 * driver by image fsync before header update.
+	 * But we write this data directly into underlying device
+	 * bypassing EXT4 by usage of extent map tree
+	 * (see dio_submit()). So fsync of EXT4 image doesnt help us.
+	 * We need to force sync of nullified blocks.
+	 */
+	if (top_delta->io.ops->issue_flush) {
+		preq->eng_io = &top_delta->io;
+		set_bit(PLOOP_REQ_ISSUE_FLUSH, &preq->state);
+	}
+
+	top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+				  &sbl, preq->iblock, 1<<plo->cluster_log);
+	return 0;
+}
+
+static int discard_get_index(struct ploop_request *preq)
+{
+	struct ploop_device *plo       = preq->plo;
+	struct ploop_delta  *top_delta = ploop_top_delta(plo);
+	int	 level;
+	int	 err;
+
+	preq->iblock = 0;
+
+	err = ploop_find_map(&plo->map, preq);
+	if (err)
+		return err;
+
+	level = map_get_index(preq, preq->req_cluster, &preq->iblock);
+	if (level != top_delta->level)
+		preq->iblock = 0;
+
+	if (preq->map) {
+		spin_lock_irq(&plo->lock);
+		map_release(preq->map);
+		preq->map = NULL;
+		spin_unlock_irq(&plo->lock);
+	}
+
+	return 0;
+}
+
+static int ploop_entry_discard_req(struct ploop_request *preq)
+{
+	int err = 0;
+	struct ploop_device * plo = preq->plo;
+	unsigned int len = 0;
+	cluster_t last_clu;
+
+	if (!test_bit(PLOOP_S_DISCARD, &plo->state)) {
+		err = -EOPNOTSUPP;
+		goto err;
+	}
+
+	BUG_ON(plo->maintenance_type != PLOOP_MNTN_DISCARD);
+
+	last_clu = (preq->req_sector + preq->req_size) >> plo->cluster_log;
+
+	for (; preq->req_cluster < last_clu; preq->req_cluster++) {
+		len = preq->req_cluster - preq->dst_cluster;
+
+		err = discard_get_index(preq);
+		if (err) {
+			if (err == 1)
+				return 0;
+			goto err;
+		}
+
+		if (preq->dst_iblock &&
+		    (!preq->iblock || preq->dst_iblock + len != preq->iblock)) {
+			err = ploop_fb_add_free_extent(plo->fbd,
+							preq->dst_cluster,
+							preq->dst_iblock, len);
+			preq->dst_iblock = 0;
+			if (err)
+				goto err;
+		}
+
+		if (!preq->dst_iblock && preq->iblock) {
+			preq->dst_cluster = preq->req_cluster;
+			preq->dst_iblock = preq->iblock;
+		}
+	}
+
+	if (preq->dst_iblock) {
+		len = preq->req_cluster - preq->dst_cluster;
+		err = ploop_fb_add_free_extent(plo->fbd, preq->dst_cluster,
+						preq->dst_iblock, len);
+	}
+
+err:
+	preq->error = err;
+	preq->eng_state = PLOOP_E_COMPLETE;
+	ploop_complete_request(preq);
+
+	return 0;
+}
+
+/* Main preq state machine */
+
+static inline bool preq_is_special(struct ploop_request * preq)
+{
+	unsigned long state = READ_ONCE(preq->state);
+
+	return state & (PLOOP_REQ_MERGE_FL |
+			PLOOP_REQ_RELOC_A_FL |
+			PLOOP_REQ_RELOC_S_FL |
+			PLOOP_REQ_RELOC_N_FL |
+			PLOOP_REQ_DISCARD_FL |
+			PLOOP_REQ_ZERO_FL);
+}
+
+void ploop_add_req_to_fsync_queue(struct ploop_request * preq)
+{
+	struct ploop_device * plo       = preq->plo;
+	struct ploop_delta  * top_delta = ploop_top_delta(plo);
+	struct ploop_io     * top_io    = &top_delta->io;
+
+	spin_lock_irq(&plo->lock);
+	list_add_tail(&preq->list, &top_io->fsync_queue);
+	top_io->fsync_qlen++;
+	if (waitqueue_active(&top_io->fsync_waitq))
+		wake_up_interruptible(&top_io->fsync_waitq);
+	spin_unlock_irq(&plo->lock);
+}
+
+static void
+ploop_entry_request(struct ploop_request * preq)
+{
+	struct ploop_device * plo       = preq->plo;
+	struct ploop_delta  * top_delta = ploop_top_delta(plo);
+	struct ploop_io     * top_io    = &top_delta->io;
+	struct ploop_delta  * delta;
+	int level;
+	int err;
+	iblock_t iblk;
+
+	if (!preq_is_special(preq)) {
+		/* Control request */
+		if (unlikely(preq->bl.head == NULL)) {
+			complete(plo->quiesce_comp);
+			wait_for_completion(&plo->relax_comp);
+			ploop_complete_request(preq);
+			complete(&plo->relaxed_comp);
+			return;
+		}
+
+		/* Need to fsync before start handling FLUSH */
+		if ((preq->req_rw & REQ_FLUSH) &&
+		    test_bit(PLOOP_IO_FSYNC_DELAYED, &top_io->io_state) &&
+		    !test_bit(PLOOP_REQ_FSYNC_DONE, &preq->state)) {
+			ploop_add_req_to_fsync_queue(preq);
+			return;
+		}
+
+		/* Empty flush or unknown zero-size request */
+		if (preq->req_size == 0) {
+			if (preq->req_rw & REQ_FLUSH &&
+			    !test_bit(PLOOP_REQ_FSYNC_DONE, &preq->state)) {
+				preq->eng_state = PLOOP_E_COMPLETE;
+				if (top_io->ops->issue_flush) {
+					top_io->ops->issue_flush(top_io, preq);
+					return;
+				}
+			}
+
+			preq->eng_state = PLOOP_E_COMPLETE;
+			ploop_complete_request(preq);
+			return;
+		}
+	}
+
+	if (unlikely(test_bit(PLOOP_REQ_SYNC, &preq->state) &&
+		     !(preq->req_rw & REQ_SYNC)))
+		preq->req_rw |= REQ_SYNC;
+
+restart:
+	if (test_bit(PLOOP_REQ_DISCARD, &preq->state)) {
+		err = ploop_entry_discard_req(preq);
+		if (err)
+			goto error;
+		return;
+	} else if (test_bit(PLOOP_REQ_ZERO, &preq->state)) {
+		err = ploop_entry_zero_req(preq);
+		if (err)
+			goto error;
+		return;
+	} else if (test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
+		   test_bit(PLOOP_REQ_RELOC_S, &preq->state)) {
+		err = ploop_entry_reloc_req(preq, &iblk);
+		if (err)
+			goto error;
+		if (iblk)
+			ploop_reloc_sched_read(preq, iblk);
+		return;
+	} else if (test_bit(PLOOP_REQ_RELOC_N, &preq->state)) {
+		err = ploop_entry_nullify_req(preq);
+		if (err)
+			goto error;
+		return;
+	} else if (preq->req_cluster == ~0U) {
+		BUG_ON(!test_bit(PLOOP_REQ_MERGE, &preq->state));
+		BUG_ON(preq->trans_map);
+		BUG_ON(preq->map);
+
+		preq->req_cluster = plo->merge_ptr;
+		plo->merge_ptr++;
+		if (preq->req_cluster >= plo->trans_map->max_index) {
+			preq->eng_state = PLOOP_E_COMPLETE;
+			ploop_complete_request(preq);
+			return;
+		}
+	}
+
+	if (check_lockout(preq)) {
+		__TRACE("l %p %u\n", preq, preq->req_cluster);
+		return;
+	}
+
+	/* push_backup special processing */
+	if (!test_bit(PLOOP_REQ_PB_LOCKOUT, &preq->state) &&
+	    (preq->req_rw & REQ_WRITE) && preq->req_size &&
+	    ploop_pb_check_bit(plo->pbd, preq->req_cluster)) {
+		if (ploop_pb_preq_add_pending(plo->pbd, preq)) {
+			/* already reported by userspace push_backup */
+			ploop_pb_clear_bit(plo->pbd, preq->req_cluster);
+		} else {
+			/* needn't lock because only ploop_thread accesses */
+			ploop_add_pb_lockout(preq);
+			ploop_set_blockable(plo, preq);
+			/*
+			 * preq IN: preq is in ppb_pending tree waiting for
+			 * out-of-band push_backup processing by userspace ...
+			 */
+			return;
+		}
+	} else if (test_bit(PLOOP_REQ_PB_LOCKOUT, &preq->state) &&
+		   test_and_clear_bit(PLOOP_REQ_PUSH_BACKUP, &preq->ppb_state)) {
+		/*
+		 * preq OUT: out-of-band push_backup processing by
+		 * userspace done; preq was re-scheduled
+		 */
+		ploop_pb_clear_bit(plo->pbd, preq->req_cluster);
+		ploop_test_and_clear_blockable(plo, preq);
+
+		del_pb_lockout(preq);
+		spin_lock_irq(&plo->lock);
+		if (!list_empty(&preq->delay_list))
+			list_splice_init(&preq->delay_list, plo->ready_queue.prev);
+		spin_unlock_irq(&plo->lock);
+	}
+
+	if (plo->trans_map) {
+		err = ploop_find_trans_map(plo->trans_map, preq);
+		if (err) {
+			if (err == 1) {
+				__TRACE("tm %p %u\n", preq, preq->req_cluster);
+				return;
+			}
+			goto error;
+		}
+
+		if (preq->trans_map &&
+		    !(preq->req_rw & REQ_WRITE) &&
+		    trans_map_get_index(preq, preq->req_cluster, &iblk) == 0) {
+			delta = map_top_delta(plo->trans_map);
+			preq->iblock = iblk;
+			preq->eng_state = PLOOP_E_COMPLETE;
+			plo->st.bio_out++;
+			__TRACE("tS %p %u\n", preq, preq->req_cluster);
+			delta->io.ops->submit(&delta->io, preq, preq->req_rw, &preq->bl,
+					      iblk, preq->req_size);
+			return;
+		}
+
+		if (test_bit(PLOOP_REQ_MERGE, &preq->state)) {
+			if (prepare_merge_req(preq))
+				goto restart;
+		}
+	}
+
+	err = ploop_find_map(&plo->map, preq);
+	if (err) {
+		if (err == 1) {
+			__TRACE("m %p %u\n", preq, preq->req_cluster);
+			return;
+		}
+		goto error;
+	}
+
+	if (preq->trans_map &&
+	    trans_map_get_index(preq, preq->req_cluster, &iblk) == 0) {
+		struct bio_list sbl;
+
+		/* Read requests were served earlier. */
+		BUG_ON(!(preq->req_rw & REQ_WRITE));
+
+		spin_lock_irq(&plo->lock);
+		ploop_add_lockout(preq, 0);
+		spin_unlock_irq(&plo->lock);
+
+		if (whole_block(plo, preq)) {
+			set_bit(PLOOP_REQ_TRANS, &preq->state);
+			plo->st.bio_trans_whole++;
+			goto delta_io;
+		}
+
+		plo->st.bio_cows++;
+
+		if (!preq->aux_bio)
+			preq->aux_bio = bio_alloc(GFP_NOFS, block_vecs(plo));
+
+		if (!preq->aux_bio ||
+		    fill_bio(plo, preq->aux_bio, preq->req_cluster)) {
+			PLOOP_REQ_FAIL_IMMEDIATE(preq, -ENOMEM);
+			return;
+		}
+
+		delta = map_top_delta(plo->trans_map);
+
+		__TRACE("tDR %p %u\n", preq, preq->req_cluster);
+		preq->iblock = iblk;
+		preq->eng_state = PLOOP_E_TRANS_DELTA_READ;
+		sbl.head = sbl.tail = preq->aux_bio;
+		delta->io.ops->submit(&delta->io, preq, READ_SYNC,
+				      &sbl, iblk, 1<<plo->cluster_log);
+		plo->st.bio_trans_copy++;
+		return;
+	}
+
+delta_io:
+	BUG_ON(test_bit(PLOOP_REQ_MERGE, &preq->state));
+
+	delta = top_delta;
+
+	level = map_get_index(preq, preq->req_cluster, &iblk);
+	if (level < 0) {
+		delta = NULL;
+	} else if (level != top_delta->level) {
+		delta = find_delta(plo, level);
+		if (!delta) {
+			err = -EIO;
+			goto error;
+		}
+	}
+
+	if (!(preq->req_rw & REQ_WRITE)) {
+		/* Read direction. If we found existing block in some
+		 * delta, we direct bio there. If we did not, this location
+		 * was never written before. We return zero fill and,
+		 * probably, should log an alert.
+		 */
+		if (!delta) {
+			struct bio * bio;
+
+			if (map_index_fault(preq) == 0) {
+				__TRACE("i %p %u\n", preq, preq->req_cluster);
+				return;
+			}
+
+			__TRACE("X %p %u\n", preq, preq->req_cluster);
+			bio_list_for_each(bio, &preq->bl) {
+				zero_fill_bio(bio);
+			}
+			ploop_complete_request(preq);
+			plo->st.bio_rzero++;
+			return;
+		}
+		preq->iblock = iblk;
+		preq->eng_state = PLOOP_E_COMPLETE;
+		plo->st.bio_out++;
+		__TRACE("S %p %u\n", preq, preq->req_cluster);
+		delta->io.ops->submit(&delta->io, preq, preq->req_rw, &preq->bl,
+				      iblk, preq->req_size);
+	} else {
+		if (delta) {
+			if (delta == top_delta) {
+				/* Block exists in top delta. Good. */
+				if (plo->maintenance_type == PLOOP_MNTN_GROW ||
+				    plo->maintenance_type == PLOOP_MNTN_RELOC) {
+					spin_lock_irq(&plo->lock);
+					ploop_add_lockout(preq, 0);
+					spin_unlock_irq(&plo->lock);
+				}
+				preq->iblock = iblk;
+				preq->eng_state = PLOOP_E_COMPLETE;
+				__TRACE("T %p %u\n", preq, preq->req_cluster);
+				plo->st.bio_out++;
+				delta->io.ops->submit(&delta->io, preq, preq->req_rw,
+						      &preq->bl, iblk, preq->req_size);
+			} else if (whole_block(plo, preq)) {
+				__TRACE("O1 %p %u\n", preq, preq->req_cluster);
+				/* Block does not exist in top delta,
+				 * but it exists in some delta.
+				 * BUT! Plain luck, we have full block
+				 * and can skip read stage.
+				 */
+				plo->st.bio_whole_cows++;
+
+				/* About lockout. Reads could proceed
+				 * without lockout.
+				 */
+				spin_lock_irq(&plo->lock);
+				ploop_add_lockout(preq, 0);
+				spin_unlock_irq(&plo->lock);
+
+				if (likely(ploop_reuse_free_block(preq)))
+					top_delta->ops->allocate(top_delta,
+								 preq, &preq->bl,
+								 preq->req_size);
+			} else {
+				struct bio_list sbl;
+
+				plo->st.bio_cows++;
+
+				if (!preq->aux_bio)
+					preq->aux_bio = bio_alloc(GFP_NOFS, block_vecs(plo));
+
+				if (!preq->aux_bio ||
+				    fill_bio(plo, preq->aux_bio, preq->req_cluster)) {
+					PLOOP_REQ_FAIL_IMMEDIATE(preq, -ENOMEM);
+					return;
+				}
+				spin_lock_irq(&plo->lock);
+				ploop_add_lockout(preq, 0);
+				spin_unlock_irq(&plo->lock);
+
+				__TRACE("DR %p %u\n", preq, preq->req_cluster);
+				preq->iblock = iblk;
+				preq->eng_state = PLOOP_E_DELTA_READ;
+				sbl.head = sbl.tail = preq->aux_bio;
+				delta->io.ops->submit(&delta->io, preq, READ_SYNC,
+						      &sbl, iblk, 1<<plo->cluster_log);
+			}
+		} else {
+			if (!whole_block(plo, preq) && map_index_fault(preq) == 0) {
+					__TRACE("f %p %u\n", preq, preq->req_cluster);
+					return;
+			}
+
+			if (plo->tune.check_zeros && check_zeros(&preq->bl)) {
+				if (map_index_fault(preq) == 0) {
+					__TRACE("f %p %u\n", preq, preq->req_cluster);
+					return;
+				}
+				preq->eng_state = PLOOP_E_COMPLETE;
+				/* Not ploop_complete_request().
+				 * This can be TRANS request.
+				 */
+				ploop_complete_io_state(preq);
+				if(whole_block(plo, preq))
+					plo->st.bio_alloc_whole++;
+				plo->st.bio_wzero++;
+				return;
+			}
+			if(whole_block(plo, preq))
+				plo->st.bio_alloc_whole++;
+
+			spin_lock_irq(&plo->lock);
+			ploop_add_lockout(preq, 0);
+			spin_unlock_irq(&plo->lock);
+
+			/* Block does not exist. */
+			if (likely(ploop_reuse_free_block(preq))) {
+				__TRACE("K %p %u\n", preq, preq->req_cluster);
+				plo->st.bio_alloc++;
+				top_delta->ops->allocate(top_delta, preq,
+							 &preq->bl,
+							 preq->req_size);
+			}
+		}
+	}
+	return;
+
+error:
+	PLOOP_REQ_FAIL_IMMEDIATE(preq, err);
+}
+
+static void ploop_req_state_process(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	struct ploop_delta * top_delta;
+	struct io_context * saved_ioc = NULL;
+	int release_ioc = 0;
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter * uninitialized_var(saved_ub);
+#endif
+
+	trace_req_state_process(preq);
+
+	if (preq->ioc) {
+		saved_ioc = current->io_context;
+		current->io_context = preq->ioc;
+#ifdef CONFIG_BEANCOUNTERS
+		saved_ub = set_exec_ub(preq->ioc->ioc_ub);
+#endif
+		atomic_long_inc(&preq->ioc->refcount);
+		release_ioc = 1;
+	}
+
+	if (preq->eng_state != PLOOP_E_COMPLETE &&
+	    test_bit(PLOOP_REQ_SYNC, &preq->state))
+		set_bit(PLOOP_S_SYNC, &plo->state);
+
+	if (test_bit(PLOOP_REQ_TRACK, &preq->state)) {
+		sector_t sec;
+		clear_bit(PLOOP_REQ_TRACK, &preq->state);
+
+		sec = (sector_t)preq->track_cluster << plo->cluster_log;
+		if (sec < plo->track_end)
+			ploop_tracker_notify(plo, sec);
+	}
+
+	/* trick: preq->prealloc_size is actually new pos of eof */
+	if (unlikely(preq->prealloc_size && !preq->error)) {
+		struct ploop_io *io = &ploop_top_delta(plo)->io;
+		int log = preq->plo->cluster_log + 9;
+
+		BUG_ON(preq != io->prealloc_preq);
+		io->prealloc_preq = NULL;
+
+		io->prealloced_size = preq->prealloc_size -
+				      ((loff_t)io->alloc_head << log);
+		preq->prealloc_size = 0; /* only for sanity */
+	}
+
+	if (test_bit(PLOOP_REQ_POST_SUBMIT, &preq->state)) {
+		preq->eng_io->ops->post_submit(preq->eng_io, preq);
+		clear_bit(PLOOP_REQ_POST_SUBMIT, &preq->state);
+		preq->eng_io = NULL;
+	}
+
+	if (test_bit(PLOOP_REQ_ISSUE_FLUSH, &preq->state)) {
+		preq->eng_io->ops->issue_flush(preq->eng_io, preq);
+		clear_bit(PLOOP_REQ_ISSUE_FLUSH, &preq->state);
+		preq->eng_io = NULL;
+		goto out;
+	}
+
+restart:
+	BUG_ON(test_bit(PLOOP_REQ_POST_SUBMIT, &preq->state));
+	__TRACE("ST %p %u %lu\n", preq, preq->req_cluster, preq->eng_state);
+	switch (preq->eng_state) {
+	case PLOOP_E_ENTRY:
+		/* First entry */
+		if (preq->error ||
+		    ((preq->req_rw & REQ_WRITE) &&
+		     test_bit(PLOOP_S_ABORT, &plo->state))) {
+			PLOOP_REQ_FAIL_IMMEDIATE(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		ploop_entry_request(preq);
+		break;
+
+	case PLOOP_E_RELOC_COMPLETE:
+		BUG_ON (!test_bit(PLOOP_REQ_RELOC_S, &preq->state));
+		if (!preq->error) {
+			ploop_fb_relocate_req_completed(plo->fbd);
+			ploop_fb_del_reloc_req(plo->fbd, preq);
+			spin_lock_irq(&plo->lock);
+			if (!list_empty(&preq->delay_list)) {
+				struct ploop_request *pr;
+				pr = list_entry(preq->delay_list.next,
+						struct ploop_request, list);
+				list_splice_init(&preq->delay_list,
+						 plo->ready_queue.prev);
+			}
+			spin_unlock_irq(&plo->lock);
+			preq->req_cluster = ~0U;
+			preq->src_iblock  = ~0U; /* redundant */
+			preq->dst_cluster = ~0U; /* redundant */
+			preq->dst_iblock  = ~0U; /* redundant */
+			preq->eng_state = PLOOP_E_ENTRY;
+			goto restart;
+		}
+		/* drop down to PLOOP_E_COMPLETE case ... */
+	case PLOOP_E_COMPLETE:
+		if (unlikely(test_bit(PLOOP_REQ_RELOC_S, &preq->state) &&
+			     preq->error)) {
+			printk("RELOC_S completed with err %d"
+			       " (%u %u %u %u %u)\n",
+			       preq->error, preq->req_cluster, preq->iblock,
+			       preq->src_iblock, preq->dst_cluster,
+			       preq->dst_iblock);
+			ploop_fb_del_reloc_req(plo->fbd, preq);
+		}
+
+		if (!preq->error &&
+		    test_bit(PLOOP_REQ_TRANS, &preq->state)) {
+			u32 iblk;
+
+			__clear_bit(PLOOP_REQ_TRANS, &preq->state);
+			BUG_ON(!preq->trans_map);
+			if (!trans_map_get_index(preq, preq->req_cluster, &iblk)) {
+				spin_lock_irq(&plo->lock);
+				if (preq->map)
+					map_release(preq->map);
+				preq->map = preq->trans_map;
+				preq->trans_map = NULL;
+				spin_unlock_irq(&plo->lock);
+				preq->iblock = 0;
+				top_delta = map_top_delta(plo->trans_map);
+				top_delta->ops->allocate_complete(top_delta, preq);
+				plo->st.bio_trans_index++;
+				break;
+			}
+		}
+
+		ploop_complete_request(preq);
+		/* All done. */
+		break;
+
+	case PLOOP_E_DELTA_READ:
+	{
+		struct bio * b;
+
+		/* preq was scheduled for read from delta. bio is a bio
+		 * covering full block of data. Now we should copy data
+		 * and proceed with write.
+		 */
+		if (preq->error ||
+		    test_bit(PLOOP_S_ABORT, &plo->state)) {
+			PLOOP_REQ_FAIL_IMMEDIATE(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		bio_list_for_each(b, &preq->bl) {
+			bio_bcopy(preq->aux_bio, b, plo);
+		}
+
+		/* Fall through ... */
+	}
+	case PLOOP_E_DELTA_COPIED:
+	{
+		if (likely(ploop_reuse_free_block(preq))) {
+			struct bio_list sbl;
+			sbl.head = sbl.tail = preq->aux_bio;
+			top_delta = ploop_top_delta(plo);
+			top_delta->ops->allocate(top_delta, preq,
+						 &sbl, 1<<plo->cluster_log);
+		}
+		break;
+	}
+	case PLOOP_E_ZERO_INDEX:
+	{
+		preq->eng_state = PLOOP_E_DATA_WBI;
+		top_delta = ploop_top_delta(plo);
+		plo->st.bio_out++;
+		if (whole_block(plo, preq)) {
+			top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+						  &preq->bl, preq->iblock,
+						  preq->req_size);
+		} else {
+			struct bio_list sbl;
+			struct bio * b;
+			int i;
+
+			if (!preq->aux_bio)
+				preq->aux_bio = bio_alloc(GFP_NOFS, block_vecs(plo));
+
+			if (!preq->aux_bio ||
+			    fill_bio(plo, preq->aux_bio, preq->req_cluster)) {
+				PLOOP_REQ_FAIL_IMMEDIATE(preq, -ENOMEM);
+				break;
+			}
+
+			for (i = 0; i < preq->aux_bio->bi_vcnt; i++)
+				memset(page_address(preq->aux_bio->bi_io_vec[i].bv_page),
+				       0, PAGE_SIZE);
+
+			bio_list_for_each(b, &preq->bl) {
+				bio_bcopy(preq->aux_bio, b, plo);
+			}
+
+			sbl.head = sbl.tail = preq->aux_bio;
+			top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+						  &sbl, preq->iblock, 1<<plo->cluster_log);
+		}
+		break;
+	}
+	case PLOOP_E_DELTA_ZERO_INDEX:
+	{
+		struct bio_list sbl;
+
+		BUG_ON (preq->aux_bio == NULL);
+
+		preq->eng_state = PLOOP_E_DATA_WBI;
+		sbl.head = sbl.tail = preq->aux_bio;
+		top_delta = ploop_top_delta(plo);
+		plo->st.bio_out++;
+		top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+					  &sbl, preq->iblock,
+					  1<<plo->cluster_log);
+		break;
+	}
+	case PLOOP_E_RELOC_DATA_READ:
+	{
+		struct bio_list sbl;
+
+		if (preq->error ||
+		    test_bit(PLOOP_S_ABORT, &plo->state)) {
+			PLOOP_REQ_FAIL_IMMEDIATE(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		BUG_ON (!preq->aux_bio);
+
+		top_delta = ploop_top_delta(plo);
+		sbl.head = sbl.tail = preq->aux_bio;
+
+		/* Relocated data write required sync before BAT update
+		 * this will happen inside index_update */
+
+		if (test_bit(PLOOP_REQ_RELOC_S, &preq->state)) {
+			preq->eng_state = PLOOP_E_DATA_WBI;
+			plo->st.bio_out++;
+			preq->iblock = preq->dst_iblock;
+			top_delta->io.ops->submit(&top_delta->io, preq,
+						  preq->req_rw, &sbl,
+						  preq->iblock,
+						  1<<plo->cluster_log);
+		} else {
+			top_delta->ops->allocate(top_delta, preq, &sbl,
+						 1<<plo->cluster_log);
+		}
+		break;
+	}
+	case PLOOP_E_RELOC_NULLIFY:
+	{
+		if (preq->error ||
+		    test_bit(PLOOP_S_ABORT, &plo->state)) {
+			PLOOP_REQ_FAIL_IMMEDIATE(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		BUG_ON (!preq->aux_bio);
+
+		if (++plo->grow_relocated > plo->grow_end - plo->grow_start) {
+			preq->eng_state = PLOOP_E_COMPLETE;
+			ploop_complete_request(preq);
+			break;
+		}
+
+		del_lockout(preq);
+		preq->eng_state = PLOOP_E_ENTRY;
+		preq->iblock++;
+		goto restart;
+	}
+	case PLOOP_E_TRANS_DELTA_READ:
+	{
+		struct bio * b;
+		struct bio_list sbl;
+		u32 iblk;
+
+		/* preq was scheduled for read from delta. bio is a bio
+		 * covering full block of data. Now we should copy data
+		 * and proceed with write.
+		 */
+		if (preq->error ||
+		    test_bit(PLOOP_S_ABORT, &plo->state)) {
+			PLOOP_REQ_FAIL_IMMEDIATE(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		bio_list_for_each(b, &preq->bl) {
+			bio_bcopy(preq->aux_bio, b, plo);
+		}
+
+		top_delta = ploop_top_delta(plo);
+		sbl.head = sbl.tail = preq->aux_bio;
+
+		__set_bit(PLOOP_REQ_TRANS, &preq->state);
+		if (map_get_index(preq, preq->req_cluster, &iblk) != top_delta->level) {
+			/*
+			 * we can be here only if merge is in progress and
+			 * merge can't happen concurrently with ballooning
+			 */
+			top_delta->ops->allocate(top_delta, preq, &sbl, 1<<plo->cluster_log);
+			plo->st.bio_trans_alloc++;
+		} else {
+			preq->eng_state = PLOOP_E_COMPLETE;
+			preq->iblock = iblk;
+			top_delta->io.ops->submit(&top_delta->io, preq, preq->req_rw,
+						  &sbl, iblk, 1<<plo->cluster_log);
+		}
+		break;
+	}
+	case PLOOP_E_INDEX_READ:
+	case PLOOP_E_TRANS_INDEX_READ:
+		/* It was an index read. */
+		map_read_complete(preq);
+		preq->eng_state = PLOOP_E_ENTRY;
+		goto restart;
+
+	case PLOOP_E_DATA_WBI:
+		/* Data written. Index must be updated. */
+		if (preq->error ||
+		    test_bit(PLOOP_S_ABORT, &plo->state)) {
+			PLOOP_REQ_FAIL_IMMEDIATE(preq, preq->error ? : -EIO);
+			break;
+		}
+
+		top_delta = ploop_top_delta(plo);
+		top_delta->ops->allocate_complete(top_delta, preq);
+		break;
+
+	case PLOOP_E_INDEX_WB:
+		/* Index write completed. */
+		ploop_index_wb_complete(preq);
+		break;
+
+	case PLOOP_E_FSYNC_PENDED:
+		/* fsync done */
+		ploop_index_wb_proceed(preq);
+		break;
+
+	default:
+		BUG();
+	}
+out:
+	if (release_ioc) {
+		struct io_context * ioc = current->io_context;
+		current->io_context = saved_ioc;
+#ifdef CONFIG_BEANCOUNTERS
+		set_exec_ub(saved_ub);
+#endif
+		put_io_context(ioc);
+	}
+}
+
+static void ploop_wait(struct ploop_device * plo, int once, struct blk_plug *plug)
+{
+	DEFINE_WAIT(_wait);
+	for (;;) {
+		prepare_to_wait(&plo->waitq, &_wait, TASK_INTERRUPTIBLE);
+
+		/* This is obvious. */
+		if (!list_empty(&plo->ready_queue))
+			break;
+
+		/* This is not. If we have something in entry queue... */
+		if (!list_empty(&plo->entry_queue)) {
+			/* And entry queue is not suspended due to barrier
+			 * or active reuests are all completed, so that
+			 * we can start/finish barrier processing
+			 */
+			if (!once &&
+			    (!test_bit(PLOOP_S_ATTENTION, &plo->state) ||
+			     !plo->active_reqs))
+				break;
+		} else if (plo->bio_head ||
+			   (!bio_list_empty(&plo->bio_discard_list) &&
+			    !ploop_discard_is_inprogress(plo->fbd))) {
+			/* ready_queue and entry_queue are empty, but
+			 * bio list not. Obviously, we'd like to process
+			 * bio_list instead of sleeping */
+			if (!list_empty(&plo->free_list) &&
+			    (!test_bit(PLOOP_S_ATTENTION, &plo->state) ||
+			     !plo->active_reqs))
+				break;
+		}
+
+		if (kthread_should_stop() && !plo->active_reqs)
+			break;
+
+		set_bit(PLOOP_S_WAIT_PROCESS, &plo->state);
+		if (kthread_should_stop())
+			set_bit(PLOOP_S_EXITING, &plo->state);
+		once = 0;
+		spin_unlock_irq(&plo->lock);
+		blk_finish_plug(plug);
+		schedule();
+		blk_start_plug(plug);
+		spin_lock_irq(&plo->lock);
+		clear_bit(PLOOP_S_WAIT_PROCESS, &plo->state);
+	}
+	finish_wait(&plo->waitq, &_wait);
+}
+
+static void ploop_handle_enospc_req(struct ploop_request *preq)
+{
+	struct ploop_device * plo = preq->plo;
+	DEFINE_WAIT(_wait);
+
+	if (test_bit(PLOOP_S_ABORT, &plo->state))
+		return;
+
+	mod_timer(&plo->freeze_timer, jiffies + HZ * 10);
+
+	prepare_to_wait(&plo->freeze_waitq, &_wait, TASK_INTERRUPTIBLE);
+	spin_unlock_irq(&plo->lock);
+	schedule();
+	spin_lock_irq(&plo->lock);
+
+	finish_wait(&plo->freeze_waitq, &_wait);
+
+	spin_unlock_irq(&plo->lock);
+	if (preq->aux_bio) {
+		int i;
+		struct bio * bio = preq->aux_bio;
+
+		for (i = 0; i < bio->bi_vcnt; i++) {
+			struct page *page = bio->bi_io_vec[i].bv_page;
+			if (page != ZERO_PAGE(0))
+				put_page(page);
+		}
+
+		bio_put(bio);
+
+		preq->aux_bio = NULL;
+	}
+	spin_lock_irq(&plo->lock);
+
+	del_lockout(preq);
+
+	if (!list_empty(&preq->delay_list))
+		list_splice_init(&preq->delay_list, plo->ready_queue.prev);
+
+	if (preq->map) {
+		map_release(preq->map);
+		preq->map = NULL;
+	}
+	if (preq->trans_map) {
+		map_release(preq->trans_map);
+		preq->trans_map = NULL;
+	}
+
+	preq->eng_state = PLOOP_E_ENTRY;
+	preq->error = 0;
+	preq->tstamp = jiffies;
+	preq->iblock = 0;
+}
+
+static void
+process_pending_bios(struct ploop_device * plo, struct list_head *drop_list)
+{
+	while (!ploop_pb_bio_list_empty(plo->pbd) &&
+	       !list_empty(&plo->free_list) &&
+	       (plo->free_qlen > plo->free_qmax / 2 ||
+		plo->blockable_reqs <= plo->free_qmax / 4)) {
+		struct bio *bio = ploop_pb_bio_get(plo->pbd);
+
+		ploop_bio_queue(plo, bio, drop_list, 1);
+		plo->blocked_bios--;
+	}
+}
+
+/* Main process. Processing queues in proper order, handling pre-barrier
+ * flushes and queue suspend while processing a barrier
+ */
+static int ploop_thread(void * data)
+{
+	int once = 0;
+	struct ploop_device * plo = data;
+	struct blk_plug plug;
+	LIST_HEAD(drop_list);
+
+	set_user_nice(current, -20);
+
+	blk_start_plug(&plug);
+	spin_lock_irq(&plo->lock);
+	for (;;) {
+		/* Convert bios to preqs early (at least before processing
+		 * entry queue) to increase chances of bio merge
+		 */
+	again:
+		BUG_ON (!list_empty(&drop_list));
+
+		process_pending_bios(plo, &drop_list);
+		process_bio_queue_main(plo, &drop_list);
+		process_discard_bio_queue(plo, &drop_list);
+
+		if (!list_empty(&drop_list)) {
+			spin_unlock_irq(&plo->lock);
+			ploop_preq_drop(plo, &drop_list, 1);
+			goto again;
+		}
+
+		if (!list_empty(&plo->ready_queue)) {
+			struct ploop_request * preq;
+			preq = ploop_get_request(plo, &plo->ready_queue);
+			if (preq->error == -ENOSPC)
+				ploop_handle_enospc_req(preq);
+			spin_unlock_irq(&plo->lock);
+
+			ploop_req_state_process(preq);
+
+			spin_lock_irq(&plo->lock);
+			continue;
+		}
+
+		/* Now ready_queue is empty */
+
+		if (plo->active_reqs == 0)
+			clear_bit(PLOOP_S_ATTENTION, &plo->state);
+
+		if (!list_empty(&plo->entry_queue) &&
+		    !test_bit(PLOOP_S_ATTENTION, &plo->state)) {
+			struct ploop_request * preq;
+
+			preq = ploop_get_request(plo, &plo->entry_queue);
+
+			if (test_bit(PLOOP_REQ_BARRIER, &preq->state)) {
+				set_bit(PLOOP_S_ATTENTION, &plo->state);
+				if (plo->active_reqs) {
+					list_add(&preq->list, &plo->entry_queue);
+					continue;
+				}
+				plo->barrier_reqs--;
+			} else {
+				if (!plo->read_sync_reqs &&
+				    plo->active_reqs > plo->tune.max_active_requests &&
+				    plo->active_reqs > plo->entry_qlen &&
+				    time_before(jiffies, preq->tstamp + plo->tune.batch_entry_delay) &&
+				    !kthread_should_stop()) {
+					list_add(&preq->list, &plo->entry_queue);
+					once = 1;
+					mod_timer(&plo->mitigation_timer, preq->tstamp + plo->tune.batch_entry_delay);
+					goto wait_more;
+				}
+			}
+
+			plo->active_reqs++;
+			ploop_entry_qlen_dec(preq);
+
+			if (test_bit(PLOOP_REQ_DISCARD, &preq->state)) {
+				BUG_ON(plo->maintenance_type != PLOOP_MNTN_DISCARD);
+				atomic_inc(&plo->maintenance_cnt);
+			}
+
+			if (test_bit(PLOOP_REQ_SORTED, &preq->state)) {
+				rb_erase(&preq->lockout_link, &plo->entry_tree[preq->req_rw & WRITE]);
+				__clear_bit(PLOOP_REQ_SORTED, &preq->state);
+			}
+			preq->eng_state = PLOOP_E_ENTRY;
+			spin_unlock_irq(&plo->lock);
+
+			ploop_req_state_process(preq);
+
+			spin_lock_irq(&plo->lock);
+			continue;
+		}
+
+		/* Termination condition: stop requested,
+		 * no requests are in process or in entry queue
+		 */
+		if (kthread_should_stop() && !plo->active_reqs &&
+		    list_empty(&plo->entry_queue) && !plo->bio_head &&
+		    bio_list_empty(&plo->bio_discard_list) &&
+		    ploop_pb_bio_list_empty(plo->pbd))
+			break;
+
+wait_more:
+		ploop_wait(plo, once, &plug);
+		once = 0;
+	}
+	spin_unlock_irq(&plo->lock);
+	blk_finish_plug(&plug);
+
+	if (current->io_context)
+		exit_io_context(current);
+
+	return 0;
+}
+
+
+/* block device operations */
+static int ploop_open(struct block_device *bdev, fmode_t fmode)
+{
+	struct ploop_device * plo = bdev->bd_disk->private_data;
+
+	mutex_lock(&plo->ctl_mutex);
+
+	BUG_ON (plo->bdev && plo->bdev != bdev);
+	if (!plo->bdev)
+		plo->bdev = bdev;
+
+	atomic_inc(&plo->open_count);
+	mutex_unlock(&plo->ctl_mutex);
+
+	check_disk_change(bdev);
+
+	return 0;
+}
+
+static void ploop_release(struct gendisk *disk, fmode_t fmode)
+{
+	struct ploop_device *plo = disk->private_data;
+
+	mutex_lock(&plo->ctl_mutex);
+	if (atomic_dec_and_test(&plo->open_count)) {
+		ploop_pb_destroy(plo, NULL);
+		ploop_tracker_stop(plo, 1);
+		plo->bdev = NULL;
+	}
+	mutex_unlock(&plo->ctl_mutex);
+}
+
+static struct ploop_delta *
+init_delta(struct ploop_device * plo, struct ploop_ctl * ctl, int level)
+{
+	struct ploop_delta * delta;
+	struct ploop_delta_ops * ops;
+	int err;
+
+	ops = ploop_format_get(ctl->pctl_format);
+	if (ops == NULL)
+		return ERR_PTR(-EINVAL);
+
+	if (level < 0 && !list_empty(&plo->map.delta_list)) {
+		struct ploop_delta * top_delta = ploop_top_delta(plo);
+		err = -EINVAL;
+		if (top_delta->level >= 127)
+			goto out_err;
+		level = top_delta->level + 1;
+		if (ctl->pctl_cluster_log != plo->cluster_log)
+			goto out_err;
+		if (!(ops->capability & PLOOP_FMT_CAP_DELTA))
+			goto out_err;
+	} else if (level >= 0) {
+		struct ploop_delta * delta = find_delta(plo, level);
+		err = -EINVAL;
+		if (delta == NULL)
+			goto out_err;
+		if (ctl->pctl_cluster_log != plo->cluster_log)
+			goto out_err;
+		if (level && !(ops->capability & PLOOP_FMT_CAP_DELTA))
+			goto out_err;
+	}
+
+	if (level < 0)
+		level = 0;
+
+	err = -ENOMEM;
+	delta = kzalloc(sizeof(struct ploop_delta), GFP_KERNEL);
+	if (delta == NULL)
+		goto out_err;
+
+	__module_get(THIS_MODULE);
+
+	delta->level = level;
+	delta->cluster_log = ctl->pctl_cluster_log;
+	delta->plo = plo;
+	delta->ops = ops;
+	delta->flags = ctl->pctl_flags & PLOOP_FMT_FLAGS;
+	delta->max_delta_size = ULLONG_MAX;
+
+	KOBJECT_INIT(&delta->kobj, &ploop_delta_ktype);
+	return delta;
+
+out_err:
+	ploop_format_put(ops);
+	return ERR_PTR(err);
+}
+
+
+static int ploop_set_max_delta_size(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_delta * top_delta = ploop_top_delta(plo);
+	u64 max_delta_size;
+
+	if (copy_from_user(&max_delta_size, (void*)arg, sizeof(u64)))
+		return -EFAULT;
+
+	if (top_delta == NULL)
+		return -EINVAL;
+
+	top_delta->max_delta_size = max_delta_size;
+
+	return 0;
+}
+
+static int ploop_add_delta(struct ploop_device * plo, unsigned long arg)
+{
+	int err;
+	struct ploop_ctl ctl;
+	struct ploop_ctl_chunk chunk;
+	struct ploop_delta * delta;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(struct ploop_ctl)))
+		return -EFAULT;
+	if (ctl.pctl_chunks != 1)
+		return -EINVAL;
+	if (copy_from_user(&chunk, (void*)arg + sizeof(struct ploop_ctl),
+			   sizeof(struct ploop_ctl_chunk)))
+		return -EFAULT;
+
+	if ((ctl.pctl_flags & PLOOP_FLAG_COOKIE) && !plo->cookie[0] &&
+	    copy_from_user(plo->cookie, (void*)arg + sizeof(struct ploop_ctl) +
+			   sizeof(struct ploop_ctl_chunk),
+			   PLOOP_COOKIE_SIZE - 1))
+		return -EFAULT;
+
+	if (test_bit(PLOOP_S_RUNNING, &plo->state))
+		return -EBUSY;
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	delta = init_delta(plo, &ctl, -1);
+	if (IS_ERR(delta))
+		return PTR_ERR(delta);
+
+	err = delta->ops->compose(delta, 1, &chunk);
+	if (err)
+		goto out_destroy;
+
+	if (list_empty(&plo->map.delta_list))
+		plo->fmt_version = PLOOP_FMT_UNDEFINED;
+
+	err = delta->ops->open(delta);
+	if (err)
+		goto out_destroy;
+
+	if (list_empty(&plo->map.delta_list)) {
+		plo->cluster_log = delta->cluster_log;
+	} else {
+		struct ploop_delta * top_delta = ploop_top_delta(plo);
+
+		err = -EINVAL;
+		if (!(top_delta->flags & PLOOP_FMT_RDONLY))
+			goto out_close;
+	}
+
+	err = KOBJECT_ADD(&delta->kobj, kobject_get(&plo->kobj),
+			  "%d", delta->level);
+	if (err < 0) {
+		kobject_put(&plo->kobj);
+		goto out_close;
+	}
+
+	mutex_lock(&plo->sysfs_mutex);
+	list_add(&delta->list, &plo->map.delta_list);
+	mutex_unlock(&plo->sysfs_mutex);
+	set_bit(PLOOP_S_CHANGED, &plo->state);
+
+	return 0;
+
+out_close:
+	delta->ops->stop(delta);
+out_destroy:
+	delta->ops->destroy(delta);
+	kobject_put(&delta->kobj);
+	return err;
+}
+
+static int ploop_replace_delta(struct ploop_device * plo, unsigned long arg)
+{
+	int err;
+	struct ploop_ctl ctl;
+	struct ploop_ctl_chunk chunk;
+	struct ploop_delta * delta, * old_delta;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(struct ploop_ctl)))
+		return -EFAULT;
+	if (ctl.pctl_chunks != 1)
+		return -EINVAL;
+	if (copy_from_user(&chunk, (void*)arg + sizeof(struct ploop_ctl),
+			   sizeof(struct ploop_ctl_chunk)))
+		return -EFAULT;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	old_delta = find_delta(plo, ctl.pctl_level);
+	if (old_delta == NULL)
+		return -ENOENT;
+
+	if ((old_delta->flags ^ ctl.pctl_flags) & PLOOP_FMT_RDONLY)
+		return -EINVAL;
+
+	delta = init_delta(plo, &ctl, ctl.pctl_level);
+	if (IS_ERR(delta))
+		return PTR_ERR(delta);
+
+	err = delta->ops->compose(delta, 1, &chunk);
+	if (err)
+		goto out_destroy;
+
+	err = delta->ops->open(delta);
+	if (err)
+		goto out_destroy;
+
+	kobject_del(&old_delta->kobj);
+
+	err = KOBJECT_ADD(&delta->kobj, kobject_get(&plo->kobj),
+			  "%d", delta->level);
+	kobject_put(&plo->kobj);
+
+	if (err < 0) {
+		kobject_put(&plo->kobj);
+		goto out_close;
+	}
+
+	ploop_quiesce(plo);
+	ploop_map_destroy(&plo->map);
+	list_replace_init(&old_delta->list, &delta->list);
+	ploop_relax(plo);
+
+	old_delta->ops->stop(old_delta);
+	old_delta->ops->destroy(old_delta);
+	kobject_put(&old_delta->kobj);
+	return 0;
+
+out_close:
+	delta->ops->stop(delta);
+out_destroy:
+	delta->ops->destroy(delta);
+	kobject_put(&delta->kobj);
+	return err;
+}
+
+
+void ploop_quiesce(struct ploop_device * plo)
+{
+	struct completion qcomp;
+	struct ploop_request * preq;
+
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return;
+
+	spin_lock_irq(&plo->lock);
+	preq = ploop_alloc_request(plo);
+	preq->bl.head = preq->bl.tail = NULL;
+	preq->req_size = 0;
+	preq->req_rw = 0;
+	preq->eng_state = PLOOP_E_ENTRY;
+	preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_BARRIER);
+	preq->error = 0;
+	preq->tstamp = jiffies;
+
+	init_completion(&qcomp);
+	init_completion(&plo->relax_comp);
+	init_completion(&plo->relaxed_comp);
+	plo->quiesce_comp = &qcomp;
+
+	ploop_entry_add(plo, preq);
+	plo->barrier_reqs++;
+
+	if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irq(&plo->lock);
+
+	wait_for_completion(&qcomp);
+	plo->quiesce_comp = NULL;
+}
+
+void ploop_relax(struct ploop_device * plo)
+{
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return;
+
+	complete(&plo->relax_comp);
+	wait_for_completion(&plo->relaxed_comp);
+}
+
+/* search disk for first partition bdev with mounted fs and freeze it */
+static struct super_block *find_and_freeze_bdev(struct ploop_device *plo,
+						struct block_device ** bdev_pp)
+{
+	struct super_block  * sb   = NULL;
+	struct block_device * bdev = NULL;
+	struct gendisk *disk = plo->disk;
+	int i;
+
+	bdev = ploop_get_dm_crypt_bdev(plo);
+	if (bdev) {
+		sb = freeze_bdev(bdev);
+		goto out;
+	}
+
+	for (i = 0; i <= (*bdev_pp)->bd_part_count; i++) {
+		bdev = bdget_disk(disk, i);
+		if (!bdev)
+			break;
+
+		sb = freeze_bdev(bdev);
+		if (sb)
+			break;
+
+		thaw_bdev(bdev, sb);
+		bdput(bdev);
+		bdev = NULL;
+	}
+
+out:
+	if (IS_ERR(sb))
+		bdput(bdev);
+	else
+		*bdev_pp = bdev;
+	return sb;
+}
+
+static int ploop_snapshot(struct ploop_device * plo, unsigned long arg,
+			  struct block_device * bdev)
+{
+	int err;
+	struct ploop_ctl ctl;
+	struct ploop_ctl_chunk chunk;
+	struct ploop_delta * delta, * top_delta;
+	struct ploop_snapdata snapdata;
+	struct super_block * sb;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return ploop_add_delta(plo, arg);
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(struct ploop_ctl)))
+		return -EFAULT;
+	if (ctl.pctl_chunks != 1)
+		return -EINVAL;
+	if (copy_from_user(&chunk, (void*)arg + sizeof(struct ploop_ctl),
+			   sizeof(struct ploop_ctl_chunk)))
+		return -EFAULT;
+
+	delta = init_delta(plo, &ctl, -1);
+	if (IS_ERR(delta))
+		return PTR_ERR(delta);
+
+	err = delta->ops->compose(delta, 1, &chunk);
+	if (err)
+		goto out_destroy;
+
+	err = delta->ops->open(delta);
+	if (err)
+		goto out_destroy;
+
+	err = KOBJECT_ADD(&delta->kobj, kobject_get(&plo->kobj),
+			  "%d", delta->level);
+	if (err)
+		goto out_close;
+
+	top_delta = ploop_top_delta(plo);
+
+	err = top_delta->ops->prepare_snapshot(top_delta, &snapdata);
+	if (err)
+		goto out_close2;
+
+	/* _XXX_ only one mounted fs per ploop-device is supported */
+	sb = NULL;
+	if (ctl.pctl_flags & PLOOP_FLAG_FS_SYNC) {
+		/* freeze_bdev() may trigger ploop_bd_full() */
+		plo->maintenance_type = PLOOP_MNTN_SNAPSHOT;
+		mutex_unlock(&plo->ctl_mutex);
+		sb = find_and_freeze_bdev(plo, &bdev);
+		mutex_lock(&plo->ctl_mutex);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+		if (IS_ERR(sb)) {
+			err = PTR_ERR(sb);
+			fput(snapdata.file);
+			goto out_close2;
+		}
+	}
+
+	ploop_quiesce(plo);
+	err = top_delta->ops->complete_snapshot(top_delta, &snapdata);
+	if (!err) {
+		mutex_lock(&plo->sysfs_mutex);
+		list_add(&delta->list, &plo->map.delta_list);
+		clear_bit(PLOOP_MAP_IDENTICAL, &plo->map.flags);
+		mutex_unlock(&plo->sysfs_mutex);
+	}
+	ploop_relax(plo);
+
+	if ((ctl.pctl_flags & PLOOP_FLAG_FS_SYNC) && bdev) {
+		/* Drop ctl_mutex in order to avoid reverse order locking
+		   thaw_bdev() ->kill_sb() ->blkdev_put() ->bd_mutex */
+		plo->maintenance_type = PLOOP_MNTN_SNAPSHOT;
+		mutex_unlock(&plo->ctl_mutex);
+		thaw_bdev(bdev, sb);
+		mutex_lock(&plo->ctl_mutex);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+		bdput(bdev);
+	}
+
+	if (err)
+		goto out_close2;
+
+	return 0;
+
+out_close2:
+	kobject_del(&delta->kobj);
+out_close:
+	kobject_put(&plo->kobj);
+	delta->ops->stop(delta);
+out_destroy:
+	delta->ops->destroy(delta);
+	kobject_put(&delta->kobj);
+	return err;
+}
+
+static void renumber_deltas(struct ploop_device * plo)
+{
+	struct ploop_delta * delta;
+	int level = 0;
+
+	list_for_each_entry_reverse(delta, &plo->map.delta_list, list) {
+		delta->level = level++;
+	}
+
+	if (level == 1) {
+		delta = ploop_top_delta(plo);
+		if (delta->level == 0 &&
+		    (delta->ops->capability & PLOOP_FMT_CAP_IDENTICAL))
+			set_bit(PLOOP_MAP_IDENTICAL, &plo->map.flags);
+	}
+}
+
+static void rename_deltas(struct ploop_device * plo, int level)
+{
+	struct ploop_delta * delta;
+
+	list_for_each_entry_reverse(delta, &plo->map.delta_list, list) {
+		int err;
+
+		if (delta->level < level)
+			continue;
+#if 0
+		/* Oops, kobject_rename() is not exported! */
+		sprintf(nname, "%d", delta->level);
+		err = kobject_rename(&delta->kobj, nname);
+#else
+		kobject_del(&delta->kobj);
+		err = KOBJECT_ADD(&delta->kobj, &plo->kobj,
+				  "%d", delta->level);
+#endif
+		if (err)
+			printk("rename_deltas: %d %d %d\n", err, level, delta->level);
+	}
+}
+
+/* Delete delta. Obviously, removing an arbitrary delta will destroy
+ * all the data unless this delta is empty or its data are completely
+ * covered by higher delta or lower delta contains the whole copy of delta,
+ * which is deleted. Driver does not check this.
+ *
+ * Some cases, f.e. removing writable top delta are never valid,
+ * because caller has no way to ensure that new data do not emerge.
+ * Nevertheless, we do _NOT_ prohibit this operation, assuming
+ * that caller have some knowledge, which we cannot comprehend.
+ * F.e. virtual machine using the device was stopped, device
+ * was synced and data were copied to lower delta. And this is bad
+ * idea. This should be different ioctl.
+ */
+
+static int ploop_del_delta(struct ploop_device * plo, unsigned long arg)
+{
+	__u32 level;
+	struct ploop_delta * delta, * next;
+
+	if (copy_from_user(&level, (void*)arg, 4))
+		return -EFAULT;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	if (level == 0 && test_bit(PLOOP_S_RUNNING, &plo->state)) {
+		printk(KERN_INFO "Can't del base delta on running ploop%d\n",
+		       plo->index);
+		return -EBUSY;
+	}
+
+	delta = find_delta(plo, level);
+
+	if (delta == NULL)
+		return -ENOENT;
+
+	kobject_del(&delta->kobj);
+	kobject_put(&plo->kobj);
+
+	ploop_quiesce(plo);
+	next = list_entry(delta->list.next, struct ploop_delta, list);
+	list_del(&delta->list);
+	if (list_empty(&plo->map.delta_list))
+		plo->cookie[0] = 0;
+	if (level != 0)
+		next->ops->refresh(next);
+	if (test_bit(PLOOP_S_RUNNING, &plo->state))
+		ploop_map_remove_delta(&plo->map, level);
+	renumber_deltas(plo);
+	ploop_relax(plo);
+	rename_deltas(plo, level);
+
+	delta->ops->stop(delta);
+	delta->ops->destroy(delta);
+	kobject_put(&delta->kobj);
+	BUG_ON(test_bit(PLOOP_S_RUNNING, &plo->state) &&
+	       list_empty(&plo->map.delta_list));
+	return 0;
+}
+
+static void ploop_merge_process(struct ploop_device * plo)
+{
+	int num_reqs;
+
+	spin_lock_irq(&plo->lock);
+
+	atomic_set(&plo->maintenance_cnt, 1);
+	plo->merge_ptr = 0;
+
+	init_completion(&plo->maintenance_comp);
+
+	num_reqs = plo->tune.fsync_max;
+	if (num_reqs > plo->tune.max_requests/2)
+		num_reqs = plo->tune.max_requests/2;
+	if (num_reqs < 1)
+		num_reqs = 1;
+
+	for (; num_reqs; num_reqs--) {
+		struct ploop_request * preq;
+
+		preq = ploop_alloc_request(plo);
+
+		preq->bl.tail = preq->bl.head = NULL;
+		preq->req_cluster = ~0U;
+		preq->req_size = 0;
+		preq->req_rw = WRITE_SYNC;
+		preq->eng_state = PLOOP_E_ENTRY;
+		preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_MERGE);
+		preq->error = 0;
+		preq->tstamp = jiffies;
+		preq->iblock = 0;
+		preq->prealloc_size = 0;
+
+		atomic_inc(&plo->maintenance_cnt);
+
+		ploop_entry_add(plo, preq);
+
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+	}
+
+	if (atomic_dec_and_test(&plo->maintenance_cnt))
+		complete(&plo->maintenance_comp);
+
+	spin_unlock_irq(&plo->lock);
+}
+
+int ploop_maintenance_wait(struct ploop_device * plo)
+{
+	int err;
+
+	mutex_unlock(&plo->ctl_mutex);
+
+	err = wait_for_completion_interruptible(&plo->maintenance_comp);
+
+	mutex_lock(&plo->ctl_mutex);
+
+	return atomic_read(&plo->maintenance_cnt) ? err : 0;
+}
+
+static void ploop_update_fmt_version(struct ploop_device * plo)
+{
+	struct ploop_delta * delta = ploop_top_delta(plo);
+
+	if (delta->level == 0 &&
+	    (delta->ops->capability & PLOOP_FMT_CAP_IDENTICAL)) {
+		ploop_map_destroy(&plo->map);
+		set_bit(PLOOP_MAP_IDENTICAL, &plo->map.flags);
+		plo->fmt_version = PLOOP_FMT_UNDEFINED;
+	}
+}
+
+static void ploop_merge_cleanup(struct ploop_device * plo,
+				struct ploop_map * map,
+				struct ploop_delta * delta, int err)
+{
+	ploop_quiesce(plo);
+	mutex_lock(&plo->sysfs_mutex);
+	list_del(&delta->list);
+
+	if (err)
+		list_add(&delta->list, &plo->map.delta_list);
+	else
+		ploop_update_fmt_version(plo);
+
+	plo->trans_map = NULL;
+	plo->maintenance_type = PLOOP_MNTN_OFF;
+	mutex_unlock(&plo->sysfs_mutex);
+	ploop_map_destroy(map);
+	ploop_relax(plo);
+}
+
+static int ploop_merge(struct ploop_device * plo)
+{
+	int err;
+	struct ploop_map * map;
+	struct ploop_delta * delta, * next;
+	struct ploop_snapdata sd;
+
+	if (plo->maintenance_type == PLOOP_MNTN_MERGE)
+		goto already;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	BUG_ON (plo->trans_map);
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	delta = ploop_top_delta(plo);
+	if (delta->level == 0)
+		return -ENOENT;
+
+	map = kzalloc(sizeof(struct ploop_map), GFP_KERNEL);
+	if (map == NULL)
+		return -ENOMEM;
+
+	map_init(plo, map);
+	ploop_map_start(map, plo->bd_size);
+
+	next = list_entry(delta->list.next, struct ploop_delta, list);
+
+	err = next->ops->prepare_merge(next, &sd);
+	if (err) {
+		printk(KERN_WARNING "prepare_merge for ploop%d failed (%d)\n",
+		       plo->index, err);
+		goto out;
+	}
+
+	ploop_quiesce(plo);
+
+	if (test_bit(PLOOP_S_RUNNING, &plo->state))
+		ploop_map_destroy(&plo->map);
+
+	err = next->ops->start_merge(next, &sd);
+
+	if (!err) {
+		mutex_lock(&plo->sysfs_mutex);
+		list_del(&delta->list);
+		list_add(&delta->list, &map->delta_list);
+		delta->level = 0;
+		plo->trans_map = map;
+		plo->maintenance_type = PLOOP_MNTN_MERGE;
+		mutex_unlock(&plo->sysfs_mutex);
+	} else {
+		/* Yes. All transient obstacles must be resolved
+		 * in prepare_merge. Failed start_merge means
+		 * abort of the device.
+		 */
+		printk(KERN_WARNING "start_merge for ploop%d failed (%d)\n",
+		       plo->index, err);
+		set_bit(PLOOP_S_ABORT, &plo->state);
+	}
+
+	ploop_relax(plo);
+
+	if (err)
+		goto out;
+
+	ploop_merge_process(plo);
+
+already:
+	err = ploop_maintenance_wait(plo);
+	if (err)
+		return err;
+
+	BUG_ON(atomic_read(&plo->maintenance_cnt));
+
+	if (plo->maintenance_type != PLOOP_MNTN_MERGE)
+		return -EALREADY;
+
+	map = plo->trans_map;
+	BUG_ON (!map);
+
+	delta = map_top_delta(plo->trans_map);
+
+	if (test_bit(PLOOP_S_ABORT, &plo->state)) {
+		printk(KERN_WARNING "merge for ploop%d failed (state ABORT)\n",
+		       plo->index);
+		err = -EIO;
+	}
+
+	ploop_merge_cleanup(plo, map, delta, err);
+
+	if (!err) {
+		kobject_del(&delta->kobj);
+		kobject_put(&plo->kobj);
+
+		delta->ops->stop(delta);
+		delta->ops->destroy(delta);
+		kobject_put(&delta->kobj);
+	}
+out:
+	kfree(map);
+	return err;
+}
+
+static int ploop_truncate(struct ploop_device * plo, unsigned long arg)
+{
+	int err;
+	struct ploop_truncate_ctl ctl;
+	struct ploop_delta * delta;
+	struct file * file;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(struct ploop_truncate_ctl)))
+		return -EFAULT;
+
+	if (ctl.fd < 0)
+		return -EBADF;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	delta = find_delta(plo, ctl.level);
+	if (delta == NULL)
+		return -ENOENT;
+
+	if (!(delta->flags & PLOOP_FMT_RDONLY))
+		return -EBUSY;
+
+	if (delta->ops->truncate == NULL)
+		return -EOPNOTSUPP;
+
+	file = fget(ctl.fd);
+	if (file == NULL)
+		return -EBADF;
+
+	ploop_quiesce(plo);
+
+	ploop_map_destroy(&plo->map);
+
+	err = delta->ops->truncate(delta, file, ctl.alloc_head);
+	if (!err)
+		delta->io.prealloced_size = 0;
+
+	ploop_relax(plo);
+
+	fput(file);
+
+	return err;
+}
+
+#define FUSE_SUPER_MAGIC 0x65735546
+#define IS_PSTORAGE(sb) (sb->s_magic == FUSE_SUPER_MAGIC && \
+			 (!strcmp(sb->s_subtype, "pstorage") || \
+			  !strcmp(sb->s_subtype, "vstorage")))
+
+static int ploop_bd_full(struct backing_dev_info *bdi, long long nr, int root)
+{
+	struct ploop_device *plo      = bdi->congested_data;
+	u64		     reserved = 0;
+	int		     rc	      = 0;
+
+	if (root) {
+		if (!plo->tune.disable_root_threshold)
+			reserved = (u64)root_threshold * 1024;
+	} else {
+		if (!plo->tune.disable_user_threshold)
+			reserved = (u64)user_threshold * 1024;
+	}
+
+	if (reserved) {
+		struct kstatfs buf;
+		int	       ret;
+
+		struct ploop_delta *top_delta;
+		struct file	   *file;
+		struct super_block *sb;
+		void		   *jctx = current->journal_info;
+
+		mutex_lock(&plo->sysfs_mutex);
+		top_delta = ploop_top_delta(plo);
+		file	  = top_delta->io.files.file;
+		sb	  = F_DENTRY(file)->d_inode->i_sb;
+
+		/* bd_full can be unsupported or not needed */
+		if (IS_PSTORAGE(sb) || sb->s_op->statfs == simple_statfs ||
+		    top_delta->flags & PLOOP_FMT_PREALLOCATED) {
+			mutex_unlock(&plo->sysfs_mutex);
+			return 0;
+		}
+
+		get_file(file);
+		mutex_unlock(&plo->sysfs_mutex);
+
+		current->journal_info = NULL;
+		ret = sb->s_op->statfs(F_DENTRY(file), &buf);
+		if (ret || buf.f_bfree * buf.f_bsize < reserved + nr) {
+			static unsigned long full_warn_time;
+
+			if (printk_timed_ratelimit(&full_warn_time, 60*60*HZ))
+				printk(KERN_WARNING
+				       "ploop%d: host disk is almost full "
+				       "(%llu < %llu); CT sees -ENOSPC !\n",
+				       plo->index, buf.f_bfree * buf.f_bsize,
+				       reserved + nr);
+
+			rc = 1;
+		}
+
+		fput(file);
+		current->journal_info = jctx;
+	}
+
+	return rc;
+}
+
+static int ploop_start(struct ploop_device * plo, struct block_device *bdev)
+{
+	int err;
+	struct ploop_delta * top_delta, * delta;
+	int i;
+
+	if (test_bit(PLOOP_S_RUNNING, &plo->state))
+		return -EBUSY;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	for (i = 0; i < plo->tune.max_requests; i++) {
+		struct ploop_request * preq;
+		preq = kzalloc(sizeof(struct ploop_request), GFP_KERNEL);
+		if (preq == NULL)
+			break;
+
+		preq->plo = plo;
+		INIT_LIST_HEAD(&preq->delay_list);
+		list_add(&preq->list, &plo->free_list);
+		plo->free_qlen++;
+		plo->free_qmax++;
+	}
+
+	list_for_each_entry_reverse(delta, &plo->map.delta_list, list) {
+		err = delta->ops->start(delta);
+		if (err)
+			return err;
+	}
+
+	ploop_map_start(&plo->map, plo->bd_size);
+
+	top_delta = ploop_top_delta(plo);
+
+	if (top_delta->level == 0 &&
+	    (top_delta->ops->capability & PLOOP_FMT_CAP_IDENTICAL))
+		set_bit(PLOOP_MAP_IDENTICAL, &plo->map.flags);
+
+	/* Deltas are ready. Enable block device. */
+	set_device_ro(bdev, (top_delta->flags & PLOOP_FMT_RDONLY) != 0);
+
+	blk_queue_make_request(plo->queue, ploop_make_request);
+	plo->queue->queuedata = plo;
+	plo->queue->backing_dev_info.congested_fn = ploop_congested;
+	plo->queue->backing_dev_info.congested_fn2 = ploop_congested2;
+	plo->queue->backing_dev_info.bd_full_fn = ploop_bd_full;
+	plo->queue->backing_dev_info.congested_data = plo;
+
+	blk_queue_merge_bvec(plo->queue, ploop_merge_bvec);
+	blk_queue_flush(plo->queue, REQ_FLUSH);
+
+	if (top_delta->io.ops->queue_settings)
+		top_delta->io.ops->queue_settings(&top_delta->io, plo->queue);
+
+	blk_queue_max_discard_sectors(plo->queue, INT_MAX);
+	queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, plo->queue);
+
+	set_capacity(plo->disk, plo->bd_size);
+	bd_set_size(bdev, (loff_t)plo->bd_size << 9);
+	set_blocksize(bdev, PAGE_SIZE);
+
+	plo->thread = kthread_create(ploop_thread, plo, "ploop%d",
+				     plo->index);
+	if (IS_ERR(plo->thread)) {
+		err = PTR_ERR(plo->thread);
+		goto out_err;
+	}
+
+	wake_up_process(plo->thread);
+	set_bit(PLOOP_S_RUNNING, &plo->state);
+	BUG_ON(list_empty(&plo->map.delta_list));
+	return 0;
+
+out_err:
+	plo->thread = NULL;
+	set_capacity(plo->disk, 0);
+	bd_set_size(bdev, 0);
+	return err;
+}
+
+static int ploop_stop(struct ploop_device * plo, struct block_device *bdev)
+{
+	int p;
+	struct ploop_delta * delta;
+	int cnt;
+
+	if (bdev != bdev->bd_contains) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "stop ploop%d failed (wrong bdev)\n",
+			       plo->index);
+		return -ENODEV;
+	}
+
+	if (bdev->bd_contains->bd_holders) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "stop ploop%d failed (holders=%d)\n",
+			       plo->index, bdev->bd_contains->bd_holders);
+		return -EBUSY;
+	}
+
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return -EINVAL;
+
+	if (list_empty(&plo->map.delta_list)) {
+		printk(KERN_INFO "stop ploop%d failed (no deltas)\n",
+		       plo->index);
+		return -ENOENT;
+	}
+
+	cnt = atomic_read(&plo->open_count);
+	if (cnt > 1) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "stop ploop%d failed (cnt=%d)\n",
+			       plo->index, cnt);
+		return -EBUSY;
+	}
+
+	cnt = atomic_read(&plo->maintenance_cnt);
+	if (plo->maintenance_type != PLOOP_MNTN_OFF && cnt) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "stop ploop%d failed "
+			       "(type=%d cnt=%d)\n",
+			       plo->index, plo->maintenance_type, cnt);
+		return -EBUSY;
+	}
+
+	if (plo->freeze_state != PLOOP_F_NORMAL) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "stop ploop%d failed (freeze_state=%d)\n",
+			       plo->index, plo->freeze_state);
+		return -EBUSY;
+	}
+
+	clear_bit(PLOOP_S_PUSH_BACKUP, &plo->state);
+	ploop_pb_stop(plo->pbd, true);
+
+	for (p = plo->disk->minors - 1; p > 0; p--)
+		invalidate_partition(plo->disk, p);
+	invalidate_partition(plo->disk, 0);
+
+	clear_bit(PLOOP_S_RUNNING, &plo->state);
+
+	del_timer_sync(&plo->mitigation_timer);
+	del_timer_sync(&plo->freeze_timer);
+
+	/* This will wait for queue drain */
+	kthread_stop(plo->thread);
+	plo->thread = NULL;
+
+	/* queue drained, no more ENOSPC */
+	spin_lock_irq(&plo->lock);
+	if (waitqueue_active(&plo->event_waitq))
+		wake_up_interruptible(&plo->event_waitq);
+	spin_unlock_irq(&plo->lock);
+
+	BUG_ON(plo->entry_qlen);
+	BUG_ON(plo->active_reqs);
+	BUG_ON(plo->barrier_reqs);
+	BUG_ON(plo->fastpath_reqs);
+	BUG_ON(plo->read_sync_reqs);
+
+	list_for_each_entry(delta, &plo->map.delta_list, list) {
+		delta->ops->stop(delta);
+	}
+
+	set_capacity(plo->disk, 0);
+	bd_set_size(bdev, 0);
+
+	if (plo->cached_bio) {
+		bio_put(plo->cached_bio);
+		plo->cached_bio = NULL;
+	}
+
+	while (!list_empty(&plo->free_list)) {
+		struct ploop_request * preq;
+
+		preq = list_first_entry(&plo->free_list, struct ploop_request, list);
+		list_del(&preq->list);
+		plo->free_qlen--;
+		plo->free_qmax--;
+		kfree(preq);
+	}
+	BUG_ON(plo->free_qlen);
+
+	ploop_map_destroy(&plo->map);
+	if (plo->trans_map)
+		ploop_map_destroy(plo->trans_map);
+
+	return 0;
+}
+
+static int ploop_sync(struct ploop_device * plo, struct block_device *bdev)
+{
+	struct ploop_delta * delta;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	delta = ploop_top_delta(plo);
+
+	if (delta->ops->sync == NULL)
+		return 0;
+
+	return delta->ops->sync(delta);
+}
+
+static void destroy_deltas(struct ploop_device * plo, struct ploop_map * map)
+{
+	while (!list_empty(&map->delta_list)) {
+		struct ploop_delta * delta;
+		delta = list_entry(map->delta_list.next, struct ploop_delta, list);
+
+		mutex_lock(&plo->sysfs_mutex);
+		list_del(&delta->list);
+		mutex_unlock(&plo->sysfs_mutex);
+
+		kobject_del(&delta->kobj);
+		kobject_put(&plo->kobj);
+
+		delta->ops->destroy(delta);
+		kobject_put(&delta->kobj);
+	}
+
+	plo->cookie[0] = 0;
+}
+
+static int ploop_clear(struct ploop_device * plo, struct block_device * bdev)
+{
+	int cnt;
+
+	if (test_bit(PLOOP_S_RUNNING, &plo->state)) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "clear ploop%d failed (RUNNING)\n",
+			       plo->index);
+		return -EBUSY;
+	}
+	if (plo->maintenance_type == PLOOP_MNTN_TRACK) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "clear ploop%d failed (TRACK)\n",
+			       plo->index);
+		return -EBUSY;
+	}
+	cnt = atomic_read(&plo->maintenance_cnt);
+	if (plo->maintenance_type != PLOOP_MNTN_OFF && cnt) {
+		if (printk_ratelimit())
+			printk(KERN_INFO "clear ploop%d failed "
+			       "(type=%d cnt=%d)\n",
+			       plo->index, plo->maintenance_type, cnt);
+		return -EBUSY;
+	}
+
+	clear_bit(PLOOP_S_DISCARD_LOADED, &plo->state);
+	clear_bit(PLOOP_S_DISCARD, &plo->state);
+	clear_bit(PLOOP_S_NULLIFY, &plo->state);
+
+	destroy_deltas(plo, &plo->map);
+
+	if (plo->trans_map) {
+		struct ploop_map * map;
+		destroy_deltas(plo, plo->trans_map);
+		map = plo->trans_map;
+		plo->trans_map = NULL;
+		kfree(map);
+	}
+
+	ploop_fb_fini(plo->fbd, 0);
+	ploop_pb_fini(plo->pbd);
+
+	plo->maintenance_type = PLOOP_MNTN_OFF;
+	plo->bd_size = 0;
+	plo->state = (1 << PLOOP_S_CHANGED);
+	BUG_ON(test_bit(PLOOP_S_RUNNING, &plo->state));
+	return 0;
+}
+
+static int ploop_index_update_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_index_update_ctl ctl;
+	struct reloc_map *map;
+	int i;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (copy_from_user(&ctl, (void*)arg,
+			   sizeof(struct ploop_index_update_ctl)))
+		return -EFAULT;
+
+	if (!ctl.n_maps)
+		return 0;
+
+	map = kzalloc(sizeof(*map) * ctl.n_maps, GFP_KERNEL);
+	if (!map)
+		return -ENOMEM;
+
+	if (copy_from_user(map, (u8*)arg + sizeof(ctl),
+			   sizeof(*map) * ctl.n_maps)) {
+		kfree(map);
+		return -EFAULT;
+	}
+
+	ploop_quiesce(plo);
+
+	for (i = 0; i < ctl.n_maps; i++)
+		ploop_update_map(&plo->map, ctl.level,
+				 map[i].req_cluster, map[i].iblk);
+
+	ploop_relax(plo);
+
+	kfree(map);
+	return 0;
+}
+
+enum {
+	PLOOP_GROW_RELOC = 0,
+	PLOOP_GROW_NULLIFY,
+	PLOOP_GROW_MAX,
+};
+
+static void ploop_relocate(struct ploop_device * plo, int grow_stage)
+{
+	struct ploop_request * preq;
+	int reloc_type = (grow_stage == PLOOP_GROW_RELOC) ?
+		PLOOP_REQ_RELOC_A : PLOOP_REQ_RELOC_N;
+
+	BUG_ON(grow_stage != PLOOP_GROW_RELOC &&
+	       grow_stage != PLOOP_GROW_NULLIFY);
+
+	spin_lock_irq(&plo->lock);
+
+	atomic_set(&plo->maintenance_cnt, 1);
+	plo->grow_relocated = 0;
+
+	if (grow_stage == PLOOP_GROW_NULLIFY)
+		set_bit(PLOOP_S_NULLIFY, &plo->state);
+
+	init_completion(&plo->maintenance_comp);
+
+	preq = ploop_alloc_request(plo);
+
+	preq->bl.tail = preq->bl.head = NULL;
+	preq->req_cluster = 0;
+	preq->req_size = 0;
+	preq->req_rw = WRITE_SYNC;
+	preq->eng_state = PLOOP_E_ENTRY;
+	preq->state = (1 << PLOOP_REQ_SYNC) | (1 << reloc_type);
+	preq->error = 0;
+	preq->tstamp = jiffies;
+	preq->iblock = (reloc_type == PLOOP_REQ_RELOC_A) ? 0 : plo->grow_start;
+	preq->prealloc_size = 0;
+
+	atomic_inc(&plo->maintenance_cnt);
+
+	ploop_entry_add(plo, preq);
+
+	if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+		wake_up_interruptible(&plo->waitq);
+
+	if (atomic_dec_and_test(&plo->maintenance_cnt))
+		complete(&plo->maintenance_comp);
+
+	spin_unlock_irq(&plo->lock);
+}
+
+static int ploop_grow(struct ploop_device *plo, struct block_device *bdev,
+		      unsigned long arg)
+{
+	u64 new_size;
+	struct ploop_ctl ctl;
+	struct ploop_delta *delta = ploop_top_delta(plo);
+	int reloc = 0; /* 'relocation needed' flag */
+	int err;
+	int grow_stage = PLOOP_GROW_RELOC;
+
+	if (!delta)
+		return -ENOENT;
+
+	if (plo->maintenance_type == PLOOP_MNTN_GROW) {
+		if (test_bit(PLOOP_S_NULLIFY, &plo->state))
+			grow_stage = PLOOP_GROW_NULLIFY;
+		goto already;
+	}
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(struct ploop_ctl)))
+		return -EFAULT;
+
+	if (ctl.pctl_cluster_log != plo->cluster_log)
+		return -EINVAL;
+
+	if (ctl.pctl_flags & PLOOP_FLAG_CLUBLKS)
+		new_size = (u64)ctl.pctl_size << plo->cluster_log;
+	else
+		new_size = ctl.pctl_size;
+
+	if (plo->bd_size > new_size) /* online shrink not supported */
+		return -EINVAL;
+
+	if (plo->bd_size == new_size) /* nothing to do */
+		return 0;
+
+	if (!delta->ops->prepare_grow)
+		return -EINVAL;
+
+	ploop_quiesce(plo);
+	err = delta->ops->prepare_grow(delta, &new_size, &reloc);
+	if (err)
+		goto grow_failed;
+
+	plo->grow_new_size = new_size;
+
+	/* prepare_grow() succeeded, but more actions needed */
+	if (reloc) {
+		plo->maintenance_type = PLOOP_MNTN_GROW;
+		ploop_relax(plo);
+		for (; grow_stage < PLOOP_GROW_MAX; grow_stage++) {
+			ploop_relocate(plo, grow_stage);
+already:
+			err = ploop_maintenance_wait(plo);
+			if (err)
+				return err;
+
+			BUG_ON(atomic_read(&plo->maintenance_cnt));
+
+			if (plo->maintenance_type != PLOOP_MNTN_GROW)
+				return -EALREADY;
+
+			if (test_bit(PLOOP_S_ABORT, &plo->state)) {
+				clear_bit(PLOOP_S_NULLIFY, &plo->state);
+				plo->maintenance_type = PLOOP_MNTN_OFF;
+				return -EIO;
+			}
+		}
+
+		ploop_quiesce(plo);
+		new_size = plo->grow_new_size;
+		clear_bit(PLOOP_S_NULLIFY, &plo->state);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+	}
+
+	/* Update bdev size and friends */
+	if (delta->ops->complete_grow) {
+		err = delta->ops->complete_grow(delta, new_size);
+		if (err)
+			goto grow_failed;
+	}
+
+	mutex_lock(&plo->sysfs_mutex);
+	plo->bd_size = new_size;
+	plo->map.max_index = (plo->bd_size + (1 << plo->cluster_log) - 1 )
+			     >> plo->cluster_log;
+
+	set_capacity(plo->disk, plo->bd_size);
+	bd_set_size(bdev, (loff_t)plo->bd_size << 9);
+
+	mutex_unlock(&plo->sysfs_mutex);
+grow_failed:
+	ploop_relax(plo);
+	return err;
+}
+
+static int ploop_balloon_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_balloon_ctl ctl;
+	struct ploop_delta *delta = ploop_top_delta(plo);
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	if (ctl.inflate && ctl.keep_intact)
+		return -EINVAL;
+
+	switch (plo->maintenance_type) {
+	case PLOOP_MNTN_DISCARD:
+		if (!test_bit(PLOOP_S_DISCARD_LOADED, &plo->state))
+			break;
+
+		ploop_quiesce(plo);
+		clear_bit(PLOOP_S_DISCARD_LOADED, &plo->state);
+		plo->maintenance_type = PLOOP_MNTN_FBLOADED;
+		ploop_fb_lost_range_init(plo->fbd, delta->io.alloc_head);
+		ploop_relax(plo);
+		/* fall through */
+	case PLOOP_MNTN_FBLOADED:
+	case PLOOP_MNTN_RELOC:
+		BUG_ON (!plo->fbd);
+		ctl.alloc_head = ploop_fb_get_alloc_head(plo->fbd);
+		ctl.level      = ploop_fb_get_freezed_level(plo->fbd);
+		break;
+	case PLOOP_MNTN_OFF:
+		if (ctl.inflate) {
+			if (delta->ops->id != PLOOP_FMT_PLOOP1)
+				return -EOPNOTSUPP;
+
+			ploop_quiesce(plo);
+			plo->maintenance_type = PLOOP_MNTN_BALLOON;
+			ploop_relax(plo);
+		}
+		break;
+	case PLOOP_MNTN_BALLOON :
+		if (!ctl.inflate && !ctl.keep_intact) {
+			ploop_quiesce(plo);
+			plo->maintenance_type = PLOOP_MNTN_OFF;
+			ploop_relax(plo);
+		}
+	}
+	ctl.mntn_type = plo->maintenance_type;
+
+	return copy_to_user((void*)arg, &ctl, sizeof(ctl));
+}
+
+static int ploop_freeblks_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_delta *delta;
+	struct ploop_freeblks_ctl ctl;
+	struct ploop_freeblks_ctl_extent __user *extents;
+	struct ploop_freeblks_desc *fbd;
+	int i;
+	int rc = 0;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (plo->maintenance_type == PLOOP_MNTN_OFF)
+		return -EINVAL;
+	if (plo->maintenance_type != PLOOP_MNTN_BALLOON)
+		return -EBUSY;
+	BUG_ON (plo->fbd);
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	delta = ploop_top_delta(plo);
+	if (delta->level != ctl.level) {
+		rc = -EINVAL;
+		goto exit;
+	}
+
+	fbd = ploop_fb_init(plo);
+	if (!fbd) {
+		rc = -ENOMEM;
+		goto exit;
+	}
+
+	extents = (void __user *)(arg + sizeof(ctl));
+
+	for (i = 0; i < ctl.n_extents; i++) {
+		struct ploop_freeblks_ctl_extent extent;
+
+		if (copy_from_user(&extent, &extents[i],
+					sizeof(extent))) {
+			rc = -EFAULT;
+			ploop_fb_fini(fbd, rc);
+			goto exit;
+		}
+
+		rc = ploop_fb_add_free_extent(fbd, extent.clu,
+					extent.iblk, extent.len);
+		if (rc) {
+			ploop_fb_fini(fbd, rc);
+			goto exit;
+		}
+	}
+
+	ploop_quiesce(plo);
+
+	ctl.alloc_head = delta->io.alloc_head;
+	if (copy_to_user((void*)arg, &ctl, sizeof(ctl))) {
+		rc = -EFAULT;
+		ploop_fb_fini(fbd, rc);
+	} else {
+		iblock_t a_h = delta->io.alloc_head;
+		/* make fbd visible to ploop engine */
+		plo->fbd = fbd;
+		plo->maintenance_type = PLOOP_MNTN_FBLOADED;
+		BUG_ON (a_h != ctl.alloc_head); /* quiesce sanity */
+		ploop_fb_lost_range_init(fbd, a_h);
+		ploop_fb_set_freezed_level(fbd, delta->level);
+	}
+
+	ploop_relax(plo);
+exit:
+	return rc;
+}
+
+static int ploop_fbget_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_freeblks_ctl ctl;
+	int rc = 0;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (plo->maintenance_type == PLOOP_MNTN_DISCARD) {
+		if (!test_bit(PLOOP_S_DISCARD_LOADED, &plo->state))
+			return -EINVAL;
+	} else if (plo->maintenance_type != PLOOP_MNTN_FBLOADED)
+		return -EINVAL;
+	BUG_ON (!plo->fbd);
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	ploop_quiesce(plo);
+	rc = ploop_fb_copy_freeblks_to_user(plo->fbd, (void*)arg, &ctl);
+	ploop_relax(plo);
+
+	return rc;
+}
+
+static int ploop_fbfilter_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	int rc = 0;
+
+	if (plo->maintenance_type != PLOOP_MNTN_DISCARD ||
+	    !test_bit(PLOOP_S_DISCARD_LOADED, &plo->state))
+		return -EINVAL;
+
+	BUG_ON (!plo->fbd);
+
+	ploop_quiesce(plo);
+	rc = ploop_fb_filter_freeblks(plo->fbd, arg);
+	ploop_relax(plo);
+
+	return rc;
+}
+
+static void ploop_relocblks_process(struct ploop_device *plo)
+{
+	int num_reqs;
+	struct ploop_request *preq;
+
+	num_reqs = plo->tune.fsync_max;
+	if (num_reqs > plo->tune.max_requests/2)
+		num_reqs = plo->tune.max_requests/2;
+	if (num_reqs < 1)
+		num_reqs = 1;
+
+	spin_lock_irq(&plo->lock);
+
+	atomic_set(&plo->maintenance_cnt, 1);
+
+	init_completion(&plo->maintenance_comp);
+
+	for (; num_reqs; num_reqs--) {
+		preq = ploop_alloc_request(plo);
+
+		preq->bl.tail = preq->bl.head = NULL;
+		preq->req_cluster = ~0U; /* uninitialized */
+		preq->req_size = 0;
+		preq->req_rw = WRITE_SYNC;
+		preq->eng_state = PLOOP_E_ENTRY;
+		preq->state = (1 << PLOOP_REQ_SYNC) | (1 << PLOOP_REQ_RELOC_S);
+		preq->error = 0;
+		preq->tstamp = jiffies;
+		preq->iblock = 0;
+		preq->prealloc_size = 0;
+
+		atomic_inc(&plo->maintenance_cnt);
+
+		ploop_entry_add(plo, preq);
+
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+	}
+
+	if (atomic_dec_and_test(&plo->maintenance_cnt))
+		complete(&plo->maintenance_comp);
+
+	spin_unlock_irq(&plo->lock);
+}
+
+static int release_fbd(struct ploop_device *plo, int err)
+{
+	clear_bit(PLOOP_S_DISCARD, &plo->state);
+
+	ploop_quiesce(plo);
+	ploop_fb_fini(plo->fbd, err);
+	plo->maintenance_type = PLOOP_MNTN_OFF;
+	ploop_relax(plo);
+
+	return err;
+}
+
+static void ploop_discard_restart(struct ploop_device *plo, int err)
+{
+	if (!err && test_bit(PLOOP_S_DISCARD, &plo->state)) {
+		ploop_fb_reinit(plo->fbd, 0);
+		atomic_set(&plo->maintenance_cnt, 0);
+		init_completion(&plo->maintenance_comp);
+		plo->maintenance_type = PLOOP_MNTN_DISCARD;
+	} else {
+		clear_bit(PLOOP_S_DISCARD, &plo->state);
+		ploop_fb_fini(plo->fbd, err);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+	}
+}
+
+static int ploop_fbdrop_ioc(struct ploop_device *plo)
+{
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (plo->maintenance_type == PLOOP_MNTN_DISCARD) {
+		if (!test_bit(PLOOP_S_DISCARD_LOADED, &plo->state))
+			return -EINVAL;
+	} else if (plo->maintenance_type != PLOOP_MNTN_FBLOADED)
+		return -EINVAL;
+	BUG_ON (!plo->fbd);
+
+	ploop_quiesce(plo);
+	ploop_discard_restart(plo, 0);
+	ploop_relax(plo);
+
+	return 0;
+}
+
+static int ploop_relocblks_ioc(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_delta *delta = ploop_top_delta(plo);
+	struct ploop_relocblks_ctl ctl;
+	struct ploop_freeblks_desc *fbd = plo->fbd;
+	int i;
+	int err = 0;
+	int n_free;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (!fbd || (plo->maintenance_type != PLOOP_MNTN_FBLOADED &&
+		     plo->maintenance_type != PLOOP_MNTN_RELOC))
+		return -EINVAL;
+
+	BUG_ON(test_bit(PLOOP_S_DISCARD_LOADED, &plo->state));
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	if (delta->level != ctl.level ||
+	    ploop_fb_get_freezed_level(plo->fbd) != ctl.level ||
+	    ploop_fb_get_alloc_head(plo->fbd) != ctl.alloc_head) {
+		return -EINVAL;
+	}
+
+	if (plo->maintenance_type == PLOOP_MNTN_RELOC)
+		goto already;
+
+	if (ctl.n_extents) {
+		struct ploop_relocblks_ctl_extent __user *extents;
+
+		extents = (void __user *)(arg + sizeof(ctl));
+
+		for (i = 0; i < ctl.n_extents; i++) {
+			struct ploop_relocblks_ctl_extent extent;
+
+			if (copy_from_user(&extent, &extents[i],
+						sizeof(extent)))
+				return release_fbd(plo, -EFAULT);
+
+			/* this extent is also present in freemap */
+			err = ploop_fb_add_reloc_extent(fbd, extent.clu,
+					extent.iblk, extent.len, extent.free);
+			if (err)
+				return release_fbd(plo, err);
+		}
+	}
+
+	ploop_quiesce(plo);
+
+	/* alloc_head must never decrease */
+	BUG_ON (delta->io.alloc_head < ploop_fb_get_alloc_head(plo->fbd));
+	n_free = ploop_fb_get_n_free(plo->fbd);
+
+	/*
+	 * before relocation start, freeblks engine could provide only
+	 * free blocks
+	 */
+	BUG_ON (delta->io.alloc_head > ploop_fb_get_alloc_head(plo->fbd) &&
+		n_free);
+	ploop_fb_relocation_start(plo->fbd, ctl.n_scanned);
+
+	if (!n_free || !ctl.n_extents)
+		goto truncate;
+
+	plo->maintenance_type = PLOOP_MNTN_RELOC;
+
+	ploop_relax(plo);
+
+	ploop_relocblks_process(plo);
+already:
+	err = ploop_maintenance_wait(plo);
+	if (err)
+		return err;
+
+	BUG_ON(atomic_read(&plo->maintenance_cnt));
+
+	if (plo->maintenance_type != PLOOP_MNTN_RELOC)
+		return -EALREADY;
+
+	fbd = plo->fbd;
+	BUG_ON (!fbd);
+
+	if (test_bit(PLOOP_S_ABORT, &plo->state)) {
+		clear_bit(PLOOP_S_DISCARD,&plo->state);
+
+		ploop_fb_fini(plo->fbd, -EIO);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+		return -EIO;
+	}
+
+	if (ploop_fb_get_n_relocated(fbd) != ploop_fb_get_n_relocating(fbd))
+		return release_fbd(plo, -EIO);
+
+	/* time to truncate */
+	ploop_quiesce(plo);
+truncate:
+	if (ploop_fb_get_lost_range_len(plo->fbd) != 0) {
+		BUG_ON (delta->io.alloc_head >
+			ploop_fb_get_alloc_head(plo->fbd));
+		err = delta->ops->truncate(delta, NULL,
+					   ploop_fb_get_first_lost_iblk(plo->fbd));
+		if (!err) {
+			delta->io.prealloced_size = 0;
+			ctl.alloc_head = ploop_fb_get_lost_range_len(plo->fbd);
+			err = copy_to_user((void*)arg, &ctl, sizeof(ctl));
+		}
+	} else {
+		ctl.alloc_head = 0;
+		err = copy_to_user((void*)arg, &ctl, sizeof(ctl));
+	}
+
+	ploop_discard_restart(plo, err);
+
+	ploop_relax(plo);
+	return err;
+}
+
+static int ploop_getdevice_ioc(unsigned long arg)
+{
+	int err;
+	int index = 0;
+	struct rb_node *n;
+	struct ploop_getdevice_ctl ctl = {};
+
+	mutex_lock(&ploop_devices_mutex);
+	for (n = rb_first(&ploop_devices_tree); n; n = rb_next(n), index++) {
+		struct ploop_device *plo;
+		plo = rb_entry(n, struct ploop_device, link);
+		if (plo->index != index || list_empty(&plo->map.delta_list))
+			break;
+	}
+	mutex_unlock(&ploop_devices_mutex);
+
+	ctl.minor = index << PLOOP_PART_SHIFT;
+	if (ctl.minor & ~MINORMASK)
+		return -ERANGE;
+	err = copy_to_user((void*)arg, &ctl, sizeof(ctl));
+	return err;
+}
+
+static int ploop_push_backup_init(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_push_backup_init_ctl ctl;
+	struct ploop_pushbackup_desc *pbd = NULL;
+	int rc = 0;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EINVAL;
+
+	BUG_ON(plo->pbd);
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	pbd = ploop_pb_alloc(plo);
+	if (!pbd) {
+		rc = -ENOMEM;
+		goto pb_init_done;
+	}
+
+	ploop_quiesce(plo);
+
+	rc = ploop_pb_init(pbd, ctl.cbt_uuid, !ctl.cbt_mask_addr);
+	if (rc) {
+		ploop_relax(plo);
+		goto pb_init_done;
+	}
+
+	mutex_lock(&plo->sysfs_mutex);
+	plo->pbd = pbd;
+	mutex_unlock(&plo->sysfs_mutex);
+
+	atomic_set(&plo->maintenance_cnt, 0);
+	plo->maintenance_type = PLOOP_MNTN_PUSH_BACKUP;
+	set_bit(PLOOP_S_PUSH_BACKUP, &plo->state);
+
+	ploop_relax(plo);
+
+	if (ctl.cbt_mask_addr)
+		rc = ploop_pb_copy_cbt_to_user(pbd, (char *)ctl.cbt_mask_addr);
+pb_init_done:
+	if (rc)
+		ploop_pb_fini(pbd);
+	return rc;
+}
+
+static int ploop_push_backup_io_get(struct ploop_device *plo,
+		unsigned long arg, struct ploop_push_backup_io_ctl *ctl,
+		int (*get)(struct ploop_pushbackup_desc *, cluster_t *,
+			   cluster_t *, unsigned))
+{
+	struct ploop_push_backup_ctl_extent *e;
+	unsigned n_extents = 0;
+	int rc = 0;
+	cluster_t clu = 0;
+	cluster_t len = 0;
+
+	e = kmalloc(sizeof(*e) * ctl->n_extents, GFP_KERNEL);
+	if (!e)
+		return -ENOMEM;
+
+	while (n_extents < ctl->n_extents) {
+		rc = get(plo->pbd, &clu, &len, n_extents);
+		if (rc == -ENOENT && n_extents)
+			break;
+		else if (rc)
+			goto io_get_done;
+
+		e[n_extents].clu = clu;
+		e[n_extents].len = len;
+		n_extents++;
+	}
+
+	rc = -EFAULT;
+	ctl->n_extents = n_extents;
+	if (copy_to_user((void*)arg, ctl, sizeof(*ctl)))
+		goto io_get_done;
+	if (n_extents &&
+	    copy_to_user((void*)(arg + sizeof(*ctl)), e,
+			 n_extents * sizeof(*e)))
+			goto io_get_done;
+	rc = 0;
+
+io_get_done:
+	kfree(e);
+	return rc;
+}
+
+static int ploop_push_backup_io_read(struct ploop_device *plo,
+		unsigned long arg, struct ploop_push_backup_io_ctl *ctl)
+{
+	return ploop_push_backup_io_get(plo, arg, ctl, ploop_pb_get_pending);
+}
+
+static int ploop_push_backup_io_peek(struct ploop_device *plo,
+		unsigned long arg, struct ploop_push_backup_io_ctl *ctl)
+{
+	int rc;
+
+	rc = ploop_push_backup_io_get(plo, arg, ctl, ploop_pb_peek);
+
+	if (rc == -ENOENT) {
+		ctl->n_extents = 0;
+		if (copy_to_user((void*)arg, ctl, sizeof(*ctl)))
+			rc = -EFAULT;
+		else
+			rc = 0;
+	}
+
+	return rc;
+}
+
+static int ploop_push_backup_io_write(struct ploop_device *plo, unsigned long arg,
+				      struct ploop_push_backup_io_ctl *ctl)
+{
+	struct ploop_push_backup_ctl_extent *e;
+	unsigned i;
+	int rc = 0;
+
+	e = kmalloc(sizeof(*e) * ctl->n_extents, GFP_KERNEL);
+	if (!e)
+		return -ENOMEM;
+
+	rc = -EFAULT;
+	if (copy_from_user(e, (void*)(arg + sizeof(*ctl)),
+			   ctl->n_extents * sizeof(*e)))
+		goto io_write_done;
+
+	rc = 0;
+	for (i = 0; i < ctl->n_extents; i++)
+		ploop_pb_put_reported(plo->pbd, e[i].clu, e[i].len);
+
+io_write_done:
+	kfree(e);
+	return rc;
+}
+
+static int ploop_push_backup_io(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_push_backup_io_ctl ctl;
+	struct ploop_pushbackup_desc *pbd = plo->pbd;
+
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	if (plo->maintenance_type != PLOOP_MNTN_PUSH_BACKUP)
+		return -EINVAL;
+
+	BUG_ON (!pbd);
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	if (!ctl.n_extents)
+		return -EINVAL;
+
+	if (ploop_pb_check_uuid(pbd, ctl.cbt_uuid)) {
+		printk("ploop(%d): PUSH_BACKUP_IO uuid mismatch\n",
+		       plo->index);
+		return -EINVAL;
+	}
+
+	switch(ctl.direction) {
+	case PLOOP_READ:
+		return ploop_push_backup_io_read(plo, arg, &ctl);
+	case PLOOP_WRITE:
+		return ploop_push_backup_io_write(plo, arg, &ctl);
+	case PLOOP_PEEK:
+		return ploop_push_backup_io_peek(plo, arg, &ctl);
+	}
+
+	return -EINVAL;
+}
+
+static int ploop_push_backup_stop(struct ploop_device *plo, unsigned long arg)
+{
+	struct ploop_pushbackup_desc *pbd = plo->pbd;
+	struct ploop_push_backup_stop_ctl ctl;
+	int ret;
+
+	if (plo->maintenance_type != PLOOP_MNTN_PUSH_BACKUP)
+		return -EINVAL;
+
+	if (copy_from_user(&ctl, (void*)arg, sizeof(ctl)))
+		return -EFAULT;
+
+	if (pbd && ploop_pb_check_uuid(pbd, ctl.cbt_uuid)) {
+		printk("ploop(%d): PUSH_BACKUP_STOP uuid mismatch\n",
+		       plo->index);
+		return -EINVAL;
+	}
+
+	ret = ploop_pb_destroy(plo, &ctl.status);
+	if (ret)
+		return ret;
+
+	return copy_to_user((void*)arg, &ctl, sizeof(ctl));
+}
+
+static int ploop_freeze(struct ploop_device *plo, struct block_device *bdev)
+{
+	struct super_block *sb;
+
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return -EINVAL;
+
+	if (plo->freeze_state == PLOOP_F_FROZEN)
+		return 0;
+
+	if (plo->freeze_state == PLOOP_F_THAWING)
+		return -EBUSY;
+
+	if (plo->dm_crypt_bdev)
+		bdev = plo->dm_crypt_bdev;
+
+	bdgrab(bdev);
+	sb = freeze_bdev(bdev);
+	if (sb && IS_ERR(sb)) {
+		bdput(bdev);
+		return PTR_ERR(sb);
+	}
+
+	plo->frozen_bdev = bdev;
+	plo->freeze_state = PLOOP_F_FROZEN;
+	return 0;
+}
+
+static int ploop_thaw(struct ploop_device *plo)
+{
+	struct block_device *bdev = plo->frozen_bdev;
+	struct super_block *sb = bdev ? bdev->bd_super : NULL;
+	int err;
+
+	if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return -EINVAL;
+
+	if (plo->freeze_state == PLOOP_F_NORMAL)
+		return 0;
+
+	if (plo->freeze_state == PLOOP_F_THAWING)
+		return -EBUSY;
+
+	plo->frozen_bdev = NULL;
+	plo->freeze_state = PLOOP_F_THAWING;
+
+	mutex_unlock(&plo->ctl_mutex);
+	err = thaw_bdev(bdev, sb);
+	bdput(bdev);
+	mutex_lock(&plo->ctl_mutex);
+
+	BUG_ON(plo->freeze_state != PLOOP_F_THAWING);
+
+	if (!err)
+		plo->freeze_state = PLOOP_F_NORMAL;
+	else
+		plo->freeze_state = PLOOP_F_FROZEN;
+
+	return err;
+}
+
+static int ploop_ioctl(struct block_device *bdev, fmode_t fmode, unsigned int cmd,
+		       unsigned long arg)
+{
+	struct ploop_device *plo = bdev->bd_disk->private_data;
+	int err = -EINVAL;
+
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
+	mutex_lock(&plo->ctl_mutex);
+
+	if (plo->maintenance_type == PLOOP_MNTN_SNAPSHOT) {
+		mutex_unlock(&plo->ctl_mutex);
+		return -EBUSY;
+	}
+
+	switch(cmd) {
+	case PLOOP_IOC_ADD_DELTA:
+		err = ploop_add_delta(plo, arg);
+		break;
+	case PLOOP_IOC_DEL_DELTA:
+		err = ploop_del_delta(plo, arg);
+		break;
+	case PLOOP_IOC_REPLACE_DELTA:
+		err = ploop_replace_delta(plo, arg);
+		break;
+	case PLOOP_IOC_SNAPSHOT:
+		err = ploop_snapshot(plo, arg, bdev);
+		break;
+	case PLOOP_IOC_CLEAR:
+		err = ploop_clear(plo, bdev);
+		break;
+	case PLOOP_IOC_STOP:
+		err = ploop_stop(plo, bdev);
+		break;
+	case PLOOP_IOC_START:
+		err = ploop_start(plo, bdev);
+		break;
+	case PLOOP_IOC_SYNC:
+		err = ploop_sync(plo, bdev);
+		break;
+
+	case PLOOP_IOC_TRACK_INIT:
+		err = ploop_tracker_init(plo, arg);
+		break;
+	case PLOOP_IOC_TRACK_SETPOS:
+		err = ploop_tracker_setpos(plo, arg);
+		break;
+	case PLOOP_IOC_TRACK_STOP:
+		err = ploop_tracker_stop(plo, 0);
+		break;
+	case PLOOP_IOC_TRACK_ABORT:
+		err = ploop_tracker_stop(plo, 1);
+		break;
+	case PLOOP_IOC_TRACK_READ:
+		err = ploop_tracker_read(plo, arg);
+		break;
+
+	case PLOOP_IOC_MERGE:
+		err = ploop_merge(plo);
+		break;
+	case PLOOP_IOC_TRUNCATE:
+		err = ploop_truncate(plo, arg);
+		break;
+	case PLOOP_IOC_UPDATE_INDEX:
+		err = ploop_index_update_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_GROW:
+		err = ploop_grow(plo, bdev, arg);
+		break;
+	case PLOOP_IOC_BALLOON:
+		err = ploop_balloon_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_FREEBLKS:
+		err = ploop_freeblks_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_FBGET:
+		err = ploop_fbget_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_FBFILTER:
+		err = ploop_fbfilter_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_FBDROP:
+		err = ploop_fbdrop_ioc(plo);
+		break;
+	case PLOOP_IOC_RELOCBLKS:
+		err = ploop_relocblks_ioc(plo, arg);
+		break;
+	case PLOOP_IOC_GETDEVICE:
+		err = ploop_getdevice_ioc(arg);
+		break;
+
+	case PLOOP_IOC_DISCARD_INIT:
+		err = ploop_discard_init_ioc(plo);
+		break;
+	case PLOOP_IOC_DISCARD_FINI:
+		err = ploop_discard_fini_ioc(plo);
+		break;
+	case PLOOP_IOC_DISCARD_WAIT:
+		err = ploop_discard_wait_ioc(plo);
+		break;
+	case PLOOP_IOC_MAX_DELTA_SIZE:
+		err = ploop_set_max_delta_size(plo, arg);
+		break;
+	case PLOOP_IOC_PUSH_BACKUP_INIT:
+		err = ploop_push_backup_init(plo, arg);
+		break;
+	case PLOOP_IOC_PUSH_BACKUP_IO:
+		err = ploop_push_backup_io(plo, arg);
+		break;
+	case PLOOP_IOC_PUSH_BACKUP_STOP:
+		err = ploop_push_backup_stop(plo, arg);
+		break;
+	case PLOOP_IOC_FREEZE:
+		err = ploop_freeze(plo, bdev);
+		break;
+	case PLOOP_IOC_THAW:
+		err = ploop_thaw(plo);
+		break;
+	default:
+		err = -EINVAL;
+	}
+	mutex_unlock(&plo->ctl_mutex);
+	return err;
+}
+
+static int ploop_media_changed(struct gendisk *disk)
+{
+	struct ploop_device *plo = disk->private_data;
+
+	return test_bit(PLOOP_S_CHANGED, &plo->state);
+}
+
+static int ploop_revalidate(struct gendisk *disk)
+{
+	struct ploop_device *plo = disk->private_data;
+
+	clear_bit(PLOOP_S_CHANGED, &plo->state);
+	return 0;
+}
+
+static struct block_device_operations ploop_dev_fops = {
+	.owner =		THIS_MODULE,
+	.open =			ploop_open,
+	.release =		ploop_release,
+	.ioctl =		ploop_ioctl,
+	.media_changed =	ploop_media_changed,
+	.revalidate_disk =	ploop_revalidate,
+};
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
+MODULE_DESCRIPTION("Virtuozzo loopback device driver");
+MODULE_ALIAS_BLOCKDEV_MAJOR(PLOOP_DEVICE_MAJOR);
+
+atomic_t plo_count = ATOMIC_INIT(0);
+
+static struct sysfs_ops ploop_sysfs_ops = { };
+
+static void ploop_obj_release(struct kobject *kobj)
+{
+	struct ploop_device *plo = container_of(kobj, struct ploop_device, kobj);
+	kfree(plo);
+	atomic_dec(&plo_count);
+}
+
+static struct kobj_type ploop_ktype = {
+	.sysfs_ops	= &ploop_sysfs_ops,
+	.release	= ploop_obj_release,
+};
+
+static struct ploop_device *__ploop_dev_alloc(int index)
+{
+	struct ploop_device *plo;
+	struct gendisk *dk;
+
+	plo = kzalloc(sizeof(*plo), GFP_KERNEL);
+	if(!plo)
+		goto out;
+
+	plo->queue = blk_alloc_queue(GFP_KERNEL);
+	if (!plo->queue)
+		goto out_mem;
+
+	dk = plo->disk = alloc_disk(PLOOP_PART_MAX);
+	if (!plo->disk)
+		goto out_queue;
+
+	spin_lock_init(&plo->lock);
+	spin_lock_init(&plo->dummy_lock);
+	plo->queue->queue_lock = &plo->dummy_lock;
+	mutex_init(&plo->ctl_mutex);
+	mutex_init(&plo->sysfs_mutex);
+	plo->index = index;
+	plo->state = 0;
+	atomic_set(&plo->open_count, 0);
+	init_timer(&plo->mitigation_timer);
+	plo->mitigation_timer.function = mitigation_timeout;
+	plo->mitigation_timer.data = (unsigned long)plo;
+	init_timer(&plo->freeze_timer);
+	plo->freeze_timer.function = freeze_timeout;
+	plo->freeze_timer.data = (unsigned long)plo;
+	INIT_LIST_HEAD(&plo->entry_queue);
+	plo->entry_tree[0] = plo->entry_tree[1] = RB_ROOT;
+	plo->lockout_tree = RB_ROOT;
+	plo->lockout_pb_tree = RB_ROOT;
+	INIT_LIST_HEAD(&plo->ready_queue);
+	INIT_LIST_HEAD(&plo->free_list);
+	init_waitqueue_head(&plo->waitq);
+	init_waitqueue_head(&plo->req_waitq);
+	init_waitqueue_head(&plo->freeze_waitq);
+	init_waitqueue_head(&plo->event_waitq);
+	plo->tune = DEFAULT_PLOOP_TUNE;
+	map_init(plo, &plo->map);
+	track_init(plo);
+	KOBJECT_INIT(&plo->kobj, &ploop_ktype);
+	atomic_inc(&plo_count);
+	bio_list_init(&plo->bio_discard_list);
+
+	dk->major		= ploop_major;
+	dk->first_minor		= index << PLOOP_PART_SHIFT;
+	dk->minors		= PLOOP_PART_MAX;
+	dk->fops		= &ploop_dev_fops;
+	dk->private_data	= plo;
+	dk->queue		= plo->queue;
+	snprintf(dk->disk_name, sizeof(dk->disk_name), "ploop%d", index);
+	return plo;
+
+out_queue:
+	blk_cleanup_queue(plo->queue);
+out_mem:
+	kfree(plo);
+out:
+	return NULL;
+}
+
+static void ploop_dev_del(struct ploop_device *plo)
+{
+	ploop_tracker_destroy(plo, 1);
+	ploop_sysfs_uninit(plo);
+	del_gendisk(plo->disk);
+	blk_cleanup_queue(plo->queue);
+	put_disk(plo->disk);
+	rb_erase(&plo->link, &ploop_devices_tree);
+	ploop_fb_fini(plo->fbd, 0);
+	kobject_put(&plo->kobj);
+}
+
+static void ploop_dev_insert(struct ploop_device *plo)
+{
+	struct rb_node ** p;
+	struct rb_node *parent = NULL;
+	struct ploop_device * pl;
+
+	p = &ploop_devices_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		pl = rb_entry(parent, struct ploop_device, link);
+		BUG_ON (plo->index == pl->index);
+
+		if (plo->index < pl->index)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&plo->link, parent, p);
+	rb_insert_color(&plo->link, &ploop_devices_tree);
+}
+
+static struct ploop_device *ploop_dev_search(int index)
+{
+	struct rb_node *n = ploop_devices_tree.rb_node;
+
+	while(n) {
+		struct ploop_device *plo;
+		plo = rb_entry(n, struct ploop_device, link);
+
+		if (index < plo->index)
+			n = n->rb_left;
+		else if (index > plo->index)
+			n = n->rb_right;
+		else
+			return plo;
+	}
+
+	return NULL;
+}
+
+static struct ploop_device *ploop_dev_init(int index)
+{
+	struct ploop_device *plo = ploop_dev_search(index);
+
+	if (plo) {
+		BUG_ON(list_empty(&plo->map.delta_list) &&
+		       test_bit(PLOOP_S_NULLIFY, &plo->state));
+		return plo;
+	}
+
+	plo = __ploop_dev_alloc(index);
+	if (plo) {
+		add_disk(plo->disk);
+		ploop_sysfs_init(plo);
+		ploop_dev_insert(plo);
+	}
+	return plo;
+}
+
+static struct kobject *ploop_dev_probe(dev_t dev, int *part, void *data)
+{
+	struct kobject *kobj;
+	struct ploop_device *plo;
+
+	*part = dev & (PLOOP_PART_MAX - 1);
+	mutex_lock(&ploop_devices_mutex);
+	plo = ploop_dev_init((dev & MINORMASK) >> PLOOP_PART_SHIFT);
+	if (!plo)
+		kobj = ERR_PTR(-ENOMEM);
+	else
+		kobj = get_disk(plo->disk);
+	mutex_unlock(&ploop_devices_mutex);
+
+	return kobj;
+}
+
+/* Functions to service /proc/vz/ploop_minor */
+
+static int ploop_minor_show(struct seq_file *m, void *v)
+{
+	struct ploop_device *plo = m->private;
+	seq_printf(m, "%d\n", plo->index << PLOOP_PART_SHIFT);
+	return 0;
+}
+
+/* Returns random index from 10000 - 65535 range */
+static unsigned ploop_random_index(void)
+{
+	unsigned int n;
+
+	get_random_bytes(&n, sizeof(n));
+
+	return 10000 + n % (65536 - 10000);
+}
+
+static int ploop_minor_open(struct inode *inode, struct file *file)
+{
+	int index = 0;
+	struct rb_node *n;
+	struct ploop_device *plo = NULL;
+	int found = 0;
+	int ret;
+
+	mutex_lock(&ploop_devices_mutex);
+	for (n = rb_first(&ploop_devices_tree); n; n = rb_next(n)) {
+		plo = rb_entry(n, struct ploop_device, link);
+		if (list_empty(&plo->map.delta_list) &&
+		    !test_bit(PLOOP_S_LOCKED, &plo->locking_state)) {
+			found = 1;
+			break;
+		}
+	}
+
+	if (!found) {
+		int i = 0;
+
+		index = ploop_random_index();
+		plo = ploop_dev_search(index);
+
+		while (plo) {
+			for (n = &plo->link; n; n = rb_next(n), index++) {
+				plo = rb_entry(n, struct ploop_device, link);
+				if (plo->index != index ||
+				    (list_empty(&plo->map.delta_list) &&
+				     !test_bit(PLOOP_S_LOCKED, &plo->locking_state)))
+					break;
+			}
+
+			BUG_ON (plo->index == index);
+
+			/* not more than two iterations */
+			if (i++ == 2)
+				break;
+
+			if ((index << PLOOP_PART_SHIFT) & ~MINORMASK) {
+				index = 0;
+				plo = ploop_dev_search(index);
+			} else
+				plo = NULL;
+		}
+		
+		if ((index << PLOOP_PART_SHIFT) & ~MINORMASK) {
+			mutex_unlock(&ploop_devices_mutex);
+			return -ERANGE;
+		}
+
+		plo = __ploop_dev_alloc(index);
+		if (!plo) {
+			mutex_unlock(&ploop_devices_mutex);
+			return -ENOMEM;
+		}
+
+		add_disk(plo->disk);
+		ploop_sysfs_init(plo);
+		ploop_dev_insert(plo);
+	}
+	BUG_ON(test_bit(PLOOP_S_NULLIFY, &plo->state));
+	set_bit(PLOOP_S_LOCKED, &plo->locking_state);
+	mutex_unlock(&ploop_devices_mutex);
+
+	ret = single_open(file, ploop_minor_show, plo);
+	if (ret)
+		clear_bit(PLOOP_S_LOCKED, &plo->locking_state);
+	return ret;
+}
+
+static int ploop_minor_release(struct inode *inode, struct file *filp)
+{
+	struct ploop_device *plo = ((struct seq_file *)filp->private_data)->private;
+	clear_bit(PLOOP_S_LOCKED, &plo->locking_state);
+	return single_release(inode, filp);
+}
+
+static const struct file_operations proc_ploop_minor = {
+	.owner          = THIS_MODULE,
+	.open		= ploop_minor_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= ploop_minor_release,
+};
+
+module_param(ploop_max, int, 0);
+MODULE_PARM_DESC(ploop_max, "Maximum number of ploop devices");
+module_param(ploop_major, int, 0);
+MODULE_PARM_DESC(ploop_major, "Major number of ploop device");
+module_param(max_map_pages, int, 0644);
+MODULE_PARM_DESC(ploop_max_map_pages, "Maximal amount of pages taken by map cache");
+module_param(root_threshold, long, 0644);
+MODULE_PARM_DESC(root_threshold, "Disk space reserved for root (in kilobytes)");
+module_param(user_threshold, long, 0644);
+MODULE_PARM_DESC(user_threshold, "Disk space reserved for user (in kilobytes)");
+module_param(large_disk_support, int, 0444);
+MODULE_PARM_DESC(ploop_large_disk_support, "Support of large disks (>2TB)");
+
+static int __init ploop_mod_init(void)
+{
+	int err;
+
+	/* _XXX_ should be estimated from available ram */
+	if (max_map_pages == 0)
+		max_map_pages = 1024;
+
+	err = ploop_map_init();
+	if (err)
+		goto out_err;
+
+	if (register_blkdev(ploop_major, "ploop"))
+		goto out_err;
+
+	blk_register_region(MKDEV(ploop_major, 0), ploop_max,
+			THIS_MODULE, ploop_dev_probe, NULL, NULL);
+
+	if (!proc_create("ploop_minor", 0440,
+			 proc_vz_dir, &proc_ploop_minor))
+		goto out_err2;
+
+	printk(KERN_INFO "ploop_dev: module loaded\n");
+	return 0;
+
+out_err2:
+	err = -ENOMEM;
+	blk_unregister_region(MKDEV(ploop_major, 0), ploop_max);
+	unregister_blkdev(PLOOP_DEVICE_MAJOR, "ploop");
+out_err:
+	ploop_map_exit();
+	return err;
+}
+
+static void __exit ploop_mod_exit(void)
+{
+	struct rb_node * n;
+
+	remove_proc_entry("ploop_minor", proc_vz_dir);
+	while ((n = rb_first(&ploop_devices_tree)) != NULL)
+		ploop_dev_del(rb_entry(n, struct ploop_device, link));
+	blk_unregister_region(MKDEV(ploop_major, 0), ploop_max);
+	unregister_blkdev(PLOOP_DEVICE_MAJOR, "ploop");
+	ploop_map_exit();
+	WARN_ON(atomic_read(&plo_count));
+}
+module_init(ploop_mod_init);
+module_exit(ploop_mod_exit);
--- /dev/null
+++ b/drivers/block/ploop/discard.c
@@ -0,0 +1,115 @@
+/*
+ *  drivers/block/ploop/discard.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/bio.h>
+
+#include <linux/ploop/ploop.h>
+#include "discard.h"
+#include "freeblks.h"
+
+int ploop_discard_init_ioc(struct ploop_device *plo)
+{
+	struct ploop_freeblks_desc *fbd;
+	struct ploop_delta *delta = ploop_top_delta(plo);
+
+	if (delta == NULL)
+		return -EINVAL;
+
+	if (delta->ops->id != PLOOP_FMT_PLOOP1)
+		return -EOPNOTSUPP;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+
+	fbd = ploop_fb_init(plo);
+	if (!fbd)
+		return -ENOMEM;
+
+	ploop_quiesce(plo);
+
+	ploop_fb_set_freezed_level(fbd, delta->level);
+
+	plo->fbd = fbd;
+
+	atomic_set(&plo->maintenance_cnt, 0);
+	init_completion(&plo->maintenance_comp);
+	plo->maintenance_type = PLOOP_MNTN_DISCARD;
+	set_bit(PLOOP_S_DISCARD, &plo->state);
+
+	ploop_relax(plo);
+
+	return 0;
+}
+
+int ploop_discard_fini_ioc(struct ploop_device *plo)
+{
+	int ret = 0;
+	struct ploop_request *preq, *tmp;
+	LIST_HEAD(drop_list);
+
+	if (!test_and_clear_bit(PLOOP_S_DISCARD, &plo->state))
+		return 0;
+
+	ploop_quiesce(plo);
+
+	spin_lock_irq(&plo->lock);
+	list_for_each_entry_safe(preq, tmp, &plo->entry_queue, list)
+		if (test_bit(PLOOP_REQ_DISCARD, &preq->state)) {
+			list_move(&preq->list, &drop_list);
+			ploop_entry_qlen_dec(preq);
+		}
+	spin_unlock_irq(&plo->lock);
+
+	if (!list_empty(&drop_list))
+		ploop_preq_drop(plo, &drop_list, 0);
+
+	if (plo->maintenance_type != PLOOP_MNTN_DISCARD) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	ploop_fb_fini(plo->fbd, -EOPNOTSUPP);
+
+	clear_bit(PLOOP_S_DISCARD_LOADED, &plo->state);
+
+	plo->maintenance_type = PLOOP_MNTN_OFF;
+	complete(&plo->maintenance_comp);
+
+out:
+	ploop_relax(plo);
+
+	return ret;
+}
+
+int ploop_discard_wait_ioc(struct ploop_device *plo)
+{
+	int err;
+
+	if (!test_bit(PLOOP_S_DISCARD, &plo->state))
+		return 0;
+
+	if (plo->maintenance_type == PLOOP_MNTN_FBLOADED)
+		return 1;
+
+	if (plo->maintenance_type != PLOOP_MNTN_DISCARD)
+		return -EINVAL;
+
+	err = ploop_maintenance_wait(plo);
+	if (err)
+		goto out;
+
+	/* maintenance_cnt is zero without discard requests,
+	 * in this case ploop_maintenance_wait returns 0
+	 * instead of ERESTARTSYS */
+	if (test_bit(PLOOP_S_DISCARD_LOADED, &plo->state)) {
+		err = 1;
+	} else if (signal_pending(current))
+		err = -ERESTARTSYS;
+out:
+	return err;
+}
--- /dev/null
+++ b/drivers/block/ploop/discard.h
@@ -0,0 +1,15 @@
+/*
+ *  drivers/block/ploop/discard.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_PLOOP_DISCARD_H_
+#define _LINUX_PLOOP_DISCARD_H_
+
+extern int ploop_discard_init_ioc(struct ploop_device *plo);
+extern int ploop_discard_fini_ioc(struct ploop_device *plo);
+extern int ploop_discard_wait_ioc(struct ploop_device *plo);
+
+#endif // _LINUX_PLOOP_DISCARD_H_
--- /dev/null
+++ b/drivers/block/ploop/events.h
@@ -0,0 +1,115 @@
+/*
+ *  drivers/block/ploop/events.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#if !defined(_TRACE_EVENTS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_EVENTS_H
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ploop
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#include <linux/ploop/ploop.h>
+
+#define PRINT_BI_RW(rw)	__print_flags(rw, "|",		\
+			{ REQ_WRITE,				"W"},	\
+			{ REQ_FAILFAST_DEV,		"FD"},	\
+			{ REQ_FAILFAST_TRANSPORT,	"FT"},	\
+			{ REQ_FAILFAST_DRIVER,		"FDRV"},\
+			{ REQ_RAHEAD,			"A"},	\
+			{ REQ_SYNC,			"S"},	\
+			{ REQ_META,			"M"},	\
+			{ REQ_DISCARD,			"D"},	\
+			{ REQ_NOIDLE,			"N"},	\
+			{ REQ_FLUSH,			"F"},	\
+			{ REQ_FUA,			"FUA"},	\
+			{ REQ_THROTTLED,		"T"})
+
+#define PRINT_PREQ_STATE(state)					\
+			__print_flags(state, "|",		\
+			{ 1 << PLOOP_REQ_LOCKOUT,	"L"},	\
+			{ 1 << PLOOP_REQ_PB_LOCKOUT,	"BL"},	\
+			{ 1 << PLOOP_REQ_SYNC,		"S"},	\
+			{ 1 << PLOOP_REQ_BARRIER,	"B"},	\
+			{ 1 << PLOOP_REQ_UNSTABLE,	"U"},	\
+			{ 1 << PLOOP_REQ_TRACK,		"TRACK"},\
+			{ 1 << PLOOP_REQ_SORTED,	"SORT"},\
+			{ 1 << PLOOP_REQ_TRANS,		"T"},	\
+			{ 1 << PLOOP_REQ_MERGE,		"M"},	\
+			{ 1 << PLOOP_REQ_RELOC_A,	"RA"},	\
+			{ 1 << PLOOP_REQ_RELOC_S,	"RS"},	\
+			{ 1 << PLOOP_REQ_RELOC_N,	"RN"},	\
+			{ 1 << PLOOP_REQ_ZERO,		"Z"},	\
+			{ 1 << PLOOP_REQ_DISCARD,	"D"})
+
+#define PREQ_FORMAT "preq=0x%p cluster=0x%x iblock=0x%x size=0x%x eng_state=0x%lx state=%s rw=%s"
+
+#define PREQ_ARGS	__entry->preq,				\
+			__entry->clu,				\
+			__entry->iblk,				\
+			__entry->size,				\
+			__entry->eng_state,			\
+			PRINT_PREQ_STATE(__entry->state),	\
+			PRINT_BI_RW(__entry->rw)
+
+DECLARE_EVENT_CLASS(preq_template,
+	TP_PROTO(struct ploop_request *preq),
+
+	TP_ARGS(preq),
+
+	TP_STRUCT__entry(
+		__field(void *,		preq)
+		__field(cluster_t,	clu)
+		__field(iblock_t,	iblk)
+		__field(unsigned int,	size)
+		__field(unsigned long,	eng_state)
+		__field(unsigned long,	state)
+		__field(unsigned int,	rw)
+	),
+
+	TP_fast_assign(
+		__entry->preq		= preq;
+		__entry->clu		= preq->req_cluster;
+		__entry->iblk		= preq->iblock;
+		__entry->size		= preq->req_size;
+		__entry->eng_state	= preq->eng_state;
+		__entry->state		= preq->state;
+		__entry->rw		= preq->req_rw;
+	),
+
+	TP_printk(PREQ_FORMAT, PREQ_ARGS)
+);
+
+DECLARE_EVENT_CLASS(bio_template,
+	TP_PROTO(struct bio *bio),
+
+	TP_ARGS(bio),
+
+	TP_STRUCT__entry(
+		__field(void *,		bio)
+		__field(sector_t,	sector)
+		__field(unsigned int,	size)
+		__field(unsigned long,	rw)
+	),
+
+	TP_fast_assign(
+		__entry->bio		= bio;
+		__entry->sector		= bio->bi_sector;
+		__entry->size		= bio->bi_size;
+		__entry->rw		= bio->bi_rw;
+	),
+
+	TP_printk("bio=0x%p sector=0x%lx size=0x%x rw=%s",
+			__entry->bio,
+			__entry->sector,
+			__entry->size,
+			PRINT_BI_RW(__entry->rw)
+			)
+);
+
+#endif /* _TRACE_PLOOP_H */
--- /dev/null
+++ b/drivers/block/ploop/fmt_ploop1.c
@@ -0,0 +1,603 @@
+/*
+ *  drivers/block/ploop/fmt_ploop1.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+
+#include <linux/ploop/ploop.h>
+#include "ploop1_image.h"
+
+/* The implementaion of ploop1 (PVD) delta format, defined in ploop1_fmt.h
+ */
+
+#define INDEX_PER_PAGE	     (PAGE_SIZE  / 4)
+#define INDEX_PER_PAGE_SHIFT (PAGE_SHIFT - 2)
+
+struct ploop1_private
+{
+	struct page	*dyn_page;
+	u64		bd_size;
+	u32		alloc_head;
+	sector_t	l1_off;
+};
+
+int ploop1_map_index(struct ploop_delta * delta, unsigned long block, sector_t *sec)
+{
+	struct ploop1_private * ph = delta->priv;
+
+	if ((u64)block << delta->plo->cluster_log >= ph->bd_size)
+		return 0;
+
+	/*
+	 * ondisk_pageno == (block + off) >> INDEX_PER_PAGE_SHIFT
+	 * sec == ondisk_pageno << (PAGE_SHIFT - 9)
+	 * (8 sectors per page, and log(8) == PAGE_SHIFT - 9)
+	 */
+	*sec = ((block + PLOOP_MAP_OFFSET) >> INDEX_PER_PAGE_SHIFT) <<
+	       (PAGE_SHIFT - 9);
+	return 1;
+}
+
+static void
+ploop1_read_index(struct ploop_delta * delta, struct ploop_request * preq,
+		  struct page * page, sector_t sec)
+{
+	return delta->io.ops->read_page(&delta->io, preq, page, sec);
+}
+
+static void
+ploop1_destroy_priv(struct ploop_delta * delta)
+{
+	struct ploop1_private * ph = delta->priv;
+
+	if (ph == NULL)
+		return;
+
+	delta->priv = NULL;
+
+	if (ph->dyn_page)
+		put_page(ph->dyn_page);
+
+	kfree(ph);
+}
+
+static int ploop1_stop(struct ploop_delta * delta)
+{
+	int err;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	if ((delta->flags & PLOOP_FMT_RDONLY) ||
+	    test_bit(PLOOP_S_ABORT, &delta->plo->state))
+		return 0;
+
+	ph->alloc_head = delta->io.alloc_head;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		return err;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	if (ph->alloc_head > (ph->l1_off >> delta->plo->cluster_log)) {
+		vh->m_Flags = le32_to_cpu(vh->m_Flags);
+		vh->m_Flags &= ~CIF_Empty;
+		vh->m_Flags = cpu_to_le32(vh->m_Flags);
+	}
+
+	pvd_header_set_disk_closed(vh);
+
+	err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	return delta->io.ops->sync(&delta->io);
+}
+
+static int
+ploop1_compose(struct ploop_delta * delta, int nchunks, struct ploop_ctl_chunk * pc)
+{
+	return ploop_io_init(delta, nchunks, pc);
+}
+
+static int
+ploop1_open(struct ploop_delta * delta)
+{
+	int err;
+	struct ploop1_private * ph;
+	struct ploop_pvd_header *vh;
+	u64 i_size;
+	int version;
+
+	err = -ENOMEM;
+	ph = kzalloc(sizeof(struct ploop1_private), GFP_KERNEL);
+	if (ph == NULL)
+		return -ENOMEM;
+
+	delta->priv = ph;
+
+	ph->dyn_page = alloc_page(GFP_KERNEL);
+	if (ph->dyn_page == NULL)
+		goto out_err;
+
+	err = ploop_io_open(&delta->io);
+	if (err)
+		goto out_err;
+
+	/* IO engine is ready. */
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		goto out_err;
+
+	err = -EINVAL;
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+	version = ploop1_version(vh);
+	if (version == -1 || 
+	    vh->m_Type	  != cpu_to_le32(PRL_IMAGE_COMPRESSED) ||
+	    vh->m_Sectors != cpu_to_le32(1 << delta->cluster_log))
+		goto out_err;
+
+	/* We don't support mixed configuration of V1 and V2 images */
+	if (delta->plo->fmt_version && delta->plo->fmt_version != version)
+		goto out_err;
+
+	ph->l1_off = le32_to_cpu(vh->m_FirstBlockOffset);
+
+	err = -EBUSY;
+	if (pvd_header_is_disk_in_use(vh))
+		goto out_err;
+
+	err = -EINVAL;
+	i_size = delta->io.ops->i_size_read(&delta->io);
+	ph->alloc_head = i_size >> (delta->cluster_log + 9);
+	if (!(le32_to_cpu(vh->m_Sectors) << 9) ||
+	    do_div(i_size, le32_to_cpu(vh->m_Sectors) << 9))
+		goto out_err;
+
+	ph->bd_size = get_SizeInSectors_from_le(vh, version);
+
+	if (delta->plo->bd_size > ph->bd_size)
+		goto out_err;
+	if (ph->bd_size & (le32_to_cpu(vh->m_Sectors) - 1))
+		goto out_err;
+	if (delta->plo->bd_size & (le32_to_cpu(vh->m_Sectors) - 1))
+		goto out_err;
+
+	if (!(delta->flags & PLOOP_FMT_RDONLY)) {
+		pvd_header_set_disk_in_use(vh);
+		err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+		if (err)
+			goto out_err;
+	}
+
+	delta->io.alloc_head = ph->alloc_head;
+	delta->plo->bd_size = ph->bd_size;
+	delta->plo->fmt_version = version;
+
+	/* If i_size >= max_size, no more allocations needed */
+	if ((u64)ph->alloc_head << (delta->cluster_log + 9) >=
+	    ((u64)ph->bd_size + ph->l1_off) << 9)
+		delta->flags |= PLOOP_FMT_PREALLOCATED;
+
+	return 0;
+
+out_err:
+	ploop1_destroy_priv(delta);
+	return err;
+}
+
+static int
+ploop1_refresh(struct ploop_delta * delta)
+{
+	int err;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	ph->bd_size = get_SizeInSectors_from_le(vh, delta->plo->fmt_version);
+
+	return 0;
+}
+
+/*
+ * The function gets preq with a bio. Caller checked that this bio
+ * is write to a block, which is not allocated in this delta.
+ * If this block is totally new, bio can cover only a part of block,
+ * if bio is a COW from previous delta, the function gets a bio
+ * covering the whole cluster, which is read from original delta.
+ *
+ * Task of this function is to allocate new block in image,
+ * to copy data there and to update index after this. A lot, huh?
+ */
+
+static void
+ploop1_allocate(struct ploop_delta * delta, struct ploop_request * preq,
+		struct bio_list * sbl, unsigned int size)
+{
+	if (delta->io.alloc_head >=
+			(delta->max_delta_size >> delta->cluster_log)) {
+		PLOOP_FAIL_REQUEST(preq, -E2BIG);
+		return;
+	}
+	delta->io.ops->submit_alloc(&delta->io, preq, sbl, size);
+}
+
+/* Call this when data write is complete */
+
+static void
+ploop1_allocate_complete(struct ploop_delta * delta, struct ploop_request * preq)
+{
+	ploop_index_update(preq);
+}
+
+static void
+ploop1_destroy(struct ploop_delta * delta)
+{
+	ploop_io_destroy(&delta->io);
+	ploop1_destroy_priv(delta);
+}
+
+static int
+ploop1_start(struct ploop_delta * delta)
+{
+	return 0;
+//	return delta->io.ops->start(&delta->io);
+}
+
+static int
+ploop1_sync(struct ploop_delta * delta)
+{
+	int err;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	if (delta->flags & PLOOP_FMT_RDONLY)
+		return 0;
+
+	if (test_bit(PLOOP_S_ABORT, &delta->plo->state))
+		return -EIO;
+
+	ph->alloc_head = delta->io.alloc_head;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		return err;
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+	pvd_header_set_disk_in_use(vh);
+
+	if (ph->alloc_head > (ph->l1_off >> delta->plo->cluster_log)) {
+		vh->m_Flags = le32_to_cpu(vh->m_Flags);
+		vh->m_Flags &= ~CIF_Empty;
+		vh->m_Flags = cpu_to_le32(vh->m_Flags);
+	}
+
+	err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	return delta->io.ops->sync(&delta->io);
+}
+
+static int
+ploop1_prepare_snapshot(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	return delta->io.ops->prepare_snapshot(&delta->io, sd);
+}
+
+static int
+ploop1_complete_snapshot(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err = 0;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	if (delta->flags & PLOOP_FMT_RDONLY)
+		goto out;
+
+	err = -EIO;
+	if (test_bit(PLOOP_S_ABORT, &delta->plo->state))
+		goto out;
+
+	ph->alloc_head = delta->io.alloc_head;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		goto out;
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		goto out;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+
+	if (ph->alloc_head > (ph->l1_off >> delta->io.plo->cluster_log)) {
+		vh->m_Flags = le32_to_cpu(vh->m_Flags);
+		vh->m_Flags &= ~CIF_Empty;
+		vh->m_Flags = cpu_to_le32(vh->m_Flags);
+	}
+
+	pvd_header_set_disk_closed(vh);
+
+	/*
+	 * NB: we don't call ploop_update_map_hdr() here because top
+	 * delta after snapshot completion should bear m_DiskInUse != 0.
+	 * Also, we rely on the fact that new top delta (created while
+	 * snapshotting) has exactly the same PVD-header as former top
+	 * delta. So, first 64 bytes of correspondent map_node page
+	 * remain valid.
+	 */
+
+	err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		goto out;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		goto out;
+
+	err = delta->io.ops->complete_snapshot(&delta->io, sd);
+	if (err)
+		goto out;
+
+	delta->flags |= PLOOP_FMT_RDONLY;
+	return 0;
+
+out:
+	if (sd->file) {
+		fput(sd->file);
+		sd->file = NULL;
+	}
+	return err;
+}
+
+static int
+ploop1_prepare_merge(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	if (pvd_header_is_disk_in_use(vh))
+		return -EBUSY;
+
+	ph->alloc_head = delta->io.ops->i_size_read(&delta->io) >>
+			 (delta->io.plo->cluster_log + 9);
+	delta->io.alloc_head = ph->alloc_head;
+
+	err = delta->io.ops->prepare_merge(&delta->io, sd);
+	if (err)
+		return err;
+
+	delta->flags &= ~PLOOP_FMT_RDONLY;
+	return 0;
+}
+
+static int
+ploop1_start_merge(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err;
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+
+	err = delta->io.ops->start_merge(&delta->io, sd);
+	if (err)
+		return err;
+
+	if (test_bit(PLOOP_S_ABORT, &delta->plo->state)) {
+		printk(KERN_WARNING "ploop1_start_merge for ploop%d failed "
+		       "(state ABORT)\n", delta->plo->index);
+		return -EIO;
+	}
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+	pvd_header_set_disk_in_use(vh);
+
+	/* keep hdr in ph->dyn_page and in map_node in sync */
+	ploop_update_map_hdr(&delta->plo->map, (u8 *)vh, sizeof(*vh));
+
+	err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	ph->bd_size = get_SizeInSectors_from_le(vh, delta->plo->fmt_version);
+
+	return delta->io.ops->sync(&delta->io);
+}
+
+static int ploop1_truncate(struct ploop_delta * delta, struct file * file,
+			   __u32 alloc_head)
+{
+	struct ploop1_private * ph = delta->priv;
+
+	/*
+	 * Maybe we should call here ploop1_refresh() and re-read PVD-header
+	 * from disk. This will be clear in the course of porting
+	 * ploop-shrink.c::shrink_in_place().
+	 */
+
+	ph->alloc_head = alloc_head;
+	delta->io.alloc_head = alloc_head;
+
+	return delta->io.ops->truncate(&delta->io,
+				       file ? file : delta->io.files.file,
+				       alloc_head);
+}
+
+static int
+ploop1_prepare_grow(struct ploop_delta * delta, u64 *new_size, int *reloc)
+{
+	struct ploop1_private * ph = delta->priv;
+	struct ploop_pvd_header *vh;
+	int idxs_per_iblk; /* # indices in one cluster-block */
+	iblock_t bdsize;   /* block-device size measured in cluster-blocks */
+	int n_present;     /* # cluster-blocks in L2-table (existent now) */
+	int n_needed;      /* # cluster-blocks in L2-table (for new_size) */
+	int n_alloced = 0; /* # cluster-blocks we can alloc right now */
+	int err;
+	iblock_t a_h = delta->io.alloc_head;
+	int	 log = delta->io.plo->cluster_log;
+
+	if (*new_size & ((1 << delta->cluster_log) - 1))
+		return -EINVAL;
+
+	if (*new_size > ploop1_max_size(1 << delta->plo->cluster_log,
+					delta->plo->fmt_version))
+		return -EFBIG;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+	n_present  = le32_to_cpu(vh->m_FirstBlockOffset) >> log;
+	BUG_ON (!n_present);
+
+	bdsize = (*new_size + (1 << log) - 1) >> log;
+
+	idxs_per_iblk = (1 << (log + 9)) / sizeof(u32);
+	n_needed = (bdsize + PLOOP_MAP_OFFSET + idxs_per_iblk - 1) /
+		   idxs_per_iblk;
+
+	if (n_needed <= n_present)
+		return 0;
+
+	if (a_h < n_needed) {
+		n_alloced = n_needed - a_h;
+		err = delta->io.ops->alloc(&delta->io,
+					   (loff_t)a_h << (log + 9),
+					   (loff_t)(n_alloced) << (log + 9));
+		if (err)
+			return err;
+	}
+
+	*reloc = n_needed - n_present - n_alloced;
+	if (*reloc) {
+		/* Feeling irresistable infatuation to relocate ... */
+		delta->io.plo->grow_start = n_present;
+		delta->io.plo->grow_end = n_needed - n_alloced - 1;
+	}
+
+	return 0;
+}
+
+static int ploop1_complete_grow(struct ploop_delta * delta, u64 new_size)
+{
+	struct ploop_pvd_header *vh;
+	struct ploop1_private * ph = delta->priv;
+	int err;
+	u32 vh_bsize; /* block size in sectors */
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		return err;
+
+	err = delta->io.ops->sync_read(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	vh = (struct ploop_pvd_header *)page_address(ph->dyn_page);
+	vh_bsize = le32_to_cpu(vh->m_Sectors);
+
+	if (vh_bsize != (1 << delta->io.plo->cluster_log)) {
+		printk("grow: vh->m_Sectors=%u != 1<<plo->cluster_log=%u\n",
+		       vh_bsize, 1 << delta->io.plo->cluster_log);
+		return -EINVAL;
+	}
+
+	generate_pvd_header(vh, new_size, vh_bsize, delta->plo->fmt_version);
+
+	vh->m_Type             = cpu_to_le32(vh->m_Type);
+	cpu_to_le_SizeInSectors(vh, delta->plo->fmt_version);
+	vh->m_Sectors          = cpu_to_le32(vh->m_Sectors);
+	vh->m_Heads            = cpu_to_le32(vh->m_Heads);
+	vh->m_Cylinders        = cpu_to_le32(vh->m_Cylinders);
+	vh->m_Size             = cpu_to_le32(vh->m_Size);
+	vh->m_FirstBlockOffset = cpu_to_le32(vh->m_FirstBlockOffset);
+
+	/* keep hdr in ph->dyn_page and in map_node in sync */
+	ploop_update_map_hdr(&delta->plo->map, (u8 *)vh, sizeof(*vh));
+
+	err = delta->io.ops->sync_write(&delta->io, ph->dyn_page, 4096, 0, 0);
+	if (err)
+		return err;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		return err;
+
+	ph->bd_size = new_size;
+	ph->l1_off = le32_to_cpu(vh->m_FirstBlockOffset);
+
+	return 0;
+}
+
+static struct ploop_delta_ops ploop1_delta_ops =
+{
+	.id		=	PLOOP_FMT_PLOOP1,
+	.name		=	"ploop1",
+	.owner		=	THIS_MODULE,
+	.capability	=	PLOOP_FMT_CAP_WRITABLE | PLOOP_FMT_CAP_DELTA,
+
+	.map_index	=	ploop1_map_index,
+	.read_index	=	ploop1_read_index,
+
+	.allocate	=	ploop1_allocate,
+	.allocate_complete =	ploop1_allocate_complete,
+
+	.compose	=	ploop1_compose,
+	.open		=	ploop1_open,
+	.destroy	=	ploop1_destroy,
+	.start		=	ploop1_start,
+	.stop		=	ploop1_stop,
+	.refresh	=	ploop1_refresh,
+	.sync		=	ploop1_sync,
+	.prepare_snapshot =	ploop1_prepare_snapshot,
+	.complete_snapshot =	ploop1_complete_snapshot,
+	.prepare_merge	=	ploop1_prepare_merge,
+	.start_merge	=	ploop1_start_merge,
+	.truncate	=	ploop1_truncate,
+	.prepare_grow	=	ploop1_prepare_grow,
+	.complete_grow	=	ploop1_complete_grow,
+};
+
+static int __init pfmt_ploop1_mod_init(void)
+{
+	return ploop_register_format(&ploop1_delta_ops);
+}
+
+static void __exit pfmt_ploop1_mod_exit(void)
+{
+	ploop_unregister_format(&ploop1_delta_ops);
+}
+
+module_init(pfmt_ploop1_mod_init);
+module_exit(pfmt_ploop1_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/drivers/block/ploop/fmt_raw.c
@@ -0,0 +1,269 @@
+/*
+ *  drivers/block/ploop/fmt_raw.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+
+#include <linux/ploop/ploop.h>
+
+/* An implementation of raw linear image format.
+ *
+ * Right now it is not quite optimal because we simulate
+ * raw image as ploop1-like image with dummy preallocated
+ * index tables. It is optimized only when we have
+ * just one raw image without any deltas on top.
+ * Probably, this is all that we need.
+ */
+
+static int raw_stop(struct ploop_delta * delta)
+{
+	return delta->io.ops->sync(&delta->io);
+}
+
+static int
+raw_compose(struct ploop_delta * delta, int nchunks, struct ploop_ctl_chunk * pc)
+{
+	return ploop_io_init(delta, nchunks, pc);
+}
+
+static int
+raw_open(struct ploop_delta * delta)
+{
+	int err;
+	loff_t pos;
+	int cluster_log = list_empty(&delta->plo->map.delta_list) ?
+		delta->cluster_log : delta->plo->cluster_log;
+
+	err = ploop_io_open(&delta->io);
+	if (err)
+		return err;
+
+	if (delta->plo->bd_size) {
+		if (delta->plo->bd_size > (delta->io.ops->i_size_read(&delta->io) >> 9))
+			return -EINVAL;
+	} else {
+		delta->plo->bd_size = delta->io.ops->i_size_read(&delta->io) >> 9;
+	}
+
+	pos = delta->io.ops->i_size_read(&delta->io);
+	pos += (1 << (cluster_log + 9)) - 1;
+	delta->io.alloc_head = pos >> (cluster_log + 9);
+
+	/* no more allocations at all */
+	delta->flags |= PLOOP_FMT_PREALLOCATED;
+
+	return 0;
+}
+
+/*
+ * Sanity checks below assumes that we can be called only by
+ * ploop_del_delta() or raw_start_merge(). Thus, there recently
+ * was a ploop1 delta above us. Adding ploop1 delta on the top
+ * of raw delta is only supported if raw delta is cluster-block
+ * aligned.
+ *
+ * Another assumption is that either size of raw delta was
+ * kept unchanged or it was grown in user-space while merging.
+ */
+static int
+raw_refresh(struct ploop_delta * delta)
+{
+	loff_t pos;
+
+	pos = delta->io.ops->i_size_read(&delta->io);
+	if (pos & ((1 << (delta->plo->cluster_log + 9)) - 1)) {
+		printk("raw delta is not aligned (%llu bytes)\n", pos);
+		return -EINVAL;
+	}
+	if ((pos >> (delta->plo->cluster_log + 9)) < delta->io.alloc_head) {
+		printk("raw delta was corrupted "
+		       "(old_size=%u new_size=%llu iblocks)\n",
+		       delta->io.alloc_head,
+		       pos >> (delta->plo->cluster_log + 9));
+		return -EINVAL;
+	}
+
+	delta->io.alloc_head = pos >> (delta->plo->cluster_log + 9);
+	return 0;
+}
+
+static void
+raw_allocate(struct ploop_delta * delta, struct ploop_request * preq,
+		struct bio_list * sbl, unsigned int size)
+{
+	delta->io.ops->submit_alloc(&delta->io, preq, sbl, size);
+}
+
+int raw_map_index(struct ploop_delta * delta, unsigned long index, sector_t *sec)
+{
+	*sec = index;
+	return 1;
+}
+
+static void
+raw_read_index(struct ploop_delta * delta, struct ploop_request * preq,
+	       struct page * page, sector_t sec)
+{
+	int i;
+	u32 * ptr = page_address(page);
+	int skip = (sec == 0) ? PLOOP_MAP_OFFSET : 0;
+
+	for (i = skip; i < PAGE_SIZE/4; i++) {
+		if ((sec << delta->plo->cluster_log) >=
+		    (delta->io.alloc_head << delta->plo->cluster_log)) {
+			ptr[i] = 0;
+			sec++;
+		} else if (sec == 0) {
+			/* ptr[i]==0 would be interpreted as "iblock not alloced" */
+			ptr[i] = PLOOP_ZERO_INDEX;
+			sec++;
+		} else {
+			ptr[i] = sec++ << ploop_map_log(delta->plo);
+		}
+	}
+
+	ploop_complete_io_state(preq);
+}
+
+static void
+raw_destroy(struct ploop_delta * delta)
+{
+	ploop_io_destroy(&delta->io);
+}
+
+static int
+raw_start(struct ploop_delta * delta)
+{
+	return 0;
+//	return delta->io.ops->start(&delta->io);
+}
+
+static int
+raw_prepare_snapshot(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	return delta->io.ops->prepare_snapshot(&delta->io, sd);
+}
+
+static int
+raw_complete_snapshot(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err = 0;
+
+	if (delta->flags & PLOOP_FMT_RDONLY)
+		goto out;
+
+	err = -EIO;
+	if (test_bit(PLOOP_S_ABORT, &delta->plo->state))
+		goto out;
+
+	err = delta->io.ops->sync(&delta->io);
+	if (err)
+		goto out;
+
+	err = delta->io.ops->complete_snapshot(&delta->io, sd);
+	if (err)
+		goto out;
+
+	delta->flags |= PLOOP_FMT_RDONLY;
+	return 0;
+
+out:
+	if (sd->file) {
+		fput(sd->file);
+		sd->file = NULL;
+	}
+	return err;
+}
+
+static int
+raw_prepare_merge(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err;
+
+	err = delta->io.ops->prepare_merge(&delta->io, sd);
+	if (err)
+		return err;
+
+	delta->flags &= ~PLOOP_FMT_RDONLY;
+	return 0;
+}
+
+static int
+raw_start_merge(struct ploop_delta * delta, struct ploop_snapdata * sd)
+{
+	int err;
+
+	err = delta->io.ops->start_merge(&delta->io, sd);
+	if (err)
+		return err;
+
+	if (test_bit(PLOOP_S_ABORT, &delta->plo->state)) {
+		printk(KERN_WARNING "raw_start_merge for ploop%d failed "
+		       "(state ABORT)\n", delta->plo->index);
+		return -EIO;
+	}
+
+	err = raw_refresh(delta);
+	if (err)
+		return err;
+
+	return delta->io.ops->sync(&delta->io);
+}
+
+
+static int
+raw_prepare_grow(struct ploop_delta * delta, u64 *new_size, int *reloc)
+{
+	*new_size = (*new_size + (PAGE_SIZE >> 9) - 1) &
+		    ~((PAGE_SIZE >> 9) - 1);
+	return delta->io.ops->alloc(&delta->io,
+				    delta->plo->bd_size << 9,
+				    (*new_size - delta->plo->bd_size) << 9);
+}
+
+static struct ploop_delta_ops raw_delta_ops =
+{
+	.id		=	PLOOP_FMT_RAW,
+	.name		=	"raw",
+	.owner		=	THIS_MODULE,
+	.capability	=	PLOOP_FMT_CAP_WRITABLE|PLOOP_FMT_CAP_IDENTICAL,
+
+	.map_index	=	raw_map_index,
+	.read_index	=	raw_read_index,
+
+	.allocate	=	raw_allocate,
+
+	.compose	=	raw_compose,
+	.open		=	raw_open,
+	.destroy	=	raw_destroy,
+	.start		=	raw_start,
+	.stop		=	raw_stop,
+	.refresh	=	raw_refresh,
+	.prepare_snapshot =	raw_prepare_snapshot,
+	.complete_snapshot =	raw_complete_snapshot,
+	.prepare_merge	=	raw_prepare_merge,
+	.start_merge	=	raw_start_merge,
+	.prepare_grow	=	raw_prepare_grow,
+};
+
+static int __init pfmt_raw_mod_init(void)
+{
+	return ploop_register_format(&raw_delta_ops);
+}
+
+static void __exit pfmt_raw_mod_exit(void)
+{
+	ploop_unregister_format(&raw_delta_ops);
+}
+
+module_init(pfmt_raw_mod_init);
+module_exit(pfmt_raw_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/drivers/block/ploop/freeblks.c
@@ -0,0 +1,1100 @@
+/*
+ *  drivers/block/ploop/freeblks.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/bio.h>
+#include <linux/interrupt.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+
+#include <trace/events/block.h>
+
+#include <linux/ploop/ploop.h>
+#include "freeblks.h"
+
+#define MIN(a, b) (a < b ? a : b)
+
+struct ploop_freeblks_extent
+{
+	struct list_head list; /* List link */
+
+	cluster_t clu;
+	iblock_t  iblk;
+	u32	  len;
+
+};
+
+struct ploop_relocblks_extent
+{
+	struct list_head list; /* List link */
+
+	cluster_t clu;
+	iblock_t  iblk;
+	u32	  len;
+	u32	  free;	/* this extent is also present in freemap */
+};
+
+struct ploop_fextent_ptr {
+	struct ploop_freeblks_extent *ext;
+	u32 off;
+};
+
+struct ploop_rextent_ptr {
+	struct ploop_relocblks_extent *ext;
+	u32 off;
+};
+
+struct ploop_freeblks_desc {
+	struct ploop_device *plo;
+
+	int fbd_n_free;	       /* # free blocks remaining
+				  (i.e. "not re-used") */
+
+	/* fbd_ffb.ext->clu + fbd_ffb.off can be used as
+	 * 'clu of first free block to reuse' for WRITE ops */
+	struct ploop_fextent_ptr fbd_ffb; /* 'ffb' stands for
+					     'first free block' */
+
+	/* fbd_lfb.ext->clu + fbd_lfb.off can be used as
+	 * 'clu of first block to overwrite' (draining reloc range from end) */
+	struct ploop_fextent_ptr fbd_lfb; /* 'lfb' stands for
+					     'last free block for relocation'*/
+
+	/* fbd_reloc_extents[fbd->fbd_last_reloc_extent].clu +
+	 * fbd_last_reloc_off can be used as 'clu of first block to relocate'
+	 * (draining reloc range from end)
+	 * NB: ffb and lfb above deal with free_list, while lrb deals with
+	 * reloc_list! */
+	struct ploop_rextent_ptr fbd_lrb; /* 'lrb' stands for
+					     'last block to relocate' */
+
+	/* counters to trace the progress of relocation */
+	int fbd_n_relocated;  /* # blocks actually relocated */
+	int fbd_n_relocating; /* # blocks whose relocation was at
+				   least started */
+
+	/* lost_range: [fbd_first_lost_iblk ..
+	 *		fbd_first_lost_iblk + fbd_lost_range_len - 1] */
+	iblock_t fbd_first_lost_iblk;
+	int	 fbd_lost_range_len;
+	int	 fbd_lost_range_addon; /* :)) */
+
+	/* any reloc request resides there while it's "in progress" */
+	struct rb_root		reloc_tree;
+
+	/* list of ploop_request-s for PLOOP_REQ_ZERO ops: firstly zero index
+	 * for PLOOP_REQ_ZERO req_cluster, then schedule ordinary request
+	 * pinned to given PLOOP_REQ_ZERO request */
+	struct list_head	free_zero_list;
+
+	/* storage for free-block extents: list for now */
+	struct list_head	fbd_free_list;
+
+	/* storage for reloc-block extents: list for now */
+	struct list_head	fbd_reloc_list;
+
+	int	 fbd_freezed_level; /* for sanity - level on
+				     * PLOOP_IOC_FREEBLKS stage */
+
+	struct bio_list	fbd_dbl; /* dbl stands for 'discard bio list' */
+};
+
+int ploop_fb_get_n_relocated(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_n_relocated;
+}
+int ploop_fb_get_n_relocating(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_n_relocating;
+}
+int ploop_fb_get_n_free(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_n_free;
+}
+iblock_t ploop_fb_get_alloc_head(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_first_lost_iblk + fbd->fbd_lost_range_len;
+}
+int ploop_fb_get_lost_range_len(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_lost_range_len;
+}
+iblock_t ploop_fb_get_first_lost_iblk(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_first_lost_iblk;
+}
+
+int ploop_fb_get_freezed_level(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_freezed_level;
+}
+void ploop_fb_set_freezed_level(struct ploop_freeblks_desc *fbd, int level)
+{
+	fbd->fbd_freezed_level = level;
+}
+
+void ploop_fb_add_reloc_req(struct ploop_freeblks_desc *fbd,
+			    struct ploop_request *preq)
+{
+	struct rb_node ** p;
+	struct rb_node *parent = NULL;
+	struct ploop_request * pr;
+
+	if (fbd == NULL)
+		return;
+
+	p = &fbd->reloc_tree.rb_node;
+	while (*p) {
+		parent = *p;
+		pr = rb_entry(parent, struct ploop_request, reloc_link);
+		BUG_ON (preq->src_iblock == pr->src_iblock);
+
+		if (preq->src_iblock < pr->src_iblock)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	rb_link_node(&preq->reloc_link, parent, p);
+	rb_insert_color(&preq->reloc_link, &fbd->reloc_tree);
+}
+
+void ploop_fb_del_reloc_req(struct ploop_freeblks_desc *fbd,
+			    struct ploop_request *preq)
+{
+	BUG_ON (fbd == NULL);
+
+	rb_erase(&preq->reloc_link, &fbd->reloc_tree);
+}
+
+int ploop_fb_check_reloc_req(struct ploop_freeblks_desc *fbd,
+			     struct ploop_request *preq,
+			     unsigned long pin_state)
+{
+	struct rb_node *n;
+	struct ploop_request * p;
+
+	BUG_ON (fbd == NULL);
+	BUG_ON (preq->iblock == 0);
+	BUG_ON (preq->iblock >= fbd->fbd_first_lost_iblk);
+
+	n = fbd->reloc_tree.rb_node;
+	if (n == NULL)
+		return 0;
+
+	while (n) {
+		p = rb_entry(n, struct ploop_request, reloc_link);
+
+		if (preq->iblock < p->src_iblock)
+			n = n->rb_left;
+		else if (preq->iblock > p->src_iblock)
+			n = n->rb_right;
+		else {
+			spin_lock_irq(&fbd->plo->lock);
+			preq->eng_state = pin_state;
+			list_add_tail(&preq->list, &p->delay_list);
+			spin_unlock_irq(&fbd->plo->lock);
+			return 1;
+		}
+	}
+	return 0;
+}
+
+int ploop_fb_copy_freeblks_to_user(struct ploop_freeblks_desc *fbd, void *arg,
+				   struct ploop_freeblks_ctl *ctl)
+{
+	int   rc = 0;
+	int   n	 = 0;
+	struct ploop_freeblks_extent	 *fextent;
+	struct ploop_freeblks_ctl_extent  cext;
+
+	memset(&cext, 0, sizeof(cext));
+	list_for_each_entry(fextent, &fbd->fbd_free_list, list)
+		if (ctl->n_extents) {
+			int off = offsetof(struct ploop_freeblks_ctl,
+					   extents[n]);
+			if (n++ >= ctl->n_extents) {
+				rc = -ENOSPC;
+				break;
+			}
+
+			cext.clu  = fextent->clu;
+			cext.iblk = fextent->iblk;
+			cext.len  = fextent->len;
+
+			rc = copy_to_user((u8*)arg + off, &cext, sizeof(cext));
+			if (rc)
+				break;
+		} else {
+			n++;
+		}
+
+	if (!rc) {
+		ctl->n_extents = n;
+		rc = copy_to_user((void*)arg, ctl, sizeof(*ctl));
+	}
+
+	return rc;
+}
+
+int ploop_fb_filter_freeblks(struct ploop_freeblks_desc *fbd, unsigned long minlen)
+{
+	struct ploop_freeblks_extent *fextent, *n;
+
+	list_for_each_entry_safe(fextent, n, &fbd->fbd_free_list, list)
+		if (fextent->len < minlen) {
+			list_del(&fextent->list);
+			fbd->fbd_n_free -= fextent->len;
+			kfree(fextent);
+		}
+
+	if (list_empty(&fbd->fbd_free_list))
+		fbd->fbd_ffb.ext = NULL;
+	else
+		fbd->fbd_ffb.ext = list_entry(fbd->fbd_free_list.next,
+						struct ploop_freeblks_extent,
+						list);
+	fbd->fbd_ffb.off = 0;
+
+	return fbd->fbd_n_free;
+}
+
+struct ploop_request *
+ploop_fb_get_zero_request(struct ploop_freeblks_desc *fbd)
+{
+	struct ploop_request * preq;
+
+	BUG_ON (fbd == NULL);
+	BUG_ON (list_empty(&fbd->free_zero_list));
+
+	preq = list_entry(fbd->free_zero_list.next,
+			  struct ploop_request, list);
+	list_del(&preq->list);
+	return preq;
+}
+
+void ploop_fb_put_zero_request(struct ploop_freeblks_desc *fbd,
+			       struct ploop_request *preq)
+{
+	list_add(&preq->list, &fbd->free_zero_list);
+}
+
+static iblock_t ffb_iblk(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_ffb.ext->iblk + fbd->fbd_ffb.off;
+}
+static cluster_t ffb_clu(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_ffb.ext->clu + fbd->fbd_ffb.off;
+}
+static iblock_t lfb_iblk(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_lfb.ext->iblk + fbd->fbd_lfb.off;
+}
+static cluster_t lfb_clu(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_lfb.ext->clu + fbd->fbd_lfb.off;
+}
+static iblock_t lrb_iblk(struct ploop_freeblks_desc *fbd)
+{
+	return fbd->fbd_lrb.ext->iblk + fbd->fbd_lrb.off;
+}
+
+static iblock_t get_first_reloc_iblk(struct ploop_freeblks_desc *fbd)
+{
+	struct ploop_relocblks_extent *r_extent;
+
+	BUG_ON (list_empty(&fbd->fbd_reloc_list));
+	r_extent = list_entry(fbd->fbd_reloc_list.next,
+			      struct ploop_relocblks_extent, list);
+	return r_extent->iblk;
+}
+
+static void advance_ffb_simple(struct ploop_freeblks_desc *fbd)
+{
+	BUG_ON (fbd->fbd_ffb.ext == NULL);
+
+	if (fbd->fbd_ffb.off < fbd->fbd_ffb.ext->len - 1) {
+		fbd->fbd_ffb.off++;
+	} else {
+		if (fbd->fbd_ffb.ext->list.next == &fbd->fbd_free_list)
+			fbd->fbd_ffb.ext = NULL;
+		else
+			fbd->fbd_ffb.ext = list_entry(fbd->fbd_ffb.ext->list.next,
+						      struct ploop_freeblks_extent,
+						      list);
+		fbd->fbd_ffb.off = 0;
+	}
+
+	if (fbd->fbd_ffb.ext != NULL &&
+	    ffb_iblk(fbd) >= fbd->fbd_first_lost_iblk) {
+		/* invalidate ffb */
+		fbd->fbd_ffb.ext = NULL;
+		fbd->fbd_ffb.off = 0;
+	}
+}
+
+static void advance_lrb(struct ploop_freeblks_desc *fbd)
+{
+	iblock_t skip = 0;
+	BUG_ON (fbd->fbd_lrb.ext == NULL);
+
+	if (likely(fbd->fbd_lrb.off)) {
+		fbd->fbd_lrb.off--;
+	} else {
+		struct ploop_relocblks_extent *r_extent = fbd->fbd_lrb.ext;
+		/* here 'skip' means: [new_lrb_ext]<--skip-->[r_extent] */
+
+		if (fbd->fbd_lrb.ext->list.prev == &fbd->fbd_reloc_list) {
+			BUG_ON (fbd->fbd_lost_range_addon < 0);
+			skip = fbd->fbd_lost_range_addon;
+			fbd->fbd_lrb.ext = NULL;
+		} else {
+			fbd->fbd_lrb.ext = list_entry(fbd->fbd_lrb.ext->list.prev,
+						      struct ploop_relocblks_extent,
+						      list);
+			fbd->fbd_lrb.off = fbd->fbd_lrb.ext->len - 1;
+			BUG_ON (r_extent->iblk < fbd->fbd_lrb.ext->iblk +
+						 fbd->fbd_lrb.ext->len);
+			skip = r_extent->iblk - (fbd->fbd_lrb.ext->iblk +
+						 fbd->fbd_lrb.ext->len);
+		}
+	}
+
+	fbd->fbd_first_lost_iblk -= 1 + skip;
+	fbd->fbd_lost_range_len	 += 1 + skip;
+
+	if (fbd->fbd_ffb.ext != NULL &&
+	    ffb_iblk(fbd) >= fbd->fbd_first_lost_iblk) {
+		/* invalidate ffb */
+		fbd->fbd_ffb.ext = NULL;
+		fbd->fbd_ffb.off = 0;
+	}
+
+	BUG_ON(fbd->fbd_n_free <= 0);
+	fbd->fbd_n_free--;
+}
+
+static int split_fb_extent(struct ploop_freeblks_extent *extent, u32 *off_p,
+			   struct ploop_freeblks_desc *fbd)
+{
+	struct ploop_freeblks_extent *new_extent;
+
+	new_extent = kzalloc(sizeof(*new_extent), GFP_KERNEL);
+	if (new_extent == NULL) {
+		printk("Can't allocate new freeblks extent for splittig!\n");
+		return -ENOMEM;
+	}
+
+	new_extent->clu	 = extent->clu	+ *off_p + 1;
+	new_extent->iblk = extent->iblk + *off_p + 1;
+	new_extent->len	 = extent->len	- *off_p - 1;
+
+	extent->len  = *off_p;
+
+	list_add(&new_extent->list, &extent->list);
+
+	(*off_p)--;
+	return 0;
+}
+
+static int advance_lfb_left(struct ploop_freeblks_desc *fbd)
+{
+	int rc = 0;
+	struct ploop_freeblks_extent *lfb_ext = fbd->fbd_lfb.ext;
+
+	BUG_ON (fbd->fbd_ffb.ext == NULL);
+	BUG_ON (lfb_ext == NULL);
+	BUG_ON (ffb_iblk(fbd) > lfb_iblk(fbd));
+
+	if (ffb_iblk(fbd) == lfb_iblk(fbd)) {
+		/* invalidate lfb */
+		fbd->fbd_lfb.ext = NULL;
+		fbd->fbd_lfb.off = 0;
+		advance_ffb_simple(fbd);
+		return 0;
+	}
+
+	if (fbd->fbd_lfb.off) {
+		if (fbd->fbd_lfb.off == lfb_ext->len - 1) {
+			lfb_ext->len--;
+			fbd->fbd_lfb.off--;
+		} else {
+			rc = split_fb_extent(lfb_ext, &fbd->fbd_lfb.off, fbd);
+		}
+	} else {
+		BUG_ON (lfb_ext->list.prev == &fbd->fbd_free_list);
+		BUG_ON (lfb_ext == fbd->fbd_ffb.ext);
+
+		lfb_ext->clu++;
+		lfb_ext->iblk++;
+		lfb_ext->len--;
+
+		fbd->fbd_lfb.ext = list_entry(lfb_ext->list.prev,
+					      struct ploop_freeblks_extent,
+					      list);
+		fbd->fbd_lfb.off = fbd->fbd_lfb.ext->len - 1;
+
+		if (lfb_ext->len == 0) {
+			list_del(&lfb_ext->list);
+			kfree(lfb_ext);
+		}
+	}
+
+	BUG_ON (fbd->fbd_ffb.ext == NULL);
+	BUG_ON (fbd->fbd_lfb.ext == NULL);
+	BUG_ON (lfb_iblk(fbd) < ffb_iblk(fbd));
+	return rc;
+}
+
+int ploop_fb_get_reloc_block(struct ploop_freeblks_desc *fbd,
+			     cluster_t *from_clu_p, iblock_t *from_iblk_p,
+			     cluster_t *to_clu_p, iblock_t *to_iblk_p,
+			     u32 *free_p)
+{
+	cluster_t from_clu, to_clu;
+	iblock_t  from_iblk, to_iblk;
+	u32 free;
+	struct ploop_relocblks_extent *r_extent = fbd->fbd_lrb.ext;
+
+	if (!fbd)
+		return -1;
+
+	/* whole range is drained? */
+	if (r_extent == NULL)
+		return -1;
+
+	BUG_ON (fbd->fbd_lrb.off >= r_extent->len);
+
+	from_clu  = r_extent->clu  + fbd->fbd_lrb.off;
+	from_iblk = r_extent->iblk + fbd->fbd_lrb.off;
+	free	  = r_extent->free;
+
+	/* from_iblk is in range to relocate, but it's marked as free.
+	 * This means that we only need to zero its index, no actual
+	 * relocation needed. Such an operation doesn't consume free
+	 * block that fbd_last_free refers to */
+	if (free) {
+		/* The block we're going to zero-index was already re-used? */
+		if (fbd->fbd_ffb.ext == NULL || ffb_iblk(fbd) > from_iblk)
+			return -1;
+
+		BUG_ON (fbd->fbd_ffb.off  >= fbd->fbd_ffb.ext->len);
+
+		to_iblk = ~0U;
+		to_clu	= ~0U;
+	} else {
+		/* run out of free blocks which can be used as destination
+		 * for relocation ? */
+		if (fbd->fbd_lfb.ext == NULL)
+			return -1;
+
+		BUG_ON (fbd->fbd_ffb.ext == NULL);
+		BUG_ON (fbd->fbd_ffb.off  >= fbd->fbd_ffb.ext->len);
+		BUG_ON (fbd->fbd_lfb.off  >= fbd->fbd_lfb.ext->len);
+		BUG_ON (ffb_iblk(fbd) > lfb_iblk(fbd));
+
+		to_clu	= lfb_clu(fbd);
+		to_iblk = lfb_iblk(fbd);
+
+		if (advance_lfb_left(fbd)) {
+			/* Error implies stopping relocation */
+			fbd->fbd_lrb.ext = NULL;
+			fbd->fbd_lrb.off = 0;
+			return -1;
+		}
+	}
+
+	/* consume one block from the end of reloc list */
+	advance_lrb(fbd);
+
+	fbd->fbd_n_relocating++;
+
+	*from_clu_p  = from_clu;
+	*from_iblk_p = from_iblk;
+	*to_clu_p    = to_clu;
+	*to_iblk_p   = to_iblk;
+	*free_p	     = free;
+	return 0;
+}
+
+void ploop_fb_relocate_req_completed(struct ploop_freeblks_desc *fbd)
+{
+	fbd->fbd_n_relocated++;
+}
+
+static void advance_lfb_right(struct ploop_freeblks_desc *fbd)
+{
+	iblock_t iblk = get_first_reloc_iblk(fbd);
+
+	if (fbd->fbd_lfb.off < fbd->fbd_lfb.ext->len - 1) {
+		if (fbd->fbd_lfb.ext->iblk + fbd->fbd_lfb.off + 1 < iblk) {
+			fbd->fbd_lfb.off++;
+		}
+	} else if (fbd->fbd_lfb.ext->list.next != &fbd->fbd_free_list) {
+		struct ploop_freeblks_extent *f_extent;
+		f_extent = list_entry(fbd->fbd_lfb.ext->list.next,
+				      struct ploop_freeblks_extent,
+				      list);
+		if (f_extent->iblk < iblk) {
+			fbd->fbd_lfb.ext = f_extent;
+			fbd->fbd_lfb.off = 0;
+		}
+	}
+
+	/* invalidating ffb always implies invalidating lfb */
+	BUG_ON (fbd->fbd_ffb.ext == NULL && fbd->fbd_lfb.ext != NULL);
+
+	/* caller has just advanced ffb, but we must keep lfb intact
+	 * if next-free-block (following to lfb) is in reloc-range */
+	if (fbd->fbd_ffb.ext != NULL && fbd->fbd_lfb.ext != NULL &&
+	    lfb_iblk(fbd) < ffb_iblk(fbd)) {
+		fbd->fbd_lfb.ext = NULL;
+		fbd->fbd_lfb.off = 0;
+	}
+}
+
+static void trim_reloc_list_one_blk(struct ploop_freeblks_desc *fbd)
+{
+	struct ploop_relocblks_extent *r_extent_first;
+	iblock_t iblk = lrb_iblk(fbd);
+	int invalidate = 0;
+
+	BUG_ON (list_empty(&fbd->fbd_reloc_list));
+	r_extent_first = list_entry(fbd->fbd_reloc_list.next,
+				    struct ploop_relocblks_extent, list);
+
+	if (r_extent_first->len > 1) {
+		fbd->fbd_lost_range_addon = 0;
+		r_extent_first->iblk++;
+		r_extent_first->clu++;
+		r_extent_first->len--;
+		if (iblk < r_extent_first->iblk) {
+			invalidate = 1;
+		} else if (r_extent_first == fbd->fbd_lrb.ext) {
+			BUG_ON (fbd->fbd_lrb.off == 0);
+			fbd->fbd_lrb.off--;
+		}
+	} else {
+		if (r_extent_first == fbd->fbd_lrb.ext) {
+			invalidate = 1;
+		} else {
+			struct ploop_relocblks_extent *r_extent;
+			BUG_ON (r_extent_first->list.next ==
+				&fbd->fbd_reloc_list);
+			r_extent = list_entry(r_extent_first->list.next,
+					      struct ploop_relocblks_extent,
+					      list);
+			fbd->fbd_lost_range_addon = r_extent->iblk -
+				(r_extent_first->iblk + r_extent_first->len);
+		}
+		list_del(&r_extent_first->list);
+		kfree(r_extent_first);
+	}
+
+	if (invalidate) {
+		/* invalidate both lfb and lrb */
+		fbd->fbd_lrb.ext = NULL;
+		fbd->fbd_lrb.off = 0;
+		if (fbd->fbd_lfb.ext != NULL) {
+			fbd->fbd_lfb.ext = NULL;
+			fbd->fbd_lfb.off = 0;
+		}
+	}
+}
+
+static void advance_ffb(struct ploop_freeblks_desc *fbd)
+{
+	BUG_ON (fbd->fbd_ffb.ext == NULL);
+	BUG_ON (fbd->fbd_lfb.ext != NULL && ffb_iblk(fbd) > lfb_iblk(fbd));
+
+	if (fbd->fbd_ffb.off < fbd->fbd_ffb.ext->len - 1) {
+		fbd->fbd_ffb.off++;
+	} else {
+		if (fbd->fbd_ffb.ext->list.next == &fbd->fbd_free_list) {
+			BUG_ON (fbd->fbd_lfb.ext != NULL &&
+				ffb_iblk(fbd) != lfb_iblk(fbd));
+			fbd->fbd_ffb.ext = NULL;
+		} else {
+			fbd->fbd_ffb.ext = list_entry(fbd->fbd_ffb.ext->list.next,
+						      struct ploop_freeblks_extent,
+						      list);
+		}
+		fbd->fbd_ffb.off = 0;
+	}
+
+	if (fbd->fbd_ffb.ext == NULL && fbd->fbd_lfb.ext != NULL) {
+		/* invalidate lfb */
+		fbd->fbd_lfb.ext = NULL;
+		fbd->fbd_lfb.off = 0;
+		return;
+	}
+
+	if (fbd->fbd_ffb.ext != NULL &&
+	    ffb_iblk(fbd) >= fbd->fbd_first_lost_iblk) {
+		/* invalidate both ffb and lfb */
+		fbd->fbd_ffb.ext = NULL;
+		fbd->fbd_ffb.off = 0;
+		fbd->fbd_lfb.ext = NULL;
+		fbd->fbd_lfb.off = 0;
+	}
+
+	/* nothing to do anymore if relocation process is completed */
+	if (fbd->fbd_lrb.ext == NULL)
+		return;
+
+	trim_reloc_list_one_blk(fbd);
+
+	/* trim could invalidate both lrb and lfb */
+	if (fbd->fbd_lrb.ext == NULL || fbd->fbd_lfb.ext == NULL)
+		return;
+
+	advance_lfb_right(fbd);
+}
+
+int ploop_fb_get_free_block(struct ploop_freeblks_desc *fbd,
+			    cluster_t *clu, iblock_t *iblk)
+{
+	if (!fbd)
+		return -1;
+
+	if (fbd->fbd_ffb.ext == NULL) {
+		BUG_ON (fbd->fbd_lfb.ext != NULL);
+		BUG_ON (fbd->fbd_lost_range_len < 0);
+
+		if (fbd->fbd_lost_range_len == 0)
+			return -1;
+
+		*iblk = fbd->fbd_first_lost_iblk++;
+		fbd->fbd_lost_range_len--;
+
+		if (fbd->fbd_lrb.ext != NULL) {
+			/* stop relocation process */
+			fbd->fbd_lrb.ext = NULL;
+			fbd->fbd_lrb.off = 0;
+		}
+
+		return 0;
+	}
+
+	BUG_ON (ffb_iblk(fbd) >= fbd->fbd_first_lost_iblk);
+	BUG_ON (fbd->fbd_n_free <= 0);
+
+	*clu = ffb_clu(fbd);
+	fbd->fbd_n_free--;
+
+	if (fbd->plo->maintenance_type == PLOOP_MNTN_RELOC)
+		advance_ffb(fbd);
+	else
+		advance_ffb_simple(fbd);
+
+	BUG_ON (fbd->fbd_ffb.ext == NULL && fbd->fbd_n_free != 0);
+	BUG_ON (fbd->fbd_ffb.ext != NULL && fbd->fbd_n_free == 0);
+
+	return 1;
+}
+
+static void fbd_complete_bio(struct ploop_freeblks_desc *fbd, int err)
+{
+	struct ploop_device *plo = fbd->plo;
+	unsigned int nr_completed = 0;
+
+	while (fbd->fbd_dbl.head) {
+		struct bio * bio = fbd->fbd_dbl.head;
+		fbd->fbd_dbl.head = bio->bi_next;
+		bio->bi_next = NULL;
+		BIO_ENDIO(plo->queue, bio, err);
+		nr_completed++;
+	}
+	fbd->fbd_dbl.tail = NULL;
+
+	spin_lock_irq(&plo->lock);
+	plo->bio_total -= nr_completed;
+	if (!bio_list_empty(&plo->bio_discard_list) &&
+	    waitqueue_active(&plo->waitq))
+		wake_up_interruptible(&plo->waitq);
+	spin_unlock_irq(&plo->lock);
+}
+
+void ploop_fb_reinit(struct ploop_freeblks_desc *fbd, int err)
+{
+	fbd_complete_bio(fbd, err);
+
+	while (!list_empty(&fbd->fbd_free_list)) {
+		struct ploop_freeblks_extent *fblk_extent;
+
+		fblk_extent = list_first_entry(&fbd->fbd_free_list,
+					       struct ploop_freeblks_extent,
+					       list);
+		list_del(&fblk_extent->list);
+		kfree(fblk_extent);
+	}
+
+	while (!list_empty(&fbd->fbd_reloc_list)) {
+		struct ploop_relocblks_extent *rblk_extent;
+
+		rblk_extent = list_first_entry(&fbd->fbd_reloc_list,
+					       struct ploop_relocblks_extent,
+					       list);
+		list_del(&rblk_extent->list);
+		kfree(rblk_extent);
+	}
+
+	fbd->fbd_n_free = 0;
+	fbd->fbd_ffb.ext = NULL;
+	fbd->fbd_lfb.ext = NULL;
+	fbd->fbd_lrb.ext = NULL;
+	fbd->fbd_ffb.off = 0;
+	fbd->fbd_lfb.off = 0;
+	fbd->fbd_lrb.off = 0;
+	fbd->fbd_n_relocated = fbd->fbd_n_relocating = 0;
+	fbd->fbd_lost_range_len = 0;
+	fbd->fbd_lost_range_addon = 0;
+
+	BUG_ON(!RB_EMPTY_ROOT(&fbd->reloc_tree));
+}
+
+struct ploop_freeblks_desc *ploop_fb_init(struct ploop_device *plo)
+{
+	struct ploop_freeblks_desc *fbd;
+	int i;
+
+	fbd = kmalloc(sizeof(struct ploop_freeblks_desc), GFP_KERNEL);
+	if (fbd == NULL)
+		return NULL;
+
+	fbd->fbd_dbl.tail = fbd->fbd_dbl.head = NULL;
+	INIT_LIST_HEAD(&fbd->fbd_free_list);
+	INIT_LIST_HEAD(&fbd->fbd_reloc_list);
+	fbd->reloc_tree = RB_ROOT;
+	fbd->fbd_freezed_level = -1;
+
+	fbd->plo = plo;
+
+	ploop_fb_reinit(fbd, 0);
+
+	INIT_LIST_HEAD(&fbd->free_zero_list);
+	for (i = 0; i < plo->tune.max_requests; i++) {
+		struct ploop_request * preq;
+		preq = kzalloc(sizeof(struct ploop_request), GFP_KERNEL);
+		if (preq == NULL)
+			goto fb_init_failed;
+
+		preq->plo = plo;
+		INIT_LIST_HEAD(&preq->delay_list);
+		list_add(&preq->list, &fbd->free_zero_list);
+	}
+
+	return fbd;
+
+fb_init_failed:
+	ploop_fb_fini(fbd, -ENOMEM);
+	return NULL;
+}
+
+void ploop_fb_fini(struct ploop_freeblks_desc *fbd, int err)
+{
+	struct ploop_device *plo;
+
+	if (fbd == NULL)
+		return;
+
+	plo = fbd->plo;
+	BUG_ON (plo == NULL);
+
+	fbd_complete_bio(fbd, err);
+
+	while (!list_empty(&fbd->fbd_free_list)) {
+		struct ploop_freeblks_extent *fblk_extent;
+
+		fblk_extent = list_first_entry(&fbd->fbd_free_list,
+					       struct ploop_freeblks_extent,
+					       list);
+		list_del(&fblk_extent->list);
+		kfree(fblk_extent);
+	}
+
+	while (!list_empty(&fbd->fbd_reloc_list)) {
+		struct ploop_relocblks_extent *rblk_extent;
+
+		rblk_extent = list_first_entry(&fbd->fbd_reloc_list,
+					       struct ploop_relocblks_extent,
+					       list);
+		list_del(&rblk_extent->list);
+		kfree(rblk_extent);
+	}
+
+	while (!list_empty(&fbd->free_zero_list)) {
+		struct ploop_request * preq;
+
+		preq = list_first_entry(&fbd->free_zero_list,
+					struct ploop_request,
+					list);
+		list_del(&preq->list);
+		kfree(preq);
+	}
+
+	kfree(fbd);
+	plo->fbd = NULL;
+}
+
+int ploop_fb_add_free_extent(struct ploop_freeblks_desc *fbd,
+			     cluster_t clu, iblock_t iblk, u32 len)
+{
+	struct ploop_freeblks_extent *fblk_extent;
+	struct ploop_freeblks_extent *ex;
+
+	if (len == 0) {
+		printk("ploop_fb_add_free_extent(): empty extent! (%u/%u)\n",
+		       clu, iblk);
+		return 0;
+	}
+
+	list_for_each_entry_reverse(ex, &fbd->fbd_free_list, list)
+		if (ex->iblk < iblk)
+			break;
+
+	if (ex->list.next != &fbd->fbd_free_list) {
+		struct ploop_freeblks_extent *tmp;
+		tmp = list_entry(ex->list.next, struct ploop_freeblks_extent, list);
+
+		if (iblk + len > tmp->iblk) {
+			printk("ploop_fb_add_free_extent(): intersected extents");
+			return -EINVAL;
+		}
+	}
+
+	if (&ex->list != &fbd->fbd_free_list) {
+		if (ex->iblk + ex->len > iblk) {
+			printk("ploop_fb_add_free_extent(): intersected extents");
+			return -EINVAL;
+		}
+	}
+
+	fblk_extent = kzalloc(sizeof(*fblk_extent), GFP_KERNEL);
+	if (fblk_extent == NULL)
+		return -ENOMEM;
+
+	fblk_extent->clu  = clu;
+	fblk_extent->iblk = iblk;
+	fblk_extent->len  = len;
+
+	list_add(&fblk_extent->list, &ex->list);
+
+	fbd->fbd_n_free	 += len;
+
+	fbd->fbd_ffb.ext = list_entry(fbd->fbd_free_list.next, struct ploop_freeblks_extent, list);
+	fbd->fbd_ffb.off = 0;
+
+	return 0;
+}
+
+int ploop_fb_add_reloc_extent(struct ploop_freeblks_desc *fbd,
+			      cluster_t clu, iblock_t iblk, u32 len, u32 free)
+{
+	struct ploop_relocblks_extent *rblk_extent;
+
+	if (len == 0) {
+		printk("ploop_fb_add_reloc_extent(): empty extent! (%u/%u)\n",
+		       clu, iblk);
+		return 0;
+	}
+
+	if (!list_empty(&fbd->fbd_reloc_list)) {
+		rblk_extent = list_entry(fbd->fbd_reloc_list.prev,
+					 struct ploop_relocblks_extent, list);
+		if (rblk_extent->iblk + rblk_extent->len > iblk) {
+			printk("ploop_fb_add_reloc_extent(): extents should be sorted");
+			return -EINVAL;
+		}
+
+		if (rblk_extent->list.next != &fbd->fbd_reloc_list) {
+			rblk_extent = list_entry(rblk_extent->list.next,
+					 struct ploop_relocblks_extent, list);
+			if (iblk + len > rblk_extent->iblk) {
+				printk("ploop_fb_add_reloc_extent(): intersected extents");
+				return -EINVAL;
+			}
+		}
+	}
+
+	rblk_extent = kzalloc(sizeof(*rblk_extent), GFP_KERNEL);
+	if (rblk_extent == NULL)
+		return -ENOMEM;
+
+	rblk_extent->clu  = clu;
+	rblk_extent->iblk = iblk;
+	rblk_extent->len  = len;
+	rblk_extent->free = free;
+
+	list_add_tail(&rblk_extent->list, &fbd->fbd_reloc_list);
+
+	return 0;
+}
+
+void ploop_fb_lost_range_init(struct ploop_freeblks_desc *fbd,
+			      iblock_t first_lost_iblk)
+{
+	fbd->fbd_first_lost_iblk = first_lost_iblk;
+	fbd->fbd_lost_range_len = 0;
+}
+
+void ploop_fb_relocation_start(struct ploop_freeblks_desc *fbd,
+			       __u32 n_scanned)
+{
+	iblock_t a_h = fbd->fbd_first_lost_iblk;
+	iblock_t new_a_h; /* where a_h will be after relocation
+			     if no WRITEs intervene */
+	struct ploop_relocblks_extent *r_extent;
+	struct ploop_relocblks_extent *r_extent_first;
+	int n_free = fbd->fbd_n_free;
+	u32 l;
+	struct ploop_freeblks_extent *fextent;
+
+	BUG_ON(fbd->fbd_lost_range_len != 0);
+	if (list_empty(&fbd->fbd_reloc_list)) {
+		fbd->fbd_first_lost_iblk -= n_scanned;
+		fbd->fbd_lost_range_len	 += n_scanned;
+		return;
+	}
+
+	r_extent_first = list_entry(fbd->fbd_reloc_list.next,
+				    struct ploop_relocblks_extent, list);
+	r_extent = list_entry(fbd->fbd_reloc_list.prev,
+			      struct ploop_relocblks_extent, list);
+	new_a_h = r_extent->iblk + r_extent->len;
+
+	BUG_ON(fbd->fbd_first_lost_iblk < new_a_h);
+	fbd->fbd_lost_range_len = fbd->fbd_first_lost_iblk - new_a_h;
+	fbd->fbd_first_lost_iblk = new_a_h;
+
+	if (!n_free)
+		return;
+
+	while (1) {
+		l = MIN(n_free, r_extent->len);
+
+		n_free	-= l;
+		new_a_h -= l;
+
+		if (!n_free)
+			break;
+
+		if (r_extent->list.prev == &fbd->fbd_reloc_list) {
+			r_extent = NULL;
+			break;
+		} else {
+			r_extent = list_entry(r_extent->list.prev,
+					      struct ploop_relocblks_extent,
+					      list);
+		}
+		/* skip lost blocks */
+		new_a_h = r_extent->iblk + r_extent->len;
+	}
+
+	l = 0;
+
+	/* ploop-balloon scanned exactly range [a_h - n_scanned .. a_h - 1] */
+	if (n_free) {
+		l = r_extent_first->iblk - (a_h - n_scanned);
+	} else if (r_extent->iblk == new_a_h) {
+		if (r_extent == r_extent_first) {
+			l = r_extent->iblk - (a_h - n_scanned);
+		} else {
+			struct ploop_relocblks_extent *r_extent_prev;
+
+			BUG_ON (r_extent->list.prev == &fbd->fbd_reloc_list);
+			r_extent_prev = list_entry(r_extent->list.prev,
+						   struct ploop_relocblks_extent,
+						   list);
+			l = r_extent->iblk - (r_extent_prev->iblk +
+					      r_extent_prev->len);
+		}
+	}
+
+	new_a_h -= l;
+
+	/* let's trim reloc_list a bit based on new_a_h */
+	while (r_extent_first->iblk < new_a_h) {
+
+		if (r_extent_first->iblk + r_extent_first->len > new_a_h) {
+			l = new_a_h - r_extent_first->iblk;
+			r_extent_first->iblk += l;
+			r_extent_first->clu  += l;
+			r_extent_first->len  -= l;
+			break;
+		}
+
+		if (r_extent_first->list.next == &fbd->fbd_reloc_list) {
+			list_del(&r_extent_first->list);
+			kfree(r_extent_first);
+			break;
+		}
+
+		list_del(&r_extent_first->list);
+		kfree(r_extent_first);
+		r_extent_first = list_entry(fbd->fbd_reloc_list.next,
+					    struct ploop_relocblks_extent,
+					    list);
+	}
+
+	if (!list_empty(&fbd->fbd_reloc_list)) {
+		fbd->fbd_lrb.ext = list_entry(fbd->fbd_reloc_list.prev,
+					      struct ploop_relocblks_extent,
+					      list);
+		fbd->fbd_lrb.off = fbd->fbd_lrb.ext->len - 1;
+
+		fbd->fbd_lost_range_addon = r_extent_first->iblk - new_a_h;
+	}
+
+	/* new_a_h is calculated. now, let's find "last free block" position */
+	if (ffb_iblk(fbd) < new_a_h) {
+		list_for_each_entry_reverse(fextent, &fbd->fbd_free_list, list)
+			if (fextent->iblk < new_a_h)
+				break;
+
+		BUG_ON(&fextent->list == &fbd->fbd_free_list);
+	} else
+		fextent = NULL;
+
+	fbd->fbd_lfb.ext = fextent; /* NULL means
+				       "no free blocks for relocation" */
+	if (fextent != NULL)
+		fbd->fbd_lfb.off = MIN(new_a_h - fextent->iblk,
+				       fextent->len) - 1;
+}
+
+int ploop_discard_add_bio(struct ploop_freeblks_desc *fbd, struct bio *bio)
+{
+	struct ploop_device *plo;
+
+	if (!fbd)
+		return -EOPNOTSUPP;
+
+	plo = fbd->plo;
+
+	if (!test_bit(PLOOP_S_DISCARD, &plo->state))
+		return -EOPNOTSUPP;
+	if (fbd->plo->maintenance_type != PLOOP_MNTN_DISCARD)
+		return -EBUSY;
+	/* only one request can be processed simultaneously */
+	if (fbd->fbd_dbl.head)
+		return -EBUSY;
+
+	fbd->fbd_dbl.head = fbd->fbd_dbl.tail = bio;
+
+	return 0;
+}
+
+int ploop_discard_is_inprogress(struct ploop_freeblks_desc *fbd)
+{
+	return fbd && fbd->fbd_dbl.head != NULL;
+}
--- /dev/null
+++ b/drivers/block/ploop/freeblks.h
@@ -0,0 +1,58 @@
+/*
+ *  drivers/block/ploop/freeblks.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __FREEBLKS_H__
+#define __FREEBLKS_H__
+
+/* freeblks API - in-kernel balloon support */
+
+/* init/fini stuff */
+struct ploop_freeblks_desc *ploop_fb_init(struct ploop_device *plo);
+void ploop_fb_fini(struct ploop_freeblks_desc *fbd, int err);
+void ploop_fb_reinit(struct ploop_freeblks_desc *fbd, int err);
+int ploop_fb_add_free_extent(struct ploop_freeblks_desc *fbd, cluster_t clu, iblock_t iblk, u32 len);
+int ploop_fb_add_reloc_extent(struct ploop_freeblks_desc *fbd, cluster_t clu, iblock_t iblk, u32 len, u32 free);
+void ploop_fb_lost_range_init(struct ploop_freeblks_desc *fbd, iblock_t first_lost_iblk);
+void ploop_fb_relocation_start(struct ploop_freeblks_desc *fbd, __u32 n_scanned);
+int ploop_discard_add_bio(struct ploop_freeblks_desc *fbd, struct bio *bio);
+int ploop_discard_is_inprogress(struct ploop_freeblks_desc *fbd);
+
+/* avoid direct access to freeblks internals */
+int ploop_fb_get_n_relocated(struct ploop_freeblks_desc *fbd);
+int ploop_fb_get_n_relocating(struct ploop_freeblks_desc *fbd);
+int ploop_fb_get_n_free(struct ploop_freeblks_desc *fbd);
+iblock_t ploop_fb_get_alloc_head(struct ploop_freeblks_desc *fbd);
+int ploop_fb_get_lost_range_len(struct ploop_freeblks_desc *fbd);
+iblock_t ploop_fb_get_first_lost_iblk(struct ploop_freeblks_desc *fbd);
+
+/* get/set freezed level (for sanity checks) */
+int ploop_fb_get_freezed_level(struct ploop_freeblks_desc *fbd);
+void ploop_fb_set_freezed_level(struct ploop_freeblks_desc *fbd, int level);
+
+/* maintain rb-tree of "in progress" relocation requests */
+void ploop_fb_add_reloc_req(struct ploop_freeblks_desc *fbd, struct ploop_request *preq);
+void ploop_fb_del_reloc_req(struct ploop_freeblks_desc *fbd, struct ploop_request *preq);
+int ploop_fb_check_reloc_req(struct ploop_freeblks_desc *fbd, struct ploop_request *preq, unsigned long pin_state);
+
+/* helper for ioctl(PLOOP_IOC_FBGET) */
+int ploop_fb_copy_freeblks_to_user(struct ploop_freeblks_desc *fbd, void *arg,
+				   struct ploop_freeblks_ctl *ctl);
+int ploop_fb_filter_freeblks(struct ploop_freeblks_desc *fbd, unsigned long minlen);
+
+/* get/put "zero index" request */
+struct ploop_request *ploop_fb_get_zero_request(struct ploop_freeblks_desc *fbd);
+void ploop_fb_put_zero_request(struct ploop_freeblks_desc *fbd, struct ploop_request *preq);
+
+/* get/put block to relocate */
+int ploop_fb_get_reloc_block(struct ploop_freeblks_desc *fbd, cluster_t *from_clu, iblock_t *from_iblk,
+			     cluster_t *to_clu, iblock_t *to_iblk, u32 *free);
+void ploop_fb_relocate_req_completed(struct ploop_freeblks_desc *fbd);
+
+/* get free block to reuse */
+int ploop_fb_get_free_block(struct ploop_freeblks_desc *fbd, cluster_t *clu, iblock_t *iblk);
+
+#endif
--- /dev/null
+++ b/drivers/block/ploop/io.c
@@ -0,0 +1,150 @@
+/*
+ *  drivers/block/ploop/io.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+
+#include <linux/ploop/ploop.h>
+#include <linux/ploop/ploop_if.h>
+
+/* Generic IO routines. */
+
+static LIST_HEAD(ploop_ios);
+static DEFINE_MUTEX(ploop_ios_mutex);
+
+int ploop_register_io(struct ploop_io_ops * ops)
+{
+	mutex_lock(&ploop_ios_mutex);
+	list_add(&ops->list, &ploop_ios);
+	mutex_unlock(&ploop_ios_mutex);
+	return 0;
+}
+EXPORT_SYMBOL(ploop_register_io);
+
+void ploop_unregister_io(struct ploop_io_ops * ops)
+{
+	mutex_lock(&ploop_ios_mutex);
+	list_del(&ops->list);
+	mutex_unlock(&ploop_ios_mutex);
+}
+EXPORT_SYMBOL(ploop_unregister_io);
+
+static struct ploop_io_ops * ploop_io_get(struct ploop_io *io, unsigned int id)
+{
+	struct ploop_io_ops * ops;
+
+	mutex_lock(&ploop_ios_mutex);
+	list_for_each_entry(ops, &ploop_ios, list) {
+		if ((id == ops->id || id == PLOOP_IO_AUTO) &&
+		    !ops->autodetect(io) && try_module_get(ops->owner)) {
+			mutex_unlock(&ploop_ios_mutex);
+			return ops;
+		}
+	}
+	mutex_unlock(&ploop_ios_mutex);
+	return NULL;
+}
+
+void ploop_io_put(struct ploop_io_ops * ops)
+{
+	module_put(ops->owner);
+}
+
+
+int
+ploop_io_init(struct ploop_delta * delta, int nchunks, struct ploop_ctl_chunk * pc)
+{
+	int err;
+
+	if (nchunks != 1)
+		return -EINVAL;
+
+	if (pc[0].pctl_offset ||
+	    pc[0].pctl_start ||
+	    pc[0].pctl_len)
+		return -EINVAL;
+
+	memset(&delta->io, 0, sizeof(struct ploop_io));
+	delta->io.plo = delta->plo;
+	delta->io.n_chunks = 1;
+
+	err = -EBADF;
+	delta->io.files.file = fget(pc[0].pctl_fd);
+	if (!delta->io.files.file)
+		goto out_err;
+
+	err = -EOPNOTSUPP;
+	delta->io.ops = ploop_io_get(&delta->io, pc[0].pctl_type);
+	if (delta->io.ops == NULL)
+		goto out_err;
+
+	err = delta->io.ops->init(&delta->io);
+	if (err)
+		goto out_err;
+
+	return 0;
+
+out_err:
+	if (delta->io.files.file)
+		fput(delta->io.files.file);
+	delta->io.files.file = NULL;
+	if (delta->io.ops)
+		ploop_io_put(delta->io.ops);
+	delta->io.ops = NULL;
+	return err;
+}
+EXPORT_SYMBOL(ploop_io_init);
+
+int ploop_io_open(struct ploop_io * io)
+{
+	struct file * file;
+	struct ploop_delta * delta = container_of(io, struct ploop_delta, io);
+
+	if ((file = io->files.file) == NULL)
+		return -EBADF;
+
+	if ((delta->flags & PLOOP_FMT_RDONLY) &&
+	    (io->ops->f_mode(io) & FMODE_WRITE))
+		return -EINVAL;
+
+	if (!(delta->flags & PLOOP_FMT_RDONLY) &&
+	    !(io->ops->f_mode(io) & FMODE_WRITE))
+		return -EINVAL;
+
+	return io->ops->open(io);
+}
+EXPORT_SYMBOL(ploop_io_open);
+
+void ploop_io_destroy(struct ploop_io * io)
+{
+	if (io->ops) {
+		io->ops->destroy(io);
+		ploop_io_put(io->ops);
+		io->ops = NULL;
+	}
+}
+EXPORT_SYMBOL(ploop_io_destroy);
+
+void ploop_io_report_fn(struct file * file, char * msg)
+{
+	char *fn = "?";
+	char *path;
+
+	path = (char *)__get_free_page(GFP_KERNEL);
+	if (path) {
+		fn = d_path(&file->f_path, path, PAGE_SIZE);
+		if (IS_ERR(fn))
+			fn = "?";
+	}
+
+	printk("%s: %s\n", msg, fn);
+
+	if (path)
+		free_page((unsigned long)path);
+}
+EXPORT_SYMBOL(ploop_io_report_fn);
--- /dev/null
+++ b/drivers/block/ploop/io_direct.c
@@ -0,0 +1,1972 @@
+/*
+ *  drivers/block/ploop/io_direct.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/bio.h>
+#include <linux/pagemap.h>
+#include <linux/blkdev.h>
+#include <linux/kthread.h>
+#include <linux/mount.h>
+#include <linux/buffer_head.h>
+#include <linux/falloc.h>
+#include <linux/magic.h>
+
+#include <linux/ploop/ploop.h>
+#include <linux/ploop/ploop_if.h>
+#include <linux/ploop/compat.h>
+#include "ploop_events.h"
+#include "io_direct_map.h"
+
+#define CREATE_TRACE_POINTS
+#include "io_direct_events.h"
+
+/* from fs/ext4/ext4.h */
+#define EXT4_EXTENTS_FL			0x00080000
+
+#define MIN(a, b) (a < b ? a : b)
+
+#define PLOOP_MAX_PREALLOC(plo) (128 * 1024 * 1024) /* 128MB */
+
+#define PLOOP_MAX_EXTENT_MAP (64 * 1024 * 1024)    /* 64MB */
+int max_extent_map_pages __read_mostly;
+int min_extent_map_entries __read_mostly;
+
+/* total sum of m->size for all ploop_mapping structs */
+atomic_long_t ploop_io_images_size = ATOMIC_LONG_INIT(0);
+
+/* Direct IO from/to file.
+ *
+ * Holes in image file are not allowed.
+ */
+
+static inline sector_t
+dio_isec_to_phys(struct extent_map * em, sector_t isec)
+{
+	return (isec - em->start) + em->block_start;
+}
+
+DEFINE_BIO_CB(dio_endio_async)
+{
+	struct ploop_request * preq = bio->bi_private;
+
+	if (!err && !bio_flagged(bio, BIO_UPTODATE))
+		err = -EIO;
+	if (err)
+		PLOOP_REQ_SET_ERROR(preq, err);
+
+	ploop_complete_io_request(preq);
+
+	bio_put(bio);
+}
+END_BIO_CB(dio_endio_async)
+
+struct bio_list_walk
+{
+	struct bio * cur;
+	int idx;
+	int bv_off;
+};
+
+static int cached_submit(struct ploop_io *io, iblock_t iblk,
+	      struct ploop_request * preq,
+	      struct bio_list * sbl, unsigned int size, bool use_prealloc);
+
+static void
+dio_submit(struct ploop_io *io, struct ploop_request * preq,
+	   unsigned long rw,
+	   struct bio_list *sbl, iblock_t iblk, unsigned int size)
+{
+	struct bio_list bl;
+	struct bio * bio = NULL;
+	struct extent_map * em;
+	sector_t sec, nsec;
+	int err;
+	struct bio_list_walk bw;
+	int write = !!(rw & REQ_WRITE);
+	int delayed_fua = 0;
+
+	trace_submit(preq);
+
+	if ((rw & REQ_FUA) && ploop_req_delay_fua_possible(preq)) {
+		/* Mark req that delayed flush required */
+		preq->req_rw |= (REQ_FLUSH | REQ_FUA);
+		delayed_fua = 1;
+	}
+
+	rw &= ~(REQ_FLUSH | REQ_FUA);
+
+
+	bio_list_init(&bl);
+
+	if (iblk == PLOOP_ZERO_INDEX)
+		iblk = 0;
+
+	if ((rw & REQ_WRITE) &&
+	    !(io->files.file->f_mode & FMODE_WRITE)) {
+		err = -EBADF;
+		goto out;
+	}
+
+	sec = sbl->head->bi_sector;
+	sec = ((sector_t)iblk << preq->plo->cluster_log) | (sec & ((1<<preq->plo->cluster_log) - 1));
+
+	em = extent_lookup_create(io, sec, size);
+	if (IS_ERR(em))
+		goto out_em_err;
+
+	if (write && em->uninit) {
+		sector_t end = (sector_t)(iblk + 1) << preq->plo->cluster_log;
+		sec = (sector_t)iblk << preq->plo->cluster_log;
+
+		if (em->start <= sec)
+			sec = em->end;
+		ploop_extent_put(em);
+
+		while (sec < end) {
+			em = extent_lookup_create(io, sec, end - sec);
+			if (IS_ERR(em))
+				goto out_em_err;
+			if (!em->uninit)
+				goto write_unint_fail;
+
+			sec = em->end;
+			ploop_extent_put(em);
+		}
+
+		goto write_unint;
+	}
+
+	ploop_prepare_io_request(preq);
+	if (rw & REQ_WRITE)
+		ploop_prepare_tracker(preq, sec);
+
+	bw.cur = sbl->head;
+	bw.idx = 0;
+	bw.bv_off = 0;
+	BUG_ON(bw.cur->bi_io_vec[0].bv_len & 511);
+
+	bio = NULL;
+
+	while (size > 0) {
+		struct bio_vec * bv;
+		int copy;
+
+		bv = bw.cur->bi_io_vec + bw.idx;
+
+		if (bw.bv_off >= bv->bv_len) {
+			bw.idx++;
+			bv++;
+			bw.bv_off = 0;
+			if (bw.idx >= bw.cur->bi_vcnt) {
+				bw.cur = bw.cur->bi_next;
+				bw.idx = 0;
+				bv = bw.cur->bi_io_vec;
+			}
+			BUG_ON(bv->bv_len & 511);
+		}
+
+		if (sec >= em->end) {
+			ploop_extent_put(em);
+			em = extent_lookup_create(io, sec, size);
+			if (IS_ERR(em))
+				goto out_em_err;
+			if (write && em->uninit)
+				goto write_unint_fail;
+		}
+
+		nsec = dio_isec_to_phys(em, sec);
+
+		if (!em->uninit &&
+		     (bio == NULL ||
+		     bio->bi_sector + (bio->bi_size>>9) != nsec)) {
+
+flush_bio:
+			bio = bio_alloc(GFP_NOFS, 32);
+			if (bio == NULL)
+				goto enomem;
+			bio_list_add(&bl, bio);
+			bio->bi_bdev = io->files.bdev;
+			bio->bi_sector = nsec;
+		}
+
+		copy = bv->bv_len - bw.bv_off;
+		if (copy > ((em->end - sec) << 9))
+			copy = (em->end - sec) << 9;
+
+		if (em->uninit) {
+			void *kaddr = kmap_atomic(bv->bv_page);
+			memset(kaddr + bv->bv_offset + bw.bv_off, 0, copy);
+			kunmap_atomic(kaddr);
+		} else if (bio_add_page(bio, bv->bv_page, copy,
+				 bv->bv_offset + bw.bv_off) != copy) {
+			/* Oops, this chunk does not fit. Flush and start
+			 * fresh bio.
+			 */
+			goto flush_bio;
+		}
+
+		bio->bi_rw |= bw.cur->bi_rw &
+			(REQ_FLUSH | delayed_fua ? 0 : REQ_FUA);
+		bw.bv_off += copy;
+		size -= copy >> 9;
+		sec += copy >> 9;
+	}
+	ploop_extent_put(em);
+
+	while (bl.head) {
+		struct bio * b = bl.head;
+		unsigned long rw2 = rw;
+
+		bl.head = b->bi_next;
+		atomic_inc(&preq->io_count);
+		b->bi_next = NULL;
+		b->bi_private = preq;
+		b->bi_end_io = dio_endio_async;
+
+		ploop_acc_ff_out(preq->plo, rw2 | b->bi_rw);
+		submit_bio(rw2, b);
+	}
+
+	ploop_complete_io_request(preq);
+	return;
+
+
+enomem:
+	err = -ENOMEM;
+	goto out;
+
+write_unint:
+	spin_lock_irq(&preq->plo->lock);
+	ploop_add_lockout(preq, 0);
+	spin_unlock_irq(&preq->plo->lock);
+
+	err = cached_submit(io, iblk, preq, sbl, size, false);
+	goto out;
+
+write_unint_fail:
+	ploop_extent_put(em);
+	err = -EIO;
+	ploop_msg_once(io->plo, "A part of cluster is in uninitialized extent.");
+	goto out;
+
+out_em_err:
+	err = PTR_ERR(em);
+out:
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+		b->bi_next = NULL;
+		bio_put(b);
+	}
+
+	if (err)
+		PLOOP_FAIL_REQUEST(preq, err);
+}
+
+struct bio_iter {
+	struct bio     *bio;  /* traverses sbl */
+	struct bio_vec *bv;   /* traverses bio->bi_io_vec */
+	int             off;  /* offset in bv payload:
+			       * 0 <= off < bv->bv_len */
+};
+
+static inline void bio_iter_init(struct bio_iter *biter, struct bio_list *sbl)
+{
+	biter->bio  = sbl->head;
+	biter->bv   = biter->bio->bi_io_vec;
+	biter->off  = 0;
+}
+
+static inline void bio_iter_advance(struct bio_iter *biter, int len)
+{
+	if (biter->bv->bv_len - biter->off > len) {
+		biter->off += len;
+		return;
+	}
+
+	BUG_ON (biter->bv->bv_len - biter->off != len);
+
+	biter->bv++;
+	biter->off = 0;
+
+	if (biter->bv - biter->bio->bi_io_vec < biter->bio->bi_vcnt)
+		return;
+
+	biter->bio = biter->bio->bi_next;
+	if (biter->bio)
+		biter->bv = biter->bio->bi_io_vec;
+}
+
+static void bcopy_from_blist(struct page *page, int dst_off, /* dst */
+			     struct bio_iter *biter,         /* src */
+			     int copy_len)                   /* len */
+{
+	u8 *kdst = kmap_atomic(page);
+
+	while (copy_len > 0) {
+		u8 *ksrc;
+		int copy = MIN(copy_len, biter->bv->bv_len - biter->off);
+
+		ksrc = kmap_atomic(biter->bv->bv_page);
+		memcpy(kdst + dst_off,
+		       ksrc + biter->bv->bv_offset + biter->off,
+		       copy);
+		kunmap_atomic(ksrc);
+
+		copy_len -= copy;
+		dst_off  += copy;
+		bio_iter_advance(biter, copy);
+		BUG_ON (copy_len && !biter->bio);
+	}
+
+	kunmap_atomic(kdst);
+}
+
+static inline void bzero_page(struct page *page)
+{
+	void *kaddr = kmap_atomic(page);
+
+	memset(kaddr, 0, PAGE_SIZE);
+
+	kunmap_atomic(kaddr);
+}
+
+static void
+dio_submit_pad(struct ploop_io *io, struct ploop_request * preq,
+	       struct bio_list * sbl, unsigned int size,
+	       struct extent_map *em);
+
+static int
+cached_submit(struct ploop_io *io, iblock_t iblk, struct ploop_request * preq,
+	      struct bio_list * sbl, unsigned int size, bool use_prealloc)
+{
+	struct ploop_device * plo = preq->plo;
+	int err = 0;
+	loff_t pos, end_pos, start, end;
+	loff_t clu_siz = 1 << (plo->cluster_log + 9);
+	struct bio_iter biter;
+	loff_t new_size;
+	loff_t used_pos;
+	bool may_fallocate = io->files.file->f_op->fallocate &&
+		io->files.flags & EXT4_EXTENTS_FL;
+
+	trace_cached_submit(preq);
+
+	pos = (loff_t)iblk << (plo->cluster_log + 9);
+	end_pos = pos + clu_siz;
+	used_pos = (io->alloc_head - 1) << (io->plo->cluster_log + 9);
+
+	file_start_write(io->files.file);
+
+	if (use_prealloc && end_pos > used_pos && may_fallocate) {
+		if (unlikely(io->prealloced_size < clu_siz)) {
+			loff_t prealloc = end_pos;
+			if (prealloc > PLOOP_MAX_PREALLOC(plo))
+				prealloc = PLOOP_MAX_PREALLOC(plo);
+try_again:
+			err = io->files.file->f_op->fallocate(io->files.file, 0,
+							       pos, prealloc);
+			if (err) {
+				if (err == -ENOSPC && prealloc != clu_siz) {
+					prealloc = clu_siz;
+					goto try_again;
+				} else {
+					goto end_write;
+				}
+			}
+
+			/* flush new i_size to disk */
+			err = io->ops->sync(io);
+			if (err)
+				goto end_write;
+
+			io->prealloced_size = prealloc;
+		}
+
+		io->prealloced_size -= clu_siz;
+	}
+
+	if (may_fallocate) {
+		sector_t sec = (sector_t)iblk << preq->plo->cluster_log;
+		sector_t len = 1 << preq->plo->cluster_log;
+		struct extent_map * em = extent_lookup_create(io, sec, len);
+
+		if (unlikely(IS_ERR(em))) {
+			err = PTR_ERR(em);
+			goto end_write;
+		}
+
+		preq->iblock = iblk;
+		preq->eng_io = io;
+		BUG_ON(test_bit(PLOOP_REQ_ISSUE_FLUSH, &preq->state));
+		set_bit(PLOOP_REQ_POST_SUBMIT, &preq->state);
+		dio_submit_pad(io, preq, sbl, size, em);
+		err = 0;
+		goto end_write;
+	}
+
+	bio_iter_init(&biter, sbl);
+	mutex_lock(&io->files.inode->i_mutex);
+
+	start = pos + ((sbl->head->bi_sector & ((1<<plo->cluster_log)-1)) << 9);
+	end = start + (size << 9);
+	ploop_prepare_tracker(preq, start>>9);
+
+	while (pos < end_pos) {
+		struct page * page;
+		void * fsdata;
+
+		err = pagecache_write_begin(io->files.file, io->files.mapping,
+					    pos, PAGE_CACHE_SIZE, 0,
+					    &page, &fsdata);
+		if (err)
+			break;
+
+		if (pos < start || pos + PAGE_CACHE_SIZE > end)
+			bzero_page(page);
+
+		if (pos < end && pos + PAGE_CACHE_SIZE > start) {
+			int dst_off = 0;
+			int copy_len = PAGE_CACHE_SIZE;
+
+			if (pos < start) {
+				dst_off = start - pos;
+				copy_len -= dst_off;
+				if (pos + PAGE_CACHE_SIZE > end)
+					copy_len = end - start;
+			} else {
+				if (pos + PAGE_CACHE_SIZE > end)
+					copy_len = end - pos;
+			}
+
+			bcopy_from_blist(page, dst_off, &biter, copy_len);
+		}
+
+		err = pagecache_write_end(io->files.file, io->files.mapping,
+					  pos, PAGE_CACHE_SIZE, PAGE_CACHE_SIZE,
+					  page, &fsdata);
+		if (err != PAGE_CACHE_SIZE) {
+			if (err >= 0)
+				err = -EIO;
+			break;
+		}
+		err = 0;
+
+		pos += PAGE_CACHE_SIZE;
+	}
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	new_size = i_size_read(io->files.inode);
+	atomic_long_add(new_size - *io->size_ptr, &ploop_io_images_size);
+	*io->size_ptr = new_size;
+
+	if (!err)
+		err = filemap_fdatawrite(io->files.mapping);
+
+	if (!err) {
+		spin_lock_irq(&plo->lock);
+		ploop_acc_flush_skip_locked(plo, preq->req_rw);
+		preq->iblock = iblk;
+		list_add_tail(&preq->list, &io->fsync_queue);
+		io->fsync_qlen++;
+		plo->st.bio_syncwait++;
+		if ((test_bit(PLOOP_REQ_SYNC, &preq->state) ||
+		     io->fsync_qlen >= plo->tune.fsync_max) &&
+		    waitqueue_active(&io->fsync_waitq))
+			wake_up_interruptible(&io->fsync_waitq);
+		else if (!timer_pending(&io->fsync_timer))
+			mod_timer(&io->fsync_timer, jiffies + plo->tune.fsync_delay);
+		spin_unlock_irq(&plo->lock);
+	}
+end_write:
+	file_end_write(io->files.file);
+	return err;
+}
+
+static void
+dio_post_submit(struct ploop_io *io, struct ploop_request * preq)
+{
+	struct ploop_device *plo = preq->plo;
+	sector_t sec = (sector_t)preq->iblock << preq->plo->cluster_log;
+	loff_t clu_siz = 1 << (preq->plo->cluster_log + 9);
+	int force_sync = preq->req_rw & REQ_FUA;
+	int err;
+
+	file_start_write(io->files.file);
+
+	if (!force_sync) {
+		/* Here io->io_count is even ... */
+		spin_lock_irq(&plo->lock);
+		io->io_count++;
+		set_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state);
+		spin_unlock_irq(&plo->lock);
+	}
+	err = io->files.file->f_op->fallocate(io->files.file,
+					      FALLOC_FL_CONVERT_UNWRITTEN,
+					      (loff_t)sec << 9, clu_siz);
+
+	/* highly unlikely case: FUA coming to a block not provisioned yet */
+	if (!err && force_sync)
+		err = io->ops->sync(io);
+
+	if (!force_sync) {
+		spin_lock_irq(&plo->lock);
+		io->io_count++;
+		spin_unlock_irq(&plo->lock);
+	}
+	/* and here io->io_count is even (+2) again. */
+
+	file_end_write(io->files.file);
+	if (err) {
+		PLOOP_REQ_SET_ERROR(preq, err);
+		set_bit(PLOOP_S_ABORT, &preq->plo->state);
+	}
+}
+
+/* Submit the whole cluster. If preq contains only partial data
+ * within the cluster, pad the rest of cluster with zeros.
+ */
+static void
+dio_submit_pad(struct ploop_io *io, struct ploop_request * preq,
+	       struct bio_list * sbl, unsigned int size,
+	       struct extent_map *em)
+{
+	struct bio_list bl;
+	struct bio * bio = NULL;
+	sector_t sec, end_sec, nsec, start, end;
+	struct bio_list_walk bw;
+	int err;
+
+	bio_list_init(&bl);
+
+	/* sec..end_sec is the range which we are going to write */
+	sec = (sector_t)preq->iblock << preq->plo->cluster_log;
+	end_sec = sec + (1 << preq->plo->cluster_log);
+
+	/* start..end is data that we have. The rest must be zero padded. */
+	start = sec + (sbl->head->bi_sector & ((1<<preq->plo->cluster_log) - 1));
+	end = start + size;
+
+	if (IS_ERR(em))
+		goto out_em_err;
+
+#if 1
+	/* GCC, shut up! */
+	bw.cur = sbl->head;
+	bw.idx = 0;
+	bw.bv_off = 0;
+	BUG_ON(bw.cur->bi_io_vec[0].bv_len & 511);
+#endif
+
+	ploop_prepare_io_request(preq);
+	ploop_prepare_tracker(preq, start);
+
+	bio = NULL;
+
+	while (sec < end_sec) {
+		struct page * page;
+		unsigned int poff, plen;
+		bool zero_page;
+
+		if (sec < start) {
+			zero_page = true;
+			page = ZERO_PAGE(0);
+			poff = 0;
+			plen = start - sec;
+			if (plen > (PAGE_SIZE>>9))
+				plen = (PAGE_SIZE>>9);
+		} else if (sec >= end) {
+			zero_page = true;
+			page = ZERO_PAGE(0);
+			poff = 0;
+			plen = end_sec - sec;
+			if (plen > (PAGE_SIZE>>9))
+				plen = (PAGE_SIZE>>9);
+		} else {
+			/* sec >= start && sec < end */
+			struct bio_vec * bv;
+			zero_page = false;
+
+			if (sec == start) {
+				bw.cur = sbl->head;
+				bw.idx = 0;
+				bw.bv_off = 0;
+				BUG_ON(bw.cur->bi_io_vec[0].bv_len & 511);
+			}
+			bv = bw.cur->bi_io_vec + bw.idx;
+
+			if (bw.bv_off >= bv->bv_len) {
+				bw.idx++;
+				bv++;
+				bw.bv_off = 0;
+				if (bw.idx >= bw.cur->bi_vcnt) {
+					bw.cur = bw.cur->bi_next;
+					bw.idx = 0;
+					bw.bv_off = 0;
+					bv = bw.cur->bi_io_vec;
+				}
+				BUG_ON(bv->bv_len & 511);
+			}
+
+			page = bv->bv_page;
+			poff = bv->bv_offset + bw.bv_off;
+			plen = (bv->bv_len - bw.bv_off) >> 9;
+		}
+
+		if (sec >= em->end) {
+			ploop_extent_put(em);
+			em = extent_lookup_create(io, sec, end_sec - sec);
+			if (IS_ERR(em))
+				goto out_em_err;
+		}
+
+		nsec = dio_isec_to_phys(em, sec);
+
+		if (bio == NULL ||
+		    bio->bi_sector + (bio->bi_size>>9) != nsec) {
+
+flush_bio:
+			bio = bio_alloc(GFP_NOFS, 32);
+			if (bio == NULL)
+				goto enomem;
+			bio_list_add(&bl, bio);
+			bio->bi_bdev = io->files.bdev;
+			bio->bi_sector = nsec;
+		}
+
+		if (plen > em->end - sec)
+			plen = em->end - sec;
+
+		if (bio_add_page(bio, page, plen<<9, poff) != (plen<<9)) {
+			/* Oops, this chunk does not fit. Flush and start
+			 * new bio
+			 */
+			goto flush_bio;
+		}
+
+		/* Handle FLUSH here, dio_post_submit will handle FUA */
+		if (!zero_page)
+			bio->bi_rw |= bw.cur->bi_rw & REQ_FLUSH;
+
+		bw.bv_off += (plen<<9);
+		BUG_ON(plen == 0);
+		sec += plen;
+	}
+	ploop_extent_put(em);
+
+	while (bl.head) {
+		unsigned long rw;
+		struct bio * b = bl.head;
+
+		bl.head = b->bi_next;
+		atomic_inc(&preq->io_count);
+		b->bi_next = NULL;
+		b->bi_private = preq;
+		b->bi_end_io = dio_endio_async;
+
+		rw = preq->req_rw & ~(REQ_FLUSH | REQ_FUA);
+		ploop_acc_ff_out(preq->plo, rw | b->bi_rw);
+		submit_bio(rw, b);
+	}
+
+	ploop_complete_io_request(preq);
+	return;
+
+
+enomem:
+	err = -ENOMEM;
+	goto out;
+
+out_em_err:
+	err = PTR_ERR(em);
+out:
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+		b->bi_next = NULL;
+		bio_put(b);
+	}
+	PLOOP_FAIL_REQUEST(preq, err);
+}
+
+static struct extent_map * dio_fallocate(struct ploop_io *io, u32 iblk, int nr)
+{
+	struct extent_map * em;
+	mutex_lock(&io->files.inode->i_mutex);
+	em = map_extent_get_block(io,
+				  io->files.mapping,
+				  (sector_t)iblk << io->plo->cluster_log,
+				  1 << io->plo->cluster_log,
+				  1, mapping_gfp_mask(io->files.mapping),
+				  NULL);
+	mutex_unlock(&io->files.inode->i_mutex);
+	return em;
+}
+
+
+static void
+dio_submit_alloc(struct ploop_io *io, struct ploop_request * preq,
+		 struct bio_list * sbl, unsigned int size)
+{
+	int err;
+	iblock_t iblk = io->alloc_head++;
+
+	trace_submit_alloc(preq);
+
+	if (!(io->files.file->f_mode & FMODE_WRITE)) {
+		PLOOP_FAIL_REQUEST(preq, -EBADF);
+		return;
+	}
+
+	/* io->fallocate is not a "posix" fallocate()!
+	 *
+	 * We require backing fs gave us _uninitialized_ blocks,
+	 * otherwise it does not make sense to go that way.
+	 *
+	 * IMPORTANT: file _grows_ and dio_submit_alloc() cannot
+	 * complete requests until i_size is commited to disk.
+	 * Read this as: no hope to do this in a non-suboptimal way,
+	 * linux updates i_size synchronously even when O_DIRECT AIO
+	 * is requested. Even in PCSS we have to update i_size synchronously.
+	 * Obviously, we will expand file by larger pieces
+	 * and take some measures to avoid initialization of the blocks
+	 * and the same time leakage of uninitizlized data
+	 * to user of our device.
+	 */
+	if (io->files.em_tree->_get_extent) {
+		struct extent_map * em;
+
+		em = dio_fallocate(io, iblk, 1);
+		if (unlikely(IS_ERR(em))) {
+			PLOOP_FAIL_REQUEST(preq, PTR_ERR(em));
+			return;
+		}
+
+		preq->iblock = iblk;
+		preq->eng_state = PLOOP_E_DATA_WBI;
+
+		dio_submit_pad(io, preq, sbl, size, em);
+		return;
+	}
+
+	err = cached_submit(io, iblk, preq, sbl, size, true);
+	if (err) {
+		if (err == -ENOSPC)
+			io->alloc_head--;
+		PLOOP_FAIL_REQUEST(preq, err);
+	}
+	preq->eng_state = PLOOP_E_DATA_WBI;
+}
+
+/* When backing fs does not export any method to allocate new blocks
+ * without initialization, we fallback to cached write with subsequent
+ * fsync. Obviously, this is going to be utterly inefficient.
+ *
+ * Here is a workaround. We start writeback, but do not fsync()
+ * immediately, but start a timer, which wakes up ploop_sync thread.
+ *
+ * Requests are queued to ploop_sync and when timer expires or we
+ * have a lot of requests scheduled for sync, the thread call
+ * real fsync.
+ *
+ * Still not sure this is an improvement. :-)
+ */
+
+static int dio_fsync_thread(void * data)
+{
+	struct ploop_io * io = data;
+	struct ploop_device * plo = io->plo;
+	u64 io_count;
+
+	set_user_nice(current, -20);
+
+	spin_lock_irq(&plo->lock);
+	while (!kthread_should_stop() || !list_empty(&io->fsync_queue)) {
+		int err;
+		LIST_HEAD(list);
+
+		DEFINE_WAIT(_wait);
+		for (;;) {
+			prepare_to_wait(&io->fsync_waitq, &_wait, TASK_INTERRUPTIBLE);
+			if (!list_empty(&io->fsync_queue) ||
+			    kthread_should_stop())
+				break;
+
+			spin_unlock_irq(&plo->lock);
+			schedule();
+			spin_lock_irq(&plo->lock);
+		}
+		finish_wait(&io->fsync_waitq, &_wait);
+
+		if (list_empty(&io->fsync_queue) && kthread_should_stop())
+			break;
+
+		INIT_LIST_HEAD(&list);
+		list_splice_init(&io->fsync_queue, &list);
+		io_count = io->io_count;
+		spin_unlock_irq(&plo->lock);
+
+		/* filemap_fdatawrite() has been made already */
+		filemap_fdatawait(io->files.mapping);
+
+		err = io->ops->sync(io);
+
+		/* Do we need to invalidate page cache? Not really,
+		 * because we use it only to create full new pages,
+		 * which we overwrite completely. Probably, we should
+		 * invalidate in a non-blocking way to reclaim memory
+		 * faster than it happens with normal LRU logic.
+		 */
+
+		spin_lock_irq(&plo->lock);
+
+		if (io_count == io->io_count && !(io_count & 1))
+			clear_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state);
+
+		while (!list_empty(&list)) {
+			struct ploop_request * preq;
+			preq = list_entry(list.next, struct ploop_request, list);
+			list_del(&preq->list);
+			if (err)
+				PLOOP_REQ_SET_ERROR(preq, err);
+
+			__set_bit(PLOOP_REQ_FSYNC_DONE, &preq->state);
+			list_add_tail(&preq->list, &plo->ready_queue);
+			io->fsync_qlen--;
+		}
+		plo->st.bio_fsync++;
+
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+	}
+	spin_unlock_irq(&plo->lock);
+	return 0;
+}
+
+/* Invalidate page cache. It is called with inode mutex taken
+ * and mapping mapping must be synced. If some dirty pages remained,
+ * it will fail.
+ *
+ * Retry with fs freeze is required to work around a race (bug?)
+ * in ext3, where some blocks can be held by uncommited transaction.
+ * The procedure is dangerous. No mutexes should be held, ploop
+ * must not be quiesced.
+ */
+
+static int dio_invalidate_cache(struct address_space * mapping,
+				struct block_device * bdev)
+{
+	int err;
+	int attempt2 = 0;
+
+retry:
+	err = invalidate_inode_pages2(mapping);
+	if (err) {
+		struct ploop_device *plo = bdev->bd_disk->private_data;
+		struct block_device *dm_crypt_bdev;
+
+		printk("PLOOP: failed to invalidate page cache %d/%d\n", err, attempt2);
+		if (attempt2)
+			return err;
+		attempt2 = 1;
+
+		mutex_unlock(&mapping->host->i_mutex);
+
+		dm_crypt_bdev = ploop_get_dm_crypt_bdev(plo);
+		if (dm_crypt_bdev)
+			bdev = dm_crypt_bdev;
+		else
+			bdgrab(bdev);
+
+		thaw_bdev(bdev, freeze_bdev(bdev));
+		bdput(bdev);
+
+		mutex_lock(&mapping->host->i_mutex);
+		goto retry;
+	}
+	return err;
+}
+
+static int dio_truncate(struct ploop_io *, struct file *, __u32);
+
+static int dio_release_prealloced(struct ploop_io * io)
+{
+	int ret;
+
+	if (!io->prealloced_size)
+		return 0;
+
+	ret = dio_truncate(io, io->files.file, io->alloc_head);
+	if (ret)
+		printk("Can't release %llu prealloced bytes: "
+		       "truncate to %llu failed (%d)\n",
+		       io->prealloced_size,
+		       (loff_t)io->alloc_head << (io->plo->cluster_log + 9),
+		       ret);
+	else
+		io->prealloced_size = 0;
+
+	return ret;
+}
+
+static void dio_destroy(struct ploop_io * io)
+{
+	if (io->files.file) {
+		struct file * file;
+		struct ploop_delta * delta = container_of(io, struct ploop_delta, io);
+
+		(void)dio_release_prealloced(io);
+
+		if (io->files.em_tree) {
+			io->files.em_tree = NULL;
+			mutex_lock(&io->files.inode->i_mutex);
+			ploop_dio_close(io, delta->flags & PLOOP_FMT_RDONLY);
+			(void)dio_invalidate_cache(io->files.mapping, io->files.bdev);
+			mutex_unlock(&io->files.inode->i_mutex);
+		}
+
+		del_timer_sync(&io->fsync_timer);
+
+		if (io->fsync_thread) {
+			kthread_stop(io->fsync_thread);
+			io->fsync_thread = NULL;
+		}
+
+		file = io->files.file;
+		mutex_lock(&delta->plo->sysfs_mutex);
+		io->files.file = NULL;
+		mutex_unlock(&delta->plo->sysfs_mutex);
+		if (!(delta->flags & PLOOP_FMT_RDONLY))
+			file_update_time(file);
+		fput(file);
+	}
+}
+
+static int dio_sync(struct ploop_io * io)
+{
+	struct file * file = io->files.file;
+	int err = 0;
+
+	if (file)
+		err = file->f_op->fsync(file, 0, LLONG_MAX, 0);
+
+	return err;
+}
+
+static int dio_stop(struct ploop_io * io)
+{
+	return io->ops->sync(io);
+}
+
+static int dio_open(struct ploop_io * io)
+{
+	struct ploop_delta * delta = container_of(io, struct ploop_delta, io);
+	int err = 0;
+	struct file * file = io->files.file;
+	struct extent_map_tree * em_tree;
+
+	if (file == NULL)
+		return -EBADF;
+
+	io->files.mapping = file->f_mapping;
+	io->files.inode = io->files.mapping->host;
+	io->files.bdev = io->files.inode->i_sb->s_bdev;
+
+	err = io->ops->sync(io);
+	if (err)
+		return err;
+
+	mutex_lock(&io->files.inode->i_mutex);
+	em_tree = ploop_dio_open(io, (delta->flags & PLOOP_FMT_RDONLY));
+	err = PTR_ERR(em_tree);
+	if (IS_ERR(em_tree))
+		goto out;
+
+	io->files.em_tree = em_tree;
+
+	err = dio_invalidate_cache(io->files.mapping, io->files.bdev);
+	if (err) {
+		io->files.em_tree = NULL;
+		ploop_dio_close(io, 0);
+		goto out;
+	}
+
+	if (!(delta->flags & PLOOP_FMT_RDONLY) && !io->files.em_tree->_get_extent) {
+		io->fsync_thread = kthread_create(dio_fsync_thread,
+						  io, "ploop_fsync%d",
+						  delta->plo->index);
+		if (io->fsync_thread == NULL) {
+			io->files.em_tree = NULL;
+			ploop_dio_close(io, 0);
+			goto out;
+		}
+		wake_up_process(io->fsync_thread);
+	}
+
+out:
+	mutex_unlock(&io->files.inode->i_mutex);
+	return err;
+}
+
+void fsync_timeout(unsigned long data)
+{
+	struct ploop_io * io = (void*)data;
+
+	wake_up_interruptible(&io->fsync_waitq);
+}
+
+static int
+dio_init(struct ploop_io * io)
+{
+	INIT_LIST_HEAD(&io->fsync_queue);
+	init_waitqueue_head(&io->fsync_waitq);
+	init_timer(&io->fsync_timer);
+	io->fsync_timer.function = fsync_timeout;
+	io->fsync_timer.data = (unsigned long)io;
+
+	return 0;
+}
+
+struct dio_comp
+{
+	struct completion comp;
+	atomic_t count;
+	int error;
+};
+
+DEFINE_BIO_CB(dio_endio_sync)
+{
+	struct dio_comp * comp = bio->bi_private;
+
+	if (!err && !bio_flagged(bio, BIO_UPTODATE))
+		err = -EIO;
+	if (err && !comp->error)
+		comp->error = err;
+
+	if (atomic_dec_and_test(&comp->count))
+		complete(&comp->comp);
+
+	bio_put(bio);
+}
+END_BIO_CB(dio_endio_sync)
+
+static int
+dio_sync_io(struct ploop_io * io, int rw, struct page * page,
+	    unsigned int len, unsigned int off, sector_t sec)
+{
+	struct bio_list bl;
+	struct bio * bio;
+	struct dio_comp comp;
+	struct extent_map * em;
+	sector_t nsec;
+	int err;
+
+	BUG_ON(len & 511);
+	BUG_ON(off & 511);
+
+	bio_list_init(&bl);
+	bio = NULL;
+	em = NULL;
+
+	init_completion(&comp.comp);
+	atomic_set(&comp.count, 1);
+	comp.error = 0;
+
+	while (len > 0) {
+		int copy;
+
+		if (!em || sec >= em->end) {
+			if (em)
+				ploop_extent_put(em);
+			em = extent_lookup_create(io, sec, len>>9);
+			if (IS_ERR(em))
+				goto out_em_err;
+		}
+
+		nsec = dio_isec_to_phys(em, sec);
+
+		if (bio == NULL ||
+		    bio->bi_sector + (bio->bi_size>>9) != nsec) {
+flush_bio:
+			bio = bio_alloc(GFP_NOFS, 32);
+			if (bio == NULL)
+				goto enomem;
+			bio_list_add(&bl, bio);
+			bio->bi_bdev = io->files.bdev;
+			bio->bi_sector = nsec;
+		}
+
+		copy = len;
+		if (copy > ((em->end - sec) << 9))
+			copy = (em->end - sec) << 9;
+		if (bio_add_page(bio, page, copy, off) != copy) {
+			/* Oops. */
+			goto flush_bio;
+		}
+
+		off += copy;
+		len -= copy;
+		sec += copy >> 9;
+	}
+
+	if (em)
+		ploop_extent_put(em);
+
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+
+		b->bi_next = NULL;
+		b->bi_end_io = dio_endio_sync;
+		b->bi_private = &comp;
+		atomic_inc(&comp.count);
+		submit_bio(rw, b);
+	}
+
+	if (atomic_dec_and_test(&comp.count))
+		complete(&comp.comp);
+
+	wait_for_completion(&comp.comp);
+
+	return comp.error;
+
+
+enomem:
+	err = -ENOMEM;
+	goto out;
+
+out_em_err:
+	err = PTR_ERR(em);
+out:
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+		b->bi_next = NULL;
+		bio_put(b);
+	}
+	return err;
+}
+
+static int
+dio_sync_read(struct ploop_io * io, struct page * page, unsigned int len,
+	      unsigned int off, sector_t pos)
+{
+	return dio_sync_io(io, READ_SYNC, page, len, off, pos);
+}
+
+static int
+dio_sync_write(struct ploop_io * io, struct page * page, unsigned int len,
+	       unsigned int off, sector_t sec)
+{
+	int err;
+
+	if (!(io->files.file->f_mode & FMODE_WRITE))
+		return -EBADF;
+
+	err = dio_sync_io(io, WRITE_SYNC, page, len, off, sec);
+
+	if (sec < io->plo->track_end)
+		ploop_tracker_notify(io->plo, sec);
+
+	return err;
+}
+
+static int
+dio_sync_iovec(struct ploop_io * io, int rw, struct page ** pvec,
+	       unsigned int nr, sector_t sec)
+{
+	struct bio_list bl;
+	struct bio * bio;
+	struct dio_comp comp;
+	unsigned int len = PAGE_SIZE * nr;
+	unsigned int off;
+	struct extent_map * em;
+	int err;
+	sector_t nsec;
+
+	bio_list_init(&bl);
+	bio = NULL;
+	em = NULL;
+	off = 0;
+
+	init_completion(&comp.comp);
+	atomic_set(&comp.count, 1);
+	comp.error = 0;
+
+	while (len > 0) {
+		int copy;
+
+		if (!em || sec >= em->end) {
+			if (em)
+				ploop_extent_put(em);
+			em = extent_lookup_create(io, sec, len>>9);
+			if (IS_ERR(em))
+				goto out_em_err;
+		}
+
+		nsec = dio_isec_to_phys(em, sec);
+
+		if (bio == NULL ||
+		    bio->bi_sector + (bio->bi_size>>9) != nsec) {
+flush_bio:
+			bio = bio_alloc(GFP_NOFS, 32);
+			if (bio == NULL)
+				goto enomem;
+			bio_list_add(&bl, bio);
+			bio->bi_bdev = io->files.bdev;
+			bio->bi_sector = nsec;
+		}
+
+		copy = len;
+		if (copy > ((em->end - sec) << 9))
+			copy = (em->end - sec) << 9;
+		if (off/PAGE_SIZE != (off + copy + 1)/PAGE_SIZE)
+			copy = PAGE_SIZE - (off & (PAGE_SIZE-1));
+		if (bio_add_page(bio, pvec[off/PAGE_SIZE], copy,
+				 off & (PAGE_SIZE-1) ) != copy) {
+			/* Oops. */
+			goto flush_bio;
+		}
+
+		off += copy;
+		len -= copy;
+		sec += copy >> 9;
+	}
+
+	if (em)
+		ploop_extent_put(em);
+
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+
+		b->bi_next = NULL;
+		b->bi_end_io = dio_endio_sync;
+		b->bi_private = &comp;
+		atomic_inc(&comp.count);
+		submit_bio(rw, b);
+	}
+
+	if (atomic_dec_and_test(&comp.count))
+		complete(&comp.comp);
+
+	wait_for_completion(&comp.comp);
+
+	return comp.error;
+
+
+enomem:
+	err = -ENOMEM;
+	goto out;
+
+out_em_err:
+	err = PTR_ERR(em);
+out:
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+		b->bi_next = NULL;
+		bio_put(b);
+	}
+	return err;
+}
+
+static int
+dio_sync_readvec(struct ploop_io * io, struct page ** pvec, unsigned int nr,
+		 sector_t sec)
+{
+	return dio_sync_iovec(io, READ_SYNC, pvec, nr, sec);
+}
+
+static int
+dio_sync_writevec(struct ploop_io * io, struct page ** pvec, unsigned int nr,
+		  sector_t sec)
+{
+	int err;
+
+	if (!(io->files.file->f_mode & FMODE_WRITE))
+		return -EBADF;
+
+	err = dio_sync_iovec(io, WRITE_SYNC, pvec, nr, sec);
+
+	if (sec < io->plo->track_end)
+		ploop_tracker_notify(io->plo, sec);
+
+	return err;
+}
+
+/*
+ * Allocate and zero new block in file. Do it through page cache.
+ * It is assumed there is no point to optimize this, it is used
+ * (for ploop1 format) only for allocation of index clusters. Another
+ * use-case is growing raw delta, but this is assumed to be rare.
+ */
+static int dio_alloc_sync(struct ploop_io * io, loff_t pos, loff_t len)
+{
+	int err;
+	int ret;
+	struct page *pad = NULL;
+	int pad_len = pos & (PAGE_CACHE_SIZE - 1);
+
+	if (pos + len > i_size_read(io->files.inode) &&
+	    io->files.file->f_op->fallocate) {
+		err = io->files.file->f_op->fallocate(io->files.file, 0,
+						       pos, len);
+		if (err)
+			return err;
+	}
+
+	if (pad_len) {
+		BUILD_BUG_ON(PAGE_SIZE != PAGE_CACHE_SIZE);
+
+		pad = alloc_page(GFP_NOFS);
+		if (pad == NULL)
+			return -ENOMEM;
+
+		len += pad_len;
+		pos -= pad_len;
+
+		err = dio_sync_read(io, pad, pad_len, 0, pos >> 9);
+		if (err) {
+			put_page(pad);
+			return err;
+		}
+	}
+
+	err = 0;
+
+	mutex_lock(&io->files.inode->i_mutex);
+
+	while (len > 0) {
+		struct page *page;
+		void *fsdata;
+		ret = pagecache_write_begin(io->files.file, io->files.mapping,
+					    pos, PAGE_CACHE_SIZE, 0,
+					    &page, &fsdata);
+		if (ret) {
+			err = ret;
+			mutex_unlock(&io->files.inode->i_mutex);
+			goto fail;
+		}
+
+		bzero_page(page);
+
+		if (pad) {
+			memcpy(page_address(page), page_address(pad), pad_len);
+			put_page(pad);
+			pad = NULL;
+		}
+
+		ret = pagecache_write_end(io->files.file, io->files.mapping,
+					  pos, PAGE_CACHE_SIZE,
+					  PAGE_CACHE_SIZE, page, fsdata);
+		if (ret < 0 || ret != PAGE_CACHE_SIZE) {
+			err = ret;
+			mutex_unlock(&io->files.inode->i_mutex);
+			goto fail;
+		}
+
+		len -= PAGE_CACHE_SIZE;
+		pos += PAGE_CACHE_SIZE;
+	}
+
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	err = filemap_fdatawrite(io->files.mapping);
+	if (err)
+		goto fail;
+
+	err = io->ops->sync(io);
+	if (err)
+		goto fail;
+
+	err = filemap_fdatawait(io->files.mapping);
+
+fail:
+	if (pad)
+		put_page(pad);
+
+	if (!err)
+		io->alloc_head = pos >> (io->plo->cluster_log + 9);
+
+	return err;
+}
+
+static void
+dio_io_page(struct ploop_io * io, unsigned long rw,
+	    struct ploop_request * preq, struct page * page,
+	    sector_t sec)
+{
+	struct bio_list bl;
+	struct bio * bio;
+	unsigned int len;
+	struct extent_map * em;
+	sector_t nsec;
+	int err;
+	int off;
+
+	bio_list_init(&bl);
+	bio = NULL;
+	em = NULL;
+	off = 0;
+
+	ploop_prepare_io_request(preq);
+	if (rw & REQ_WRITE)
+		ploop_prepare_tracker(preq, sec);
+
+	len = PAGE_SIZE;
+
+	while (len > 0) {
+		int copy;
+
+		if (!em || sec >= em->end) {
+			if (em)
+				ploop_extent_put(em);
+			em = extent_lookup_create(io, sec, len>>9);
+			if (IS_ERR(em))
+				goto out_em_err;
+		}
+
+		nsec = dio_isec_to_phys(em, sec);
+
+		if (bio == NULL ||
+		    bio->bi_sector + (bio->bi_size>>9) != nsec) {
+flush_bio:
+			bio = bio_alloc(GFP_NOFS, 32);
+			if (bio == NULL)
+				goto enomem;
+			bio_list_add(&bl, bio);
+			bio->bi_bdev = io->files.bdev;
+			bio->bi_sector = nsec;
+		}
+
+		copy = len;
+		if (copy > ((em->end - sec) << 9))
+			copy = (em->end - sec) << 9;
+		if (bio_add_page(bio, page, copy, off) != copy) {
+			/* Oops. */
+			goto flush_bio;
+		}
+
+		off += copy;
+		len -= copy;
+		sec += copy >> 9;
+	}
+
+	if (em)
+		ploop_extent_put(em);
+
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+
+		b->bi_next = NULL;
+		b->bi_end_io = dio_endio_async;
+		b->bi_private = preq;
+		atomic_inc(&preq->io_count);
+		ploop_acc_ff_out(preq->plo, rw | b->bi_rw);
+		submit_bio(rw, b);
+	}
+
+	ploop_complete_io_request(preq);
+	return;
+
+enomem:
+	err = -ENOMEM;
+	goto out;
+
+out_em_err:
+	err = PTR_ERR(em);
+out:
+	while (bl.head) {
+		struct bio * b = bl.head;
+		bl.head = b->bi_next;
+		b->bi_next = NULL;
+		bio_put(b);
+	}
+	PLOOP_FAIL_REQUEST(preq, err);
+}
+
+static void
+dio_read_page(struct ploop_io * io, struct ploop_request * preq,
+	      struct page * page, sector_t sec)
+{
+	dio_io_page(io, READ | REQ_SYNC, preq, page, sec);
+}
+
+static void
+dio_write_page(struct ploop_io * io, struct ploop_request * preq,
+	       struct page * page, sector_t sec, unsigned long rw)
+{
+	if (!(io->files.file->f_mode & FMODE_WRITE)) {
+		PLOOP_FAIL_REQUEST(preq, -EBADF);
+		return;
+	}
+
+	dio_io_page(io, rw | WRITE | REQ_SYNC, preq, page, sec);
+}
+
+static int
+dio_fastmap(struct ploop_io * io, struct bio * orig_bio,
+	    struct bio * bio, sector_t isec)
+{
+	struct request_queue * q;
+	struct extent_map * em;
+	int i;
+
+	if (unlikely((orig_bio->bi_rw & (REQ_FLUSH | REQ_FUA)) &&
+		     test_bit(PLOOP_IO_FSYNC_DELAYED, &io->io_state)))
+		return 1;
+
+	if (orig_bio->bi_size == 0) {
+		bio->bi_vcnt   = 0;
+		bio->bi_sector = 0;
+		bio->bi_size   = 0;
+		bio->bi_idx    = 0;
+
+		bio->bi_rw   = orig_bio->bi_rw;
+		bio->bi_bdev = io->files.bdev;
+		return 0;
+	}
+
+	em = extent_lookup(io->files.em_tree, isec);
+
+	if (em == NULL) {
+		io->plo->st.fast_neg_noem++;
+		return 1;
+	}
+
+	if (isec + (orig_bio->bi_size>>9) > em->end) {
+		io->plo->st.fast_neg_shortem++;
+		ploop_extent_put(em);
+		return 1;
+	}
+
+	BUG_ON(bio->bi_max_vecs < orig_bio->bi_vcnt);
+
+	memcpy(bio->bi_io_vec, orig_bio->bi_io_vec,
+	       orig_bio->bi_vcnt * sizeof(struct bio_vec));
+
+	bio->bi_sector = dio_isec_to_phys(em, isec);
+	ploop_extent_put(em);
+
+	bio->bi_bdev = io->files.bdev;
+	bio->bi_rw = orig_bio->bi_rw;
+	bio->bi_vcnt = orig_bio->bi_vcnt;
+	bio->bi_size = orig_bio->bi_size;
+	bio->bi_idx = orig_bio->bi_idx;
+
+	q = bdev_get_queue(bio->bi_bdev);
+
+	if (q->merge_bvec_fn == NULL)
+		return 0;
+
+	bio->bi_size = 0;
+	bio->bi_vcnt = 0;
+
+	for (i = 0; i < orig_bio->bi_vcnt; i++) {
+		struct bio_vec * bv = &bio->bi_io_vec[i];
+		struct bvec_merge_data bm_data = {
+			.bi_bdev = bio->bi_bdev,
+			.bi_sector = bio->bi_sector,
+			.bi_size = bio->bi_size,
+			.bi_rw = bio->bi_rw,
+		};
+		if (q->merge_bvec_fn(q, &bm_data, bv) < bv->bv_len) {
+			io->plo->st.fast_neg_backing++;
+			return 1;
+		}
+		bio->bi_size += bv->bv_len;
+		bio->bi_vcnt++;
+	}
+	return 0;
+}
+
+/* Merge is disabled _only_ if we _have_ resolved mapping and
+ * we are sure bio is going to be split in any case due to
+ * file level fragmentation.
+ */
+static int
+dio_disable_merge(struct ploop_io * io, sector_t isector, unsigned int len)
+{
+	int ret = 0;
+	struct extent_map * em;
+
+	em = extent_lookup(io->files.em_tree, isector);
+	if (em) {
+		if (isector + len > em->end)
+			ret = 1;
+		ploop_extent_put(em);
+	}
+	return ret;
+}
+
+static int dio_prepare_snapshot(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	int err;
+	struct file * file = io->files.file;
+	struct path	path;
+
+	path.mnt = F_MNT(file);
+	path.dentry = F_DENTRY(file);
+
+	file = dentry_open(&path, O_RDONLY|O_LARGEFILE, current_cred());
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	/* Sanity checks */
+
+	if (io->files.mapping != file->f_mapping ||
+	    io->files.inode != file->f_mapping->host ||
+	    io->files.bdev != file->f_mapping->host->i_sb->s_bdev) {
+		fput(file);
+		return -EINVAL;
+	}
+
+	err = io->ops->sync(io);
+	if (err) {
+		fput(file);
+		return err;
+	}
+
+	mutex_lock(&io->files.inode->i_mutex);
+	err = dio_invalidate_cache(io->files.mapping, io->files.bdev);
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	if (err) {
+		fput(file);
+		return -EINVAL;
+	}
+
+	sd->file = file;
+	return 0;
+}
+
+static int dio_complete_snapshot(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+	int ret;
+
+	ret = dio_release_prealloced(io);
+	if (ret)
+		return ret;
+
+	mutex_lock(&io->plo->sysfs_mutex);
+	io->files.file = sd->file;
+	sd->file = NULL;
+	mutex_unlock(&io->plo->sysfs_mutex);
+
+	mutex_lock(&io->files.inode->i_mutex);
+	ploop_dio_downgrade(io->files.mapping);
+	BUG_ON((loff_t)io->alloc_head << (io->plo->cluster_log + 9) !=
+	       i_size_read(io->files.inode));
+	(void)invalidate_inode_pages2(io->files.mapping);
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	if (io->fsync_thread) {
+		kthread_stop(io->fsync_thread);
+		io->fsync_thread = NULL;
+	}
+
+	fput(file);
+	return 0;
+}
+
+static int dio_prepare_merge(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	int err;
+	struct file * file = io->files.file;
+	struct path	path;
+
+	path.mnt = F_MNT(file);
+	path.dentry = F_DENTRY(file);
+
+	file = dentry_open(&path, O_RDWR|O_LARGEFILE, current_cred());
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	/* Sanity checks */
+
+	if (io->files.mapping != file->f_mapping ||
+	    io->files.inode != file->f_mapping->host ||
+	    io->files.bdev != file->f_mapping->host->i_sb->s_bdev) {
+		fput(file);
+		return -EINVAL;
+	}
+
+	err = io->ops->sync(io);
+	if (err) {
+		fput(file);
+		return err;
+	}
+
+	mutex_lock(&io->files.inode->i_mutex);
+
+	err = dio_invalidate_cache(io->files.mapping, io->files.bdev);
+	if (err) {
+		mutex_unlock(&io->files.inode->i_mutex);
+		fput(file);
+		return err;
+	}
+
+	err = ploop_dio_upgrade(io);
+	if (err) {
+		mutex_unlock(&io->files.inode->i_mutex);
+		fput(file);
+		return err;
+	}
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	if (!io->files.em_tree->_get_extent) {
+		io->fsync_thread = kthread_create(dio_fsync_thread,
+						  io, "ploop_fsync%d",
+						  io->plo->index);
+		if (io->fsync_thread == NULL) {
+			fput(file);
+			return -ENOMEM;
+		}
+		wake_up_process(io->fsync_thread);
+	}
+
+	sd->file = file;
+	return 0;
+}
+
+static int dio_truncate(struct ploop_io * io, struct file * file,
+			__u32 alloc_head)
+{
+	int err;
+	struct iattr newattrs;
+	loff_t new_size;
+
+	if (file->f_mapping != io->files.mapping)
+		return -EINVAL;
+
+	newattrs.ia_size = (u64)alloc_head << (io->plo->cluster_log + 9);
+	newattrs.ia_valid = ATTR_SIZE;
+
+	mutex_lock(&io->files.inode->i_mutex);
+	if (io->files.em_tree)
+		trim_extent_mappings(io->files.em_tree, newattrs.ia_size>>9);
+	io->files.inode->i_flags &= ~S_SWAPFILE;
+	err = notify_change(F_DENTRY(file), &newattrs, NULL);
+	io->files.inode->i_flags |= S_SWAPFILE;
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	new_size = i_size_read(io->files.inode);
+	atomic_long_sub(*io->size_ptr - new_size, &ploop_io_images_size);
+	*io->size_ptr = new_size;
+
+	if (!err) {
+		if (io->files.file == file)
+			err = io->ops->sync(io);
+		else
+			err = file->f_op->fsync(file, 0, LLONG_MAX, 0);
+	}
+
+	return err;
+}
+
+static int dio_start_merge(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+
+	mutex_lock(&io->plo->sysfs_mutex);
+	io->files.file = sd->file;
+	sd->file = NULL;
+	mutex_unlock(&io->plo->sysfs_mutex);
+
+	fput(file);
+	return 0;
+}
+
+static void dio_unplug(struct ploop_io * io)
+{	
+	/* Need more thinking how to implement unplug */
+}
+
+static int dio_congested(struct ploop_io * io, int bits)
+{
+	struct request_queue *bq;
+
+	bq = bdev_get_queue(io->files.bdev);
+
+	return bdi_congested(&bq->backing_dev_info, bits);
+}
+
+static void dio_queue_settings(struct ploop_io * io, struct request_queue * q)
+{
+	blk_queue_stack_limits(q, bdev_get_queue(io->files.bdev));
+}
+
+static void dio_issue_flush(struct ploop_io * io, struct ploop_request *preq)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(GFP_NOFS, 0);
+	if (unlikely(!bio)) {
+		PLOOP_FAIL_REQUEST(preq, -ENOMEM);
+		return;
+	}
+
+	ploop_prepare_io_request(preq);
+	bio->bi_end_io = dio_endio_async;
+	bio->bi_bdev = io->files.bdev;
+	bio->bi_private = preq;
+
+	atomic_inc(&preq->io_count);
+	ploop_acc_ff_out(io->plo, preq->req_rw | bio->bi_rw);
+	submit_bio(WRITE_FLUSH, bio);
+	ploop_complete_io_request(preq);
+}
+
+static int dio_dump(struct ploop_io * io)
+{
+	extern void dump_extent_map(struct extent_map_tree *tree);
+
+	if (io->files.em_tree) {
+		dump_extent_map(io->files.em_tree);
+		return 0;
+	}
+	return -1;
+}
+
+static int dio_autodetect(struct ploop_io * io)
+{
+	struct file  * file  = io->files.file;
+	struct inode * inode = file->f_mapping->host;
+	char         * s_id  = inode->i_sb->s_id;
+
+	int err;
+	mm_segment_t fs;
+	unsigned int flags;
+	
+	if (inode->i_sb->s_magic != EXT4_SUPER_MAGIC)
+		return -1; /* not mine */
+
+	if (inode->i_sb->s_bdev == NULL) {
+		printk("File on FS EXT(%s) without backing device\n", s_id);
+		return -1;
+	}
+
+	if (!file->f_op->fallocate)
+		ploop_io_report_fn(file, KERN_WARNING
+					"File on FS w/o fallocate");
+
+	if (!file->f_op->unlocked_ioctl) {
+		printk("Cannot run on EXT4(%s): no unlocked_ioctl\n", s_id);
+		return -1;
+	}
+
+	if (!file->f_op->fsync) {
+		printk("Cannot run on EXT4(%s): no fsync\n", s_id);
+		return -1;
+	}
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	flags = 0;
+	err = file->f_op->unlocked_ioctl(file, FS_IOC_GETFLAGS, (long)&flags);
+	set_fs(fs);
+
+	if (err != 0) {
+		printk("Cannot run on EXT4(%s): failed FS_IOC_GETFLAGS (%d)\n",
+		       s_id, err);
+		return -1;
+	}
+
+	io->files.flags = flags;
+	if (!(flags & EXT4_EXTENTS_FL))
+		ploop_io_report_fn(file, KERN_WARNING "File w/o extents");
+
+	return 0;
+}
+
+static struct ploop_io_ops ploop_io_ops_direct =
+{
+	.id		=	PLOOP_IO_DIRECT,
+	.name		=	"direct",
+	.owner		=	THIS_MODULE,
+
+	.unplug		=	dio_unplug,
+	.congested	=	dio_congested,
+
+	.alloc		=	dio_alloc_sync,
+	.submit		=	dio_submit,
+	.submit_alloc	=	dio_submit_alloc,
+	.post_submit	=	dio_post_submit,
+	.disable_merge	=	dio_disable_merge,
+	.fastmap	=	dio_fastmap,
+	.read_page	=	dio_read_page,
+	.write_page	=	dio_write_page,
+	.sync_read	=	dio_sync_read,
+	.sync_write	=	dio_sync_write,
+	.sync_readvec	=	dio_sync_readvec,
+	.sync_writevec	=	dio_sync_writevec,
+
+	.init		=	dio_init,
+	.destroy	=	dio_destroy,
+	.open		=	dio_open,
+	.sync		=	dio_sync,
+	.stop		=	dio_stop,
+	.prepare_snapshot =	dio_prepare_snapshot,
+	.complete_snapshot =	dio_complete_snapshot,
+	.prepare_merge  =	dio_prepare_merge,
+	.start_merge	=	dio_start_merge,
+	.truncate	=	dio_truncate,
+
+	.queue_settings	=	dio_queue_settings,
+	.issue_flush	=	dio_issue_flush,
+
+	.dump		=	dio_dump,
+
+	.i_size_read	=	generic_i_size_read,
+	.f_mode		=	generic_f_mode,
+
+	.autodetect     =       dio_autodetect,
+};
+
+module_param(max_extent_map_pages, int, 0644);
+MODULE_PARM_DESC(max_extent_map_pages, "Maximal amount of pages taken by all extent map caches");
+module_param(min_extent_map_entries, int, 0644);
+MODULE_PARM_DESC(min_extent_map_entries, "Minimal amount of entries in a single extent map cache");
+
+static int __init pio_direct_mod_init(void)
+{
+	int err;
+
+	if (max_extent_map_pages == 0)
+		max_extent_map_pages = PLOOP_MAX_EXTENT_MAP >> PAGE_SHIFT;
+
+	if (min_extent_map_entries == 0)
+		min_extent_map_entries = 64;
+
+	err = ploop_extent_map_init();
+	if (!err) {
+		err = ploop_register_io(&ploop_io_ops_direct);
+		if (err)
+			ploop_extent_map_exit();
+	}
+
+	return err;
+}
+
+static void __exit pio_direct_mod_exit(void)
+{
+	ploop_unregister_io(&ploop_io_ops_direct);
+	ploop_extent_map_exit();
+	BUG_ON(atomic_long_read(&ploop_io_images_size));
+}
+
+module_init(pio_direct_mod_init);
+module_exit(pio_direct_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/drivers/block/ploop/io_direct_events.h
@@ -0,0 +1,49 @@
+/*
+ *  drivers/block/ploop/io_direct_events.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#if !defined(_TRACE_IO_DIRECT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IO_DIRECT_H
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ploop
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#include <linux/ploop/ploop.h>
+#include "io_direct_map.h"
+
+TRACE_EVENT(add_extent_mapping,
+	TP_PROTO(struct extent_map *em),
+
+	TP_ARGS(em),
+
+	TP_STRUCT__entry(
+		__field(sector_t,  start)
+		__field(sector_t,  end)
+		__field(sector_t,  bstart)
+	),
+
+	TP_fast_assign(
+		__entry->start	= em->start;
+		__entry->end	= em->end;
+		__entry->bstart	= em->block_start;
+	),
+
+	TP_printk("start=0x%lx end=0x%lx block_start=0x%lx",
+			__entry->start, __entry->end, __entry->bstart)
+);
+
+#endif /* _TRACE_PLOOP_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE io_direct_events
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--- /dev/null
+++ b/drivers/block/ploop/io_direct_map.c
@@ -0,0 +1,863 @@
+/*
+ *  drivers/block/ploop/io_direct_map.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/err.h>
+#include <linux/gfp.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/version.h>
+#include <linux/fs.h>
+#include <linux/buffer_head.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+#include <linux/ploop/ploop_if.h>
+#include "io_direct_events.h"
+#include "io_direct_map.h"
+
+/* Part of io_direct shared between all the devices.
+ * No way this code is good. But it is the best, which we can do
+ * not modifying core.
+ *
+ * Keep track of images opened by ploop. Maintain shared extent
+ * maps for shared images, which are open read-only. Top level
+ * deltas, which are open for write, are open exclusively.
+ *
+ * Also take care about setting/clearing S_SWAPFILE and setting
+ * mapping gfp mask to GFP_NOFS.
+ */
+
+struct ploop_mapping
+{
+	struct list_head	list;
+	struct address_space	* mapping;
+	int			readers;
+	unsigned long		saved_gfp_mask;
+	loff_t			size;
+
+	struct extent_map_tree	extent_root;
+};
+
+static LIST_HEAD(ploop_mappings);
+static DEFINE_SPINLOCK(ploop_mappings_lock);
+
+/* total number of extent_map structures */
+static atomic_t ploop_extent_maps_count = ATOMIC_INIT(0);
+
+static void extent_map_tree_init(struct extent_map_tree *tree);
+static int drop_extent_map(struct extent_map_tree *tree);
+static int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em);
+
+extern atomic_long_t ploop_io_images_size;
+
+/*
+ * ploop_dio_* functions must be called with i_mutex taken.
+ */
+
+struct extent_map_tree *
+ploop_dio_open(struct ploop_io * io, int rdonly)
+{
+	int err;
+	struct ploop_mapping *m, *pm;
+	struct file * file = io->files.file;
+	struct address_space * mapping = file->f_mapping;
+
+	pm = kzalloc(sizeof(struct ploop_mapping), GFP_KERNEL);
+
+	err = 0;
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			if (rdonly) {
+				if (m->readers < 0)
+					err = -ETXTBSY;
+				else
+					m->readers++;
+			} else {
+				if (m->readers)
+					err = -EBUSY;
+				else
+					m->readers = -1;
+			}
+
+out_unlock:
+			spin_unlock(&ploop_mappings_lock);
+			if (pm)
+				kfree(pm);
+			if (!err)
+				io->size_ptr = &m->size;
+			return err ? ERR_PTR(err) : &m->extent_root;
+		}
+	}
+
+	if (pm == NULL) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	if (mapping->host->i_flags & S_SWAPFILE) {
+		err = -EBUSY;
+		goto out_unlock;
+	}
+
+	pm->mapping = mapping;
+	extent_map_tree_init(&pm->extent_root);
+	pm->extent_root.mapping = mapping;
+	pm->readers = rdonly ? 1 : -1;
+	list_add(&pm->list, &ploop_mappings);
+	mapping->host->i_flags |= S_SWAPFILE;
+	io->size_ptr = &pm->size;
+	*io->size_ptr = i_size_read(mapping->host);
+	atomic_long_add(*io->size_ptr, &ploop_io_images_size);
+
+	pm->saved_gfp_mask = mapping_gfp_mask(mapping);
+	mapping_set_gfp_mask(mapping,
+			     pm->saved_gfp_mask & ~__GFP_FS);
+
+	spin_unlock(&ploop_mappings_lock);
+
+	if (strcmp(mapping->host->i_sb->s_type->name, "pcss") == 0) {
+		struct ploop_xops xops;
+		if (file->f_op->unlocked_ioctl) {
+			mm_segment_t fs = get_fs();
+
+			set_fs(KERNEL_DS);
+			xops.magic = 0;
+			err = file->f_op->unlocked_ioctl(file, PLOOP_IOC_INTERNAL, (long)&xops);
+			set_fs(fs);
+			if (err == 0 && xops.magic == PLOOP_INTERNAL_MAGIC)
+				pm->extent_root._get_extent = xops.get_extent;
+		}
+	}
+	return &pm->extent_root;
+}
+
+int
+ploop_dio_close(struct ploop_io * io, int rdonly)
+{
+	struct address_space * mapping = io->files.mapping;
+	struct ploop_mapping *m, *pm = NULL;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			if (rdonly) {
+				m->readers--;
+			} else {
+				BUG_ON(m->readers != -1);
+				m->readers = 0;
+			}
+
+			if (m->readers == 0) {
+				atomic_long_sub(*io->size_ptr,
+						&ploop_io_images_size);
+				*io->size_ptr = 0;
+				mapping->host->i_flags &= ~S_SWAPFILE;
+				list_del(&m->list);
+				pm = m;
+			}
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+
+	if (pm) {
+		drop_extent_map(&pm->extent_root);
+		BUG_ON(pm->extent_root.map_size);
+		kfree(pm);
+		return 0;
+	}
+	return -ENOENT;
+}
+
+void ploop_dio_downgrade(struct address_space * mapping)
+{
+	struct ploop_mapping * m;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			BUG_ON(m->readers != -1);
+			m->readers = 1;
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+}
+
+int ploop_dio_upgrade(struct ploop_io * io)
+{
+	struct address_space * mapping = io->files.mapping;
+	struct ploop_mapping * m;
+	int err = -ESRCH;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			err = -EBUSY;
+			if (m->readers == 1) {
+				loff_t new_size = i_size_read(io->files.inode);
+				atomic_long_add(new_size - *io->size_ptr,
+						&ploop_io_images_size);
+				*io->size_ptr = new_size;
+
+				m->readers = -1;
+				err = 0;
+			}
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+	return err;
+}
+
+
+/* The rest of the file is written by Jens Axboe.
+ * I just fixed a few of bugs (requests not aligned at fs block size
+ * due to direct-io aligned to 512) and truncated some useless functionality.
+ *
+ * In any case, it must be remade: not only because of GPL, but also
+ * because it is not good.
+ */
+
+static struct kmem_cache *extent_map_cache;
+
+int __init ploop_extent_map_init(void)
+{
+	extent_map_cache = kmem_cache_create("ploop_itree",
+						sizeof(struct extent_map), 0,
+						SLAB_MEM_SPREAD, NULL
+						);
+	if (!extent_map_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void ploop_extent_map_exit(void)
+{
+	if (extent_map_cache)
+		kmem_cache_destroy(extent_map_cache);
+}
+
+static void extent_map_tree_init(struct extent_map_tree *tree)
+{
+	tree->map.rb_node = NULL;
+	INIT_LIST_HEAD(&tree->lru_list);
+	tree->map_size = 0;
+	rwlock_init(&tree->lock);
+}
+
+struct extent_map *ploop_alloc_extent_map(gfp_t mask)
+{
+	struct extent_map *em;
+
+	em = kmem_cache_alloc(extent_map_cache, GFP_NOFS);
+	if (em) {
+		atomic_set(&em->refs, 1);
+		INIT_LIST_HEAD(&em->lru_link);
+		atomic_inc(&ploop_extent_maps_count);
+		em->uninit = false;
+	}
+	return em;
+}
+
+void ploop_extent_put(struct extent_map *em)
+{
+	if (!em)
+		return;
+	if (atomic_dec_and_test(&em->refs)) {
+		atomic_dec(&ploop_extent_maps_count);
+		kmem_cache_free(extent_map_cache, em);
+	}
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, sector_t start,
+				   sector_t end, struct rb_node *node)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct extent_map *entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct extent_map, rb_node);
+
+		if (end <= entry->start)
+			p = &(*p)->rb_left;
+		else if (start >= entry->end)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+/* Find extent which contains "offset". If there is no such extent,
+ * prev_ret is the first extent following "offset".
+ */
+static struct rb_node *__tree_search(struct rb_root *root, sector_t offset,
+				     struct rb_node **prev_ret)
+{
+	struct rb_node * n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct extent_map *entry;
+	struct extent_map *prev_entry = NULL;
+
+	while (n) {
+		entry = rb_entry(n, struct extent_map, rb_node);
+		prev = n;
+		prev_entry = entry;
+
+		if (offset < entry->start)
+			n = n->rb_left;
+		else if (offset >= entry->end)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	if (!prev_ret)
+		return NULL;
+
+	while (prev && offset >= prev_entry->end) {
+		prev = rb_next(prev);
+		prev_entry = rb_entry(prev, struct extent_map, rb_node);
+	}
+	*prev_ret = prev;
+	return NULL;
+}
+
+/* Find the first extent which could intersect a range starting at offset.
+ * Probably, it does not contain offset.
+ */
+static inline struct rb_node *tree_search(struct rb_root *root, sector_t offset)
+{
+	struct rb_node *prev;
+	struct rb_node *ret;
+	ret = __tree_search(root, offset, &prev);
+	if (!ret)
+		return prev;
+	return ret;
+}
+
+static int tree_delete(struct rb_root *root, sector_t offset)
+{
+	struct rb_node *node;
+
+	node = __tree_search(root, offset, NULL);
+	if (!node)
+		return -ENOENT;
+	rb_erase(node, root);
+	return 0;
+}
+
+static int mergable_maps(struct extent_map *prev, struct extent_map *next)
+{
+	if (prev->end == next->start &&
+	    next->block_start == extent_map_block_end(prev))
+		return 1;
+	return 0;
+}
+
+static inline int purge_lru_mapping(struct extent_map_tree *tree)
+{
+	int max_entries = (max_extent_map_pages << PAGE_SHIFT) /
+		sizeof(struct extent_map);
+
+	return atomic_read(&ploop_extent_maps_count) > max_entries &&
+	       tree->map_size > max(1, min_extent_map_entries) &&
+	       (u64)tree->map_size * atomic_long_read(&ploop_io_images_size) >
+	       (u64)max_entries * i_size_read(tree->mapping->host);
+}
+
+static inline void purge_lru_warn(struct extent_map_tree *tree)
+{
+	int max_entries = (max_extent_map_pages << PAGE_SHIFT) /
+		sizeof(struct extent_map);
+
+	loff_t ratio = i_size_read(tree->mapping->host) * 100;
+	do_div(ratio, atomic_long_read(&ploop_io_images_size));
+
+	printk(KERN_WARNING "Purging lru entry from extent tree for inode %ld "
+	       "(map_size=%d ratio=%lld%%)\n",
+	       tree->mapping->host->i_ino, tree->map_size, ratio);
+
+	/* Claim FS as 'too fragmented' if average_extent_size < 8MB */
+	if ((u64)max_entries * (8 * 1024 * 1024) <
+	    atomic_long_read(&ploop_io_images_size))
+		printk(KERN_WARNING "max_extent_map_pages=%d is too low for "
+		       "ploop_io_images_size=%ld bytes\n",
+		       max_extent_map_pages,
+		       atomic_long_read(&ploop_io_images_size));
+	else {
+		loff_t avg_siz = i_size_read(tree->mapping->host);
+		do_div(avg_siz, tree->map_size);
+
+		printk(KERN_WARNING "host fs is too fragmented: average extent"
+		       " size is lesser than %lld bytes\n", avg_siz);
+	}
+}
+
+/*
+ * add_extent_mapping tries a simple forward/backward merge with existing
+ * mappings.  The extent_map struct passed in will be inserted into
+ * the tree directly (no copies made, just a reference taken).
+ */
+static int add_extent_mapping(struct extent_map_tree *tree,
+			      struct extent_map *em)
+{
+	int ret = 0;
+	struct rb_node *rb;
+
+	write_lock_irq(&tree->lock);
+
+	do {
+		rb = tree_insert(&tree->map, em->start, em->end, &em->rb_node);
+		/* A part of this extent can be in tree */
+		if (rb) {
+			struct extent_map *tmp =
+				rb_entry(rb, struct extent_map, rb_node);
+			BUG_ON(tmp->block_start - tmp->start !=
+					em->block_start - em->start);
+			if (tmp->start <= em->start &&
+			    tmp->end >= em->end) {
+				ret =  -EEXIST;
+				goto out;
+			}
+			if (tmp->start < em->start) {
+				em->start = tmp->start;
+				em->block_start = tmp->block_start;
+			}
+			if (tmp->end > em->end)
+				em->end = tmp->end;
+			rb_erase(rb, &tree->map);
+			list_del_init(&tmp->lru_link);
+			tree->map_size--;
+			ploop_extent_put(tmp);
+		} else {
+			list_add_tail(&em->lru_link, &tree->lru_list);
+			tree->map_size++;
+
+			if (purge_lru_mapping(tree)) {
+				struct extent_map *victim_em;
+				static unsigned long purge_lru_time;
+
+				/* Warn about this once per hour */
+				if (printk_timed_ratelimit(&purge_lru_time,
+							   60*60*HZ))
+					purge_lru_warn(tree);
+
+				victim_em = list_entry(tree->lru_list.next,
+						       struct extent_map,
+						       lru_link);
+
+				list_del_init(&victim_em->lru_link);
+				tree->map_size--;
+				rb_erase(&victim_em->rb_node, &tree->map);
+				ploop_extent_put(victim_em);
+			}
+		}
+	} while (rb);
+
+	atomic_inc(&em->refs);
+	if (em->start != 0) {
+		rb = rb_prev(&em->rb_node);
+		if (rb) {
+			struct extent_map *merge;
+
+			merge = rb_entry(rb, struct extent_map, rb_node);
+			if (mergable_maps(merge, em)) {
+				em->start = merge->start;
+				em->block_start = merge->block_start;
+				rb_erase(&merge->rb_node, &tree->map);
+				list_del_init(&merge->lru_link);
+				tree->map_size--;
+				ploop_extent_put(merge);
+			}
+		}
+	}
+	rb = rb_next(&em->rb_node);
+	if (rb) {
+		struct extent_map *merge;
+
+		merge = rb_entry(rb, struct extent_map, rb_node);
+		if (mergable_maps(em, merge)) {
+			em->end = merge->end;
+			rb_erase(&merge->rb_node, &tree->map);
+			list_del_init(&merge->lru_link);
+			tree->map_size--;
+			ploop_extent_put(merge);
+		}
+	}
+
+	trace_add_extent_mapping(em);
+out:
+	write_unlock_irq(&tree->lock);
+	return ret;
+}
+
+struct extent_map *
+extent_lookup(struct extent_map_tree *tree, sector_t start)
+{
+	struct extent_map *em = NULL;
+	struct rb_node *rb_node;
+
+	read_lock(&tree->lock);
+	rb_node = __tree_search(&tree->map, start, NULL);
+	if (rb_node) {
+		em = rb_entry(rb_node, struct extent_map, rb_node);
+		atomic_inc(&em->refs);
+	}
+	read_unlock(&tree->lock);
+
+	if (em) {
+		write_lock(&tree->lock);
+		/* em could not be released, but could be deleted
+		 * from the list before we re-acquired the lock */
+		if (!list_empty(&em->lru_link)) {
+			list_del(&em->lru_link);
+			list_add_tail(&em->lru_link, &tree->lru_list);
+		}
+		write_unlock(&tree->lock);
+	}
+
+	return em;
+}
+
+/*
+ * lookup_extent_mapping returns the first extent_map struct in the
+ * tree that intersects the [start, start+len) range.  There may
+ * be additional objects in the tree that intersect, so check the object
+ * returned carefully to make sure you don't need additional lookups.
+ */
+static struct extent_map *
+lookup_extent_mapping(struct extent_map_tree *tree, sector_t start, sector_t len)
+{
+	struct extent_map *em;
+	struct rb_node *rb_node;
+
+	read_lock_irq(&tree->lock);
+	rb_node = tree_search(&tree->map, start);
+	if (!rb_node) {
+		em = NULL;
+		goto out;
+	}
+	em = rb_entry(rb_node, struct extent_map, rb_node);
+	if (em->end <= start || em->start >= start + len) {
+		em = NULL;
+		goto out;
+	}
+	atomic_inc(&em->refs);
+
+out:
+	read_unlock_irq(&tree->lock);
+	return em;
+}
+
+/*
+ * removes an extent_map struct from the tree.  No reference counts are
+ * dropped, and no checks are done to  see if the range is in use
+ */
+static int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
+{
+	int ret;
+
+	write_lock_irq(&tree->lock);
+	ret = tree_delete(&tree->map, em->start);
+	if (!ret) {
+		list_del_init(&em->lru_link);
+		tree->map_size--;
+	}
+	write_unlock_irq(&tree->lock);
+	return ret;
+}
+
+static struct extent_map *__map_extent_get_extent(struct extent_map_tree *tree,
+						  struct address_space *mapping,
+						  sector_t start, sector_t len, int create,
+						  gfp_t gfp_mask)
+{
+	struct inode *inode = mapping->host;
+	struct extent_map *em;
+	sector_t nstart, result;
+	int ret;
+
+again:
+	em = lookup_extent_mapping(tree, start, len);
+	if (em) {
+		if (em->start <= start && em->end >= start + len)
+			return em;
+
+		/*
+		 * we may have found an extent that starts after the
+		 * requested range.  Double check and alter the length
+		 * appropriately
+		 */
+		if (em->start > start) {
+			len = em->start - start;
+		} else if (!create) {
+			return em;
+		}
+		ploop_extent_put(em);
+	}
+	BUG_ON(gfp_mask & GFP_ATOMIC);
+
+	em = ploop_alloc_extent_map(gfp_mask);
+	if (!em)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * FIXME if there are errors later on, we end up exposing stale
+	 * data on disk while filling holes.
+	 *
+	 * _XXX_ Danger! len is reduced above, therefore _get_extent
+	 * does not allocate all that we need. It works only with pcss
+	 * and only when cluster size <= pcss block size and allocation
+	 * is aligned. If we relax those conditions, the code must be fixed.
+	 */
+	ret = tree->_get_extent(inode, start, len, &nstart, &result, create);
+	if (ret < 0) {
+		ploop_extent_put(em);
+		return ERR_PTR(ret);
+	}
+
+	em->start = nstart;
+	em->end = nstart + ret;
+	em->block_start = result;
+
+	ret = add_extent_mapping(tree, em);
+	if (ret == -EEXIST) {
+		ploop_extent_put(em);
+		goto again;
+	}
+	return em;
+}
+
+static struct extent_map *__map_extent_bmap(struct ploop_io *io,
+				       struct address_space *mapping,
+				       sector_t start, sector_t len, gfp_t gfp_mask)
+{
+	struct extent_map_tree *tree = io->files.em_tree;
+	struct inode *inode = mapping->host;
+	loff_t start_off = (loff_t)start << 9;
+	struct extent_map *em;
+	struct fiemap_extent_info fieinfo;
+	struct fiemap_extent fi_extent;
+	mm_segment_t old_fs;
+	int ret;
+
+again:
+	em = lookup_extent_mapping(tree, start, len);
+	if (em) {
+		/*
+		 * we may have found an extent that starts after the
+		 * requested range.  Double check and alter the length
+		 * appropriately
+		 */
+		if (em->start > start) {
+			len = em->start - start;
+		} else {
+			return em;
+		}
+		ploop_extent_put(em);
+	}
+
+	BUG_ON(gfp_mask & GFP_ATOMIC);
+
+	if (!inode->i_op->fiemap)
+		return ERR_PTR(-EINVAL);
+
+	em = ploop_alloc_extent_map(gfp_mask);
+	if (!em)
+		return ERR_PTR(-ENOMEM);
+
+	fieinfo.fi_extents_start = &fi_extent;
+	fieinfo.fi_extents_max = 1;
+	fieinfo.fi_flags = 0;
+	fieinfo.fi_extents_mapped = 0;
+	fi_extent.fe_flags = 0;
+
+	old_fs = get_fs();
+	set_fs(KERNEL_DS);
+	ret = inode->i_op->fiemap(inode, &fieinfo, start_off, 1);
+
+	/* chase for PSBM-26762: em->block_start == 0 */
+	if (!ret && fieinfo.fi_extents_mapped == 1 &&
+	    !(fi_extent.fe_flags & FIEMAP_EXTENT_UNWRITTEN) &&
+	    (fi_extent.fe_physical >> 9) == 0) {
+		/* see how ext4_fill_fiemap_extents() implemented */
+		if (!(fi_extent.fe_flags & FIEMAP_EXTENT_DELALLOC)) {
+			printk("bad fiemap(%ld,%ld) on inode=%p &fieinfo=%p"
+			" i_size=%lld\n", start, len, inode, &fieinfo,
+			i_size_read(inode));
+			BUG();
+		}
+		/* complain about delalloc case -- ploop always fallocate
+		* before buffered write */
+		WARN(1, "ploop%d: delalloc extent [%lld,%lld] for [%lld,%ld];"
+			" i_size=%lld\n", io->plo->index, fi_extent.fe_logical,
+			fi_extent.fe_length, start_off, len << 9, i_size_read(inode));
+		ret = -ENOENT;
+	}
+	set_fs(old_fs);
+
+	if (ret) {
+		ploop_extent_put(em);
+		return ERR_PTR(ret);
+	}
+
+	if (fieinfo.fi_extents_mapped != 1) {
+		if (start_off < i_size_read(inode))
+			ploop_msg_once(io->plo, "a hole in image file detected"
+				       " (mapped=%d i_size=%llu off=%llu)",
+				       fieinfo.fi_extents_mapped,
+				       i_size_read(inode), start_off);
+		ploop_extent_put(em);
+		return ERR_PTR(-EINVAL);
+	}
+
+	em->start = fi_extent.fe_logical >> 9;
+	em->end = (fi_extent.fe_logical + fi_extent.fe_length) >> 9;
+	em->block_start = fi_extent.fe_physical >> 9;
+
+	if (fi_extent.fe_flags & FIEMAP_EXTENT_UNWRITTEN) {
+		em->uninit = true;
+	} else {
+		ret = add_extent_mapping(tree, em);
+		if (ret == -EEXIST) {
+			ploop_extent_put(em);
+			goto again;
+		}
+	}
+	return em;
+}
+
+static struct extent_map *__map_extent(struct ploop_io *io,
+				       struct address_space *mapping,
+				       sector_t start, sector_t len, int create,
+				       gfp_t gfp_mask, get_block_t get_block)
+{
+	struct extent_map_tree *tree = io->files.em_tree;
+
+	if (tree->_get_extent)
+		return __map_extent_get_extent(tree, mapping, start, len, create,
+					       gfp_mask);
+	if (create)
+		/* create flag not supported by bmap implementation */
+		return ERR_PTR(-EINVAL);
+
+	return __map_extent_bmap(io, mapping, start,len, gfp_mask);
+}
+
+struct extent_map *map_extent_get_block(struct ploop_io *io,
+					struct address_space *mapping,
+					sector_t start, sector_t len, int create,
+					gfp_t gfp_mask, get_block_t get_block)
+{
+	struct extent_map *em;
+	sector_t last;
+	sector_t map_ahead_len = 0;
+
+	em = __map_extent(io, mapping, start, len, create,
+			  gfp_mask, get_block);
+
+	/*
+	 * if we're doing a write or we found a large extent, return it
+	 */
+	if (IS_ERR(em) || !em || create || start + len < em->end) {
+		return em;
+	}
+
+	/*
+	 * otherwise, try to walk forward a bit and see if we can build
+	 * something bigger.
+	 */
+	do {
+		/* avoid race with userspace merge */
+		if (em->end >=
+		    ((sector_t)io->alloc_head << io->plo->cluster_log))
+			break;
+
+		last = em->end;
+		ploop_extent_put(em);
+		em = __map_extent(io, mapping, last, len, create,
+				  gfp_mask, get_block);
+		if (IS_ERR(em) || !em)
+			break;
+		map_ahead_len += em->end - last;
+	} while (em->start <= start && start + len <= em->end &&
+		 map_ahead_len < 1024);
+
+	/* make sure we return the extent for this range */
+	if (!em || IS_ERR(em) || em->start > start ||
+	    start + len > em->end) {
+		if (em && !IS_ERR(em))
+			ploop_extent_put(em);
+		em = __map_extent(io, mapping, start, len, create,
+				  gfp_mask, get_block);
+	}
+	return em;
+}
+
+
+struct extent_map *extent_lookup_create(struct ploop_io *io,
+					sector_t start, sector_t len)
+{
+	struct extent_map_tree *tree = io->files.em_tree;
+
+	return map_extent_get_block(io, tree->mapping,
+				    start, len, 0, mapping_gfp_mask(tree->mapping),
+				    NULL);
+}
+
+static int drop_extent_map(struct extent_map_tree *tree)
+{
+	struct extent_map *em;
+	struct rb_node * node;
+
+	write_lock_irq(&tree->lock);
+	while ((node = tree->map.rb_node) != NULL) {
+		em = rb_entry(node, struct extent_map, rb_node);
+		rb_erase(node, &tree->map);
+		list_del_init(&em->lru_link);
+		tree->map_size--;
+		ploop_extent_put(em);
+	}
+	write_unlock_irq(&tree->lock);
+	return 0;
+}
+
+void trim_extent_mappings(struct extent_map_tree *tree, sector_t start)
+{
+	struct extent_map *em;
+
+	while ((em = lookup_extent_mapping(tree, start, ((sector_t)(-1ULL)) - start))) {
+		remove_extent_mapping(tree, em);
+		WARN_ON(atomic_read(&em->refs) != 2);
+		/* once for us */
+		ploop_extent_put(em);
+		/* No concurrent lookups due to ploop_quiesce(). See WARN_ON above */
+		/* once for the tree */
+		ploop_extent_put(em);
+	}
+}
+
+
+void dump_extent_map(struct extent_map_tree *tree)
+{
+	struct rb_node * r = rb_first(&tree->map);
+
+	while (r) {
+		struct extent_map *em0 = rb_entry(r, struct extent_map, rb_node);
+		printk("N=%ld %ld -> %ld\n", (long)em0->start, (long)(em0->end - em0->start), (long)em0->block_start);
+		r = rb_next(r);
+	}
+}
+
--- /dev/null
+++ b/drivers/block/ploop/io_direct_map.h
@@ -0,0 +1,68 @@
+/*
+ *  drivers/block/ploop/io_direct_map.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __INTERVAL_TREE_H__
+#define __INTERVAL_TREE_H__
+
+#include <linux/rbtree.h>
+
+struct extent_map_tree
+{
+	struct rb_root map;
+	struct list_head lru_list;
+	unsigned int map_size; /* # entries in map */
+	rwlock_t lock;
+	struct address_space * mapping;
+	int (*_get_extent)(struct inode *inode, sector_t isec,
+			   unsigned int nr, sector_t *start,
+			   sector_t *psec, int creat);
+};
+
+struct extent_map
+{
+	struct rb_node rb_node;
+	struct list_head lru_link;
+
+	sector_t	start;
+	sector_t	end;
+
+	sector_t	block_start;
+
+	atomic_t refs;
+
+	bool uninit;
+};
+
+extern int max_extent_map_pages;
+extern int min_extent_map_entries;
+
+static inline sector_t extent_map_block_end(struct extent_map *em)
+{
+	return em->block_start + (em->end - em->start);
+}
+
+struct extent_map *extent_lookup_create(struct ploop_io *io,
+					sector_t start, sector_t len);
+struct extent_map *extent_lookup(struct extent_map_tree *tree,
+				 sector_t start);
+void ploop_extent_put(struct extent_map *em);
+
+struct extent_map *map_extent_get_block(struct ploop_io *io,
+					struct address_space *mapping,
+					sector_t start, sector_t len, int create,
+					gfp_t gfp_mask, get_block_t get_block);
+void trim_extent_mappings(struct extent_map_tree *tree, sector_t start);
+
+int ploop_dio_close(struct ploop_io * io, int rdonly);
+struct extent_map_tree * ploop_dio_open(struct ploop_io * io, int rdonly);
+void ploop_dio_downgrade(struct address_space * mapping);
+int ploop_dio_upgrade(struct ploop_io * io);
+
+int __init ploop_extent_map_init(void);
+void ploop_extent_map_exit(void);
+
+#endif
--- /dev/null
+++ b/drivers/block/ploop/io_kaio.c
@@ -0,0 +1,1056 @@
+/*
+ *  drivers/block/ploop/io_kaio.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/file.h>
+#include <linux/pagemap.h>
+#include <linux/kthread.h>
+#include <linux/mount.h>
+#include <linux/aio.h>
+
+#include <linux/ploop/ploop.h>
+
+/* from fs/inode/fuse.c */
+#define FUSE_SUPER_MAGIC 0x65735546
+
+#define KAIO_PREALLOC (128 * 1024 * 1024) /* 128 MB */
+
+#define KAIO_MAX_PAGES_PER_REQ 32	  /* 128 KB */
+
+/* This will be used as flag "ploop_kaio_open() succeeded" */
+static struct extent_map_tree
+{
+} dummy_em_tree;
+
+int ploop_kaio_open(struct file * file, int rdonly);
+int ploop_kaio_close(struct address_space * mapping, int rdonly);
+void ploop_kaio_downgrade(struct address_space * mapping);
+int ploop_kaio_upgrade(struct address_space * mapping);
+
+static int __kaio_truncate(struct ploop_io * io, struct file * file, u64 pos);
+static int kaio_truncate(struct ploop_io * io, struct file * file, __u32 a_h);
+
+static void __kaio_queue_fsync_req(struct ploop_request * preq, int prio)
+{
+	struct ploop_device * plo   = preq->plo;
+	struct ploop_delta  * delta = ploop_top_delta(plo);
+	struct ploop_io     * io    = &delta->io;
+
+	if (prio)
+		list_add(&preq->list, &io->fsync_queue);
+	else
+		list_add_tail(&preq->list, &io->fsync_queue);
+
+	io->fsync_qlen++;
+	if (waitqueue_active(&io->fsync_waitq))
+		wake_up_interruptible(&io->fsync_waitq);
+}
+
+static void kaio_queue_fsync_req(struct ploop_request * preq)
+{
+	__kaio_queue_fsync_req(preq, 0);
+}
+
+static void kaio_queue_trunc_req(struct ploop_request * preq)
+{
+	__kaio_queue_fsync_req(preq, 1);
+}
+
+static void kaio_complete_io_state(struct ploop_request * preq)
+{
+	struct ploop_device * plo   = preq->plo;
+	unsigned long flags;
+	int post_fsync = 0;
+	int need_fua = !!(preq->req_rw & REQ_FUA);
+	unsigned long state = READ_ONCE(preq->state);
+	int reloc = !!(state & (PLOOP_REQ_RELOC_A_FL|
+				PLOOP_REQ_RELOC_S_FL|
+				PLOOP_REQ_RELOC_N_FL));
+
+	if (preq->error || !(preq->req_rw & REQ_FUA) ||
+	    preq->eng_state == PLOOP_E_INDEX_READ ||
+	    preq->eng_state == PLOOP_E_TRANS_INDEX_READ ||
+	    preq->eng_state == PLOOP_E_DELTA_READ ||
+	    preq->eng_state == PLOOP_E_TRANS_DELTA_READ) {
+		ploop_complete_io_state(preq);
+		return;
+	}
+
+	/* Convert requested fua to fsync */
+	if (test_and_clear_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state) ||
+	    (need_fua && !ploop_req_delay_fua_possible(preq)) ||
+	    (reloc && ploop_req_delay_fua_possible(preq))) {
+		post_fsync = 1;
+		preq->req_rw &= ~REQ_FUA;
+	}
+
+	if (post_fsync) {
+		spin_lock_irqsave(&plo->lock, flags);
+		kaio_queue_fsync_req(preq);
+		plo->st.bio_syncwait++;
+		spin_unlock_irqrestore(&plo->lock, flags);
+	} else {
+		ploop_complete_io_state(preq);
+	}
+}
+
+static void kaio_complete_io_request(struct ploop_request * preq)
+{
+	if (atomic_dec_and_test(&preq->io_count))
+		kaio_complete_io_state(preq);
+}
+
+struct kaio_req {
+	struct ploop_request *preq;
+	struct bio_vec	      bvecs[0];
+};
+
+static void kaio_rw_aio_complete(u64 data, long res)
+{
+	struct ploop_request * preq = (struct ploop_request *)data;
+
+	if (unlikely(res < 0)) {
+		struct bio *b = preq->aux_bio;
+		printk("kaio_rw_aio_complete: kaio failed with err=%ld "
+		       "(rw=%s; state=%ld/0x%lx; clu=%d; iblk=%d; aux=%ld)\n",
+		       res, (preq->req_rw & REQ_WRITE) ? "WRITE" : "READ",
+		       preq->eng_state, preq->state, preq->req_cluster,
+		       preq->iblock, b ? b->bi_sector : -1);
+		bio_list_for_each(b, &preq->bl)
+			printk(" bio=%p: bi_sector=%ld bi_size=%d\n",
+			       b, b->bi_sector, b->bi_size);
+		PLOOP_REQ_SET_ERROR(preq, res);
+	}
+
+	kaio_complete_io_request(preq);
+}
+
+static void kaio_rw_kreq_complete(u64 data, long res)
+{
+	struct kaio_req *kreq = (struct kaio_req *)data;
+	struct ploop_request *preq = kreq->preq;
+
+	kfree(kreq);
+	kaio_rw_aio_complete((u64)preq, res);
+}
+
+static struct kaio_req *kaio_kreq_alloc(struct ploop_request *preq, int *nr_p)
+{
+	static const int nr = KAIO_MAX_PAGES_PER_REQ;
+	struct kaio_req *kreq;
+
+	kreq = kmalloc(offsetof(struct kaio_req, bvecs[nr]), GFP_NOFS);
+	if (kreq) {
+		*nr_p = nr;
+		kreq->preq = preq;
+	}
+
+	return kreq;
+}
+
+static int kaio_kernel_submit(struct file *file, struct kaio_req *kreq,
+		size_t nr_segs, size_t count, loff_t pos, unsigned long rw)
+{
+	struct kiocb *iocb;
+	unsigned short op;
+	struct iov_iter iter;
+	int err;
+
+	iocb = aio_kernel_alloc(GFP_NOIO);
+	if (!iocb)
+		return -ENOMEM;
+
+	if (rw & REQ_WRITE)
+		op = IOCB_CMD_WRITE_ITER;
+	else
+		op = IOCB_CMD_READ_ITER;
+
+	iov_iter_init_bvec(&iter, kreq->bvecs, nr_segs, count, 0);
+	aio_kernel_init_iter(iocb, file, op, &iter, pos);
+	aio_kernel_init_callback(iocb, kaio_rw_kreq_complete, (u64)kreq);
+
+	err = aio_kernel_submit(iocb);
+	if (err)
+		printk("kaio_kernel_submit: aio_kernel_submit failed with "
+		       "err=%d (rw=%s; state=%ld/0x%lx; pos=%lld; len=%ld)\n",
+		       err, (rw & REQ_WRITE) ? "WRITE" : "READ",
+		       kreq->preq->eng_state, kreq->preq->state, pos, count);
+	return err;
+}
+
+/*
+ * Pack as many bios from the list pointed by '*bio_pp' to kreq as possible,
+ * but no more than 'size' bytes. Returns 'copy' equal to # bytes copied.
+ *
+ * <*bio_pp, *idx_p> plays the role of iterator to walk through bio list.
+ * NB: the iterator is valid only while 'size' > 'copy'
+ *
+ * NB: at enter, '*nr_segs' depicts capacity of kreq;
+ *     at return, it depicts actual payload
+ */
+static size_t kaio_kreq_pack(struct kaio_req *kreq, int *nr_segs,
+			     struct bio **bio_pp, int *idx_p, size_t size)
+{
+	int kreq_nr_max = *nr_segs;
+	struct bio *b = *bio_pp;
+	int idx = *idx_p;
+	struct bio_vec *src_bv = b->bi_io_vec + idx;
+	struct bio_vec *dst_bv = kreq->bvecs;
+	size_t copy = 0;
+
+	BUG_ON(b->bi_idx);
+
+	while (1) {
+		int nr = min_t(int, kreq_nr_max, b->bi_vcnt - idx);
+		BUG_ON(!nr);
+
+		memcpy(dst_bv, src_bv, nr * sizeof(struct bio_vec));
+
+		copy += bvec_length(dst_bv, nr);
+		if (copy >= size) {
+			*nr_segs = dst_bv - kreq->bvecs + nr;
+			return size;
+		}
+
+		dst_bv += nr;
+		src_bv += nr;
+		idx += nr;
+
+		if (b->bi_vcnt == idx) {
+			b = b->bi_next;
+			BUG_ON(!b);
+			src_bv = b->bi_io_vec;
+			idx = 0;
+		}
+
+		kreq_nr_max -= nr;
+		if (kreq_nr_max == 0)
+			break;
+	}
+
+	*bio_pp = b;
+	*idx_p = idx;
+	return copy;
+}
+
+/*
+ * WRITE case:
+ *
+ * sbl is the list of bio; the first bio in the list and iblk specify
+ * destination file offset; the content of bios in sbl is scattered source
+ * buffer.
+ *
+ * The goal is to write source buffer to the file with given offset. We're
+ * doing it by stuffing as many bvecs from source to kreqs as possible and
+ * submitting kreqs to in-kernel aio.
+ *
+ * READ case:
+ *
+ * The same as WRITE, but here the file plays the role of source and the
+ * content of bios in sbl plays the role of destination.
+ */
+static void kaio_sbl_submit(struct file *file, struct ploop_request *preq,
+			    unsigned long rw, struct bio_list *sbl,
+			    iblock_t iblk, size_t size)
+{
+	struct bio *bio = sbl->head;
+	int idx = 0;
+
+	loff_t off = bio->bi_sector;
+	off = ((loff_t)iblk << preq->plo->cluster_log) |
+		(off & ((1<<preq->plo->cluster_log) - 1));
+
+	if (rw & REQ_WRITE)
+		ploop_prepare_tracker(preq, off);
+
+	off <<= 9;
+	/* since now 'off' always points to a position in the file to X-mit */
+
+	WARN_ONCE(!(file->f_flags & O_DIRECT), "File opened w/o O_DIRECT");
+
+	ploop_prepare_io_request(preq);
+
+	size <<= 9;
+	while (size > 0) {
+		struct kaio_req *kreq;
+		int nr_segs;
+		size_t copy;
+		int err;
+
+		kreq = kaio_kreq_alloc(preq, &nr_segs);
+		if (!kreq) {
+			PLOOP_REQ_SET_ERROR(preq, -ENOMEM);
+			break;
+		}
+
+		copy = kaio_kreq_pack(kreq, &nr_segs, &bio, &idx, size);
+
+		atomic_inc(&preq->io_count);
+		err = kaio_kernel_submit(file, kreq, nr_segs, copy, off, rw);
+		if (err) {
+			PLOOP_REQ_SET_ERROR(preq, err);
+			ploop_complete_io_request(preq);
+			kfree(kreq);
+			break;
+		}
+
+		off += copy;
+		size -= copy;
+	}
+
+	kaio_complete_io_request(preq);
+}
+
+static void
+kaio_submit(struct ploop_io *io, struct ploop_request * preq,
+	     unsigned long rw,
+	     struct bio_list *sbl, iblock_t iblk, unsigned int size)
+{
+	if (rw & REQ_FLUSH) {
+		spin_lock_irq(&io->plo->lock);
+		kaio_queue_fsync_req(preq);
+		io->plo->st.bio_syncwait++;
+		spin_unlock_irq(&io->plo->lock);
+		return;
+	}
+
+	if (iblk == PLOOP_ZERO_INDEX)
+		iblk = 0;
+
+	kaio_sbl_submit(io->files.file, preq, rw, sbl, iblk, size);
+}
+
+/* returns non-zero if and only if preq was resubmitted */
+static int kaio_resubmit(struct ploop_request * preq)
+{
+	struct ploop_delta * delta = ploop_top_delta(preq->plo);
+
+	switch (preq->eng_state) {
+	case PLOOP_E_ENTRY:
+		return 0;
+	case PLOOP_E_COMPLETE:
+	case PLOOP_E_RELOC_NULLIFY:
+	case PLOOP_E_DATA_WBI:
+		if (preq->aux_bio) {
+			struct bio_list tbl;
+			tbl.head = tbl.tail = preq->aux_bio;
+			kaio_submit(&delta->io, preq, preq->req_rw, &tbl,
+				    preq->iblock, 1<<preq->plo->cluster_log);
+		} else {
+			kaio_submit(&delta->io, preq, preq->req_rw, &preq->bl,
+				    preq->iblock, preq->req_size);
+		}
+		break;
+	case PLOOP_E_TRANS_DELTA_READ:
+		/* BUG_ON below guarantees that 'case PLOOP_E_DELTA_COPIED'
+		 * is equivalent to the part of 'case PLOOP_E_TRANS_DELTA_READ'
+		 * after bio_bcopy(). This is not trivial. */
+		BUG_ON(!test_bit(PLOOP_REQ_TRANS, &preq->state));
+		/* Fall through ... */
+	case PLOOP_E_DELTA_READ:
+		preq->eng_state = PLOOP_E_DELTA_COPIED; /* skip bcopy() */
+		return 0;
+	default:
+		printk("Resubmit bad state %lu\n", preq->eng_state);
+		BUG();
+	}
+
+	return 1;
+}
+
+static inline int io2level(struct ploop_io * io)
+{
+	struct ploop_delta *delta = container_of(io, struct ploop_delta, io);
+	return delta->level;
+}
+
+static int kaio_fsync_thread(void * data)
+{
+	struct ploop_io * io = data;
+	struct ploop_device * plo = io->plo;
+
+	set_user_nice(current, -20);
+
+	spin_lock_irq(&plo->lock);
+	while (!kthread_should_stop() || !list_empty(&io->fsync_queue)) {
+		int err;
+		struct ploop_request * preq;
+
+		DEFINE_WAIT(_wait);
+		for (;;) {
+			prepare_to_wait(&io->fsync_waitq, &_wait, TASK_INTERRUPTIBLE);
+			if (!list_empty(&io->fsync_queue) ||
+			    kthread_should_stop())
+				break;
+
+			spin_unlock_irq(&plo->lock);
+			schedule();
+			spin_lock_irq(&plo->lock);
+		}
+		finish_wait(&io->fsync_waitq, &_wait);
+
+		if (list_empty(&io->fsync_queue) && kthread_should_stop())
+			break;
+
+		preq = list_entry(io->fsync_queue.next, struct ploop_request, list);
+		list_del(&preq->list);
+		io->fsync_qlen--;
+		if (!preq->prealloc_size)
+			plo->st.bio_fsync++;
+		spin_unlock_irq(&plo->lock);
+
+		/* trick: preq->prealloc_size is actually new pos of eof */
+		if (preq->prealloc_size) {
+			err = kaio_truncate(io, io->files.file,
+					    preq->prealloc_size >> (plo->cluster_log + 9));
+			if (err)
+				PLOOP_REQ_SET_ERROR(preq, -EIO);
+		} else {
+			struct file *file = io->files.file;
+			err = vfs_fsync(file, 1);
+			if (err) {
+				printk("kaio_fsync_thread: vfs_fsync failed "
+				       "with err=%d (i_ino=%ld of level=%d "
+				       "on ploop%d)\n",
+				       err, io->files.inode->i_ino,
+				       io2level(io), plo->index);
+				PLOOP_REQ_SET_ERROR(preq, -EIO);
+			} else if (preq->req_rw & REQ_FLUSH) {
+				BUG_ON(!preq->req_size);
+				preq->req_rw &= ~REQ_FLUSH;
+				if (kaio_resubmit(preq)) {
+					spin_lock_irq(&plo->lock);
+					continue;
+				}
+			}
+		}
+
+		spin_lock_irq(&plo->lock);
+		list_add_tail(&preq->list, &plo->ready_queue);
+
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+	}
+	spin_unlock_irq(&plo->lock);
+	return 0;
+}
+
+static void
+kaio_submit_alloc(struct ploop_io *io, struct ploop_request * preq,
+		 struct bio_list * sbl, unsigned int size)
+{
+	struct ploop_delta *delta = container_of(io, struct ploop_delta, io);
+	iblock_t iblk;
+	int log = preq->plo->cluster_log + 9;
+	loff_t clu_siz = 1 << log;
+
+	if (delta->flags & PLOOP_FMT_RDONLY) {
+		PLOOP_FAIL_REQUEST(preq, -EBADF);
+		return;
+	}
+
+	iblk = io->alloc_head;
+
+	if (unlikely(preq->req_rw & REQ_FLUSH)) {
+		spin_lock_irq(&io->plo->lock);
+		kaio_queue_fsync_req(preq);
+		io->plo->st.bio_syncwait++;
+		spin_unlock_irq(&io->plo->lock);
+		return;
+	}
+
+	BUG_ON(preq->prealloc_size);
+
+	if (unlikely(io->prealloced_size < clu_siz)) {
+		if (!io->prealloc_preq) {
+			loff_t pos = (((loff_t)(iblk + 1)  << log) |
+				      (KAIO_PREALLOC - 1)) + 1;
+
+			BUG_ON(preq->prealloc_size);
+			preq->prealloc_size = pos;
+			io->prealloc_preq   = preq;
+
+			spin_lock_irq(&io->plo->lock);
+			kaio_queue_trunc_req(preq);
+			io->plo->st.bio_syncwait++;
+			spin_unlock_irq(&io->plo->lock);
+			return;
+		} else { /* we're not first */
+			list_add_tail(&preq->list,
+				      &io->prealloc_preq->delay_list);
+			return;
+		}
+	}
+
+	io->prealloced_size -= clu_siz;
+	io->alloc_head++;
+
+	preq->iblock = iblk;
+	preq->eng_state = PLOOP_E_DATA_WBI;
+
+	kaio_sbl_submit(io->files.file, preq, REQ_WRITE, sbl, iblk, size);
+}
+
+static int kaio_release_prealloced(struct ploop_io * io)
+{
+	int ret;
+
+	if (!io->prealloced_size)
+		return 0;
+
+	ret = kaio_truncate(io, io->files.file, io->alloc_head);
+	if (ret)
+		printk("Can't release %llu prealloced bytes: "
+		       "truncate to %llu failed (%d)\n",
+		       io->prealloced_size,
+		       (loff_t)io->alloc_head << (io->plo->cluster_log + 9),
+		       ret);
+	else
+		io->prealloced_size = 0;
+
+	return ret;
+}
+
+static void
+kaio_destroy(struct ploop_io * io)
+{
+	if (io->files.file) {
+		struct file * file;
+		struct ploop_delta * delta = container_of(io, struct ploop_delta, io);
+
+		if (io->fsync_thread) {
+			kthread_stop(io->fsync_thread);
+			io->fsync_thread = NULL;
+		}
+
+		(void)kaio_release_prealloced(io);
+
+		if (io->files.em_tree) {
+			mutex_lock(&io->files.inode->i_mutex);
+			ploop_kaio_close(io->files.mapping, delta->flags & PLOOP_FMT_RDONLY);
+			mutex_unlock(&io->files.inode->i_mutex);
+		}
+
+		file = io->files.file;
+		mutex_lock(&delta->plo->sysfs_mutex);
+		io->files.file = NULL;
+		mutex_unlock(&delta->plo->sysfs_mutex);
+		fput(file);
+	}
+}
+
+static int
+kaio_sync(struct ploop_io * io)
+{
+	struct file *file = io->files.file;
+
+	return vfs_fsync(file, 0);
+}
+
+static int
+kaio_stop(struct ploop_io * io)
+{
+	return 0;
+}
+
+static int
+kaio_init(struct ploop_io * io)
+{
+	INIT_LIST_HEAD(&io->fsync_queue);
+	init_waitqueue_head(&io->fsync_waitq);
+
+	return 0;
+}
+
+static void
+kaio_io_page(struct ploop_io * io, int op, struct ploop_request * preq,
+	     struct page * page, sector_t sec)
+{
+
+	struct kiocb *iocb;
+	struct iov_iter iter;
+	loff_t pos = (loff_t) sec << 9;
+	struct file *file = io->files.file;
+	int err;
+
+	ploop_prepare_io_request(preq);
+
+	iocb = aio_kernel_alloc(GFP_NOIO);
+	if (!iocb) {
+		PLOOP_REQ_SET_ERROR(preq, -ENOMEM);
+		goto out;
+	}
+
+	iov_iter_init_page(&iter, page, PAGE_SIZE, 0);
+	aio_kernel_init_iter(iocb, file, op, &iter, pos);
+	aio_kernel_init_callback(iocb, kaio_rw_aio_complete, (u64)preq);
+
+	atomic_inc(&preq->io_count);
+
+	err = aio_kernel_submit(iocb);
+	if (err) {
+		printk("kaio_io_page: aio_kernel_submit failed with "
+		       "err=%d (rw=%s; state=%ld/0x%lx; pos=%lld)\n",
+		       err, (op == IOCB_CMD_WRITE_ITER) ? "WRITE" : "READ",
+		       preq->eng_state, preq->state, pos);
+		PLOOP_REQ_SET_ERROR(preq, err);
+	}
+
+out:
+	ploop_complete_io_request(preq);
+}
+
+static void
+kaio_read_page(struct ploop_io * io, struct ploop_request * preq,
+		struct page * page, sector_t sec)
+{
+	kaio_io_page(io, IOCB_CMD_READ_ITER, preq, page, sec);
+}
+
+static void
+kaio_write_page(struct ploop_io * io, struct ploop_request * preq,
+		 struct page * page, sector_t sec, unsigned long rw)
+{
+	ploop_prepare_tracker(preq, sec);
+
+	/* No FUA in kaio, convert it to fsync. Don't care
+	   about REQ_FLUSH: only io_direct relies on it,
+	   io_kaio implements delay_fua in another way... */
+	if (rw & REQ_FUA)
+		set_bit(PLOOP_REQ_KAIO_FSYNC, &preq->state);
+
+	kaio_io_page(io, IOCB_CMD_WRITE_ITER, preq, page, sec);
+}
+
+static int
+kaio_sync_readvec(struct ploop_io * io, struct page ** pvec, unsigned int nr,
+		   sector_t sec)
+{
+	return -EINVAL;
+}
+
+static int
+kaio_sync_writevec(struct ploop_io * io, struct page ** pvec, unsigned int nr,
+		    sector_t sec)
+{
+	return -EINVAL;
+}
+
+struct kaio_comp {
+	struct completion comp;
+	atomic_t count;
+	int error;
+};
+
+static inline void kaio_comp_init(struct kaio_comp * c)
+{
+	init_completion(&c->comp);
+	atomic_set(&c->count, 1);
+	c->error = 0;
+}
+
+static void kaio_sync_io_complete(u64 data, long err)
+{
+
+	struct kaio_comp *comp = (struct kaio_comp *) data;
+
+	if (unlikely(err < 0)) {
+		if (!comp->error)
+			comp->error = err;
+	}
+
+	if (atomic_dec_and_test(&comp->count))
+		complete(&comp->comp);
+}
+
+static int
+kaio_sync_io(struct ploop_io * io, int op, struct page * page,
+	     unsigned int len, unsigned int off, sector_t sec)
+{
+	struct kiocb *iocb;
+	struct iov_iter iter;
+	struct bio_vec bvec;
+	loff_t pos = (loff_t) sec << 9;
+	struct file *file = io->files.file;
+	struct kaio_comp comp;
+	int err;
+
+	kaio_comp_init(&comp);
+
+	iocb = aio_kernel_alloc(GFP_NOIO);
+	if (!iocb)
+		return -ENOMEM;
+
+	bvec.bv_page = page;
+	bvec.bv_len = len;
+	bvec.bv_offset = off;
+
+	iov_iter_init_bvec(&iter, &bvec, 1, bvec_length(&bvec, 1), 0);
+	aio_kernel_init_iter(iocb, file, op, &iter, pos);
+	aio_kernel_init_callback(iocb, kaio_sync_io_complete, (u64)&comp);
+
+	atomic_inc(&comp.count);
+
+	err = aio_kernel_submit(iocb);
+	if (err) {
+		printk("kaio_sync_io: aio_kernel_submit failed with "
+		       "err=%d (rw=%s; pos=%lld; len=%d off=%d)\n",
+		       err, (op == IOCB_CMD_WRITE_ITER) ? "WRITE" : "READ",
+		       pos, len, off);
+		comp.error = err;
+		if (atomic_dec_and_test(&comp.count))
+			complete(&comp.comp);
+	}
+
+	if (atomic_dec_and_test(&comp.count))
+		complete(&comp.comp);
+
+	wait_for_completion(&comp.comp);
+
+	if (!err && comp.error)
+		printk("kaio_sync_io: kaio failed with err=%d "
+		       "(rw=%s; pos=%lld; len=%d off=%d)\n",
+		       comp.error,
+		       (op == IOCB_CMD_WRITE_ITER) ? "WRITE" : "READ",
+		       pos, len, off);
+
+	return comp.error;
+}
+
+static int
+kaio_sync_read(struct ploop_io * io, struct page * page, unsigned int len,
+		unsigned int off, sector_t sec)
+{
+	return kaio_sync_io(io, IOCB_CMD_READ_ITER, page, len, off, sec);
+}
+
+static int
+kaio_sync_write(struct ploop_io * io, struct page * page, unsigned int len,
+		 unsigned int off, sector_t sec)
+{
+	int ret;
+
+	ret = kaio_sync_io(io, IOCB_CMD_WRITE_ITER, page, len, off, sec);
+
+	if (sec < io->plo->track_end)
+		ploop_tracker_notify(io->plo, sec);
+
+	return ret;
+}
+
+static int kaio_alloc_sync(struct ploop_io * io, loff_t pos, loff_t len)
+{
+	return __kaio_truncate(io, io->files.file, pos + len);
+}
+
+static int kaio_open(struct ploop_io * io)
+{
+	struct file * file = io->files.file;
+	struct ploop_delta * delta = container_of(io, struct ploop_delta, io);
+	int err;
+
+	if (file == NULL)
+		return -EBADF;
+
+	io->files.mapping = file->f_mapping;
+	io->files.inode = io->files.mapping->host;
+	io->files.bdev = io->files.inode->i_sb->s_bdev;
+
+	mutex_lock(&io->files.inode->i_mutex);
+	err = ploop_kaio_open(file, delta->flags & PLOOP_FMT_RDONLY);
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	if (err)
+		return err;
+
+	io->files.em_tree = &dummy_em_tree;
+
+	if (!(delta->flags & PLOOP_FMT_RDONLY)) {
+		io->fsync_thread = kthread_create(kaio_fsync_thread,
+						  io, "ploop_fsync%d",
+						  delta->plo->index);
+		if (io->fsync_thread == NULL) {
+			ploop_kaio_close(io->files.mapping, 0);
+			return -ENOMEM;
+		}
+
+		wake_up_process(io->fsync_thread);
+	}
+
+	return 0;
+}
+
+static int kaio_prepare_snapshot(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+	struct path   path;
+	int err;
+
+	path.mnt = F_MNT(file);
+	path.dentry = F_DENTRY(file);
+
+	file = dentry_open(&path, O_RDONLY|O_LARGEFILE|O_DIRECT,
+			   current_cred());
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	/* Sanity checks */
+	if (io->files.mapping != file->f_mapping ||
+	    io->files.inode != file->f_mapping->host) {
+		fput(file);
+		return -EINVAL;
+	}
+
+	err = vfs_fsync(file, 0);
+	if (err) {
+		fput(file);
+		return err;
+	}
+
+	sd->file = file;
+	return 0;
+}
+
+static int kaio_complete_snapshot(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+	int ret;
+
+	ret = kaio_release_prealloced(io);
+	if (ret)
+		return ret;
+
+	mutex_lock(&io->plo->sysfs_mutex);
+	io->files.file = sd->file;
+	sd->file = NULL;
+	mutex_unlock(&io->plo->sysfs_mutex);
+
+	ploop_kaio_downgrade(io->files.mapping);
+
+	if (io->fsync_thread) {
+		kthread_stop(io->fsync_thread);
+		io->fsync_thread = NULL;
+	}
+
+	fput(file);
+	return 0;
+}
+
+static int kaio_prepare_merge(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+	struct path   path;
+	int err;
+
+	path.mnt = F_MNT(file);
+	path.dentry = F_DENTRY(file);
+
+	file = dentry_open(&path, O_RDWR|O_LARGEFILE|O_DIRECT, current_cred());
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	/* Sanity checks */
+	if (io->files.mapping != file->f_mapping ||
+	    io->files.inode != file->f_mapping->host) {
+		err = -EINVAL;
+		goto prep_merge_done;
+	}
+
+	err = vfs_fsync(file, 0);
+	if (err)
+		goto prep_merge_done;
+
+	err = ploop_kaio_upgrade(io->files.mapping);
+	if (err)
+		goto prep_merge_done;
+
+	io->fsync_thread = kthread_create(kaio_fsync_thread,
+					  io, "ploop_fsync%d",
+					  io->plo->index);
+	if (io->fsync_thread == NULL) {
+		err = -ENOMEM;
+		goto prep_merge_done;
+	}
+
+	wake_up_process(io->fsync_thread);
+
+	sd->file = file;
+
+prep_merge_done:
+	if (err)
+		fput(file);
+	return err;
+}
+
+static int kaio_start_merge(struct ploop_io * io, struct ploop_snapdata *sd)
+{
+	struct file * file = io->files.file;
+
+	mutex_lock(&io->plo->sysfs_mutex);
+	io->files.file = sd->file;
+	sd->file = NULL;
+	mutex_unlock(&io->plo->sysfs_mutex);
+
+	fput(file);
+	return 0;
+}
+
+static int __kaio_truncate(struct ploop_io * io, struct file * file, u64 pos)
+{
+	int err;
+	struct iattr newattrs;
+
+	if (file->f_mapping != io->files.mapping)
+		return -EINVAL;
+
+	newattrs.ia_size  = pos;
+	newattrs.ia_valid = ATTR_SIZE;
+
+	mutex_lock(&io->files.inode->i_mutex);
+	io->files.inode->i_flags &= ~S_SWAPFILE;
+	err = notify_change(F_DENTRY(file), &newattrs, NULL);
+	io->files.inode->i_flags |= S_SWAPFILE;
+	mutex_unlock(&io->files.inode->i_mutex);
+
+	if (err) {
+		printk("__kaio_truncate(i_ino=%ld of level=%d on ploop%d, "
+		       "pos=%lld): notify_change failed with err=%d "
+		       "(i_size=%lld)\n",
+		       io->files.inode->i_ino, io2level(io), io->plo->index,
+		       pos, err, i_size_read(io->files.inode));
+		return err;
+	}
+
+	err = vfs_fsync(file, 0);
+
+	if (err)
+		printk("__kaio_truncate(i_ino=%ld of level=%d on ploop%d, "
+		       "pos=%lld): vfs_fsync failed with err=%d\n",
+		       io->files.inode->i_ino, io2level(io), io->plo->index,
+		       pos, err);
+
+	return err;
+}
+
+static int kaio_truncate(struct ploop_io * io, struct file * file,
+			  __u32 alloc_head)
+{
+	return __kaio_truncate(io, file,
+			       (u64)alloc_head << (io->plo->cluster_log + 9));
+}
+
+static void kaio_unplug(struct ploop_io * io)
+{
+	/* Need more thinking how to implement unplug */
+}
+
+static void kaio_queue_settings(struct ploop_io * io, struct request_queue * q)
+{
+	blk_set_stacking_limits(&q->limits);
+	blk_queue_max_write_same_sectors(q, 0);
+}
+
+static void kaio_issue_flush(struct ploop_io * io, struct ploop_request *preq)
+{
+	struct ploop_delta *delta = container_of(io, struct ploop_delta, io);
+
+	preq->req_rw &= ~REQ_FLUSH;
+
+	spin_lock_irq(&io->plo->lock);
+
+	if (delta->flags & PLOOP_FMT_RDONLY)
+		list_add_tail(&preq->list, &io->plo->ready_queue);
+	else
+		kaio_queue_fsync_req(preq);
+
+	spin_unlock_irq(&io->plo->lock);
+}
+
+static int kaio_autodetect(struct ploop_io * io)
+{
+	struct file  * file  = io->files.file;
+	struct inode * inode = file->f_mapping->host;
+
+	if (inode->i_sb->s_magic != FUSE_SUPER_MAGIC)
+		return -1; /* not mine */
+
+	if (!(file->f_flags & O_DIRECT)) {
+		ploop_io_report_fn(file, "File opened w/o O_DIRECT");
+		return -1;
+	}
+
+	if (file->f_mapping->a_ops->direct_IO_bvec == NULL) {
+		printk("Cannot run kaio over fs (%s) w/o direct_IO_bvec\n",
+		       file->f_mapping->host->i_sb->s_type->name);
+		return -1;
+	}
+
+	if (file->f_mapping->a_ops->direct_IO_page == NULL) {
+		printk("Cannot run kaio over fs (%s) w/o direct_IO_page\n",
+		       file->f_mapping->host->i_sb->s_type->name);
+		return -1;
+	}
+
+	return 0;
+}
+
+static struct ploop_io_ops ploop_io_ops_kaio =
+{
+	.id		=	PLOOP_IO_KAIO,
+	.name		=	"kaio",
+	.owner		=	THIS_MODULE,
+
+	.unplug		=	kaio_unplug,
+
+	.alloc		=	kaio_alloc_sync,
+	.submit		=	kaio_submit,
+	.submit_alloc	=	kaio_submit_alloc,
+	.read_page	=	kaio_read_page,
+	.write_page	=	kaio_write_page,
+	.sync_read	=	kaio_sync_read,
+	.sync_write	=	kaio_sync_write,
+	.sync_readvec	=	kaio_sync_readvec,
+	.sync_writevec	=	kaio_sync_writevec,
+
+	.init		=	kaio_init,
+	.destroy	=	kaio_destroy,
+	.open		=	kaio_open,
+	.sync		=	kaio_sync,
+	.stop		=	kaio_stop,
+	.prepare_snapshot =	kaio_prepare_snapshot,
+	.complete_snapshot =	kaio_complete_snapshot,
+	.prepare_merge	=	kaio_prepare_merge,
+	.start_merge	=	kaio_start_merge,
+	.truncate	=	kaio_truncate,
+
+	.queue_settings	=	kaio_queue_settings,
+	.issue_flush	=	kaio_issue_flush,
+
+	.i_size_read	=	generic_i_size_read,
+	.f_mode		=	generic_f_mode,
+
+	.autodetect     =       kaio_autodetect,
+};
+
+static int __init pio_kaio_mod_init(void)
+{
+	return ploop_register_io(&ploop_io_ops_kaio);
+}
+
+static void __exit pio_kaio_mod_exit(void)
+{
+	ploop_unregister_io(&ploop_io_ops_kaio);
+}
+
+module_init(pio_kaio_mod_init);
+module_exit(pio_kaio_mod_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/drivers/block/ploop/io_kaio_map.c
@@ -0,0 +1,133 @@
+/*
+ *  drivers/block/ploop/io_kaio_map.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/ploop/ploop.h>
+
+struct ploop_mapping
+{
+	struct list_head	list;
+	struct address_space	* mapping;
+	int			readers;
+};
+
+static LIST_HEAD(ploop_mappings);
+static DEFINE_SPINLOCK(ploop_mappings_lock);
+
+int ploop_kaio_open(struct file * file, int rdonly)
+{
+	int err = 0;
+	struct ploop_mapping *m, *pm;
+	struct address_space * mapping = file->f_mapping;
+
+	pm = kzalloc(sizeof(struct ploop_mapping), GFP_KERNEL);
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			if (rdonly) {
+				if (m->readers < 0)
+					err = -ETXTBSY;
+				else
+					m->readers++;
+			} else {
+				if (m->readers)
+					err = -EBUSY;
+				else
+					m->readers = -1;
+			}
+			goto kaio_open_done;
+		}
+	}
+
+	if (pm == NULL) {
+		err = -ENOMEM;
+		goto kaio_open_done;
+	}
+
+	if (mapping->host->i_flags & S_SWAPFILE) {
+		err = -EBUSY;
+		goto kaio_open_done;
+	}
+
+	pm->mapping = mapping;
+	pm->readers = rdonly ? 1 : -1;
+	list_add(&pm->list, &ploop_mappings);
+	pm = NULL;
+	mapping->host->i_flags |= S_SWAPFILE;
+
+kaio_open_done:
+	spin_unlock(&ploop_mappings_lock);
+	if (pm)
+		kfree(pm);
+	return err;
+}
+
+int ploop_kaio_close(struct address_space * mapping, int rdonly)
+{
+	struct ploop_mapping *m, *pm = NULL;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			if (rdonly) {
+				m->readers--;
+			} else {
+				BUG_ON(m->readers != -1);
+				m->readers = 0;
+			}
+
+			if (m->readers == 0) {
+				mapping->host->i_flags &= ~S_SWAPFILE;
+				list_del(&m->list);
+				pm = m;
+			}
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+
+	if (pm) {
+		kfree(pm);
+		return 0;
+	}
+	return -ENOENT;
+}
+
+void ploop_kaio_downgrade(struct address_space * mapping)
+{
+	struct ploop_mapping * m;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			BUG_ON(m->readers != -1);
+			m->readers = 1;
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+}
+
+int ploop_kaio_upgrade(struct address_space * mapping)
+{
+	struct ploop_mapping * m;
+	int err = -ESRCH;
+
+	spin_lock(&ploop_mappings_lock);
+	list_for_each_entry(m, &ploop_mappings, list) {
+		if (m->mapping == mapping) {
+			err = -EBUSY;
+			if (m->readers == 1) {
+				m->readers = -1;
+				err = 0;
+			}
+			break;
+		}
+	}
+	spin_unlock(&ploop_mappings_lock);
+	return err;
+}
--- /dev/null
+++ b/drivers/block/ploop/map.c
@@ -0,0 +1,1339 @@
+/*
+ *  drivers/block/ploop/map.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * Generic engine for mapping virtual blocks (cluster_t) to indices
+ * in image (iblock_t).
+ *
+ * Mapping is global, it is defined not for some particular delta,
+ * but for the whole disk. Therefore it is abstract and does not depend
+ * on particular virtual disk format. Of course, for some disk types
+ * it can be not so easy to fetch/update backing store. Actually,
+ * this engine is tightly bound to organization of index tables in ploop1.
+ *
+ * Technically, it is just array of pages with some metainformation
+ * attached to each page. The array may be highly sparse, so that it is
+ * in rbtree keyed by array index cluster_no / (PAGE_SIZE / sizeof(map_index)).
+ *
+ * Sadly, it is completely similar to linux page cache for a virtual
+ * mapping. "Sadly" is because linux page cache provides only a crippled
+ * implementation of asynchronous read/writeback, which requires synchronous
+ * waits for completions and does not making any callbacks on completion.
+ * So that, we have to redo all the work here.
+ *
+ * Two words about synchronization. All the updates to map are
+ * made from single thread. Lookups can happen in an unserialized context,
+ * therefore we protect all critical updates with spinlock. RCU can be used too.
+ *
+ * Mapping is UPTODATE, when it is in sync with top delta.
+ * When a mapping is accessed the first time and there is no mapping in top
+ * delta, we search for lower level delta. We could create empty mapping
+ * and this would have advantage: when the whole blocks are rewritten
+ * we do not even need lower deltas (_XXX_).
+ */
+
+#include <linux/module.h>
+#include <linux/version.h>
+
+#include <linux/ploop/ploop.h>
+
+/* This defines slot in mapping page. Right now it is 32 bit
+ * and therefore it directly matches ploop1 structure. */
+typedef u32 map_index_t;
+
+#define INDEX_PER_PAGE	(PAGE_SIZE / sizeof(map_index_t))
+
+static struct kmem_cache * ploop_map_cache;
+
+static LIST_HEAD(map_lru);
+static DEFINE_SPINLOCK(map_lru_lock);
+static atomic_t map_pages_nr = ATOMIC_INIT(0);
+
+/*
+ * Additional information for each page is:
+ * 1. rb tree link
+ * 2. Page
+ * 3. mn_start, mn_end - the first and the last index
+ * (correspondingly) the page maps to iblocks.
+ * 4. lru linkage
+ * 5. delta level of whole page, it is delta, where this page
+ *    is backed.
+ * 6. Array of delta levels for each map_index in the page.
+ *    If page is backed at level N, those levels cannot be >N.
+ *    If all the levels == N, array of levels is not allocated.
+ *    When at least one level < N, it is stored in the array.
+ *    Note, that in this case exporting page to disk implies
+ *    clearing irrelevant entries.
+ */
+
+struct map_node
+{
+	struct rb_node		rb_link;
+	cluster_t		mn_start;
+	cluster_t		mn_end;
+	unsigned long		state;
+	atomic_t		refcnt;
+	struct ploop_map	*parent;
+
+	struct page		*page;
+	struct list_head	lru;
+	u8			*levels;
+
+	/* List of preq's blocking on this mapping.
+	 *
+	 * We queue here several kinds of requests:
+	 * 1. If mapping is not uptodate, all the requests which need
+	 *    this mapping are queued here. preq state is ENTRY.
+	 * 2. If preq requires index update and it is delayed
+	 *    because writeback is in progress. preq state is INDEX_DELAY,
+	 *    new index is kept in preq->iblock.
+	 * 3. If preq's started index update, preq state is INDEX_WB,
+	 *    new indices are sent to io, but they are not inserted
+	 *    into mapping until writeback is complete.
+	 */
+	struct list_head	io_queue;
+};
+
+cluster_t map_get_mn_end(struct map_node *m)
+{
+	return m->mn_end;
+}
+
+#define MAP_LEVEL(m)		((m)->state & 0xFF)
+#define MAP_SET_LEVEL(m, l)	((m)->state = ((m)->state & ~0xFF) | (l))
+
+#define MAP_UPTODATE(m)		(((m)->state >> 8) & 0xFFUL)
+#define MAP_SET_UPTODATE(m, l)	((m)->state = ((m)->state & ~0xFF00UL) | ((l)<<8))
+
+enum {
+	PLOOP_MAP_UPTODATE	= 16,	/* Mapping is in sync with top_delta,
+					 * we can write index. But zero entries
+					 * still require read lower delta indices.
+					 */
+	PLOOP_MAP_READ		= 17,	/* Mapping read is scheduled */
+	PLOOP_MAP_WRITEBACK	= 18,	/* Mapping is under writeback */
+	PLOOP_MAP_ERROR		= 19,	/* Mapping is baaad */
+};
+
+void map_init(struct ploop_device * plo, struct ploop_map * map)
+{
+	INIT_LIST_HEAD(&map->delta_list);
+	map->flags = 0;
+	map->last_activity = jiffies;
+	map->plo = plo;
+	map->rb_root = RB_ROOT;
+	map->lru_buffer_ptr = 0;
+	init_waitqueue_head(&map->destroy_waitq);
+}
+
+/* Deliver batch of LRU updates from buffer to global LRU.
+ * Everything, which has zero refcnt, is added to LRU or moved to tail
+ * of LRU. Everything, which has non-zero refcnt, is removed from LRU.
+ */
+static void flush_lru_buffer(struct ploop_map * map)
+{
+	int i;
+	unsigned long flags;
+
+	spin_lock_irqsave(&map_lru_lock, flags);
+	for (i = 0; i < map->lru_buffer_ptr; i++) {
+		struct map_node * m = map->lru_buffer[i];
+		if (atomic_dec_and_test(&m->refcnt))
+			list_move_tail(&m->lru, &map_lru);
+		else
+			list_del_init(&m->lru);
+	}
+	spin_unlock_irqrestore(&map_lru_lock, flags);
+
+	map->lru_buffer_ptr = 0;
+}
+
+/*
+ * map_release() must be called under plo-lock, because
+ * The pair atomic_read & atomic_dec_and_test is not atomic.
+ */
+void map_release(struct map_node * m)
+{
+	struct ploop_map * map = m->parent;
+
+	if (atomic_read(&m->refcnt) == 1) {
+		if (!list_empty(&m->lru))
+			return;
+		if (map->lru_buffer_ptr == PLOOP_LRU_BUFFER)
+			flush_lru_buffer(map);
+		map->lru_buffer[map->lru_buffer_ptr++] = m;
+		return;
+	}
+	if (atomic_dec_and_test(&m->refcnt))
+		BUG();
+}
+
+static inline void cond_flush_lru_buffer(struct ploop_map * map)
+{
+	if (map->lru_buffer_ptr == PLOOP_LRU_BUFFER)
+		flush_lru_buffer(map);
+}
+
+
+static struct map_node * map_lookup(struct ploop_map * map, cluster_t block)
+{
+	struct rb_node * n = map->rb_root.rb_node;
+	struct map_node * m;
+
+	while (n) {
+		m = rb_entry(n, struct map_node, rb_link);
+
+		if (block < m->mn_start)
+			n = n->rb_left;
+		else if (block > m->mn_end)
+			n = n->rb_right;
+		else
+			return m;
+	}
+	return NULL;
+}
+
+/* Lookup mapping atomically. */
+
+int ploop_fastmap(struct ploop_map * map, cluster_t block, iblock_t *result)
+{
+	struct map_node * m;
+	u32 idx;
+	map_index_t blk;
+
+	if (unlikely(block >= map->max_index))
+		return -1;
+
+	if (test_bit(PLOOP_MAP_IDENTICAL, &map->flags)) {
+		*result = block;
+		return 0;
+	}
+
+	m = map_lookup(map, block);
+	if (m == NULL)
+		return -1;
+
+	if (atomic_read(&m->refcnt) == 0) {
+		cond_flush_lru_buffer(map);
+		if (atomic_read(&m->refcnt) == 0) {
+			atomic_inc(&m->refcnt);
+			map->lru_buffer[map->lru_buffer_ptr++] = m;
+		}
+	}
+	map->last_activity = jiffies;
+
+	if (!test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		return -1;
+
+	idx = (block + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1); 
+	blk = ((map_index_t *)page_address(m->page))[idx] >>
+	       ploop_map_log(map->plo);
+
+	if (blk) {
+		*result = blk;
+		if (m->levels)
+			return m->levels[idx];
+		else
+			return MAP_LEVEL(m);
+	}
+	return -1;
+}
+
+static void map_node_destroy(struct map_node *m)
+{
+	rb_erase(&m->rb_link, &m->parent->rb_root);
+	list_del_init(&m->lru);
+	BUG_ON(atomic_read(&m->refcnt));
+	BUG_ON(!list_empty(&m->io_queue));
+	if (m->page)
+		put_page(m->page);
+	if (m->levels)
+		kfree(m->levels);
+	m->parent->pages--;
+	atomic_dec(&map_pages_nr);
+	kmem_cache_free(ploop_map_cache, m);
+}
+
+static void map_lru_scan(void)
+{
+	int max_loops = atomic_read(&map_pages_nr);
+
+	while (atomic_read(&map_pages_nr) > max_map_pages &&
+	       --max_loops >= 0) {
+		struct ploop_map * map;
+		struct map_node * candidate = NULL;
+
+		spin_lock_irq(&map_lru_lock);
+		if (!list_empty(&map_lru)) {
+			candidate = list_first_entry(&map_lru, struct map_node, lru);
+			atomic_inc(&candidate->refcnt);
+		}
+		spin_unlock_irq(&map_lru_lock);
+
+		if (!candidate)
+			break;
+
+		map = candidate->parent;
+
+		spin_lock_irq(&map->plo->lock);
+		spin_lock(&map_lru_lock);
+
+		if (waitqueue_active(&map->destroy_waitq)) {
+			atomic_dec(&candidate->refcnt);
+			wake_up(&map->destroy_waitq);
+			spin_unlock(&map_lru_lock);
+			spin_unlock_irq(&map->plo->lock);
+			return;
+		}
+
+		list_del_init(&candidate->lru);
+
+		if (atomic_dec_and_test(&candidate->refcnt)) {
+			/* This instance is within its limits, just
+			 * readd node back to tail of lru.
+			 */
+			if (map->pages <= map->plo->tune.min_map_pages &&
+			    time_after(map->last_activity +
+				       map->plo->tune.max_map_inactivity, jiffies) &&
+			    !test_bit(PLOOP_MAP_DEAD, &map->flags)) {
+				list_add_tail(&candidate->lru, &map_lru);
+			} else {
+				map_node_destroy(candidate);
+			}
+		}
+		spin_unlock(&map_lru_lock);
+		spin_unlock_irq(&map->plo->lock);
+
+		if (!(max_loops & 16))
+			cond_resched();
+	}
+}
+
+static struct map_node *
+map_create(struct ploop_map * map, cluster_t block)
+{
+	struct ploop_device * plo = map->plo;
+	struct rb_node **p, *parent;
+	struct map_node * m;
+	cluster_t ondisk_pageno = (block + PLOOP_MAP_OFFSET) / INDEX_PER_PAGE;
+
+	m = kmem_cache_alloc(ploop_map_cache, GFP_NOFS);
+	if (unlikely(m == NULL))
+		return ERR_PTR(-ENOMEM);
+
+	m->page = alloc_page(GFP_NOFS);
+	if (unlikely(m->page == NULL)) {
+		kmem_cache_free(ploop_map_cache, m);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	if (ondisk_pageno == 0) {
+		m->mn_start = 0;
+		m->mn_end = INDEX_PER_PAGE - PLOOP_MAP_OFFSET - 1;
+	} else {
+		m->mn_start = ondisk_pageno * INDEX_PER_PAGE - PLOOP_MAP_OFFSET;
+		m->mn_end = m->mn_start + INDEX_PER_PAGE - 1;
+	}
+
+	INIT_LIST_HEAD(&m->io_queue);
+	INIT_LIST_HEAD(&m->lru);
+	m->levels = NULL;
+	m->state = 0;
+	atomic_set(&m->refcnt, 1);
+	m->parent = map;
+
+	spin_lock_irq(&plo->lock);
+
+	p = &map->rb_root.rb_node;
+	parent = NULL;
+
+	while (*p) {
+		struct map_node * entry;
+		parent = *p;
+		entry = rb_entry(parent, struct map_node, rb_link);
+
+		/* Nodes can be deleted by any of ploop threads,
+		 * but they are inserted only in ploop thread.
+		 * Before calling map_create() we checked the node
+		 * is absent, therefore:
+		 */
+		BUG_ON(ondisk_pageno ==
+		       (entry->mn_start + PLOOP_MAP_OFFSET) / INDEX_PER_PAGE);
+
+		if (block < entry->mn_start)
+			p = &(*p)->rb_left;
+		else if (block > entry->mn_end)
+			p = &(*p)->rb_right;
+		else
+			printk("map_create: Oops! block=%u; mn_range=[%u..%u]\n",
+			       block, entry->mn_start, entry->mn_end);
+	}
+
+	rb_link_node(&m->rb_link, parent, p);
+	rb_insert_color(&m->rb_link, &map->rb_root);
+
+	map->pages++;
+	atomic_inc(&map_pages_nr);
+	spin_unlock_irq(&plo->lock);
+
+	if (atomic_read(&map_pages_nr) > max_map_pages)
+		map_lru_scan();
+
+	return m;
+}
+
+/* helper for trans_map_get_index() and map_get_index() */
+static iblock_t
+cluster2iblock(struct ploop_request *preq, struct map_node *m, cluster_t block,
+	       u32 *idx)
+{
+	iblock_t iblk;
+	char *fmt;
+
+	BUG_ON (block < INDEX_PER_PAGE - PLOOP_MAP_OFFSET && m->mn_start != 0);
+	BUG_ON (block >= INDEX_PER_PAGE - PLOOP_MAP_OFFSET && m->mn_start !=
+		((block + PLOOP_MAP_OFFSET) &
+		 ~(INDEX_PER_PAGE - 1)) - PLOOP_MAP_OFFSET);
+
+	*idx = (block + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1);
+	iblk = ((map_index_t *)page_address(m->page))[*idx];
+
+	if (likely(iblk != PLOOP_ZERO_INDEX))
+		iblk >>= ploop_map_log(preq->plo);
+
+	if (m == preq->trans_map)
+		fmt = "tmgi %u %d %u [ %u %u ]\n";
+	else if (m == preq->map)
+		fmt = "mgi %u %d %u [ %u %u ]\n";
+	else
+		BUG();
+
+	__TRACE(fmt, block, *idx, iblk,
+		((map_index_t *)page_address(m->page))[0],
+		((map_index_t *)page_address(m->page))[1]);
+
+	return iblk;
+}
+
+int trans_map_get_index(struct ploop_request * preq, cluster_t block, iblock_t *result)
+{
+	struct map_node * m = preq->trans_map;
+	u32 idx;
+	map_index_t blk;
+
+	if (m == NULL)
+		return -1;
+
+	blk = cluster2iblock(preq, m, block, &idx);
+
+	if (blk) {
+		*result = blk;
+		return 0;
+	}
+	return -1;
+}
+
+
+int map_get_index(struct ploop_request * preq, cluster_t block, iblock_t *result)
+{
+	struct map_node * m = preq->map;
+	u32 idx;
+	map_index_t blk;
+
+	if (m == NULL) {
+		*result = block;
+		return 0;
+	}
+
+	blk = cluster2iblock(preq, m, block, &idx);
+
+	if (blk) {
+		*result = blk;
+		if (m->levels)
+			return m->levels[idx];
+		else
+			return MAP_LEVEL(m);
+	}
+	return -1;
+}
+
+int map_index_fault(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	struct ploop_delta * top_delta, * delta, * ndelta;
+	struct map_node * m = preq->map;
+	int uptodate_level;
+	sector_t pos;
+	int err;
+
+	uptodate_level = MAP_UPTODATE(m);
+
+	/* All the levels are read, mapping is absent. */
+	if (uptodate_level == 0) {
+		__TRACE("MAP E %u\n", preq->req_cluster);
+		return -1;
+	}
+
+	top_delta = ploop_top_delta(plo);
+	delta = NULL;
+
+	list_for_each_entry(ndelta, &plo->map.delta_list, list) {
+		int rc;
+
+		if (ndelta->level >= uptodate_level)
+			continue;
+
+		rc = ndelta->ops->map_index(ndelta, m->mn_start, &pos);
+		if (rc != 0) {
+			delta = ndelta;
+			break;
+		}
+
+		MAP_SET_UPTODATE(m, ndelta->level);
+		__TRACE("MAP SKIP %u %d\n", preq->req_cluster, ndelta->level);
+	}
+
+	/* Not found anywhere. */
+	if (!delta) {
+		__TRACE("MAP NF %u\n", preq->req_cluster);
+		return -1;
+	}
+
+	/* Mapping is present in lower delta, start merge */
+	spin_lock_irq(&plo->lock);
+	ploop_add_lockout(preq, 0);
+
+	if (test_and_set_bit(PLOOP_MAP_READ, &m->state)) {
+		__TRACE("r %p %u %p\n", preq, preq->req_cluster, m);
+		list_add_tail(&preq->list, &m->io_queue);
+		plo->st.merge_lockouts++;
+		spin_unlock_irq(&plo->lock);
+		/* Someone already scheduled read. */
+		return 0;
+	}
+	spin_unlock_irq(&plo->lock);
+
+	err = -EIO;
+	if (test_bit(PLOOP_MAP_ERROR, &m->state))
+		goto err_out;
+
+	err = -ENOMEM;
+	preq->sinfo.ri.tpage = alloc_page(GFP_NOFS);
+	if (preq->sinfo.ri.tpage == NULL)
+		goto err_out;
+
+	preq->sinfo.ri.level = delta->level;
+	preq->eng_state = PLOOP_E_INDEX_READ;
+
+	plo->st.map_merges++;
+	delta->ops->read_index(delta, preq, preq->sinfo.ri.tpage, pos);
+	return 0;
+
+err_out:
+	clear_bit(PLOOP_MAP_READ, &m->state);
+	PLOOP_FAIL_REQUEST(preq, err);
+	return 0;
+}
+
+static void map_read_endio(struct ploop_request * preq, struct map_node * m)
+{
+	struct ploop_device * plo = preq->plo;
+	struct list_head * n, *pn;
+	LIST_HEAD(list);
+
+	spin_lock_irq(&plo->lock);
+
+	if (!preq->error) {
+		set_bit(PLOOP_MAP_UPTODATE, &m->state);
+	} else {
+		set_bit(PLOOP_MAP_ERROR, &m->state);
+	}
+	clear_bit(PLOOP_MAP_READ, &m->state);
+
+	__TRACE(">E %p %u %p\n", preq, preq->req_cluster, m);
+
+	list_for_each_safe(n, pn, &m->io_queue) {
+		preq = list_entry(n, struct ploop_request, list);
+		if (preq->eng_state == PLOOP_E_ENTRY) {
+			list_del(&preq->list);
+			list_add_tail(&preq->list, &list);
+		}
+	}
+	if (!list_empty(&list))
+		list_splice(&list, &plo->ready_queue);
+	spin_unlock_irq(&plo->lock);
+}
+
+static void map_merge_endio(struct ploop_request * preq, struct map_node * m)
+{
+	struct ploop_device * plo = preq->plo;
+	struct list_head *n, *pn;
+	LIST_HEAD(list);
+	int i;
+	u32 * map;
+	u32 * merged;
+	int skip = m->mn_start == 0 ? PLOOP_MAP_OFFSET : 0;
+
+	__TRACE(">M %p %u %p\n", preq, preq->req_cluster, m);
+
+	if (unlikely(preq->error))
+		goto abort_update;
+
+	map = page_address(m->page);
+	merged = page_address(preq->sinfo.ri.tpage);
+
+	for (i = skip; i < INDEX_PER_PAGE; i++) {
+		if (map[i] != 0)
+			continue;
+		if (merged[i] == 0)
+			continue;
+		if (preq->sinfo.ri.level != MAP_LEVEL(m)) {
+			if (!m->levels) {
+				m->levels = kmalloc(INDEX_PER_PAGE, GFP_NOFS);
+				if (unlikely(m->levels == NULL)) {
+					preq->error = -ENOMEM;
+					goto abort_update;
+				}
+				memset(m->levels, MAP_LEVEL(m), INDEX_PER_PAGE);
+			}
+			m->levels[i] = preq->sinfo.ri.level;
+		}
+		map[i] = merged[i];
+	}
+
+	put_page(preq->sinfo.ri.tpage);
+	preq->sinfo.ri.tpage = NULL;
+
+	spin_lock_irq(&plo->lock);
+	clear_bit(PLOOP_MAP_READ, &m->state);
+	MAP_SET_UPTODATE(m, preq->sinfo.ri.level);
+	__TRACE("MAP U %u %d\n", preq->req_cluster, preq->sinfo.ri.level);
+	preq->eng_state = PLOOP_E_ENTRY;
+
+flush_queue:
+	list_for_each_safe(n, pn, &m->io_queue) {
+		preq = list_entry(n, struct ploop_request, list);
+		if (preq->eng_state == PLOOP_E_ENTRY) {
+			list_del(&preq->list);
+			list_add_tail(&preq->list, &list);
+		}
+	}
+	if (!list_empty(&list))
+		list_splice(&list, &plo->ready_queue);
+	spin_unlock_irq(&plo->lock);
+	return;
+
+abort_update:
+	put_page(preq->sinfo.ri.tpage);
+	preq->sinfo.ri.tpage = NULL;
+	preq->eng_state = PLOOP_E_COMPLETE;
+
+	spin_lock_irq(&plo->lock);
+	clear_bit(PLOOP_MAP_READ, &m->state);
+	set_bit(PLOOP_MAP_ERROR, &m->state);
+	goto flush_queue;
+}
+
+
+void map_read_complete(struct ploop_request * preq)
+{
+	struct map_node * m = preq->map;
+
+	if (preq->eng_state == PLOOP_E_TRANS_INDEX_READ)
+		m = preq->trans_map;
+
+	if (!test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		map_read_endio(preq, m);
+	else
+		map_merge_endio(preq, m);
+}
+
+static int
+ploop_map_start_read(struct ploop_map * map, struct ploop_request * preq,
+		     struct map_node * m)
+{
+	struct ploop_device * plo = map->plo;
+	struct ploop_delta * top_delta, * delta, * ndelta;
+	sector_t pos;
+
+	top_delta = map_top_delta(map);
+	delta = NULL;
+
+	list_for_each_entry(ndelta, &map->delta_list, list) {
+		int rc;
+
+		rc = ndelta->ops->map_index(ndelta, m->mn_start, &pos);
+		if (rc != 0) {
+			delta = ndelta;
+			break;
+		}
+	}
+
+	if (delta) {
+		__TRACE("MAP R0 %u %d %lu %d\n", preq->req_cluster, delta->level, pos, m->index);
+		/* We know delta, we know position. We can read. */
+		MAP_SET_LEVEL(m, delta->level);
+		MAP_SET_UPTODATE(m, delta->level);
+		if (map == &plo->map)
+			preq->eng_state = PLOOP_E_INDEX_READ;
+		else
+			preq->eng_state = PLOOP_E_TRANS_INDEX_READ;
+		delta->ops->read_index(delta, preq, m->page, pos);
+		plo->st.map_reads++;
+		return 1;
+	}
+
+	/* Otherwise mapping does not exist. */
+	memset(page_address(m->page), 0, PAGE_SIZE);
+	__TRACE("MAP R1 %u %d\n", preq->req_cluster, top_delta->level);
+	MAP_SET_LEVEL(m, top_delta->level);
+	MAP_SET_UPTODATE(m, 0);
+	clear_bit(PLOOP_MAP_READ, &m->state);
+	set_bit(PLOOP_MAP_UPTODATE, &m->state);
+	return 0;
+}
+
+static int ploop_read_map(struct ploop_map * map, struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	struct map_node * m = (map == &plo->map) ? preq->map : preq->trans_map;
+	int err = 0;
+
+	spin_lock_irq(&plo->lock);
+	if (!test_bit(PLOOP_MAP_UPTODATE, &m->state)) {
+		if (test_bit(PLOOP_MAP_ERROR, &m->state)) {
+			err = -EIO;
+			goto out;
+		}
+
+		if (!test_and_set_bit(PLOOP_MAP_READ, &m->state)) {
+			spin_unlock_irq(&plo->lock);
+
+			return ploop_map_start_read(map, preq, m);
+		} else {
+			__TRACE("g %p %u %p\n", preq, preq->req_cluster, m);
+			plo->st.map_lockouts++;
+			list_add_tail(&preq->list, &m->io_queue);
+			err = 1;
+		}
+	}
+
+out:
+	spin_unlock_irq(&plo->lock);
+	return err;
+}
+
+void ploop_update_map(struct ploop_map * map, int level,
+		      cluster_t block, iblock_t iblk)
+{
+	struct map_node * m;
+	u32 idx;
+	map_index_t *p;
+
+	spin_lock_irq(&map->plo->lock);
+
+	m = map_lookup(map, block);
+	if (!m || !test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		goto out;
+
+	p = (map_index_t *)page_address(m->page);
+	idx = (block  + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1);
+
+	if (p[idx]) {
+		int lvl = m->levels ? m->levels[idx] : MAP_LEVEL(m);
+
+		if (lvl == level)
+			p[idx] = iblk << ploop_map_log(map->plo);
+		else if (lvl < level)
+			printk("Unexpected condition: uptodate map_node %p "
+			       "covering range %u..%u maps %u to %u on level "
+			       "%d, while user-space merge detected mapping "
+			       "on level %d\n", m, m->mn_start, m->mn_end,
+			       block, p[idx] >> map->plo->cluster_log, lvl,
+			       level);
+	}
+out:
+	spin_unlock_irq(&map->plo->lock);
+}
+
+void ploop_update_map_hdr(struct ploop_map * map, u8 *hdr, int hdr_size)
+{
+	struct map_node * m;
+
+	spin_lock_irq(&map->plo->lock);
+
+	m = map_lookup(map, 0);
+	if (m && test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		memcpy(page_address(m->page), hdr, hdr_size);
+
+	spin_unlock_irq(&map->plo->lock);
+}
+EXPORT_SYMBOL(ploop_update_map_hdr);
+
+int ploop_find_trans_map(struct ploop_map * map, struct ploop_request * preq)
+{
+	struct map_node * m;
+	cluster_t block;
+
+	block = preq->req_cluster;
+
+	if (unlikely(block >= map->max_index))
+		return -ERANGE;
+
+	map->last_activity = jiffies;
+
+	m = preq->trans_map;
+	if (m == NULL) {
+		spin_lock_irq(&map->plo->lock);
+		m = map_lookup(map, block);
+		if (m) {
+			atomic_inc(&m->refcnt);
+			if (!list_empty(&m->lru) && atomic_read(&m->refcnt) == 1) {
+				cond_flush_lru_buffer(map);
+				if (atomic_read(&m->refcnt) == 1) {
+					atomic_inc(&m->refcnt);
+					map->lru_buffer[map->lru_buffer_ptr++] = m;
+				}
+			}
+		}
+		spin_unlock_irq(&map->plo->lock);
+
+		if (m == NULL) {
+			struct ploop_delta * mdelta = map_top_delta(map);
+			sector_t sec;
+			if (mdelta->ops->map_index(mdelta, block, &sec) == 0)
+				return 0;
+
+			m = map_create(map, block);
+			if (IS_ERR(m))
+				return PTR_ERR(m);
+		}
+
+		preq->trans_map = m;
+	}
+
+	if (test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		return 0;
+
+	return ploop_read_map(map, preq);
+}
+
+/* Find mapping for this request. Mapping can be not uptodate. */
+
+int ploop_find_map(struct ploop_map * map, struct ploop_request * preq)
+{
+	struct map_node * m;
+	cluster_t block;
+
+	block = preq->req_cluster;
+
+	if (unlikely(block >= map->max_index))
+		return -ERANGE;
+
+	if (test_bit(PLOOP_MAP_IDENTICAL, &map->flags))
+		return 0;
+
+	map->last_activity = jiffies;
+
+	m = preq->map;
+	if (m == NULL) {
+		spin_lock_irq(&map->plo->lock);
+		m = map_lookup(map, block);
+		if (m) {
+			atomic_inc(&m->refcnt);
+			if (!list_empty(&m->lru) && atomic_read(&m->refcnt) == 1) {
+				cond_flush_lru_buffer(map);
+				if (atomic_read(&m->refcnt) == 1) {
+					atomic_inc(&m->refcnt);
+					map->lru_buffer[map->lru_buffer_ptr++] = m;
+				}
+			}
+		}
+		spin_unlock_irq(&map->plo->lock);
+
+		if (m == NULL) {
+			m = map_create(map, block);
+			if (IS_ERR(m))
+				return PTR_ERR(m);
+		}
+
+		preq->map = m;
+	}
+
+	if (test_bit(PLOOP_MAP_UPTODATE, &m->state))
+		return 0;
+
+	return ploop_read_map(map, preq);
+}
+
+
+/* Blank entries, which refer to another delta
+ * _XXX_ a little more brain stress to detect the case, when we do not
+ * have such entries. Also, copy cries for an optimization.
+ */
+
+static void copy_index_for_wb(struct page * page, struct map_node * m, int level)
+{
+	int i;
+	u32 * s = page_address(m->page);
+	u32 * d = page_address(page);
+	int skip = 0;
+
+	if (m->mn_start == 0) {
+		skip = PLOOP_MAP_OFFSET;
+		memcpy(d, s, skip * sizeof(u32));
+	}
+
+	for (i = skip; i < INDEX_PER_PAGE; i++) {
+		if (level != (m->levels ? m->levels[i] : MAP_LEVEL(m)))
+			d[i] = 0;
+		else
+			d[i] = s[i];
+	}
+}
+
+
+void ploop_index_wb_proceed(struct ploop_request * preq)
+{
+	struct map_node * m = preq->map;
+	struct ploop_delta * top_delta = map_top_delta(m->parent);
+	struct page * page = preq->sinfo.wi.tpage;
+	unsigned long rw = preq->req_index_update_rw;
+	sector_t sec;
+
+	preq->eng_state = PLOOP_E_INDEX_WB;
+
+	top_delta->ops->map_index(top_delta, m->mn_start, &sec);
+
+	__TRACE("wbi-proceed %p %u %p\n", preq, preq->req_cluster, m);
+	top_delta->io.ops->write_page(&top_delta->io, preq, page, sec, rw);
+
+	put_page(page);
+}
+
+static void ploop_index_wb_proceed_or_delay(struct ploop_request * preq,
+					    int do_fsync_if_delayed)
+{
+	if (do_fsync_if_delayed) {
+		struct map_node * m = preq->map;
+		struct ploop_delta * top_delta = map_top_delta(m->parent);
+		struct ploop_io * top_io = &top_delta->io;
+
+		if (test_bit(PLOOP_IO_FSYNC_DELAYED, &top_io->io_state)) {
+			preq->eng_state = PLOOP_E_FSYNC_PENDED;
+			ploop_add_req_to_fsync_queue(preq);
+			return;
+		}
+	}
+
+	ploop_index_wb_proceed(preq);
+}
+
+/* Data write is commited. Now we need to update index. */
+
+void ploop_index_update(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	struct map_node * m = preq->map;
+	struct ploop_delta * top_delta = map_top_delta(m->parent);
+	u32 idx;
+	map_index_t blk;
+	int old_level;
+	struct page * page;
+	unsigned long state = READ_ONCE(preq->state);
+	int do_fsync_if_delayed = 0;
+
+	/* No way back, we are going to initiate index write. */
+
+	idx = (preq->req_cluster + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1);
+	blk = ((map_index_t *)page_address(m->page))[idx]  >> ploop_map_log(plo);
+	old_level = m->levels ? m->levels[idx] : MAP_LEVEL(m);
+
+	if (top_delta->level != old_level) {
+		if (m->levels == NULL) {
+			u8 * levels = kmalloc(INDEX_PER_PAGE, GFP_NOFS);
+			if (levels == NULL)
+				goto enomem;
+			memset(levels, MAP_LEVEL(m), INDEX_PER_PAGE);
+			m->levels = levels;
+		}
+	}
+
+	BUG_ON (test_bit(PLOOP_REQ_ZERO, &preq->state) && preq->iblock);
+	if (test_bit(PLOOP_REQ_ZERO, &preq->state) && !blk) {
+		printk("Either map_node is corrupted or bug in "
+		       "ploop-balloon (%u)\n", preq->req_cluster);
+		PLOOP_REQ_SET_ERROR(preq, -EIO);
+		goto corrupted;
+	}
+
+	if (blk == preq->iblock && top_delta->level == old_level)
+		goto out;
+
+	if (test_and_set_bit(PLOOP_MAP_WRITEBACK, &m->state)) {
+		preq->eng_state = PLOOP_E_INDEX_DELAY;
+		list_add_tail(&preq->list, &m->io_queue);
+		__TRACE("d %p %u %p\n", preq, preq->req_cluster, m);
+		return;
+	}
+
+	page = alloc_page(GFP_NOFS);
+	if (page == NULL) {
+		clear_bit(PLOOP_MAP_WRITEBACK, &m->state);
+		goto enomem;
+	}
+
+	copy_index_for_wb(page, m, top_delta->level);
+
+	((map_index_t*)page_address(page))[idx] = preq->iblock << ploop_map_log(plo);
+
+	get_page(page);
+	preq->sinfo.wi.tpage = page;
+
+	__TRACE("wbi %p %u %p\n", preq, preq->req_cluster, m);
+	plo->st.map_single_writes++;
+
+	preq->req_index_update_rw = (preq->req_rw & (REQ_FUA | REQ_FLUSH));
+
+	/* We've just set REQ_FLUSH in rw, ->write_page() below
+	   will do the FLUSH */
+	preq->req_rw &= ~REQ_FLUSH;
+
+	/* Relocate requires consistent index update */
+	if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) {
+		preq->req_index_update_rw |= (REQ_FLUSH | REQ_FUA);
+		do_fsync_if_delayed = 1;
+	}
+
+	ploop_index_wb_proceed_or_delay(preq, do_fsync_if_delayed);
+	return;
+
+enomem:
+	PLOOP_REQ_SET_ERROR(preq, -ENOMEM);
+corrupted:
+	set_bit(PLOOP_S_ABORT, &plo->state);
+out:
+	preq->eng_state = PLOOP_E_COMPLETE;
+	spin_lock_irq(&plo->lock);
+	list_add_tail(&preq->list, &plo->ready_queue);
+	spin_unlock_irq(&plo->lock);
+	return;
+}
+EXPORT_SYMBOL(ploop_index_update);
+
+
+int map_index(struct ploop_delta * delta, struct ploop_request * preq, unsigned long *sec)
+{
+	return delta->ops->map_index(delta, preq->map->mn_start, sec);
+}
+EXPORT_SYMBOL(map_index);
+
+struct ploop_delta * map_writable_delta(struct ploop_request * preq)
+{
+	struct map_node * m = preq->map;
+
+	if (m == NULL)
+		return ploop_top_delta(preq->plo);
+	else
+		return map_top_delta(m->parent);
+}
+EXPORT_SYMBOL(map_writable_delta);
+
+static void map_idx_swap(struct map_node *m, unsigned int idx,
+			 iblock_t *iblk, int log)
+{
+	iblock_t iblk2 = ((map_index_t*)page_address(m->page))[idx] >> log;
+	((map_index_t*)page_address(m->page))[idx] = *iblk << log;
+	*iblk = iblk2;
+}
+
+static inline void requeue_req(struct ploop_request *preq,
+			       unsigned long new_eng_state)
+{
+	preq->eng_state = new_eng_state;
+	spin_lock_irq(&preq->plo->lock);
+	list_del(&preq->list);
+	list_add_tail(&preq->list, &preq->plo->ready_queue);
+	spin_unlock_irq(&preq->plo->lock);
+}
+
+/*
+ * Index write-back for given preq happened, map_wb_complete()
+ * found preq in m->io_queue in PLOOP_E_INDEX_WB eng_state and
+ * updated in-core page of L2-table with preq->iblock. Now, it's
+ * time to either finalize preq (main case) setting eng_state to
+ * PLOOP_E_COMPLETE or process it further (RELOC_[A|S] case)
+ */
+static void map_wb_complete_post_process(struct ploop_map *map,
+					 struct ploop_request *preq, int err)
+{
+	struct ploop_device *plo       = map->plo;
+
+	if (likely(err ||
+		   (!test_bit(PLOOP_REQ_RELOC_A, &preq->state) &&
+		    !test_bit(PLOOP_REQ_RELOC_S, &preq->state)))) {
+
+		requeue_req(preq, PLOOP_E_COMPLETE);
+		return;
+	}
+
+	if (test_bit(PLOOP_REQ_RELOC_S, &preq->state)) {
+		spin_lock_irq(&plo->lock);
+		del_lockout(preq);
+		map_release(preq->map);
+		preq->map = NULL;
+		spin_unlock_irq(&plo->lock);
+
+		requeue_req(preq, PLOOP_E_RELOC_COMPLETE);
+		return;
+	}
+
+	BUG_ON (!test_bit(PLOOP_REQ_RELOC_A, &preq->state));
+	BUG_ON (!preq->aux_bio);
+
+	if (++plo->grow_relocated > plo->grow_end - plo->grow_start) {
+		requeue_req(preq, PLOOP_E_COMPLETE);
+		return;
+	}
+
+	del_lockout(preq);
+	preq->req_cluster++;
+	requeue_req(preq, PLOOP_E_ENTRY);
+}
+
+static void map_wb_complete(struct map_node * m, int err)
+{
+	struct ploop_device * plo = m->parent->plo;
+	struct ploop_delta * top_delta = map_top_delta(m->parent);
+	struct list_head * cursor, * tmp;
+	struct ploop_request * main_preq;
+	struct page * page = NULL;
+	int delayed = 0;
+	unsigned int idx;
+	unsigned long rw;
+	int do_fsync_if_delayed = 0;
+
+	/* First, complete processing of written back indices,
+	 * finally instantiate indices in mapping cache.
+	 */
+	list_for_each_safe(cursor, tmp, &m->io_queue) {
+		struct ploop_request * preq;
+
+		preq = list_entry(cursor, struct ploop_request, list);
+
+		switch (preq->eng_state) {
+		case PLOOP_E_ENTRY:
+			break;
+		case PLOOP_E_INDEX_WB:
+			idx = (preq->req_cluster + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1);
+			if (!err) {
+				struct ploop_request *pr = preq;
+
+				if (unlikely(test_bit(PLOOP_REQ_ZERO, &preq->state))) {
+					BUG_ON (list_empty(&preq->delay_list));
+					pr = list_first_entry(&preq->delay_list,
+							      struct ploop_request,
+							      list);
+				}
+
+				if (unlikely(test_bit(PLOOP_REQ_RELOC_A, &preq->state) ||
+					     test_bit(PLOOP_REQ_ZERO, &preq->state)))
+					map_idx_swap(m, idx, &pr->iblock,
+						     ploop_map_log(plo));
+				else
+					((map_index_t*)page_address(m->page))[idx] =
+						pr->iblock << ploop_map_log(plo);
+
+				if (m->levels) {
+					m->levels[idx] = top_delta->level;
+				} else {
+					BUG_ON(MAP_LEVEL(m) != top_delta->level);
+				}
+			} else {
+				PLOOP_REQ_SET_ERROR(preq, err);
+			}
+			put_page(preq->sinfo.wi.tpage);
+			preq->sinfo.wi.tpage = NULL;
+			map_wb_complete_post_process(m->parent, preq, err);
+			break;
+		case PLOOP_E_INDEX_DELAY:
+			if (err) {
+				PLOOP_REQ_SET_ERROR(preq, err);
+				preq->eng_state = PLOOP_E_COMPLETE;
+				spin_lock_irq(&plo->lock);
+				list_del(cursor);
+				list_add_tail(cursor, &preq->plo->ready_queue);
+				spin_unlock_irq(&plo->lock);
+			} else {
+				delayed++;
+			}
+			break;
+		}
+	}
+
+	if (!delayed) {
+		clear_bit(PLOOP_MAP_WRITEBACK, &m->state);
+		return;
+	}
+
+	page = alloc_page(GFP_NOFS);
+	if (page)
+		copy_index_for_wb(page, m, top_delta->level);
+
+	main_preq = NULL;
+	rw = 0;
+
+	list_for_each_safe(cursor, tmp, &m->io_queue) {
+		struct ploop_request * preq;
+		unsigned long state;
+
+		preq = list_entry(cursor, struct ploop_request, list);
+
+		switch (preq->eng_state) {
+		case PLOOP_E_INDEX_DELAY:
+			if (page == NULL) {
+				PLOOP_REQ_SET_ERROR(preq, -ENOMEM);
+				preq->eng_state = PLOOP_E_COMPLETE;
+				spin_lock_irq(&plo->lock);
+				list_del(cursor);
+				list_add_tail(cursor, &plo->ready_queue);
+				spin_unlock_irq(&plo->lock);
+				break;
+			}
+
+			rw |= (preq->req_rw & (REQ_FLUSH | REQ_FUA));
+
+			/* We've just set REQ_FLUSH in rw, ->write_page() below
+			   will do the FLUSH */
+			preq->req_rw &= ~REQ_FLUSH;
+
+			state = READ_ONCE(preq->state);
+			/* Relocate requires consistent index update */
+			if (state & (PLOOP_REQ_RELOC_A_FL|PLOOP_REQ_RELOC_S_FL)) {
+				rw |= (REQ_FLUSH | REQ_FUA);
+				do_fsync_if_delayed = 1;
+			}
+
+			preq->eng_state = PLOOP_E_INDEX_WB;
+			get_page(page);
+			preq->sinfo.wi.tpage = page;
+			idx = (preq->req_cluster + PLOOP_MAP_OFFSET) & (INDEX_PER_PAGE - 1);
+
+			((map_index_t*)page_address(page))[idx] = preq->iblock << ploop_map_log(plo);
+
+			if (!main_preq) {
+				main_preq = preq;
+				list_del_init(&main_preq->list);
+			}
+			plo->st.map_multi_updates++;
+		}
+	}
+
+	if (!page) {
+		/* Writes are discarded */
+		clear_bit(PLOOP_MAP_WRITEBACK, &m->state);
+		return;
+	}
+
+	__TRACE("wbi2 %p %u %p\n", main_preq, main_preq->req_cluster, m);
+	plo->st.map_multi_writes++;
+
+	main_preq->req_index_update_rw = rw;
+	ploop_index_wb_proceed_or_delay(main_preq, do_fsync_if_delayed);
+}
+
+void
+ploop_index_wb_complete(struct ploop_request * preq)
+{
+	struct ploop_device * plo = preq->plo;
+	struct map_node * m = preq->map;
+
+	spin_lock_irq(&plo->lock);
+	list_add_tail(&preq->list, &m->io_queue);
+	spin_unlock_irq(&plo->lock);
+
+	map_wb_complete(m, preq->error);
+}
+
+void ploop_map_start(struct ploop_map * map, u64 bd_size)
+{
+	struct ploop_device * plo = map->plo;
+
+	map->max_index = (bd_size + (1 << plo->cluster_log) - 1 ) >> plo->cluster_log;
+	map->flags = 0;
+}
+
+
+static void map_wait(struct ploop_map * map)
+{
+	DEFINE_WAIT(_wait);
+	prepare_to_wait(&map->destroy_waitq, &_wait, TASK_UNINTERRUPTIBLE);
+
+	spin_unlock(&map_lru_lock);
+	spin_unlock_irq(&map->plo->lock);
+	io_schedule();
+	spin_lock_irq(&map->plo->lock);
+	spin_lock(&map_lru_lock);
+
+	finish_wait(&map->destroy_waitq, &_wait);
+}
+
+void ploop_map_destroy(struct ploop_map * map)
+{
+	int i;
+	struct rb_node * node;
+
+	spin_lock_irq(&map->plo->lock);
+	set_bit(PLOOP_MAP_DEAD, &map->flags);
+
+	for (i = 0; i < map->lru_buffer_ptr; i++)
+		atomic_dec(&map->lru_buffer[i]->refcnt);
+
+	map->lru_buffer_ptr = 0;
+
+	spin_lock(&map_lru_lock);
+	while ((node = map->rb_root.rb_node) != NULL) {
+		struct map_node * m = rb_entry(node, struct map_node, rb_link);
+		/* refcnt can be not zero if and only if this node is grabbed
+		 * by map_lru_scan() and in flight between releasing
+		 * map_lru_lock and taking plo->lock. We can skip this entry
+		 * if will be destroyed by map_lru_scan(), because we
+		 * set PLOOP_MAP_DEAD.
+		 */
+		if (atomic_read(&m->refcnt) == 0)
+			map_node_destroy(m);
+		else
+			map_wait(map);
+	}
+	spin_unlock(&map_lru_lock);
+	spin_unlock_irq(&map->plo->lock);
+	BUG_ON(map->pages);
+}
+
+void ploop_map_remove_delta(struct ploop_map * map, int level)
+{
+	/* For now. */
+	ploop_map_destroy(map);
+}
+
+
+int __init ploop_map_init(void)
+{
+	ploop_map_cache = kmem_cache_create("ploop_map",
+						sizeof(struct map_node), 0,
+						SLAB_MEM_SPREAD, NULL
+						);
+	if (!ploop_map_cache)
+		return -ENOMEM;
+	return 0;
+}
+
+void ploop_map_exit(void)
+{
+	if (ploop_map_cache)
+		kmem_cache_destroy(ploop_map_cache);
+}
--- /dev/null
+++ b/drivers/block/ploop/ploop1_image.h
@@ -0,0 +1,429 @@
+/*
+ *  drivers/block/ploop/ploop1_image.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __PLOOP1_IMAGE_H__
+#define __PLOOP1_IMAGE_H__ 1
+
+/* Definition of PVD (Parallels Virtual Disk) format
+ *
+ * 1. All the data are in ?little-endian? format.
+ * 2. All the data except for the first cluster are aligned and padded
+ *    to size of cluster. First cluster is exception - it combines
+ *    PVD header (first 64 bytes of the cluster) with L2 index table
+ *    (L2 index table is an array of indices of blocks)
+ * 3. Image size must be multiple of cluster size. If it is not,
+ *    we assume it is the result of image extension failed in the
+ *    middle of transaction, therefore new allocations start at
+ *    size rounded down to cluster size.
+ * 4. Update of indices must be done only after data clusters
+ *    are committed to reliable storage. If we fail to update index,
+ *    we can get an unused and, maybe, uninitialized or partially
+ *    initialized data cluster. It is lost, forgotten and ignored
+ *    until repair or image rebuild.
+ */
+
+/*
+ * copy/paste of IMAGE_PARAMETERS from DiskImageComp.h
+ */
+#pragma pack(push,1)
+struct ploop_pvd_header
+{
+	__u8  m_Sig[16];          /* Signature */
+	__u32 m_Type;             /* Disk type */
+	__u32 m_Heads;            /* heads count */
+	__u32 m_Cylinders;        /* tracks count */
+	__u32 m_Sectors;          /* Sectors per track count */
+	__u32 m_Size;             /* Size of disk in tracks */
+	union {                   /* Size of disk in 512-byte sectors */
+		struct {
+			__u32 m_SizeInSectors_v1;
+			__u32 Unused;
+		};
+		__u64 m_SizeInSectors_v2;
+	};
+	__u32 m_DiskInUse;        /* Disk in use */
+	__u32 m_FirstBlockOffset; /* First data block offset (in sectors) */
+	__u32 m_Flags;            /* Misc flags */
+	__u8  m_Reserved[8];      /* Reserved */
+};
+#pragma pack(pop)
+
+/* Compressed disk (version 1) */
+#define PRL_IMAGE_COMPRESSED            2
+
+/* Compressed disk v1 signature */
+#define SIGNATURE_STRUCTURED_DISK_V1 "WithoutFreeSpace"
+
+/* Compressed disk v2 signature */
+#define SIGNATURE_STRUCTURED_DISK_V2 "WithouFreSpacExt"
+
+/* Sign that the disk is in "using" state */
+#define SIGNATURE_DISK_IN_USE		0x746F6E59
+
+/* Disk was closed by software which conformed specification 2.0 */
+#define SIGNATURE_DISK_CLOSED_V20	0x0
+
+/* Disk disk was closed by software which conformed specification 2.1 */
+#define SIGNATURE_DISK_CLOSED_V21	0x312e3276
+
+/**
+ * Compressed disk image flags
+ */
+#define	CIF_NoFlags		0x00000000 /* No any flags */
+#define	CIF_Empty		0x00000001 /* No any data was written */
+#define	CIF_Invalid		0xFFFFFFFF /* Invalid flag */
+
+
+#define PLOOP1_SECTOR_LOG	9
+#define PLOOP1_DEF_CLUSTER_LOG	9 /* 256K cluster-block */
+#define CLUSTER (1UL << (PLOOP1_DEF_CLUSTER_LOG + PLOOP1_SECTOR_LOG))
+
+/* Helpers to generate PVD-header based on requested bdsize */
+
+#define DEFAULT_HEADS_COUNT   16
+#define DEFAULT_SECTORS_COUNT 63
+#define SECTOR_SIZE (1 << 9)
+
+struct CHSData
+{
+	__u32 Sectors;
+	__u32 Heads;
+	__u32 Cylinders;
+};
+
+#ifdef __KERNEL__
+# define ploop_do_div(n, base) do_div(n, base)
+#else
+# define ploop_do_div(n, base) ({		\
+	__u32 __rem = n % base;			\
+	n /= base;				\
+	__rem;					\
+ })
+#endif
+/*
+ * Try to count disk sectors per track value
+ */
+static inline __u32
+CalcSectors(const __u64 uiSize)
+{
+	__u64 size = uiSize;
+
+	/* Try to determine sector count */
+	if (!ploop_do_div(size, DEFAULT_SECTORS_COUNT))
+		return DEFAULT_SECTORS_COUNT;
+
+	if (!(uiSize % 32))
+		return 32;
+
+	if (!(uiSize % 16))
+		return 16;
+
+	if (!(uiSize % 8))
+		return 8;
+
+	return ~0;
+}
+
+/*
+ * Try to count disk heads value
+ */
+static inline __u32
+CalcHeads(const __u64 uiSize)
+{
+	__u64 size = uiSize;
+
+	/* Try to determine heads count */
+	if (!ploop_do_div(size, DEFAULT_HEADS_COUNT))
+		return DEFAULT_HEADS_COUNT;
+
+	if (!(uiSize % 8))
+		return 8;
+
+	if (!(uiSize % 4))
+		return 4;
+
+	if (!(uiSize % 2))
+		return 2;
+
+	return ~0;
+}
+
+/*
+ * Convert size to CHS for disks from 504 Mb to 8 Gb
+ */
+static inline void
+ConvertToCHSLow(__u64 From, struct CHSData *chs)
+{
+	chs->Sectors = DEFAULT_SECTORS_COUNT;
+	chs->Heads = DEFAULT_HEADS_COUNT;
+	ploop_do_div(From, DEFAULT_SECTORS_COUNT * DEFAULT_HEADS_COUNT);
+	chs->Cylinders = From;
+}
+
+/*
+ * Convert size to pure LBA config
+ */
+static inline void
+ConvertToPureLBA(__u64 From, struct CHSData *chs)
+{
+	chs->Sectors = 1;
+	chs->Heads = 1;
+	chs->Cylinders = From;
+}
+
+static inline void
+ConvertToCHS(__u64 From, struct CHSData *chs)
+{
+	__u64 Size;
+
+	/*
+	 * According to ATA2 specs:
+	 *  - If the device is above 1,032,192 sectors then the value should be 63.
+	 *    This value does not exceed 63 (3Fh). But note, that if device size
+	 *    above 16,777,216 the HDD reports proper 'magic' number in CHS values,
+	 *    so the situation in the middle must be handled separately
+	 */
+	if ((From > 1032192) && (From < 16777216))
+	{
+		ConvertToCHSLow(From, chs);
+		return;
+	}
+
+	Size = From;
+
+	/* Store size */
+	chs->Sectors = CalcSectors(Size);
+
+	if (chs->Sectors == (__u32)~0)
+		goto PureLBA;
+
+	ploop_do_div(Size, chs->Sectors);
+
+	chs->Heads = CalcHeads(Size);
+
+	if (chs->Heads == (__u32)~0)
+		goto PureLBA;
+
+	ploop_do_div(Size, chs->Heads);
+
+	chs->Cylinders = Size;
+
+	return;
+
+PureLBA:
+	ConvertToPureLBA(From, chs);
+}
+
+static inline __u32
+GetHeaderSize(__u32 m_Size)
+{
+	__u32 Size = sizeof(struct ploop_pvd_header);
+
+	/* Add BAT */
+	Size += m_Size * sizeof(__u32);
+	/* Align to size of sector */
+	Size = (Size + SECTOR_SIZE - 1) & ~(SECTOR_SIZE - 1);
+
+	return Size;
+}
+
+static inline char *
+ploop1_signature(int version)
+{
+	switch (version) {
+	case PLOOP_FMT_V1:
+		return SIGNATURE_STRUCTURED_DISK_V1;
+	case PLOOP_FMT_V2:
+		return SIGNATURE_STRUCTURED_DISK_V2;
+#ifdef __KERNEL__
+	default:
+		BUG();
+#endif
+	}
+
+	return NULL;
+}
+
+static inline int
+ploop1_version(struct ploop_pvd_header *vh)
+{
+	if (!memcmp(vh->m_Sig, SIGNATURE_STRUCTURED_DISK_V1, sizeof(vh->m_Sig)))
+		return PLOOP_FMT_V1;
+
+	if (!memcmp(vh->m_Sig, SIGNATURE_STRUCTURED_DISK_V2, sizeof(vh->m_Sig)))
+		return PLOOP_FMT_V2;
+
+	return -1;
+}
+
+static inline __u64
+ploop1_max_size(__u32 blocksize, int version)
+{
+	switch (version) {
+	case PLOOP_FMT_V1:
+		return (__u32)-1;
+	case PLOOP_FMT_V2:
+		return 0xffffffffUL * blocksize;
+	}
+
+	return 0;
+}
+
+#ifdef __KERNEL__
+static inline u64
+get_SizeInSectors_from_le(struct ploop_pvd_header *vh, int version)
+{
+	switch (version) {
+	case PLOOP_FMT_V1:
+		return le32_to_cpu(vh->m_SizeInSectors_v1);
+	case PLOOP_FMT_V2:
+		return le64_to_cpu(vh->m_SizeInSectors_v2);
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static inline void
+put_SizeInSectors(u64 SizeInSectors, struct ploop_pvd_header *vh,
+		  int version)
+{
+	switch (version) {
+	case PLOOP_FMT_V1:
+		vh->m_SizeInSectors_v1 = SizeInSectors;
+		break;
+	case PLOOP_FMT_V2:
+		vh->m_SizeInSectors_v2 = SizeInSectors;
+		break;
+	default:
+		BUG();
+	}
+}
+
+static inline void
+cpu_to_le_SizeInSectors(struct ploop_pvd_header *vh, int version)
+{
+	switch (version) {
+	case PLOOP_FMT_V1:
+		vh->m_SizeInSectors_v1 = cpu_to_le32(vh->m_SizeInSectors_v1);
+		break;
+	case PLOOP_FMT_V2:
+		vh->m_SizeInSectors_v2 = cpu_to_le64(vh->m_SizeInSectors_v2);
+		break;
+	default:
+		BUG();
+	}
+}
+#endif
+
+/*
+ * Returns: "size to fill" (in bytes)
+ *
+ * NB: m_Flags and m_DiskInUse are being kept as is; our caller
+ * should take care of them.
+ *
+ * NB: Both bdsize and blocksize are measured in sectors.
+ */
+static inline __u32
+generate_pvd_header(struct ploop_pvd_header *vh, __u64 bdsize, __u32 blocksize,
+		    int version)
+{
+	struct CHSData chs;
+	__u32 SizeToFill;
+	__u32 uiAlignmentSize;
+	__u64 SizeInSectors;
+
+	memcpy(vh->m_Sig, ploop1_signature(version) , sizeof(vh->m_Sig));
+	vh->m_Type = PRL_IMAGE_COMPRESSED;
+
+	/* Round up to block size */
+	SizeInSectors = bdsize + blocksize - 1;
+	ploop_do_div(SizeInSectors, blocksize);
+	SizeInSectors *= blocksize;
+	put_SizeInSectors(SizeInSectors, vh, version);
+
+	ConvertToCHS(SizeInSectors, &chs);
+
+	vh->m_Sectors = blocksize;
+	vh->m_Heads = chs.Heads;
+	vh->m_Cylinders = chs.Cylinders;
+
+	ploop_do_div(SizeInSectors, blocksize);
+	vh->m_Size = SizeInSectors;
+
+	uiAlignmentSize = blocksize << 9;
+	SizeToFill = GetHeaderSize(vh->m_Size);
+	/* Align to block size */
+	if (SizeToFill % uiAlignmentSize)
+		SizeToFill += uiAlignmentSize - (SizeToFill % uiAlignmentSize);
+
+	vh->m_FirstBlockOffset = SizeToFill >> 9;
+
+	return SizeToFill;
+}
+
+static inline bool pvd_header_is_disk_in_use(struct ploop_pvd_header *vh)
+{
+	return (vh->m_DiskInUse == cpu_to_le32(SIGNATURE_DISK_IN_USE)) ?
+		true : false;
+}
+
+static inline void pvd_header_set_disk_in_use(struct ploop_pvd_header *vh)
+{
+	vh->m_DiskInUse = cpu_to_le32(SIGNATURE_DISK_IN_USE);
+}
+
+static inline void pvd_header_set_disk_closed(struct ploop_pvd_header *vh)
+{
+	vh->m_DiskInUse = cpu_to_le32(SIGNATURE_DISK_CLOSED_V20);
+}
+
+/* Translation of sector number to offset in image */
+
+#if 0
+
+/* Those function are not really used */
+
+/* Calculate virtual cluster number from virtual sector number */
+
+static inline __u32
+ploop1_cluster(struct ploop_img_header * info, __u64 sector)
+{
+	return sector >> info->cluster_log;
+}
+
+/* Get amount of clusters covered by one L2 table, 32K by default,
+ * which can map 4G of data
+ */
+static inline __u32
+ploop1_clusters_per_l2(struct ploop_img_header * info)
+{
+	return 1 << (info->cluster_log + info->sector_log - 2);
+}
+
+/* Calculate index in L1 table mapping a cluster. */
+
+static inline __u32
+ploop1_l1_index(struct ploop_img_header * info, __u32 cluster)
+{
+	return cluster >> (info->cluster_log + info->sector_log - 2);
+}
+
+/* Calculate index in L2 table mapping a cluster. */
+
+static inline __u32
+ploop1_l2_index(struct ploop_img_header * info, __u32 cluster)
+{
+	return cluster & (ploop1_clusters_per_l2(info) - 1);
+}
+
+/* That's all, simple and stupid */
+
+#endif
+
+#endif /* __PLOOP1_IMAGE_H__ */
--- /dev/null
+++ b/drivers/block/ploop/ploop_events.c
@@ -0,0 +1,16 @@
+/*
+ *  drivers/block/ploop/ploop_events.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+
+#define CREATE_TRACE_POINTS
+#include "ploop_events.h"
+
+EXPORT_TRACEPOINT_SYMBOL(submit);
+EXPORT_TRACEPOINT_SYMBOL(submit_alloc);
+EXPORT_TRACEPOINT_SYMBOL(cached_submit);
--- /dev/null
+++ b/drivers/block/ploop/ploop_events.h
@@ -0,0 +1,100 @@
+/*
+ *  drivers/block/ploop/ploop_events.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#if !defined(_TRACE_PLOOP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PLOOP_H
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM ploop
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#include <linux/ploop/ploop.h>
+#include "events.h"
+
+DEFINE_EVENT(preq_template, submit,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, submit_alloc,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, cached_submit,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, complete_request,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, req_state_process,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, bio_queue,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, add_lockout,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+DEFINE_EVENT(preq_template, del_lockout,
+	TP_PROTO(struct ploop_request *preq),
+	TP_ARGS(preq));
+
+TRACE_EVENT(preq_lockout,
+	TP_PROTO(struct ploop_request *preq,
+		struct ploop_request *ppreq),
+
+	TP_ARGS(preq, ppreq),
+
+	TP_STRUCT__entry(
+		__field(void *,		ppreq)
+		__field(void *,		preq)
+		__field(cluster_t,	clu)
+		__field(iblock_t,	iblk)
+		__field(unsigned int,	size)
+		__field(unsigned long,	eng_state)
+		__field(unsigned long,	state)
+		__field(unsigned int,	rw)
+	),
+
+	TP_fast_assign(
+		__entry->preq		= preq;
+		__entry->ppreq		= ppreq;
+		__entry->clu		= preq->req_cluster;
+		__entry->iblk		= preq->iblock;
+		__entry->size		= preq->req_size;
+		__entry->eng_state	= preq->eng_state;
+		__entry->state		= preq->state;
+		__entry->rw		= preq->req_rw;
+	),
+
+	TP_printk("ppreq=%p "PREQ_FORMAT, __entry->ppreq, PREQ_ARGS)
+);
+
+DEFINE_EVENT(bio_template, make_request,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio));
+
+DEFINE_EVENT(bio_template, bio_fast_map,
+	TP_PROTO(struct bio *bio),
+	TP_ARGS(bio));
+
+#endif /* _TRACE_PLOOP_H */
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+
+#define TRACE_INCLUDE_FILE ploop_events
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
--- /dev/null
+++ b/drivers/block/ploop/push_backup.c
@@ -0,0 +1,1097 @@
+/*
+ *  drivers/block/ploop/push_backup.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/bio.h>
+#include <linux/interrupt.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+
+#include <trace/events/block.h>
+
+#include <linux/ploop/ploop.h>
+#include "push_backup.h"
+
+#define NR_PAGES(bits) (((bits) + PAGE_SIZE*8 - 1) / (PAGE_SIZE*8))
+#define BITS_PER_PAGE  (1UL << (PAGE_SHIFT + 3))
+
+struct pb_set {
+	struct rb_root	   tree;
+	struct list_head   list;
+	struct timer_list  timer;
+	char		  *name;
+	struct ploop_pushbackup_desc *pbd; /* points to parent pbd */
+};
+
+enum {
+	PLOOP_PB_ALIVE,
+	PLOOP_PB_STOPPING,
+	PLOOP_PB_DEAD,
+};
+
+struct ploop_pushbackup_desc {
+	struct ploop_device *plo;
+	struct page **cbt_map; /* a 'snapshot' copy of CBT mask */
+	blkcnt_t cbt_block_max;
+	blkcnt_t cbt_block_bits;
+	__u8 	 cbt_uuid[16];
+
+	struct page **ppb_map; /* Ploop Push Backup mask */
+	struct page **reported_map; /* what userspace reported as backed up */
+	cluster_t ppb_block_max; /* first invalid index in ppb_map */
+
+	spinlock_t	      ppb_lock;
+	struct completion     ppb_comp;
+	bool                  ppb_waiting;
+
+	struct pb_set	      pending_set;
+	struct pb_set	      reported_set;
+
+	struct bio_list	      bio_pending_list;
+
+	struct task_struct   *health_monitor_thread;
+	wait_queue_head_t     ppb_waitq;
+	int		      ppb_state; /* see enum above */
+};
+
+int ploop_pb_check_uuid(struct ploop_pushbackup_desc *pbd, __u8 *uuid)
+{
+	if (memcmp(pbd->cbt_uuid, uuid, sizeof(pbd->cbt_uuid)))
+		return -1;
+	return 0;
+}
+
+int ploop_pb_get_uuid(struct ploop_pushbackup_desc *pbd, __u8 *uuid)
+{
+	if (!pbd)
+		return -1;
+
+	memcpy(uuid, pbd->cbt_uuid, sizeof(pbd->cbt_uuid));
+	return 0;
+}
+
+static struct page **ploop_pb_map_alloc(unsigned long block_max)
+{
+	unsigned long npages = NR_PAGES(block_max);
+	struct page **map = vmalloc(npages * sizeof(void *));
+	unsigned long i;
+
+	if (!map)
+		return NULL;
+
+	memset(map, 0, npages * sizeof(void *));
+
+	for (i = 0; i < npages; i++) {
+		map[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+		if (!map[i]) {
+			while (--i >= 0)
+				__free_page(map[i]);
+			vfree(map);
+			return NULL;
+		}
+	}
+
+	return map;
+}
+
+static void ploop_pb_map_free(struct page **map, unsigned long block_max)
+{
+	if (map) {
+		unsigned long i;
+		for (i = 0; i < NR_PAGES(block_max); i++)
+			if (map[i])
+				__free_page(map[i]);
+
+		vfree(map);
+	}
+}
+
+int ploop_pb_cbt_map_release(struct ploop_pushbackup_desc *pbd, bool do_merge)
+{
+	int ret = 0;
+
+	if (pbd->cbt_map == NULL)
+		return 0;
+
+	if (do_merge) {
+		ret = blk_cbt_map_merge(pbd->plo->queue,
+					pbd->cbt_uuid,
+					pbd->cbt_map,
+					pbd->cbt_block_max,
+					pbd->cbt_block_bits);
+		if (ret)
+			printk("ploop(%d): blk_cbt_map_merge() failed with "
+			       "%d\n", pbd->plo->index, ret);
+	}
+
+	ploop_pb_map_free(pbd->cbt_map, pbd->cbt_block_max);
+	pbd->cbt_map = NULL;
+	return ret;
+}
+
+static void ploop_pb_timeout_func(unsigned long data);
+
+static void ploop_pbs_init(struct pb_set *pbs,
+		struct ploop_pushbackup_desc *pbd, char *name)
+{
+	pbs->pbd = pbd;
+	pbs->name = name;
+	pbs->tree = RB_ROOT;
+	INIT_LIST_HEAD(&pbs->list);
+
+	init_timer(&pbs->timer);
+	pbs->timer.function = ploop_pb_timeout_func;
+	pbs->timer.data = (unsigned long)pbs;
+}
+
+static void ploop_pbs_fini(struct pb_set *pbs)
+{
+	del_timer_sync(&pbs->timer);
+}
+
+struct ploop_pushbackup_desc *ploop_pb_alloc(struct ploop_device *plo)
+{
+	struct ploop_pushbackup_desc *pbd;
+
+	pbd = kmalloc(sizeof(struct ploop_pushbackup_desc), GFP_KERNEL|__GFP_ZERO);
+	if (pbd == NULL)
+		return NULL;
+
+	pbd->ppb_block_max = (plo->bd_size + (1 << plo->cluster_log) - 1)
+		>> plo->cluster_log;
+
+	pbd->ppb_map = ploop_pb_map_alloc(pbd->ppb_block_max);
+	if (!pbd->ppb_map) {
+		kfree(pbd);
+		return NULL;
+	}
+
+	pbd->reported_map = ploop_pb_map_alloc(pbd->ppb_block_max);
+	if (!pbd->reported_map) {
+		ploop_pb_map_free(pbd->ppb_map, pbd->ppb_block_max);
+		kfree(pbd);
+		return NULL;
+	}
+
+	spin_lock_init(&pbd->ppb_lock);
+	init_completion(&pbd->ppb_comp);
+	ploop_pbs_init(&pbd->pending_set, pbd, "pending");
+	ploop_pbs_init(&pbd->reported_set, pbd, "reported");
+	init_waitqueue_head(&pbd->ppb_waitq);
+	bio_list_init(&pbd->bio_pending_list);
+	pbd->plo = plo;
+
+	return pbd;
+}
+
+static int find_first_blk_in_map(struct page **map, u64 map_max, u64 *blk_p)
+{
+	u64 blk = *blk_p;
+	unsigned long idx = blk >> (PAGE_SHIFT + 3);
+
+	while (blk < map_max) {
+		unsigned long off = blk & (BITS_PER_PAGE -1);
+		unsigned long next_bit;
+		struct page *page = map[idx];
+
+		if (!page)
+			goto next;
+
+		next_bit = find_next_bit(page_address(page), BITS_PER_PAGE, off);
+		if (next_bit != BITS_PER_PAGE) {
+			*blk_p = ((u64)idx << (PAGE_SHIFT + 3)) + next_bit;
+			return 0;
+		}
+
+	next:
+		idx++;
+		blk = (u64)idx << (PAGE_SHIFT + 3);
+	}
+
+	return -1;
+}
+
+enum {
+	SET_BIT,
+	CLEAR_BIT,
+	CHECK_BIT,
+};
+
+static bool do_bit_in_map(struct page **map, u64 map_max, u64 blk, int action)
+{
+	unsigned long idx = blk >> (PAGE_SHIFT + 3);
+	unsigned long off = blk & (BITS_PER_PAGE -1);
+	struct page *page = map[idx];
+
+	BUG_ON(blk >= map_max);
+
+	switch (action) {
+	case SET_BIT:
+		__set_bit(off, page_address(page));
+		break;
+	case CLEAR_BIT:
+		__clear_bit(off, page_address(page));
+		break;
+	case CHECK_BIT:
+		return test_bit(off, page_address(page));
+	default:
+		BUG();
+	}
+
+	return false;
+}
+
+static void set_bit_in_map(struct page **map, u64 map_max, u64 blk)
+{
+	do_bit_in_map(map, map_max, blk, SET_BIT);
+}
+
+static void clear_bit_in_map(struct page **map, u64 map_max, u64 blk)
+{
+	do_bit_in_map(map, map_max, blk, CLEAR_BIT);
+}
+
+static bool check_bit_in_map(struct page **map, u64 map_max, u64 blk)
+{
+	return do_bit_in_map(map, map_max, blk, CHECK_BIT);
+}
+
+static void set_bits_in_map(struct page **map, u64 map_max, u64 blk, u64 cnt)
+{
+	if (blk + cnt > map_max) {
+		printk("set_bits_in_map: extent [%llu, %llu) is out of range"
+		       " [0, %llu)\n", blk, blk + cnt, map_max);
+		return;
+	}
+
+	while (cnt) {
+		unsigned long idx = blk >> (PAGE_SHIFT + 3);
+		unsigned long off = blk & (BITS_PER_PAGE -1);
+		unsigned long len;
+		void *addr = page_address(map[idx]);
+
+		len = min_t(unsigned long, BITS_PER_PAGE - off, cnt);
+		cnt -= len;
+		blk += len;
+
+		while (len) {
+			if ((off & 31) == 0 && len >= 32) {
+				*(u32 *)(addr + (off >> 3)) = -1;
+				off += 32;
+				len -= 32;
+			} else {
+				__set_bit(off, addr);
+				off += 1;
+				len -= 1;
+			}
+		}
+	}
+}
+
+/* intentionally lockless */
+void ploop_pb_clear_bit(struct ploop_pushbackup_desc *pbd, cluster_t clu)
+{
+	BUG_ON(!pbd);
+	clear_bit_in_map(pbd->ppb_map, pbd->ppb_block_max, clu);
+}
+
+/* intentionally lockless */
+bool ploop_pb_check_bit(struct ploop_pushbackup_desc *pbd, cluster_t clu)
+{
+	if (!pbd)
+		return false;
+
+	return check_bit_in_map(pbd->ppb_map, pbd->ppb_block_max, clu);
+}
+
+static int convert_map_to_map(struct ploop_pushbackup_desc *pbd)
+{
+	struct page **from_map = pbd->cbt_map;
+	blkcnt_t from_max = pbd->cbt_block_max;
+	blkcnt_t from_bits = pbd->cbt_block_bits;
+
+	struct page **to_map = pbd->ppb_map;
+	cluster_t to_max = pbd->ppb_block_max;
+	int to_bits = pbd->plo->cluster_log + 9;
+
+	u64 from_blk, to_blk;
+
+	if ((u64)from_max << from_bits != (u64)to_max << to_bits) {
+		printk("mismatch in map convert: %lu %lu ---> %u %d\n",
+		       from_max, from_bits, to_max, to_bits);
+		return -EINVAL;
+	}
+
+	for (from_blk = 0; from_blk < from_max;
+	     from_blk = (++to_blk << to_bits) >> from_bits) {
+
+		if (find_first_blk_in_map(from_map, from_max, &from_blk))
+			break;
+
+		to_blk = (from_blk << from_bits) >> to_bits;
+		set_bit_in_map(to_map, to_max, to_blk);
+	}
+
+	return 0;
+
+}
+
+static int ploop_pb_health_monitor(void * data)
+{
+	struct ploop_pushbackup_desc *pbd = data;
+	struct ploop_device	     *plo = pbd->plo;
+
+	spin_lock_irq(&pbd->ppb_lock);
+	while (!kthread_should_stop() || pbd->ppb_state == PLOOP_PB_STOPPING) {
+
+		DEFINE_WAIT(_wait);
+		for (;;) {
+			prepare_to_wait(&pbd->ppb_waitq, &_wait, TASK_INTERRUPTIBLE);
+			if (pbd->ppb_state == PLOOP_PB_STOPPING ||
+			    kthread_should_stop())
+				break;
+
+			spin_unlock_irq(&pbd->ppb_lock);
+			schedule();
+			spin_lock_irq(&pbd->ppb_lock);
+		}
+		finish_wait(&pbd->ppb_waitq, &_wait);
+
+		if (pbd->ppb_state == PLOOP_PB_STOPPING) {
+			spin_unlock_irq(&pbd->ppb_lock);
+			mutex_lock(&plo->ctl_mutex);
+			ploop_pb_stop(pbd, true);
+			mutex_unlock(&plo->ctl_mutex);
+			spin_lock_irq(&pbd->ppb_lock);
+		}
+	}
+	spin_unlock_irq(&pbd->ppb_lock);
+	return 0;
+}
+
+int ploop_pb_init(struct ploop_pushbackup_desc *pbd, __u8 *uuid, bool full)
+{
+	struct task_struct *ts;
+
+	memcpy(pbd->cbt_uuid, uuid, sizeof(pbd->cbt_uuid));
+
+	if (full) {
+		int i, off;
+		for (i = 0; i < NR_PAGES(pbd->ppb_block_max); i++)
+			memset(page_address(pbd->ppb_map[i]), 0xff, PAGE_SIZE);
+
+		/* nullify bits beyond [0, pbd->ppb_block_max) range */
+		off = pbd->ppb_block_max & (BITS_PER_PAGE -1);
+		i = pbd->ppb_block_max >> (PAGE_SHIFT + 3);
+		while (off && off < BITS_PER_PAGE) {
+			__clear_bit(off, page_address(pbd->ppb_map[i]));
+			off++;
+		}
+	} else {
+		int rc = blk_cbt_map_copy_once(pbd->plo->queue,
+					       uuid,
+					       &pbd->cbt_map,
+					       &pbd->cbt_block_max,
+					       &pbd->cbt_block_bits);
+		if (rc)
+			return rc;
+
+		rc = convert_map_to_map(pbd);
+		if (rc)
+			return rc;
+	}
+
+	ts = kthread_create(ploop_pb_health_monitor, pbd, "ploop_pb_hm%d",
+			    pbd->plo->index);
+	if (IS_ERR(ts))
+		return PTR_ERR(ts);
+
+	pbd->health_monitor_thread = ts;
+	wake_up_process(ts);
+	return 0;
+}
+
+void ploop_pb_fini(struct ploop_pushbackup_desc *pbd)
+{
+	if (pbd == NULL)
+		return;
+
+	if (!RB_EMPTY_ROOT(&pbd->pending_set.tree))
+		printk("ploop_pb_fini: pending_tree is not empty!\n");
+	if (!RB_EMPTY_ROOT(&pbd->reported_set.tree))
+		printk("ploop_pb_fini: reported_tree is not empty!\n");
+
+	if (pbd->health_monitor_thread) {
+		kthread_stop(pbd->health_monitor_thread);
+		pbd->health_monitor_thread = NULL;
+	}
+
+	if (pbd->plo) {
+		struct ploop_device *plo = pbd->plo;
+		mutex_lock(&plo->sysfs_mutex);
+		plo->pbd = NULL;
+		mutex_unlock(&plo->sysfs_mutex);
+	}
+
+	ploop_pb_cbt_map_release(pbd, true);
+	ploop_pb_map_free(pbd->ppb_map, pbd->ppb_block_max);
+	ploop_pb_map_free(pbd->reported_map, pbd->ppb_block_max);
+
+	kfree(pbd);
+}
+
+int ploop_pb_copy_cbt_to_user(struct ploop_pushbackup_desc *pbd, char *user_addr)
+{
+	unsigned long i;
+
+	for (i = 0; i < NR_PAGES(pbd->cbt_block_max); i++) {
+		struct page *page = pbd->cbt_map[i] ? : ZERO_PAGE(0);
+
+		if (copy_to_user(user_addr, page_address(page), PAGE_SIZE))
+			return -EFAULT;
+
+		user_addr += PAGE_SIZE;
+	}
+
+	return 0;
+}
+
+static void ploop_pb_add_req_to_tree(struct ploop_request *preq,
+				     struct pb_set *pbs)
+{
+	struct rb_root *tree = &pbs->tree;
+	struct rb_node ** p = &tree->rb_node;
+	struct rb_node *parent = NULL;
+	struct ploop_request * pr;
+	unsigned long timeout = preq->plo->tune.push_backup_timeout * HZ;
+
+	while (*p) {
+		parent = *p;
+		pr = rb_entry(parent, struct ploop_request, reloc_link);
+		BUG_ON (preq->req_cluster == pr->req_cluster);
+
+		if (preq->req_cluster < pr->req_cluster)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+
+	preq->tstamp = jiffies;
+	if (timeout && list_empty(&pbs->list) &&
+	    pbs->pbd->ppb_state == PLOOP_PB_ALIVE)
+		mod_timer(&pbs->timer, preq->tstamp + timeout + 1);
+
+	list_add_tail(&preq->list, &pbs->list);
+
+	rb_link_node(&preq->reloc_link, parent, p);
+	rb_insert_color(&preq->reloc_link, tree);
+}
+
+static void ploop_pb_add_req_to_pending(struct ploop_pushbackup_desc *pbd,
+					struct ploop_request *preq)
+{
+	ploop_pb_add_req_to_tree(preq, &pbd->pending_set);
+}
+
+static void ploop_pb_add_req_to_reported(struct ploop_pushbackup_desc *pbd,
+					 struct ploop_request *preq)
+{
+	ploop_pb_add_req_to_tree(preq, &pbd->reported_set);
+}
+
+static void remove_req_from_pbs(struct pb_set *pbs,
+					 struct ploop_request *preq)
+{
+	unsigned long timeout = preq->plo->tune.push_backup_timeout * HZ;
+	bool oldest_deleted = false;
+
+	if (preq == list_first_entry(&pbs->list, struct ploop_request, list))
+		oldest_deleted = true;
+
+	rb_erase(&preq->reloc_link, &pbs->tree);
+	list_del_init(&preq->list);
+
+	if (timeout && oldest_deleted && !list_empty(&pbs->list) &&
+	    pbs->pbd->ppb_state == PLOOP_PB_ALIVE) {
+		preq = list_first_entry(&pbs->list, struct ploop_request,
+					list);
+		mod_timer(&pbs->timer, preq->tstamp + timeout + 1);
+	}
+}
+
+
+static inline bool preq_match(struct ploop_request *preq, cluster_t clu,
+			      cluster_t len)
+{
+	return preq &&
+		clu <= preq->req_cluster &&
+		preq->req_cluster < clu + len;
+}
+
+/* returns leftmost preq which req_cluster >= clu */
+static struct ploop_request *ploop_pb_get_req_from_tree(struct pb_set *pbs,
+						cluster_t clu, cluster_t len,
+						struct ploop_request **npreq)
+{
+	struct rb_root *tree = &pbs->tree;
+	struct rb_node *n = tree->rb_node;
+	struct ploop_request *p = NULL;
+
+	*npreq = NULL;
+
+	while (n) {
+		p = rb_entry(n, struct ploop_request, reloc_link);
+
+		if (clu < p->req_cluster)
+			n = n->rb_left;
+		else if (clu > p->req_cluster)
+			n = n->rb_right;
+		else { /* perfect match */
+			n = rb_next(n);
+			if (n)
+				*npreq = rb_entry(n, struct ploop_request,
+						  reloc_link);
+			remove_req_from_pbs(pbs, p);
+			return p;
+		}
+	}
+	/* here p is not perfect, but it's closest */
+
+	if (p && p->req_cluster < clu) {
+		n = rb_next(&p->reloc_link);
+		if (n)
+			p = rb_entry(n, struct ploop_request, reloc_link);
+	}
+
+	if (preq_match(p, clu, len)) {
+		n = rb_next(&p->reloc_link);
+		if (n)
+			*npreq = rb_entry(n, struct ploop_request, reloc_link);
+		remove_req_from_pbs(pbs, p);
+		return p;
+	}
+
+	return NULL;
+}
+
+static struct ploop_request *
+ploop_pb_get_first_req_from_tree(struct pb_set *pbs,
+				 struct ploop_request **npreq)
+{
+	struct rb_root *tree = &pbs->tree;
+	static struct ploop_request *p;
+	struct rb_node *n = rb_first(tree);
+
+	if (!n)
+		return NULL;
+
+	if (npreq) {
+		struct rb_node *next = rb_next(n);
+		if (next)
+			*npreq = rb_entry(next, struct ploop_request,
+					  reloc_link);
+		else
+			*npreq = NULL;
+	}
+
+	p = rb_entry(n, struct ploop_request, reloc_link);
+	remove_req_from_pbs(pbs, p);
+	return p;
+}
+
+static struct ploop_request *
+ploop_pb_get_first_req_from_pending(struct ploop_pushbackup_desc *pbd)
+{
+	return ploop_pb_get_first_req_from_tree(&pbd->pending_set, NULL);
+}
+
+static struct ploop_request *
+ploop_pb_get_first_reqs_from_pending(struct ploop_pushbackup_desc *pbd,
+				     struct ploop_request **npreq)
+{
+	return ploop_pb_get_first_req_from_tree(&pbd->pending_set, npreq);
+}
+
+static struct ploop_request *
+ploop_pb_get_first_req_from_reported(struct ploop_pushbackup_desc *pbd)
+{
+	return ploop_pb_get_first_req_from_tree(&pbd->reported_set, NULL);
+}
+
+int ploop_pb_preq_add_pending(struct ploop_pushbackup_desc *pbd,
+			       struct ploop_request *preq)
+{
+	BUG_ON(!pbd);
+
+	spin_lock_irq(&pbd->ppb_lock);
+
+	if (pbd->ppb_state != PLOOP_PB_ALIVE) {
+		spin_unlock_irq(&pbd->ppb_lock);
+		return -ESTALE;
+	}
+
+	if (!test_bit(PLOOP_S_PUSH_BACKUP, &pbd->plo->state)) {
+		spin_unlock_irq(&pbd->ppb_lock);
+		return -EINTR;
+	}
+
+	if (check_bit_in_map(pbd->reported_map, pbd->ppb_block_max,
+			     preq->req_cluster)) {
+		spin_unlock_irq(&pbd->ppb_lock);
+		return -EALREADY;
+	}
+
+	ploop_pb_add_req_to_pending(pbd, preq);
+
+	if (pbd->ppb_waiting)
+		complete(&pbd->ppb_comp);
+
+	spin_unlock_irq(&pbd->ppb_lock);
+	return 0;
+}
+
+bool ploop_pb_check_and_clear_bit(struct ploop_pushbackup_desc *pbd,
+				  cluster_t clu)
+{
+	if (!pbd)
+		return false;
+
+	if (!check_bit_in_map(pbd->ppb_map, pbd->ppb_block_max, clu))
+		return false;
+
+	spin_lock(&pbd->ppb_lock);
+
+	if (pbd->ppb_state != PLOOP_PB_ALIVE ||
+	    check_bit_in_map(pbd->reported_map, pbd->ppb_block_max, clu)) {
+		spin_unlock(&pbd->ppb_lock);
+		ploop_pb_clear_bit(pbd, clu);
+		return false;
+	}
+
+	spin_unlock(&pbd->ppb_lock);
+	return true;
+}
+
+static void return_bios_back_to_plo(struct ploop_device *plo,
+				    struct bio_list *bl)
+{
+	if (!bl->head)
+		return;
+
+	if (plo->bio_tail)
+		plo->bio_tail->bi_next = bl->head;
+	else
+		plo->bio_head = bl->head;
+
+	plo->bio_tail = bl->tail;
+
+	bio_list_init(bl);
+}
+
+/* Always serialized by plo->ctl_mutex */
+unsigned long ploop_pb_stop(struct ploop_pushbackup_desc *pbd, bool do_merge)
+{
+	unsigned long ret = 0;
+	int merge_status = 0;
+	LIST_HEAD(drop_list);
+
+	if (pbd == NULL)
+		return 0;
+
+	spin_lock_irq(&pbd->ppb_lock);
+	if (pbd->ppb_state == PLOOP_PB_DEAD) {
+		spin_unlock_irq(&pbd->ppb_lock);
+		return 0;
+	}
+	pbd->ppb_state = PLOOP_PB_DEAD;
+	spin_unlock_irq(&pbd->ppb_lock);
+
+	ploop_pbs_fini(&pbd->pending_set);
+	ploop_pbs_fini(&pbd->reported_set);
+
+	merge_status = ploop_pb_cbt_map_release(pbd, do_merge);
+
+	spin_lock_irq(&pbd->ppb_lock);
+
+	while (!RB_EMPTY_ROOT(&pbd->pending_set.tree)) {
+		struct ploop_request *preq =
+			ploop_pb_get_first_req_from_pending(pbd);
+		list_add(&preq->list, &drop_list);
+		ret++;
+	}
+
+	while (!RB_EMPTY_ROOT(&pbd->reported_set.tree)) {
+		struct ploop_request *preq =
+			ploop_pb_get_first_req_from_reported(pbd);
+		list_add(&preq->list, &drop_list);
+		ret++;
+	}
+
+	if (pbd->ppb_waiting)
+		complete(&pbd->ppb_comp);
+	spin_unlock_irq(&pbd->ppb_lock);
+
+	if (!list_empty(&drop_list) || !ploop_pb_bio_list_empty(pbd)) {
+		struct ploop_device *plo = pbd->plo;
+
+		BUG_ON(!plo);
+		spin_lock_irq(&plo->lock);
+		list_splice_init(&drop_list, plo->ready_queue.prev);
+		return_bios_back_to_plo(plo, &pbd->bio_pending_list);
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+		spin_unlock_irq(&plo->lock);
+	}
+
+	return merge_status ? : ret;
+}
+
+int ploop_pb_get_pending(struct ploop_pushbackup_desc *pbd,
+			 cluster_t *clu_p, cluster_t *len_p, unsigned n_done)
+{
+	bool blocking  = !n_done;
+	struct ploop_request *preq, *npreq;
+	int err = 0;
+
+	spin_lock_irq(&pbd->ppb_lock);
+
+	preq = ploop_pb_get_first_reqs_from_pending(pbd, &npreq);
+	if (!preq) {
+		struct ploop_device *plo = pbd->plo;
+
+		if (!blocking) {
+			err = -ENOENT;
+			goto get_pending_unlock;
+		}
+
+                /* blocking case */
+		if (pbd->ppb_state != PLOOP_PB_ALIVE) {
+			err = -ESTALE;
+			goto get_pending_unlock;
+		}
+		if (unlikely(pbd->ppb_waiting)) {
+			/* Other task is already waiting for event */
+			err = -EBUSY;
+			goto get_pending_unlock;
+		}
+		pbd->ppb_waiting = true;
+		spin_unlock_irq(&pbd->ppb_lock);
+
+		mutex_unlock(&plo->ctl_mutex);
+		err = wait_for_completion_interruptible(&pbd->ppb_comp);
+		mutex_lock(&plo->ctl_mutex);
+
+		if (plo->pbd != pbd)
+			return -EINTR;
+
+		spin_lock_irq(&pbd->ppb_lock);
+		pbd->ppb_waiting = false;
+		init_completion(&pbd->ppb_comp);
+
+		preq = ploop_pb_get_first_reqs_from_pending(pbd, &npreq);
+		if (!preq) {
+			if (!test_bit(PLOOP_S_PUSH_BACKUP, &plo->state))
+				err = -EINTR;
+			else if (pbd->ppb_state != PLOOP_PB_ALIVE)
+				err =  -ESTALE;
+			else if (signal_pending(current))
+				err = -ERESTARTSYS;
+			else err = -ENOENT;
+
+			goto get_pending_unlock;
+		}
+	}
+
+	ploop_pb_add_req_to_reported(pbd, preq);
+
+	*clu_p = preq->req_cluster;
+	*len_p = 1;
+
+	while (npreq && npreq->req_cluster == *clu_p + *len_p) {
+		struct rb_node *next = rb_next(&npreq->reloc_link);
+
+		preq = npreq;
+		if (next)
+			npreq = rb_entry(next, struct ploop_request,
+					 reloc_link);
+		else
+			npreq = NULL;
+
+		remove_req_from_pbs(&pbd->pending_set, preq);
+		ploop_pb_add_req_to_reported(pbd, preq);
+
+		(*len_p)++;
+	}
+
+get_pending_unlock:
+	spin_unlock_irq(&pbd->ppb_lock);
+	return err;
+}
+
+static void fill_page_to_backup(struct ploop_pushbackup_desc *pbd,
+				unsigned long idx, struct page *page)
+{
+	u32 *dst = page_address(page);
+	u32 *fin = page_address(page) + PAGE_SIZE;
+	u32 *map = page_address(pbd->ppb_map[idx]);
+	u32 *rep = page_address(pbd->reported_map[idx]);
+
+	while (dst < fin) {
+		*dst = *map & ~*rep;
+		dst++;
+		map++;
+		rep++;
+	}
+}
+
+int ploop_pb_peek(struct ploop_pushbackup_desc *pbd,
+		  cluster_t *clu_p, cluster_t *len_p, unsigned n_done)
+{
+	unsigned long block = *clu_p + *len_p;
+	unsigned long idx = block >> (PAGE_SHIFT + 3);
+	unsigned long clu = 0;
+	unsigned long len = 0;
+	unsigned long off, off2;
+	struct page *page;
+	bool found = 0;
+
+	if (block >= pbd->ppb_block_max)
+		return -ENOENT;
+
+	if (pbd->ppb_state != PLOOP_PB_ALIVE)
+		return -ESTALE;
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	spin_lock_irq(&pbd->ppb_lock);
+	while (block < pbd->ppb_block_max) {
+		fill_page_to_backup(pbd, idx, page);
+		off = block & (BITS_PER_PAGE -1);
+
+		if (!found) {
+			clu = find_next_bit(page_address(page),
+					       BITS_PER_PAGE, off);
+			if (clu == BITS_PER_PAGE)
+				goto next;
+
+			off = clu;
+			clu += idx << (PAGE_SHIFT + 3);
+			found = 1;
+		}
+
+		if (found) {
+			off2 = find_next_zero_bit(page_address(page),
+						  BITS_PER_PAGE, off);
+			len += off2 - off;
+			if (off2 != BITS_PER_PAGE)
+				break;
+		}
+
+	next:
+		idx++;
+		block = idx << (PAGE_SHIFT + 3);
+	}
+	spin_unlock_irq(&pbd->ppb_lock);
+
+	__free_page(page);
+
+	if (!found)
+		return -ENOENT;
+
+	*clu_p = clu;
+	*len_p = len;
+	return 0;
+}
+
+static void ploop_pb_process_extent(struct pb_set *pbs, cluster_t clu,
+				    cluster_t len, struct list_head *ready_list,
+				    int *n_found)
+{
+	struct ploop_request *preq, *npreq;
+
+	preq = ploop_pb_get_req_from_tree(pbs, clu, len, &npreq);
+
+	while (preq) {
+		struct rb_node *n;
+
+		set_bit(PLOOP_REQ_PUSH_BACKUP, &preq->ppb_state);
+		list_add(&preq->list, ready_list);
+
+		if (n_found)
+			(*n_found)++;
+
+		if (!preq_match(npreq, clu, len))
+			break;
+
+		preq = npreq;
+		n = rb_next(&preq->reloc_link);
+		if (n)
+			npreq = rb_entry(n, struct ploop_request, reloc_link);
+		else
+			npreq = NULL;
+		remove_req_from_pbs(pbs, preq);
+	}
+}
+
+void ploop_pb_put_reported(struct ploop_pushbackup_desc *pbd,
+			   cluster_t clu, cluster_t len)
+{
+	int n_found = 0;
+	LIST_HEAD(ready_list);
+
+	spin_lock_irq(&pbd->ppb_lock);
+
+	ploop_pb_process_extent(&pbd->reported_set, clu, len, &ready_list, &n_found);
+	ploop_pb_process_extent(&pbd->pending_set, clu, len, &ready_list, NULL);
+
+	/*
+	 * If preq not found above, it's unsolicited report. Then it's
+	 * enough to have corresponding bit set in reported_map because if
+	 * any WRITE-request comes afterwards, ploop_pb_preq_add_pending()
+	 * fails and ploop_thread will clear corresponding bit in ppb_map
+	 * -- see "push_backup special processing" in ploop_entry_request()
+	 * for details.
+	 */
+	set_bits_in_map(pbd->reported_map, pbd->ppb_block_max, clu, len);
+
+	spin_unlock_irq(&pbd->ppb_lock);
+
+	if (!list_empty(&ready_list)) {
+		struct ploop_device *plo = pbd->plo;
+
+		spin_lock_irq(&plo->lock);
+		list_splice(&ready_list, plo->ready_queue.prev);
+		if (test_bit(PLOOP_S_WAIT_PROCESS, &plo->state))
+			wake_up_interruptible(&plo->waitq);
+		spin_unlock_irq(&plo->lock);
+	}
+}
+
+int ploop_pb_destroy(struct ploop_device *plo, __u32 *status)
+{
+	struct ploop_pushbackup_desc *pbd = plo->pbd;
+	unsigned long ret;
+	bool do_merge;
+
+	if (!test_and_clear_bit(PLOOP_S_PUSH_BACKUP, &plo->state))
+		return -EINVAL;
+
+	BUG_ON (!pbd);
+	do_merge = status ? *status : true;
+	ret = ploop_pb_stop(pbd, do_merge);
+
+	if (status)
+		*status = ret;
+
+	ploop_quiesce(plo);
+	ploop_pb_fini(plo->pbd);
+	plo->maintenance_type = PLOOP_MNTN_OFF;
+	ploop_relax(plo);
+
+	return 0;
+}
+
+static bool ploop_pb_set_expired(struct pb_set *pbs)
+{
+	struct ploop_pushbackup_desc *pbd = pbs->pbd;
+	struct ploop_device          *plo = pbd->plo;
+	unsigned long timeout = plo->tune.push_backup_timeout * HZ;
+	unsigned long tstamp = 0;
+	cluster_t clu = 0;
+	bool ret = false;
+	unsigned long flags;
+
+	if (!timeout)
+		return false;
+
+	spin_lock_irqsave(&pbd->ppb_lock, flags);
+
+	if (pbd->ppb_state != PLOOP_PB_ALIVE) {
+		spin_unlock_irqrestore(&pbd->ppb_lock, flags);
+		return false;
+	}
+
+	/* No need to scan the whole list: the first preq is the oldest! */
+	if (!list_empty(&pbs->list)) {
+		struct ploop_request *preq = list_first_entry(&pbs->list,
+							      struct ploop_request, list);
+		if (time_before(preq->tstamp + timeout, jiffies)) {
+			tstamp = preq->tstamp;
+			clu = preq->req_cluster;
+			ret = true;
+		} else
+			mod_timer(&pbs->timer, preq->tstamp + timeout + 1);
+	}
+
+	spin_unlock_irqrestore(&pbd->ppb_lock, flags);
+
+	if (ret)
+		printk(KERN_WARNING "Abort push_backup for ploop%d: found "
+		       "preq (clu=%d) in %s tree delayed for %u msecs\n",
+		       plo->index, clu, pbs->name,
+		       jiffies_to_msecs(jiffies - tstamp));
+
+	return ret;
+}
+
+static void ploop_pb_timeout_func(unsigned long data)
+{
+	struct pb_set                *pbs = (void*)data;
+	struct ploop_pushbackup_desc *pbd = pbs->pbd;
+	struct ploop_device          *plo = pbd->plo;
+	unsigned long flags;
+
+	if (!plo->tune.push_backup_timeout ||
+	    !test_bit(PLOOP_S_RUNNING, &plo->state) ||
+	    !test_bit(PLOOP_S_PUSH_BACKUP, &plo->state) ||
+	    !ploop_pb_set_expired(pbs))
+		return;
+
+	spin_lock_irqsave(&pbd->ppb_lock, flags);
+	if (pbd->ppb_state == PLOOP_PB_ALIVE) {
+		pbd->ppb_state = PLOOP_PB_STOPPING;
+		if (waitqueue_active(&pbd->ppb_waitq))
+			wake_up_interruptible(&pbd->ppb_waitq);
+	}
+	spin_unlock_irqrestore(&pbd->ppb_lock, flags);
+}
+
+/* Return true if bio was detained, false otherwise */
+bool ploop_pb_bio_detained(struct ploop_pushbackup_desc *pbd, struct bio *bio)
+{
+	cluster_t   clu = bio->bi_sector >> pbd->plo->cluster_log;
+
+	if (ploop_pb_check_and_clear_bit(pbd, clu)) {
+		bio_list_add(&pbd->bio_pending_list, bio);
+		return true;
+	}
+
+	return false;
+}
+
+/* Return true if no detained bio-s present, false otherwise */
+bool ploop_pb_bio_list_empty(struct ploop_pushbackup_desc *pbd)
+{
+	return !pbd || bio_list_empty(&pbd->bio_pending_list);
+}
+
+struct bio *ploop_pb_bio_get(struct ploop_pushbackup_desc *pbd)
+{
+	return bio_list_pop(&pbd->bio_pending_list);
+}
+
+void ploop_pb_bio_list_merge(struct ploop_pushbackup_desc *pbd,
+			     struct bio_list *tmp)
+{
+	bio_list_merge(&pbd->bio_pending_list, tmp);
+}
--- /dev/null
+++ b/drivers/block/ploop/push_backup.h
@@ -0,0 +1,37 @@
+/*
+ *  drivers/block/ploop/push_backup.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+struct ploop_pushbackup_desc;
+
+struct ploop_pushbackup_desc *ploop_pb_alloc(struct ploop_device *plo);
+int ploop_pb_init(struct ploop_pushbackup_desc *pbd, __u8 *uuid, bool full);
+void ploop_pb_fini(struct ploop_pushbackup_desc *pbd);
+int ploop_pb_copy_cbt_to_user(struct ploop_pushbackup_desc *pbd, char *user_addr);
+unsigned long ploop_pb_stop(struct ploop_pushbackup_desc *pbd, bool do_merge);
+int ploop_pb_check_uuid(struct ploop_pushbackup_desc *pbd, __u8 *uuid);
+int ploop_pb_get_uuid(struct ploop_pushbackup_desc *pbd, __u8 *uuid);
+
+int ploop_pb_get_pending(struct ploop_pushbackup_desc *pbd,
+			 cluster_t *clu_p, cluster_t *len_p, unsigned n_done);
+int ploop_pb_peek(struct ploop_pushbackup_desc *pbd,
+		  cluster_t *clu_p, cluster_t *len_p, unsigned n_done);
+void ploop_pb_put_reported(struct ploop_pushbackup_desc *pbd,
+			   cluster_t clu, cluster_t len);
+
+void ploop_pb_clear_bit(struct ploop_pushbackup_desc *pbd, cluster_t clu);
+bool ploop_pb_check_bit(struct ploop_pushbackup_desc *pbd, cluster_t clu);
+bool ploop_pb_check_and_clear_bit(struct ploop_pushbackup_desc *pbd, cluster_t clu);
+
+int ploop_pb_preq_add_pending(struct ploop_pushbackup_desc *pbd,
+			       struct ploop_request *preq);
+
+int ploop_pb_destroy(struct ploop_device *plo, __u32 *status);
+
+bool ploop_pb_bio_detained(struct ploop_pushbackup_desc *pbd, struct bio *bio);
+bool ploop_pb_bio_list_empty(struct ploop_pushbackup_desc *pbd);
+struct bio *ploop_pb_bio_get(struct ploop_pushbackup_desc *pbd);
+void ploop_pb_bio_list_merge(struct ploop_pushbackup_desc *pbd, struct bio_list *tmp);
--- /dev/null
+++ b/drivers/block/ploop/sysfs.c
@@ -0,0 +1,690 @@
+/*
+ *  drivers/block/ploop/sysfs.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/interrupt.h>
+#include <linux/buffer_head.h>
+#include <linux/kthread.h>
+#include <asm/uaccess.h>
+
+#include <linux/ploop/ploop.h>
+#include "push_backup.h"
+
+struct delta_sysfs_entry {
+	struct attribute attr;
+	ssize_t (*show)(struct ploop_delta *, char *);
+	ssize_t (*store)(struct ploop_delta *, const char *, size_t);
+};
+
+static ssize_t
+delta_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct delta_sysfs_entry *entry = container_of(attr, struct delta_sysfs_entry, attr);
+	struct ploop_delta *delta = container_of(kobj, struct ploop_delta, kobj);
+
+	if (!entry->show)
+		return -EIO;
+	return entry->show(delta, page);
+}
+
+static ssize_t
+delta_attr_store(struct kobject *kobj, struct attribute *attr,
+		 const char *page, size_t length)
+{
+	struct delta_sysfs_entry *entry = container_of(attr, struct delta_sysfs_entry, attr);
+	struct ploop_delta *delta = container_of(kobj, struct ploop_delta, kobj);
+
+	if (!entry->store)
+		return -EIO;
+
+	return entry->store(delta, page, length);
+}
+
+
+static struct sysfs_ops delta_sysfs_ops = {
+	.show	= delta_attr_show,
+	.store	= delta_attr_store,
+};
+
+static void release_delta(struct kobject *kobj)
+{
+	struct ploop_delta *delta = container_of(kobj, struct ploop_delta, kobj);
+
+	if (delta->ops)
+		ploop_format_put(delta->ops);
+	module_put(THIS_MODULE);
+	kfree(delta);
+}
+
+static ssize_t
+delta_var_show(unsigned int var, char *page)
+{
+	return sprintf(page, "%d\n", var);
+}
+
+static ssize_t
+delta_string_show(char * str, char *page)
+{
+	return sprintf(page, "%s\n", str);
+}
+
+static ssize_t delta_level_show(struct ploop_delta *delta, char *page)
+{
+	return delta_var_show(delta->level, page);
+}
+
+static ssize_t delta_image_show(struct ploop_delta *delta, char *page)
+{
+	char * res;
+	int len = -ENOENT;
+
+	mutex_lock(&delta->plo->sysfs_mutex);
+	if (delta->io.files.file) {
+		res = d_path(&delta->io.files.file->f_path, page, PAGE_SIZE-1);
+		len = PTR_ERR(res);
+		if (!IS_ERR(res)) {
+			len = strlen(res);
+			if (res != page)
+				memmove(page, res, len);
+			page[len] = '\n';
+			len++;
+		}
+	}
+	mutex_unlock(&delta->plo->sysfs_mutex);
+	return len;
+}
+
+static ssize_t delta_format_show(struct ploop_delta *delta, char *page)
+{
+	return delta_string_show(delta->ops->name, page);
+}
+
+static ssize_t delta_io_show(struct ploop_delta *delta, char *page)
+{
+	return delta_string_show(delta->io.ops->name, page);
+}
+
+static ssize_t delta_ro_show(struct ploop_delta *delta, char *page)
+{
+	return sprintf(page, "%d\n", !!(delta->flags & PLOOP_FMT_RDONLY));
+}
+
+static ssize_t delta_trans_show(struct ploop_delta *delta, char *page)
+{
+	struct ploop_device * plo = delta->plo;
+	int trans = 0;
+
+	mutex_lock(&delta->plo->sysfs_mutex);
+	if (plo->trans_map && map_top_delta(plo->trans_map) == delta)
+		trans = 1;
+	mutex_unlock(&delta->plo->sysfs_mutex);
+	return sprintf(page, "%d\n", trans);
+}
+
+static ssize_t delta_dump(struct ploop_delta *delta, char *page)
+{
+	int ret = delta->io.ops->dump ? delta->io.ops->dump(&delta->io) : -1;
+	return sprintf(page, "%d\n", ret);
+}
+
+static struct delta_sysfs_entry delta_level_entry = {
+	.attr = {.name = "level", .mode = S_IRUGO },
+	.show = delta_level_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_image_entry = {
+	.attr = {.name = "image", .mode = S_IRUGO },
+	.show = delta_image_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_format_entry = {
+	.attr = {.name = "format", .mode = S_IRUGO },
+	.show = delta_format_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_io_entry = {
+	.attr = {.name = "io", .mode = S_IRUGO },
+	.show = delta_io_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_ro_entry = {
+	.attr = {.name = "ro", .mode = S_IRUGO },
+	.show = delta_ro_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_trans_entry = {
+	.attr = {.name = "transparent", .mode = S_IRUGO },
+	.show = delta_trans_show,
+	.store = NULL,
+};
+
+static struct delta_sysfs_entry delta_dump_entry = {
+	.attr = {.name = "dump", .mode = S_IRUGO },
+	.show = delta_dump,
+};
+
+static struct attribute *default_attrs[] = {
+	&delta_level_entry.attr,
+	&delta_image_entry.attr,
+	&delta_format_entry.attr,
+	&delta_io_entry.attr,
+	&delta_ro_entry.attr,
+	&delta_trans_entry.attr,
+	&delta_dump_entry.attr,
+	NULL,
+};
+
+struct kobj_type ploop_delta_ktype = {
+	.sysfs_ops	= &delta_sysfs_ops,
+	.default_attrs	= default_attrs,
+	.release	= release_delta,
+};
+
+
+static struct {
+#define __DO(_at)	struct attribute _at;
+#include <linux/ploop/ploop_stat.h>
+#undef __DO
+} _attr_arr = {
+#define __DO(_at)	._at = { .name = __stringify(_at), .mode = S_IRUGO|S_IWUSR, },
+#include <linux/ploop/ploop_stat.h>
+#undef __DO
+};
+
+static struct attribute *stats_attributes[] = {
+#define __DO(_at) &_attr_arr._at,
+#include <linux/ploop/ploop_stat.h>
+#undef __DO
+	NULL
+};
+
+static const struct attribute_group stats_group = {
+	.attrs = stats_attributes,
+};
+
+
+
+#define to_disk(obj) dev_to_disk(container_of(obj,struct device,kobj))
+
+static ssize_t pstat_show(struct kobject *kobj, struct attribute *attr,
+			  char *page)
+{
+	struct gendisk *disk = to_disk(kobj->parent);
+	struct ploop_device * plo = disk->private_data;
+	int n;
+
+	n = attr - (struct attribute *)&_attr_arr;
+
+	return sprintf(page, "%u\n", ((u32*)&plo->st)[n]);
+}
+
+static ssize_t pstat_store(struct kobject * kobj, struct attribute * attr,
+			   const char *page, size_t count)
+{
+	struct gendisk *disk = to_disk(kobj->parent);
+	struct ploop_device * plo = disk->private_data;
+	char *p = (char *) page;
+	unsigned long var;
+	int n;
+
+	var = simple_strtoul(p, &p, 10);
+
+	n = attr - (struct attribute *)&_attr_arr;
+	((u32*)&plo->st)[n] = var;
+	return count;
+}
+
+static u32 show_block_size(struct ploop_device * plo)
+{
+	return 1 << plo->cluster_log;
+}
+
+static u32 show_fmt_version(struct ploop_device * plo)
+{
+	return plo->fmt_version;
+}
+
+static u32 show_total_bios(struct ploop_device * plo)
+{
+	return plo->bio_total;
+}
+
+static u32 show_queued_bios(struct ploop_device * plo)
+{
+	return plo->bio_qlen;
+}
+
+static u32 show_discard_bios(struct ploop_device * plo)
+{
+	return plo->bio_discard_qlen;
+}
+
+static u32 show_active_reqs(struct ploop_device * plo)
+{
+	return plo->active_reqs;
+}
+
+static u32 show_entry_read_sync_reqs(struct ploop_device * plo)
+{
+	return plo->read_sync_reqs;
+}
+
+static u32 show_entry_reqs(struct ploop_device * plo)
+{
+	return plo->entry_qlen;
+}
+
+static u32 show_barrier_reqs(struct ploop_device * plo)
+{
+	return plo->barrier_reqs;
+}
+
+static u32 show_fsync_reqs(struct ploop_device * plo)
+{
+	u32 qlen = 0;
+	mutex_lock(&plo->sysfs_mutex);
+	if (!list_empty(&plo->map.delta_list))
+		qlen = ploop_top_delta(plo)->io.fsync_qlen;
+	mutex_unlock(&plo->sysfs_mutex);
+	return qlen;
+}
+
+static u32 show_fastpath_reqs(struct ploop_device * plo)
+{
+	return plo->fastpath_reqs;
+}
+
+static u32 show_map_pages(struct ploop_device * plo)
+{
+	return plo->map.pages;
+}
+
+static u32 show_running(struct ploop_device * plo)
+{
+	return test_bit(PLOOP_S_RUNNING, &plo->state);
+}
+
+static u32 show_locked(struct ploop_device * plo)
+{
+	return test_bit(PLOOP_S_LOCKED, &plo->locking_state);
+}
+
+static u32 show_aborted(struct ploop_device * plo)
+{
+	return test_bit(PLOOP_S_ABORT, &plo->state);
+}
+
+static int store_aborted(struct ploop_device * plo, u32 val)
+{
+	printk(KERN_INFO "ploop: Force %s aborted state for ploop%d\n",
+	       val ? "set" : "clear", plo->index);
+
+	if (val)
+		set_bit(PLOOP_S_ABORT, &plo->state);
+	else
+		clear_bit(PLOOP_S_ABORT, &plo->state);
+	return 0;
+}
+
+static u32 show_top(struct ploop_device * plo)
+{
+	int top = -1;
+
+	mutex_lock(&plo->sysfs_mutex);
+	if (!list_empty(&plo->map.delta_list))
+		top = ploop_top_delta(plo)->level;
+	if (plo->trans_map)
+		top++;
+	mutex_unlock(&plo->sysfs_mutex);
+	return (u32)top;
+}
+
+static inline u32 get_event_locked(struct ploop_device * plo)
+{
+	if (test_and_clear_bit(PLOOP_S_ENOSPC_EVENT, &plo->state))
+		return PLOOP_EVENT_ENOSPC;
+	else if (test_bit(PLOOP_S_ABORT, &plo->state))
+		return PLOOP_EVENT_ABORTED;
+	else if (!test_bit(PLOOP_S_RUNNING, &plo->state))
+		return PLOOP_EVENT_STOPPED;
+
+	return 0;
+}
+
+static u32 show_event(struct ploop_device * plo)
+{
+	u32 ret;
+
+	DEFINE_WAIT(_wait);
+	spin_lock_irq(&plo->lock);
+
+	ret = get_event_locked(plo);
+	if (ret) {
+		spin_unlock_irq(&plo->lock);
+		return ret;
+	}
+
+	prepare_to_wait(&plo->event_waitq, &_wait, TASK_INTERRUPTIBLE);
+	spin_unlock_irq(&plo->lock);
+	schedule();
+	spin_lock_irq(&plo->lock);
+	finish_wait(&plo->event_waitq, &_wait);
+
+	ret = get_event_locked(plo);
+
+	spin_unlock_irq(&plo->lock);
+	return ret;
+}
+
+static u32 show_open_count(struct ploop_device * plo)
+{
+	return atomic_read(&plo->open_count);
+}
+
+static ssize_t print_cookie(struct ploop_device * plo, char * page)
+{
+	return sprintf(page, "%s\n", plo->cookie);
+}
+
+static ssize_t print_push_backup_uuid(struct ploop_device * plo, char * page)
+{
+	__u8 uuid[16];
+	int err;
+
+	mutex_lock(&plo->sysfs_mutex);
+	err = ploop_pb_get_uuid(plo->pbd, uuid);
+	mutex_unlock(&plo->sysfs_mutex);
+
+	page[0] = '\0';
+	if (err)
+		return 0;
+
+	return snprintf(page, PAGE_SIZE, "%pUB\n", uuid);
+}
+
+static u32 show_free_reqs(struct ploop_device * plo)
+{
+	return plo->free_qlen;
+}
+
+static u32 show_free_qmax(struct ploop_device * plo)
+{
+	return plo->free_qmax;
+}
+
+static u32 show_blockable_reqs(struct ploop_device * plo)
+{
+	return plo->blockable_reqs;
+}
+
+static u32 show_blocked_bios(struct ploop_device * plo)
+{
+	return plo->blocked_bios;
+}
+
+static u32 show_freeze_state(struct ploop_device * plo)
+{
+	return plo->freeze_state;
+}
+
+#define _TUNE_U32(_name)				\
+static u32 show_##_name(struct ploop_device * plo)	\
+{							\
+	return plo->tune._name;				\
+}							\
+							\
+static int store_##_name(struct ploop_device * plo, u32 val) \
+{							\
+	plo->tune._name = val;				\
+	return 0;					\
+}
+
+#define _TUNE_JIFFIES(_name)				\
+static u32 show_##_name(struct ploop_device * plo)	\
+{							\
+	return (plo->tune._name * 1000) / HZ;		\
+}							\
+							\
+static int store_##_name(struct ploop_device * plo, u32 val) \
+{							\
+	plo->tune._name = (val * HZ) / 1000;		\
+	return 0;					\
+}
+
+#define _TUNE_BOOL	_TUNE_U32
+
+_TUNE_U32(max_requests);
+_TUNE_U32(batch_entry_qlen);
+_TUNE_JIFFIES(batch_entry_delay);
+_TUNE_U32(fsync_max);
+_TUNE_JIFFIES(fsync_delay);
+_TUNE_BOOL(pass_flushes);
+_TUNE_BOOL(pass_fuas);
+_TUNE_BOOL(congestion_detection);
+_TUNE_BOOL(check_zeros);
+_TUNE_U32(min_map_pages);
+_TUNE_JIFFIES(max_map_inactivity);
+_TUNE_BOOL(disable_root_threshold);
+_TUNE_BOOL(disable_user_threshold);
+_TUNE_U32(congestion_high_watermark);
+_TUNE_U32(congestion_low_watermark);
+_TUNE_U32(max_active_requests);
+_TUNE_U32(push_backup_timeout);
+
+
+struct pattr_sysfs_entry {
+	struct attribute attr;
+	u32 (*show)(struct ploop_device *);
+	int (*store)(struct ploop_device *, __u32 val);
+	ssize_t (*print)(struct ploop_device *, char *page);
+};
+
+#define _A(_name) \
+&((struct pattr_sysfs_entry){ .attr = { .name = __stringify(_name), .mode = S_IRUGO }, .show = show_##_name, }).attr
+
+#define _A2(_name) \
+&((struct pattr_sysfs_entry){ .attr = { .name = __stringify(_name), .mode = S_IRUGO|S_IWUSR }, .show = show_##_name, .store = store_##_name, }).attr
+
+#define _A3(_name)							\
+&((struct pattr_sysfs_entry){ .attr = { .name = __stringify(_name), .mode = S_IRUGO }, .print = print_##_name, }).attr
+
+static struct attribute *state_attributes[] = {
+	_A(block_size),
+	_A(fmt_version),
+	_A(total_bios),
+	_A(queued_bios),
+	_A(discard_bios),
+	_A(active_reqs),
+	_A(entry_reqs),
+	_A(entry_read_sync_reqs),
+	_A(barrier_reqs),
+	_A(fastpath_reqs),
+	_A(fsync_reqs),
+	_A(map_pages),
+	_A(running),
+	_A(locked),
+	_A2(aborted),
+	_A(top),
+	_A(event),
+	_A3(cookie),
+	_A3(push_backup_uuid),
+	_A(open_count),
+	_A(free_reqs),
+	_A(free_qmax),
+	_A(blockable_reqs),
+	_A(blocked_bios),
+	_A(freeze_state),
+	NULL
+};
+
+static struct attribute *tune_attributes[] = {
+	_A2(max_requests),
+	_A2(batch_entry_qlen),
+	_A2(batch_entry_delay),
+	_A2(fsync_max),
+	_A2(fsync_delay),
+	_A2(min_map_pages),
+	_A2(max_map_inactivity),
+	_A2(pass_flushes),
+	_A2(pass_fuas),
+	_A2(congestion_detection),
+	_A2(check_zeros),
+	_A2(disable_root_threshold),
+	_A2(disable_user_threshold),
+	_A2(congestion_high_watermark),
+	_A2(congestion_low_watermark),
+	_A2(max_active_requests),
+	_A2(push_backup_timeout),
+	NULL
+};
+
+static const struct attribute_group state_group = {
+	.attrs = state_attributes,
+};
+
+static const struct attribute_group tune_group = {
+	.attrs = tune_attributes,
+};
+
+static ssize_t
+pattr_show(struct kobject *kobj, struct attribute *attr, char *page)
+{
+	struct pattr_sysfs_entry *entry = container_of(attr, struct pattr_sysfs_entry, attr);
+	struct gendisk *disk = to_disk(kobj->parent);
+	struct ploop_device * plo = disk->private_data;
+	u32 val;
+
+	if (entry->print)
+		return entry->print(plo, page);
+
+	if (!entry->show)
+		return -EIO;
+	val = entry->show(plo);
+	return sprintf(page, "%u\n", val);
+}
+
+static ssize_t
+pattr_store(struct kobject *kobj, struct attribute *attr,
+	    const char *page, size_t length)
+{
+	struct pattr_sysfs_entry *entry = container_of(attr, struct pattr_sysfs_entry, attr);
+	struct gendisk *disk = to_disk(kobj->parent);
+	struct ploop_device * plo = disk->private_data;
+	char *p = (char *) page;
+	unsigned long var;
+	int err;
+
+	if (!entry->store)
+		return -EIO;
+
+	var = simple_strtoul(p, &p, 10);
+
+	err = entry->store(plo, var);
+	return err ? : length;
+}
+
+static struct sysfs_ops pattr_sysfs_ops = {
+	.show	= &pattr_show,
+	.store	= &pattr_store,
+};
+
+static struct sysfs_ops pstat_sysfs_ops = {
+	.show	= &pstat_show,
+	.store	= &pstat_store,
+};
+
+static void pattr_release(struct kobject *kobj)
+{
+	kfree(kobj);
+}
+
+static struct kobj_type pattr_ktype = {
+	.release	= pattr_release,
+	.sysfs_ops	= &pattr_sysfs_ops,
+};
+
+static struct kobj_type pstat_ktype = {
+	.release	= pattr_release,
+	.sysfs_ops	= &pstat_sysfs_ops,
+};
+
+struct kobject *kobject_add_attr(struct gendisk *gd, const char *name,
+				 struct kobj_type * type)
+{
+	struct kobject *k;
+	int err;
+	struct kobject * parent = &disk_to_dev(gd)->kobj;
+
+	k = kzalloc(sizeof(*k), GFP_KERNEL);
+	if (!k)
+		return NULL;
+
+	kobject_init(k, type);
+
+	err = kobject_add(k, parent, "%s", name);
+	if (err) {
+		kobject_put(k);
+		return NULL;
+	}
+	return k;
+}
+
+void ploop_sysfs_init(struct ploop_device * plo)
+{
+	plo->pstat_dir = kobject_add_attr(plo->disk, "pstat", &pstat_ktype);
+	if (plo->pstat_dir) {
+		if (sysfs_create_group(plo->pstat_dir, &stats_group))
+			printk("ploop: were not able to create pstat dir\n");
+	}
+	plo->pstate_dir = kobject_add_attr(plo->disk, "pstate", &pattr_ktype);
+	if (plo->pstate_dir) {
+		if (sysfs_create_group(plo->pstate_dir, &state_group))
+			printk("ploop: were not able to create pstate dir\n");
+	}
+	plo->ptune_dir = kobject_add_attr(plo->disk, "ptune", &pattr_ktype);
+	if (plo->ptune_dir) {
+		if (sysfs_create_group(plo->ptune_dir, &tune_group))
+			printk("ploop: were not able to create ptune dir\n");
+	}
+
+	if (kobject_add(&plo->kobj, kobject_get(&disk_to_dev(plo->disk)->kobj), "%s", "pdelta"))
+		printk("ploop: were not able to create pdelta dir\n");
+}
+
+void ploop_sysfs_uninit(struct ploop_device * plo)
+{
+	if (plo->pstat_dir) {
+		sysfs_remove_group(plo->pstat_dir, &stats_group);
+		kobject_del(plo->pstat_dir);
+		kobject_put(plo->pstat_dir);
+		plo->pstat_dir = NULL;
+	}
+	if (plo->pstate_dir) {
+		sysfs_remove_group(plo->pstate_dir, &state_group);
+		kobject_del(plo->pstate_dir);
+		kobject_put(plo->pstate_dir);
+		plo->pstate_dir = NULL;
+	}
+	if (plo->ptune_dir) {
+		sysfs_remove_group(plo->ptune_dir, &tune_group);
+		kobject_del(plo->ptune_dir);
+		kobject_put(plo->ptune_dir);
+		plo->ptune_dir = NULL;
+	}
+	kobject_del(&plo->kobj);
+
+	kobject_put(&disk_to_dev(plo->disk)->kobj);
+}
--- /dev/null
+++ b/drivers/block/ploop/tracker.c
@@ -0,0 +1,293 @@
+/*
+ *  drivers/block/ploop/tracker.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/* Tracker engine detects and records changed clusters.
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/interrupt.h>
+#include <asm/uaccess.h>
+
+#include <linux/ploop/ploop.h>
+
+struct track_record
+{
+	struct rb_node	rb_node;
+	u32		start;
+	u32		end;
+};
+
+static int tree_insert(struct rb_root *root, struct track_record *m)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct track_record * entry;
+
+	while (*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct track_record, rb_node);
+
+		if (m->start < entry->start)
+			p = &(*p)->rb_left;
+		else if (m->start >= entry->end)
+			p = &(*p)->rb_right;
+		else
+			return -EEXIST;
+	}
+
+	rb_link_node(&m->rb_node, parent, p);
+	rb_insert_color(&m->rb_node, root);
+	return 0;
+}
+
+void ploop_tracker_notify(struct ploop_device * plo, sector_t sec)
+{
+	struct track_record * m;
+
+	if (!test_bit(PLOOP_S_TRACK, &plo->state))
+		return;
+	if (test_bit(PLOOP_S_TRACK_ABORT, &plo->state))
+		return;
+
+	sec >>= plo->cluster_log;
+
+	m = kmalloc(sizeof(struct track_record), GFP_NOFS);
+	if (m == NULL) {
+		set_bit(PLOOP_S_TRACK_ABORT, &plo->state);
+		return;
+	}
+
+	m->start = sec;
+	m->end = sec + 1;
+
+	spin_lock(&plo->track_lock);
+	if (tree_insert(&plo->track_tree, m)) {
+		kfree(m);
+	} else {
+		struct rb_node * rb;
+		struct track_record * merge;
+
+		if (m->start != 0) {
+			rb = rb_prev(&m->rb_node);
+			if (rb) {
+				merge = rb_entry(rb, struct track_record, rb_node);
+				if (m->start == merge->end) {
+					m->start = merge->start;
+					rb_erase(&merge->rb_node, &plo->track_tree);
+					kfree(merge);
+				}
+			}
+		}
+
+		rb = rb_next(&m->rb_node);
+		if (rb) {
+			merge = rb_entry(rb, struct track_record, rb_node);
+			if (m->end == merge->start) {
+				m->end = merge->end;
+				rb_erase(&merge->rb_node, &plo->track_tree);
+				kfree(merge);
+			}
+		}
+	}
+	spin_unlock(&plo->track_lock);
+}
+EXPORT_SYMBOL(ploop_tracker_notify);
+
+int ploop_tracker_init(struct ploop_device * plo, unsigned long arg)
+{
+	struct ploop_track_extent e;
+
+	if (plo->maintenance_type != PLOOP_MNTN_OFF)
+		return -EBUSY;
+	if (list_empty(&plo->map.delta_list))
+		return -ENOENT;
+
+	ploop_quiesce(plo);
+
+	e.start = 0;
+	e.end = (u64)ploop_top_delta(plo)->io.alloc_head << (plo->cluster_log + 9);
+	if (copy_to_user((void*)arg, &e, sizeof(struct ploop_track_extent))) {
+		ploop_relax(plo);
+		return -EFAULT;
+	}
+
+	set_bit(PLOOP_S_TRACK, &plo->state);
+	plo->maintenance_type = PLOOP_MNTN_TRACK;
+	plo->track_end = 0;
+	plo->track_ptr = 0;
+	ploop_relax(plo);
+	return 0;
+}
+
+int ploop_tracker_setpos(struct ploop_device * plo, unsigned long arg)
+{
+	u64 pos;
+
+	if (copy_from_user(&pos, (void*)arg, sizeof(u64)))
+		return -EFAULT;
+
+	if (!test_bit(PLOOP_S_TRACK, &plo->state))
+		return -EINVAL;
+
+	pos >>= 9;
+
+	if (pos < plo->track_end) {
+		/* _XXX_ This would be good to trim tail of track tree
+		 * and to rewind tracking. We implement this if it will
+		 * be really useful.
+		 */
+		if (pos)
+			return -EINVAL;
+
+		ploop_quiesce(plo);
+
+		clear_bit(PLOOP_S_TRACK_ABORT, &plo->state);
+		ploop_tracker_destroy(plo, 1);
+
+		plo->track_end = pos;
+		plo->track_ptr = 0;
+
+		ploop_relax(plo);
+	} else 
+		plo->track_end = pos;
+
+	return 0;
+}
+
+static struct track_record * find_record(struct rb_root * root, u32 start)
+{
+	struct rb_node * n = root->rb_node;
+	struct rb_node * prev = NULL;
+
+	while (n) {
+		struct track_record * m;
+
+		m = rb_entry(n, struct track_record, rb_node);
+		prev = n;
+
+		if (start < m->start)
+			n = n->rb_left;
+		else if (start >= m->end)
+			n = n->rb_right;
+		else
+			return m;
+	}
+
+	while (prev && start >= rb_entry(prev, struct track_record, rb_node)->end)
+		prev = rb_next(prev);
+
+	if (!prev)
+		return NULL;
+
+	return rb_entry(prev, struct track_record, rb_node);
+}
+
+
+int ploop_tracker_read(struct ploop_device * plo, unsigned long arg)
+{
+	u64 ptr;
+	struct track_record * m;
+	struct ploop_delta * delta;
+	struct ploop_track_extent e;
+	int err;
+
+	if (copy_from_user(&ptr, (void*)arg, sizeof(u64)))
+		return -EFAULT;
+
+	if (!test_bit(PLOOP_S_TRACK, &plo->state))
+		return -EINVAL;
+
+	if (test_bit(PLOOP_S_TRACK_ABORT, &plo->state)) {
+		ploop_tracker_destroy(plo, 1);
+		return -ECONNABORTED;
+	}
+
+	delta = ploop_top_delta(plo);
+
+	spin_lock(&plo->track_lock);
+	m = find_record(&plo->track_tree, plo->track_ptr);
+	if (m == NULL) {
+		if (plo->track_end >= ((sector_t)delta->io.alloc_head << plo->cluster_log) &&
+		    plo->track_ptr)
+			m = find_record(&plo->track_tree, 0);
+	}
+
+	if (m) {
+		rb_erase(&m->rb_node, &plo->track_tree);
+		plo->track_ptr = m->end;
+	} else {
+		plo->track_ptr = 0;
+	}
+	spin_unlock(&plo->track_lock);
+
+	err = -EAGAIN;
+	if (m) {
+		e.start = (u64)m->start << (plo->cluster_log + 9);
+		e.end = (u64)m->end << (plo->cluster_log + 9);
+		kfree(m);
+		err = 0;
+	} else if (plo->track_end < ((sector_t)delta->io.alloc_head << plo->cluster_log)) {
+		e.start = (u64)plo->track_end << 9;
+		e.end = (u64)delta->io.alloc_head << (plo->cluster_log + 9);
+		err = 0;
+	}
+
+	if (!err && copy_to_user((void *)arg, &e, sizeof(e))) {
+		set_bit(PLOOP_S_TRACK_ABORT, &plo->state);
+		err = -EFAULT;
+	}
+
+	return err;
+}
+
+int ploop_tracker_stop(struct ploop_device * plo, int force)
+{
+	int err;
+
+	if (!test_bit(PLOOP_S_TRACK, &plo->state))
+		return 0;
+
+	ploop_quiesce(plo);
+	if (test_bit(PLOOP_S_TRACK_ABORT, &plo->state))
+		force = 1;
+	err = ploop_tracker_destroy(plo, force);
+	if (!err) {
+		clear_bit(PLOOP_S_TRACK, &plo->state);
+		plo->maintenance_type = PLOOP_MNTN_OFF;
+	}
+	ploop_relax(plo);
+	if (test_bit(PLOOP_S_TRACK_ABORT, &plo->state))
+		return -ECONNABORTED;
+	return err;
+}
+
+int ploop_tracker_destroy(struct ploop_device *plo, int force)
+{
+	struct rb_node * n;
+
+	if (RB_EMPTY_ROOT(&plo->track_tree))
+		return 0;
+
+	if (!force)
+		return -EBUSY;
+
+	spin_lock(&plo->track_lock);
+	while ((n = rb_first(&plo->track_tree)) != NULL) {
+		rb_erase(n, &plo->track_tree);
+		kfree(n);
+	}
+	spin_unlock(&plo->track_lock);
+	return 0;
+}
+
+void track_init(struct ploop_device * plo)
+{
+	plo->track_tree = RB_ROOT;
+	spin_lock_init(&plo->track_lock);
+}
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -1347,6 +1347,9 @@ again:
 		xenbus_dev_fatal(dev, err, "%s", message);
  destroy_blkring:
 	blkif_free(info, 0);
+
+	kfree(info);
+	dev_set_drvdata(&dev->dev, NULL);
  out:
 	return err;
 }
@@ -1429,13 +1432,6 @@ static int blkfront_probe(struct xenbus_device *dev,
 	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
 	dev_set_drvdata(&dev->dev, info);
 
-	err = talk_to_blkback(dev, info);
-	if (err) {
-		kfree(info);
-		dev_set_drvdata(&dev->dev, NULL);
-		return err;
-	}
-
 	return 0;
 }
 
@@ -1905,8 +1901,12 @@ static void blkback_changed(struct xenbus_device *dev,
 	dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
 
 	switch (backend_state) {
-	case XenbusStateInitialising:
 	case XenbusStateInitWait:
+		if (dev->state != XenbusStateInitialising)
+			break;
+		if (talk_to_blkback(dev, info))
+			break;
+	case XenbusStateInitialising:
 	case XenbusStateInitialised:
 	case XenbusStateReconfiguring:
 	case XenbusStateReconfigured:
@@ -1914,6 +1914,23 @@ static void blkback_changed(struct xenbus_device *dev,
 		break;
 
 	case XenbusStateConnected:
+		/*
+		 * talk_to_blkback sets state to XenbusStateInitialised
+		 * and blkfront_connect sets it to XenbusStateConnected
+		 * (if connection went OK).
+		 *
+		 * If the backend (or toolstack) decides to poke at backend
+		 * state (and re-trigger the watch by setting the state repeatedly
+		 * to XenbusStateConnected (4)) we need to deal with this.
+		 * This is allowed as this is used to communicate to the guest
+		 * that the size of disk has changed!
+		 */
+		if ((dev->state != XenbusStateInitialised) &&
+		    (dev->state != XenbusStateConnected)) {
+			if (talk_to_blkback(dev, info))
+				break;
+		}
+
 		blkfront_connect(info);
 		break;
 
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -1335,6 +1335,11 @@ static ssize_t random_write(struct file *file, const char __user *buffer,
 {
 	size_t ret;
 
+#ifdef CONFIG_VE
+	if (!ve_is_super(get_exec_env()))
+		return count;
+#endif
+
 	ret = write_pool(&blocking_pool, buffer, count);
 	if (ret)
 		return ret;
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -122,18 +122,6 @@ struct menu_device {
 	int		interval_ptr;
 };
 
-
-#define LOAD_INT(x) ((x) >> FSHIFT)
-#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
-
-static int get_loadavg(void)
-{
-	unsigned long this = this_cpu_load();
-
-
-	return LOAD_INT(this) * 10 + LOAD_FRAC(this) / 10;
-}
-
 static inline int which_bucket(unsigned int duration)
 {
 	int bucket = 0;
@@ -173,7 +161,7 @@ static inline int performance_multiplier(void)
 
 	/* for higher loadavg, we are more reluctant */
 
-	mult += 2 * get_loadavg();
+	mult += 10 * nr_active_cpu();
 
 	/* for IO wait tasks (per cpu!) we add 5x each */
 	mult += 10 * nr_iowait_cpu(smp_processor_id());
--- a/drivers/gpu/drm/drm_backport.c
+++ b/drivers/gpu/drm/drm_backport.c
@@ -8,41 +8,6 @@
 
 #include <drm/drm_backport.h>
 
-/*
- * shrinker
- */
-
-#undef shrinker
-#undef register_shrinker
-#undef unregister_shrinker
-
-static int shrinker2_shrink(struct shrinker *shrinker, struct shrink_control *sc)
-{
-	struct shrinker2 *s2 = container_of(shrinker, struct shrinker2, compat);
-	int count;
-
-	s2->scan_objects(s2, sc);
-	count = s2->count_objects(s2, sc);
-	shrinker->seeks = s2->seeks;
-
-	return count;
-}
-
-int register_shrinker2(struct shrinker2 *s2)
-{
-	s2->compat.shrink = shrinker2_shrink;
-	s2->compat.seeks = s2->seeks;
-	register_shrinker(&s2->compat);
-	return 0;
-}
-EXPORT_SYMBOL(register_shrinker2);
-
-void unregister_shrinker2(struct shrinker2 *s2)
-{
-	unregister_shrinker(&s2->compat);
-}
-EXPORT_SYMBOL(unregister_shrinker2);
-
 int __init drm_backport_init(void)
 {
 	return 0;
--- a/drivers/gpu/drm/ttm/ttm_page_alloc.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c
@@ -389,6 +389,10 @@ out:
  *
  * XXX: (dchinner) Deadlock warning!
  *
+ * ttm_page_pool_free() does memory allocation using GFP_KERNEL.  that means
+ * this can deadlock when called a sc->gfp_mask that is not equal to
+ * GFP_KERNEL.
+ *
  * This code is crying out for a shrinker per pool....
  */
 static unsigned long
--- a/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
+++ b/drivers/gpu/drm/ttm/ttm_page_alloc_dma.c
@@ -1005,6 +1005,11 @@ EXPORT_SYMBOL_GPL(ttm_dma_unpopulate);
  *
  * XXX: (dchinner) Deadlock warning!
  *
+ * ttm_dma_page_pool_free() does GFP_KERNEL memory allocation, and so attention
+ * needs to be paid to sc->gfp_mask to determine if this can be done or not.
+ * GFP_KERNEL memory allocation in a GFP_ATOMIC reclaim context woul dbe really
+ * bad.
+ *
  * I'm getting sadder as I hear more pathetical whimpers about needing per-pool
  * shrinkers
  */
--- a/drivers/hv/hyperv_vmbus.h
+++ b/drivers/hv/hyperv_vmbus.h
@@ -73,10 +73,6 @@ enum hv_cpuid_function {
 /* Define version of the synthetic interrupt controller. */
 #define HV_SYNIC_VERSION		(1)
 
-/* Define synthetic interrupt controller message constants. */
-#define HV_MESSAGE_SIZE			(256)
-#define HV_MESSAGE_PAYLOAD_BYTE_COUNT	(240)
-#define HV_MESSAGE_PAYLOAD_QWORD_COUNT	(30)
 #define HV_ANY_VP			(0xFFFFFFFF)
 
 /* Define synthetic interrupt controller flag constants. */
@@ -84,48 +80,9 @@ enum hv_cpuid_function {
 #define HV_EVENT_FLAGS_BYTE_COUNT	(256)
 #define HV_EVENT_FLAGS_DWORD_COUNT	(256 / sizeof(u32))
 
-/* Define hypervisor message types. */
-enum hv_message_type {
-	HVMSG_NONE			= 0x00000000,
-
-	/* Memory access messages. */
-	HVMSG_UNMAPPED_GPA		= 0x80000000,
-	HVMSG_GPA_INTERCEPT		= 0x80000001,
-
-	/* Timer notification messages. */
-	HVMSG_TIMER_EXPIRED			= 0x80000010,
-
-	/* Error messages. */
-	HVMSG_INVALID_VP_REGISTER_VALUE	= 0x80000020,
-	HVMSG_UNRECOVERABLE_EXCEPTION	= 0x80000021,
-	HVMSG_UNSUPPORTED_FEATURE		= 0x80000022,
-
-	/* Trace buffer complete messages. */
-	HVMSG_EVENTLOG_BUFFERCOMPLETE	= 0x80000040,
-
-	/* Platform-specific processor intercept messages. */
-	HVMSG_X64_IOPORT_INTERCEPT		= 0x80010000,
-	HVMSG_X64_MSR_INTERCEPT		= 0x80010001,
-	HVMSG_X64_CPUID_INTERCEPT		= 0x80010002,
-	HVMSG_X64_EXCEPTION_INTERCEPT	= 0x80010003,
-	HVMSG_X64_APIC_EOI			= 0x80010004,
-	HVMSG_X64_LEGACY_FP_ERROR		= 0x80010005
-};
-
-#define HV_SYNIC_STIMER_COUNT		(4)
-
 /* Define invalid partition identifier. */
 #define HV_PARTITION_ID_INVALID		((u64)0x0)
 
-/* Define port identifier type. */
-union hv_port_id {
-	u32 asu32;
-	struct {
-		u32 id:24;
-		u32 reserved:8;
-	} u ;
-};
-
 /* Define port type. */
 enum hv_port_type {
 	HVPORT_MSG	= 1,
@@ -173,27 +130,6 @@ struct hv_connection_info {
 	};
 };
 
-/* Define synthetic interrupt controller message flags. */
-union hv_message_flags {
-	u8 asu8;
-	struct {
-		u8 msg_pending:1;
-		u8 reserved:7;
-	};
-};
-
-/* Define synthetic interrupt controller message header. */
-struct hv_message_header {
-	u32 message_type;
-	u8 payload_size;
-	union hv_message_flags message_flags;
-	u8 reserved[2];
-	union {
-		u64 sender;
-		union hv_port_id port;
-	};
-};
-
 /*
  * Timer configuration register.
  */
@@ -210,31 +146,9 @@ union hv_timer_config {
 	};
 };
 
-
-/* Define timer message payload structure. */
-struct hv_timer_message_payload {
-	u32 timer_index;
-	u32 reserved;
-	u64 expiration_time;	/* When the timer expired */
-	u64 delivery_time;	/* When the message was delivered */
-};
-
-/* Define synthetic interrupt controller message format. */
-struct hv_message {
-	struct hv_message_header header;
-	union {
-		u64 payload[HV_MESSAGE_PAYLOAD_QWORD_COUNT];
-	} u ;
-};
-
 /* Define the number of message buffers associated with each port. */
 #define HV_PORT_MESSAGE_BUFFER_COUNT	(16)
 
-/* Define the synthetic interrupt message page layout. */
-struct hv_message_page {
-	struct hv_message sint_message[HV_SYNIC_SINT_COUNT];
-};
-
 /* Define the synthetic interrupt controller event flags format. */
 union hv_synic_event_flags {
 	u8 flags8[HV_EVENT_FLAGS_BYTE_COUNT];
@@ -347,12 +261,6 @@ struct hv_monitor_page {
 	u8 rsvdz4[1984];
 };
 
-/* Declare the various hypercall operations. */
-enum hv_call_code {
-	HVCALL_POST_MESSAGE	= 0x005c,
-	HVCALL_SIGNAL_EVENT	= 0x005d,
-};
-
 /* Definition of the hv_post_message hypercall input structure. */
 struct hv_input_post_message {
 	union hv_connection_id connectionid;
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -679,6 +679,20 @@ static struct cpuidle_state dnv_cstates[] = {
 		.enter = NULL }
 };
 
+static int force_auto_demotion = 0;
+
+static int __init parse_intel_auto_demotion(char *arg)
+{
+	if (!arg)
+		return -EINVAL;
+	if (strcmp(arg, "force") == 0)
+		force_auto_demotion = 1;
+	else
+		return -EINVAL;
+	return 0;
+}
+early_param("intel_auto_demotion", parse_intel_auto_demotion);
+
 /**
  * intel_idle
  * @dev: cpuidle_device
@@ -1131,7 +1145,7 @@ static int intel_idle_cpu_init(int cpu)
 		return -EIO;
 	}
 
-	if (icpu->auto_demotion_disable_flags)
+	if (icpu->auto_demotion_disable_flags && !force_auto_demotion)
 		smp_call_function_single(cpu, auto_demotion_disable, NULL, 1);
 
 	if (icpu->disable_promotion_to_c1e)
--- a/drivers/input/input.c
+++ b/drivers/input/input.c
@@ -46,7 +46,7 @@ static LIST_HEAD(input_handler_list);
  * be mutually exclusive which simplifies locking in drivers implementing
  * input handlers.
  */
-static DEFINE_MUTEX(input_mutex);
+DEFINE_MUTEX(input_mutex);
 
 static const struct input_value input_value_sync = { EV_SYN, SYN_REPORT, 1 };
 
--- a/drivers/md/bcache/btree.c
+++ b/drivers/md/bcache/btree.c
@@ -613,24 +613,19 @@ static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order)
 	return 0;
 }
 
-static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long bch_mca_scan(struct shrinker *shrink,
+				  struct shrink_control *sc)
 {
 	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
 	struct btree *b, *t;
 	unsigned long i, nr = sc->nr_to_scan;
+	unsigned long freed = 0;
 
 	if (c->shrinker_disabled)
-		return 0;
+		return SHRINK_STOP;
 
 	if (c->try_harder)
-		return 0;
-
-	/*
-	 * If nr == 0, we're supposed to return the number of items we have
-	 * cached. Not allowed to return -1.
-	 */
-	if (!nr)
-		return mca_can_free(c) * c->btree_pages;
+		return SHRINK_STOP;
 
 	/* Return -1 if we can't do anything right now */
 	if (sc->gfp_mask & __GFP_IO)
@@ -643,14 +638,14 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
 
 	i = 0;
 	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
-		if (!nr)
+		if (freed >= nr)
 			break;
 
 		if (++i > 3 &&
 		    !mca_reap(b, NULL, 0)) {
 			mca_data_free(b);
 			rw_unlock(true, b);
-			--nr;
+			freed++;
 		}
 	}
 
@@ -661,7 +656,7 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
 	if (list_empty(&c->btree_cache))
 		goto out;
 
-	for (i = 0; nr && i < c->bucket_cache_used; i++) {
+	for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
 		b = list_first_entry(&c->btree_cache, struct btree, list);
 		list_rotate_left(&c->btree_cache);
 
@@ -670,14 +665,27 @@ static int bch_mca_shrink(struct shrinker *shrink, struct shrink_control *sc)
 			mca_bucket_free(b);
 			mca_data_free(b);
 			rw_unlock(true, b);
-			--nr;
+			freed++;
 		} else
 			b->accessed = 0;
 	}
 out:
-	nr = mca_can_free(c) * c->btree_pages;
 	mutex_unlock(&c->bucket_lock);
-	return nr;
+	return freed;
+}
+
+static unsigned long bch_mca_count(struct shrinker *shrink,
+				   struct shrink_control *sc)
+{
+	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
+
+	if (c->shrinker_disabled)
+		return 0;
+
+	if (c->try_harder)
+		return 0;
+
+	return mca_can_free(c) * c->btree_pages;
 }
 
 void bch_btree_cache_free(struct cache_set *c)
@@ -746,7 +754,8 @@ int bch_btree_cache_alloc(struct cache_set *c)
 		c->verify_data = NULL;
 #endif
 
-	c->shrink.shrink = bch_mca_shrink;
+	c->shrink.count_objects = bch_mca_count;
+	c->shrink.scan_objects = bch_mca_scan;
 	c->shrink.seeks = 4;
 	c->shrink.batch = c->btree_pages * 2;
 	register_shrinker(&c->shrink);
--- a/drivers/md/bcache/sysfs.c
+++ b/drivers/md/bcache/sysfs.c
@@ -541,7 +541,7 @@ STORE(__bch_cache_set)
 		struct shrink_control sc;
 		sc.gfp_mask = GFP_KERNEL;
 		sc.nr_to_scan = strtoul_or_return(buf);
-		c->shrink.shrink(&c->shrink, &sc);
+		c->shrink.scan_objects(&c->shrink, &sc);
 	}
 
 	sysfs_strtoul(congested_read_threshold_us,
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -1551,8 +1551,8 @@ static unsigned get_retain_buffers(struct dm_bufio_client *c)
         return retain_bytes / c->block_size;
 }
 
-static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
-		   struct shrink_control *sc)
+static unsigned long __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
+			    gfp_t gfp_mask)
 {
 	int l;
 	struct dm_buffer *b, *tmp;
@@ -1562,37 +1562,48 @@ static void __scan(struct dm_bufio_client *c, unsigned long nr_to_scan,
 
 	for (l = 0; l < LIST_SIZE; l++) {
 		list_for_each_entry_safe_reverse(b, tmp, &c->lru[l], lru_list) {
-			if (__try_evict_buffer(b, sc->gfp_mask))
+			if (__try_evict_buffer(b, gfp_mask))
 				freed++;
 			if (!--nr_to_scan || ((count - freed) <= retain_target))
-				return;
+				return freed;
 			dm_bufio_cond_resched();
 		}
 	}
+	return freed;
 }
 
-static int shrink(struct shrinker *shrinker, struct shrink_control *sc)
+static unsigned long
+dm_bufio_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
-	struct dm_bufio_client *c =
-	    container_of(shrinker, struct dm_bufio_client, shrinker);
-	unsigned long r;
-	unsigned long nr_to_scan = sc->nr_to_scan;
+	struct dm_bufio_client *c;
+	unsigned long freed;
 
+	c = container_of(shrink, struct dm_bufio_client, shrinker);
 	if (sc->gfp_mask & __GFP_FS)
 		dm_bufio_lock(c);
 	else if (!dm_bufio_trylock(c))
-		return !nr_to_scan ? 0 : -1;
+		return SHRINK_STOP;
 
-	if (nr_to_scan)
-		__scan(c, nr_to_scan, sc);
+	freed  = __scan(c, sc->nr_to_scan, sc->gfp_mask);
+	dm_bufio_unlock(c);
+	return freed;
+}
 
-	r = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
-	if (r > INT_MAX)
-		r = INT_MAX;
+static unsigned long
+dm_bufio_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct dm_bufio_client *c;
+	unsigned long count;
 
-	dm_bufio_unlock(c);
+	c = container_of(shrink, struct dm_bufio_client, shrinker);
+	if (sc->gfp_mask & __GFP_FS)
+		dm_bufio_lock(c);
+	else if (!dm_bufio_trylock(c))
+		return 0;
 
-	return r;
+	count = c->n_buffers[LIST_CLEAN] + c->n_buffers[LIST_DIRTY];
+	dm_bufio_unlock(c);
+	return count;
 }
 
 /*
@@ -1689,7 +1700,8 @@ struct dm_bufio_client *dm_bufio_client_create(struct block_device *bdev, unsign
 	__cache_size_refresh();
 	mutex_unlock(&dm_bufio_clients_lock);
 
-	c->shrinker.shrink = shrink;
+	c->shrinker.count_objects = dm_bufio_shrink_count;
+	c->shrinker.scan_objects = dm_bufio_shrink_scan;
 	c->shrinker.seeks = 1;
 	c->shrinker.batch = 0;
 	register_shrinker(&c->shrinker);
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
+#include <linux/key.h>
 #include <linux/bio.h>
 #include <linux/blkdev.h>
 #include <linux/mempool.h>
@@ -28,11 +29,14 @@
 #include <crypto/hash.h>
 #include <crypto/md5.h>
 #include <crypto/algapi.h>
+#include <keys/user-type.h>
 
 #include <linux/device-mapper.h>
 
 #define DM_MSG_PREFIX "crypt"
 
+#include <linux/ploop/ploop.h>
+#include "dm.h"
 /*
  * context holding the current state of a multi-part conversion
  */
@@ -1486,21 +1490,112 @@ static int crypt_setkey_allcpus(struct crypt_config *cc)
 	return err;
 }
 
+#ifdef CONFIG_KEYS
+static struct key *crypt_decode_get_keyring_key(char *key_desc)
+{
+	int ret;
+	struct key *key;
+	char *decoded_key_desc;
+	int key_desc_size = strlen(key_desc) >> 1;
+
+	decoded_key_desc = kmalloc(key_desc_size + 1, GFP_KERNEL);
+	if (!decoded_key_desc)
+		return ERR_PTR(-ENOMEM);
+
+	if (crypt_decode_key(decoded_key_desc, key_desc, key_desc_size) < 0) {
+		kfree(decoded_key_desc);
+		return ERR_PTR(-EINVAL);
+	}
+
+	decoded_key_desc[key_desc_size] = '\0';
+
+	key = request_key(&key_type_user, decoded_key_desc, NULL);
+	kfree(decoded_key_desc);
+	if (IS_ERR(key))
+		return key;
+
+	ret = key_validate(key);
+	if (ret < 0)
+		return ERR_PTR(ret);
+
+	return key;
+}
+
+static int crypt_set_keyring_key(struct crypt_config *cc, char *key_desc)
+{
+	int ret = 0;
+	struct key *key;
+	const struct user_key_payload *ukp;
+
+	key = crypt_decode_get_keyring_key(key_desc);
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+
+	rcu_read_lock();
+	ukp = user_key_payload(key);
+	if (cc->key_size != ukp->datalen) {
+		ret = -EINVAL;
+		goto out;
+	}
+	memcpy(cc->key, ukp->data, cc->key_size);
+out:
+	rcu_read_unlock();
+	key_put(key);
+	return ret;
+}
+
+static int get_key_size(char *key_desc)
+{
+	int ret;
+	struct key *key;
+
+	if (key_desc[0] != ':')
+		return strlen(key_desc) >> 1;
+
+	key = crypt_decode_get_keyring_key(key_desc + 1);
+	if (IS_ERR(key))
+		return PTR_ERR(key);
+
+	rcu_read_lock();
+	ret = user_key_payload(key)->datalen;
+	rcu_read_unlock();
+	key_put(key);
+	return ret;
+}
+#else
+static int crypt_set_keyring_key(struct crypt_config *cc, char *key_desc)
+{
+	return -EINVAL;
+}
+
+static int get_key_size(const char *key)
+{
+	return strlen(key) >> 1;
+}
+#endif
+
 static int crypt_set_key(struct crypt_config *cc, char *key)
 {
 	int r = -EINVAL;
 	int key_string_len = strlen(key);
 
-	/* The key size may not be changed. */
-	if (cc->key_size != (key_string_len >> 1))
-		goto out;
-
 	/* Hyphen (which gives a key_size of zero) means there is no key. */
 	if (!cc->key_size && strcmp(key, "-"))
 		goto out;
 
-	if (cc->key_size && crypt_decode_key(cc->key, key, cc->key_size) < 0)
-		goto out;
+	/* ':' means that the key is in kernel keyring */
+	if (key[0] == ':') {
+		if (crypt_set_keyring_key(cc, key + 1))
+			goto out;
+	} else {
+		/* The key size may not be changed. */
+		if (cc->key_size != (key_string_len >> 1))
+			goto out;
+
+		if (cc->key_size &&
+			crypt_decode_key(cc->key, key, cc->key_size) < 0)
+			goto out;
+	}
 
 	set_bit(DM_CRYPT_KEY_VALID, &cc->flags);
 
@@ -1727,12 +1822,13 @@ bad_mem:
 
 /*
  * Construct an encryption mapping:
- * <cipher> <key> <iv_offset> <dev_path> <start>
+ * <cipher> [<key>|:<key description>] <iv_offset> <dev_path> <start>
  */
 static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 {
 	struct crypt_config *cc;
-	unsigned int key_size, opt_params;
+	int key_size;
+	unsigned int opt_params;
 	unsigned long long tmpll;
 	int ret;
 	size_t iv_size_padding;
@@ -1749,7 +1845,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
 		return -EINVAL;
 	}
 
-	key_size = strlen(argv[1]) >> 1;
+	key_size = get_key_size(argv[1]);
+	if (key_size < 0) {
+		ti->error = "Cannot get the key";
+		return -EINVAL;
+	}
 
 	cc = kzalloc(sizeof(*cc) + key_size * sizeof(u8), GFP_KERNEL);
 	if (!cc) {
@@ -2073,6 +2173,24 @@ static void crypt_io_hints(struct dm_target *ti, struct queue_limits *limits)
 	limits->max_segment_size = PAGE_SIZE;
 }
 
+static void crypt_ploop_modify(struct dm_target *ti, int action)
+{
+	struct crypt_config *cc = ti->private;
+
+	if (cc && cc->dev)
+		switch (action) {
+		case DM_PLOOP_ATTACH:
+			ploop_set_dm_crypt_bdev(cc->dev->bdev,
+				dm_md_get_bdev(dm_table_get_md(ti->table)));
+			break;
+		case DM_PLOOP_DETACH:
+			ploop_set_dm_crypt_bdev(cc->dev->bdev, NULL);
+			break;
+		default:
+			BUG();
+		}
+}
+
 static struct target_type crypt_target = {
 	.name   = "crypt",
 	.version = {1, 14, 1},
@@ -2088,6 +2206,7 @@ static struct target_type crypt_target = {
 	.merge  = crypt_merge,
 	.iterate_devices = crypt_iterate_devices,
 	.io_hints = crypt_io_hints,
+	.ploop_modify = crypt_ploop_modify,
 };
 
 static int __init dm_crypt_init(void)
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -36,14 +36,6 @@ struct hash_cell {
 	struct dm_table *new_map;
 };
 
-/*
- * A dummy definition to make RCU happy.
- * struct dm_table should never be dereferenced in this file.
- */
-struct dm_table {
-	int undefined__;
-};
-
 struct vers_iter {
     size_t param_size;
     struct dm_target_versions *vers, *old_vers;
@@ -1037,6 +1029,9 @@ static int do_resume(struct dm_ioctl *param)
 			return PTR_ERR(old_map);
 		}
 
+		dm_table_ploop_modify(old_map, DM_PLOOP_DETACH);
+		dm_table_ploop_modify(new_map, DM_PLOOP_ATTACH);
+
 		if (dm_table_get_mode(new_map) & FMODE_WRITE)
 			set_disk_ro(dm_disk(md), 0);
 		else
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -1745,3 +1745,18 @@ void dm_table_run_md_queue_async(struct dm_table *t)
 }
 EXPORT_SYMBOL(dm_table_run_md_queue_async);
 
+void dm_table_ploop_modify(struct dm_table *t, int action)
+{
+	unsigned int i;
+
+	if (!t)
+		return;
+
+	/* attach or detach the targets */
+	for (i = 0; i < t->num_targets; i++) {
+		struct dm_target *tgt = t->targets + i;
+
+		if (tgt->type->ploop_modify)
+			tgt->type->ploop_modify(tgt, action);
+	}
+}
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -165,6 +165,12 @@ static unsigned dm_get_numa_node(void)
 					 DM_NUMA_NODE, num_online_nodes() - 1);
 }
 
+struct block_device *dm_md_get_bdev(struct mapped_device *md)
+{
+	return md->bdev;
+}
+EXPORT_SYMBOL_GPL(dm_md_get_bdev);
+
 static int __init local_init(void)
 {
 	int r = -ENOMEM;
@@ -1776,7 +1782,7 @@ static void __set_size(struct mapped_device *md, sector_t size)
 {
 	set_capacity(md->disk, size);
 
-	i_size_write(md->bdev->bd_inode, (loff_t)size << SECTOR_SHIFT);
+	bd_write_size(md->bdev, (loff_t)size << SECTOR_SHIFT);
 }
 
 /*
@@ -2114,7 +2120,9 @@ static void __dm_destroy(struct mapped_device *md, bool wait)
 		       dm_device_name(md), atomic_read(&md->holders));
 
 	dm_sysfs_exit(md);
-	dm_table_destroy(__unbind(md));
+	map = __unbind(md);
+	dm_table_ploop_modify(map, DM_PLOOP_DETACH);
+	dm_table_destroy(map);
 	free_dev(md);
 }
 
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -79,9 +79,12 @@ bool dm_table_request_based(struct dm_table *t);
 bool dm_table_mq_request_based(struct dm_table *t);
 void dm_table_free_md_mempools(struct dm_table *t);
 struct dm_md_mempools *dm_table_get_md_mempools(struct dm_table *t);
+void dm_table_ploop_modify(struct dm_table *t, int action);
 
 int dm_queue_merge_is_compulsory(struct request_queue *q);
 
+struct block_device *dm_md_get_bdev(struct mapped_device *md);
+
 void dm_lock_md_type(struct mapped_device *md);
 void dm_unlock_md_type(struct mapped_device *md);
 void dm_set_md_type(struct mapped_device *md, unsigned type);
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -128,6 +128,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 	struct md_rdev *rdev;
 	int i, cnt;
 	bool discard_supported = false;
+	bool sg_gaps_disabled = false;
 
 	conf = kzalloc (sizeof (*conf) + raid_disks*sizeof(struct dev_info),
 			GFP_KERNEL);
@@ -163,6 +164,9 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 
 		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
 			discard_supported = true;
+
+		if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
+			sg_gaps_disabled = true;
 	}
 	if (cnt != raid_disks) {
 		printk(KERN_ERR "md/linear:%s: not enough drives present. Aborting!\n",
@@ -175,6 +179,11 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
 	else
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 
+	if (!sg_gaps_disabled)
+		queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
+	else
+		queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
+
 	/*
 	 * Here we calculate the device offsets.
 	 */
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -436,6 +436,7 @@ static int raid0_run(struct mddev *mddev)
 	if (mddev->queue) {
 		struct md_rdev *rdev;
 		bool discard_supported = false;
+		bool sg_gaps_disabled = false;
 
 		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
 		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
@@ -450,6 +451,8 @@ static int raid0_run(struct mddev *mddev)
 					  rdev->data_offset << 9);
 			if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
 				discard_supported = true;
+			if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
+				sg_gaps_disabled = true;
 		}
 
 		/* Unfortunately, some devices have awful discard performance,
@@ -471,6 +474,11 @@ static int raid0_run(struct mddev *mddev)
 			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
 		else
 			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+
+		if (!sg_gaps_disabled)
+			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
+		else
+			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
 	}
 
 	/* calculate array device size */
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1665,6 +1665,8 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	}
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+	if (mddev->queue && blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
+		queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
 	print_conf(conf);
 	return err;
 }
@@ -2916,6 +2918,7 @@ static int raid1_run(struct mddev *mddev)
 	struct md_rdev *rdev;
 	int ret;
 	bool discard_supported = false;
+	bool sg_gaps_disabled = false;
 
 	if (mddev->level != 1) {
 		printk(KERN_ERR "md/raid1:%s: raid level not set to mirroring (%d)\n",
@@ -2950,6 +2953,8 @@ static int raid1_run(struct mddev *mddev)
 				  rdev->data_offset << 9);
 		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
 			discard_supported = true;
+		if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
+			sg_gaps_disabled = true;
 	}
 
 	mddev->degraded = 0;
@@ -2987,6 +2992,12 @@ static int raid1_run(struct mddev *mddev)
 		else
 			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
 						  mddev->queue);
+		if (sg_gaps_disabled)
+			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
+						mddev->queue);
+		else
+			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
+						  mddev->queue);
 	}
 
 	ret =  md_integrity_register(mddev);
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1870,6 +1870,8 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
 	}
 	if (mddev->queue && blk_queue_discard(bdev_get_queue(rdev->bdev)))
 		queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+	if (mddev->queue && blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
+		queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS, mddev->queue);
 
 	print_conf(conf);
 	return err;
@@ -3682,6 +3684,7 @@ static int raid10_run(struct mddev *mddev)
 	sector_t min_offset_diff = 0;
 	int first = 1;
 	bool discard_supported = false;
+	bool sg_gaps_disabled = false;
 
 	if (mddev->private == NULL) {
 		conf = setup_conf(mddev);
@@ -3749,6 +3752,9 @@ static int raid10_run(struct mddev *mddev)
 
 		if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
 			discard_supported = true;
+
+		if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
+			sg_gaps_disabled = true;
 	}
 
 	if (mddev->queue) {
@@ -3758,6 +3764,12 @@ static int raid10_run(struct mddev *mddev)
 		else
 			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
 						  mddev->queue);
+		if (sg_gaps_disabled)
+			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
+						mddev->queue);
+		else
+			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
+						  mddev->queue);
 	}
 	/* need to check that every block has at least one working mirror */
 	if (!enough(conf, -1)) {
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6451,14 +6451,14 @@ static unsigned long raid5_cache_scan(struct shrinker *shrink,
 				      struct shrink_control *sc)
 {
 	struct r5conf *conf = container_of(shrink, struct r5conf, shrinker);
-	unsigned long ret = ~0UL; /* SHRINK_STOP */
+	unsigned long ret = SHRINK_STOP;
 
 	if (mutex_trylock(&conf->cache_size_mutex)) {
 		ret= 0;
 		while (ret < sc->nr_to_scan &&
 		       conf->max_nr_stripes > conf->min_nr_stripes) {
 			if (drop_one_stripe(conf) == 0) {
-				ret = ~0UL; /* SHRINK_STOP */
+				ret = SHRINK_STOP;
 				break;
 			}
 			ret++;
@@ -6479,14 +6479,6 @@ static unsigned long raid5_cache_count(struct shrinker *shrink,
 	return conf->max_nr_stripes - conf->min_nr_stripes;
 }
 
-static int raid5_cache_shrink(struct shrinker *shrink, struct shrink_control *sc)
-{
-	if (sc->nr_to_scan)
-		(void) raid5_cache_scan(shrink, sc);
-
-	return raid5_cache_count(shrink, sc);
-}
-
 static struct r5conf *setup_conf(struct mddev *mddev)
 {
 	struct r5conf *conf;
@@ -6661,8 +6653,10 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	 * So set it rather large, scaled by number of devices.
 	 */
 	conf->shrinker.seeks = DEFAULT_SEEKS * conf->raid_disks * 4;
-	conf->shrinker.shrink = raid5_cache_shrink;
+	conf->shrinker.scan_objects = raid5_cache_scan;
+	conf->shrinker.count_objects = raid5_cache_count;
 	conf->shrinker.batch = 128;
+	conf->shrinker.flags = 0;
 	register_shrinker(&conf->shrinker);
 
 	sprintf(pers_name, "raid%d", mddev->new_level);
@@ -6981,6 +6975,7 @@ static int raid5_run(struct mddev *mddev)
 	if (mddev->queue) {
 		int chunk_size;
 		bool discard_supported = true;
+		bool sg_gaps_disabled = false;
 		/* read-ahead size must cover two whole stripes, which
 		 * is 2 * (datadisks) * chunksize where 'n' is the
 		 * number of raid devices
@@ -7045,6 +7040,8 @@ static int raid5_run(struct mddev *mddev)
 				}
 				discard_supported = false;
 			}
+			if (blk_queue_sg_gaps(bdev_get_queue(rdev->bdev)))
+				sg_gaps_disabled = true;
 		}
 
 		if (discard_supported &&
@@ -7055,6 +7052,13 @@ static int raid5_run(struct mddev *mddev)
 		else
 			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD,
 						mddev->queue);
+
+		if (sg_gaps_disabled)
+			queue_flag_set_unlocked(QUEUE_FLAG_SG_GAPS,
+						mddev->queue);
+		else
+			queue_flag_clear_unlocked(QUEUE_FLAG_SG_GAPS,
+						mddev->queue);
 	}
 
 	if (journal_dev) {
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -26,6 +26,9 @@ obj-$(CONFIG_VXLAN) += vxlan.o
 obj-$(CONFIG_GENEVE) += geneve.o
 obj-$(CONFIG_NLMON) += nlmon.o
 
+obj-$(CONFIG_VE_NETDEV) += vznetdev.o
+vznetdev-objs := venetdev.o veip_mgmt.o
+
 #
 # Networking Drivers
 #
--- a/drivers/net/bonding/bond_procfs.c
+++ b/drivers/net/bonding/bond_procfs.c
@@ -284,7 +284,7 @@ void bond_create_proc_entry(struct bonding *bond)
 	struct bond_net *bn = net_generic(dev_net(bond_dev), bond_net_id);
 
 	if (bn->proc_dir) {
-		bond->proc_entry = proc_create_data(bond_dev->name,
+		bond->proc_entry = proc_net_create_data(bond_dev->name,
 						    S_IRUGO, bn->proc_dir,
 						    &bond_info_fops, bond);
 		if (bond->proc_entry == NULL)
@@ -313,7 +313,8 @@ void bond_remove_proc_entry(struct bonding *bond)
 void __net_init bond_create_proc_dir(struct bond_net *bn)
 {
 	if (!bn->proc_dir) {
-		bn->proc_dir = proc_mkdir(DRV_NAME, bn->net->proc_net);
+		bn->proc_dir = proc_net_mkdir(bn->net, DRV_NAME,
+					      bn->net->proc_net);
 		if (!bn->proc_dir)
 			pr_warn("Warning: Cannot create /proc/net/%s\n",
 				DRV_NAME);
--- a/drivers/net/dummy.c
+++ b/drivers/net/dummy.c
@@ -134,6 +134,7 @@ static void dummy_setup(struct net_device *dev)
 	dev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_NO_QUEUE;
 	dev->features	|= NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_TSO;
 	dev->features	|= NETIF_F_HW_CSUM | NETIF_F_HIGHDMA | NETIF_F_LLTX;
+	dev->features	|= NETIF_F_VIRTUAL;
 	eth_hw_addr_random(dev);
 }
 
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -26,7 +26,6 @@
 #include <net/vxlan.h>
 
 MODULE_VERSION(DRV_VER);
-MODULE_DEVICE_TABLE(pci, be_dev_ids);
 MODULE_DESCRIPTION(DRV_DESC " " DRV_VER);
 MODULE_AUTHOR("Emulex Corporation");
 MODULE_LICENSE("GPL");
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -399,119 +399,87 @@ struct ixgbe_q_vector {
 	char name[IFNAMSIZ + 9];
 
 #ifdef CONFIG_NET_RX_BUSY_POLL
-	unsigned int state;
-#define IXGBE_QV_STATE_IDLE        0
-#define IXGBE_QV_STATE_NAPI	   1     /* NAPI owns this QV */
-#define IXGBE_QV_STATE_POLL	   2     /* poll owns this QV */
-#define IXGBE_QV_STATE_DISABLED	   4     /* QV is disabled */
-#define IXGBE_QV_OWNED (IXGBE_QV_STATE_NAPI | IXGBE_QV_STATE_POLL)
-#define IXGBE_QV_LOCKED (IXGBE_QV_OWNED | IXGBE_QV_STATE_DISABLED)
-#define IXGBE_QV_STATE_NAPI_YIELD  8     /* NAPI yielded this QV */
-#define IXGBE_QV_STATE_POLL_YIELD  16    /* poll yielded this QV */
-#define IXGBE_QV_YIELD (IXGBE_QV_STATE_NAPI_YIELD | IXGBE_QV_STATE_POLL_YIELD)
-#define IXGBE_QV_USER_PEND (IXGBE_QV_STATE_POLL | IXGBE_QV_STATE_POLL_YIELD)
-	spinlock_t lock;
 #endif  /* CONFIG_NET_RX_BUSY_POLL */
+	atomic_t state;
 
 	/* for dynamic allocation of rings associated with this q_vector */
 	struct ixgbe_ring ring[0] ____cacheline_internodealigned_in_smp;
 };
+
 #ifdef CONFIG_NET_RX_BUSY_POLL
+enum ixgbe_qv_state_t {
+	IXGBE_QV_STATE_IDLE = 0,
+	IXGBE_QV_STATE_NAPI,
+	IXGBE_QV_STATE_POLL,
+	IXGBE_QV_STATE_DISABLE
+};
+
 static inline void ixgbe_qv_init_lock(struct ixgbe_q_vector *q_vector)
 {
-
-	spin_lock_init(&q_vector->lock);
-	q_vector->state = IXGBE_QV_STATE_IDLE;
+	/* reset state to idle */
+	atomic_set(&q_vector->state, IXGBE_QV_STATE_IDLE);
 }
 
 /* called from the device poll routine to get ownership of a q_vector */
 static inline bool ixgbe_qv_lock_napi(struct ixgbe_q_vector *q_vector)
 {
-	int rc = true;
-	spin_lock_bh(&q_vector->lock);
-	if (q_vector->state & IXGBE_QV_LOCKED) {
-		WARN_ON(q_vector->state & IXGBE_QV_STATE_NAPI);
-		q_vector->state |= IXGBE_QV_STATE_NAPI_YIELD;
-		rc = false;
+	int rc = atomic_cmpxchg(&q_vector->state, IXGBE_QV_STATE_IDLE,
+				IXGBE_QV_STATE_NAPI);
 #ifdef BP_EXTENDED_STATS
+	if (rc != IXGBE_QV_STATE_IDLE)
 		q_vector->tx.ring->stats.yields++;
 #endif
-	} else {
-		/* we don't care if someone yielded */
-		q_vector->state = IXGBE_QV_STATE_NAPI;
-	}
-	spin_unlock_bh(&q_vector->lock);
-	return rc;
+
+	return rc == IXGBE_QV_STATE_IDLE;
 }
 
 /* returns true is someone tried to get the qv while napi had it */
-static inline bool ixgbe_qv_unlock_napi(struct ixgbe_q_vector *q_vector)
+static inline void ixgbe_qv_unlock_napi(struct ixgbe_q_vector *q_vector)
 {
-	int rc = false;
-	spin_lock_bh(&q_vector->lock);
-	WARN_ON(q_vector->state & (IXGBE_QV_STATE_POLL |
-			       IXGBE_QV_STATE_NAPI_YIELD));
-
-	if (q_vector->state & IXGBE_QV_STATE_POLL_YIELD)
-		rc = true;
-	/* will reset state to idle, unless QV is disabled */
-	q_vector->state &= IXGBE_QV_STATE_DISABLED;
-	spin_unlock_bh(&q_vector->lock);
-	return rc;
+	WARN_ON(atomic_read(&q_vector->state) != IXGBE_QV_STATE_NAPI);
+
+	/* flush any outstanding Rx frames */
+	if (q_vector->napi.gro_list)
+		napi_gro_flush(&q_vector->napi, false);
+
+	/* reset state to idle */
+	atomic_set(&q_vector->state, IXGBE_QV_STATE_IDLE);
 }
 
 /* called from ixgbe_low_latency_poll() */
 static inline bool ixgbe_qv_lock_poll(struct ixgbe_q_vector *q_vector)
 {
-	int rc = true;
-	spin_lock_bh(&q_vector->lock);
-	if ((q_vector->state & IXGBE_QV_LOCKED)) {
-		q_vector->state |= IXGBE_QV_STATE_POLL_YIELD;
-		rc = false;
+	int rc = atomic_cmpxchg(&q_vector->state, IXGBE_QV_STATE_IDLE,
+				IXGBE_QV_STATE_POLL);
 #ifdef BP_EXTENDED_STATS
+	if (rc != IXGBE_QV_STATE_IDLE)
 		q_vector->rx.ring->stats.yields++;
 #endif
-	} else {
-		/* preserve yield marks */
-		q_vector->state |= IXGBE_QV_STATE_POLL;
-	}
-	spin_unlock_bh(&q_vector->lock);
-	return rc;
+	return rc == IXGBE_QV_STATE_IDLE;
 }
 
 /* returns true if someone tried to get the qv while it was locked */
-static inline bool ixgbe_qv_unlock_poll(struct ixgbe_q_vector *q_vector)
+static inline void ixgbe_qv_unlock_poll(struct ixgbe_q_vector *q_vector)
 {
-	int rc = false;
-	spin_lock_bh(&q_vector->lock);
-	WARN_ON(q_vector->state & (IXGBE_QV_STATE_NAPI));
-
-	if (q_vector->state & IXGBE_QV_STATE_POLL_YIELD)
-		rc = true;
-	/* will reset state to idle, unless QV is disabled */
-	q_vector->state &= IXGBE_QV_STATE_DISABLED;
-	spin_unlock_bh(&q_vector->lock);
-	return rc;
+	WARN_ON(atomic_read(&q_vector->state) != IXGBE_QV_STATE_POLL);
+
+	/* reset state to idle */
+	atomic_set(&q_vector->state, IXGBE_QV_STATE_IDLE);
 }
 
 /* true if a socket is polling, even if it did not get the lock */
 static inline bool ixgbe_qv_busy_polling(struct ixgbe_q_vector *q_vector)
 {
-	WARN_ON(!(q_vector->state & IXGBE_QV_OWNED));
-	return q_vector->state & IXGBE_QV_USER_PEND;
+	return atomic_read(&q_vector->state) == IXGBE_QV_STATE_POLL;
 }
 
 /* false if QV is currently owned */
 static inline bool ixgbe_qv_disable(struct ixgbe_q_vector *q_vector)
 {
-	int rc = true;
-	spin_lock_bh(&q_vector->lock);
-	if (q_vector->state & IXGBE_QV_OWNED)
-		rc = false;
-	q_vector->state |= IXGBE_QV_STATE_DISABLED;
-	spin_unlock_bh(&q_vector->lock);
-
-	return rc;
+	int rc = atomic_cmpxchg(&q_vector->state, IXGBE_QV_STATE_IDLE,
+				IXGBE_QV_STATE_DISABLE);
+
+	return rc == IXGBE_QV_STATE_IDLE;
 }
 
 #else /* CONFIG_NET_RX_BUSY_POLL */
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
@@ -846,6 +846,11 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter,
 	netif_napi_add(adapter->netdev, &q_vector->napi,
 		       ixgbe_poll, 64);
 
+#ifdef CONFIG_NET_RX_BUSY_POLL
+	/* initialize busy poll */
+	atomic_set(&q_vector->state, IXGBE_QV_STATE_DISABLE);
+
+#endif
 	/* tie q_vector and adapter together */
 	adapter->q_vector[v_idx] = q_vector;
 	q_vector->adapter = adapter;
--- a/drivers/net/loopback.c
+++ b/drivers/net/loopback.c
@@ -58,6 +58,7 @@
 #include <linux/percpu.h>
 #include <net/net_namespace.h>
 #include <linux/u64_stats_sync.h>
+#include <linux/ve.h>
 
 struct pcpu_lstats {
 	u64			packets;
@@ -75,6 +76,12 @@ static netdev_tx_t loopback_xmit(struct sk_buff *skb,
 	struct pcpu_lstats *lb_stats;
 	int len;
 
+#ifdef CONFIG_VE
+	if (unlikely(dev_net(dev)->owner_ve->disable_net)) {
+		kfree_skb(skb);
+		return 0;
+	}
+#endif
 	skb_orphan(skb);
 
 	/* Before queueing this packet to netif_rx(),
@@ -179,6 +186,7 @@ static void loopback_setup(struct net_device *dev)
 		| NETIF_F_HIGHDMA
 		| NETIF_F_LLTX
 		| NETIF_F_NETNS_LOCAL
+		| NETIF_F_VIRTUAL
 		| NETIF_F_VLAN_CHALLENGED
 		| NETIF_F_LOOPBACK;
 	dev->ethtool_ops	= &loopback_ethtool_ops;
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -2676,7 +2676,7 @@ static netdev_tx_t macsec_start_xmit(struct sk_buff *skb,
 }
 
 #define MACSEC_FEATURES \
-	(NETIF_F_SG | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST)
+	(NETIF_F_SG | NETIF_F_HIGHDMA)
 static int macsec_dev_init(struct net_device *dev)
 {
 	struct macsec_dev *macsec = macsec_priv(dev);
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -966,8 +966,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 
 	err = netdev_upper_dev_link(lowerdev, dev);
 	if (err)
-		goto destroy_port;
-
+		goto unregister_netdev;
 
 	dev->priv_flags |= IFF_MACVLAN;
 	list_add_tail_rcu(&vlan->list, &port->vlans);
@@ -975,6 +974,8 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
 
 	return 0;
 
+unregister_netdev:
+	unregister_netdevice(dev);
 destroy_port:
 	port->count -= 1;
 	if (!port->count)
--- a/drivers/net/ppp/ppp_generic.c
+++ b/drivers/net/ppp/ppp_generic.c
@@ -54,6 +54,9 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 
+#include <linux/ve.h>
+#include <uapi/linux/vzcalluser.h>
+
 #define PPP_VERSION	"2.4.2"
 
 /*
@@ -379,8 +382,10 @@ static int ppp_open(struct inode *inode, struct file *file)
 	/*
 	 * This could (should?) be enforced by the permissions on /dev/ppp.
 	 */
-	if (!capable(CAP_NET_ADMIN))
+	if (!ns_capable(current_user_ns(), CAP_NET_ADMIN))
 		return -EPERM;
+	if (!net_generic(current->nsproxy->net_ns, ppp_net_id)) /* no VE_FEATURE_PPP */
+		return -EACCES;
 	return 0;
 }
 
@@ -879,6 +884,9 @@ static __net_init int ppp_init_net(struct net *net)
 {
 	struct ppp_net *pn = net_generic(net, ppp_net_id);
 
+	if (!(net->owner_ve->features & VE_FEATURE_PPP))
+		return net_assign_generic(net, ppp_net_id, NULL);
+
 	idr_init(&pn->units_idr);
 	mutex_init(&pn->all_ppp_mutex);
 
@@ -894,6 +902,9 @@ static __net_exit void ppp_exit_net(struct net *net)
 {
 	struct ppp_net *pn = net_generic(net, ppp_net_id);
 
+	if (!pn) /* no VE_FEATURE_PPP */
+		return;
+
 	idr_destroy(&pn->units_idr);
 }
 
@@ -1081,7 +1092,7 @@ static void ppp_setup(struct net_device *dev)
 	dev->tx_queue_len = 3;
 	dev->type = ARPHRD_PPP;
 	dev->flags = IFF_POINTOPOINT | IFF_NOARP | IFF_MULTICAST;
-	dev->features |= NETIF_F_NETNS_LOCAL;
+	dev->features |= NETIF_F_NETNS_LOCAL | NETIF_F_VIRTUAL;
 	netif_keep_dst(dev);
 }
 
@@ -2214,12 +2225,14 @@ int ppp_register_net_channel(struct net *net, struct ppp_channel *chan)
 	struct channel *pch;
 	struct ppp_net *pn;
 
+	pn = ppp_pernet(net);
+	if (!pn)
+		return -EACCES;
+
 	pch = kzalloc(sizeof(struct channel), GFP_KERNEL);
 	if (!pch)
 		return -ENOMEM;
 
-	pn = ppp_pernet(net);
-
 	pch->ppp = NULL;
 	pch->chan = chan;
 	pch->chan_net = net;
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -77,6 +77,8 @@
 #include <linux/file.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <uapi/linux/vzcalluser.h>
+#include <linux/ve.h>
 
 #include <linux/nsproxy.h>
 #include <net/net_namespace.h>
@@ -283,6 +285,8 @@ static void pppoe_flush_dev(struct net_device *dev)
 	int i;
 
 	pn = pppoe_pernet(dev_net(dev));
+	if (!pn) /* no VE_FEATURE_PPP */
+		return;
 	write_lock_bh(&pn->hash_lock);
 	for (i = 0; i < PPPOE_HASH_SIZE; i++) {
 		struct pppox_sock *po = pn->hash_table[i];
@@ -439,6 +443,8 @@ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev,
 		goto drop;
 
 	pn = pppoe_pernet(dev_net(dev));
+	if (!pn) /* no VE_FEATURE_PPP */
+		goto drop;
 
 	/* Note that get_item does a sock_hold(), so sk_pppox(po)
 	 * is known to be safe.
@@ -497,6 +503,9 @@ static int pppoe_disc_rcv(struct sk_buff *skb, struct net_device *dev,
 		goto abort;
 
 	pn = pppoe_pernet(dev_net(dev));
+	if (!pn) /* no VE_FEATURE_PPP */
+		goto abort;
+
 	po = get_item(pn, ph->sid, eth_hdr(skb)->h_source, dev->ifindex);
 	if (po) {
 		struct sock *sk = sk_pppox(po);
@@ -551,6 +560,9 @@ static int pppoe_create(struct net *net, struct socket *sock)
 {
 	struct sock *sk;
 
+	if (!(net->owner_ve->features & VE_FEATURE_PPP))
+		return -EACCES;
+
 	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppoe_sk_proto);
 	if (!sk)
 		return -ENOMEM;
@@ -1149,6 +1161,9 @@ static __net_init int pppoe_init_net(struct net *net)
 	struct pppoe_net *pn = pppoe_pernet(net);
 	struct proc_dir_entry *pde;
 
+	if (!(net->owner_ve->features & VE_FEATURE_PPP))
+		return net_assign_generic(net, pppoe_net_id, NULL);
+
 	rwlock_init(&pn->hash_lock);
 
 	pde = proc_create("pppoe", S_IRUGO, net->proc_net, &pppoe_seq_fops);
@@ -1162,6 +1177,12 @@ static __net_init int pppoe_init_net(struct net *net)
 
 static __net_exit void pppoe_exit_net(struct net *net)
 {
+	struct pppoe_net *pn;
+
+	pn = net_generic(net, pppoe_net_id);
+	if (!pn) /* no VE_FEATURE_PPP */
+		return;
+
 	remove_proc_entry("pppoe", net->proc_net);
 }
 
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -72,6 +72,10 @@
 
 #include <asm/uaccess.h>
 
+#ifdef CONFIG_VE_TUNTAP_ACCOUNTING
+#include <linux/vznetstat.h>
+#endif /* CONFIG_VE_TUNTAP_ACCOUNTING */
+
 /* Uncomment to enable debugging */
 /* #define TUN_DEBUG 1 */
 
@@ -215,6 +219,9 @@ struct tun_struct {
 	void *security;
 	u32 flow_count;
 	struct tun_pcpu_stats __percpu *pcpu_stats;
+#ifdef CONFIG_VE_TUNTAP_ACCOUNTING
+	struct venet_stat *vestat;
+#endif /* CONFIG_VE_TUNTAP_ACCOUNTING */
 };
 
 #ifdef CONFIG_TUN_VNET_CROSS_LE
@@ -1383,6 +1390,12 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
 	skb_reset_network_header(skb);
 	skb_probe_transport_header(skb, 0);
 
+#ifdef CONFIG_VE_TUNTAP_ACCOUNTING
+	if (tun->vestat) {
+		venet_acct_classify_add_outgoing(tun->vestat, skb);
+	}
+#endif /* CONFIG_VE_TUNTAP_ACCOUNTING */
+
 	rxhash = skb_get_hash(skb);
 	netif_rx_ni(skb);
 
@@ -1540,6 +1553,12 @@ done:
 	u64_stats_update_end(&stats->syncp);
 	put_cpu_ptr(tun->pcpu_stats);
 
+#ifdef CONFIG_VE_TUNTAP_ACCOUNTING
+	if (tun->vestat) {
+		venet_acct_classify_add_incoming(tun->vestat, skb);
+	}
+#endif /* CONFIG_VE_TUNTAP_ACCOUNTING */
+
 	return total;
 }
 
@@ -1625,6 +1644,14 @@ static void tun_free_netdev(struct net_device *dev)
 	free_percpu(tun->pcpu_stats);
 	tun_flow_uninit(tun);
 	security_tun_dev_free_security(tun->security);
+
+#ifdef CONFIG_VE_TUNTAP_ACCOUNTING
+	if (tun->vestat) {
+		venet_acct_put_stat(tun->vestat);
+		tun->vestat = NULL;
+	}
+#endif /* CONFIG_VE_TUNTAP_ACCOUNTING */
+
 	free_netdev(dev);
 }
 
@@ -1885,7 +1912,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
 		dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
 				   TUN_USER_FEATURES | NETIF_F_HW_VLAN_CTAG_TX |
 				   NETIF_F_HW_VLAN_STAG_TX;
-		dev->features = dev->hw_features | NETIF_F_LLTX;
+		dev->features = dev->hw_features | NETIF_F_LLTX |
+				   NETIF_F_VIRTUAL;
 		dev->vlan_features = dev->features &
 				     ~(NETIF_F_HW_VLAN_CTAG_TX |
 				       NETIF_F_HW_VLAN_STAG_TX);
@@ -2057,11 +2085,38 @@ unlock:
 	return ret;
 }
 
+#ifdef CONFIG_VE_TUNTAP_ACCOUNTING
+/* setacctid_ioctl should be called under rtnl_lock */
+static int tun_set_acctid(struct net *net, struct ifreq *ifr)
+{
+	struct net_device *dev;
+	struct tun_struct *tun;
+
+	dev = __dev_get_by_name(net, ifr->ifr_name);
+	if (dev == NULL)
+		return -ENOENT;
+
+	/* This check may be dropped to allow tun devices */
+	if (dev->netdev_ops != &tap_netdev_ops)
+		return -EINVAL;
+
+	tun = netdev_priv(dev);
+	if (tun->vestat) {
+		venet_acct_put_stat(tun->vestat);
+	}
+	tun->vestat = venet_acct_find_create_stat(ifr->ifr_acctid);
+	if (tun->vestat == NULL)
+		return -ENOMEM;
+
+	return 0;
+}
+#endif /* CONFIG_VE_TUNTAP_ACCOUNTING */
+
 static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 			    unsigned long arg, int ifreq_len)
 {
 	struct tun_file *tfile = file->private_data;
-	struct tun_struct *tun;
+	struct tun_struct *tun = NULL;
 	void __user* argp = (void __user*)arg;
 	struct ifreq ifr;
 	kuid_t owner;
@@ -2072,7 +2127,8 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 	int le;
 	int ret;
 
-	if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == 0x89) {
+	if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || cmd == TUNSETACCTID ||
+			_IOC_TYPE(cmd) == 0x89) {
 		if (copy_from_user(&ifr, argp, ifreq_len))
 			return -EFAULT;
 	} else {
@@ -2091,6 +2147,13 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
 	ret = 0;
 	rtnl_lock();
 
+#ifdef CONFIG_VE_TUNTAP_ACCOUNTING
+	if (cmd == TUNSETACCTID) {
+		ret = tun_set_acctid(tfile->net, &ifr);
+		goto unlock;
+	}
+#endif /* CONFIG_VE_TUNTAP_ACCOUNTING */
+
 	tun = __tun_get(tfile);
 	if (cmd == TUNSETIFF && !tun) {
 		ifr.ifr_name[IFNAMSIZ-1] = '\0';
--- /dev/null
+++ b/drivers/net/veip_mgmt.c
@@ -0,0 +1,174 @@
+/*
+ *  drivers/net/veip_mgmt.c
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * Virtual Networking device used to change VE ownership on packets
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+
+#include <linux/inet.h>
+#include <net/ip.h>
+#include <linux/skbuff.h>
+#include <linux/venet.h>
+#include <linux/ve.h>
+
+static void veip_free(struct veip_struct *veip)
+{
+	kfree(veip);
+}
+
+static void veip_release(struct ve_struct *ve)
+{
+	struct veip_struct *veip;
+
+	veip = ve->veip;
+	ve->veip = NULL;
+	barrier();
+	veip_put(veip);
+}
+
+static int veip_create(struct ve_struct *ve)
+{
+	struct veip_struct *veip;
+
+	veip = veip_findcreate(ve->veid);
+	if (veip == NULL)
+		return -ENOMEM;
+	if (IS_ERR(veip))
+		return PTR_ERR(veip);
+
+	ve->veip = veip;
+	return 0;
+}
+
+static int skb_extract_addr(struct sk_buff *skb,
+		struct ve_addr_struct *addr, int dir)
+{
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		addr->family = AF_INET;
+		addr->key[0] = 0;
+		addr->key[1] = 0;
+		addr->key[2] = 0;
+		addr->key[3] = (dir ? ip_hdr(skb)->daddr : ip_hdr(skb)->saddr);
+		return 0;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	case __constant_htons(ETH_P_IPV6):
+		addr->family = AF_INET6;
+		memcpy(&addr->key, dir ?
+				ipv6_hdr(skb)->daddr.s6_addr32 :
+				ipv6_hdr(skb)->saddr.s6_addr32,
+				sizeof(addr->key));
+		return 0;
+#endif
+	}
+
+	return -EAFNOSUPPORT;
+}
+
+static struct ve_struct *venet_find_ve(struct ve_addr_struct *addr, int dir)
+{
+	struct ip_entry_struct *entry;
+	struct ve_struct *ve = NULL;
+
+	entry = venet_entry_lookup(addr);
+	if (entry != NULL)
+		ve = ACCESS_ONCE(entry->active_env);
+
+	return ve;
+}
+
+static struct ve_struct *
+veip_lookup(struct ve_struct *ve_old, struct sk_buff *skb)
+{
+	struct ve_struct *ve;
+	int dir;
+	struct ve_addr_struct addr;
+
+	dir = ve_is_super(ve_old);
+	if (skb_extract_addr(skb, &addr, dir) < 0)
+		goto out_drop_nolock;
+
+	rcu_read_lock();
+	if (!dir) {
+		/* from VE to host */
+		ve = venet_find_ve(&addr, 0);
+		if (ve == NULL) {
+			if (!venet_ext_lookup(ve_old, &addr))
+				goto out_drop;
+		} else {
+			if (ve != ve_old)
+				goto out_source;
+		}
+
+		ve = get_ve0();
+	} else {
+		/* from host to VE */
+		ve = venet_find_ve(&addr, 1);
+		if (ve == NULL)
+			goto out_drop;
+	}
+	rcu_read_unlock();
+
+	return ve;
+
+out_drop:
+	rcu_read_unlock();
+out_drop_nolock:
+	return ERR_PTR(-ESRCH);
+
+out_source:
+	rcu_read_unlock();
+	if (net_ratelimit() && skb->protocol == __constant_htons(ETH_P_IP)) {
+		printk(KERN_WARNING "Dropped packet, source wrong "
+		       "veid=%s src-IP=%u.%u.%u.%u "
+		       "dst-IP=%u.%u.%u.%u\n",
+		       ve_name(ve_old),
+		       NIPQUAD(ip_hdr(skb)->saddr),
+		       NIPQUAD(ip_hdr(skb)->daddr));
+	}
+	return ERR_PTR(-EACCES);
+}
+
+void veip_cleanup(void)
+{
+	int i;
+	struct veip_struct *veip;
+
+	spin_lock(&veip_lock);
+	for (i = 0; i < VEIP_HASH_SZ; i++)
+		while (!hlist_empty(ip_entry_hash_table + i)) {
+			struct ip_entry_struct *entry;
+
+			entry = hlist_entry(ip_entry_hash_table[i].first,
+					struct ip_entry_struct, ip_hash);
+			hlist_del(&entry->ip_hash);
+			list_del(&entry->ve_list);
+			kfree(entry);
+		}
+
+	/*vzredir may remain some veip-s*/
+	while (!list_empty(&veip_lh)) {
+		veip = list_first_entry(&veip_lh, struct veip_struct, list);
+		veip_put(veip);
+	}
+	spin_unlock(&veip_lock);
+}
+
+static struct veip_pool_ops open_pool_ops = {
+	.veip_create = veip_create,
+	.veip_release = veip_release,
+	.veip_free = veip_free,
+	.veip_lookup = veip_lookup,
+};
+
+struct veip_pool_ops *veip_pool_ops = &open_pool_ops;
+EXPORT_SYMBOL(veip_pool_ops);
--- /dev/null
+++ b/drivers/net/venetdev.c
@@ -0,0 +1,1252 @@
+/*
+ *  drivers/net/venetdev.c
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * Common part for Virtuozzo virtual network devices
+ */
+
+#include <linux/kernel.h>
+#include <linux/interrupt.h>
+#include <linux/fs.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/socket.h>
+#include <linux/errno.h>
+#include <linux/fcntl.h>
+#include <linux/in.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/nsproxy.h>
+#include <linux/tcp.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <net/addrconf.h>
+
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/unistd.h>
+
+#include <linux/inet.h>
+#include <linux/netdevice.h>
+#include <linux/etherdevice.h>
+#include <net/ip.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/if_ether.h>	/* For the statistics structure. */
+#include <linux/if_arp.h>	/* For ARPHRD_ETHER */
+#include <linux/ethtool.h>
+#include <linux/venet.h>
+#include <linux/ve_proto.h>
+#include <linux/vzctl.h>
+#include <uapi/linux/vzctl_venet.h>
+#include <linux/ve.h>
+#include <linux/venet-netlink.h>
+
+struct hlist_head ip_entry_hash_table[VEIP_HASH_SZ];
+DEFINE_SPINLOCK(veip_lock);
+LIST_HEAD(veip_lh);
+static struct rtnl_link_ops venet_link_ops;
+
+#define ip_entry_hash_function(ip)  (ntohl(ip) & (VEIP_HASH_SZ - 1))
+
+void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip)
+{
+	hlist_add_head_rcu(&entry->ip_hash,
+			ip_entry_hash_table +
+			ip_entry_hash_function(entry->addr.key[3]));
+	list_add(&entry->ve_list, &veip->ip_lh);
+}
+
+static void ip_entry_free(struct rcu_head *rcu)
+{
+	struct ip_entry_struct *e;
+
+	e = container_of(rcu, struct ip_entry_struct, rcu);
+	kfree(e);
+}
+
+void ip_entry_unhash(struct ip_entry_struct *entry)
+{
+	list_del(&entry->ve_list);
+	hlist_del_rcu(&entry->ip_hash);
+	call_rcu(&entry->rcu, ip_entry_free);
+}
+
+static void veip_free(struct rcu_head *rcu)
+{
+	struct veip_struct *veip;
+
+	veip = container_of(rcu, struct veip_struct, rcu);
+	veip_pool_ops->veip_free(veip);
+}
+
+int veip_put(struct veip_struct *veip)
+{
+	if (!list_empty(&veip->ip_lh))
+		return 0;
+	if (!list_empty(&veip->src_lh))
+		return 0;
+	if (!list_empty(&veip->dst_lh))
+		return 0;
+
+	list_del(&veip->list);
+	call_rcu(&veip->rcu, veip_free);
+	return 1;
+}
+
+struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *addr)
+{
+	struct ip_entry_struct *entry;
+
+	hlist_for_each_entry_rcu(entry, ip_entry_hash_table +
+			ip_entry_hash_function(addr->key[3]), ip_hash)
+		if (memcmp(&entry->addr, addr, sizeof(*addr)) == 0)
+			return entry;
+	return NULL;
+}
+
+struct ext_entry_struct *venet_ext_lookup(struct ve_struct *ve,
+		struct ve_addr_struct *addr)
+{
+	struct ext_entry_struct *entry;
+	struct veip_struct *veip;
+
+	veip = ACCESS_ONCE(ve->veip);
+	if (veip == NULL)
+		return NULL;
+
+	list_for_each_entry_rcu (entry, &veip->ext_lh, list)
+		if (memcmp(&entry->addr, addr, sizeof(*addr)) == 0)
+			return entry;
+	return NULL;
+}
+
+static int venet_ext_add(struct ve_struct *ve, struct ve_addr_struct *addr)
+{
+	struct ext_entry_struct *entry, *found;
+	int err;
+
+	if (ve->veip == NULL)
+		return -ENONET;
+
+	entry = kzalloc(sizeof(struct ext_entry_struct), GFP_KERNEL);
+	if (entry == NULL)
+		return -ENOMEM;
+
+	spin_lock(&veip_lock);
+	err = -EADDRINUSE;
+	found = venet_ext_lookup(ve, addr);
+	if (found != NULL)
+		goto out_unlock;
+
+	entry->addr = *addr;
+	list_add_rcu(&entry->list, &ve->veip->ext_lh);
+	err = 0;
+	entry = NULL;
+out_unlock:
+	spin_unlock(&veip_lock);
+	if (entry != NULL)
+		kfree(entry);
+	return err;
+}
+
+static void venet_ext_free(struct rcu_head *rcu)
+{
+	struct ext_entry_struct *e;
+
+	e = container_of(rcu, struct ext_entry_struct, rcu);
+	kfree(e);
+}
+
+static void venet_ext_release(struct ext_entry_struct *e)
+{
+	list_del_rcu(&e->list);
+	call_rcu(&e->rcu, venet_ext_free);
+}
+
+static int venet_ext_del(struct ve_struct *ve, struct ve_addr_struct *addr)
+{
+	struct ext_entry_struct *found;
+	int err;
+
+	if (ve->veip == NULL)
+		return -ENONET;
+
+	err = -EADDRNOTAVAIL;
+	spin_lock(&veip_lock);
+	found = venet_ext_lookup(ve, addr);
+	if (found == NULL)
+		goto out;
+
+	venet_ext_release(found);
+	err = 0;
+out:
+	spin_unlock(&veip_lock);
+	return err;
+}
+
+static void __venet_ext_clean(struct ve_struct *ve)
+{
+	struct ext_entry_struct *entry, *tmp;
+
+	list_for_each_entry_safe (entry, tmp, &ve->veip->ext_lh, list)
+		venet_ext_release(entry);
+}
+
+static struct veip_struct *veip_find(envid_t veid)
+{
+	struct veip_struct *ptr;
+
+	list_for_each_entry(ptr, &veip_lh, list) {
+		if (ptr->veid != veid)
+			continue;
+		return ptr;
+	}
+	return NULL;
+}
+
+struct veip_struct *veip_findcreate(envid_t veid)
+{
+	struct veip_struct *ptr;
+
+	ptr = veip_find(veid);
+	if (ptr != NULL)
+		return ERR_PTR(-EEXIST);
+
+	ptr = kmalloc(sizeof(struct veip_struct), GFP_ATOMIC);
+	if (ptr == NULL)
+		return NULL;
+	memset(ptr, 0, sizeof(struct veip_struct));
+	INIT_LIST_HEAD(&ptr->ip_lh);
+	INIT_LIST_HEAD(&ptr->src_lh);
+	INIT_LIST_HEAD(&ptr->dst_lh);
+	INIT_LIST_HEAD(&ptr->ext_lh);
+	ptr->veid = veid;
+	list_add(&ptr->list, &veip_lh);
+	return ptr;
+}
+
+static int veip_start(struct ve_struct *ve)
+{
+	int err, get;
+
+	spin_lock(&veip_lock);
+
+	get = ve->veip == NULL;
+	err = veip_pool_ops->veip_create(ve);
+	if (!err && get && !ve_is_super(ve))
+		__module_get(THIS_MODULE);
+
+	spin_unlock(&veip_lock);
+
+	return err;
+}
+
+static void __veip_stop(struct ve_struct *ve)
+{
+	struct list_head *p, *tmp;
+
+	list_for_each_safe(p, tmp, &ve->veip->ip_lh) {
+		struct ip_entry_struct *ptr;
+		ptr = list_entry(p, struct ip_entry_struct, ve_list);
+		ptr->active_env = NULL;
+
+		if (ptr->tgt_veip == NULL)
+			ip_entry_unhash(ptr);
+	}
+
+	veip_pool_ops->veip_release(ve);
+	if (!ve_is_super(ve))
+		module_put(THIS_MODULE);
+}
+
+static void veip_stop(struct ve_struct *ve)
+{
+	spin_lock(&veip_lock);
+	if (ve->veip)
+		__veip_stop(ve);
+	spin_unlock(&veip_lock);
+}
+
+static int veip_entry_conflict(struct ip_entry_struct *entry, struct ve_struct *ve)
+{
+	if (entry->active_env != NULL)
+		return -EADDRINUSE;
+	if (entry->tgt_veip && entry->tgt_veip->veid != ve->veid)
+		return -EADDRNOTAVAIL;
+
+	entry->active_env = ve;
+	return 0;
+}
+
+static int veip_entry_add(struct ve_struct *ve, struct ve_addr_struct *addr)
+{
+	struct ip_entry_struct *entry, *found;
+	int err;
+
+	entry = kzalloc(sizeof(struct ip_entry_struct), GFP_KERNEL);
+	if (entry == NULL)
+		return -ENOMEM;
+
+	if (ve->veip == NULL) {
+		/* This can happen if we load venet AFTER ve was started */
+	       	err = veip_start(ve);
+		if (err < 0)
+			goto out;
+	}
+
+	spin_lock(&veip_lock);
+	found = venet_entry_lookup(addr);
+	if (found != NULL) {
+		err = veip_entry_conflict(found, ve);
+		goto out_unlock;
+	}
+
+	entry->active_env = ve;
+	entry->addr = *addr;
+	ip_entry_hash(entry, ve->veip);
+
+	err = 0;
+	entry = NULL;
+out_unlock:
+	spin_unlock(&veip_lock);
+out:
+	if (entry != NULL)
+		kfree(entry);
+
+	return err;
+}
+
+static int veip_entry_del(struct ve_struct *ve, struct ve_addr_struct *addr)
+{
+	struct ip_entry_struct *found;
+	int err;
+
+	err = -EADDRNOTAVAIL;
+	spin_lock(&veip_lock);
+	found = venet_entry_lookup(addr);
+	if (found == NULL)
+		goto out;
+	if (found->active_env == NULL)
+		goto out;
+	if (found->active_env->veid != ve->veid)
+		goto out;
+
+	err = 0;
+	found->active_env = NULL;
+
+	if (found->tgt_veip == NULL)
+		ip_entry_unhash(found);
+out:
+	spin_unlock(&veip_lock);
+	return err;
+}
+
+static int convert_sockaddr(struct sockaddr *addr, int addrlen,
+		struct ve_addr_struct *veaddr)
+{
+	int err;
+
+	switch (addr->sa_family) {
+	case AF_INET: {
+		struct sockaddr_in *sin;
+
+		err = -EINVAL;
+		if (addrlen != sizeof(struct sockaddr_in))
+			break;
+
+		err = 0;
+		sin = (struct sockaddr_in *)addr;
+		veaddr->family = AF_INET;
+		veaddr->key[0] = 0;
+		veaddr->key[1] = 0;
+		veaddr->key[2] = 0;
+		veaddr->key[3] = sin->sin_addr.s_addr;
+		break;
+	}
+	case AF_INET6: {
+		struct sockaddr_in6 *sin;
+
+		err = -EINVAL;
+		if (addrlen != sizeof(struct sockaddr_in6))
+			break;
+
+		err = 0;
+		sin = (struct sockaddr_in6 *)addr;
+		veaddr->family = AF_INET6;
+		memcpy(veaddr->key, &sin->sin6_addr, sizeof(veaddr->key));
+		break;
+	}
+	default:
+		err = -EAFNOSUPPORT;
+	}
+	return err;
+}
+
+int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen,
+		struct ve_addr_struct *veaddr)
+{
+	int err;
+	char addr[MAX_SOCK_ADDR];
+
+	err = move_addr_to_kernel(uaddr, addrlen, (struct sockaddr_storage *)&addr);
+	if (err < 0)
+		goto out;
+
+	err = convert_sockaddr((struct sockaddr *)&addr, addrlen, veaddr);
+out:
+	return err;
+}
+
+int in4_to_veaddr(const char *addr, struct ve_addr_struct *veaddr)
+{
+	veaddr->family = AF_INET;
+	if (!in4_pton(addr, -1, (u8 *)(&veaddr->key[3]), -1, NULL))
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL(in4_to_veaddr);
+
+int in6_to_veaddr(const char *addr, struct ve_addr_struct *veaddr)
+{
+	veaddr->family = AF_INET6;
+	if (!in6_pton(addr, -1, (u8 *)(veaddr->key), -1, NULL))
+		return -EINVAL;
+	return 0;
+}
+EXPORT_SYMBOL(in6_to_veaddr);
+
+void veaddr_print(char *str, int len, struct ve_addr_struct *a)
+{
+	if (a->family == AF_INET)
+		snprintf(str, len, "%u.%u.%u.%u", NIPQUAD(a->key[3]));
+	else
+		snprintf(str, len, "%x:%x:%x:%x:%x:%x:%x:%x",
+				ntohl(a->key[0])>>16, ntohl(a->key[0])&0xFFFF,
+				ntohl(a->key[1])>>16, ntohl(a->key[1])&0xFFFF,
+				ntohl(a->key[2])>>16, ntohl(a->key[2])&0xFFFF,
+				ntohl(a->key[3])>>16, ntohl(a->key[3])&0xFFFF
+			);
+}
+
+/*
+ * Device functions
+ */
+
+static int venet_open(struct net_device *dev)
+{
+	if (!ve_is_super(get_exec_env()) && !try_module_get(THIS_MODULE))
+		return -EBUSY;
+	return 0;
+}
+
+static int venet_close(struct net_device *master)
+{
+	if (!ve_is_super(get_exec_env()))
+		module_put(THIS_MODULE);
+	return 0;
+}
+
+void (*venet_free_stat)(struct ve_struct *) = NULL;
+EXPORT_SYMBOL(venet_free_stat);
+
+static void venet_destructor(struct net_device *dev)
+{
+	struct venet_stats *stats = (struct venet_stats *)dev->ml_priv;
+	if (stats == NULL)
+		return;
+	free_percpu(stats->real_stats);
+	kfree(stats);
+	dev->ml_priv = NULL;
+
+	if (venet_free_stat)
+		venet_free_stat(dev->nd_net->owner_ve);
+}
+
+/*
+ * The higher levels take care of making this non-reentrant (it's
+ * called with bh's disabled).
+ */
+static int venet_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct net_device_stats *stats;
+	struct net_device *rcv = NULL;
+	struct ve_struct *ve;
+	int length;
+
+	stats = venet_stats(dev, smp_processor_id());
+	ve = dev_net(dev)->owner_ve;
+	if (unlikely(ve->disable_net))
+		goto outf;
+
+	if (skb->protocol == __constant_htons(ETH_P_IP)) {
+		struct iphdr *iph;
+		iph = ip_hdr(skb);
+		if (ipv4_is_multicast(iph->daddr))
+			goto outf;
+	} else if (skb->protocol == __constant_htons(ETH_P_IPV6)) {
+		struct ipv6hdr *ip6h;
+		ip6h = ipv6_hdr(skb);
+		if (ipv6_addr_is_multicast(&ip6h->daddr))
+			goto outf;
+		skb_orphan(skb);
+	} else {
+		goto outf;
+	}
+
+	ve = veip_pool_ops->veip_lookup(ve, skb);
+	if (IS_ERR(ve))
+		goto outf;
+
+	if (unlikely(ve->disable_net))
+		goto outf;
+
+	rcv = ve->_venet_dev;
+	if (!rcv)
+		/* VE going down */
+		goto outf;
+
+	dev_hold(rcv);
+
+	if (!(rcv->flags & IFF_UP)) {
+		/* Target VE does not want to receive packets */
+		dev_put(rcv);
+		goto outf;
+	}
+
+	skb->pkt_type = PACKET_HOST;
+	skb->dev = rcv;
+
+	/*
+	 * If there is not enough space for header we allocate one.
+	 * Remember the traffic can reach VE from outside world and
+	 * as result we have to cleanup mac address of such packet.
+	 * The same applies to traffic which comes from inside of VE
+	 * but if TUN is used and traffic get fragmented we might reach
+	 * the point where is no L2 header at all and hard_header_len
+	 * is simply ingnored (because this parameter is kind of a hint
+	 * for upper net layers and never a guarantee that header will be
+	 * provided). To unify the way how packets are seen after venet
+	 * we always produce L2 header with zero'ified MAC.
+	 */
+	if (unlikely(skb_headroom(skb) < dev->hard_header_len)) {
+		struct sk_buff *skb2;
+
+		skb2 = skb_realloc_headroom(skb, LL_RESERVED_SPACE(dev));
+		if (!skb2) {
+			dev_put(rcv);
+			goto outf;
+		}
+		if (skb->sk)
+			skb_set_owner_w(skb2, skb->sk);
+		kfree_skb(skb);
+		skb = skb2;
+	}
+
+	skb_reset_mac_header(skb);
+	memset(skb->data - dev->hard_header_len, 0, dev->hard_header_len);
+
+	nf_reset(skb);
+	length = skb->len;
+
+	netif_rx(skb);
+
+	stats->tx_bytes += length;
+	stats->tx_packets++;
+	if (rcv) {
+		struct net_device_stats *rcv_stats;
+
+		rcv_stats = venet_stats(rcv, smp_processor_id());
+		rcv_stats->rx_bytes += length;
+		rcv_stats->rx_packets++;
+		dev_put(rcv);
+	}
+
+	return 0;
+
+outf:
+	kfree_skb(skb);
+	++stats->tx_dropped;
+	return 0;
+}
+
+static struct net_device_stats *get_stats(struct net_device *dev)
+{
+	int i;
+	struct venet_stats *stats;
+
+	stats = (struct venet_stats *)dev->ml_priv;
+	memset(&stats->stats, 0, sizeof(struct net_device_stats));
+	for_each_possible_cpu(i) {
+		struct net_device_stats *dev_stats;
+
+		dev_stats = venet_stats(dev, i);
+		stats->stats.rx_bytes   += dev_stats->rx_bytes;
+		stats->stats.tx_bytes   += dev_stats->tx_bytes;
+		stats->stats.rx_packets += dev_stats->rx_packets;
+		stats->stats.tx_packets += dev_stats->tx_packets;
+		stats->stats.tx_dropped += dev_stats->tx_dropped;
+	}
+
+	return &stats->stats;
+}
+
+/* Initialize the rest of the LOOPBACK device. */
+static int venet_init_dev(struct net_device *dev)
+{
+	struct venet_stats *stats;
+
+	stats = kzalloc(sizeof(struct venet_stats), GFP_KERNEL);
+	if (stats == NULL)
+		goto fail;
+	stats->real_stats = alloc_percpu(struct net_device_stats);
+	if (stats->real_stats == NULL)
+		goto fail_free;
+	dev->ml_priv = stats;
+
+	/*
+	 *	Fill in the generic fields of the device structure.
+	 */
+	dev->type		= ARPHRD_VOID;
+	dev->hard_header_len 	= ETH_HLEN;
+	dev->mtu		= 1500; /* eth_mtu */
+	dev->tx_queue_len	= 0;
+
+	memset(dev->broadcast, 0xFF, ETH_ALEN);
+
+	/* New-style flags. */
+	dev->flags		= IFF_BROADCAST|IFF_NOARP|IFF_POINTOPOINT;
+	return 0;
+
+fail_free:
+	kfree(stats);
+fail:
+	return -ENOMEM;
+}
+
+static netdev_features_t common_features;
+static const struct net_device_ops venet_netdev_ops;
+
+static int venet_set_features(struct net_device *dev,
+			      netdev_features_t features)
+{
+	struct net *net;
+
+	common_features = features;
+	for_each_net(net) {
+		for_each_netdev(net, dev) {
+			if (dev->netdev_ops == &venet_netdev_ops)
+				dev->features = features;
+		}
+	}
+	return 0;
+}
+#define DRV_NAME	"vz-venet"
+#define DRV_VERSION	"1.0"
+
+/*
+ * ethtool interface
+ */
+
+static struct {
+	const char string[ETH_GSTRING_LEN];
+} ethtool_stats_keys[] = {
+	{ "ifindex" },
+};
+
+static int venet_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+{
+	cmd->supported		= 0;
+	cmd->advertising	= 0;
+	ethtool_cmd_speed_set(cmd, SPEED_10000);
+	cmd->duplex		= DUPLEX_FULL;
+	cmd->port		= PORT_TP;
+	cmd->phy_address	= 0;
+	cmd->transceiver	= XCVR_INTERNAL;
+	cmd->autoneg		= AUTONEG_DISABLE;
+	cmd->maxtxpkt		= 0;
+	cmd->maxrxpkt		= 0;
+	return 0;
+}
+
+static void venet_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *info)
+{
+	strlcpy(info->driver, DRV_NAME, sizeof(info->driver));
+	strlcpy(info->version, DRV_VERSION, sizeof(info->version));
+}
+
+static void venet_get_strings(struct net_device *dev, u32 stringset, u8 *buf)
+{
+	switch(stringset) {
+	case ETH_SS_STATS:
+		memcpy(buf, &ethtool_stats_keys, sizeof(ethtool_stats_keys));
+		break;
+	}
+}
+
+static int venet_get_sset_count(struct net_device *dev, int sset)
+{
+	switch (sset) {
+	case ETH_SS_STATS:
+		return ARRAY_SIZE(ethtool_stats_keys);
+	default:
+		return -EOPNOTSUPP;
+	}
+}
+
+static void venet_get_ethtool_stats(struct net_device *dev,
+		struct ethtool_stats *stats, u64 *data)
+{
+	/*
+	 * TODO: copy proper statistics here.
+	 */
+	data[0] = dev->ifindex;
+}
+
+static const struct ethtool_ops venet_ethtool_ops = {
+	.get_settings		= venet_get_settings,
+	.get_drvinfo		= venet_get_drvinfo,
+	.get_link		= ethtool_op_get_link,
+	.get_strings		= venet_get_strings,
+	.get_sset_count		= venet_get_sset_count,
+	.get_ethtool_stats	= venet_get_ethtool_stats,
+};
+
+static const struct net_device_ops venet_netdev_ops = {
+	.ndo_start_xmit = venet_xmit,
+	.ndo_get_stats = get_stats,
+	.ndo_open = venet_open,
+	.ndo_stop = venet_close,
+	.ndo_init = venet_init_dev,
+	.ndo_set_features = venet_set_features,
+};
+
+static void venet_setup(struct net_device *dev)
+{
+	/*
+	 * No other features, as they are:
+	 *  - checksumming is required, and nobody else will done our job
+	 */
+	dev->features |= NETIF_F_VENET | NETIF_F_VIRTUAL | NETIF_F_LLTX |
+	       NETIF_F_HIGHDMA | NETIF_F_VLAN_CHALLENGED;
+
+	dev->netdev_ops = &venet_netdev_ops;
+	dev->destructor = venet_destructor;
+
+	dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
+
+	dev->features |= common_features;
+
+	SET_ETHTOOL_OPS(dev, &venet_ethtool_ops);
+}
+
+static void veip_shutdown(void *data)
+{
+	struct ve_struct *ve = data;
+
+	spin_lock(&veip_lock);
+	if (ve->veip) {
+		__venet_ext_clean(ve);
+		__veip_stop(ve);
+	}
+	spin_unlock(&veip_lock);
+}
+
+static void venet_dellink(struct net_device *dev, struct list_head *head)
+{
+	struct ve_struct *env = dev->nd_net->owner_ve;
+
+	/* We check ve_netns to avoid races with veip SHUTDOWN hook, called from
+	 * ve_exit_ns()
+	 */
+	if (env->ve_netns)
+		veip_shutdown(env);
+
+	env->_venet_dev = NULL;
+	unregister_netdevice_queue(dev, head);
+}
+
+static int venet_newlink(struct net *src_net, struct net_device *dev,
+		  struct nlattr *tb[], struct nlattr *data[])
+{
+	struct ve_struct *env = src_net->owner_ve;
+	int err;
+
+	if (!env->ve_netns)
+		return -EBUSY;
+
+	if (src_net != env->ve_netns)
+		/* Don't create venet-s in sub net namespaces */
+		return -ENOSYS;
+
+	if (env->veip)
+		return -EEXIST;
+
+	err = veip_start(env);
+	if (err)
+		return err;
+
+	dev->features |= NETIF_F_NETNS_LOCAL;
+
+	err = register_netdevice(dev);
+	if (err)
+		goto err_stop;
+
+	env->_venet_dev = dev;
+	return 0;
+
+err_stop:
+	veip_stop(env);
+	return err;
+}
+
+#ifdef CONFIG_PROC_FS
+static void veaddr_seq_print(struct seq_file *m, struct ve_struct *ve)
+{
+	struct ip_entry_struct *entry;
+	struct veip_struct *veip;
+
+	spin_lock(&veip_lock);
+	veip = ACCESS_ONCE(ve->veip);
+	if (veip == NULL)
+		goto unlock;
+	list_for_each_entry (entry, &veip->ip_lh, ve_list) {
+		char addr[40];
+
+		if (entry->active_env == NULL)
+			continue;
+
+		veaddr_print(addr, sizeof(addr), &entry->addr);
+		if (entry->addr.family == AF_INET)
+			seq_printf(m, " %15s", addr);
+		else
+			seq_printf(m, " %39s", addr);
+	}
+unlock:
+	spin_unlock(&veip_lock);
+}
+
+static void *veip_seq_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t l;
+	struct ip_entry_struct *s;
+	int i;
+
+	l = *pos;
+	rcu_read_lock();
+	if (l == 0) {
+		m->private = (void *)0;
+		return SEQ_START_TOKEN;
+	}
+
+	for (i = 0; i < VEIP_HASH_SZ; i++) {
+		hlist_for_each_entry_rcu(s, ip_entry_hash_table + i, ip_hash) {
+			if (--l == 0) {
+				m->private = (void *)(long)(i + 1);
+				return &s->ip_hash;
+			}
+		}
+	}
+	return NULL;
+}
+
+static void *veip_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct hlist_node *p;
+	int i;
+
+	if (v == SEQ_START_TOKEN)
+		goto find;
+
+	p = rcu_dereference(((struct hlist_node *)v)->next);
+	if (p != NULL)
+		goto found;
+
+find:
+	for (i = (int)(long)m->private; i < VEIP_HASH_SZ; i++) {
+		p = rcu_dereference(ip_entry_hash_table[i].first);
+		if (p != NULL) {
+			m->private = (void *)(long)(i + 1);
+found:
+			(*pos)++;
+			return p;
+		}
+	}
+
+	return NULL;
+}
+
+static void veip_seq_stop(struct seq_file *m, void *v)
+{
+	rcu_read_unlock();
+}
+
+static int veip_seq_show(struct seq_file *m, void *v)
+{
+	struct hlist_node *p;
+	struct ip_entry_struct *entry;
+	struct veip_struct *veip;
+	char s[40];
+
+	if (v == SEQ_START_TOKEN) {
+		seq_puts(m, "Version: 2.5\n");
+		return 0;
+	}
+
+	p = (struct hlist_node *)v;
+	entry = hlist_entry(p, struct ip_entry_struct, ip_hash);
+	veaddr_print(s, sizeof(s), &entry->addr);
+	veip = ACCESS_ONCE(entry->tgt_veip);
+	seq_printf(m, "%39s %10u\n", s, veip == NULL ? 0 : veip->veid);
+	return 0;
+}
+
+static struct seq_operations veip_seq_op = {
+	.start	= veip_seq_start,
+	.next	= veip_seq_next,
+	.stop	= veip_seq_stop,
+	.show	= veip_seq_show,
+};
+
+static int veip_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &veip_seq_op);
+}
+
+static struct file_operations proc_veip_operations = {
+	.open		= veip_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+#endif
+
+static int do_ve_ip_map(struct ve_struct *ve, int op, struct ve_addr_struct *addr)
+{
+	int err;
+
+	if (!capable_setveid())
+		return -EPERM;
+
+	down_read(&ve->op_sem);
+	switch (op)
+	{
+		case VE_IP_ADD:
+			/*
+			 * FIXME We should check if VE
+			 * is either running or in restore
+			 * state instead of allowing adding
+			 * address arbitrary.
+			 */
+			err = veip_entry_add(ve, addr);
+			break;
+
+		case VE_IP_DEL:
+			err = veip_entry_del(ve, addr);
+			break;
+		case VE_IP_EXT_ADD:
+			err = venet_ext_add(ve, addr);
+			break;
+		case VE_IP_EXT_DEL:
+			err = venet_ext_del(ve, addr);
+			break;
+		default:
+			err = -EINVAL;
+	}
+	up_read(&ve->op_sem);
+	return err;
+}
+
+static int real_ve_ip_map(envid_t veid, int op,
+			  struct sockaddr __user *uaddr, int addrlen)
+{
+	int err;
+	struct ve_addr_struct addr;
+	struct ve_struct *ve;
+
+	err = sockaddr_to_veaddr(uaddr, addrlen, &addr);
+	if (err < 0)
+		return err;
+
+	ve = get_ve_by_id(veid);
+	if (!ve)
+		return -ESRCH;
+
+	err = do_ve_ip_map(ve, op, &addr);
+	put_ve(ve);
+	return err;
+}
+
+int venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	err = -ENOTTY;
+	switch(cmd) {
+	case VENETCTL_VE_IP_MAP: {
+		struct vzctl_ve_ip_map s;
+		err = -EFAULT;
+		if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+			break;
+		err = real_ve_ip_map(s.veid, s.op, s.addr, s.addrlen);
+		break;
+	}
+	}
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+int compat_venet_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	switch(cmd) {
+	case VENETCTL_COMPAT_VE_IP_MAP: {
+		struct compat_vzctl_ve_ip_map cs;
+
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+
+		err = real_ve_ip_map(cs.veid, cs.op, compat_ptr(cs.addr),
+				cs.addrlen);
+		break;
+	}
+	default:
+		err = venet_ioctl(file, cmd, arg);
+		break;
+	}
+	return err;
+}
+#endif
+
+static struct vzioctlinfo venetcalls = {
+	.type		= VENETCTLTYPE,
+	.ioctl		= venet_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= compat_venet_ioctl,
+#endif
+	.owner		= THIS_MODULE,
+};
+
+static int ve_ip_access_write(struct cgroup *cgrp, struct cftype *cft,
+			      const char *buffer)
+{
+	struct ve_struct *ve = cgroup_ve(cgrp);
+	struct ve_addr_struct addr;
+	int ret;
+
+	if (!ve->veid)
+		return -ENOENT;
+
+	memset(&addr, 0, sizeof(addr));
+	if (strncmp(cft->name, "ip6", 3)) {
+		if ((ret = in4_to_veaddr(buffer, &addr)) != 0)
+			return ret;
+	} else {
+		if ((ret = in6_to_veaddr(buffer, &addr)) != 0)
+			return ret;
+	}
+
+	return do_ve_ip_map(ve, cft->private, &addr);
+}
+
+static int ve_ip_access_seq_read(struct cgroup *cgrp, struct cftype *cft,
+				 struct seq_file *m)
+{
+	struct ve_struct *ve = cgroup_ve(cgrp);
+	struct ip_entry_struct *s;
+	char buf[40];
+	int family = strncmp(cft->name, "ip6", 3) ? AF_INET : AF_INET6;
+	int i;
+
+	if (!ve->veid)
+		return -ENOENT;
+
+	rcu_read_lock();
+	for (i = 0; i < VEIP_HASH_SZ; i++) {
+		hlist_for_each_entry_rcu(s, ip_entry_hash_table + i,
+					 ip_hash) {
+			if (s->addr.family == family &&
+			    s->active_env && s->active_env->veid == ve->veid) {
+				veaddr_print(buf, sizeof(buf), &s->addr);
+				seq_printf(m, "%s\n", buf);
+			}
+		}
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static struct cftype venet_cftypes[] = {
+	{
+		.name = "ip_allow",
+		.write_string = ve_ip_access_write,
+		.private = VE_IP_ADD,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "ip_deny",
+		.write_string = ve_ip_access_write,
+		.private = VE_IP_DEL,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "ip_list",
+		.read_seq_string = ve_ip_access_seq_read,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "ip6_allow",
+		.write_string = ve_ip_access_write,
+		.private = VE_IP_ADD,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "ip6_deny",
+		.write_string = ve_ip_access_write,
+		.private = VE_IP_DEL,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{
+		.name = "ip6_list",
+		.read_seq_string = ve_ip_access_seq_read,
+		.flags = CFTYPE_NOT_ON_ROOT,
+	},
+	{ }
+};
+
+static int venet_changelink(struct net_device *dev, struct nlattr *tb[],
+			    struct nlattr *data[])
+{
+	struct venetaddrmsg *vamp;
+	struct nlattr *nla_addr;
+	struct ve_struct *ve;
+	struct ve_addr_struct addr;
+	int cmd;
+
+	ve = dev_net(dev)->owner_ve;
+	if (ve_is_super(ve))
+		return -EINVAL;
+
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
+	if (!data[VENET_INFO_CMD])
+		return -EINVAL;
+
+	nla_addr = data[VENET_INFO_CMD];
+	vamp = nla_data(nla_addr);
+
+	memset(&addr, 0, sizeof(addr));
+	addr.family = vamp->va_family;
+
+	if (addr.family == AF_INET)
+		memcpy(&addr.key[3], &vamp->va_addr[0], 4);
+	else if (addr.family == AF_INET6)
+		memcpy(&addr.key[0], &vamp->va_addr[0], sizeof(addr.key));
+	else
+		return -EINVAL;
+
+	if (vamp->va_cmd == VENET_IP_ADD)
+		cmd = VE_IP_ADD;
+	else if (vamp->va_cmd == VENET_IP_DEL)
+		cmd = VE_IP_DEL;
+	else
+		return -EINVAL;
+
+	return do_ve_ip_map(ve, cmd, &addr);
+}
+
+static const struct nla_policy venet_policy[VENET_INFO_MAX + 1] = {
+	[VENET_INFO_CMD]	= { .len = sizeof(struct venetaddrmsg) },
+};
+
+static struct rtnl_link_ops venet_link_ops = {
+	.kind		= "venet",
+	.priv_size	= sizeof(struct veip_struct),
+	.newlink	= venet_newlink,
+	.dellink	= venet_dellink,
+	.setup		= venet_setup,
+	.changelink	= venet_changelink,
+	.policy		= venet_policy,
+	.maxtype	= VENET_INFO_MAX,
+};
+
+static struct ve_hook veip_shutdown_hook = {
+	.fini		= veip_shutdown,
+	.priority	= HOOK_PRIO_FINISHING,
+	.owner		= THIS_MODULE,
+};
+
+__init int venet_init(void)
+{
+	struct proc_dir_entry *de;
+	int i, err;
+
+	if (get_ve0()->_venet_dev != NULL)
+		return -EEXIST;
+
+	for (i = 0; i < VEIP_HASH_SZ; i++)
+		INIT_HLIST_HEAD(ip_entry_hash_table + i);
+
+	de = proc_create("veip", S_IFREG | S_IRUSR, proc_vz_dir,
+			&proc_veip_operations);
+	if (!de)
+		return -EINVAL;
+
+	err = cgroup_add_cftypes(&ve_subsys, venet_cftypes);
+	if (err)
+		goto err_proc;
+
+	vzioctl_register(&venetcalls);
+	vzmon_register_veaddr_print_cb(veaddr_seq_print);
+	ve_hook_register(VE_SHUTDOWN_CHAIN, &veip_shutdown_hook);
+
+	return rtnl_link_register(&venet_link_ops);
+
+err_proc:
+	remove_proc_entry("veip", proc_vz_dir);
+	return err;
+}
+
+__exit void venet_exit(void)
+{
+	cgroup_rm_cftypes(&ve_subsys, venet_cftypes);
+	vzmon_unregister_veaddr_print_cb(veaddr_seq_print);
+	vzioctl_unregister(&venetcalls);
+	remove_proc_entry("veip", proc_vz_dir);
+	veip_cleanup();
+
+	/* Ensure there are no outstanding rcu callbacks */
+	rcu_barrier();
+
+	BUG_ON(!list_empty(&veip_lh));
+	rtnl_link_unregister(&venet_link_ops);
+}
+
+module_init(venet_init);
+module_exit(venet_exit);
+
+MODULE_AUTHOR("Parallels <devel@openvz.org>");
+MODULE_DESCRIPTION("Virtuozzo Virtual Network Device");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("vznet");
+
+EXPORT_SYMBOL(veip_lock);
+EXPORT_SYMBOL(ip_entry_hash);
+EXPORT_SYMBOL(ip_entry_unhash);
+EXPORT_SYMBOL(sockaddr_to_veaddr);
+EXPORT_SYMBOL(veaddr_print);
+EXPORT_SYMBOL(venet_entry_lookup);
+EXPORT_SYMBOL(veip_findcreate);
+EXPORT_SYMBOL(veip_put);
+EXPORT_SYMBOL(venet_ext_lookup);
+EXPORT_SYMBOL(veip_lh);
+EXPORT_SYMBOL(ip_entry_hash_table);
--- a/drivers/net/veth.c
+++ b/drivers/net/veth.c
@@ -19,6 +19,7 @@
 #include <net/xfrm.h>
 #include <linux/veth.h>
 #include <linux/module.h>
+#include "../../net/bridge/br_private.h"
 
 #define DRV_NAME	"veth"
 #define DRV_VERSION	"1.0"
@@ -106,6 +107,29 @@ static const struct ethtool_ops veth_ethtool_ops = {
 	.get_ethtool_stats	= veth_get_ethtool_stats,
 };
 
+static int vzethdev_filter(struct sk_buff *skb, struct net_device *dev, struct net_device *rcv)
+{
+	/* Filtering */
+	if (ve_is_super(dev_net(dev)->owner_ve) &&
+	    dev->features & NETIF_F_FIXED_ADDR) {
+		/* from VE0 to VEX */
+		if (ve_is_super(dev_net(rcv)->owner_ve))
+			return 1;
+		if (is_multicast_ether_addr(
+					((struct ethhdr *)skb->data)->h_dest))
+			return 1;
+		if (compare_ether_addr(((struct ethhdr *)skb->data)->h_dest, rcv->dev_addr))
+				return 0;
+	} else if (!ve_is_super(dev_net(dev)->owner_ve) &&
+		   dev->features & NETIF_F_FIXED_ADDR) {
+		/* from VEX to VE0 */
+		if (compare_ether_addr(((struct ethhdr *)skb->data)->h_source, dev->dev_addr))
+				return 0;
+	}
+
+	return 1;
+}
+
 static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct veth_priv *priv = netdev_priv(dev);
@@ -119,6 +143,12 @@ static netdev_tx_t veth_xmit(struct sk_buff *skb, struct net_device *dev)
 		goto drop;
 	}
 
+
+	if (dev->features & NETIF_F_VENET && !vzethdev_filter(skb, dev, rcv)) {
+		kfree_skb(skb);
+		goto drop;
+	}
+
 	if (likely(dev_forward_skb(rcv, skb) == NET_RX_SUCCESS)) {
 		struct pcpu_vstats *stats = this_cpu_ptr(dev->vstats);
 
@@ -276,6 +306,59 @@ out:
 	rcu_read_unlock();
 }
 
+static int veth_mac_addr(struct net_device *dev, void *p)
+{
+	if (dev->features & NETIF_F_VENET &&
+	    dev->features & NETIF_F_FIXED_ADDR)
+		return -EPERM;
+	return eth_mac_addr(dev, p);
+}
+
+static int vzethdev_net_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
+{
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
+	switch (cmd) {
+	case SIOCSVENET:
+	{
+		struct veth_priv *priv = netdev_priv(dev);
+		struct net_device *rcv;
+
+		rcu_read_lock();
+		rcv = rcu_dereference(priv->peer);
+		if (rcv)
+			rcv->features |= NETIF_F_VENET;
+		dev->features |= NETIF_F_VENET;
+		rcu_read_unlock();
+
+		return 0;
+	}
+	case SIOCSFIXEDADDR:
+		if (ifr->ifr_ifru.ifru_flags)
+			dev->features |= NETIF_F_FIXED_ADDR;
+		else
+			dev->features &= ~NETIF_F_FIXED_ADDR;
+		return 0;
+	}
+	return -ENOTTY;
+}
+
+#ifdef CONFIG_NET_POLL_CONTROLLER
+static void veth_poll_controller(struct net_device *dev)
+{
+	/* veth only receives frames when its peer sends one
+	 * Since it's a synchronous operation, we are guaranteed
+	 * never to have pending data when we poll for it so
+	 * there is nothing to do here.
+	 *
+	 * We need this though so netpoll recognizes us as an interface that
+	 * supports polling, which enables bridge devices in virt setups to
+	 * still use netconsole
+	 */
+}
+#endif	/* CONFIG_NET_POLL_CONTROLLER */
+
 static const struct net_device_ops veth_netdev_ops = {
 	.ndo_init            = veth_dev_init,
 	.ndo_open            = veth_open,
@@ -283,10 +366,14 @@ static const struct net_device_ops veth_netdev_ops = {
 	.ndo_start_xmit      = veth_xmit,
 	.ndo_change_mtu      = veth_change_mtu,
 	.ndo_get_stats64     = veth_get_stats64,
-	.ndo_set_mac_address = eth_mac_addr,
+	.ndo_set_mac_address = veth_mac_addr,
 	.ndo_get_iflink		= veth_get_iflink,
 	.ndo_size		= sizeof(struct net_device_ops),
 	.extended.ndo_set_rx_headroom	= veth_set_rx_headroom,
+	.ndo_do_ioctl        = vzethdev_net_ioctl,
+#ifdef CONFIG_NET_POLL_CONTROLLER
+	.ndo_poll_controller	= veth_poll_controller,
+#endif
 };
 
 #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
@@ -307,7 +394,7 @@ static void veth_setup(struct net_device *dev)
 	dev->netdev_ops = &veth_netdev_ops;
 	dev->ethtool_ops = &veth_ethtool_ops;
 	dev->features |= NETIF_F_LLTX;
-	dev->features |= VETH_FEATURES;
+	dev->features |= VETH_FEATURES | NETIF_F_VIRTUAL;
 	dev->vlan_features = dev->features &
 			     ~(NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX);
 	dev->destructor = veth_dev_free;
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2578,6 +2578,7 @@ static void vxlan_setup(struct net_device *dev)
 
 	dev->vlan_features = dev->features;
 	dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
+	dev->features |= NETIF_F_VIRTUAL;
 	dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
 	dev->hw_features |= NETIF_F_GSO_SOFTWARE;
 	dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
--- a/drivers/scsi/be2iscsi/be_main.c
+++ b/drivers/scsi/be2iscsi/be_main.c
@@ -49,7 +49,6 @@ static unsigned int be_iopoll_budget = 10;
 static unsigned int be_max_phys_size = 64;
 static unsigned int enable_msix = 1;
 
-MODULE_DEVICE_TABLE(pci, beiscsi_pci_id_table);
 MODULE_DESCRIPTION(DRV_DESC " " BUILD_STR);
 MODULE_VERSION(BUILD_STR);
 MODULE_AUTHOR("Emulex Corporation");
--- a/drivers/scsi/hosts.c
+++ b/drivers/scsi/hosts.c
@@ -40,6 +40,7 @@
 
 #include "scsi_priv.h"
 #include "scsi_logging.h"
+#include "scsi_dbg.h"
 
 
 static atomic_t scsi_host_next_hn = ATOMIC_INIT(0);	/* host_no for next new host */
@@ -140,12 +141,13 @@ int scsi_host_set_state(struct Scsi_Host *shost, enum scsi_host_state state)
 	return 0;
 
  illegal:
-	SCSI_LOG_ERROR_RECOVERY(1,
-				shost_printk(KERN_ERR, shost,
-					     "Illegal host state transition"
-					     "%s->%s\n",
-					     scsi_host_state_name(oldstate),
-					     scsi_host_state_name(state)));
+	shost_printk(KERN_ERR, shost,
+		     "Illegal host state transition"
+		     "%s->%s\n",
+		     scsi_host_state_name(oldstate),
+		     scsi_host_state_name(state));
+	dump_stack();
+
 	return -EINVAL;
 }
 EXPORT_SYMBOL(scsi_host_set_state);
@@ -358,6 +360,7 @@ static void scsi_host_dev_release(struct device *dev)
 
 	if (parent)
 		put_device(parent);
+	kfree(SHOST_TO_SDBG(shost));
 	kfree(shost);
 }
 
@@ -388,6 +391,7 @@ static struct device_type scsi_host_type = {
 struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 {
 	struct Scsi_Host *shost;
+	struct scsi_host_dbg *sdbg;
 	gfp_t gfp_mask = GFP_KERNEL;
 
 	if (sht->unchecked_isa_dma && privsize)
@@ -397,6 +401,15 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
 	if (!shost)
 		return NULL;
 
+	sdbg = kzalloc(sizeof(struct scsi_host_dbg), gfp_mask);
+	if (!sdbg) {
+		kfree(shost);
+		return NULL;
+	}
+
+	SHOST_TO_SDBG(shost) = sdbg;
+	spin_lock_init(&sdbg->sdbg_lock);
+
 	shost->host_lock = &shost->default_lock;
 	spin_lock_init(shost->host_lock);
 	shost->shost_state = SHOST_CREATED;
@@ -514,6 +527,7 @@ struct Scsi_Host *scsi_host_alloc(struct scsi_host_template *sht, int privsize)
  fail_kthread:
 	kthread_stop(shost->ehandler);
  fail_kfree:
+	kfree(SHOST_TO_SDBG(shost));
 	kfree(shost);
 	return NULL;
 }
--- a/drivers/scsi/libsas/sas_scsi_host.c
+++ b/drivers/scsi/libsas/sas_scsi_host.c
@@ -41,6 +41,7 @@
 #include "../scsi_sas_internal.h"
 #include "../scsi_transport_api.h"
 #include "../scsi_priv.h"
+#include "../scsi_dbg.h"
 
 #include <linux/err.h>
 #include <linux/blkdev.h>
@@ -234,6 +235,7 @@ static void sas_eh_finish_cmd(struct scsi_cmnd *cmd)
 	 * handler done list, this also takes it off the
 	 * error handler pending list.
 	 */
+	scsi_debug_log_cmnd(SAS_EH_FINISH_CMD_CALLS_EH_FINISH, cmd);
 	scsi_eh_finish_cmd(cmd, &sas_ha->eh_done_q);
 }
 
@@ -465,6 +467,7 @@ static int sas_queue_reset(struct domain_device *dev, int reset_type, int lun, i
 			set_bit(SAS_DEV_EH_PENDING, &dev->state);
 			set_bit(reset_type, &dev->state);
 			int_to_scsilun(lun, &dev->ssp_dev.reset_lun);
+			scsi_debug_log_shost(SAS_QUEUE_RESET_CALLS_SCHEDULE_EH, ha->core.shost);
 			scsi_schedule_eh(ha->core.shost);
 		}
 		spin_unlock_irq(&ha->lock);
@@ -789,6 +792,7 @@ out:
 	/* check if any new eh work was scheduled during the last run */
 	spin_lock_irq(&ha->lock);
 	if (ha->eh_active == 0) {
+		scsi_debug_log_shost(SAS_SCSI_RECOVER_HOST_ZERO_EH_SCHEDULED, shost);
 		shost->host_eh_scheduled = 0;
 		retry = false;
 	}
--- a/drivers/scsi/scsi.c
+++ b/drivers/scsi/scsi.c
@@ -68,6 +68,7 @@
 
 #include "scsi_priv.h"
 #include "scsi_logging.h"
+#include "scsi_dbg.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/scsi.h>
@@ -684,6 +685,7 @@ void scsi_finish_command(struct scsi_cmnd *cmd)
 	struct scsi_driver *drv;
 	unsigned int good_bytes;
 
+	scsi_debug_log_cmnd(SCSI_FINISH_COMMAND_CALLS_UNBUSY, cmd);
 	scsi_device_unbusy(sdev);
 
 	/*
--- /dev/null
+++ b/drivers/scsi/scsi_dbg.h
@@ -0,0 +1,133 @@
+/*
+ *  drivers/scsi/scsi_dbg.h
+ *
+ *  Copyright (c) 2016 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _SCSI_DBG_H
+#define _SCSI_DBG_H
+
+#include <scsi/scsi_cmnd.h>
+
+/*
+ * Temporary debug stuff to chase missed ehandler wakeup.
+ */
+
+#define SCSI_HOST_DBG_N_ENTRIES 45 /* fit in one page */
+
+enum scsi_dbg_type {
+	SCSI_HOST_QUEUE_READY_INC_HOST_BUSY = 1, /* scsi_host_queue_ready() */
+	SCSI_HOST_QUEUE_READY_DEC_HOST_BUSY,
+	SCSI_KILL_REQUEST_INC_HOST_BUSY,
+	SCSI_QUEUE_RQ_DEC_HOST_BUSY,
+	SCSI_FINISH_COMMAND_CALLS_UNBUSY,
+	SCSI_QUEUE_INSERT_CALLS_UNBUSY,
+	SCSI_EH_SCMD_ADD_INC_HOST_FAILED,
+	ATA_SCSI_CMD_ERROR_HANDLER_CALLS_EH_FINISH,
+	ATA_EH_QC_COMPLETE_CALLS_EH_FINISH,
+	SAS_EH_FINISH_CMD_CALLS_EH_FINISH,
+	SCSI_EH_GET_SENSE_CALLS_EH_FINISH,
+	SCSI_EH_TEST_DEVICES_CALLS_EH_FINISH,
+	SCSI_EH_ABORT_CMDS_CALLS_EH_FINISH,
+	SCSI_EH_STU_CALLS_EH_FINISH,
+	SCSI_EH_BUS_DEVICE_RESET_CALLS_EH_FINISH,
+	SCSI_EH_TARGET_RESET_CALLS_EH_FINISH,
+	SCSI_EH_BUS_RESET_CALLS_EH_FINISH,
+	SCSI_EH_HOST_RESET_CALLS_EH_FINISH,
+	SCSI_EH_OFFLINE_SDEVS_CALLS_EH_FINISH,
+	ATA_STD_END_EH_ZERO_EH_SCHEDULED,
+	SAS_SCSI_RECOVER_HOST_ZERO_EH_SCHEDULED,
+	ATA_STD_SCHED_EH_CALLS_SCHEDULE_EH,
+	SAS_QUEUE_RESET_CALLS_SCHEDULE_EH,
+	SCSI_EH_WAKEUP_EHANDLER,
+	SCSI_SCHEDULE_EH_CALLS_EH_WAKEUP,
+	SCSI_DEVICE_UNBUSY_CALLS_EH_WAKEUP,
+	SCSI_ERROR_HANDLER_SLEEP,
+	SCSI_ERROR_HANDLER_WAKEUP,
+	SCSI_ERROR_HANDLER_CALLS_HANDLER,
+};
+
+struct scsi_host_log_entry {
+	enum scsi_dbg_type sle_type;
+	enum scsi_host_state sle_shost_state;
+
+	int sle_host_failed;
+	int sle_host_busy;
+	int sle_host_blocked;
+	int sle_host_eh_scheduled;
+
+	struct task_struct *sle_task;
+	char sle_comm[TASK_COMM_LEN];
+
+	struct scsi_device *sle_sdev;
+	struct scsi_cmnd   *sle_cmnd;
+	struct request     *sle_req;
+
+	ktime_t sle_ktime;
+	u64     sle_jiffies;
+};
+
+struct scsi_host_dbg {
+	spinlock_t		   sdbg_lock;
+	struct scsi_host_log_entry sdbg_entries[SCSI_HOST_DBG_N_ENTRIES];
+	int                        sdbg_next_entry;
+};
+
+#define SHOST_TO_SDBG(shost) (shost)->scsi_mq_reserved3
+
+static inline void
+scsi_debug_log(struct Scsi_Host *shost, enum scsi_dbg_type type,
+	       struct scsi_device *sdev, struct scsi_cmnd *cmnd,
+	       struct request *req)
+{
+	struct scsi_host_dbg *s = SHOST_TO_SDBG(shost);
+	struct scsi_host_log_entry *e;
+	unsigned long irq_flags;
+
+	spin_lock_irqsave(&s->sdbg_lock, irq_flags);
+	e = &s->sdbg_entries[s->sdbg_next_entry];
+
+	e->sle_type = type;
+	e->sle_sdev = sdev;
+	e->sle_cmnd = cmnd;
+	e->sle_req  = req;
+
+	e->sle_shost_state       = shost->shost_state;
+	e->sle_host_failed       = shost->host_failed;
+	e->sle_host_busy         = atomic_read(&shost->host_busy);
+	e->sle_host_blocked      = atomic_read(&shost->host_blocked);
+	e->sle_host_eh_scheduled = shost->host_eh_scheduled;
+
+	e->sle_task = current;
+	memcpy(e->sle_comm, current->comm, TASK_COMM_LEN);
+
+	e->sle_ktime   = ktime_get();
+	e->sle_jiffies = jiffies;
+
+	s->sdbg_next_entry++;
+	if (s->sdbg_next_entry == SCSI_HOST_DBG_N_ENTRIES)
+		s->sdbg_next_entry = 0;
+	spin_unlock_irqrestore(&s->sdbg_lock, irq_flags);
+}
+
+static inline void
+scsi_debug_log_cmnd(enum scsi_dbg_type type, struct scsi_cmnd *cmnd)
+{
+	scsi_debug_log(cmnd->device->host, type, cmnd->device, cmnd,
+		       cmnd->request);
+}
+
+static inline void
+scsi_debug_log_shost(enum scsi_dbg_type type, struct Scsi_Host *shost)
+{
+	scsi_debug_log(shost, type, NULL, NULL, NULL);
+}
+
+static inline void
+scsi_debug_log_sdev(enum scsi_dbg_type type, struct scsi_device *sdev)
+{
+	scsi_debug_log(sdev->host, type, sdev, NULL, NULL);
+}
+
+#endif /* _SCSI_DBG_H */
--- a/drivers/scsi/scsi_error.c
+++ b/drivers/scsi/scsi_error.c
@@ -38,6 +38,7 @@
 #include <scsi/scsi_ioctl.h>
 
 #include "scsi_priv.h"
+#include "scsi_dbg.h"
 #include "scsi_logging.h"
 #include "scsi_transport_api.h"
 
@@ -61,6 +62,7 @@ void scsi_eh_wakeup(struct Scsi_Host *shost)
 {
 	if (atomic_read(&shost->host_busy) == shost->host_failed) {
 		trace_scsi_eh_wakeup(shost);
+		scsi_debug_log_shost(SCSI_EH_WAKEUP_EHANDLER, shost);
 		wake_up_process(shost->ehandler);
 		SCSI_LOG_ERROR_RECOVERY(5, shost_printk(KERN_INFO, shost,
 			"Waking error handler thread\n"));
@@ -82,6 +84,7 @@ void scsi_schedule_eh(struct Scsi_Host *shost)
 	if (scsi_host_set_state(shost, SHOST_RECOVERY) == 0 ||
 	    scsi_host_set_state(shost, SHOST_CANCEL_RECOVERY) == 0) {
 		shost->host_eh_scheduled++;
+		scsi_debug_log_shost(SCSI_SCHEDULE_EH_CALLS_EH_WAKEUP, shost);
 		scsi_eh_wakeup(shost);
 	}
 
@@ -247,6 +250,7 @@ int scsi_eh_scmd_add(struct scsi_cmnd *scmd, int eh_flag)
 		eh_flag &= ~SCSI_EH_CANCEL_CMD;
 	scmd->eh_eflags |= eh_flag;
 	list_add_tail(&scmd->eh_entry, &shost->eh_cmd_q);
+	scsi_debug_log_cmnd(SCSI_EH_SCMD_ADD_INC_HOST_FAILED, scmd);
 	shost->host_failed++;
 	scsi_eh_wakeup(shost);
  out_unlock:
@@ -1215,6 +1219,7 @@ int scsi_eh_get_sense(struct list_head *work_q,
 		else if (rtn != NEEDS_RETRY)
 			continue;
 
+		scsi_debug_log_cmnd(SCSI_EH_GET_SENSE_CALLS_EH_FINISH, scmd);
 		scsi_eh_finish_cmd(scmd, done_q);
 	}
 
@@ -1299,8 +1304,10 @@ static int scsi_eh_test_devices(struct list_head *cmd_list,
 			if (scmd->device == sdev) {
 				if (finish_cmds &&
 				    (try_stu ||
-				     scsi_eh_action(scmd, SUCCESS) == SUCCESS))
+				     scsi_eh_action(scmd, SUCCESS) == SUCCESS)) {
+					scsi_debug_log_cmnd(SCSI_EH_TEST_DEVICES_CALLS_EH_FINISH, scmd);
 					scsi_eh_finish_cmd(scmd, done_q);
+				}
 				else
 					list_move_tail(&scmd->eh_entry, work_q);
 			}
@@ -1354,9 +1361,10 @@ static int scsi_eh_abort_cmds(struct list_head *work_q,
 			return list_empty(work_q);
 		}
 		scmd->eh_eflags &= ~SCSI_EH_CANCEL_CMD;
-		if (rtn == FAST_IO_FAIL)
+		if (rtn == FAST_IO_FAIL) {
+			scsi_debug_log_cmnd(SCSI_EH_ABORT_CMDS_CALLS_EH_FINISH, scmd);
 			scsi_eh_finish_cmd(scmd, done_q);
-		else
+		} else
 			list_move_tail(&scmd->eh_entry, &check_list);
 	}
 
@@ -1434,8 +1442,10 @@ static int scsi_eh_stu(struct Scsi_Host *shost,
 				list_for_each_entry_safe(scmd, next,
 							  work_q, eh_entry) {
 					if (scmd->device == sdev &&
-					    scsi_eh_action(scmd, SUCCESS) == SUCCESS)
+					    scsi_eh_action(scmd, SUCCESS) == SUCCESS) {
+						scsi_debug_log_cmnd(SCSI_EH_STU_CALLS_EH_FINISH, scmd);
 						scsi_eh_finish_cmd(scmd, done_q);
+					}
 				}
 			}
 		} else {
@@ -1499,9 +1509,11 @@ static int scsi_eh_bus_device_reset(struct Scsi_Host *shost,
 				list_for_each_entry_safe(scmd, next,
 							 work_q, eh_entry) {
 					if (scmd->device == sdev &&
-					    scsi_eh_action(scmd, rtn) != FAILED)
+					    scsi_eh_action(scmd, rtn) != FAILED) {
+						scsi_debug_log_cmnd(SCSI_EH_BUS_DEVICE_RESET_CALLS_EH_FINISH, scmd);
 						scsi_eh_finish_cmd(scmd,
 								   done_q);
+					}
 				}
 			}
 		} else {
@@ -1568,9 +1580,10 @@ static int scsi_eh_target_reset(struct Scsi_Host *shost,
 
 			if (rtn == SUCCESS)
 				list_move_tail(&scmd->eh_entry, &check_list);
-			else if (rtn == FAST_IO_FAIL)
+			else if (rtn == FAST_IO_FAIL) {
+				scsi_debug_log_cmnd(SCSI_EH_TARGET_RESET_CALLS_EH_FINISH, scmd);
 				scsi_eh_finish_cmd(scmd, done_q);
-			else
+			} else
 				/* push back on work queue for further processing */
 				list_move(&scmd->eh_entry, work_q);
 		}
@@ -1633,10 +1646,11 @@ static int scsi_eh_bus_reset(struct Scsi_Host *shost,
 		if (rtn == SUCCESS || rtn == FAST_IO_FAIL) {
 			list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
 				if (channel == scmd_channel(scmd)) {
-					if (rtn == FAST_IO_FAIL)
+					if (rtn == FAST_IO_FAIL) {
+						scsi_debug_log_cmnd(SCSI_EH_BUS_RESET_CALLS_EH_FINISH, scmd);
 						scsi_eh_finish_cmd(scmd,
 								   done_q);
-					else
+					} else
 						list_move_tail(&scmd->eh_entry,
 							       &check_list);
 				}
@@ -1679,6 +1693,7 @@ static int scsi_eh_host_reset(struct Scsi_Host *shost,
 			list_splice_init(work_q, &check_list);
 		} else if (rtn == FAST_IO_FAIL) {
 			list_for_each_entry_safe(scmd, next, work_q, eh_entry) {
+				scsi_debug_log_cmnd(SCSI_EH_HOST_RESET_CALLS_EH_FINISH, scmd);
 					scsi_eh_finish_cmd(scmd, done_q);
 			}
 		} else {
@@ -1710,6 +1725,7 @@ static void scsi_eh_offline_sdevs(struct list_head *work_q,
 			 * FIXME: Handle lost cmds.
 			 */
 		}
+		scsi_debug_log_cmnd(SCSI_EH_OFFLINE_SDEVS_CALLS_EH_FINISH, scmd);
 		scsi_eh_finish_cmd(scmd, done_q);
 	}
 	return;
@@ -2198,7 +2214,9 @@ int scsi_error_handler(void *data)
 				shost_printk(KERN_INFO, shost,
 					     "scsi_eh_%d: sleeping\n",
 					     shost->host_no));
+			scsi_debug_log_shost(SCSI_ERROR_HANDLER_SLEEP, shost);
 			schedule();
+			scsi_debug_log_shost(SCSI_ERROR_HANDLER_WAKEUP, shost);
 			continue;
 		}
 
@@ -2223,6 +2241,7 @@ int scsi_error_handler(void *data)
 			continue;
 		}
 
+		scsi_debug_log_shost(SCSI_ERROR_HANDLER_CALLS_HANDLER, shost);
 		if (shost->transportt->eh_strategy_handler)
 			shost->transportt->eh_strategy_handler(shost);
 		else
--- a/drivers/scsi/scsi_lib.c
+++ b/drivers/scsi/scsi_lib.c
@@ -38,6 +38,7 @@
 
 #include "scsi_priv.h"
 #include "scsi_logging.h"
+#include "scsi_dbg.h"
 
 
 #define SG_MEMPOOL_NR		ARRAY_SIZE(scsi_sg_pools)
@@ -155,8 +156,10 @@ static void __scsi_queue_insert(struct scsi_cmnd *cmd, int reason, int unbusy)
 	 * Decrement the counters, since these commands are no longer
 	 * active on the host/device.
 	 */
-	if (unbusy)
+	if (unbusy) {
+		scsi_debug_log_cmnd(SCSI_QUEUE_INSERT_CALLS_UNBUSY, cmd);
 		scsi_device_unbusy(device);
+	}
 
 	/*
 	 * Requeue this command.  It will go before all other commands
@@ -320,6 +323,7 @@ void scsi_device_unbusy(struct scsi_device *sdev)
 	if (unlikely(scsi_host_in_recovery(shost) &&
 		     (shost->host_failed || shost->host_eh_scheduled))) {
 		spin_lock_irqsave(shost->host_lock, flags);
+		scsi_debug_log_shost(SCSI_DEVICE_UNBUSY_CALLS_EH_WAKEUP, shost);
 		scsi_eh_wakeup(shost);
 		spin_unlock_irqrestore(shost->host_lock, flags);
 	}
@@ -1523,6 +1527,8 @@ static inline int scsi_host_queue_ready(struct request_queue *q,
 	if (scsi_host_in_recovery(shost))
 		return 0;
 
+	scsi_debug_log_sdev(SCSI_HOST_QUEUE_READY_INC_HOST_BUSY, sdev);
+
 	busy = atomic_inc_return(&shost->host_busy) - 1;
 	if (atomic_read(&shost->host_blocked) > 0) {
 		if (busy)
@@ -1560,6 +1566,7 @@ starved:
 		list_add_tail(&sdev->starved_entry, &shost->starved_list);
 	spin_unlock_irq(shost->host_lock);
 out_dec:
+	scsi_debug_log_sdev(SCSI_HOST_QUEUE_READY_DEC_HOST_BUSY, sdev);
 	atomic_dec(&shost->host_busy);
 	return 0;
 }
@@ -1619,6 +1626,7 @@ static void scsi_kill_request(struct request *req, struct request_queue *q)
 	cmd->result = DID_NO_CONNECT << 16;
 	atomic_inc(&cmd->device->iorequest_cnt);
 
+	scsi_debug_log_cmnd(SCSI_KILL_REQUEST_INC_HOST_BUSY, cmd);
 	/*
 	 * SCSI request completion path will do scsi_device_unbusy(),
 	 * bump busy counts.  To bump the counters, we need to dance
@@ -1948,6 +1956,7 @@ static int scsi_queue_rq(struct blk_mq_hw_ctx *hctx,
 	return BLK_MQ_RQ_QUEUE_OK;
 
 out_dec_host_busy:
+	scsi_debug_log_sdev(SCSI_QUEUE_RQ_DEC_HOST_BUSY, sdev);
 	atomic_dec(&shost->host_busy);
 out_dec_target_busy:
 	if (scsi_target(sdev)->can_queue > 0)
--- a/drivers/scsi/virtio_scsi.c
+++ b/drivers/scsi/virtio_scsi.c
@@ -61,6 +61,13 @@ struct virtio_scsi_vq {
 	struct virtqueue *vq;
 };
 
+#define __check_ret(val) do {				\
+		if (val == FAILED) {			\
+			printk("virtscsi_failure");	\
+			dump_stack();			\
+		}					\
+	} while(0)
+
 /*
  * Per-target queue state.
  *
@@ -489,6 +496,7 @@ static int virtscsi_add_cmd(struct virtqueue *vq,
 	return virtqueue_add_sgs(vq, sgs, out_num, in_num, cmd, GFP_ATOMIC);
 }
 
+
 static int virtscsi_kick_cmd(struct virtio_scsi_vq *vq,
 			     struct virtio_scsi_cmd *cmd,
 			     size_t req_size, size_t resp_size)
@@ -633,6 +641,7 @@ static int virtscsi_tmf(struct virtio_scsi *vscsi, struct virtio_scsi_cmd *cmd)
 	virtscsi_poll_requests(vscsi);
 
 out:
+	__check_ret(ret);
 	mempool_free(cmd, virtscsi_cmd_pool);
 	return ret;
 }
@@ -644,8 +653,10 @@ static int virtscsi_device_reset(struct scsi_cmnd *sc)
 
 	sdev_printk(KERN_INFO, sc->device, "device reset\n");
 	cmd = mempool_alloc(virtscsi_cmd_pool, GFP_NOIO);
-	if (!cmd)
+	if (!cmd) {
+		__check_ret(FAILED);
 		return FAILED;
+	}
 
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->sc = sc;
@@ -666,11 +677,12 @@ static int virtscsi_abort(struct scsi_cmnd *sc)
 	struct virtio_scsi *vscsi = shost_priv(sc->device->host);
 	struct virtio_scsi_cmd *cmd;
 
-	scmd_printk(KERN_INFO, sc, "abort\n");
+	scmd_printk(KERN_INFO, sc, "%s abort\n", __FUNCTION__);
 	cmd = mempool_alloc(virtscsi_cmd_pool, GFP_NOIO);
-	if (!cmd)
+	if (!cmd) {
+		__check_ret(FAILED);
 		return FAILED;
-
+	}
 	memset(cmd, 0, sizeof(*cmd));
 	cmd->sc = sc;
 	cmd->req.tmf = (struct virtio_scsi_ctrl_tmf_req){
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -339,29 +339,28 @@ out:
 }
 
 /*
- * ashmem_shrink - our cache shrinker, called from mm/vmscan.c :: shrink_slab
+ * ashmem_shrink - our cache shrinker, called from mm/vmscan.c
  *
- * 'nr_to_scan' is the number of objects (pages) to prune, or 0 to query how
- * many objects (pages) we have in total.
+ * 'nr_to_scan' is the number of objects to scan for freeing.
  *
  * 'gfp_mask' is the mask of the allocation that got us into this mess.
  *
- * Return value is the number of objects (pages) remaining, or -1 if we cannot
+ * Return value is the number of objects freed or -1 if we cannot
  * proceed without risk of deadlock (due to gfp_mask).
  *
  * We approximate LRU via least-recently-unpinned, jettisoning unpinned partial
  * chunks of ashmem regions LRU-wise one-at-a-time until we hit 'nr_to_scan'
  * pages freed.
  */
-static int ashmem_shrink(struct shrinker *s, struct shrink_control *sc)
+static unsigned long
+ashmem_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct ashmem_range *range, *next;
+	unsigned long freed = 0;
 
 	/* We might recurse into filesystem code, so bail out if necessary */
-	if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
-		return -1;
-	if (!sc->nr_to_scan)
-		return lru_count;
+	if (!(sc->gfp_mask & __GFP_FS))
+		return SHRINK_STOP;
 
 	mutex_lock(&ashmem_mutex);
 	list_for_each_entry_safe(range, next, &ashmem_lru_list, lru) {
@@ -374,17 +373,32 @@ static int ashmem_shrink(struct shrinker *s, struct shrink_control *sc)
 		range->purged = ASHMEM_WAS_PURGED;
 		lru_del(range);
 
-		sc->nr_to_scan -= range_size(range);
-		if (sc->nr_to_scan <= 0)
+		freed += range_size(range);
+		if (--sc->nr_to_scan <= 0)
 			break;
 	}
 	mutex_unlock(&ashmem_mutex);
+	return freed;
+}
 
+static unsigned long
+ashmem_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	/*
+	 * note that lru_count is count of pages on the lru, not a count of
+	 * objects on the list. This means the scan function needs to return the
+	 * number of pages freed, not the number of objects scanned.
+	 */
 	return lru_count;
 }
 
 static struct shrinker ashmem_shrinker = {
-	.shrink = ashmem_shrink,
+	.count_objects = ashmem_shrink_count,
+	.scan_objects = ashmem_shrink_scan,
+	/*
+	 * XXX (dchinner): I wish people would comment on why they need on
+	 * significant changes to the default value here
+	 */
 	.seeks = DEFAULT_SEEKS * 4,
 };
 
@@ -690,11 +704,10 @@ static long ashmem_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 		if (capable(CAP_SYS_ADMIN)) {
 			struct shrink_control sc = {
 				.gfp_mask = GFP_KERNEL,
-				.nr_to_scan = 0,
+				.nr_to_scan = LONG_MAX,
 			};
-			ret = ashmem_shrink(&ashmem_shrinker, &sc);
-			sc.nr_to_scan = ret;
-			ashmem_shrink(&ashmem_shrinker, &sc);
+
+			ashmem_shrink_scan(&ashmem_shrinker, &sc);
 		}
 		break;
 	}
--- a/drivers/staging/android/lowmemorykiller.c
+++ b/drivers/staging/android/lowmemorykiller.c
@@ -66,11 +66,20 @@ static unsigned long lowmem_deathpending_timeout;
 			pr_info(x);			\
 	} while (0)
 
-static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
+static unsigned long lowmem_count(struct shrinker *s,
+				  struct shrink_control *sc)
+{
+	return global_page_state(NR_ACTIVE_ANON) +
+		global_page_state(NR_ACTIVE_FILE) +
+		global_page_state(NR_INACTIVE_ANON) +
+		global_page_state(NR_INACTIVE_FILE);
+}
+
+static unsigned long lowmem_scan(struct shrinker *s, struct shrink_control *sc)
 {
 	struct task_struct *tsk;
 	struct task_struct *selected = NULL;
-	int rem = 0;
+	unsigned long rem = 0;
 	int tasksize;
 	int i;
 	short min_score_adj = OOM_SCORE_ADJ_MAX + 1;
@@ -92,19 +101,17 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
 			break;
 		}
 	}
-	if (sc->nr_to_scan > 0)
-		lowmem_print(3, "lowmem_shrink %lu, %x, ofree %d %d, ma %hd\n",
-				sc->nr_to_scan, sc->gfp_mask, other_free,
-				other_file, min_score_adj);
-	rem = global_page_state(NR_ACTIVE_ANON) +
-		global_page_state(NR_ACTIVE_FILE) +
-		global_page_state(NR_INACTIVE_ANON) +
-		global_page_state(NR_INACTIVE_FILE);
-	if (sc->nr_to_scan <= 0 || min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
-		lowmem_print(5, "lowmem_shrink %lu, %x, return %d\n",
-			     sc->nr_to_scan, sc->gfp_mask, rem);
-		return rem;
+
+	lowmem_print(3, "lowmem_scan %lu, %x, ofree %d %d, ma %hd\n",
+			sc->nr_to_scan, sc->gfp_mask, other_free,
+			other_file, min_score_adj);
+
+	if (min_score_adj == OOM_SCORE_ADJ_MAX + 1) {
+		lowmem_print(5, "lowmem_scan %lu, %x, return 0\n",
+			     sc->nr_to_scan, sc->gfp_mask);
+		return 0;
 	}
+
 	selected_oom_score_adj = min_score_adj;
 
 	rcu_read_lock();
@@ -152,18 +159,25 @@ static int lowmem_shrink(struct shrinker *s, struct shrink_control *sc)
 			     selected->pid, selected->comm,
 			     selected_oom_score_adj, selected_tasksize);
 		lowmem_deathpending_timeout = jiffies + HZ;
+		/*
+		 * FIXME: lowmemorykiller shouldn't abuse global OOM killer
+		 * infrastructure. There is no real reason why the selected
+		 * task should have access to the memory reserves.
+		 */
+		mark_oom_victim(selected);
 		send_sig(SIGKILL, selected, 0);
-		set_tsk_thread_flag(selected, TIF_MEMDIE);
-		rem -= selected_tasksize;
+		rem += selected_tasksize;
 	}
-	lowmem_print(4, "lowmem_shrink %lu, %x, return %d\n",
+
+	lowmem_print(4, "lowmem_scan %lu, %x, return %lu\n",
 		     sc->nr_to_scan, sc->gfp_mask, rem);
 	rcu_read_unlock();
 	return rem;
 }
 
 static struct shrinker lowmem_shrinker = {
-	.shrink = lowmem_shrink,
+	.scan_objects = lowmem_scan,
+	.count_objects = lowmem_count,
 	.seeks = DEFAULT_SEEKS * 16
 };
 
--- a/drivers/staging/zcache/zcache-main.c
+++ b/drivers/staging/zcache/zcache-main.c
@@ -961,7 +961,7 @@ static int zcache_get_swap_cache_page(int type, pgoff_t offset,
 		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 		 * clear SWAP_HAS_CACHE flag.
 		 */
-		swapcache_free(entry, NULL);
+		swapcache_free(entry);
 		/* FIXME: is it possible to get here without err==-ENOMEM?
 		 * If not, we can dispense with the do loop, use goto retry */
 	} while (err != -ENOMEM);
--- a/drivers/tty/n_hdlc.c
+++ b/drivers/tty/n_hdlc.c
@@ -158,7 +158,6 @@ struct n_hdlc {
  */
 static void n_hdlc_buf_return(struct n_hdlc_buf_list *buf_list,
 						struct n_hdlc_buf *buf);
-static void n_hdlc_buf_list_init(struct n_hdlc_buf_list *list);
 static void n_hdlc_buf_put(struct n_hdlc_buf_list *list,
 			   struct n_hdlc_buf *buf);
 static struct n_hdlc_buf *n_hdlc_buf_get(struct n_hdlc_buf_list *list);
@@ -844,11 +843,16 @@ static struct n_hdlc *n_hdlc_alloc(void)
 
 	memset(n_hdlc, 0, sizeof(*n_hdlc));
 
-	n_hdlc_buf_list_init(&n_hdlc->rx_free_buf_list);
-	n_hdlc_buf_list_init(&n_hdlc->tx_free_buf_list);
-	n_hdlc_buf_list_init(&n_hdlc->rx_buf_list);
-	n_hdlc_buf_list_init(&n_hdlc->tx_buf_list);
-	
+	spin_lock_init(&n_hdlc->rx_free_buf_list.spinlock);
+	spin_lock_init(&n_hdlc->tx_free_buf_list.spinlock);
+	spin_lock_init(&n_hdlc->rx_buf_list.spinlock);
+	spin_lock_init(&n_hdlc->tx_buf_list.spinlock);
+
+	INIT_LIST_HEAD(&n_hdlc->rx_free_buf_list.list);
+	INIT_LIST_HEAD(&n_hdlc->tx_free_buf_list.list);
+	INIT_LIST_HEAD(&n_hdlc->rx_buf_list.list);
+	INIT_LIST_HEAD(&n_hdlc->tx_buf_list.list);
+
 	/* allocate free rx buffer list */
 	for(i=0;i<DEFAULT_RX_BUF_COUNT;i++) {
 		buf = kmalloc(N_HDLC_BUF_SIZE, GFP_KERNEL);
@@ -876,17 +880,6 @@ static struct n_hdlc *n_hdlc_alloc(void)
 }	/* end of n_hdlc_alloc() */
 
 /**
- * n_hdlc_buf_list_init - initialize specified HDLC buffer list
- * @list - pointer to buffer list
- */
-static void n_hdlc_buf_list_init(struct n_hdlc_buf_list *list)
-{
-	memset(list, 0, sizeof(*list));
-	spin_lock_init(&list->spinlock);
-	INIT_LIST_HEAD(&list->list);
-}	/* end of n_hdlc_buf_list_init() */
-
-/**
  * n_hdlc_buf_return - put the HDLC buffer after the head of the specified list
  * @buf_list - pointer to the buffer list
  * @buf - pointer to the buffer
--- a/drivers/tty/n_tty.c
+++ b/drivers/tty/n_tty.c
@@ -50,6 +50,7 @@
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ratelimit.h>
+#include <linux/ve.h>
 
 
 /* number of characters left in xmit buffer before select has we have room */
@@ -2041,7 +2042,12 @@ static ssize_t n_tty_write(struct tty_struct *tty, struct file *file,
 			retval = -ERESTARTSYS;
 			break;
 		}
+#ifdef CONFIG_VE
+		if (tty_hung_up_p(file) ||
+		    (tty->link && !tty->link->count && !vtty_is_master(tty->link))) {
+#else
 		if (tty_hung_up_p(file) || (tty->link && !tty->link->count)) {
+#endif
 			retval = -EIO;
 			break;
 		}
--- a/drivers/tty/pty.c
+++ b/drivers/tty/pty.c
@@ -13,7 +13,6 @@
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/fcntl.h>
-#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/major.h>
 #include <linux/mm.h>
@@ -25,6 +24,7 @@
 #include <linux/slab.h>
 #include <linux/mutex.h>
 
+#include <bc/misc.h>
 
 #ifdef CONFIG_UNIX98_PTYS
 static struct tty_driver *ptm_driver;
@@ -35,6 +35,8 @@ static DEFINE_MUTEX(devpts_mutex);
 static void pty_close(struct tty_struct *tty, struct file *filp)
 {
 	BUG_ON(!tty);
+
+	ub_pty_uncharge(tty);
 	if (tty->driver->subtype == PTY_TYPE_MASTER)
 		WARN_ON(tty->count > 1);
 	else {
@@ -242,9 +244,12 @@ static void pty_flush_buffer(struct tty_struct *tty)
 
 static int pty_open(struct tty_struct *tty, struct file *filp)
 {
+	int retval;
+
 	if (!tty || !tty->link)
 		return -ENODEV;
 
+	retval = -EIO;
 	if (test_bit(TTY_OTHER_CLOSED, &tty->flags))
 		goto out;
 	if (test_bit(TTY_PTY_LOCK, &tty->link->flags))
@@ -252,6 +257,10 @@ static int pty_open(struct tty_struct *tty, struct file *filp)
 	if (tty->driver->subtype == PTY_TYPE_SLAVE && tty->link->count != 1)
 		goto out;
 
+	retval = -ENOMEM;
+	if (ub_pty_charge(tty))
+		goto out;
+
 	clear_bit(TTY_IO_ERROR, &tty->flags);
 	clear_bit(TTY_OTHER_CLOSED, &tty->link->flags);
 	set_bit(TTY_THROTTLED, &tty->flags);
@@ -259,7 +268,7 @@ static int pty_open(struct tty_struct *tty, struct file *filp)
 
 out:
 	set_bit(TTY_IO_ERROR, &tty->flags);
-	return -EIO;
+	return retval;
 }
 
 static void pty_set_termios(struct tty_struct *tty,
@@ -538,6 +547,7 @@ static void __init legacy_pty_init(void)
 	if (tty_register_driver(pty_slave_driver))
 		panic("Couldn't register pty slave driver");
 }
+
 #else
 static inline void legacy_pty_init(void) { }
 #endif
@@ -831,10 +841,535 @@ static void __init unix98_pty_init(void)
 static inline void unix98_pty_init(void) { }
 #endif
 
+#if defined(CONFIG_VE)
+
+/*
+ * VTTY architecture overview.
+ *
+ * With VTTY we make /dev/console and /dev/tty[X] virtualized
+ * per container (note the real names may vary because the
+ * kernel itself uses major:minor numbers to distinguish
+ * devices and doesn't care how they are named inside /dev.
+ * /dev/console stands for TTYAUX_MAJOR:1 while /dev/tty[X]
+ * stands for TTY_MAJOR:[0:12]. That said from inside of
+ * VTTY /dev/console is the same as /dev/tty0.
+ *
+ * For every container here is a tty map represented by
+ * vtty_map_t. It carries @veid of VE and associated slave
+ * tty peers.
+ *
+ * map
+ *  veid -> CTID
+ *    vttys -> [ 0 ]
+ *               `- @slave -> link -> @master
+ *             [ 1 ]
+ *               `- @slave -> link -> @master
+ */
+
+#include <linux/ve.h>
+#include <linux/file.h>
+#include <linux/anon_inodes.h>
+
+static struct tty_driver *vttym_driver;
+static struct tty_driver *vttys_driver;
+static DEFINE_IDR(vtty_idr);
+
+static struct file_operations vtty_fops;
+
+#define vtty_match_index(idx)	((idx) >= 0 && (idx) < MAX_NR_VTTY_CONSOLES)
+
+bool vtty_is_master(struct tty_struct *tty)
+{
+	return tty->driver == vttym_driver;
+}
+
+typedef struct {
+	envid_t			veid;
+	struct tty_struct	*vttys[MAX_NR_VTTY_CONSOLES];
+} vtty_map_t;
+
+static vtty_map_t *vtty_map_lookup(envid_t veid)
+{
+	lockdep_assert_held(&tty_mutex);
+	return idr_find(&vtty_idr, veid);
+}
+
+static void vtty_map_set(vtty_map_t *map, struct tty_struct *tty)
+{
+	lockdep_assert_held(&tty_mutex);
+	WARN_ON(map->vttys[tty->index]);
+
+	tty->driver_data = tty->link->driver_data = map;
+	map->vttys[tty->index] = tty;
+}
+
+static void vtty_map_free(vtty_map_t *map)
+{
+	lockdep_assert_held(&tty_mutex);
+	idr_remove(&vtty_idr, map->veid);
+	kfree(map);
+}
+
+static void vtty_map_clear(struct tty_struct *tty)
+{
+	vtty_map_t *map = tty->driver_data;
+
+	lockdep_assert_held(&tty_mutex);
+	if (map) {
+		struct tty_struct *p = map->vttys[tty->index];
+		int i;
+
+		WARN_ON(p != (tty->driver == vttys_driver ? tty : tty->link));
+		map->vttys[tty->index] = NULL;
+		tty->driver_data = tty->link->driver_data = NULL;
+
+		for (i = 0; i < MAX_NR_VTTY_CONSOLES; i++) {
+			if (map->vttys[i])
+				break;
+		}
+
+		if (i >= MAX_NR_VTTY_CONSOLES)
+			vtty_map_free(map);
+	}
+}
+
+static vtty_map_t *vtty_map_alloc(envid_t veid)
+{
+	vtty_map_t *map = kzalloc(sizeof(*map), GFP_KERNEL);
+
+	lockdep_assert_held(&tty_mutex);
+	if (map) {
+		map->veid = veid;
+		veid = idr_alloc(&vtty_idr, map, veid, veid + 1, GFP_KERNEL);
+		if (veid < 0) {
+			kfree(map);
+			return ERR_PTR(veid);
+		}
+	} else
+		map = ERR_PTR(-ENOMEM);
+	return map;
+}
+
+/*
+ * vttys are never supposed to be opened from inside
+ * of VE0 except special ioctl call, so treat zero as
+ * "unused" sign.
+ */
+static envid_t vtty_context_veid;
+
+static void vtty_set_context(envid_t veid)
+{
+	lockdep_assert_held(&tty_mutex);
+	WARN_ON(!veid);
+	vtty_context_veid = veid;
+}
+
+static void vtty_drop_context(void)
+{
+	lockdep_assert_held(&tty_mutex);
+	vtty_context_veid = 0;
+}
+
+static envid_t vtty_get_context(void)
+{
+	lockdep_assert_held(&tty_mutex);
+	return vtty_context_veid ?: get_exec_env()->veid;
+}
+
+static struct tty_struct *vtty_lookup(struct tty_driver *driver,
+				      struct inode *inode, int idx)
+{
+	vtty_map_t *map = vtty_map_lookup(vtty_get_context());
+	struct tty_struct *tty;
+
+	if (!vtty_match_index(idx))
+		return ERR_PTR(-EIO);
+
+	/*
+	 * Nothing ever been opened yet, allocate a new
+	 * tty map together with both peers from the scratch
+	 * in install procedure.
+	 */
+	if (!map)
+		return NULL;
+
+	tty = map->vttys[idx];
+	if (tty) {
+		if (driver == vttym_driver)
+			tty = tty->link;
+		WARN_ON(!tty);
+	}
+	return tty;
+}
+
+static void vtty_standard_install(struct tty_driver *driver,
+				  struct tty_struct *tty)
+{
+	WARN_ON(tty_init_termios(tty));
+
+	tty_driver_kref_get(driver);
+	tty_port_init(tty->port);
+	tty->port->itty = tty;
+}
+
+static struct tty_struct *vtty_install_peer(struct tty_driver *driver,
+					    struct tty_port *port, int index)
+{
+	struct tty_struct *tty;
+
+	tty = alloc_tty_struct(driver, index);
+	if (!tty)
+		return ERR_PTR(-ENOMEM);
+	tty->port = port;
+	vtty_standard_install(driver, tty);
+	return tty;
+}
+
+static int vtty_install(struct tty_driver *driver, struct tty_struct *tty)
+{
+	envid_t veid = vtty_get_context();
+	struct tty_port *peer_port;
+	struct tty_struct *peer;
+	vtty_map_t *map;
+	int ret;
+
+	WARN_ON_ONCE(driver != vttys_driver);
+
+	map = vtty_map_lookup(veid);
+	if (!map) {
+		map = vtty_map_alloc(veid);
+		if (IS_ERR(map))
+			return PTR_ERR(map);
+	}
+
+	tty->port = kzalloc(sizeof(*tty->port), GFP_KERNEL);
+	peer_port = kzalloc(sizeof(*peer_port), GFP_KERNEL);
+	if (!tty->port || !peer_port) {
+		ret = -ENOMEM;
+		goto err_free;
+	}
+
+	peer = vtty_install_peer(vttym_driver, peer_port, tty->index);
+	if (IS_ERR(peer)) {
+		ret = PTR_ERR(peer);
+		goto err_free;
+	}
+
+	vtty_standard_install(vttys_driver, tty);
+	tty->count++;
+
+	tty->link = peer;
+	peer->link = tty;
+
+	vtty_map_set(map, tty);
+	return 0;
+
+err_free:
+	kfree(tty->port);
+	kfree(peer_port);
+	return ret;
+}
+
+static int vtty_open(struct tty_struct *tty, struct file *filp)
+{
+	set_bit(TTY_THROTTLED, &tty->flags);
+	return 0;
+}
+
+static void vtty_close(struct tty_struct *tty, struct file *filp)
+{
+	if (tty->count <= (tty->driver == vttys_driver) ? 2 : 1) {
+		wake_up_interruptible(&tty->read_wait);
+		wake_up_interruptible(&tty->write_wait);
+
+		wake_up_interruptible(&tty->link->read_wait);
+		wake_up_interruptible(&tty->link->write_wait);
+	}
+}
+
+static void vtty_shutdown(struct tty_struct *tty)
+{
+	vtty_map_clear(tty);
+}
+
+static int vtty_write(struct tty_struct *tty,
+		      const unsigned char *buf, int count)
+{
+	struct tty_struct *peer = tty->link;
+
+	if (tty->stopped)
+		return 0;
+
+	if (count > 0) {
+		count = tty_insert_flip_string(peer->port, buf, count);
+		if (count) {
+			tty_flip_buffer_push(peer->port);
+			tty_wakeup(tty);
+		} else {
+			/*
+			 * Flush the slave reader if noone
+			 * is actually hooked on. Otherwise
+			 * wait until reader fetch all data.
+			 */
+			if (peer->count <
+			    (tty->driver == vttym_driver) ? 2 : 1)
+				tty_perform_flush(peer, TCIFLUSH);
+		}
+	}
+
+	return count;
+}
+
+static int vtty_write_room(struct tty_struct *tty)
+{
+	struct tty_struct *peer = tty->link;
+
+	if (tty->stopped)
+		return 0;
+
+	if (peer->count <
+	    (tty->driver == vttym_driver) ? 2 : 1)
+		return 2048;
+
+	return pty_space(peer);
+}
+
+static void vtty_remove(struct tty_driver *driver, struct tty_struct *tty)
+{
+}
+
+static int vtty_resize(struct tty_struct *tty, struct winsize *ws)
+{
+	if (tty->driver == vttym_driver)
+		return pty_resize(tty, ws);
+	return tty_do_resize(tty, ws);
+}
+
+static const struct tty_operations vtty_ops = {
+	.lookup		= vtty_lookup,
+	.install	= vtty_install,
+	.open		= vtty_open,
+	.close		= vtty_close,
+	.shutdown	= vtty_shutdown,
+	.cleanup	= pty_cleanup,
+	.write		= vtty_write,
+	.write_room	= vtty_write_room,
+	.chars_in_buffer= pty_chars_in_buffer,
+	.set_termios	= pty_set_termios,
+	.unthrottle	= pty_unthrottle,
+	.flush_buffer	= pty_flush_buffer,
+	.remove		= vtty_remove,
+	.resize		= vtty_resize,
+};
+
+struct tty_driver *vtty_console_driver(int *index)
+{
+	*index = 0;
+	return vttys_driver;
+}
+
+struct tty_driver *vtty_driver(dev_t dev, int *index)
+{
+	if (MAJOR(dev) == TTY_MAJOR &&
+	    MINOR(dev) <= MAX_NR_VTTY_CONSOLES) {
+		if (MINOR(dev))
+			*index = MINOR(dev) - 1;
+		else
+			*index = 0;
+		return vttys_driver;
+	}
+	return NULL;
+}
+
+void vtty_release(struct tty_struct *tty, struct tty_struct *o_tty,
+		  int *tty_closing, int *o_tty_closing)
+{
+	int pty_master;
+	lockdep_assert_held(&tty_mutex);
+
+	if (tty->driver != vttym_driver &&
+	    tty->driver != vttys_driver)
+		return;
+
+	pty_master = (tty->driver == vttym_driver);
+
+	/*
+	 * Do not close master while slave is active.
+	 */
+	if (!*o_tty_closing && pty_master)
+		*tty_closing = 0;
+
+	/*
+	 * Do not close master if we've closing
+	 * not the last slave even if there is no
+	 * readers on the master.
+	 */
+	if (*o_tty_closing && !*tty_closing && !pty_master)
+		*o_tty_closing = 0;
+}
+
+static int __init vtty_init(void)
+{
+#define VTTY_DRIVER_ALLOC_FLAGS			\
+	(TTY_DRIVER_REAL_RAW		|	\
+	 TTY_DRIVER_RESET_TERMIOS	|	\
+	 TTY_DRIVER_DYNAMIC_DEV		|	\
+	 TTY_DRIVER_INSTALLED		|	\
+	 TTY_DRIVER_DEVPTS_MEM)
+
+	vttym_driver = tty_alloc_driver(MAX_NR_VTTY_CONSOLES,
+					VTTY_DRIVER_ALLOC_FLAGS);
+	if (IS_ERR(vttym_driver))
+		panic(pr_fmt("Can't allocate master vtty driver\n"));
+
+	vttys_driver = tty_alloc_driver(MAX_NR_VTTY_CONSOLES,
+					VTTY_DRIVER_ALLOC_FLAGS);
+	if (IS_ERR(vttys_driver))
+		panic(pr_fmt("Can't allocate slave vtty driver\n"));
+
+	vttym_driver->driver_name		= "vtty_master";
+	vttym_driver->name			= "vttym";
+	vttym_driver->name_base			= 0;
+	vttym_driver->major			= 0;
+	vttym_driver->minor_start		= 0;
+	vttym_driver->type			= TTY_DRIVER_TYPE_PTY;
+	vttym_driver->subtype			= PTY_TYPE_MASTER;
+	vttym_driver->init_termios		= tty_std_termios;
+	vttym_driver->init_termios.c_iflag	= 0;
+	vttym_driver->init_termios.c_oflag	= 0;
+
+	/* 38400 boud rate, 8 bit char size, enable receiver */
+	vttym_driver->init_termios.c_cflag	= B38400 | CS8 | CREAD;
+	vttym_driver->init_termios.c_lflag	= 0;
+	tty_set_operations(vttym_driver, &vtty_ops);
+
+	vttys_driver->driver_name		= "vtty_slave";
+	vttys_driver->name			= "vttys";
+	vttys_driver->name_base			= 0;
+	vttys_driver->major			= 0;
+	vttys_driver->minor_start		= 0;
+	vttys_driver->type			= TTY_DRIVER_TYPE_PTY;
+	vttys_driver->subtype			= PTY_TYPE_SLAVE;
+	vttys_driver->init_termios		= tty_std_termios;
+	vttys_driver->init_termios.c_cflag	= B38400 | CS8 | CREAD;
+	tty_set_operations(vttys_driver, &vtty_ops);
+
+	if (tty_register_driver(vttym_driver))
+		panic(pr_fmt("Can't register master vtty driver\n"));
+
+	if (tty_register_driver(vttys_driver))
+		panic(pr_fmt("Can't register slave vtty driver\n"));
+
+	tty_default_fops(&vtty_fops);
+	return 0;
+}
+
+int vtty_open_master(envid_t veid, int idx)
+{
+	struct tty_struct *tty;
+	struct file *file;
+	char devname[64];
+	int fd, ret;
+
+	if (!vtty_match_index(idx))
+		return -EIO;
+
+	fd = get_unused_fd_flags(0);
+	if (fd < 0)
+		return fd;
+
+	snprintf(devname, sizeof(devname), "v%utty%d", veid, idx);
+	file = anon_inode_getfile(devname, &vtty_fops, NULL, O_RDWR);
+	if (IS_ERR(file)) {
+		ret = PTR_ERR(file);
+		goto err_put_unused_fd;
+	}
+	nonseekable_open(NULL, file);
+
+	ret = tty_alloc_file(file);
+	if (ret)
+		goto err_fput;
+
+	/*
+	 * Opening comes from ve0 context so
+	 * setup VE's context until master fetched.
+	 * This is done under @tty_mutex so noone
+	 * else would access it while we're holding
+	 * the lock.
+	 */
+	mutex_lock(&tty_mutex);
+	vtty_set_context(veid);
+
+	tty = vtty_lookup(vttym_driver, NULL, idx);
+	if (!tty) {
+		/*
+		 * FIXME: Previously we've been testing
+		 * for TTY_CLOSING bit which is not longer
+		 * here. Review and handle.
+		 */
+		/*
+		 * The previous connection is about to
+		 * be closed so drop it from the map and
+		 * allocate a new one.
+		 */
+		if (tty)
+			vtty_map_clear(tty);
+		tty = tty_init_dev(vttys_driver, idx);
+		if (IS_ERR(tty))
+			goto err_install;
+		tty->count--;
+		tty_unlock(tty);
+		tty = tty->link;
+	}
+
+	/* One master at a time */
+	if (tty->count >= 1) {
+		ret = -EBUSY;
+		goto err_install;
+	}
+
+	vtty_drop_context();
+
+	/* FIXME: code will be dropped anyway
+	 * WARN_ON(!test_bit(TTY_LDISC, &tty->flags));
+	 */
+
+	/*
+	 * We're the master peer so increment
+	 * slave counter as well.
+	 */
+	tty_add_file(tty, file);
+	tty->count++;
+	tty->link->count++;
+	fd_install(fd, file);
+	vtty_open(tty, file);
+
+	mutex_unlock(&tty_mutex);
+	ret = fd;
+out:
+	return ret;
+
+err_install:
+	vtty_drop_context();
+	mutex_unlock(&tty_mutex);
+	tty_free_file(file);
+err_fput:
+	file->f_op = NULL;
+	fput(file);
+err_put_unused_fd:
+	put_unused_fd(fd);
+	goto out;
+}
+EXPORT_SYMBOL(vtty_open_master);
+#else
+static void vtty_init(void) { };
+#endif /* CONFIG_VE */
+
 static int __init pty_init(void)
 {
 	legacy_pty_init();
 	unix98_pty_init();
+	vtty_init();
 	return 0;
 }
 module_init(pty_init);
--- a/drivers/tty/sysrq.c
+++ b/drivers/tty/sysrq.c
@@ -45,6 +45,9 @@
 #include <linux/moduleparam.h>
 #include <linux/jiffies.h>
 #include <linux/rcupdate.h>
+#include <linux/ve.h>
+
+#include <bc/vmpages.h>
 
 #include <asm/ptrace.h>
 #include <asm/irq_regs.h>
@@ -360,7 +363,7 @@ static struct sysrq_key_op sysrq_term_op = {
 static void moom_callback(struct work_struct *ignored)
 {
 	out_of_memory(node_zonelist(first_memory_node, GFP_KERNEL), GFP_KERNEL,
-		      0, NULL, true);
+		      0, NULL);
 }
 
 static DECLARE_WORK(moom_work, moom_callback);
@@ -1039,10 +1042,16 @@ static ssize_t write_sysrq_trigger(struct file *file, const char __user *buf,
 {
 	if (count) {
 		char c;
+		struct ve_struct *cur = get_exec_env();
+		static int pnum = 10;
 
 		if (get_user(c, buf))
 			return -EFAULT;
-		__handle_sysrq(c, false);
+		if (ve_is_super(cur))
+			__handle_sysrq(c, false);
+		else if (pnum--)
+			printk("SysRq: CT#%s sent '%c' magic key.\n",
+				cur->ve_name, c);
 	}
 
 	return count;
@@ -1055,7 +1064,7 @@ static const struct file_operations proc_sysrq_trigger_operations = {
 
 static void sysrq_init_procfs(void)
 {
-	if (!proc_create("sysrq-trigger", S_IWUSR, NULL,
+	if (!proc_create("sysrq-trigger", S_ISVTX | S_IWUSR, NULL,
 			 &proc_sysrq_trigger_operations))
 		pr_err("Failed to register proc interface\n");
 }
--- a/drivers/tty/tty_io.c
+++ b/drivers/tty/tty_io.c
@@ -69,7 +69,6 @@
 #include <linux/errno.h>
 #include <linux/signal.h>
 #include <linux/fcntl.h>
-#include <linux/sched.h>
 #include <linux/interrupt.h>
 #include <linux/tty.h>
 #include <linux/tty_driver.h>
@@ -104,6 +103,7 @@
 
 #include <linux/kmod.h>
 #include <linux/nsproxy.h>
+#include <linux/ve.h>
 
 #undef TTY_DEBUG_HANGUP
 
@@ -1555,7 +1555,7 @@ void tty_free_termios(struct tty_struct *tty)
 	/* Stash the termios data */
 	tp = tty->driver->termios[idx];
 	if (tp == NULL) {
-		tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL);
+		tp = kmalloc(sizeof(struct ktermios), GFP_KERNEL_ACCOUNT);
 		if (tp == NULL) {
 			pr_warn("tty: no memory to save termios state.\n");
 			return;
@@ -1602,13 +1602,14 @@ static void release_one_tty(struct work_struct *work)
 	struct tty_struct *tty =
 		container_of(work, struct tty_struct, hangup_work);
 	struct tty_driver *driver = tty->driver;
+	struct module *owner = driver->owner;
 
 	if (tty->ops->cleanup)
 		tty->ops->cleanup(tty);
 
 	tty->magic = 0;
 	tty_driver_kref_put(driver);
-	module_put(driver->owner);
+	module_put(owner);
 
 	spin_lock(&tty_files_lock);
 	list_del_init(&tty->tty_files);
@@ -1791,6 +1792,15 @@ int tty_release(struct inode *inode, struct file *filp)
 	while (1) {
 		do_sleep = 0;
 
+		/*
+		 * FIXME: Need to figure out how to prevent closing
+		 * peers when one is still active, unlike traditional
+		 * PTYs we don't close master if slave is closed.
+		 */
+#if 0
+		vtty_release(tty, o_tty, &tty_closing, &o_tty_closing);
+#endif
+
 		if (tty->count <= 1) {
 			if (waitqueue_active(&tty->read_wait)) {
 				wake_up_poll(&tty->read_wait, POLLIN);
@@ -1951,6 +1961,19 @@ static struct tty_driver *tty_lookup_driver(dev_t device, struct file *filp,
 {
 	struct tty_driver *driver;
 
+#ifdef CONFIG_VE
+	struct ve_struct *ve = get_exec_env();
+
+	if (!ve_is_super(ve)) {
+		driver = vtty_driver(device, index);
+		if (driver) {
+			if (MINOR(device) == 0)
+				*noctty = 1;
+			return tty_driver_kref_get(driver);
+		}
+	}
+#endif
+
 	switch (device) {
 #ifdef CONFIG_VT
 	case MKDEV(TTY_MAJOR, 0): {
@@ -1963,6 +1986,17 @@ static struct tty_driver *tty_lookup_driver(dev_t device, struct file *filp,
 #endif
 	case MKDEV(TTYAUX_MAJOR, 1): {
 		struct tty_driver *console_driver = console_device(index);
+#ifdef CONFIG_VE
+		if (!ve_is_super(ve)) {
+			console_driver = vtty_console_driver(index);
+			/*
+			 * Reset fops, sometimes there might be
+			 * console_fops picked from inode->i_cdev
+			 * in chrdev_open()
+			 */
+			filp->f_op = &tty_fops;
+		}
+#endif
 		if (console_driver) {
 			driver = tty_driver_kref_get(console_driver);
 			if (driver) {
@@ -2600,6 +2634,28 @@ static int tiocsetd(struct tty_struct *tty, int __user *p)
 }
 
 /**
+ *	tiocgetd	-	get line discipline
+ *	@tty: tty device
+ *	@p: pointer to user data
+ *
+ *	Retrieves the line discipline id directly from the ldisc.
+ *
+ *	Locking: waits for ldisc reference (in case the line discipline
+ *		is changing or the tty is being hungup)
+ */
+
+static int tiocgetd(struct tty_struct *tty, int __user *p)
+{
+	struct tty_ldisc *ld;
+	int ret;
+
+	ld = tty_ldisc_ref_wait(tty);
+	ret = put_user(ld->ops->num, p);
+	tty_ldisc_deref(ld);
+	return ret;
+}
+
+/**
  *	send_break	-	performed time break
  *	@tty: device to break on
  *	@duration: timeout in mS
@@ -2813,7 +2869,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case TIOCGSID:
 		return tiocgsid(tty, real_tty, p);
 	case TIOCGETD:
-		return put_user(tty->ldisc->ops->num, (int __user *)p);
+		return tiocgetd(tty, p);
 	case TIOCSETD:
 		return tiocsetd(tty, p);
 	case TIOCVHANGUP:
@@ -2869,6 +2925,11 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 			break;
 		}
 		break;
+	case TIOSAK:
+		if (real_tty == tty && !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		__do_SAK(real_tty);
+		return 0;
 	}
 	if (tty->ops->ioctl) {
 		retval = (tty->ops->ioctl)(tty, cmd, arg);
@@ -3038,7 +3099,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx)
 {
 	struct tty_struct *tty;
 
-	tty = kzalloc(sizeof(*tty), GFP_KERNEL);
+	tty = kzalloc(sizeof(*tty), GFP_KERNEL_ACCOUNT);
 	if (!tty)
 		return NULL;
 
--- a/drivers/tty/vt/keyboard.c
+++ b/drivers/tty/vt/keyboard.c
@@ -42,6 +42,7 @@
 #include <linux/notifier.h>
 #include <linux/jiffies.h>
 #include <linux/uaccess.h>
+#include <linux/device.h>
 
 #include <asm/irq_regs.h>
 
@@ -1423,7 +1424,7 @@ static bool kbd_match(struct input_handler *handler, struct input_dev *dev)
  * likes it, it can open it and get events from it. In this (kbd_connect)
  * function, we should decide which VT to bind that keyboard to initially.
  */
-static int kbd_connect(struct input_handler *handler, struct input_dev *dev,
+static int __kbd_connect(struct input_handler *handler, struct input_dev *dev,
 			const struct input_device_id *id)
 {
 	struct input_handle *handle;
@@ -1454,13 +1455,82 @@ static int kbd_connect(struct input_handler *handler, struct input_dev *dev,
 	return error;
 }
 
-static void kbd_disconnect(struct input_handle *handle)
+static void __kbd_disconnect(struct input_handle *handle)
 {
 	input_close_device(handle);
 	input_unregister_handle(handle);
 	kfree(handle);
 }
 
+extern struct mutex input_mutex;
+/*
+ * To unbind keyboard need write "unbind" in kbd_bind
+ * To bind keyboard to all TTYs need write "all" in kbd_bind (by default)
+ * To bind keyboard to specified TTY... (not implemented)
+ */
+static ssize_t kbd_bind_store(struct device *dev,
+                                struct device_attribute *attr,
+                                const char *buf, size_t len)
+{
+	struct list_head *node;
+	int ret = -EINVAL;
+	struct input_dev *idev;
+	char *s;
+
+	if (buf[len] != '\0')
+		return -EINVAL;
+
+/*	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+*/
+	s = strchr(buf, '\n');
+	if (s)
+		*s = '\0';
+
+	mutex_lock(&input_mutex);
+	if (!strcmp(buf, "unbind")) {
+		list_for_each(node, &kbd_handler.h_list) {
+			struct input_handle *handle = container_of(node,
+					struct input_handle, h_node);
+			idev = handle->dev;
+			if (&idev->dev == dev) {
+				__kbd_disconnect(handle);
+				ret = len;
+				break;
+			}
+		}
+	} else if (!strcmp(buf, "all")) {
+		idev = container_of(dev, struct input_dev, dev);
+		ret = __kbd_connect(&kbd_handler, idev, NULL);
+		if (!ret)
+			ret = len;
+	}
+	mutex_unlock(&input_mutex);
+
+	return ret;
+}
+
+static DEVICE_ATTR(kbd_bind, S_IWUSR, NULL , kbd_bind_store);
+
+static int kbd_connect(struct input_handler *handler, struct input_dev *dev,
+			const struct input_device_id *id)
+{
+	int error;
+	error = device_create_file(&dev->dev, &dev_attr_kbd_bind);
+	if (error)
+		return error;
+	error = __kbd_connect(handler, dev, id);
+	if (error)
+		device_remove_file(&dev->dev, &dev_attr_kbd_bind);
+	return error;
+}
+
+static void kbd_disconnect(struct input_handle *handle)
+{
+	device_remove_file(&handle->dev->dev, &dev_attr_kbd_bind);
+	__kbd_disconnect(handle);
+}
+
 /*
  * Start keyboard handler on the new keyboard by refreshing LED state to
  * match the rest of the system.
--- a/drivers/tty/vt/vc_screen.c
+++ b/drivers/tty/vt/vc_screen.c
@@ -32,12 +32,12 @@
 #include <linux/kbd_kern.h>
 #include <linux/console.h>
 #include <linux/device.h>
-#include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/poll.h>
 #include <linux/signal.h>
 #include <linux/slab.h>
 #include <linux/notifier.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 #include <asm/byteorder.h>
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -30,7 +30,7 @@
 
 #include "vhost.h"
 
-static int experimental_zcopytx;
+static int experimental_zcopytx = 1;
 module_param(experimental_zcopytx, int, 0444);
 MODULE_PARM_DESC(experimental_zcopytx, "Enable Zero Copy TX;"
 		                       " 1 -Enable; 0 - Disable");
@@ -823,7 +823,7 @@ static int vhost_net_open(struct inode *inode, struct file *f)
 	}
 	r = vhost_dev_init(dev, vqs, VHOST_NET_VQ_MAX);
 	if (r < 0) {
-		kfree(n);
+		vhost_net_free(n);
 		kfree(vqs);
 		return r;
 	}
--- a/drivers/xen/tmem.c
+++ b/drivers/xen/tmem.c
@@ -397,13 +397,15 @@ static int xen_tmem_init(void)
 #ifdef CONFIG_CLEANCACHE
 	BUG_ON(sizeof(struct cleancache_filekey) != sizeof(struct tmem_oid));
 	if (tmem_enabled && cleancache) {
-		char *s = "";
-		struct cleancache_ops *old_ops =
-			cleancache_register_ops(&tmem_cleancache_ops);
-		if (old_ops)
-			s = " (WARNING: cleancache_ops overridden)";
-		pr_info("cleancache enabled, RAM provided by Xen Transcendent Memory%s\n",
-			s);
+		int err;
+
+		err = cleancache_register_ops(&tmem_cleancache_ops);
+		if (err)
+			pr_warn("xen-tmem: failed to enable cleancache: %d\n",
+				err);
+		else
+			pr_info("cleancache enabled, RAM provided by "
+				"Xen Transcendent Memory\n");
 	}
 #endif
 #ifdef CONFIG_XEN_SELFBALLOONING
--- a/fs/9p/acl.c
+++ b/fs/9p/acl.c
@@ -320,32 +320,26 @@ static int v9fs_xattr_set_acl(struct dentry *dentry, const char *name,
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
 		if (acl) {
-			umode_t mode = inode->i_mode;
-			retval = posix_acl_equiv_mode(acl, &mode);
-			if (retval < 0)
+			struct iattr iattr;
+
+			retval = posix_acl_update_mode(inode, &iattr.ia_mode, &acl);
+			if (retval)
 				goto err_out;
-			else {
-				struct iattr iattr;
-				if (retval == 0) {
-					/*
-					 * ACL can be represented
-					 * by the mode bits. So don't
-					 * update ACL.
-					 */
-					acl = NULL;
-					value = NULL;
-					size = 0;
-				}
-				/* Updte the mode bits */
-				iattr.ia_mode = ((mode & S_IALLUGO) |
-						 (inode->i_mode & ~S_IALLUGO));
-				iattr.ia_valid = ATTR_MODE;
-				/* FIXME should we update ctime ?
-				 * What is the following setxattr update the
-				 * mode ?
+			if (!acl) {
+				/*
+				 * ACL can be represented
+				 * by the mode bits. So don't
+				 * update ACL.
 				 */
-				v9fs_vfs_setattr_dotl(dentry, &iattr);
+				value = NULL;
+				size = 0;
 			}
+			iattr.ia_valid = ATTR_MODE;
+			/* FIXME should we update ctime ?
+			 * What is the following setxattr update the
+			 * mode ?
+			 */
+			v9fs_vfs_setattr_dotl(dentry, &iattr);
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -574,7 +574,7 @@ static int v9fs_init_inode_cache(void)
 	v9fs_inode_cache = kmem_cache_create("v9fs_inode_cache",
 					  sizeof(struct v9fs_inode),
 					  0, (SLAB_RECLAIM_ACCOUNT|
-					      SLAB_MEM_SPREAD),
+					      SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					  v9fs_inode_init_once);
 	if (!v9fs_inode_cache)
 		return -ENOMEM;
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -482,7 +482,7 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid *fid,
 	if (invalidate && (total > 0)) {
 		pg_start = origin >> PAGE_CACHE_SHIFT;
 		pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
-		if (inode->i_mapping && inode->i_mapping->nrpages)
+		if (inode->i_mapping)
 			invalidate_inode_pages2_range(inode->i_mapping,
 						      pg_start, pg_end);
 		*offset += total;
@@ -688,7 +688,7 @@ v9fs_direct_write(struct file *filp, const char __user * data,
 	 * about to write.  We do this *before* the write so that if we fail
 	 * here we fall back to buffered write
 	 */
-	if (mapping->nrpages) {
+	{
 		pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT;
 		pgoff_t pg_end   = (offset + count - 1) >> PAGE_CACHE_SHIFT;
 
@@ -735,7 +735,6 @@ v9fs_cached_file_write(struct file *filp, const char __user * data,
 static const struct vm_operations_struct v9fs_file_vm_ops = {
 	.fault = filemap_fault,
 	.page_mkwrite = v9fs_vm_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 
--- a/fs/adfs/super.c
+++ b/fs/adfs/super.c
@@ -271,7 +271,7 @@ static int init_inodecache(void)
 	adfs_inode_cachep = kmem_cache_create("adfs_inode_cache",
 					     sizeof(struct adfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (adfs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/affs/super.c
+++ b/fs/affs/super.c
@@ -138,7 +138,7 @@ static int init_inodecache(void)
 	affs_inode_cachep = kmem_cache_create("affs_inode_cache",
 					     sizeof(struct affs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (affs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/afs/super.c
+++ b/fs/afs/super.c
@@ -91,7 +91,7 @@ int __init afs_fs_init(void)
 	afs_inode_cachep = kmem_cache_create("afs_inode_cache",
 					     sizeof(struct afs_vnode),
 					     0,
-					     SLAB_HWCACHE_ALIGN,
+					     SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
 					     afs_i_init_once);
 	if (!afs_inode_cachep) {
 		printk(KERN_NOTICE "kAFS: Failed to allocate inode cache\n");
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -29,6 +29,7 @@
 #include <linux/slab.h>
 #include <linux/timer.h>
 #include <linux/aio.h>
+#include <linux/ve.h>
 #include <linux/highmem.h>
 #include <linux/workqueue.h>
 #include <linux/security.h>
@@ -122,14 +123,9 @@ struct kioctx {
 
 	struct page		*internal_pages[AIO_RING_PAGES];
 	struct file		*aio_ring_file;
+	struct ve_struct	*ve;
 };
 
-/*------ sysctl variables----*/
-static DEFINE_SPINLOCK(aio_nr_lock);
-unsigned long aio_nr;		/* current system wide number of aio requests */
-unsigned long aio_max_nr = 0x10000; /* system wide maximum number of aio requests */
-/*----end sysctl variables---*/
-
 static struct kmem_cache	*kiocb_cachep;
 static struct kmem_cache	*kioctx_cachep;
 
@@ -518,6 +514,9 @@ static int kiocb_cancel(struct kioctx *ctx, struct kiocb *kiocb,
 static void free_ioctx_rcu(struct rcu_head *head)
 {
 	struct kioctx *ctx = container_of(head, struct kioctx, rcu_head);
+	struct ve_struct *ve = ctx->ve;
+
+	put_ve(ve);
 	kmem_cache_free(kioctx_cachep, ctx);
 }
 
@@ -594,8 +593,12 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 {
 	struct mm_struct *mm = current->mm;
 	struct kioctx *ctx;
+	struct ve_struct *ve = get_exec_env();
 	int err = -ENOMEM;
 
+	/* Kernel since e1bdd5f27a5b do this, and criu is tuned on that */
+	nr_events *= 2;
+
 	/* Prevent overflows */
 	if ((nr_events > (0x10000000U / sizeof(struct io_event))) ||
 	    (nr_events > (0x10000000U / sizeof(struct kiocb)))) {
@@ -603,7 +606,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-EINVAL);
 	}
 
-	if (!nr_events || (unsigned long)nr_events > aio_max_nr)
+	if (!nr_events || (unsigned long)nr_events > ve->aio_max_nr)
 		return ERR_PTR(-EAGAIN);
 
 	ctx = kmem_cache_zalloc(kioctx_cachep, GFP_KERNEL);
@@ -611,6 +614,7 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		return ERR_PTR(-ENOMEM);
 
 	ctx->max_reqs = nr_events;
+	ctx->ve = get_ve(ve);
 
 	spin_lock_init(&ctx->ctx_lock);
 	spin_lock_init(&ctx->completion_lock);
@@ -631,14 +635,14 @@ static struct kioctx *ioctx_alloc(unsigned nr_events)
 		goto out_freectx;
 
 	/* limit the number of system wide aios */
-	spin_lock(&aio_nr_lock);
-	if (aio_nr + nr_events > aio_max_nr ||
-	    aio_nr + nr_events < aio_nr) {
-		spin_unlock(&aio_nr_lock);
+	spin_lock(&ve->aio_nr_lock);
+	if (ve->aio_nr + ctx->nr_events > ve->aio_max_nr ||
+	    ve->aio_nr + ctx->nr_events < ve->aio_nr) {
+		spin_unlock(&ve->aio_nr_lock);
 		goto out_cleanup;
 	}
-	aio_nr += ctx->max_reqs;
-	spin_unlock(&aio_nr_lock);
+	ve->aio_nr += ctx->nr_events;
+	spin_unlock(&ve->aio_nr_lock);
 
 	/* now link into global list. */
 	spin_lock(&mm->ioctx_lock);
@@ -659,6 +663,7 @@ out_cleanup:
 		vm_munmap(ctx->mmap_base, ctx->mmap_size);
 	aio_free_ring(ctx);
 out_freectx:
+	put_ve(ctx->ve);
 	mutex_unlock(&ctx->ring_lock);
 	put_aio_ring_file(ctx);
 	kmem_cache_free(kioctx_cachep, ctx);
@@ -691,6 +696,8 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
 		struct completion *requests_done)
 {
 	if (!atomic_xchg(&ctx->dead, 1)) {
+		struct ve_struct *ve = ctx->ve;
+
 		spin_lock(&mm->ioctx_lock);
 		hlist_del_rcu(&ctx->list);
 		spin_unlock(&mm->ioctx_lock);
@@ -702,10 +709,10 @@ static int kill_ioctx(struct mm_struct *mm, struct kioctx *ctx,
 		 * -EAGAIN with no ioctxs actually in use (as far as userspace
 		 *  could tell).
 		 */
-		spin_lock(&aio_nr_lock);
-		BUG_ON(aio_nr - ctx->max_reqs > aio_nr);
-		aio_nr -= ctx->max_reqs;
-		spin_unlock(&aio_nr_lock);
+		spin_lock(&ve->aio_nr_lock);
+		BUG_ON(ve->aio_nr - ctx->nr_events > ve->aio_nr);
+		ve->aio_nr -= ctx->nr_events;
+		spin_unlock(&ve->aio_nr_lock);
 
 		if (ctx->mmap_size)
 			vm_munmap(ctx->mmap_base, ctx->mmap_size);
@@ -932,6 +939,10 @@ void aio_complete(struct kiocb *iocb, long res, long res2)
 		atomic_set(&iocb->ki_users, 0);
 		wake_up_process(iocb->ki_obj.tsk);
 		return;
+	} else if (is_kernel_kiocb(iocb)) {
+		iocb->ki_obj.complete(iocb->ki_user_data, res);
+		aio_kernel_free(iocb);
+		return;
 	}
 
 	/*
@@ -1373,6 +1384,51 @@ static ssize_t aio_setup_single_vector(int rw, struct kiocb *kiocb)
 	return 0;
 }
 
+static ssize_t aio_read_iter(struct kiocb *iocb)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t ret;
+
+	if (unlikely(!is_kernel_kiocb(iocb)))
+		return -EINVAL;
+
+	if (unlikely(!(file->f_mode & FMODE_READ)))
+		return -EBADF;
+
+	ret = security_file_permission(file, MAY_READ);
+	if (unlikely(ret))
+		return ret;
+
+	if (!file->f_op->read_iter)
+		return -EINVAL;
+
+	return file->f_op->read_iter(iocb, iocb->ki_iter, iocb->ki_pos);
+}
+
+static ssize_t aio_write_iter(struct kiocb *iocb)
+{
+	struct file *file = iocb->ki_filp;
+	ssize_t ret;
+
+	if (unlikely(!is_kernel_kiocb(iocb)))
+		return -EINVAL;
+
+	if (unlikely(!(file->f_mode & FMODE_WRITE)))
+		return -EBADF;
+
+	ret = security_file_permission(file, MAY_WRITE);
+	if (unlikely(ret))
+		return ret;
+
+	if (!file->f_op->write_iter)
+		return -EINVAL;
+
+	file_start_write(file);
+	ret = file->f_op->write_iter(iocb, iocb->ki_iter, iocb->ki_pos);
+	file_end_write(file);
+	return ret;
+}
+
 /*
  * aio_setup_iocb:
  *	Performs the initial checks and aio retry method
@@ -1424,6 +1480,14 @@ rw_common:
 		ret = aio_rw_vect_retry(req, rw, rw_op);
 		break;
 
+	case IOCB_CMD_READ_ITER:
+		ret = aio_read_iter(req);
+		break;
+
+	case IOCB_CMD_WRITE_ITER:
+		ret = aio_write_iter(req);
+		break;
+
 	case IOCB_CMD_FDSYNC:
 		if (!file->f_op->aio_fsync)
 			return -EINVAL;
@@ -1458,6 +1522,89 @@ rw_common:
 	return 0;
 }
 
+/*
+ * This allocates an iocb that will be used to submit and track completion of
+ * an IO that is issued from kernel space.
+ *
+ * The caller is expected to call the appropriate aio_kernel_init_() functions
+ * and then call aio_kernel_submit().  From that point forward progress is
+ * guaranteed by the file system aio method.  Eventually the caller's
+ * completion callback will be called.
+ *
+ * These iocbs are special.  They don't have a context, we don't limit the
+ * number pending, they can't be canceled, and can't be retried.  In the short
+ * term callers need to be careful not to call operations which might retry by
+ * only calling new ops which never add retry support.  In the long term
+ * retry-based AIO should be removed.
+ */
+struct kiocb *aio_kernel_alloc(gfp_t gfp)
+{
+	struct kiocb *iocb = kzalloc(sizeof(struct kiocb), gfp);
+	if (iocb)
+		iocb->ki_ctx = (void *)-1;
+	return iocb;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_alloc);
+
+void aio_kernel_free(struct kiocb *iocb)
+{
+	kfree(iocb);
+}
+EXPORT_SYMBOL_GPL(aio_kernel_free);
+
+/*
+ * The iter count must be set before calling here.  Some filesystems uses
+ * iocb->ki_left as an indicator of the size of an IO.
+ */
+void aio_kernel_init_iter(struct kiocb *iocb, struct file *filp,
+			  unsigned short op, struct iov_iter *iter, loff_t off)
+{
+	iocb->ki_filp = filp;
+	iocb->ki_iter = iter;
+	iocb->ki_opcode = op;
+	iocb->ki_pos = off;
+	iocb->ki_nbytes = iov_iter_count(iter);
+	iocb->ki_left = iocb->ki_nbytes;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_init_iter);
+
+void aio_kernel_init_callback(struct kiocb *iocb,
+			      void (*complete)(u64 user_data, long res),
+			      u64 user_data)
+{
+	iocb->ki_obj.complete = complete;
+	iocb->ki_user_data = user_data;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_init_callback);
+
+/*
+ * The iocb is our responsibility once this is called.  The caller must not
+ * reference it.  This comes from aio_setup_iocb() modifying the iocb.
+ *
+ * Callers must be prepared for their iocb completion callback to be called the
+ * moment they enter this function.  The completion callback may be called from
+ * any context.
+ *
+ * Returns: 0: the iocb completion callback will be called with the op result
+ * negative errno: the operation was not submitted and the iocb was freed
+ */
+int aio_kernel_submit(struct kiocb *iocb)
+{
+	int ret;
+
+	BUG_ON(!is_kernel_kiocb(iocb));
+	BUG_ON(!iocb->ki_obj.complete);
+	BUG_ON(!iocb->ki_filp);
+
+	ret = aio_run_iocb(iocb, 0);
+
+	if (ret)
+		aio_kernel_free(iocb);
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(aio_kernel_submit);
+
 static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb,
 			 struct iocb *iocb, bool compat)
 {
@@ -1703,3 +1850,73 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id,
 	}
 	return ret;
 }
+
+#ifdef CONFIG_VE
+static bool has_reqs_active(struct kioctx *ctx)
+{
+	unsigned long flags;
+	unsigned nr;
+
+	spin_lock_irqsave(&ctx->completion_lock, flags);
+	nr = atomic_read(&ctx->reqs_active);
+	nr -= ctx->completed_events;
+	spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+	return !!nr;
+}
+
+static int ve_aio_wait_inflight_reqs(struct task_struct *p)
+{
+	struct mm_struct *mm;
+	struct kioctx *ctx;
+	int ret;
+
+	if (p->flags & PF_KTHREAD)
+		return -EINVAL;
+
+	task_lock(p);
+	mm = p->mm;
+	if (mm)
+		atomic_inc(&mm->mm_count);
+	task_unlock(p);
+	if (!mm)
+		return -ESRCH;
+
+again:
+	spin_lock_irq(&mm->ioctx_lock);
+	hlist_for_each_entry_rcu(ctx, &mm->ioctx_list, list) {
+		if (!has_reqs_active(ctx))
+			continue;
+
+		atomic_inc(&ctx->users);
+		spin_unlock_irq(&mm->ioctx_lock);
+
+		ret = wait_event_interruptible(ctx->wait, !has_reqs_active(ctx));
+		put_ioctx(ctx);
+
+		if (ret)
+			goto mmdrop;
+		goto again;
+	}
+	spin_unlock_irq(&mm->ioctx_lock);
+	ret = 0;
+mmdrop:
+	mmdrop(mm);
+	return ret;
+}
+
+int ve_aio_ioctl(struct task_struct *task, unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	switch (cmd) {
+		case VE_AIO_IOC_WAIT_ACTIVE:
+			ret = ve_aio_wait_inflight_reqs(task);
+			break;
+		default:
+			ret = -EINVAL;
+	}
+
+	return ret;
+}
+#endif
--- a/fs/autofs4/autofs_i.h
+++ b/fs/autofs4/autofs_i.h
@@ -123,6 +123,7 @@ struct autofs_sb_info {
 	struct list_head active_list;
 	struct list_head expiring_list;
 	struct rcu_head rcu;
+	unsigned is32bit:1;
 };
 
 static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb)
--- a/fs/autofs4/dev-ioctl.c
+++ b/fs/autofs4/dev-ioctl.c
@@ -622,7 +622,7 @@ static int _autofs_dev_ioctl(unsigned int command, struct autofs_dev_ioctl __use
 	int err = 0;
 
 	/* only root can play with this */
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST);
--- a/fs/autofs4/init.c
+++ b/fs/autofs4/init.c
@@ -25,6 +25,7 @@ static struct file_system_type autofs_fs_type = {
 	.name		= "autofs",
 	.mount		= autofs_mount,
 	.kill_sb	= autofs4_kill_sb,
+	.fs_flags	= FS_VIRTUALIZED,
 };
 MODULE_ALIAS_FS("autofs");
 
--- a/fs/autofs4/inode.c
+++ b/fs/autofs4/inode.c
@@ -77,6 +77,10 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root)
 		return 0;
 
 	seq_printf(m, ",fd=%d", sbi->pipefd);
+	if (sbi->pipe)
+		seq_printf(m, ",pipe_ino=%ld", sbi->pipe->f_inode->i_ino);
+	else
+		seq_printf(m, ",pipe_ino=-1");
 	if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID))
 		seq_printf(m, ",uid=%u",
 			from_kuid_munged(&init_user_ns, root_inode->i_uid));
@@ -123,7 +127,7 @@ static const match_table_t tokens = {
 	{Opt_indirect, "indirect"},
 	{Opt_direct, "direct"},
 	{Opt_offset, "offset"},
-	{Opt_err, NULL}
+{Opt_err, NULL}
 };
 
 static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid,
@@ -309,7 +313,7 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent)
 
 	if (!pipe) {
 		printk("autofs: could not open pipe file descriptor\n");
-		goto fail_dput;
+		goto fail_put_pid;
 	}
 	ret = autofs_prepare_pipe(pipe);
 	if (ret < 0)
@@ -331,6 +335,8 @@ fail_fput:
 	printk("autofs: pipe file descriptor does not contain proper ops\n");
 	fput(pipe);
 	/* fall through */
+fail_put_pid:
+	put_pid(sbi->oz_pgrp);
 fail_dput:
 	dput(root);
 	goto fail_free;
--- a/fs/autofs4/root.c
+++ b/fs/autofs4/root.c
@@ -588,7 +588,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry)
 	struct autofs_info *p_ino;
 	
 	/* This allows root to remove symlinks */
-	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+	if (!autofs4_oz_mode(sbi) && !ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	if (atomic_dec_and_test(&ino->count)) {
@@ -836,7 +836,7 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp,
 	     _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT)
 		return -ENOTTY;
 	
-	if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN))
+	if (!autofs4_oz_mode(sbi) && !ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	
 	switch(cmd) {
--- a/fs/bad_inode.c
+++ b/fs/bad_inode.c
@@ -121,7 +121,7 @@ static unsigned long bad_file_get_unmapped_area(struct file *file,
 	return -EIO;
 }
 
-static int bad_file_check_flags(int flags)
+static int bad_file_set_flags(struct file *file, int flags)
 {
 	return -EIO;
 }
@@ -166,7 +166,7 @@ static const struct file_operations bad_file_ops =
 	.lock		= bad_file_lock,
 	.sendpage	= bad_file_sendpage,
 	.get_unmapped_area = bad_file_get_unmapped_area,
-	.check_flags	= bad_file_check_flags,
+	.set_flags	= bad_file_set_flags,
 	.flock		= bad_file_flock,
 	.splice_write	= bad_file_splice_write,
 	.splice_read	= bad_file_splice_read,
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -439,7 +439,7 @@ befs_init_inodecache(void)
 	befs_inode_cachep = kmem_cache_create("befs_inode_cache",
 					      sizeof (struct befs_inode_info),
 					      0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					      init_once);
 	if (befs_inode_cachep == NULL) {
 		printk(KERN_ERR "befs_init_inodecache: "
--- a/fs/bfs/inode.c
+++ b/fs/bfs/inode.c
@@ -271,7 +271,7 @@ static int init_inodecache(void)
 	bfs_inode_cachep = kmem_cache_create("bfs_inode_cache",
 					     sizeof(struct bfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (bfs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/binfmt_aout.c
+++ b/fs/binfmt_aout.c
@@ -296,12 +296,12 @@ static int load_aout_binary(struct linux_binprm * bprm)
 		if ((ex.a_text & 0xfff || ex.a_data & 0xfff) &&
 		    (N_MAGIC(ex) != NMAGIC) && printk_ratelimit())
 		{
-			printk(KERN_NOTICE "executable not page aligned\n");
+			ve_printk(VE_LOG, KERN_NOTICE "executable not page aligned\n");
 		}
 
 		if ((fd_offset & ~PAGE_MASK) != 0 && printk_ratelimit())
 		{
-			printk(KERN_WARNING 
+			ve_printk(VE_LOG, KERN_WARNING
 			       "fd_offset is not page aligned. Please convert program: %s\n",
 			       bprm->file->f_path.dentry->d_name.name);
 		}
@@ -390,7 +390,7 @@ static int load_aout_library(struct file *file)
 	if ((N_TXTOFF(ex) & ~PAGE_MASK) != 0) {
 		if (printk_ratelimit())
 		{
-			printk(KERN_WARNING 
+			ve_printk(VE_LOG, KERN_WARNING
 			       "N_TXTOFF is not page aligned. Please convert library: %s\n",
 			       file->f_path.dentry->d_name.name);
 		}
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -33,8 +33,8 @@
 #include <linux/elf.h>
 #include <linux/utsname.h>
 #include <linux/coredump.h>
-#include <linux/sched.h>
 #include <linux/dax.h>
+#include <linux/ve.h>
 #include <asm/uaccess.h>
 #include <asm/param.h>
 #include <asm/page.h>
@@ -1530,20 +1530,12 @@ static void do_thread_regset_writeback(struct task_struct *task,
 		regset->writeback(task, regset, 1);
 }
 
-#ifndef PR_REG_SIZE
-#define PR_REG_SIZE(S) sizeof(S)
-#endif
-
 #ifndef PRSTATUS_SIZE
-#define PRSTATUS_SIZE(S) sizeof(S)
-#endif
-
-#ifndef PR_REG_PTR
-#define PR_REG_PTR(S) (&((S)->pr_reg))
+#define PRSTATUS_SIZE(S, R) sizeof(S)
 #endif
 
 #ifndef SET_PR_FPVALID
-#define SET_PR_FPVALID(S, V) ((S)->pr_fpvalid = (V))
+#define SET_PR_FPVALID(S, V, R) ((S)->pr_fpvalid = (V))
 #endif
 
 static int fill_thread_core_info(struct elf_thread_core_info *t,
@@ -1551,6 +1543,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 				 long signr, size_t *total)
 {
 	unsigned int i;
+	unsigned int regset_size = view->regsets[0].n * view->regsets[0].size;
 
 	/*
 	 * NT_PRSTATUS is the one special case, because the regset data
@@ -1559,12 +1552,11 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 	 * We assume that regset 0 is NT_PRSTATUS.
 	 */
 	fill_prstatus(&t->prstatus, t->task, signr);
-	(void) view->regsets[0].get(t->task, &view->regsets[0],
-				    0, PR_REG_SIZE(t->prstatus.pr_reg),
-				    PR_REG_PTR(&t->prstatus), NULL);
+	(void) view->regsets[0].get(t->task, &view->regsets[0], 0, regset_size,
+				    &t->prstatus.pr_reg, NULL);
 
 	fill_note(&t->notes[0], "CORE", NT_PRSTATUS,
-		  PRSTATUS_SIZE(t->prstatus), &t->prstatus);
+		  PRSTATUS_SIZE(t->prstatus, regset_size), &t->prstatus);
 	*total += notesize(&t->notes[0]);
 
 	do_thread_regset_writeback(t->task, &view->regsets[0]);
@@ -1594,7 +1586,8 @@ static int fill_thread_core_info(struct elf_thread_core_info *t,
 						  regset->core_note_type,
 						  size, data);
 				else {
-					SET_PR_FPVALID(&t->prstatus, 1);
+					SET_PR_FPVALID(&t->prstatus,
+							1, regset_size);
 					fill_note(&t->notes[i], "CORE",
 						  NT_PRFPREG, size, data);
 				}
--- a/fs/binfmt_misc.c
+++ b/fs/binfmt_misc.c
@@ -18,7 +18,6 @@
 
 #include <linux/module.h>
 #include <linux/init.h>
-#include <linux/sched.h>
 #include <linux/magic.h>
 #include <linux/binfmts.h>
 #include <linux/slab.h>
@@ -30,6 +29,7 @@
 #include <linux/mount.h>
 #include <linux/syscalls.h>
 #include <linux/fs.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 
@@ -37,9 +37,6 @@ enum {
 	VERBOSE_STATUS = 1 /* make it zero to save 400 bytes kernel memory */
 };
 
-static LIST_HEAD(entries);
-static int enabled = 1;
-
 enum {Enabled, Magic};
 #define MISC_FMT_PRESERVE_ARGV0 (1<<31)
 #define MISC_FMT_OPEN_BINARY (1<<30)
@@ -57,22 +54,30 @@ typedef struct {
 	struct dentry *dentry;
 } Node;
 
-static DEFINE_RWLOCK(entries_lock);
 static struct file_system_type bm_fs_type;
-static struct vfsmount *bm_mnt;
-static int entry_count;
+
+struct binfmt_misc {
+	struct list_head entries;
+	int enabled;
+
+	rwlock_t entries_lock;
+	struct vfsmount *bm_mnt;
+	int entry_count;
+};
+
+#define BINFMT_MISC(sb)		(((struct ve_struct *)(sb)->s_fs_info)->binfmt_misc)
 
 /* 
  * Check if we support the binfmt
  * if we do, return the node, else NULL
  * locking is done in load_misc_binary
  */
-static Node *check_file(struct linux_binprm *bprm)
+static Node *check_file(struct binfmt_misc *bm_data, struct linux_binprm *bprm)
 {
 	char *p = strrchr(bprm->interp, '.');
 	struct list_head *l;
 
-	list_for_each(l, &entries) {
+	list_for_each(l, &bm_data->entries) {
 		Node *e = list_entry(l, Node, list);
 		char *s;
 		int j;
@@ -113,17 +118,18 @@ static int load_misc_binary(struct linux_binprm *bprm)
 	const char *iname_addr = iname;
 	int retval;
 	int fd_binary = -1;
+	struct binfmt_misc *bm_data = get_exec_env()->binfmt_misc;
 
 	retval = -ENOEXEC;
-	if (!enabled)
+	if (!bm_data || !bm_data->enabled)
 		goto _ret;
 
 	/* to keep locking time low, we copy the interpreter string */
-	read_lock(&entries_lock);
-	fmt = check_file(bprm);
+	read_lock(&bm_data->entries_lock);
+	fmt = check_file(bm_data, bprm);
 	if (fmt)
 		strlcpy(iname, fmt->interpreter, BINPRM_BUF_SIZE);
-	read_unlock(&entries_lock);
+	read_unlock(&bm_data->entries_lock);
 	if (!fmt)
 		goto _ret;
 
@@ -490,23 +496,23 @@ static void bm_evict_inode(struct inode *inode)
 	kfree(inode->i_private);
 }
 
-static void kill_node(Node *e)
+static void kill_node(struct binfmt_misc *bm_data, Node *e)
 {
 	struct dentry *dentry;
 
-	write_lock(&entries_lock);
+	write_lock(&bm_data->entries_lock);
 	dentry = e->dentry;
 	if (dentry) {
 		list_del_init(&e->list);
 		e->dentry = NULL;
 	}
-	write_unlock(&entries_lock);
+	write_unlock(&bm_data->entries_lock);
 
 	if (dentry) {
 		drop_nlink(dentry->d_inode);
 		d_drop(dentry);
 		dput(dentry);
-		simple_release_fs(&bm_mnt, &entry_count);
+		simple_release_fs(&bm_data->bm_mnt, &bm_data->entry_count);
 	}
 }
 
@@ -536,16 +542,18 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer,
 	struct dentry *root;
 	Node *e = file_inode(file)->i_private;
 	int res = parse_command(buffer, count);
+	struct super_block *sb = file->f_path.dentry->d_sb;
+	struct binfmt_misc *bm_data = BINFMT_MISC(sb);
 
 	switch (res) {
 		case 1: clear_bit(Enabled, &e->flags);
 			break;
 		case 2: set_bit(Enabled, &e->flags);
 			break;
-		case 3: root = dget(file->f_path.dentry->d_sb->s_root);
+		case 3: root = dget(sb->s_root);
 			mutex_lock(&root->d_inode->i_mutex);
 
-			kill_node(e);
+			kill_node(bm_data, e);
 
 			mutex_unlock(&root->d_inode->i_mutex);
 			dput(root);
@@ -570,6 +578,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	struct inode *inode;
 	struct dentry *root, *dentry;
 	struct super_block *sb = file->f_path.dentry->d_sb;
+	struct binfmt_misc *bm_data = BINFMT_MISC(sb);
 	int err = 0;
 
 	e = create_entry(buffer, count);
@@ -594,7 +603,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	if (!inode)
 		goto out2;
 
-	err = simple_pin_fs(&bm_fs_type, &bm_mnt, &entry_count);
+	err = simple_pin_fs(&bm_fs_type, &bm_data->bm_mnt, &bm_data->entry_count);
 	if (err) {
 		iput(inode);
 		inode = NULL;
@@ -606,9 +615,9 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer,
 	inode->i_fop = &bm_entry_operations;
 
 	d_instantiate(dentry, inode);
-	write_lock(&entries_lock);
-	list_add(&e->list, &entries);
-	write_unlock(&entries_lock);
+	write_lock(&bm_data->entries_lock);
+	list_add(&e->list, &bm_data->entries);
+	write_unlock(&bm_data->entries_lock);
 
 	err = 0;
 out2:
@@ -634,7 +643,8 @@ static const struct file_operations bm_register_operations = {
 static ssize_t
 bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 {
-	char *s = enabled ? "enabled\n" : "disabled\n";
+	struct binfmt_misc *bm_data = BINFMT_MISC(file->f_dentry->d_sb);
+	char *s = bm_data->enabled ? "enabled\n" : "disabled\n";
 
 	return simple_read_from_buffer(buf, nbytes, ppos, s, strlen(s));
 }
@@ -642,17 +652,19 @@ bm_status_read(struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
 static ssize_t bm_status_write(struct file * file, const char __user * buffer,
 		size_t count, loff_t *ppos)
 {
+	struct binfmt_misc *bm_data = BINFMT_MISC(file->f_dentry->d_sb);
 	int res = parse_command(buffer, count);
 	struct dentry *root;
 
 	switch (res) {
-		case 1: enabled = 0; break;
-		case 2: enabled = 1; break;
+		case 1: bm_data->enabled = 0; break;
+		case 2: bm_data->enabled = 1; break;
 		case 3: root = dget(file->f_path.dentry->d_sb->s_root);
 			mutex_lock(&root->d_inode->i_mutex);
 
-			while (!list_empty(&entries))
-				kill_node(list_entry(entries.next, Node, list));
+			while (!list_empty(&bm_data->entries))
+				kill_node(bm_data, list_first_entry(
+					&bm_data->entries, Node, list));
 
 			mutex_unlock(&root->d_inode->i_mutex);
 			dput(root);
@@ -669,9 +681,19 @@ static const struct file_operations bm_status_operations = {
 
 /* Superblock handling */
 
+static void bm_put_super(struct super_block *sb)
+{
+	struct binfmt_misc *bm_data = BINFMT_MISC(sb);
+	struct ve_struct *ve = sb->s_fs_info;
+
+	bm_data->enabled = 0;
+	put_ve(ve);
+}
+
 static const struct super_operations s_ops = {
 	.statfs		= simple_statfs,
 	.evict_inode	= bm_evict_inode,
+	.put_super	= bm_put_super,
 };
 
 static int bm_fill_super(struct super_block * sb, void * data, int silent)
@@ -681,16 +703,41 @@ static int bm_fill_super(struct super_block * sb, void * data, int silent)
 		[3] = {"register", &bm_register_operations, S_IWUSR},
 		/* last one */ {""}
 	};
-	int err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
-	if (!err)
-		sb->s_op = &s_ops;
-	return err;
+	struct ve_struct *ve = data;
+	struct binfmt_misc *bm_data = ve->binfmt_misc;
+	int err;
+
+	if (!bm_data) {
+		bm_data = kzalloc(sizeof(struct binfmt_misc), GFP_KERNEL);
+		if (!bm_data)
+			return -ENOMEM;
+
+		INIT_LIST_HEAD(&bm_data->entries);
+		rwlock_init(&bm_data->entries_lock);
+
+		ve->binfmt_misc = bm_data;
+	}
+
+	err = simple_fill_super(sb, BINFMTFS_MAGIC, bm_files);
+	if (err) {
+		kfree(bm_data);
+		return err;
+	}
+
+	sb->s_op = &s_ops;
+
+	bm_data->enabled = 1;
+	get_ve(ve);
+
+	return 0;
 }
 
 static struct dentry *bm_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
-	return mount_single(fs_type, flags, data, bm_fill_super);
+	if (!current_user_ns_initial() && !capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+	return mount_ns(fs_type, flags, get_exec_env(), bm_fill_super);
 }
 
 static struct linux_binfmt misc_format = {
@@ -703,19 +750,46 @@ static struct file_system_type bm_fs_type = {
 	.name		= "binfmt_misc",
 	.mount		= bm_mount,
 	.kill_sb	= kill_litter_super,
+	.fs_flags	= FS_VIRTUALIZED | FS_USERNS_MOUNT,
 };
 MODULE_ALIAS_FS("binfmt_misc");
 
+static void ve_binfmt_fini(void *data)
+{
+	struct ve_struct *ve = data;
+	struct binfmt_misc *bm_data = ve->binfmt_misc;
+
+	if (!bm_data)
+		return;
+
+	/*
+	 * XXX: Note we don't take any locks here. This is safe as long as
+	 * nobody uses binfmt_misc outside the owner ve.
+	 */
+	while (!list_empty(&bm_data->entries))
+		kill_node(bm_data, list_first_entry(
+			&bm_data->entries, Node, list));
+}
+
+static struct ve_hook ve_binfmt_hook = {
+	.fini		= ve_binfmt_fini,
+	.priority	= HOOK_PRIO_DEFAULT,
+	.owner		= THIS_MODULE,
+};
+
 static int __init init_misc_binfmt(void)
 {
 	int err = register_filesystem(&bm_fs_type);
-	if (!err)
+	if (!err) {
 		insert_binfmt(&misc_format);
+		ve_hook_register(VE_SS_CHAIN, &ve_binfmt_hook);
+	}
 	return err;
 }
 
 static void __exit exit_misc_binfmt(void)
 {
+	ve_hook_unregister(&ve_binfmt_hook);
 	unregister_binfmt(&misc_format);
 	unregister_filesystem(&bm_fs_type);
 }
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -108,12 +108,12 @@ void invalidate_bdev(struct block_device *bdev)
 {
 	struct address_space *mapping = bdev->bd_inode->i_mapping;
 
-	if (mapping->nrpages == 0)
-		return;
-
-	invalidate_bh_lrus();
-	lru_add_drain_all();	/* make sure all lru add caches are flushed */
-	invalidate_mapping_pages(mapping, 0, -1);
+	/* FIXME: Shouldn't we add '|| mapping->nrexceptional' ? */
+	if (mapping->nrpages) {
+		invalidate_bh_lrus();
+		lru_add_drain_all();	/* make sure all lru add caches are flushed */
+		invalidate_mapping_pages(mapping, 0, -1);
+	}
 	/* 99% of the time, we don't need to flush the cleancache on the bdev.
 	 * But, for the strange corners, lets be cautious
 	 */
@@ -257,7 +257,8 @@ struct super_block *freeze_bdev(struct block_device *bdev)
 		 * thaw_bdev drops it.
 		 */
 		sb = get_super(bdev);
-		drop_super(sb);
+		if (sb)
+			drop_super(sb);
 		mutex_unlock(&bdev->bd_fsfreeze_mutex);
 		return sb;
 	}
@@ -679,7 +680,7 @@ void __init bdev_cache_init(void)
 
 	bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
 			0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-				SLAB_MEM_SPREAD|SLAB_PANIC),
+				SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
 			init_once);
 	err = register_filesystem(&bd_type);
 	if (err)
@@ -1235,12 +1236,19 @@ int check_disk_change(struct block_device *bdev)
 
 EXPORT_SYMBOL(check_disk_change);
 
+void bd_write_size(struct block_device *bdev, loff_t size)
+{
+	i_size_write(bdev->bd_inode, size);
+	blk_cbt_update_size(bdev);
+}
+EXPORT_SYMBOL(bd_write_size);
+
 void bd_set_size(struct block_device *bdev, loff_t size)
 {
 	unsigned bsize = bdev_logical_block_size(bdev);
 
 	mutex_lock(&bdev->bd_inode->i_mutex);
-	i_size_write(bdev->bd_inode, size);
+	bd_write_size(bdev, size);
 	mutex_unlock(&bdev->bd_inode->i_mutex);
 	while (bsize < PAGE_CACHE_SIZE) {
 		if (size & bsize)
@@ -1273,6 +1281,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part)
 		perm |= MAY_READ;
 	if (mode & FMODE_WRITE)
 		perm |= MAY_WRITE;
+	if (mode & FMODE_MOUNT)
+		perm |= MAY_MOUNT;
 	/*
 	 * hooks: /n/, see "layering violations".
 	 */
--- a/fs/btrfs/acl.c
+++ b/fs/btrfs/acl.c
@@ -111,11 +111,9 @@ static int btrfs_set_acl(struct btrfs_trans_handle *trans,
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
 		if (acl) {
-			ret = posix_acl_equiv_mode(acl, &inode->i_mode);
-			if (ret < 0)
+			ret = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+			if (ret)
 				return ret;
-			if (ret == 0)
-				acl = NULL;
 		}
 		ret = 0;
 		break;
--- a/fs/btrfs/async-thread.c
+++ b/fs/btrfs/async-thread.c
@@ -70,6 +70,14 @@ void btrfs_##name(struct work_struct *arg)				\
 	normal_work_helper(work);					\
 }
 
+bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq)
+{
+	int thresh = wq->normal->thresh != NO_THRESHOLD ?
+		wq->normal->thresh : num_possible_cpus();
+
+	return atomic_read(&wq->normal->pending) > thresh * 2;
+}
+
 BTRFS_WORK_HELPER(worker_helper);
 BTRFS_WORK_HELPER(delalloc_helper);
 BTRFS_WORK_HELPER(flush_delalloc_helper);
--- a/fs/btrfs/async-thread.h
+++ b/fs/btrfs/async-thread.h
@@ -79,4 +79,5 @@ void btrfs_queue_work(struct btrfs_workqueue *wq,
 void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
 void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
 void btrfs_set_work_high_priority(struct btrfs_work *work);
+bool btrfs_workqueue_normal_congested(struct btrfs_workqueue *wq);
 #endif
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -1375,7 +1375,8 @@ release_path:
 	total_done++;
 
 	btrfs_release_prepared_delayed_node(delayed_node);
-	if (async_work->nr == 0 || total_done < async_work->nr)
+	if ((async_work->nr == 0 && total_done < BTRFS_DELAYED_WRITEBACK) ||
+	    total_done < async_work->nr)
 		goto again;
 
 free_path:
@@ -1391,7 +1392,8 @@ static int btrfs_wq_run_delayed_node(struct btrfs_delayed_root *delayed_root,
 {
 	struct btrfs_async_delayed_work *async_work;
 
-	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND)
+	if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND ||
+	    btrfs_workqueue_normal_congested(fs_info->delayed_workers))
 		return 0;
 
 	async_work = kmalloc(sizeof(*async_work), GFP_NOFS);
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2129,7 +2129,6 @@ out:
 static const struct vm_operations_struct btrfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= btrfs_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -9186,7 +9186,8 @@ int btrfs_init_cachep(void)
 {
 	btrfs_inode_cachep = kmem_cache_create("btrfs_inode",
 			sizeof(struct btrfs_inode), 0,
-			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, init_once);
+			SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD | SLAB_ACCOUNT,
+			init_once);
 	if (!btrfs_inode_cachep)
 		goto fail;
 
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -635,6 +635,11 @@ static void __set_page_dirty(struct page *page,
 		account_page_dirtied(page, mapping);
 		radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
+		if (mapping_cap_account_dirty(mapping) &&
+				!radix_tree_prev_tag_get(
+					&mapping->page_tree,
+					PAGECACHE_TAG_DIRTY))
+			ub_io_account_dirty(mapping);
 	}
 	spin_unlock_irqrestore(&mapping->tree_lock, flags);
 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
@@ -2931,7 +2936,7 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
 
 /*
  * This allows us to do IO even on the odd last sectors
- * of a device, even if the bh block size is some multiple
+ * of a device, even if the block size is some multiple
  * of the physical sector size.
  *
  * We'll just truncate the bio to the size of the device,
@@ -2941,10 +2946,11 @@ static void end_bio_bh_io_sync(struct bio *bio, int err)
  * errors, this only handles the "we need to be able to
  * do IO at the final sector" case.
  */
-static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
+void guard_bio_eod(int rw, struct bio *bio)
 {
 	sector_t maxsector;
-	unsigned bytes;
+	struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
+	unsigned truncated_bytes;
 
 	maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9;
 	if (!maxsector)
@@ -2959,23 +2965,21 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh)
 		return;
 
 	maxsector -= bio->bi_sector;
-	bytes = bio->bi_size;
-	if (likely((bytes >> 9) <= maxsector))
+	if (likely((bio->bi_size >> 9) <= maxsector))
 		return;
 
-	/* Uhhuh. We've got a bh that straddles the device size! */
-	bytes = maxsector << 9;
+	/* Uhhuh. We've got a bio that straddles the device size! */
+	truncated_bytes = bio->bi_size - (maxsector << 9);
 
 	/* Truncate the bio.. */
-	bio->bi_size = bytes;
-	bio->bi_io_vec[0].bv_len = bytes;
+	bio->bi_size -= truncated_bytes;
+	BUG_ON(truncated_bytes > bvec->bv_len);
+	bvec->bv_len -= truncated_bytes;
 
 	/* ..and clear the end of the buffer for reads */
 	if ((rw & RW_MASK) == READ) {
-		void *kaddr = kmap_atomic(bh->b_page);
-		memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes);
-		kunmap_atomic(kaddr);
-		flush_dcache_page(bh->b_page);
+		zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len,
+				truncated_bytes);
 	}
 }
 
@@ -3016,7 +3020,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags)
 	bio->bi_flags |= bio_flags;
 
 	/* Take care of bh's that straddle the end of the device */
-	guard_bh_eod(rw, bio, bh);
+	guard_bio_eod(rw, bio);
 
 	if (buffer_meta(bh))
 		rw |= REQ_META;
--- a/fs/ceph/addr.c
+++ b/fs/ceph/addr.c
@@ -1649,7 +1649,6 @@ out:
 static struct vm_operations_struct ceph_vmops = {
 	.fault		= ceph_filemap_fault,
 	.page_mkwrite	= ceph_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 int ceph_mmap(struct file *file, struct vm_area_struct *vma)
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -40,8 +40,8 @@
  */
 static size_t dio_get_pagev_size(const struct iov_iter *it)
 {
-    const struct iovec *iov = it->iov;
-    const struct iovec *iovend = iov + it->nr_segs;
+    const struct iovec *iov = iov_iter_iovec(it);
+    size_t total = iov_iter_count(it);
     size_t size;
 
     size = iov->iov_len - it->iov_offset;
@@ -50,8 +50,10 @@ static size_t dio_get_pagev_size(const struct iov_iter *it)
      * and the next base are page aligned.
      */
     while (PAGE_ALIGNED((iov->iov_base + iov->iov_len)) &&
-           (++iov < iovend && PAGE_ALIGNED((iov->iov_base)))) {
-        size += iov->iov_len;
+           PAGE_ALIGNED(((iov++)->iov_base))) {
+	    size_t n =  min(iov->iov_len, total);
+	    size += n;
+	    total -= n;
     }
     dout("dio_get_pagevlen len = %zu\n", size);
     return size;
@@ -71,7 +73,7 @@ dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
 	struct page **pages;
 	int ret = 0, idx, npages;
 
-	align = (unsigned long)(it->iov->iov_base + it->iov_offset) &
+	align = (unsigned long)(iov_iter_iovec(it)->iov_base + it->iov_offset) &
 		(PAGE_SIZE - 1);
 	npages = calc_pages_for(align, nbytes);
 	pages = kmalloc(sizeof(*pages) * npages, GFP_KERNEL);
@@ -82,10 +84,11 @@ dio_get_pages_alloc(const struct iov_iter *it, size_t nbytes,
 	}
 
 	for (idx = 0; idx < npages; ) {
-		void __user *data = tmp_it.iov->iov_base + tmp_it.iov_offset;
+		struct iovec *tmp_iov = iov_iter_iovec(&tmp_it);
+		void __user *data = tmp_iov->iov_base + tmp_it.iov_offset;
 		size_t off = (unsigned long)data & (PAGE_SIZE - 1);
 		size_t len = min_t(size_t, nbytes,
-				   tmp_it.iov->iov_len - tmp_it.iov_offset);
+				   tmp_iov->iov_len - tmp_it.iov_offset);
 		int n = (len + off + PAGE_SIZE - 1) >> PAGE_SHIFT;
 		ret = get_user_pages_fast((unsigned long)data, n, write,
 					   pages + idx);
@@ -522,10 +525,9 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i,
 		size_t left = len = ret;
 
 		while (left) {
-			void __user *data = i->iov[0].iov_base +
-					    i->iov_offset;
-			l = min(i->iov[0].iov_len - i->iov_offset,
-				left);
+			struct iovec *iov = (struct iovec *)i->data;
+			void __user *data = iov->iov_base + i->iov_offset;
+			l = min(iov->iov_len - i->iov_offset, left);
 
 			ret = ceph_copy_page_vector_to_user(&pages[k],
 							    data, off, l);
@@ -1120,8 +1122,9 @@ static ssize_t inline_to_iov(struct kiocb *iocb, struct iov_iter *i,
 			zero_user_segment(inline_page, inline_len, end);
 
 		while (left) {
-			void __user *udata = i->iov->iov_base + i->iov_offset;
-			size_t n = min(i->iov->iov_len - i->iov_offset, left);
+			struct iovec *iov = iov_iter_iovec(i);
+			void __user *udata = iov->iov_base;
+			size_t n = min(iov->iov_len - i->iov_offset, left);
 
 			if (__copy_to_user(udata, kdata, n)) {
 				ret = -EFAULT;
@@ -1138,8 +1141,9 @@ static ssize_t inline_to_iov(struct kiocb *iocb, struct iov_iter *i,
 		size_t left = min_t(loff_t, iocb->ki_pos + len, i_size) - pos;
 
 		while (left) {
-			void __user *udata = i->iov->iov_base + i->iov_offset;
-			size_t n = min(i->iov->iov_len - i->iov_offset, left);
+			struct iovec *iov = (struct iovec *)i->data;
+			void __user *udata = iov->iov_base;
+			size_t n = min(iov->iov_len - i->iov_offset, left);
 
 			if (__clear_user(udata, n)) {
 				ret = -EFAULT;
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -634,8 +634,8 @@ static int __init init_caches(void)
 	ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
 				      sizeof(struct ceph_inode_info),
 				      __alignof__(struct ceph_inode_info),
-				      (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
-				      ceph_inode_init_once);
+				      SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+				      SLAB_ACCOUNT, ceph_inode_init_once);
 	if (ceph_inode_cachep == NULL)
 		return -ENOMEM;
 
--- a/fs/char_dev.c
+++ b/fs/char_dev.c
@@ -21,6 +21,7 @@
 #include <linux/mutex.h>
 #include <linux/backing-dev.h>
 #include <linux/tty.h>
+#include <linux/device_cgroup.h>
 
 #include "internal.h"
 
@@ -72,8 +73,12 @@ void chrdev_show(struct seq_file *f, off_t offset)
 
 	if (offset < CHRDEV_MAJOR_HASH_SIZE) {
 		mutex_lock(&chrdevs_lock);
-		for (cd = chrdevs[offset]; cd; cd = cd->next)
+		for (cd = chrdevs[offset]; cd; cd = cd->next) {
+			if (!devcgroup_device_visible(S_IFCHR, cd->major,
+						cd->baseminor, cd->minorct))
+				continue;
 			seq_printf(f, "%3d %s\n", cd->major, cd->name);
+		}
 		mutex_unlock(&chrdevs_lock);
 	}
 }
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -1063,7 +1063,7 @@ cifs_init_inodecache(void)
 	cifs_inode_cachep = kmem_cache_create("cifs_inode_cache",
 					      sizeof(struct cifsInodeInfo),
 					      0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					      cifs_init_once);
 	if (cifs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -2434,8 +2434,9 @@ wdata_fill_from_iovec(struct cifs_writedata *wdata, struct iov_iter *from,
 	save_len = cur_len;
 	for (i = 0; i < nr_pages; i++) {
 		bytes = min_t(const size_t, cur_len, PAGE_SIZE);
-		copied = copy_page_from_iter(wdata->pages[i], 0, bytes, from);
+		copied = iov_iter_copy_from_user(wdata->pages[i], from, 0, bytes);
 		cur_len -= copied;
+		iov_iter_advance(from, copied);
 		/*
 		 * If we didn't copy as much as we expected, then that
 		 * may mean we trod into an unmapped area. Stop copying
@@ -2852,8 +2853,10 @@ cifs_readdata_to_iov(struct cifs_readdata *rdata, struct iov_iter *iter)
 	for (i = 0; i < rdata->nr_pages; i++) {
 		struct page *page = rdata->pages[i];
 		size_t copy = min_t(size_t, remaining, PAGE_SIZE);
-		size_t written = copy_page_to_iter(page, 0, copy, iter);
+		size_t written = iov_iter_copy_to_user(page, iter, 0, copy);
+
 		remaining -= written;
+		iov_iter_advance(iter, written);
 		if (written < copy && iov_iter_count(iter) > 0)
 			break;
 	}
@@ -3252,7 +3255,6 @@ cifs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 static struct vm_operations_struct cifs_file_vm_ops = {
 	.fault = filemap_fault,
 	.page_mkwrite = cifs_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 int cifs_file_strict_mmap(struct file *file, struct vm_area_struct *vma)
--- a/fs/cifs/inode.c
+++ b/fs/cifs/inode.c
@@ -22,7 +22,6 @@
 #include <linux/stat.h>
 #include <linux/slab.h>
 #include <linux/pagemap.h>
-#include <linux/freezer.h>
 #include <asm/div64.h>
 #include "cifsfs.h"
 #include "cifspdu.h"
@@ -1874,7 +1873,7 @@ cifs_invalidate_mapping(struct inode *inode)
 static int
 cifs_wait_bit_killable(struct wait_bit_key *key, int mode)
 {
-	freezable_schedule_unsafe();
+	schedule();
 	if (signal_pending_state(mode, current))
 		return -ERESTARTSYS;
 	return 0;
--- a/fs/coda/inode.c
+++ b/fs/coda/inode.c
@@ -76,9 +76,9 @@ static void init_once(void *foo)
 int coda_init_inodecache(void)
 {
 	coda_inode_cachep = kmem_cache_create("coda_inode_cache",
-				sizeof(struct coda_inode_info),
-				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-				init_once);
+				sizeof(struct coda_inode_info), 0,
+				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+				SLAB_ACCOUNT, init_once);
 	if (coda_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -48,6 +48,7 @@
 #include <linux/slab.h>
 #include <linux/pagemap.h>
 #include <linux/aio.h>
+#include <linux/device_cgroup.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -68,6 +69,18 @@ int compat_printk(const char *fmt, ...)
 	return ret;
 }
 
+int ve_compat_printk(int dst, const char *fmt, ...)
+{
+	va_list ap;
+	int ret;
+	if (!compat_log)
+		return 0;
+	va_start(ap, fmt);
+	ret = ve_vprintk(dst, fmt, ap);
+	va_end(ap);
+	return ret;
+}
+
 /*
  * Not all architectures have sys_utime, so implement this in terms
  * of sys_utimes.
@@ -333,9 +346,16 @@ asmlinkage long compat_sys_fstatfs64(unsigned int fd, compat_size_t sz, struct c
  */
 asmlinkage long compat_sys_ustat(unsigned dev, struct compat_ustat __user *u)
 {
+	dev_t kdev = new_decode_dev(dev);
 	struct compat_ustat tmp;
 	struct kstatfs sbuf;
-	int err = vfs_ustat(new_decode_dev(dev), &sbuf);
+	int err;
+
+	err = devcgroup_device_permission(S_IFBLK, kdev, MAY_READ);
+	if (err)
+		return err;
+
+	err = vfs_ustat(kdev, &sbuf);
 	if (err)
 		return err;
 
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -1,6 +1,7 @@
 #include <linux/slab.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/freezer.h>
 #include <linux/mm.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
@@ -32,6 +33,7 @@
 #include <linux/pipe_fs_i.h>
 #include <linux/oom.h>
 #include <linux/compat.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -45,7 +47,6 @@
 #include <trace/events/sched.h>
 
 int core_uses_pid;
-char core_pattern[CORENAME_MAX_SIZE] = "core";
 unsigned int core_pipe_limit;
 
 struct core_name {
@@ -152,7 +153,7 @@ put_exe_file:
 static int format_corename(struct core_name *cn, struct coredump_params *cprm)
 {
 	const struct cred *cred = current_cred();
-	const char *pat_ptr = core_pattern;
+	const char *pat_ptr = get_exec_env()->core_pattern;
 	int ispipe = (*pat_ptr == '|');
 	int pid_in_pattern = 0;
 	int err = 0;
@@ -388,7 +389,9 @@ static int coredump_wait(int exit_code, struct core_state *core_state)
 	if (core_waiters > 0) {
 		struct core_thread *ptr;
 
+		freezer_do_not_count();
 		wait_for_completion(&core_state->startup);
+		freezer_count();
 		/*
 		 * Wait for all the threads to become inactive, so that
 		 * all the thread context (extended register state, like
@@ -530,6 +533,9 @@ void do_coredump(siginfo_t *siginfo)
 	if (!__get_dumpable(cprm.mm_flags))
 		goto fail;
 
+	/* Avoid dumping sensitive tasks */
+	if (mm->vps_dumpable != VD_PTRACE_COREDUMP)
+		goto fail;
 	cred = prepare_creds();
 	if (!cred)
 		goto fail;
@@ -557,7 +563,6 @@ void do_coredump(siginfo_t *siginfo)
 	if (ispipe) {
 		int dump_count;
 		char **helper_argv;
-		struct subprocess_info *sub_info;
 
 		if (ispipe < 0) {
 			printk(KERN_WARNING "format_corename failed\n");
@@ -605,12 +610,9 @@ void do_coredump(siginfo_t *siginfo)
 		}
 
 		retval = -ENOMEM;
-		sub_info = call_usermodehelper_setup(helper_argv[0],
-						helper_argv, NULL, GFP_KERNEL,
-						umh_pipe_setup, NULL, &cprm);
-		if (sub_info)
-			retval = call_usermodehelper_exec(sub_info,
-							  UMH_WAIT_EXEC);
+		retval = call_usermodehelper_fns_ve(get_exec_env(), helper_argv[0],
+		                                    helper_argv, NULL, UMH_WAIT_EXEC,
+						    umh_pipe_setup, NULL, &cprm);
 
 		argv_free(helper_argv);
 		if (retval) {
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -35,11 +35,16 @@
 #include <linux/hardirq.h>
 #include <linux/bit_spinlock.h>
 #include <linux/rculist_bl.h>
+#include <linux/kasan.h>
 #include <linux/prefetch.h>
 #include <linux/ratelimit.h>
+#include <linux/list_lru.h>
+#include <linux/vzstat.h>
+#include <linux/ve.h>
 #include "internal.h"
 #include "mount.h"
 
+
 /*
  * Usage:
  * dcache->d_inode->i_lock protects:
@@ -48,7 +53,7 @@
  *   - the dcache hash table
  * s_anon bl list spinlock protects:
  *   - the s_anon list (see __d_drop)
- * dcache_lru_lock protects:
+ * dentry->d_sb->s_dentry_lru_lock protects:
  *   - the dcache lru lists and counters
  * d_lock protects:
  *   - d_flags
@@ -63,7 +68,7 @@
  * Ordering:
  * dentry->d_inode->i_lock
  *   dentry->d_lock
- *     dcache_lru_lock
+ *     dentry->d_sb->s_dentry_lru_lock
  *     dcache_hash_bucket lock
  *     s_anon lock
  *
@@ -81,12 +86,13 @@
 int sysctl_vfs_cache_pressure __read_mostly = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
-static __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lru_lock);
+int sysctl_vfs_cache_min_ratio __read_mostly = 2;
+
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(rename_lock);
 
-static struct kmem_cache *dentry_cache __read_mostly;
+struct kmem_cache *dentry_cache __read_mostly;
 
 /*
  * This is the single most critical data structure when it comes
@@ -117,23 +123,47 @@ struct dentry_stat_t dentry_stat = {
 	.age_limit = 45,
 };
 
-static DEFINE_PER_CPU(unsigned int, nr_dentry);
+static DEFINE_PER_CPU(long, nr_dentry);
+static DEFINE_PER_CPU(long, nr_dentry_unused);
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
-static int get_nr_dentry(void)
+
+/*
+ * Here we resort to our own counters instead of using generic per-cpu counters
+ * for consistency with what the vfs inode code does. We are expected to harvest
+ * better code and performance by having our own specialized counters.
+ *
+ * Please note that the loop is done over all possible CPUs, not over all online
+ * CPUs. The reason for this is that we don't want to play games with CPUs going
+ * on and off. If one of them goes off, we will just keep their counters.
+ *
+ * glommer: See cffbc8a for details, and if you ever intend to change this,
+ * please update all vfs counters to match.
+ */
+static long get_nr_dentry(void)
 {
 	int i;
-	int sum = 0;
+	long sum = 0;
 	for_each_possible_cpu(i)
 		sum += per_cpu(nr_dentry, i);
 	return sum < 0 ? 0 : sum;
 }
 
+static long get_nr_dentry_unused(void)
+{
+	int i;
+	long sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_dentry_unused, i);
+	return sum < 0 ? 0 : sum;
+}
+
 int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
 		   size_t *lenp, loff_t *ppos)
 {
 	dentry_stat.nr_dentry = get_nr_dentry();
-	return proc_dointvec(table, write, buffer, lenp, ppos);
+	dentry_stat.nr_unused = get_nr_dentry_unused();
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #endif
 
@@ -224,22 +254,38 @@ static void __d_free(struct rcu_head *head)
 	kmem_cache_free(dentry_cache, dentry); 
 }
 
+static void dentry_free(struct dentry *dentry)
+{
+	struct rcu_head *p = (struct rcu_head *)&dentry->d_alias;
+
+	/* if dentry was never visible to RCU, immediate free is OK */
+	if (!(dentry->d_flags & DCACHE_RCUACCESS))
+		__d_free(p);
+	else
+		call_rcu(p, __d_free);
+}
+
 /*
  * no locks, please.
  */
 static void d_free(struct dentry *dentry)
 {
-	struct rcu_head *p = (struct rcu_head *)&dentry->d_alias;
+	bool can_free = true;
+
 	BUG_ON((int)dentry->d_lockref.count > 0);
 	this_cpu_dec(nr_dentry);
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
 
-	/* if dentry was never visible to RCU, immediate free is OK */
-	if (!(dentry->d_flags & DCACHE_RCUACCESS))
-		__d_free(p);
-	else
-		call_rcu(p, __d_free);
+	spin_lock(&dentry->d_lock);
+	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
+		dentry->d_flags |= DCACHE_MAY_FREE;
+		can_free = false;
+	}
+	spin_unlock(&dentry->d_lock);
+
+	if (likely(can_free))
+		dentry_free(dentry);
 }
 
 /**
@@ -306,86 +352,81 @@ static void dentry_unlink_inode(struct dentry * dentry)
 }
 
 /*
- * dentry_lru_(add|del|prune|move_tail) must be called with d_lock held.
+ * The DCACHE_LRU_LIST bit is set whenever the 'd_lru' entry
+ * is in use - which includes both the "real" per-superblock
+ * LRU list _and_ the DCACHE_SHRINK_LIST use.
+ *
+ * The DCACHE_SHRINK_LIST bit is set whenever the dentry is
+ * on the shrink list (ie not on the superblock LRU list).
+ *
+ * The per-cpu "nr_dentry_unused" counters are updated with
+ * the DCACHE_LRU_LIST bit.
+ *
+ * These helper functions make sure we always follow the
+ * rules. d_lock must be held by the caller.
  */
-static void dentry_lru_add(struct dentry *dentry)
+#define D_FLAG_VERIFY(dentry,x) WARN_ON_ONCE(((dentry)->d_flags & (DCACHE_LRU_LIST | DCACHE_SHRINK_LIST)) != (x))
+static void d_lru_add(struct dentry *dentry)
 {
-	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST))) {
-		spin_lock(&dcache_lru_lock);
-		dentry->d_flags |= DCACHE_LRU_LIST;
-		list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
-		dentry->d_sb->s_nr_dentry_unused++;
-		dentry_stat.nr_unused++;
-		spin_unlock(&dcache_lru_lock);
-	}
+	D_FLAG_VERIFY(dentry, 0);
+	dentry->d_flags |= DCACHE_LRU_LIST;
+	this_cpu_inc(nr_dentry_unused);
+	WARN_ON_ONCE(!list_lru_add(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
 }
 
-static void __dentry_lru_del(struct dentry *dentry)
+static void d_lru_del(struct dentry *dentry)
 {
+	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+	dentry->d_flags &= ~DCACHE_LRU_LIST;
+	this_cpu_dec(nr_dentry_unused);
+	WARN_ON_ONCE(!list_lru_del(&dentry->d_sb->s_dentry_lru, &dentry->d_lru));
+}
+
+static void d_shrink_del(struct dentry *dentry)
+{
+	D_FLAG_VERIFY(dentry, DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
 	list_del_init(&dentry->d_lru);
 	dentry->d_flags &= ~(DCACHE_SHRINK_LIST | DCACHE_LRU_LIST);
-	dentry->d_sb->s_nr_dentry_unused--;
-	dentry_stat.nr_unused--;
+	this_cpu_dec(nr_dentry_unused);
+}
+
+static void d_shrink_add(struct dentry *dentry, struct list_head *list)
+{
+	D_FLAG_VERIFY(dentry, 0);
+	list_add(&dentry->d_lru, list);
+	dentry->d_flags |= DCACHE_SHRINK_LIST | DCACHE_LRU_LIST;
+	this_cpu_inc(nr_dentry_unused);
 }
 
 /*
- * Remove a dentry with references from the LRU.
+ * These can only be called under the global LRU lock, ie during the
+ * callback for freeing the LRU list. "isolate" removes it from the
+ * LRU lists entirely, while shrink_move moves it to the indicated
+ * private list.
  */
-static void dentry_lru_del(struct dentry *dentry)
+static void d_lru_isolate(struct list_lru_one *lru, struct dentry *dentry)
 {
-	if (!list_empty(&dentry->d_lru)) {
-		spin_lock(&dcache_lru_lock);
-		__dentry_lru_del(dentry);
-		spin_unlock(&dcache_lru_lock);
-	}
+	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+	dentry->d_flags &= ~DCACHE_LRU_LIST;
+	this_cpu_dec(nr_dentry_unused);
+	list_lru_isolate(lru, &dentry->d_lru);
 }
 
-static void dentry_lru_move_list(struct dentry *dentry, struct list_head *list)
+static void d_lru_shrink_move(struct list_lru_one *lru, struct dentry *dentry,
+			      struct list_head *list)
 {
-	spin_lock(&dcache_lru_lock);
-	if (list_empty(&dentry->d_lru)) {
-		dentry->d_flags |= DCACHE_LRU_LIST;
-		list_add_tail(&dentry->d_lru, list);
-		dentry->d_sb->s_nr_dentry_unused++;
-		dentry_stat.nr_unused++;
-	} else {
-		list_move_tail(&dentry->d_lru, list);
-	}
-	spin_unlock(&dcache_lru_lock);
+	D_FLAG_VERIFY(dentry, DCACHE_LRU_LIST);
+	dentry->d_flags |= DCACHE_SHRINK_LIST;
+	list_lru_isolate_move(lru, &dentry->d_lru, list);
 }
 
-/**
- * d_kill - kill dentry and return parent
- * @dentry: dentry to kill
- * @parent: parent dentry
- *
- * The dentry must already be unhashed and removed from the LRU.
- *
- * If this is the root of the dentry tree, return NULL.
- *
- * dentry->d_lock and parent->d_lock must be held by caller, and are dropped by
- * d_kill.
+/*
+ * dentry_lru_(add|del)_list) must be called with d_lock held.
  */
-static struct dentry *d_kill(struct dentry *dentry, struct dentry *parent)
-	__releases(dentry->d_lock)
-	__releases(parent->d_lock)
-	__releases(dentry->d_inode->i_lock)
+static void dentry_lru_add(struct dentry *dentry)
 {
-	__list_del_entry(&dentry->d_u.d_child);
-	/*
-	 * Inform d_walk() that we are no longer attached to the
-	 * dentry tree
-	 */
-	dentry->d_flags |= DCACHE_DENTRY_KILLED;
-	if (parent)
-		spin_unlock(&parent->d_lock);
-	dentry_iput(dentry);
-	/*
-	 * dentry_iput drops the locks, at which point nobody (except
-	 * transient RCU lookups) can reach this dentry.
-	 */
-	d_free(dentry);
-	return parent;
+	if (unlikely(!(dentry->d_flags & DCACHE_LRU_LIST)))
+		d_lru_add(dentry);
 }
 
 /*
@@ -441,34 +482,12 @@ void d_drop(struct dentry *dentry)
 }
 EXPORT_SYMBOL(d_drop);
 
-/*
- * Finish off a dentry we've decided to kill.
- * dentry->d_lock must be held, returns with it unlocked.
- * If ref is non-zero, then decrement the refcount too.
- * Returns dentry requiring refcount drop, or NULL if we're done.
- */
-static inline struct dentry *dentry_kill(struct dentry *dentry)
-	__releases(dentry->d_lock)
+static void __dentry_kill(struct dentry *dentry)
 {
-	struct inode *inode;
-	struct dentry *parent;
+	struct dentry *parent = NULL;
 
-	inode = dentry->d_inode;
-	if (inode && !spin_trylock(&inode->i_lock)) {
-relock:
-		spin_unlock(&dentry->d_lock);
-		cpu_relax();
-		return dentry; /* try again with same dentry */
-	}
-	if (IS_ROOT(dentry))
-		parent = NULL;
-	else
+	if (!IS_ROOT(dentry))
 		parent = dentry->d_parent;
-	if (parent && !spin_trylock(&parent->d_lock)) {
-		if (inode)
-			spin_unlock(&inode->i_lock);
-		goto relock;
-	}
 
 	/*
 	 * The dentry is now unrecoverably dead to the world.
@@ -482,10 +501,91 @@ relock:
 	if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry))
 		dentry->d_op->d_prune(dentry);
 
-	dentry_lru_del(dentry);
+	if (dentry->d_flags & DCACHE_LRU_LIST) {
+		if (!(dentry->d_flags & DCACHE_SHRINK_LIST))
+			d_lru_del(dentry);
+	}
 	/* if it was on the hash then remove it */
 	__d_drop(dentry);
-	return d_kill(dentry, parent);
+	__list_del_entry(&dentry->d_u.d_child);
+	/*
+	 * Inform d_walk() that we are no longer attached to the
+	 * dentry tree
+	 */
+	dentry->d_flags |= DCACHE_DENTRY_KILLED;
+	if (parent)
+		spin_unlock(&parent->d_lock);
+	dentry_iput(dentry);
+	/*
+	 * dentry_iput drops the locks, at which point nobody (except
+	 * transient RCU lookups) can reach this dentry.
+	 */
+	d_free(dentry);
+}
+
+/*
+ * Finish off a dentry we've decided to kill.
+ * dentry->d_lock must be held, returns with it unlocked.
+ * If ref is non-zero, then decrement the refcount too.
+ * Returns dentry requiring refcount drop, or NULL if we're done.
+ */
+static struct dentry *dentry_kill(struct dentry *dentry)
+	__releases(dentry->d_lock)
+{
+	struct inode *inode = dentry->d_inode;
+	struct dentry *parent = NULL;
+
+	if (inode && unlikely(!spin_trylock(&inode->i_lock)))
+		goto failed;
+
+	if (!IS_ROOT(dentry)) {
+		parent = dentry->d_parent;
+		if (unlikely(!spin_trylock(&parent->d_lock))) {
+			if (inode)
+				spin_unlock(&inode->i_lock);
+			goto failed;
+		}
+	}
+
+	__dentry_kill(dentry);
+	return parent;
+
+failed:
+	spin_unlock(&dentry->d_lock);
+	cpu_relax();
+	return dentry; /* try again with same dentry */
+}
+
+static inline struct dentry *lock_parent(struct dentry *dentry)
+{
+	struct dentry *parent = dentry->d_parent;
+	if (IS_ROOT(dentry))
+		return NULL;
+	if (likely(spin_trylock(&parent->d_lock)))
+		return parent;
+	spin_unlock(&dentry->d_lock);
+	rcu_read_lock();
+again:
+	parent = ACCESS_ONCE(dentry->d_parent);
+	spin_lock(&parent->d_lock);
+	/*
+	 * We can't blindly lock dentry until we are sure
+	 * that we won't violate the locking order.
+	 * Any changes of dentry->d_parent must have
+	 * been done with parent->d_lock held, so
+	 * spin_lock() above is enough of a barrier
+	 * for checking if it's still our child.
+	 */
+	if (unlikely(parent != dentry->d_parent)) {
+		spin_unlock(&parent->d_lock);
+		goto again;
+	}
+	rcu_read_unlock();
+	if (parent != dentry)
+		spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+	else
+		parent = NULL;
+	return parent;
 }
 
 /* 
@@ -753,129 +853,190 @@ restart:
 }
 EXPORT_SYMBOL(d_prune_aliases);
 
-/*
- * Try to throw away a dentry - free the inode, dput the parent.
- * Requires dentry->d_lock is held, and dentry->d_count == 0.
- * Releases dentry->d_lock.
- *
- * This may fail if locks cannot be acquired no problem, just try again.
- */
-static void try_prune_one_dentry(struct dentry *dentry)
-	__releases(dentry->d_lock)
+static void shrink_dentry_list(struct list_head *list)
 {
-	struct dentry *parent;
+	struct dentry *dentry, *parent;
 
-	parent = dentry_kill(dentry);
-	/*
-	 * If dentry_kill returns NULL, we have nothing more to do.
-	 * if it returns the same dentry, trylocks failed. In either
-	 * case, just loop again.
-	 *
-	 * Otherwise, we need to prune ancestors too. This is necessary
-	 * to prevent quadratic behavior of shrink_dcache_parent(), but
-	 * is also expected to be beneficial in reducing dentry cache
-	 * fragmentation.
-	 */
-	if (!parent)
-		return;
-	if (parent == dentry)
-		return;
+	while (!list_empty(list)) {
+		struct inode *inode;
+		dentry = list_entry(list->prev, struct dentry, d_lru);
+		spin_lock(&dentry->d_lock);
+		parent = lock_parent(dentry);
 
-	/* Prune ancestors. */
-	dentry = parent;
-	while (dentry) {
-		if (lockref_put_or_lock(&dentry->d_lockref))
-			return;
-		dentry = dentry_kill(dentry);
-	}
-}
+		/*
+		 * The dispose list is isolated and dentries are not accounted
+		 * to the LRU here, so we can simply remove it from the list
+		 * here regardless of whether it is referenced or not.
+		 */
+		d_shrink_del(dentry);
 
-static void shrink_dentry_list(struct list_head *list)
-{
-	struct dentry *dentry;
+		/*
+		 * We found an inuse dentry which was not removed from
+		 * the LRU because of laziness during lookup. Do not free it.
+		 */
+		if ((int)dentry->d_lockref.count > 0) {
+			spin_unlock(&dentry->d_lock);
+			if (parent)
+				spin_unlock(&parent->d_lock);
+			continue;
+		}
 
-	rcu_read_lock();
-	for (;;) {
-		dentry = list_entry_rcu(list->prev, struct dentry, d_lru);
-		if (&dentry->d_lru == list)
-			break; /* empty */
-		spin_lock(&dentry->d_lock);
-		if (dentry != list_entry(list->prev, struct dentry, d_lru)) {
+
+		if (unlikely(dentry->d_flags & DCACHE_DENTRY_KILLED)) {
+			bool can_free = dentry->d_flags & DCACHE_MAY_FREE;
 			spin_unlock(&dentry->d_lock);
+			if (parent)
+				spin_unlock(&parent->d_lock);
+			if (can_free)
+				dentry_free(dentry);
 			continue;
 		}
 
-		/*
-		 * We found an inuse dentry which was not removed from
-		 * the LRU because of laziness during lookup.  Do not free
-		 * it - just keep it off the LRU list.
-		 */
-		if (dentry->d_lockref.count) {
-			dentry_lru_del(dentry);
+		inode = dentry->d_inode;
+		if (inode && unlikely(!spin_trylock(&inode->i_lock))) {
+			d_shrink_add(dentry, list);
 			spin_unlock(&dentry->d_lock);
+			if (parent)
+				spin_unlock(&parent->d_lock);
 			continue;
 		}
 
-		rcu_read_unlock();
+		__dentry_kill(dentry);
+
+		/*
+		 * We need to prune ancestors too. This is necessary to prevent
+		 * quadratic behavior of shrink_dcache_parent(), but is also
+		 * expected to be beneficial in reducing dentry cache
+		 * fragmentation.
+		 */
+		dentry = parent;
+		while (dentry && !lockref_put_or_lock(&dentry->d_lockref)) {
+			parent = lock_parent(dentry);
+			if (dentry->d_lockref.count != 1) {
+				dentry->d_lockref.count--;
+				spin_unlock(&dentry->d_lock);
+				if (parent)
+					spin_unlock(&parent->d_lock);
+				break;
+			}
+			inode = dentry->d_inode;	/* can't be NULL */
+			if (unlikely(!spin_trylock(&inode->i_lock))) {
+				spin_unlock(&dentry->d_lock);
+				if (parent)
+					spin_unlock(&parent->d_lock);
+				cpu_relax();
+				continue;
+			}
+			__dentry_kill(dentry);
+			dentry = parent;
+		}
+	}
+}
+
+static enum lru_status dentry_lru_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+	struct list_head *freeable = arg;
+	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
 
-		try_prune_one_dentry(dentry);
 
-		rcu_read_lock();
+	/*
+	 * we are inverting the lru lock/dentry->d_lock here,
+	 * so use a trylock. If we fail to get the lock, just skip
+	 * it
+	 */
+	if (!spin_trylock(&dentry->d_lock))
+		return LRU_SKIP;
+
+	/*
+	 * Referenced dentries are still in use. If they have active
+	 * counts, just remove them from the LRU. Otherwise give them
+	 * another pass through the LRU.
+	 */
+	if (dentry->d_lockref.count) {
+		d_lru_isolate(lru, dentry);
+		spin_unlock(&dentry->d_lock);
+		return LRU_REMOVED;
 	}
-	rcu_read_unlock();
+
+	if (dentry->d_flags & DCACHE_REFERENCED) {
+		dentry->d_flags &= ~DCACHE_REFERENCED;
+		spin_unlock(&dentry->d_lock);
+
+		/*
+		 * The list move itself will be made by the common LRU code. At
+		 * this point, we've dropped the dentry->d_lock but keep the
+		 * lru lock. This is safe to do, since every list movement is
+		 * protected by the lru lock even if both locks are held.
+		 *
+		 * This is guaranteed by the fact that all LRU management
+		 * functions are intermediated by the LRU API calls like
+		 * list_lru_add and list_lru_del. List movement in this file
+		 * only ever occur through this functions or through callbacks
+		 * like this one, that are called from the LRU API.
+		 *
+		 * The only exceptions to this are functions like
+		 * shrink_dentry_list, and code that first checks for the
+		 * DCACHE_SHRINK_LIST flag.  Those are guaranteed to be
+		 * operating only with stack provided lists after they are
+		 * properly isolated from the main list.  It is thus, always a
+		 * local access.
+		 */
+		return LRU_ROTATE;
+	}
+
+	d_lru_shrink_move(lru, dentry, freeable);
+	spin_unlock(&dentry->d_lock);
+
+	return LRU_REMOVED;
 }
 
 /**
  * prune_dcache_sb - shrink the dcache
  * @sb: superblock
- * @count: number of entries to try to free
+ * @sc: shrink control, passed to list_lru_shrink_walk()
  *
- * Attempt to shrink the superblock dcache LRU by @count entries. This is
- * done when we need more memory an called from the superblock shrinker
+ * Attempt to shrink the superblock dcache LRU by @sc->nr_to_scan entries. This
+ * is done when we need more memory and called from the superblock shrinker
  * function.
  *
  * This function may fail to free any resources if all the dentries are in
  * use.
  */
-void prune_dcache_sb(struct super_block *sb, int count)
+long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc)
 {
-	struct dentry *dentry;
-	LIST_HEAD(referenced);
-	LIST_HEAD(tmp);
-
-relock:
-	spin_lock(&dcache_lru_lock);
-	while (!list_empty(&sb->s_dentry_lru)) {
-		dentry = list_entry(sb->s_dentry_lru.prev,
-				struct dentry, d_lru);
-		BUG_ON(dentry->d_sb != sb);
-
-		if (!spin_trylock(&dentry->d_lock)) {
-			spin_unlock(&dcache_lru_lock);
-			cpu_relax();
-			goto relock;
-		}
+	LIST_HEAD(dispose);
+	long freed;
 
-		if (dentry->d_flags & DCACHE_REFERENCED) {
-			dentry->d_flags &= ~DCACHE_REFERENCED;
-			list_move(&dentry->d_lru, &referenced);
-			spin_unlock(&dentry->d_lock);
-		} else {
-			list_move_tail(&dentry->d_lru, &tmp);
-			dentry->d_flags |= DCACHE_SHRINK_LIST;
-			spin_unlock(&dentry->d_lock);
-			if (!--count)
-				break;
-		}
-		cond_resched_lock(&dcache_lru_lock);
-	}
-	if (!list_empty(&referenced))
-		list_splice(&referenced, &sb->s_dentry_lru);
-	spin_unlock(&dcache_lru_lock);
+	KSTAT_PERF_ENTER(shrink_dcache);
+	freed = list_lru_shrink_walk(&sb->s_dentry_lru, sc,
+				     dentry_lru_isolate, &dispose);
+	shrink_dentry_list(&dispose);
+	KSTAT_PERF_LEAVE(shrink_dcache);
+	return freed;
+}
+
+static enum lru_status dentry_lru_isolate_shrink(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
+{
+	struct list_head *freeable = arg;
+	struct dentry	*dentry = container_of(item, struct dentry, d_lru);
+
+	/*
+	 * we are inverting the lru lock/dentry->d_lock here,
+	 * so use a trylock. If we fail to get the lock, just skip
+	 * it
+	 */
+	if (!spin_trylock(&dentry->d_lock))
+		return LRU_SKIP;
+
+	d_lru_shrink_move(lru, dentry, freeable);
+	spin_unlock(&dentry->d_lock);
 
-	shrink_dentry_list(&tmp);
+	return LRU_REMOVED;
 }
 
+
 /**
  * shrink_dcache_sb - shrink dcache for a superblock
  * @sb: superblock
@@ -885,16 +1046,17 @@ relock:
  */
 void shrink_dcache_sb(struct super_block *sb)
 {
-	LIST_HEAD(tmp);
+	long freed;
 
-	spin_lock(&dcache_lru_lock);
-	while (!list_empty(&sb->s_dentry_lru)) {
-		list_splice_init(&sb->s_dentry_lru, &tmp);
-		spin_unlock(&dcache_lru_lock);
-		shrink_dentry_list(&tmp);
-		spin_lock(&dcache_lru_lock);
-	}
-	spin_unlock(&dcache_lru_lock);
+	do {
+		LIST_HEAD(dispose);
+
+		freed = list_lru_walk(&sb->s_dentry_lru,
+			dentry_lru_isolate_shrink, &dispose, UINT_MAX);
+
+		this_cpu_sub(nr_dentry_unused, freed);
+		shrink_dentry_list(&dispose);
+	} while (freed > 0);
 }
 EXPORT_SYMBOL(shrink_dcache_sb);
 
@@ -928,7 +1090,9 @@ static void shrink_dcache_for_umount_subtree(struct dentry *dentry)
 			    !d_unhashed(dentry))
 				dentry->d_op->d_prune(dentry);
 
-			dentry_lru_del(dentry);
+			WARN_ON_ONCE(dentry->d_flags & DCACHE_SHRINK_LIST);
+			if (dentry->d_flags & DCACHE_LRU_LIST)
+				d_lru_del(dentry);
 			__d_shrink(dentry);
 
 			if (dentry->d_lockref.count != 0) {
@@ -1233,29 +1397,23 @@ static enum d_walk_ret select_collect(void *_data, struct dentry *dentry)
 	if (data->start == dentry)
 		goto out;
 
-	/*
-	 * move only zero ref count dentries to the dispose list.
-	 *
-	 * Those which are presently on the shrink list, being processed
-	 * by shrink_dentry_list(), shouldn't be moved.  Otherwise the
-	 * loop in shrink_dcache_parent() might not make any progress
-	 * and loop forever.
-	 */
-	if (dentry->d_lockref.count) {
-		dentry_lru_del(dentry);
-	} else if (!(dentry->d_flags & DCACHE_SHRINK_LIST)) {
-		dentry_lru_move_list(dentry, &data->dispose);
-		dentry->d_flags |= DCACHE_SHRINK_LIST;
+	if (dentry->d_flags & DCACHE_SHRINK_LIST) {
 		data->found++;
-		ret = D_WALK_NORETRY;
+	} else {
+		if (dentry->d_flags & DCACHE_LRU_LIST)
+			d_lru_del(dentry);
+		if (!dentry->d_lockref.count) {
+			d_shrink_add(dentry, &data->dispose);
+			data->found++;
+		}
 	}
 	/*
 	 * We can return to the caller if we have found some (this
 	 * ensures forward progress). We'll be coming back to find
 	 * the rest.
 	 */
-	if (data->found && need_resched())
-		ret = D_WALK_QUIT;
+	if (!list_empty(&data->dispose))
+		ret = need_resched() ? D_WALK_QUIT : D_WALK_NORETRY;
 out:
 	return ret;
 }
@@ -1377,11 +1535,16 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name)
 	 */
 	dentry->d_iname[DNAME_INLINE_LEN-1] = 0;
 	if (name->len > DNAME_INLINE_LEN-1) {
-		dname = kmalloc(name->len + 1, GFP_KERNEL);
+		dname = kmalloc(name->len + 1, GFP_KERNEL_ACCOUNT);
 		if (!dname) {
 			kmem_cache_free(dentry_cache, dentry); 
 			return NULL;
 		}
+		if (IS_ENABLED(CONFIG_DCACHE_WORD_ACCESS))
+			kasan_unpoison_shadow(dname,
+					round_up(name->len + 1,
+						sizeof(unsigned long)));
+
 	} else  {
 		dname = dentry->d_iname;
 	}	
@@ -2758,7 +2921,7 @@ static int prepend_path(const struct path *path,
 	struct dentry *dentry;
 	struct vfsmount *vfsmnt;
 	struct mount *mnt;
-	int error = 0;
+	int error;
 	unsigned seq = 0;
 	char *bptr;
 	int blen;
@@ -3009,7 +3172,6 @@ restart:
 	read_seqbegin_or_lock(&rename_lock, &seq);
 	while (!IS_ROOT(dentry)) {
 		struct dentry *parent = dentry->d_parent;
-		int error;
 
 		prefetch(parent);
 		error = prepend_name(&end, &len, &dentry->d_name);
@@ -3262,7 +3424,7 @@ static void __init dcache_init(void)
 	 * of the dcache. 
 	 */
 	dentry_cache = KMEM_CACHE(dentry,
-		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD);
+		SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|SLAB_MEM_SPREAD|SLAB_ACCOUNT);
 
 	/* Hash may have been set up in dcache_init_early */
 	if (!hashdist)
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -13,7 +13,6 @@
 #include <linux/module.h>
 #include <linux/init.h>
 #include <linux/fs.h>
-#include <linux/sched.h>
 #include <linux/namei.h>
 #include <linux/slab.h>
 #include <linux/mount.h>
@@ -25,6 +24,7 @@
 #include <linux/parser.h>
 #include <linux/fsnotify.h>
 #include <linux/seq_file.h>
+#include <linux/ve.h>
 
 #define DEVPTS_DEFAULT_MODE 0600
 /*
@@ -140,7 +140,7 @@ static inline struct super_block *pts_sb_from_inode(struct inode *inode)
 	if (inode->i_sb->s_magic == DEVPTS_SUPER_MAGIC)
 		return inode->i_sb;
 #endif
-	return devpts_mnt->mnt_sb;
+	return get_exec_env()->devpts_sb;
 }
 
 #define PARSE_MOUNT	0
@@ -402,11 +402,19 @@ fail:
 }
 
 #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
-static int compare_init_pts_sb(struct super_block *s, void *p)
+static int test_devpts_sb(struct super_block *s, void *p)
 {
-	if (devpts_mnt)
-		return devpts_mnt->mnt_sb == s;
-	return 0;
+	return get_exec_env()->devpts_sb == s;
+}
+
+static int set_devpts_sb(struct super_block *s, void *p)
+{
+	int error = set_anon_super(s, p);
+	if (!error && !get_exec_env()->devpts_sb) {
+		atomic_inc(&s->s_active);
+		get_exec_env()->devpts_sb = s;
+	}
+	return error;
 }
 
 /*
@@ -450,14 +458,14 @@ static struct dentry *devpts_mount(struct file_system_type *fs_type,
 	/* Require newinstance for all user namespace mounts to ensure
 	 * the mount options are not changed.
 	 */
-	if ((current_user_ns() != &init_user_ns) && !opts.newinstance)
+	if (!IS_ENABLED(CONFIG_VE) &&
+	    (current_user_ns() != &init_user_ns) && !opts.newinstance)
 		return ERR_PTR(-EINVAL);
 
 	if (opts.newinstance)
-		s = sget(fs_type, NULL, set_anon_super, flags, NULL);
+		s = sget(fs_type, NULL, set_devpts_sb, flags, NULL);
 	else
-		s = sget(fs_type, compare_init_pts_sb, set_anon_super, flags,
-			 NULL);
+		s = sget(fs_type, test_devpts_sb, set_devpts_sb, flags, NULL);
 
 	if (IS_ERR(s))
 		return ERR_CAST(s);
@@ -508,7 +516,7 @@ static struct file_system_type devpts_fs_type = {
 	.mount		= devpts_mount,
 	.kill_sb	= devpts_kill_sb,
 #ifdef CONFIG_DEVPTS_MULTIPLE_INSTANCES
-	.fs_flags	= FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT,
+	.fs_flags	= FS_USERNS_MOUNT | FS_USERNS_DEV_MOUNT | FS_VIRTUALIZED,
 #endif
 };
 
@@ -520,10 +528,16 @@ static struct file_system_type devpts_fs_type = {
 int devpts_new_index(struct inode *ptmx_inode)
 {
 	struct super_block *sb = pts_sb_from_inode(ptmx_inode);
-	struct pts_fs_info *fsi = DEVPTS_SB(sb);
+	struct pts_fs_info *fsi;
 	int index;
 	int ida_ret;
 
+	/* devpts not mounted yet */
+	if (!sb)
+		return -ENODEV;
+
+	fsi = DEVPTS_SB(sb);
+
 retry:
 	if (!ida_pre_get(&fsi->allocated_ptys, GFP_KERNEL))
 		return -ENOMEM;
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -38,6 +38,7 @@
 #include <linux/atomic.h>
 #include <linux/prefetch.h>
 #include <linux/aio.h>
+#include <linux/virtinfo.h>
 
 /*
  * How many user pages to map in one call to get_user_pages().  This determines
@@ -821,6 +822,8 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page,
 {
 	int ret = 0;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, bdev_get_queue(map_bh->b_bdev));
+
 	if (dio->rw & WRITE) {
 		/*
 		 * Read accounting is performed in submit_bio()
--- a/fs/drop_caches.c
+++ b/fs/drop_caches.c
@@ -37,18 +37,6 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused)
 	iput(toput_inode);
 }
 
-static void drop_slab(void)
-{
-	int nr_objects;
-	struct shrink_control shrink = {
-		.gfp_mask = GFP_KERNEL,
-	};
-
-	do {
-		nr_objects = shrink_slab(&shrink, 1000, 1000);
-	} while (nr_objects > 10);
-}
-
 int drop_caches_sysctl_handler(ctl_table *table, int write,
 	void __user *buffer, size_t *length, loff_t *ppos)
 {
--- a/fs/ecryptfs/main.c
+++ b/fs/ecryptfs/main.c
@@ -655,6 +655,7 @@ static struct ecryptfs_cache_info {
 	struct kmem_cache **cache;
 	const char *name;
 	size_t size;
+	unsigned long flags;
 	void (*ctor)(void *obj);
 } ecryptfs_cache_infos[] = {
 	{
@@ -676,6 +677,7 @@ static struct ecryptfs_cache_info {
 		.cache = &ecryptfs_inode_info_cache,
 		.name = "ecryptfs_inode_cache",
 		.size = sizeof(struct ecryptfs_inode_info),
+		.flags = SLAB_ACCOUNT,
 		.ctor = inode_info_init_once,
 	},
 	{
@@ -747,8 +749,8 @@ static int ecryptfs_init_kmem_caches(void)
 		struct ecryptfs_cache_info *info;
 
 		info = &ecryptfs_cache_infos[i];
-		*(info->cache) = kmem_cache_create(info->name, info->size,
-				0, SLAB_HWCACHE_ALIGN, info->ctor);
+		*(info->cache) = kmem_cache_create(info->name, info->size, 0,
+				SLAB_HWCACHE_ALIGN | info->flags, info->ctor);
 		if (!*(info->cache)) {
 			ecryptfs_free_kmem_caches();
 			ecryptfs_printk(KERN_WARNING, "%s: "
--- a/fs/efs/super.c
+++ b/fs/efs/super.c
@@ -87,9 +87,9 @@ static void init_once(void *foo)
 static int init_inodecache(void)
 {
 	efs_inode_cachep = kmem_cache_create("efs_inode_cache",
-				sizeof(struct efs_inode_info),
-				0, SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-				init_once);
+				sizeof(struct efs_inode_info), 0,
+				SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+				SLAB_ACCOUNT, init_once);
 	if (efs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -312,7 +312,7 @@ ctl_table epoll_table[] = {
 };
 #endif /* CONFIG_SYSCTL */
 
-static const struct file_operations eventpoll_fops;
+const static struct file_operations eventpoll_fops;
 
 static inline int is_file_epoll(struct file *f)
 {
@@ -879,21 +879,19 @@ static int ep_show_fdinfo(struct seq_file *m, struct file *f)
 {
 	struct eventpoll *ep = f->private_data;
 	struct rb_node *rbp;
-	int ret = 0;
 
 	mutex_lock(&ep->mtx);
 	for (rbp = rb_first(&ep->rbr); rbp; rbp = rb_next(rbp)) {
 		struct epitem *epi = rb_entry(rbp, struct epitem, rbn);
 
-		ret = seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
-				 epi->ffd.fd, epi->event.events,
-				 (long long)epi->event.data);
-		if (ret)
+		seq_printf(m, "tfd: %8d events: %8x data: %16llx\n",
+			   epi->ffd.fd, epi->event.events,
+			   (long long)epi->event.data);
+		if (m->count == m->size)
 			break;
 	}
 	mutex_unlock(&ep->mtx);
-
-	return ret;
+	return 0;
 }
 #endif
 
@@ -915,7 +913,7 @@ static const struct file_operations eventpoll_fops = {
 void eventpoll_release_file(struct file *file)
 {
 	struct eventpoll *ep;
-	struct epitem *epi;
+	struct epitem *epi, *next;
 
 	/*
 	 * We don't want to get "file->f_lock" because it is not
@@ -931,7 +929,7 @@ void eventpoll_release_file(struct file *file)
 	 * Besides, ep_remove() acquires the lock, so we can't hold it here.
 	 */
 	mutex_lock(&epmutex);
-	list_for_each_entry_rcu(epi, &file->f_ep_links, fllink) {
+	list_for_each_entry_safe(epi, next, &file->f_ep_links, fllink) {
 		ep = epi->ep;
 		mutex_lock_nested(&ep->mtx, 0);
 		ep_remove(ep, epi);
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -26,6 +26,7 @@
 #include <linux/file.h>
 #include <linux/fdtable.h>
 #include <linux/mm.h>
+#include <linux/virtinfo.h>
 #include <linux/stat.h>
 #include <linux/fcntl.h>
 #include <linux/swap.h>
@@ -56,6 +57,8 @@
 #include <linux/oom.h>
 #include <linux/compat.h>
 
+#include <bc/vmpages.h>
+
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
 #include <asm/tlb.h>
@@ -249,9 +252,14 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 	struct vm_area_struct *vma = NULL;
 	struct mm_struct *mm = bprm->mm;
 
-	bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	err = -ENOMEM;
+	if (ub_memory_charge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags,
+				NULL, UB_SOFT))
+		goto err_charge;
+
+	bprm->vma = vma = allocate_vma(mm, GFP_KERNEL | __GFP_ZERO);
 	if (!vma)
-		return -ENOMEM;
+		goto err_alloc;
 
 	down_write(&mm->mmap_sem);
 	vma->vm_mm = mm;
@@ -281,7 +289,10 @@ static int __bprm_mm_init(struct linux_binprm *bprm)
 err:
 	up_write(&mm->mmap_sem);
 	bprm->vma = NULL;
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(mm, vma);
+err_alloc:
+	ub_memory_uncharge(mm, PAGE_SIZE, VM_STACK_FLAGS | mm->def_flags, NULL);
+err_charge:
 	return err;
 }
 
@@ -583,6 +594,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 	unsigned long new_start = old_start - shift;
 	unsigned long new_end = old_end - shift;
 	struct mmu_gather tlb;
+	unsigned long moved;
+	struct vm_area_struct *prev;
 
 	BUG_ON(new_start > new_end);
 
@@ -600,12 +613,11 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 		return -ENOMEM;
 
 	/*
-	 * move the page tables downwards, on failure we rely on
-	 * process cleanup to remove whatever mess we made.
+	 * move the page tables downwards, on failure undo changes.
 	 */
-	if (length != move_page_tables(vma, old_start,
-				       vma, new_start, length, false))
-		return -ENOMEM;
+	moved = move_page_tables(vma, old_start, vma, new_start, length, false);
+	if (length != moved)
+		goto undo;
 
 	lru_add_drain();
 	tlb_gather_mmu(&tlb, mm, old_start, old_end);
@@ -633,6 +645,36 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
 	vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
 
 	return 0;
+
+undo:
+	/*
+	 * move the page tables back.
+	 */
+	length = move_page_tables(vma, new_start, vma, old_start, moved, false);
+	if (WARN_ON(length != moved))
+		return -EFAULT;
+
+	/*
+	 * release unused page tables.
+	 */
+	find_vma_prev(mm, vma->vm_start, &prev);
+	tlb_gather_mmu(&tlb, mm, new_start, new_end);
+	if (new_end > old_start)
+		free_pgd_range(&tlb, new_start, old_start,
+				prev ? prev->vm_end : FIRST_USER_ADDRESS,
+				old_start);
+	else
+		free_pgd_range(&tlb, new_start, new_end,
+				prev ? prev->vm_end : FIRST_USER_ADDRESS,
+				old_start);
+	tlb_finish_mmu(&tlb, new_start, new_end);
+
+	/*
+	 * shrink the vma to the old range.
+	 */
+	vma_adjust(vma, old_start, old_end, vma->vm_pgoff, NULL);
+
+	return -ENOMEM;
 }
 
 /*
@@ -823,10 +865,10 @@ ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len)
 }
 EXPORT_SYMBOL(read_code);
 
-static int exec_mmap(struct mm_struct *mm)
+static int exec_mmap(struct linux_binprm *bprm)
 {
 	struct task_struct *tsk;
-	struct mm_struct * old_mm, *active_mm;
+	struct mm_struct *old_mm, *active_mm, *mm;
 
 	/* Notify parent that we're no longer interested in the old VM */
 	tsk = current;
@@ -847,6 +889,9 @@ static int exec_mmap(struct mm_struct *mm)
 			return -EINTR;
 		}
 	}
+
+	mm = bprm->mm;
+	mm->vps_dumpable = VD_PTRACE_COREDUMP;
 	task_lock(tsk);
 	active_mm = tsk->active_mm;
 	tsk->mm = mm;
@@ -854,6 +899,8 @@ static int exec_mmap(struct mm_struct *mm)
 	activate_mm(active_mm, mm);
 	task_unlock(tsk);
 	arch_pick_mmap_layout(mm);
+	bprm->mm = NULL;		/* We're using it now */
+
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
 		BUG_ON(active_mm != old_mm);
@@ -1096,12 +1143,10 @@ int flush_old_exec(struct linux_binprm * bprm)
 	 * Release all of the old mmap stuff
 	 */
 	acct_arg_size(bprm, 0);
-	retval = exec_mmap(bprm->mm);
+	retval = exec_mmap(bprm);
 	if (retval)
 		goto out;
 
-	bprm->mm = NULL;		/* We're using it now */
-
 	set_fs(USER_DS);
 	current->flags &=
 		~(PF_RANDOMIZE | PF_FORKNOEXEC | PF_KTHREAD | PF_NOFREEZE);
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -194,8 +194,8 @@ static int init_inodecache(void)
 {
 	exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
 				sizeof(struct exofs_i_info), 0,
-				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
-				exofs_init_once);
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+				SLAB_ACCOUNT, exofs_init_once);
 	if (exofs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
--- a/fs/ext2/acl.c
+++ b/fs/ext2/acl.c
@@ -206,15 +206,11 @@ ext2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 		case ACL_TYPE_ACCESS:
 			name_index = EXT2_XATTR_INDEX_POSIX_ACL_ACCESS;
 			if (acl) {
-				error = posix_acl_equiv_mode(acl, &inode->i_mode);
-				if (error < 0)
+				error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+				if (error)
 					return error;
-				else {
-					inode->i_ctime = CURRENT_TIME_SEC;
-					mark_inode_dirty(inode);
-					if (error == 0)
-						acl = NULL;
-				}
+				inode->i_ctime = CURRENT_TIME_SEC;
+				mark_inode_dirty(inode);
 			}
 			break;
 
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -199,7 +199,7 @@ static int init_inodecache(void)
 	ext2_inode_cachep = kmem_cache_create("ext2_inode_cache",
 					     sizeof(struct ext2_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (ext2_inode_cachep == NULL)
 		return -ENOMEM;
@@ -1536,7 +1536,7 @@ static struct file_system_type ext2_fs_type = {
 	.name		= "ext2",
 	.mount		= ext2_mount,
 	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
+	.fs_flags	= FS_REQUIRES_DEV | FS_VIRTUALIZED,
 };
 MODULE_ALIAS_FS("ext2");
 
--- a/fs/ext3/ioctl.c
+++ b/fs/ext3/ioctl.c
@@ -73,7 +73,7 @@ long ext3_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		 * the relevant capability.
 		 */
 		if ((jflag ^ oldflags) & (EXT3_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE))
+			if (!capable(CAP_SYS_ADMIN))
 				goto flags_out;
 		}
 
--- a/fs/ext3/namei.c
+++ b/fs/ext3/namei.c
@@ -1320,7 +1320,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
 	if (err)
 		ext3_std_error(dir->i_sb, err);
 	brelse(bh);
-	return 0;
+	return err;
 }
 
 /*
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -3062,7 +3062,7 @@ static struct file_system_type ext3_fs_type = {
 	.name		= "ext3",
 	.mount		= ext3_mount,
 	.kill_sb	= kill_block_super,
-	.fs_flags	= FS_REQUIRES_DEV,
+	.fs_flags	= FS_REQUIRES_DEV | FS_VIRTUALIZED,
 };
 MODULE_ALIAS_FS("ext3");
 
--- a/fs/ext4/Makefile
+++ b/fs/ext4/Makefile
@@ -8,7 +8,7 @@ ext4-y	:= balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o page-io.o \
 		ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
 		ext4_jbd2.o migrate.o mballoc.o block_validity.o move_extent.o \
 		mmp.o indirect.o extents_status.o xattr.o xattr_user.o \
-		xattr_trusted.o inline.o
+		xattr_trusted.o inline.o pfcache.o
 
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)	+= acl.o
 ext4-$(CONFIG_EXT4_FS_SECURITY)		+= xattr_security.o
--- a/fs/ext4/acl.c
+++ b/fs/ext4/acl.c
@@ -211,15 +211,11 @@ ext4_set_acl(handle_t *handle, struct inode *inode, int type,
 	case ACL_TYPE_ACCESS:
 		name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS;
 		if (acl) {
-			error = posix_acl_equiv_mode(acl, &inode->i_mode);
-			if (error < 0)
+			error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+			if (error)
 				return error;
-			else {
-				inode->i_ctime = ext4_current_time(inode);
-				ext4_mark_inode_dirty(handle, inode);
-				if (error == 0)
-					acl = NULL;
-			}
+			inode->i_ctime = ext4_current_time(inode);
+			ext4_mark_inode_dirty(handle, inode);
 		}
 		break;
 
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -255,6 +255,7 @@ unsigned ext4_free_clusters_after_init(struct super_block *sb,
 	return num_clusters_in_group(sb, block_group) - 
 		ext4_num_overhead_clusters(sb, block_group, gdp);
 }
+EXPORT_SYMBOL(ext4_get_group_desc);
 
 /*
  * The free blocks are managed by bitmaps.  A file system contains several
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -103,6 +103,14 @@ int __ext4_check_dir_entry(const char *function, unsigned int line,
 	return 1;
 }
 
+static inline int ext4_balloon(struct super_block *sb, unsigned ino)
+{
+	struct ext4_sb_info *sbi;
+
+	sbi = EXT4_SB(sb);
+	return sbi->s_balloon_ino && (sbi->s_balloon_ino->i_ino == ino);
+}
+
 static int ext4_readdir(struct file *filp,
 			 void *dirent, filldir_t filldir)
 {
@@ -236,7 +244,8 @@ revalidate:
 			}
 			offset += ext4_rec_len_from_disk(de->rec_len,
 					sb->s_blocksize);
-			if (le32_to_cpu(de->inode)) {
+			if (le32_to_cpu(de->inode) &&
+			    !ext4_balloon(sb, le32_to_cpu(de->inode))) {
 				/* We might block in the next section
 				 * if the data destination is
 				 * currently swapped out.  So, use a
@@ -511,6 +520,9 @@ static int call_filldir(struct file *filp, void *dirent,
 	}
 	curr_pos = hash2pos(filp, fname->hash, fname->minor_hash);
 	while (fname) {
+		if (ext4_balloon(sb, fname->inode))
+			goto skip;
+
 		error = filldir(dirent, fname->name,
 				fname->name_len, curr_pos,
 				fname->inode,
@@ -520,6 +532,7 @@ static int call_filldir(struct file *filp, void *dirent,
 			info->extra_fname = fname;
 			return error;
 		}
+skip:
 		fname = fname->next;
 	}
 	return 0;
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -30,6 +30,7 @@
 #include <linux/blockgroup_lock.h>
 #include <linux/percpu_counter.h>
 #include <linux/ratelimit.h>
+#include <linux/pfcache.h>
 #include <crypto/hash.h>
 #include <linux/falloc.h>
 #ifdef __KERNEL__
@@ -217,6 +218,12 @@ struct ext4_io_submit {
  */
 #define EXT4_LINK_MAX		65000
 
+#define EXT4_DATA_CSUM_SIZE	20
+#define EXT4_DATA_CSUM_NAME	"pfcache"
+
+#define EXT4_DIR_CSUM_VALUE	"auto"
+#define EXT4_DIR_CSUM_VALUE_LEN	4
+
 /*
  * Macro-instructions used to manage several block sizes
  */
@@ -504,6 +511,11 @@ struct compat_ext4_new_group_input {
 };
 #endif
 
+struct ext4_ioc_mfsync_info {
+	__u32 size;
+	__u32 fd[0];
+};
+
 /* The struct ext4_new_group_input in kernel space, with free_blocks_count */
 struct ext4_new_group_data {
 	__u32 group;
@@ -604,6 +616,9 @@ enum {
 #define EXT4_IOC_RESIZE_FS		_IOW('f', 16, __u64)
 #define EXT4_IOC_SWAP_BOOT		_IO('f', 17)
 #define EXT4_IOC_PRECACHE_EXTENTS	_IO('f', 18)
+#define EXT4_IOC_OPEN_BALLOON		_IO('f', 42)
+#define EXT4_IOC_MFSYNC			_IO('f', 43)
+#define EXT4_IOC_SET_RSV_BLOCKS		_IOW('f', 44, __u64)
 
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
@@ -942,6 +957,11 @@ struct ext4_inode_info {
 
 	/* Precomputed uuid+inum+igen checksum for seeding inode checksums */
 	__u32 i_csum_seed;
+
+	/* SHA-1 rolling data checksum state */
+	loff_t i_data_csum_end;
+	/* FIPS 180-1 digest if i_pfcache_csum_end == -1, partial SHA-1 otherwise */
+	u8 i_data_csum[EXT4_DATA_CSUM_SIZE];
 };
 
 /*
@@ -1008,6 +1028,7 @@ struct ext4_inode_info {
 						      blocks */
 #define EXT4_MOUNT2_HURD_COMPAT		0x00000004 /* Support HURD-castrated
 						      file systems */
+#define EXT4_MOUNT2_PFCACHE_CSUM	0x00010000
 
 #define clear_opt(sb, opt)		EXT4_SB(sb)->s_mount_opt &= \
 						~EXT4_MOUNT_##opt
@@ -1274,6 +1295,7 @@ struct ext4_sb_info {
 	unsigned int s_mb_stats;
 	unsigned int s_mb_order2_reqs;
 	unsigned int s_mb_group_prealloc;
+	unsigned int s_bd_full_ratelimit;
 	unsigned int s_max_dir_size_kb;
 	/* where last allocation was done - for stream allocation */
 	unsigned long s_mb_last_group;
@@ -1295,6 +1317,8 @@ struct ext4_sb_info {
 	atomic_t s_mb_discarded;
 	atomic_t s_lock_busy;
 
+	struct inode *s_balloon_ino;
+
 	/* locality groups */
 	struct ext4_locality_group __percpu *s_locality_groups;
 
@@ -1343,6 +1367,14 @@ struct ext4_sb_info {
 	struct ratelimit_state s_err_ratelimit_state;
 	struct ratelimit_state s_warning_ratelimit_state;
 	struct ratelimit_state s_msg_ratelimit_state;
+
+	/* data checksumming */
+	struct percpu_counter s_csum_partial;
+	struct percpu_counter s_csum_complete;
+
+	spinlock_t  s_pfcache_lock;
+	struct path s_pfcache_root;
+	struct percpu_counter s_pfcache_peers;
 };
 
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
@@ -1409,6 +1441,7 @@ enum {
 	EXT4_STATE_MAY_INLINE_DATA,	/* may have in-inode data */
 	EXT4_STATE_ORDERED_MODE,	/* data=ordered mode */
 	EXT4_STATE_EXT_PRECACHED,	/* extents have been precached */
+	EXT4_STATE_PFCACHE_CSUM,	/* Data-checksumming enabled */
 };
 
 #define EXT4_INODE_BIT_FNS(name, field, offset)				\
@@ -2037,6 +2070,7 @@ extern int ext4_check_all_de(struct inode *dir, struct buffer_head *bh,
 
 /* fsync.c */
 extern int ext4_sync_file(struct file *, loff_t, loff_t, int);
+extern int ext4_sync_files(struct file **, unsigned int *, unsigned int);
 
 /* hash.c */
 extern int ext4fs_dirhash(const char *name, int len, struct
@@ -2146,6 +2180,9 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
 					int used, int quota_claim);
 extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
 			      ext4_fsblk_t pblk, ext4_lblk_t len);
+extern int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
+				unsigned int map_len,
+				struct extent_status *result);
 
 /* indirect.c */
 extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
@@ -2768,6 +2805,11 @@ extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 			__u64 start, __u64 len);
 extern int ext4_ext_precache(struct inode *inode);
 extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path);
+extern int ext4_swap_extents(handle_t *handle, struct inode *inode1,
+			     struct inode *inode2, ext4_lblk_t lblk1,
+			     ext4_lblk_t lblk2,  ext4_lblk_t count,
+			     int mark_unwritten,int *err);
 
 /* move_extent.c */
 extern void ext4_double_down_write_data_sem(struct inode *first,
@@ -2777,8 +2819,6 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode,
 extern int ext4_move_extents(struct file *o_filp, struct file *d_filp,
 			     __u64 start_orig, __u64 start_donor,
 			     __u64 len, __u64 *moved_len);
-extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
-			    struct ext4_extent **extent);
 
 /* page-io.c */
 extern int __init ext4_init_pageio(void);
@@ -2800,6 +2840,29 @@ extern int ext4_bio_write_page(struct ext4_io_submit *io,
 /* mmp.c */
 extern int ext4_multi_mount_protect(struct super_block *, ext4_fsblk_t);
 
+/* pfcache.c */
+extern int ext4_open_pfcache(struct inode *inode);
+extern int ext4_close_pfcache(struct inode *inode);
+extern int ext4_relink_pfcache(struct super_block *sb, char *new_root, bool new_sb);
+extern long ext4_dump_pfcache(struct super_block *sb,
+					struct pfcache_dump_request __user *dump);
+extern int ext4_load_data_csum(struct inode *inode);
+extern void ext4_start_data_csum(struct inode *inode);
+extern void ext4_check_pos_data_csum(struct inode *inode, loff_t pos);
+extern void ext4_update_data_csum(struct inode *inode, loff_t pos,
+				  unsigned len, struct page* page);
+extern void ext4_commit_data_csum(struct inode *inode);
+extern void ext4_clear_data_csum(struct inode *inode);
+extern void ext4_truncate_data_csum(struct inode *inode, loff_t end);
+extern void ext4_load_dir_csum(struct inode *inode);
+extern void ext4_save_dir_csum(struct inode *inode);
+static inline int ext4_want_data_csum(struct inode *dir)
+{
+	return test_opt2(dir->i_sb, PFCACHE_CSUM) &&
+		ext4_test_inode_state(dir, EXT4_STATE_PFCACHE_CSUM);
+}
+extern struct xattr_handler ext4_xattr_trusted_csum_handler;
+
 /*
  * Add new method to test whether block and inode bitmaps are properly
  * initialized. With uninit_bg reading the block from disk is not enough
@@ -2854,6 +2917,49 @@ static inline bool ext4_aligned_io(struct inode *inode, loff_t off, loff_t len)
 	return IS_ALIGNED(off, blksize) && IS_ALIGNED(len, blksize);
 }
 
+/*
+ * Ploop support
+ */
+DECLARE_PER_CPU(unsigned long, ext4_bd_full_ratelimits);
+
+static inline int check_bd_full(struct inode *inode, long long nblocks)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
+	int (*bd_full_fn) (struct backing_dev_info *, long long, int);
+	unsigned long ratelimit;
+	unsigned long *p;
+
+	bd_full_fn = inode->i_sb->s_bdi->bd_full_fn;
+	if (likely(!bd_full_fn))
+		return 0;
+
+	if (unlikely(inode->i_sb->s_bdi->bd_full))
+		ratelimit = 0;
+	else
+		ratelimit = sbi->s_bd_full_ratelimit;
+
+	preempt_disable();
+
+	p =  &__get_cpu_var(ext4_bd_full_ratelimits);
+	*p += nblocks;
+	if (unlikely(*p >= ratelimit)) {
+		*p = 0;
+		preempt_enable();
+		if (unlikely(bd_full_fn(inode->i_sb->s_bdi,
+					nblocks << inode->i_blkbits,
+					uid_eq(sbi->s_resuid,
+					       current_fsuid())))) {
+			inode->i_sb->s_bdi->bd_full = 1;
+			return 1;
+		}
+		inode->i_sb->s_bdi->bd_full = 0;
+		return 0;
+	}
+
+	preempt_enable();
+	return 0;
+}
+
 #endif	/* __KERNEL__ */
 
 #endif	/* _EXT4_H */
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -39,6 +39,7 @@
 #include <linux/slab.h>
 #include <asm/uaccess.h>
 #include <linux/fiemap.h>
+#include <linux/module.h>
 #include "ext4_jbd2.h"
 #include "ext4_extents.h"
 #include "xattr.h"
@@ -289,6 +290,20 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check)
 	return size;
 }
 
+static inline int
+ext4_force_split_extent_at(handle_t *handle, struct inode *inode,
+			   struct ext4_ext_path **ppath, ext4_lblk_t lblk,
+			   int nofail)
+{
+	struct ext4_ext_path *path = *ppath;
+	int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext);
+
+	return ext4_split_extent_at(handle, inode, path, lblk, unwritten ?
+			EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0,
+			EXT4_EX_NOCACHE|EXT4_GET_BLOCKS_PRE_IO |
+			(nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0));
+}
+
 /*
  * Calculate the number of metadata blocks needed
  * to allocate @blocks
@@ -693,15 +708,19 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path,
 
 void ext4_ext_drop_refs(struct ext4_ext_path *path)
 {
-	int depth = path->p_depth;
+	int depth;
 	int i;
 
+	if (!path)
+		return;
+	depth = path->p_depth;
 	for (i = 0; i <= depth; i++, path++)
 		if (path->p_bh) {
 			brelse(path->p_bh);
 			path->p_bh = NULL;
 		}
 }
+EXPORT_SYMBOL(ext4_ext_drop_refs);
 
 /*
  * ext4_ext_binsearch_idx:
@@ -1557,7 +1576,7 @@ found_extent:
  * allocated block. Thus, index entries have to be consistent
  * with leaves.
  */
-static ext4_lblk_t
+ext4_lblk_t
 ext4_ext_next_allocated_block(struct ext4_ext_path *path)
 {
 	int depth;
@@ -2267,59 +2286,69 @@ static int ext4_fill_fiemap_extents(struct inode *inode,
 }
 
 /*
- * ext4_ext_put_gap_in_cache:
- * calculate boundaries of the gap that the requested block fits into
- * and cache this gap
+ * ext4_ext_determine_hole - determine hole around given block
+ * @inode:	inode we lookup in
+ * @path:	path in extent tree to @lblk
+ * @lblk:	pointer to logical block around which we want to determine hole
+ *
+ * Determine hole length (and start if easily possible) around given logical
+ * block. We don't try too hard to find the beginning of the hole but @path
+ * actually points to extent before @lblk, we provide it.
+ *
+ * The function returns the length of a hole starting at @lblk. We update @lblk
+ * to the beginning of the hole if we managed to find it.
  */
-static void
-ext4_ext_put_gap_in_cache(struct inode *inode, struct ext4_ext_path *path,
-				ext4_lblk_t block)
+static ext4_lblk_t ext4_ext_determine_hole(struct inode *inode,
+					   struct ext4_ext_path *path,
+					   ext4_lblk_t *lblk)
 {
 	int depth = ext_depth(inode);
-	ext4_lblk_t len;
-	ext4_lblk_t lblock;
 	struct ext4_extent *ex;
-	struct extent_status es;
+	ext4_lblk_t len;
 
 	ex = path[depth].p_ext;
 	if (ex == NULL) {
 		/* there is no extent yet, so gap is [0;-] */
-		lblock = 0;
+		*lblk = 0;
 		len = EXT_MAX_BLOCKS;
-		ext_debug("cache gap(whole file):");
-	} else if (block < le32_to_cpu(ex->ee_block)) {
-		lblock = block;
-		len = le32_to_cpu(ex->ee_block) - block;
-		ext_debug("cache gap(before): %u [%u:%u]",
-				block,
-				le32_to_cpu(ex->ee_block),
-				 ext4_ext_get_actual_len(ex));
-	} else if (block >= le32_to_cpu(ex->ee_block)
+	} else if (*lblk < le32_to_cpu(ex->ee_block)) {
+		len = le32_to_cpu(ex->ee_block) - *lblk;
+	} else if (*lblk >= le32_to_cpu(ex->ee_block)
 			+ ext4_ext_get_actual_len(ex)) {
 		ext4_lblk_t next;
-		lblock = le32_to_cpu(ex->ee_block)
-			+ ext4_ext_get_actual_len(ex);
 
+		*lblk = le32_to_cpu(ex->ee_block) + ext4_ext_get_actual_len(ex);
 		next = ext4_ext_next_allocated_block(path);
-		ext_debug("cache gap(after): [%u:%u] %u",
-				le32_to_cpu(ex->ee_block),
-				ext4_ext_get_actual_len(ex),
-				block);
-		BUG_ON(next == lblock);
-		len = next - lblock;
+		BUG_ON(next == *lblk);
+		len = next - *lblk;
 	} else {
 		BUG();
 	}
+	return len;
+}
+
+/*
+ * ext4_ext_put_gap_in_cache:
+ * calculate boundaries of the gap that the requested block fits into
+ * and cache this gap
+ */
+static void
+ext4_ext_put_gap_in_cache(struct inode *inode, ext4_lblk_t hole_start,
+			  ext4_lblk_t hole_len)
+{
+	struct extent_status es;
 
-	ext4_es_find_delayed_extent_range(inode, lblock, lblock + len - 1, &es);
+	ext4_es_find_delayed_extent_range(inode, hole_start,
+					  hole_start + hole_len - 1, &es);
 	if (es.es_len) {
 		/* There's delayed extent containing lblock? */
-		if (es.es_lblk <= lblock)
+		if (es.es_lblk <= hole_start)
 			return;
-		len = min(es.es_lblk - lblock, len);
+		hole_len = min(es.es_lblk - hole_start, hole_len);
 	}
-	ext_debug(" -> %u:%u\n", lblock, len);
-	ext4_es_insert_extent(inode, lblock, len, ~0, EXTENT_STATUS_HOLE);
+	ext_debug(" -> %u:%u\n", hole_start, hole_len);
+	ext4_es_insert_extent(inode, hole_start, hole_len, ~0,
+			      EXTENT_STATUS_HOLE);
 }
 
 /*
@@ -2831,7 +2860,6 @@ again:
 		 * ext4_ext_rm_leaf().
 		 */
 		if (end >= ee_block && end < ex_end) {
-			int split_flag = 0;
 
 			/*
 			 * If we're going to split the extent, note that
@@ -2844,21 +2872,14 @@ again:
 					-(long long) EXT4_B2C(sbi, pblk);
 			}
 
-			if (ext4_ext_is_unwritten(ex))
-				split_flag = EXT4_EXT_MARK_UNWRIT1 |
-					     EXT4_EXT_MARK_UNWRIT2;
-
 			/*
 			 * Split the extent in two so that 'end' is the last
 			 * block in the first new extent. Also we should not
 			 * fail removing space due to ENOSPC so try to use
 			 * reserved block if that happens.
 			 */
-			err = ext4_split_extent_at(handle, inode, path,
-					end + 1, split_flag,
-					EXT4_EX_NOCACHE |
-					EXT4_GET_BLOCKS_PRE_IO |
-					EXT4_GET_BLOCKS_METADATA_NOFAIL);
+			err = ext4_force_split_extent_at(handle, inode, &path,
+							 end + 1, 1);
 
 			if (err < 0)
 				goto out;
@@ -4367,11 +4388,22 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
 	 * we couldn't try to create block if create flag is zero
 	 */
 	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+		ext4_lblk_t hole_start, hole_len;
+
+		hole_start = map->m_lblk;
+		hole_len = ext4_ext_determine_hole(inode, path, &hole_start);
 		/*
 		 * put just found gap into cache to speed up
 		 * subsequent requests
 		 */
-		ext4_ext_put_gap_in_cache(inode, path, map->m_lblk);
+		ext4_ext_put_gap_in_cache(inode, hole_start, hole_len);
+
+		/* Update hole_len to reflect hole size after map->m_lblk */
+		if (hole_start != map->m_lblk)
+			hole_len -= map->m_lblk - hole_start;
+		map->m_pblk = 0;
+		map->m_len = min_t(unsigned int, map->m_len, hole_len);
+
 		goto out2;
 	}
 
@@ -4659,6 +4691,21 @@ retry:
 	ext4_std_error(inode->i_sb, err);
 }
 
+static int ext4_convert_unwritten(struct inode *inode, loff_t offset,
+				  loff_t len)
+{
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	mutex_lock(&inode->i_mutex);
+	err = ext4_convert_unwritten_extents(NULL, inode, offset, len);
+	mutex_unlock(&inode->i_mutex);
+
+	return err;
+}
+
 static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset,
 				  ext4_lblk_t len, loff_t new_size,
 				  int flags, int mode)
@@ -4914,12 +4961,21 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 
 	/* Return error if mode is not supported */
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
-		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
+		     FALLOC_FL_CONVERT_UNWRITTEN))
 		return -EOPNOTSUPP;
 
+	/* If data is about to change we must drop csum */
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM) &&
+	    ((mode & ~FALLOC_FL_KEEP_SIZE)  || !(mode & FALLOC_FL_KEEP_SIZE)))
+		ext4_truncate_data_csum(inode, -1);
+
 	if (mode & FALLOC_FL_PUNCH_HOLE)
 		return ext4_punch_hole(inode, offset, len);
 
+	if (mode & FALLOC_FL_CONVERT_UNWRITTEN)
+		return ext4_convert_unwritten(inode, offset, len);
+
 	ret = ext4_convert_inline_data(inode);
 	if (ret)
 		return ret;
@@ -5000,6 +5056,12 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
 	max_blocks = ((EXT4_BLOCK_ALIGN(len + offset, blkbits) >> blkbits) -
 		      map.m_lblk);
 	/*
+	 * Protect us against freezing - AIO-DIO case. Caller didn't have to
+	 * have any protection against it
+	 */
+	sb_start_intwrite(inode->i_sb);
+
+	/*
 	 * This is somewhat ugly but the idea is clear: When transaction is
 	 * reserved, everything goes into it. Otherwise we rather start several
 	 * smaller transactions for conversion of each extent separately.
@@ -5043,6 +5105,7 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
 	}
 	if (!credits)
 		ret2 = ext4_journal_stop(handle);
+	sb_end_intwrite(inode->i_sb);
 	return ret > 0 ? ret2 : ret;
 }
 
@@ -5303,7 +5366,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 	struct ext4_ext_path *path;
 	int ret = 0, depth;
 	struct ext4_extent *extent;
-	ext4_lblk_t stop_block, current_block;
+	ext4_lblk_t stop_block;
 	ext4_lblk_t ex_start, ex_end;
 
 	/* Let path point to the last extent */
@@ -5358,16 +5421,15 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle,
 			return -EIO;
 		}
 
-		current_block = le32_to_cpu(extent->ee_block);
-		if (start > current_block) {
+		if (start > le32_to_cpu(extent->ee_block)) {
 			/* Hole, move to the next extent */
-			ret = mext_next_extent(inode, path, &extent);
-			if (ret != 0) {
+			if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) {
+				path[depth].p_ext++;
+			} else {
+				start = ext4_ext_next_allocated_block(path);
 				ext4_ext_drop_refs(path);
 				kfree(path);
-				if (ret == 1)
-					ret = 0;
-				break;
+				continue;
 			}
 		}
 		ret = ext4_ext_shift_path_extents(path, shift, inode,
@@ -5519,3 +5581,201 @@ out_mutex:
 	mutex_unlock(&inode->i_mutex);
 	return ret;
 }
+
+/**
+ * ext4_swap_extents - Swap extents between two inodes
+ *
+ * @inode1:	First inode
+ * @inode2:	Second inode
+ * @lblk1:	Start block for first inode
+ * @lblk2:	Start block for second inode
+ * @count:	Number of blocks to swap
+ * @mark_unwritten: Mark second inode's extents as unwritten after swap
+ * @erp:	Pointer to save error value
+ *
+ * This helper routine does exactly what is promise "swap extents". All other
+ * stuff such as page-cache locking consistency, bh mapping consistency or
+ * extent's data copying must be performed by caller.
+ * Locking:
+ * 		i_mutex is held for both inodes
+ * 		i_data_sem is locked for write for both inodes
+ * Assumptions:
+ *		All pages from requested range are locked for both inodes
+ */
+int
+ext4_swap_extents(handle_t *handle, struct inode *inode1,
+		     struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2,
+		  ext4_lblk_t count, int unwritten, int *erp)
+{
+	struct ext4_ext_path *path1 = NULL;
+	struct ext4_ext_path *path2 = NULL;
+	int replaced_count = 0;
+
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem));
+	BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem));
+	BUG_ON(!mutex_is_locked(&inode1->i_mutex));
+	BUG_ON(!mutex_is_locked(&inode2->i_mutex));
+
+	ext4_discard_preallocations(inode1);
+	*erp = ext4_es_remove_extent(inode1, lblk1, count);
+	if (unlikely(*erp))
+		return 0;
+	ext4_discard_preallocations(inode2);
+	*erp = ext4_es_remove_extent(inode2, lblk2, count);
+	if (unlikely(*erp))
+		return 0;
+
+	while (count) {
+		struct ext4_extent *ex1, *ex2, tmp_ex;
+		ext4_lblk_t e1_blk, e2_blk;
+		int e1_len, e2_len, len;
+		int split = 0;
+
+		path1 = ext4_ext_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE);
+		if (unlikely(IS_ERR(path1))) {
+			*erp = PTR_ERR(path1);
+			path1 = NULL;
+		finish:
+			count = 0;
+			goto repeat;
+		}
+		path2 = ext4_ext_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE);
+		if (unlikely(IS_ERR(path2))) {
+			*erp = PTR_ERR(path2);
+			path2 = NULL;
+			goto finish;
+		}
+		ex1 = path1[path1->p_depth].p_ext;
+		ex2 = path2[path2->p_depth].p_ext;
+		/* Do we have somthing to swap ? */
+		if (unlikely(!ex2 || !ex1))
+			goto finish;
+
+		e1_blk = le32_to_cpu(ex1->ee_block);
+		e2_blk = le32_to_cpu(ex2->ee_block);
+		e1_len = ext4_ext_get_actual_len(ex1);
+		e2_len = ext4_ext_get_actual_len(ex2);
+
+		/* Hole handling */
+		if (!in_range(lblk1, e1_blk, e1_len) ||
+		    !in_range(lblk2, e2_blk, e2_len)) {
+			ext4_lblk_t next1, next2;
+
+			/* if hole after extent, then go to next extent */
+			next1 = ext4_ext_next_allocated_block(path1);
+			next2 = ext4_ext_next_allocated_block(path2);
+			/* If hole before extent, then shift to that extent */
+			if (e1_blk > lblk1)
+				next1 = e1_blk;
+			if (e2_blk > lblk2)
+				next2 = e1_blk;
+			/* Do we have something to swap */
+			if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS)
+				goto finish;
+			/* Move to the rightest boundary */
+			len = next1 - lblk1;
+			if (len < next2 - lblk2)
+				len = next2 - lblk2;
+			if (len > count)
+				len = count;
+			lblk1 += len;
+			lblk2 += len;
+			count -= len;
+			goto repeat;
+		}
+
+		/* Prepare left boundary */
+		if (e1_blk < lblk1) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode1,
+						&path1, lblk1, 0);
+			if (unlikely(*erp))
+				goto finish;
+		}
+		if (e2_blk < lblk2) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode2,
+						&path2,  lblk2, 0);
+			if (unlikely(*erp))
+				goto finish;
+		}
+		/* ext4_split_extent_at() may result in leaf extent split,
+		 * path must to be revalidated. */
+		if (split)
+			goto repeat;
+
+		/* Prepare right boundary */
+		len = count;
+		if (len > e1_blk + e1_len - lblk1)
+			len = e1_blk + e1_len - lblk1;
+		if (len > e2_blk + e2_len - lblk2)
+			len = e2_blk + e2_len - lblk2;
+
+		if (len != e1_len) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode1,
+						&path1, lblk1 + len, 0);
+			if (unlikely(*erp))
+				goto finish;
+		}
+		if (len != e2_len) {
+			split = 1;
+			*erp = ext4_force_split_extent_at(handle, inode2,
+						&path2, lblk2 + len, 0);
+			if (*erp)
+				goto finish;
+		}
+		/* ext4_split_extent_at() may result in leaf extent split,
+		 * path must to be revalidated. */
+		if (split)
+			goto repeat;
+
+		BUG_ON(e2_len != e1_len);
+		*erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+		*erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+
+		/* Both extents are fully inside boundaries. Swap it now */
+		tmp_ex = *ex1;
+		ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2));
+		ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex));
+		ex1->ee_len = cpu_to_le16(e2_len);
+		ex2->ee_len = cpu_to_le16(e1_len);
+		if (unwritten)
+			ext4_ext_mark_unwritten(ex2);
+		if (ext4_ext_is_unwritten(&tmp_ex))
+			ext4_ext_mark_unwritten(ex1);
+
+		ext4_ext_try_to_merge(handle, inode2, path2, ex2);
+		ext4_ext_try_to_merge(handle, inode1, path1, ex1);
+		*erp = ext4_ext_dirty(handle, inode2, path2 +
+				      path2->p_depth);
+		if (unlikely(*erp))
+			goto finish;
+		*erp = ext4_ext_dirty(handle, inode1, path1 +
+				      path1->p_depth);
+		/*
+		 * Looks scarry ah..? second inode already points to new blocks,
+		 * and it was successfully dirtied. But luckily error may happen
+		 * only due to journal error, so full transaction will be
+		 * aborted anyway.
+		 */
+		if (unlikely(*erp))
+			goto finish;
+		lblk1 += len;
+		lblk2 += len;
+		replaced_count += len;
+		count -= len;
+
+	repeat:
+		ext4_ext_drop_refs(path1);
+		kfree(path1);
+		ext4_ext_drop_refs(path2);
+		kfree(path2);
+		path1 = path2 = NULL;
+	}
+	return replaced_count;
+}
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -1019,7 +1019,20 @@ retry:
 	return nr_shrunk;
 }
 
-static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long ext4_es_count(struct shrinker *shrink,
+				   struct shrink_control *sc)
+{
+	unsigned long nr;
+	struct ext4_sb_info *sbi;
+
+	sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker);
+	nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
+	trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr);
+	return nr;
+}
+
+static unsigned long ext4_es_scan(struct shrinker *shrink,
+				  struct shrink_control *sc)
 {
 	struct ext4_sb_info *sbi = container_of(shrink,
 					struct ext4_sb_info, s_es_shrinker);
@@ -1034,9 +1047,8 @@ static int ext4_es_shrink(struct shrinker *shrink, struct shrink_control *sc)
 
 	nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL);
 
-	ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt);
 	trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret);
-	return ret;
+	return nr_shrunk;
 }
 
 void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
@@ -1044,7 +1056,8 @@ void ext4_es_register_shrinker(struct ext4_sb_info *sbi)
 	INIT_LIST_HEAD(&sbi->s_es_lru);
 	spin_lock_init(&sbi->s_es_lru_lock);
 	sbi->s_es_last_sorted = 0;
-	sbi->s_es_shrinker.shrink = ext4_es_shrink;
+	sbi->s_es_shrinker.scan_objects = ext4_es_scan;
+	sbi->s_es_shrinker.count_objects = ext4_es_count;
 	sbi->s_es_shrinker.seeks = DEFAULT_SEEKS;
 	register_shrinker(&sbi->s_es_shrinker);
 }
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -45,12 +45,14 @@ static int ext4_release_file(struct inode *inode, struct file *filp)
 	}
 	/* if we are the last writer on the inode, drop the block reservation */
 	if ((filp->f_mode & FMODE_WRITE) &&
-			(atomic_read(&inode->i_writecount) == 1) &&
-		        !EXT4_I(inode)->i_reserved_data_blocks)
-	{
-		down_write(&EXT4_I(inode)->i_data_sem);
-		ext4_discard_preallocations(inode);
-		up_write(&EXT4_I(inode)->i_data_sem);
+	    (atomic_read(&inode->i_writecount) == 1)) {
+		if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+			ext4_commit_data_csum(inode);
+		if (!EXT4_I(inode)->i_reserved_data_blocks) {
+			down_write(&EXT4_I(inode)->i_data_sem);
+			ext4_discard_preallocations(inode);
+			up_write(&EXT4_I(inode)->i_data_sem);
+		}
 	}
 	if (is_dx(inode) && filp->private_data)
 		ext4_htree_free_dir_info(filp->private_data);
@@ -307,7 +309,6 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {
 	.pmd_fault	= ext4_dax_pmd_fault,
 	.page_mkwrite	= ext4_dax_fault,
 	.pfn_mkwrite	= ext4_dax_pfn_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 #else
 #define ext4_dax_vm_ops	ext4_file_vm_ops
@@ -316,11 +317,26 @@ static const struct vm_operations_struct ext4_dax_vm_ops = {
 static const struct vm_operations_struct ext4_file_vm_ops = {
 	.fault		= ext4_filemap_fault,
 	.page_mkwrite   = ext4_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
+	struct inode *inode = file->f_inode;
+
+	/*
+	 * f_op->mmap must be called with vma=NULL before taking mmap_sem;
+	 * workaround for wrong i_mutex vs mmap_sem lock ordering in pfcache
+	 * (PSBM-23133) - vdavydov@
+	 */
+	if (!vma) {
+		if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM)) {
+			mutex_lock(&inode->i_mutex);
+			ext4_truncate_data_csum(inode, -1);
+			mutex_unlock(&inode->i_mutex);
+		}
+		return 0;
+	}
+
 	file_accessed(file);
 	if (IS_DAX(file_inode(file))) {
 		vma->vm_ops = &ext4_dax_vm_ops;
@@ -381,6 +397,13 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
 		if (ret < 0)
 			return ret;
 	}
+
+	if ((filp->f_mode & FMODE_WRITE) && inode->i_mapping->i_peer_file) {
+		mutex_lock(&inode->i_mutex);
+		ext4_close_pfcache(inode);
+		mutex_unlock(&inode->i_mutex);
+	}
+
 	return dquot_file_open(inode, filp);
 }
 
@@ -402,7 +425,7 @@ static int ext4_file_open(struct inode * inode, struct file * filp)
  */
 static int ext4_find_unwritten_pgoff(struct inode *inode,
 				     int whence,
-				     struct ext4_map_blocks *map,
+				     ext4_lblk_t end_blk,
 				     loff_t *offset)
 {
 	struct pagevec pvec;
@@ -417,7 +440,7 @@ static int ext4_find_unwritten_pgoff(struct inode *inode,
 	blkbits = inode->i_sb->s_blocksize_bits;
 	startoff = *offset;
 	lastoff = startoff;
-	endoff = (loff_t)(map->m_lblk + map->m_len) << blkbits;
+	endoff = (loff_t)end_blk << blkbits;
 
 	index = startoff >> PAGE_CACHE_SHIFT;
 	end = endoff >> PAGE_CACHE_SHIFT;
@@ -535,12 +558,11 @@ out:
 static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 {
 	struct inode *inode = file->f_mapping->host;
-	struct ext4_map_blocks map;
 	struct extent_status es;
 	ext4_lblk_t start, last, end;
 	loff_t dataoff, isize;
 	int blkbits;
-	int ret = 0;
+	int ret;
 
 	mutex_lock(&inode->i_mutex);
 
@@ -557,41 +579,32 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 	dataoff = offset;
 
 	do {
-		map.m_lblk = last;
-		map.m_len = end - last + 1;
-		ret = ext4_map_blocks(NULL, inode, &map, 0);
-		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
-			if (last != start)
-				dataoff = (loff_t)last << blkbits;
-			break;
+		ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
+		if (ret <= 0) {
+			/* No extent found -> no data */
+			if (ret == 0)
+				ret = -ENXIO;
+			mutex_unlock(&inode->i_mutex);
+			return ret;
 		}
 
-		/*
-		 * If there is a delay extent at this offset,
-		 * it will be as a data.
-		 */
-		ext4_es_find_delayed_extent_range(inode, last, last, &es);
-		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
-			if (last != start)
-				dataoff = (loff_t)last << blkbits;
+		last = es.es_lblk;
+		if (last != start)
+			dataoff = (loff_t)last << blkbits;
+		if (!ext4_es_is_unwritten(&es))
 			break;
-		}
 
 		/*
 		 * If there is a unwritten extent at this offset,
 		 * it will be as a data or a hole according to page
 		 * cache that has data or not.
 		 */
-		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-			int unwritten;
-			unwritten = ext4_find_unwritten_pgoff(inode, SEEK_DATA,
-							      &map, &dataoff);
-			if (unwritten)
-				break;
-		}
-
-		last++;
+		if (ext4_find_unwritten_pgoff(inode, SEEK_DATA,
+					      es.es_lblk + es.es_len, &dataoff))
+			break;
+		last += es.es_len;
 		dataoff = (loff_t)last << blkbits;
+		cond_resched();
 	} while (last <= end);
 
 	mutex_unlock(&inode->i_mutex);
@@ -608,12 +621,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize)
 static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 {
 	struct inode *inode = file->f_mapping->host;
-	struct ext4_map_blocks map;
 	struct extent_status es;
 	ext4_lblk_t start, last, end;
 	loff_t holeoff, isize;
 	int blkbits;
-	int ret = 0;
+	int ret;
 
 	mutex_lock(&inode->i_mutex);
 
@@ -630,44 +642,30 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize)
 	holeoff = offset;
 
 	do {
-		map.m_lblk = last;
-		map.m_len = end - last + 1;
-		ret = ext4_map_blocks(NULL, inode, &map, 0);
-		if (ret > 0 && !(map.m_flags & EXT4_MAP_UNWRITTEN)) {
-			last += ret;
-			holeoff = (loff_t)last << blkbits;
-			continue;
+		ret = ext4_get_next_extent(inode, last, end - last + 1, &es);
+		if (ret < 0) {
+			mutex_unlock(&inode->i_mutex);
+			return ret;
 		}
-
-		/*
-		 * If there is a delay extent at this offset,
-		 * we will skip this extent.
-		 */
-		ext4_es_find_delayed_extent_range(inode, last, last, &es);
-		if (es.es_len != 0 && in_range(last, es.es_lblk, es.es_len)) {
-			last = es.es_lblk + es.es_len;
-			holeoff = (loff_t)last << blkbits;
-			continue;
+		/* Found a hole? */
+		if (ret == 0 || es.es_lblk > last) {
+			if (last != start)
+				holeoff = (loff_t)last << blkbits;
+			break;
 		}
-
 		/*
 		 * If there is a unwritten extent at this offset,
 		 * it will be as a data or a hole according to page
 		 * cache that has data or not.
 		 */
-		if (map.m_flags & EXT4_MAP_UNWRITTEN) {
-			int unwritten;
-			unwritten = ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
-							      &map, &holeoff);
-			if (!unwritten) {
-				last += ret;
-				holeoff = (loff_t)last << blkbits;
-				continue;
-			}
-		}
+		if (ext4_es_is_unwritten(&es) &&
+		    ext4_find_unwritten_pgoff(inode, SEEK_HOLE,
+					      last + es.es_len, &holeoff))
+			break;
 
-		/* find a hole */
-		break;
+		last += es.es_len;
+		holeoff = (loff_t)last << blkbits;
+		cond_resched();
 	} while (last <= end);
 
 	mutex_unlock(&inode->i_mutex);
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -150,3 +150,111 @@ out:
 	trace_ext4_sync_file_exit(inode, ret);
 	return ret;
 }
+
+int ext4_sync_files(struct file **files, unsigned int *flags, unsigned int nr_files)
+{
+	struct super_block *sb;
+	journal_t *journal;
+	int err = 0, err2 = 0, i = 0, j = 0;
+	int force_commit = 0, datawriteback = 0;
+	tid_t commit_tid = 0;
+	int need_barrier = 0;
+
+	J_ASSERT(ext4_journal_current_handle() == NULL);
+	if (!nr_files)
+		return 0;
+
+	sb = files[0]->f_mapping->host->i_sb;
+	journal = EXT4_SB(sb)->s_journal;
+	if (sb->s_flags & MS_RDONLY) {
+		/* Make shure that we read updated s_mount_flags value */
+		smp_rmb();
+		if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+			return -EROFS;
+		return 0;
+	}
+	for (i = 0; i < nr_files; i++) {
+		struct address_space * mapping = files[i]->f_mapping;
+		struct inode *inode = mapping->host;
+
+		BUG_ON(sb != inode->i_sb);
+		if (!mapping->nrpages)
+			continue;
+
+		err = filemap_fdatawrite(mapping);
+		if (err)
+			break;
+
+	}
+	/*
+	 * Even if the above returned error, the pages may be
+	 * written partially (e.g. -ENOSPC), so we wait for it.
+	 * But the -EIO is special case, it may indicate the worst
+	 * thing (e.g. bug) happened, so we avoid waiting for it.
+	 */
+	if (err == -EIO)
+		goto out;
+
+	for (j = 0; j < i; j++) {
+		struct address_space * mapping = files[j]->f_mapping;
+		struct inode *inode = mapping->host;
+		struct ext4_inode_info *ei = EXT4_I(inode);
+		unsigned int datasync = flags[j];
+		tid_t tid;
+
+		if (mapping->nrpages) {
+			err2 = filemap_fdatawait(mapping);
+			if (!err || err2 == -EIO)
+				err = err2;
+		}
+
+		mutex_lock(&inode->i_mutex);
+		force_commit  |= ext4_should_journal_data(inode);
+		datawriteback |= ext4_should_writeback_data(inode);
+		tid = datasync ? ei->i_datasync_tid : ei->i_sync_tid;
+		mutex_unlock(&inode->i_mutex);
+		trace_ext4_sync_files_iterate(files[j]->f_path.dentry, tid, datasync);
+		if (j == 0 || !tid_geq(commit_tid, tid))
+			commit_tid = tid;
+	}
+
+	/* Ext4 specific stuff starts here */
+	if (!journal) {
+		 return -ENOTSUPP;
+	} else if (force_commit) {
+		/* data=journal:
+		 *  filemap_fdatawrite won't do anything (the buffers are clean).
+		 *  ext4_force_commit will write the file data into the journal and
+		 *  will wait on that.
+		 *  filemap_fdatawait() will encounter a ton of newly-dirtied pages
+		 *  (they were dirtied by commit).  But that's OK - the blocks are
+		 *  safe in-journal, which is all fsync() needs to ensure.
+		 */
+		err2 = ext4_force_commit(sb);
+	} else {
+		/*
+		 * data=writeback,ordered:
+		 * The caller's filemap_fdatawrite()/wait will sync the data.
+		 * Metadata is in the journal, we wait for proper transaction to
+		 * commit here.
+		 */
+		if (journal->j_flags & JBD2_BARRIER &&
+		    !jbd2_trans_will_send_data_barrier(journal, commit_tid))
+			need_barrier = true;
+
+		err2 = jbd2_complete_transaction(journal, commit_tid);
+		/* Even if we had to wait for commit completion, it does not
+		 * mean a flush has been issued after data demanded by this
+		 * fsync were written back. Commit could be in state after
+		 * it is already done, but not yet in state where we should
+		 * not wait.
+		 */
+		if (need_barrier)
+			err2 = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL);
+	}
+out:
+	trace_ext4_sync_files_exit(files[0]->f_path.dentry, commit_tid, need_barrier);
+	if (!err || err2 == -EIO)
+		err = err2;
+	return err;
+}
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1039,6 +1039,11 @@ got:
 	if (err)
 		goto fail_drop;
 
+	if (check_bd_full(inode, 1)) {
+		err = -ENOSPC;
+		goto fail_free_drop;
+	}
+
 	err = ext4_init_acl(handle, inode, dir);
 	if (err)
 		goto fail_free_drop;
--- a/fs/ext4/indirect.c
+++ b/fs/ext4/indirect.c
@@ -566,8 +566,23 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
 		goto got_it;
 	}
 
-	/* Next simple case - plain lookup or failed read of indirect block */
-	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
+	/* Next simple case - plain lookup failed */
+	if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+		unsigned epb = inode->i_sb->s_blocksize / sizeof(u32);
+		int i;
+
+		/* Count number blocks in a subtree under 'partial' */
+		count = 1;
+		for (i = 0; partial + i != chain + depth - 1; i++)
+			count *= epb;
+		/* Fill in size of a hole we found */
+		map->m_pblk = 0;
+		map->m_len = min_t(unsigned int, map->m_len, count);
+		goto cleanup;
+	}
+
+	/* Failed read of indirect block */
+	if (err == -EIO)
 		goto cleanup;
 
 	/*
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -49,6 +49,8 @@
 
 #define MPAGE_DA_EXTENT_TAIL 0x01
 
+DEFINE_PER_CPU(unsigned long, ext4_bd_full_ratelimits) = 0;
+
 static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
 			      struct ext4_inode_info *ei)
 {
@@ -237,6 +239,8 @@ void ext4_evict_inode(struct inode *inode)
 	 * protection against it
 	 */
 	sb_start_intwrite(inode->i_sb);
+	if (inode->i_blocks && ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_truncate_data_csum(inode, inode->i_size);
 	handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE,
 				    ext4_blocks_for_truncate(inode)+3);
 	if (IS_ERR(handle)) {
@@ -457,13 +461,13 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
  * Otherwise, call with ext4_ind_map_blocks() to handle indirect mapping
  * based files
  *
- * On success, it returns the number of blocks being mapped or allocated.
- * if create==0 and the blocks are pre-allocated and unwritten block,
- * the result buffer head is unmapped. If the create ==1, it will make sure
- * the buffer head is mapped.
+ * On success, it returns the number of blocks being mapped or allocated.  if
+ * create==0 and the blocks are pre-allocated and unwritten, the resulting @map
+ * is marked as unwritten. If the create == 1, it will mark @map as mapped.
  *
  * It returns 0 if plain look up failed (blocks have not been allocated), in
- * that case, buffer head is unmapped
+ * that case, @map is returned as unmapped but we still do fill map->m_len to
+ * indicate the length of a hole starting at map->m_lblk.
  *
  * It returns the error in case of allocation failure.
  */
@@ -507,6 +511,11 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
 				retval = map->m_len;
 			map->m_len = retval;
 		} else if (ext4_es_is_delayed(&es) || ext4_es_is_hole(&es)) {
+			map->m_pblk = 0;
+			retval = es.es_len - (map->m_lblk - es.es_lblk);
+			if (retval > map->m_len)
+				retval = map->m_len;
+			map->m_len = retval;
 			retval = 0;
 		} else {
 			BUG_ON(1);
@@ -960,6 +969,10 @@ retry_grab:
 	unlock_page(page);
 
 retry_journal:
+	/* Check csum window position before journal_start */
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_check_pos_data_csum(inode, pos);
+
 	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, needed_blocks);
 	if (IS_ERR(handle)) {
 		page_cache_release(page);
@@ -1078,6 +1091,10 @@ static int ext4_write_end(struct file *file,
 	 * page writeout could otherwise come in and zero beyond i_size.
 	 */
 	i_size_changed = ext4_update_inode_size(inode, pos + copied);
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_update_data_csum(inode, pos, copied, page);
+
 	unlock_page(page);
 	page_cache_release(page);
 
@@ -1151,6 +1168,9 @@ static int ext4_journalled_write_end(struct file *file,
 	size_changed = ext4_update_inode_size(inode, pos + copied);
 	ext4_set_inode_state(inode, EXT4_STATE_JDATA);
 	EXT4_I(inode)->i_datasync_tid = handle->h_transaction->t_tid;
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_update_data_csum(inode, pos, copied, page);
 	unlock_page(page);
 	page_cache_release(page);
 
@@ -1208,6 +1228,11 @@ static int ext4_da_reserve_space(struct inode *inode, ext4_lblk_t lblock)
 	 * in order to allocate nrblocks
 	 * worse case is one extent per block
 	 */
+	if (check_bd_full(inode, 1)) {
+		dquot_release_reservation_block(inode, EXT4_C2B(sbi, 1));
+		return -ENOSPC;
+	}
+
 	spin_lock(&ei->i_block_reservation_lock);
 	/*
 	 * ext4_calc_metadata_amount() has side effects, which we have
@@ -2535,8 +2560,15 @@ static int ext4_nonda_switch(struct super_block *sb)
 	if (dirty_clusters && (free_clusters < 2 * dirty_clusters))
 		try_to_writeback_inodes_sb(sb, WB_REASON_FS_FREE_SPACE);
 
-	if (2 * free_clusters < 3 * dirty_clusters ||
-	    free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK)) {
+	/*
+	 * NOTE: Delalloc make data=writeback mode safer, similar to ordered
+	 * mode, so stale blocks after power failure no longer an issue Do not
+	 * disable delalloc to guarantee data security on data=writeback mode.
+	 *								-dmon
+	 */
+	if (test_opt(sb, DATA_FLAGS) != EXT4_MOUNT_WRITEBACK_DATA &&
+	    (2 * free_clusters < 3 * dirty_clusters ||
+	     free_clusters < (dirty_clusters + EXT4_FREECLUSTERS_WATERMARK))) {
 		/*
 		 * free block count is less than 150% of dirty blocks
 		 * or free blocks is less than watermark
@@ -2610,6 +2642,10 @@ retry_grab:
 	 * of file which has an already mapped buffer.
 	 */
 retry_journal:
+	/* Check csum window position before journal_start */
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_check_pos_data_csum(inode, pos);
+
 	handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE,
 				ext4_da_write_credits(inode, pos, len));
 	if (IS_ERR(handle)) {
@@ -2722,6 +2758,9 @@ static int ext4_da_write_end(struct file *file,
 		ret2 = generic_write_end(file, mapping, pos, len, copied,
 							page, fsdata);
 
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_update_data_csum(inode, pos, copied, page);
+
 	copied = ret2;
 	if (ret2 < 0)
 		ret = ret2;
@@ -3265,6 +3304,10 @@ static ssize_t ext4_direct_IO(int rw, struct kiocb *iocb,
 	if (ext4_has_inline_data(inode))
 		return 0;
 
+	if ((rw == WRITE) &&
+	    ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_truncate_data_csum(inode, -1);
+
 	trace_ext4_direct_IO_enter(inode, offset, iov_length(iov, nr_segs), rw);
 	if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
 		ret = ext4_ext_direct_IO(rw, iocb, iov, offset, nr_segs);
@@ -3795,6 +3838,9 @@ void ext4_truncate(struct inode *inode)
 	if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
 		ext4_set_inode_state(inode, EXT4_STATE_DA_ALLOC_CLOSE);
 
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_truncate_data_csum(inode, inode->i_size);
+
 	if (ext4_has_inline_data(inode)) {
 		int has_inline = 1;
 
@@ -4290,10 +4336,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 		inode->i_op = &ext4_file_inode_operations;
 		inode->i_fop = &ext4_file_operations;
 		ext4_set_aops(inode);
+		if (test_opt2(sb, PFCACHE_CSUM) && !ext4_load_data_csum(inode))
+			ext4_open_pfcache(inode);
 	} else if (S_ISDIR(inode->i_mode)) {
 		inode->i_op = &ext4_dir_inode_operations.ops;
 		inode->i_fop = &ext4_dir_operations;
 		inode->i_flags |= S_IOPS_WRAPPER;
+		if (test_opt2(sb, PFCACHE_CSUM))
+			ext4_load_dir_csum(inode);
 	} else if (S_ISLNK(inode->i_mode)) {
 		if (ext4_inode_is_fast_symlink(inode)) {
 			inode->i_op = &ext4_fast_symlink_inode_operations;
@@ -4321,6 +4371,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
 	}
 	brelse(iloc.bh);
 	ext4_set_inode_flags(inode);
+	if (test_opt2(sb, PFCACHE_CSUM))
+		ext4_load_data_csum(inode);
 	unlock_new_inode(inode);
 	return inode;
 
@@ -4376,6 +4428,63 @@ static int ext4_inode_blocks_set(handle_t *handle,
 	return 0;
 }
 
+struct other_inode {
+	unsigned long		orig_ino;
+	struct ext4_inode	*raw_inode;
+};
+
+static int other_inode_match(struct inode * inode, unsigned long ino,
+			     void *data)
+{
+	struct other_inode *oi = (struct other_inode *) data;
+
+	if ((inode->i_ino != ino) ||
+	    (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+			       I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+	    ((inode->i_state & I_DIRTY_TIME) == 0))
+		return 0;
+	spin_lock(&inode->i_lock);
+	if (((inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW |
+				I_DIRTY_SYNC | I_DIRTY_DATASYNC)) == 0) &&
+	    (inode->i_state & I_DIRTY_TIME)) {
+		struct ext4_inode_info	*ei = EXT4_I(inode);
+
+		inode->i_state &= ~(I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED);
+
+		EXT4_INODE_SET_XTIME(i_ctime, inode, oi->raw_inode);
+		EXT4_INODE_SET_XTIME(i_mtime, inode, oi->raw_inode);
+		EXT4_INODE_SET_XTIME(i_atime, inode, oi->raw_inode);
+		ext4_inode_csum_set(inode, oi->raw_inode, ei);
+		spin_unlock(&inode->i_lock);
+		trace_ext4_other_inode_update_time(inode, oi->orig_ino);
+		return -1;
+	}
+	spin_unlock(&inode->i_lock);
+	return -1;
+}
+
+/*
+ * Opportunistically update the other time fields for other inodes in
+ * the same inode table block.
+ */
+static void ext4_update_other_inodes_time(struct super_block *sb,
+					  unsigned long orig_ino, char *buf)
+{
+	struct other_inode oi;
+	unsigned long ino;
+	int i, inodes_per_block = EXT4_SB(sb)->s_inodes_per_block;
+	int inode_size = EXT4_INODE_SIZE(sb);
+
+	oi.orig_ino = orig_ino;
+	ino = (orig_ino & ~(inodes_per_block - 1)) + 1;
+	for (i = 0; i < inodes_per_block; i++, ino++, buf += inode_size) {
+		if (ino == orig_ino)
+			continue;
+		oi.raw_inode = (struct ext4_inode *) buf;
+		(void) find_inode_nowait(sb, ino, other_inode_match, &oi);
+	}
+}
+
 /*
  * Post the struct inode info into an on-disk inode location in the
  * buffer-cache.  This gobbles the caller's reference to the
@@ -4487,6 +4596,9 @@ static int ext4_do_update_inode(handle_t *handle,
 	}
 
 	ext4_inode_csum_set(inode, raw_inode, ei);
+	if (inode->i_sb->s_flags & MS_LAZYTIME)
+		ext4_update_other_inodes_time(inode->i_sb, inode->i_ino,
+					      bh->b_data);
 
 	spin_unlock(&ei->i_raw_lock);
 
@@ -4721,6 +4833,9 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
 				goto err_out;
 		}
 		if (attr->ia_size != inode->i_size) {
+			if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+				ext4_truncate_data_csum(inode, attr->ia_size);
+
 			handle = ext4_journal_start(inode, EXT4_HT_INODE, 3);
 			if (IS_ERR(handle)) {
 				error = PTR_ERR(handle);
@@ -5082,11 +5197,17 @@ int ext4_mark_inode_dirty(handle_t *handle, struct inode *inode)
  * If the inode is marked synchronous, we don't honour that here - doing
  * so would cause a commit on atime updates, which we don't bother doing.
  * We handle synchronous inodes at the highest possible level.
+ *
+ * If only the I_DIRTY_TIME flag is set, we can skip everything.  If
+ * I_DIRTY_TIME and I_DIRTY_SYNC is set, the only inode fields we need
+ * to copy into the on-disk inode structure are the timestamp files.
  */
 void ext4_dirty_inode(struct inode *inode, int flags)
 {
 	handle_t *handle;
 
+	if (flags == I_DIRTY_TIME)
+		return;
 	handle = ext4_journal_start(inode, EXT4_HT_INODE, 2);
 	if (IS_ERR(handle))
 		goto out;
@@ -5312,3 +5433,70 @@ int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 
 	return err;
 }
+
+/*
+ * Find the first extent at or after @lblk in an inode that is not a hole.
+ * Search for @map_len blocks at most. The extent is returned in @result.
+ *
+ * The function returns 1 if we found an extent. The function returns 0 in
+ * case there is no extent at or after @lblk and in that case also sets
+ * @result->es_len to 0. In case of error, the error code is returned.
+ */
+int ext4_get_next_extent(struct inode *inode, ext4_lblk_t lblk,
+			 unsigned int map_len, struct extent_status *result)
+{
+	struct ext4_map_blocks map;
+	struct extent_status es = {};
+	int ret;
+
+	map.m_lblk = lblk;
+	map.m_len = map_len;
+
+	/*
+	 * For non-extent based files this loop may iterate several times since
+	 * we do not determine full hole size.
+	 */
+	while (map.m_len > 0) {
+		ret = ext4_map_blocks(NULL, inode, &map, 0);
+		if (ret < 0)
+			return ret;
+		/* There's extent covering m_lblk? Just return it. */
+		if (ret > 0) {
+			int status;
+
+			ext4_es_store_pblock(result, map.m_pblk);
+			result->es_lblk = map.m_lblk;
+			result->es_len = map.m_len;
+			if (map.m_flags & EXT4_MAP_UNWRITTEN)
+				status = EXTENT_STATUS_UNWRITTEN;
+			else
+				status = EXTENT_STATUS_WRITTEN;
+			ext4_es_store_status(result, status);
+			return 1;
+		}
+		ext4_es_find_delayed_extent_range(inode, map.m_lblk,
+						  map.m_lblk + map.m_len - 1,
+						  &es);
+		/* Is delalloc data before next block in extent tree? */
+		if (es.es_len && es.es_lblk < map.m_lblk + map.m_len) {
+			ext4_lblk_t offset = 0;
+
+			if (es.es_lblk < lblk)
+				offset = lblk - es.es_lblk;
+			result->es_lblk = es.es_lblk + offset;
+			ext4_es_store_pblock(result,
+					     ext4_es_pblock(&es) + offset);
+			result->es_len = es.es_len - offset;
+			ext4_es_store_status(result, ext4_es_status(&es));
+
+			return 1;
+		}
+		/* There's a hole at m_lblk, advance us after it */
+		map.m_lblk += map.m_len;
+		map_len -= map.m_len;
+		map.m_len = map_len;
+		cond_resched();
+	}
+	result->es_len = 0;
+	return 0;
+}
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -198,6 +198,60 @@ journal_err_out:
 	return err;
 }
 
+static int ext4_open_balloon(struct super_block *sb, struct vfsmount *mnt)
+{
+	struct inode *balloon_ino;
+	int err, fd;
+	struct file *filp;
+	struct dentry *de;
+	struct path path;
+	fmode_t mode;
+
+	balloon_ino = EXT4_SB(sb)->s_balloon_ino;
+	err = -ENOENT;
+	if (balloon_ino == NULL)
+		goto err;
+
+	err = fd = get_unused_fd();
+	if (err < 0)
+		goto err_fd;
+
+	__iget(balloon_ino);
+	de = d_obtain_alias(balloon_ino);
+	err = PTR_ERR(de);
+	if (IS_ERR(de))
+		goto err_de;
+
+	path.dentry = de;
+	path.mnt = mntget(mnt);
+	err = mnt_want_write(path.mnt);
+	if (err)
+		mode = FMODE_READ;
+	else
+		mode = FMODE_READ | FMODE_WRITE;
+	filp = alloc_file(&path, mode,
+			&ext4_file_operations);
+	if (mode & FMODE_WRITE)
+		mnt_drop_write(path.mnt);
+	if (IS_ERR(filp)) {
+		err = PTR_ERR(filp);
+		goto err_filp;
+	}
+
+	filp->f_flags |= O_LARGEFILE;
+	fd_install(fd, filp);
+	return fd;
+
+err_filp:
+	path_put(&path);
+err_de:
+	put_unused_fd(fd);
+err_fd:
+	/* nothing */
+err:
+	return err;
+}
+
 long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 {
 	struct inode *inode = file_inode(filp);
@@ -258,7 +312,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
 		 * the relevant capability.
 		 */
 		if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) {
-			if (!capable(CAP_SYS_RESOURCE))
+			if (!capable(CAP_SYS_ADMIN))
 				goto flags_out;
 		}
 		if ((flags ^ oldflags) & EXT4_EXTENTS_FL)
@@ -591,6 +645,66 @@ resizefs_out:
 		ext4_resize_end(sb);
 		return err;
 	}
+	case EXT4_IOC_SET_RSV_BLOCKS: {
+		ext4_fsblk_t n_blocks_count;
+		struct super_block *sb = inode->i_sb;
+		handle_t *handle;
+		int err = 0, err2 = 0;
+
+		if (copy_from_user(&n_blocks_count, (__u64 __user *)arg,
+				   sizeof(__u64))) {
+			return -EFAULT;
+		}
+
+		if (n_blocks_count > MAX_32_NUM &&
+		    !EXT4_HAS_INCOMPAT_FEATURE(sb,
+					       EXT4_FEATURE_INCOMPAT_64BIT)) {
+			ext4_msg(sb, KERN_ERR,
+				 "File system only supports 32-bit block numbers");
+			return -EOPNOTSUPP;
+		}
+
+		if (n_blocks_count > ext4_blocks_count(EXT4_SB(sb)->s_es))
+			return -EINVAL;
+
+		err = ext4_resize_begin(sb);
+		if (err)
+			return err;
+
+		err = mnt_want_write(filp->f_path.mnt);
+		if (err)
+			goto resize_out;
+
+		handle = ext4_journal_start_sb(sb, EXT4_HT_MISC, 1);
+		if (IS_ERR(handle)) {
+			err = PTR_ERR(handle);
+			goto mnt_out;
+		}
+		err = ext4_journal_get_write_access(handle, EXT4_SB(sb)->s_sbh);
+		if (err) {
+			goto journal_out;
+		}
+		ext4_r_blocks_count_set(EXT4_SB(sb)->s_es, n_blocks_count);
+		ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
+journal_out:
+		err2 = ext4_journal_stop(handle);
+		if (err == 0)
+			err = err2;
+
+		if (!err && EXT4_SB(sb)->s_journal) {
+			jbd2_journal_lock_updates(EXT4_SB(sb)->s_journal);
+			err2 = jbd2_journal_flush(EXT4_SB(sb)->s_journal);
+			jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
+		}
+		if (err == 0)
+			err = err2;
+mnt_out:
+		mnt_drop_write(filp->f_path.mnt);
+resize_out:
+		ext4_resize_end(sb);
+		return err;
+	}
+
 
 	case FITRIM:
 	{
@@ -623,6 +737,100 @@ resizefs_out:
 	case EXT4_IOC_PRECACHE_EXTENTS:
 		return ext4_ext_precache(inode);
 
+	case EXT4_IOC_OPEN_BALLOON:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EACCES;
+
+		return ext4_open_balloon(inode->i_sb, filp->f_path.mnt);
+
+	case FS_IOC_PFCACHE_OPEN:
+	{
+		int err;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		mutex_lock(&inode->i_mutex);
+		err = ext4_open_pfcache(inode);
+		mutex_unlock(&inode->i_mutex);
+
+		return err;
+	}
+	case FS_IOC_PFCACHE_CLOSE:
+	{
+		int err;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		mutex_lock(&inode->i_mutex);
+		err = ext4_close_pfcache(inode);
+		mutex_unlock(&inode->i_mutex);
+
+		return err;
+	}
+	case FS_IOC_PFCACHE_DUMP:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		return ext4_dump_pfcache(inode->i_sb,
+				(struct pfcache_dump_request __user *) arg);
+	case EXT4_IOC_MFSYNC:
+	{
+		struct ext4_ioc_mfsync_info mfsync;
+		struct file **filpp;
+		unsigned int *flags;
+		__u32 __user *usr_fd;
+		int i, err;
+
+		if (!ve_is_super(get_exec_env()))
+			return -ENOTSUPP;
+		if (copy_from_user(&mfsync, (struct ext4_ioc_mfsync_info *)arg,
+				   sizeof(mfsync)))
+			return -EFAULT;
+
+		if (mfsync.size == 0)
+			return 0;
+		if (mfsync.size > NR_FILE)
+			return -ENFILE;
+
+		usr_fd = (__u32 __user *) (arg + sizeof(__u32));
+
+		filpp = kzalloc(mfsync.size * sizeof(*filpp), GFP_KERNEL);
+		if (!filpp)
+			return -ENOMEM;
+		flags = kzalloc(mfsync.size * sizeof(*flags), GFP_KERNEL);
+		if (!flags) {
+			kfree(filpp);
+			return -ENOMEM;
+		}
+		for (i = 0; i < mfsync.size; i++) {
+			int fd;
+			int ret;
+
+			err = -EFAULT;
+			ret = get_user(fd, usr_fd + i);
+			if (ret)
+				goto mfsync_fput;
+
+			/* negative fd means fdata_sync */
+			flags[i] = (fd & (1<< 31)) != 0;
+			fd &= ~(1<< 31);
+
+			err = -EBADF;
+			filpp[i] = fget(fd);
+			if (!filpp[i])
+				goto mfsync_fput;
+		}
+		err = ext4_sync_files(filpp, flags, mfsync.size);
+mfsync_fput:
+		for (i = 0; i < mfsync.size; i++)
+			if (filpp[i])
+				fput(filpp[i]);
+		kfree(filpp);
+		kfree(flags);
+		return err;
+	}
 	default:
 		return -ENOTTY;
 	}
@@ -688,6 +896,10 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
 	case EXT4_IOC_RESIZE_FS:
 	case EXT4_IOC_PRECACHE_EXTENTS:
 		break;
+	case FS_IOC_PFCACHE_OPEN:
+	case FS_IOC_PFCACHE_CLOSE:
+	case FS_IOC_PFCACHE_DUMP:
+		break;
 	default:
 		return -ENOIOCTLCMD;
 	}
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4447,6 +4447,12 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
 			*errp = -EDQUOT;
 			goto out;
 		}
+
+		if (check_bd_full(ar->inode, inquota)) {
+			ar->len = 0;
+			*errp = -ENOSPC;
+			goto out;
+		}
 	}
 
 	ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -27,120 +27,26 @@
  * @lblock:	logical block number to find an extent path
  * @path:	pointer to an extent path pointer (for output)
  *
- * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value
+ * ext4_find_extent wrapper. Return 0 on success, or a negative error value
  * on failure.
  */
 static inline int
 get_ext_path(struct inode *inode, ext4_lblk_t lblock,
-		struct ext4_ext_path **orig_path)
+		struct ext4_ext_path **ppath)
 {
-	int ret = 0;
 	struct ext4_ext_path *path;
 
-	path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE);
+	path = ext4_ext_find_extent(inode, lblock, *ppath, EXT4_EX_NOCACHE);
 	if (IS_ERR(path))
-		ret = PTR_ERR(path);
-	else if (path[ext_depth(inode)].p_ext == NULL)
-		ret = -ENODATA;
-	else
-		*orig_path = path;
-
-	return ret;
-}
-
-/**
- * copy_extent_status - Copy the extent's initialization status
- *
- * @src:	an extent for getting initialize status
- * @dest:	an extent to be set the status
- */
-static void
-copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest)
-{
-	if (ext4_ext_is_unwritten(src))
-		ext4_ext_mark_unwritten(dest);
-	else
-		dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest));
-}
-
-/**
- * mext_next_extent - Search for the next extent and set it to "extent"
- *
- * @inode:	inode which is searched
- * @path:	this will obtain data for the next extent
- * @extent:	pointer to the next extent we have just gotten
- *
- * Search the next extent in the array of ext4_ext_path structure (@path)
- * and set it to ext4_extent structure (@extent). In addition, the member of
- * @path (->p_ext) also points the next extent. Return 0 on success, 1 if
- * ext4_ext_path structure refers to the last extent, or a negative error
- * value on failure.
- */
-int
-mext_next_extent(struct inode *inode, struct ext4_ext_path *path,
-		      struct ext4_extent **extent)
-{
-	struct ext4_extent_header *eh;
-	int ppos, leaf_ppos = path->p_depth;
-
-	ppos = leaf_ppos;
-	if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) {
-		/* leaf block */
-		*extent = ++path[ppos].p_ext;
-		path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext);
-		return 0;
-	}
-
-	while (--ppos >= 0) {
-		if (EXT_LAST_INDEX(path[ppos].p_hdr) >
-		    path[ppos].p_idx) {
-			int cur_ppos = ppos;
-
-			/* index block */
-			path[ppos].p_idx++;
-			path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx);
-			if (path[ppos+1].p_bh)
-				brelse(path[ppos+1].p_bh);
-			path[ppos+1].p_bh =
-				sb_bread(inode->i_sb, path[ppos].p_block);
-			if (!path[ppos+1].p_bh)
-				return -EIO;
-			path[ppos+1].p_hdr =
-				ext_block_hdr(path[ppos+1].p_bh);
-
-			/* Halfway index block */
-			while (++cur_ppos < leaf_ppos) {
-				path[cur_ppos].p_idx =
-					EXT_FIRST_INDEX(path[cur_ppos].p_hdr);
-				path[cur_ppos].p_block =
-					ext4_idx_pblock(path[cur_ppos].p_idx);
-				if (path[cur_ppos+1].p_bh)
-					brelse(path[cur_ppos+1].p_bh);
-				path[cur_ppos+1].p_bh = sb_bread(inode->i_sb,
-					path[cur_ppos].p_block);
-				if (!path[cur_ppos+1].p_bh)
-					return -EIO;
-				path[cur_ppos+1].p_hdr =
-					ext_block_hdr(path[cur_ppos+1].p_bh);
-			}
-
-			path[leaf_ppos].p_ext = *extent = NULL;
-
-			eh = path[leaf_ppos].p_hdr;
-			if (le16_to_cpu(eh->eh_entries) == 0)
-				/* empty leaf is found */
-				return -ENODATA;
-
-			/* leaf block */
-			path[leaf_ppos].p_ext = *extent =
-				EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr);
-			path[leaf_ppos].p_block =
-					ext4_ext_pblock(path[leaf_ppos].p_ext);
-			return 0;
-		}
+		return PTR_ERR(path);
+	if (path[ext_depth(inode)].p_ext == NULL) {
+		ext4_ext_drop_refs(path);
+		kfree(path);
+		*ppath = NULL;
+		return -ENODATA;
 	}
-	/* We found the last extent */
-	return 1;
+	*ppath = path;
+	return 0;
 }
 
 /**
@@ -178,417 +84,6 @@ ext4_double_up_write_data_sem(struct inode *orig_inode,
 }
 
 /**
- * mext_insert_across_blocks - Insert extents across leaf block
- *
- * @handle:		journal handle
- * @orig_inode:		original inode
- * @o_start:		first original extent to be changed
- * @o_end:		last original extent to be changed
- * @start_ext:		first new extent to be inserted
- * @new_ext:		middle of new extent to be inserted
- * @end_ext:		last new extent to be inserted
- *
- * Allocate a new leaf block and insert extents into it. Return 0 on success,
- * or a negative error value on failure.
- */
-static int
-mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode,
-		struct ext4_extent *o_start, struct ext4_extent *o_end,
-		struct ext4_extent *start_ext, struct ext4_extent *new_ext,
-		struct ext4_extent *end_ext)
-{
-	struct ext4_ext_path *orig_path = NULL;
-	ext4_lblk_t eblock = 0;
-	int new_flag = 0;
-	int end_flag = 0;
-	int err = 0;
-
-	if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) {
-		if (o_start == o_end) {
-
-			/*       start_ext   new_ext    end_ext
-			 * donor |---------|-----------|--------|
-			 * orig  |------------------------------|
-			 */
-			end_flag = 1;
-		} else {
-
-			/*       start_ext   new_ext   end_ext
-			 * donor |---------|----------|---------|
-			 * orig  |---------------|--------------|
-			 */
-			o_end->ee_block = end_ext->ee_block;
-			o_end->ee_len = end_ext->ee_len;
-			ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
-		}
-
-		o_start->ee_len = start_ext->ee_len;
-		eblock = le32_to_cpu(start_ext->ee_block);
-		new_flag = 1;
-
-	} else if (start_ext->ee_len && new_ext->ee_len &&
-		   !end_ext->ee_len && o_start == o_end) {
-
-		/*	 start_ext	new_ext
-		 * donor |--------------|---------------|
-		 * orig  |------------------------------|
-		 */
-		o_start->ee_len = start_ext->ee_len;
-		eblock = le32_to_cpu(start_ext->ee_block);
-		new_flag = 1;
-
-	} else if (!start_ext->ee_len && new_ext->ee_len &&
-		   end_ext->ee_len && o_start == o_end) {
-
-		/*	  new_ext	end_ext
-		 * donor |--------------|---------------|
-		 * orig  |------------------------------|
-		 */
-		o_end->ee_block = end_ext->ee_block;
-		o_end->ee_len = end_ext->ee_len;
-		ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext));
-
-		/*
-		 * Set 0 to the extent block if new_ext was
-		 * the first block.
-		 */
-		if (new_ext->ee_block)
-			eblock = le32_to_cpu(new_ext->ee_block);
-
-		new_flag = 1;
-	} else {
-		ext4_debug("ext4 move extent: Unexpected insert case\n");
-		return -EIO;
-	}
-
-	if (new_flag) {
-		err = get_ext_path(orig_inode, eblock, &orig_path);
-		if (err)
-			goto out;
-
-		if (ext4_ext_insert_extent(handle, orig_inode,
-					orig_path, new_ext, 0))
-			goto out;
-	}
-
-	if (end_flag) {
-		err = get_ext_path(orig_inode,
-				le32_to_cpu(end_ext->ee_block) - 1, &orig_path);
-		if (err)
-			goto out;
-
-		if (ext4_ext_insert_extent(handle, orig_inode,
-					   orig_path, end_ext, 0))
-			goto out;
-	}
-out:
-	if (orig_path) {
-		ext4_ext_drop_refs(orig_path);
-		kfree(orig_path);
-	}
-
-	return err;
-
-}
-
-/**
- * mext_insert_inside_block - Insert new extent to the extent block
- *
- * @o_start:		first original extent to be moved
- * @o_end:		last original extent to be moved
- * @start_ext:		first new extent to be inserted
- * @new_ext:		middle of new extent to be inserted
- * @end_ext:		last new extent to be inserted
- * @eh:			extent header of target leaf block
- * @range_to_move:	used to decide how to insert extent
- *
- * Insert extents into the leaf block. The extent (@o_start) is overwritten
- * by inserted extents.
- */
-static void
-mext_insert_inside_block(struct ext4_extent *o_start,
-			      struct ext4_extent *o_end,
-			      struct ext4_extent *start_ext,
-			      struct ext4_extent *new_ext,
-			      struct ext4_extent *end_ext,
-			      struct ext4_extent_header *eh,
-			      int range_to_move)
-{
-	int i = 0;
-	unsigned long len;
-
-	/* Move the existing extents */
-	if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) {
-		len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) -
-			(unsigned long)(o_end + 1);
-		memmove(o_end + 1 + range_to_move, o_end + 1, len);
-	}
-
-	/* Insert start entry */
-	if (start_ext->ee_len)
-		o_start[i++].ee_len = start_ext->ee_len;
-
-	/* Insert new entry */
-	if (new_ext->ee_len) {
-		o_start[i] = *new_ext;
-		ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext));
-	}
-
-	/* Insert end entry */
-	if (end_ext->ee_len)
-		o_start[i] = *end_ext;
-
-	/* Increment the total entries counter on the extent block */
-	le16_add_cpu(&eh->eh_entries, range_to_move);
-}
-
-/**
- * mext_insert_extents - Insert new extent
- *
- * @handle:	journal handle
- * @orig_inode:	original inode
- * @orig_path:	path indicates first extent to be changed
- * @o_start:	first original extent to be changed
- * @o_end:	last original extent to be changed
- * @start_ext:	first new extent to be inserted
- * @new_ext:	middle of new extent to be inserted
- * @end_ext:	last new extent to be inserted
- *
- * Call the function to insert extents. If we cannot add more extents into
- * the leaf block, we call mext_insert_across_blocks() to create a
- * new leaf block. Otherwise call mext_insert_inside_block(). Return 0
- * on success, or a negative error value on failure.
- */
-static int
-mext_insert_extents(handle_t *handle, struct inode *orig_inode,
-			 struct ext4_ext_path *orig_path,
-			 struct ext4_extent *o_start,
-			 struct ext4_extent *o_end,
-			 struct ext4_extent *start_ext,
-			 struct ext4_extent *new_ext,
-			 struct ext4_extent *end_ext)
-{
-	struct  ext4_extent_header *eh;
-	unsigned long need_slots, slots_range;
-	int	range_to_move, depth, ret;
-
-	/*
-	 * The extents need to be inserted
-	 * start_extent + new_extent + end_extent.
-	 */
-	need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) +
-		(new_ext->ee_len ? 1 : 0);
-
-	/* The number of slots between start and end */
-	slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1)
-		/ sizeof(struct ext4_extent);
-
-	/* Range to move the end of extent */
-	range_to_move = need_slots - slots_range;
-	depth = orig_path->p_depth;
-	orig_path += depth;
-	eh = orig_path->p_hdr;
-
-	if (depth) {
-		/* Register to journal */
-		BUFFER_TRACE(orig_path->p_bh, "get_write_access");
-		ret = ext4_journal_get_write_access(handle, orig_path->p_bh);
-		if (ret)
-			return ret;
-	}
-
-	/* Expansion */
-	if (range_to_move > 0 &&
-		(range_to_move > le16_to_cpu(eh->eh_max)
-			- le16_to_cpu(eh->eh_entries))) {
-
-		ret = mext_insert_across_blocks(handle, orig_inode, o_start,
-					o_end, start_ext, new_ext, end_ext);
-		if (ret < 0)
-			return ret;
-	} else
-		mext_insert_inside_block(o_start, o_end, start_ext, new_ext,
-						end_ext, eh, range_to_move);
-
-	return ext4_ext_dirty(handle, orig_inode, orig_path);
-}
-
-/**
- * mext_leaf_block - Move one leaf extent block into the inode.
- *
- * @handle:		journal handle
- * @orig_inode:		original inode
- * @orig_path:		path indicates first extent to be changed
- * @dext:		donor extent
- * @from:		start offset on the target file
- *
- * In order to insert extents into the leaf block, we must divide the extent
- * in the leaf block into three extents. The one is located to be inserted
- * extents, and the others are located around it.
- *
- * Therefore, this function creates structures to save extents of the leaf
- * block, and inserts extents by calling mext_insert_extents() with
- * created extents. Return 0 on success, or a negative error value on failure.
- */
-static int
-mext_leaf_block(handle_t *handle, struct inode *orig_inode,
-		     struct ext4_ext_path *orig_path, struct ext4_extent *dext,
-		     ext4_lblk_t *from)
-{
-	struct ext4_extent *oext, *o_start, *o_end, *prev_ext;
-	struct ext4_extent new_ext, start_ext, end_ext;
-	ext4_lblk_t new_ext_end;
-	int oext_alen, new_ext_alen, end_ext_alen;
-	int depth = ext_depth(orig_inode);
-	int ret;
-
-	start_ext.ee_block = end_ext.ee_block = 0;
-	o_start = o_end = oext = orig_path[depth].p_ext;
-	oext_alen = ext4_ext_get_actual_len(oext);
-	start_ext.ee_len = end_ext.ee_len = 0;
-
-	new_ext.ee_block = cpu_to_le32(*from);
-	ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext));
-	new_ext.ee_len = dext->ee_len;
-	new_ext_alen = ext4_ext_get_actual_len(&new_ext);
-	new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1;
-
-	/*
-	 * Case: original extent is first
-	 * oext      |--------|
-	 * new_ext      |--|
-	 * start_ext |--|
-	 */
-	if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) &&
-		le32_to_cpu(new_ext.ee_block) <
-		le32_to_cpu(oext->ee_block) + oext_alen) {
-		start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) -
-					       le32_to_cpu(oext->ee_block));
-		start_ext.ee_block = oext->ee_block;
-		copy_extent_status(oext, &start_ext);
-	} else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) {
-		prev_ext = oext - 1;
-		/*
-		 * We can merge new_ext into previous extent,
-		 * if these are contiguous and same extent type.
-		 */
-		if (ext4_can_extents_be_merged(orig_inode, prev_ext,
-					       &new_ext)) {
-			o_start = prev_ext;
-			start_ext.ee_len = cpu_to_le16(
-				ext4_ext_get_actual_len(prev_ext) +
-				new_ext_alen);
-			start_ext.ee_block = oext->ee_block;
-			copy_extent_status(prev_ext, &start_ext);
-			new_ext.ee_len = 0;
-		}
-	}
-
-	/*
-	 * Case: new_ext_end must be less than oext
-	 * oext      |-----------|
-	 * new_ext       |-------|
-	 */
-	if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) {
-		EXT4_ERROR_INODE(orig_inode,
-			"new_ext_end(%u) should be less than or equal to "
-			"oext->ee_block(%u) + oext_alen(%d) - 1",
-			new_ext_end, le32_to_cpu(oext->ee_block),
-			oext_alen);
-		ret = -EIO;
-		goto out;
-	}
-
-	/*
-	 * Case: new_ext is smaller than original extent
-	 * oext    |---------------|
-	 * new_ext |-----------|
-	 * end_ext             |---|
-	 */
-	if (le32_to_cpu(oext->ee_block) <= new_ext_end &&
-		new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) {
-		end_ext.ee_len =
-			cpu_to_le16(le32_to_cpu(oext->ee_block) +
-			oext_alen - 1 - new_ext_end);
-		copy_extent_status(oext, &end_ext);
-		end_ext_alen = ext4_ext_get_actual_len(&end_ext);
-		ext4_ext_store_pblock(&end_ext,
-			(ext4_ext_pblock(o_end) + oext_alen - end_ext_alen));
-		end_ext.ee_block =
-			cpu_to_le32(le32_to_cpu(o_end->ee_block) +
-			oext_alen - end_ext_alen);
-	}
-
-	ret = mext_insert_extents(handle, orig_inode, orig_path, o_start,
-				o_end, &start_ext, &new_ext, &end_ext);
-out:
-	return ret;
-}
-
-/**
- * mext_calc_swap_extents - Calculate extents for extent swapping.
- *
- * @tmp_dext:		the extent that will belong to the original inode
- * @tmp_oext:		the extent that will belong to the donor inode
- * @orig_off:		block offset of original inode
- * @donor_off:		block offset of donor inode
- * @max_count:		the maximum length of extents
- *
- * Return 0 on success, or a negative error value on failure.
- */
-static int
-mext_calc_swap_extents(struct ext4_extent *tmp_dext,
-			      struct ext4_extent *tmp_oext,
-			      ext4_lblk_t orig_off, ext4_lblk_t donor_off,
-			      ext4_lblk_t max_count)
-{
-	ext4_lblk_t diff, orig_diff;
-	struct ext4_extent dext_old, oext_old;
-
-	BUG_ON(orig_off != donor_off);
-
-	/* original and donor extents have to cover the same block offset */
-	if (orig_off < le32_to_cpu(tmp_oext->ee_block) ||
-	    le32_to_cpu(tmp_oext->ee_block) +
-			ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off)
-		return -ENODATA;
-
-	if (orig_off < le32_to_cpu(tmp_dext->ee_block) ||
-	    le32_to_cpu(tmp_dext->ee_block) +
-			ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off)
-		return -ENODATA;
-
-	dext_old = *tmp_dext;
-	oext_old = *tmp_oext;
-
-	/* When tmp_dext is too large, pick up the target range. */
-	diff = donor_off - le32_to_cpu(tmp_dext->ee_block);
-
-	ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff);
-	le32_add_cpu(&tmp_dext->ee_block, diff);
-	le16_add_cpu(&tmp_dext->ee_len, -diff);
-
-	if (max_count < ext4_ext_get_actual_len(tmp_dext))
-		tmp_dext->ee_len = cpu_to_le16(max_count);
-
-	orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block);
-	ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff);
-
-	/* Adjust extent length if donor extent is larger than orig */
-	if (ext4_ext_get_actual_len(tmp_dext) >
-	    ext4_ext_get_actual_len(tmp_oext) - orig_diff)
-		tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) -
-						orig_diff);
-
-	tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext));
-
-	copy_extent_status(&oext_old, tmp_dext);
-	copy_extent_status(&dext_old, tmp_oext);
-
-	return 0;
-}
-
-/**
  * mext_check_coverage - Check that all extents in range has the same type
  *
  * @inode:		inode in question
@@ -619,171 +114,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count,
 	}
 	ret = 1;
 out:
-	if (path) {
-		ext4_ext_drop_refs(path);
-		kfree(path);
-	}
+	ext4_ext_drop_refs(path);
+	kfree(path);
 	return ret;
 }
 
 /**
- * mext_replace_branches - Replace original extents with new extents
- *
- * @handle:		journal handle
- * @orig_inode:		original inode
- * @donor_inode:	donor inode
- * @from:		block offset of orig_inode
- * @count:		block count to be replaced
- * @err:		pointer to save return value
- *
- * Replace original inode extents and donor inode extents page by page.
- * We implement this replacement in the following three steps:
- * 1. Save the block information of original and donor inodes into
- *    dummy extents.
- * 2. Change the block information of original inode to point at the
- *    donor inode blocks.
- * 3. Change the block information of donor inode to point at the saved
- *    original inode blocks in the dummy extents.
- *
- * Return replaced block count.
- */
-static int
-mext_replace_branches(handle_t *handle, struct inode *orig_inode,
-			   struct inode *donor_inode, ext4_lblk_t from,
-			   ext4_lblk_t count, int *err)
-{
-	struct ext4_ext_path *orig_path = NULL;
-	struct ext4_ext_path *donor_path = NULL;
-	struct ext4_extent *oext, *dext;
-	struct ext4_extent tmp_dext, tmp_oext;
-	ext4_lblk_t orig_off = from, donor_off = from;
-	int depth;
-	int replaced_count = 0;
-	int dext_alen;
-
-	*err = ext4_es_remove_extent(orig_inode, from, count);
-	if (*err)
-		goto out;
-
-	*err = ext4_es_remove_extent(donor_inode, from, count);
-	if (*err)
-		goto out;
-
-	/* Get the original extent for the block "orig_off" */
-	*err = get_ext_path(orig_inode, orig_off, &orig_path);
-	if (*err)
-		goto out;
-
-	/* Get the donor extent for the head */
-	*err = get_ext_path(donor_inode, donor_off, &donor_path);
-	if (*err)
-		goto out;
-	depth = ext_depth(orig_inode);
-	oext = orig_path[depth].p_ext;
-	tmp_oext = *oext;
-
-	depth = ext_depth(donor_inode);
-	dext = donor_path[depth].p_ext;
-	if (unlikely(!dext))
-		goto missing_donor_extent;
-	tmp_dext = *dext;
-
-	*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
-				      donor_off, count);
-	if (*err)
-		goto out;
-
-	/* Loop for the donor extents */
-	while (1) {
-		/* The extent for donor must be found. */
-		if (unlikely(!dext)) {
-		missing_donor_extent:
-			EXT4_ERROR_INODE(donor_inode,
-				   "The extent for donor must be found");
-			*err = -EIO;
-			goto out;
-		} else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) {
-			EXT4_ERROR_INODE(donor_inode,
-				"Donor offset(%u) and the first block of donor "
-				"extent(%u) should be equal",
-				donor_off,
-				le32_to_cpu(tmp_dext.ee_block));
-			*err = -EIO;
-			goto out;
-		}
-
-		/* Set donor extent to orig extent */
-		*err = mext_leaf_block(handle, orig_inode,
-					   orig_path, &tmp_dext, &orig_off);
-		if (*err)
-			goto out;
-
-		/* Set orig extent to donor extent */
-		*err = mext_leaf_block(handle, donor_inode,
-					   donor_path, &tmp_oext, &donor_off);
-		if (*err)
-			goto out;
-
-		dext_alen = ext4_ext_get_actual_len(&tmp_dext);
-		replaced_count += dext_alen;
-		donor_off += dext_alen;
-		orig_off += dext_alen;
-
-		BUG_ON(replaced_count > count);
-		/* Already moved the expected blocks */
-		if (replaced_count >= count)
-			break;
-
-		if (orig_path)
-			ext4_ext_drop_refs(orig_path);
-		*err = get_ext_path(orig_inode, orig_off, &orig_path);
-		if (*err)
-			goto out;
-		depth = ext_depth(orig_inode);
-		oext = orig_path[depth].p_ext;
-		tmp_oext = *oext;
-
-		if (donor_path)
-			ext4_ext_drop_refs(donor_path);
-		*err = get_ext_path(donor_inode, donor_off, &donor_path);
-		if (*err)
-			goto out;
-		depth = ext_depth(donor_inode);
-		dext = donor_path[depth].p_ext;
-		tmp_dext = *dext;
-
-		*err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off,
-					   donor_off, count - replaced_count);
-		if (*err)
-			goto out;
-	}
-
-out:
-	if (orig_path) {
-		ext4_ext_drop_refs(orig_path);
-		kfree(orig_path);
-	}
-	if (donor_path) {
-		ext4_ext_drop_refs(donor_path);
-		kfree(donor_path);
-	}
-
-	return replaced_count;
-}
-
-/**
  * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2
  *
  * @inode1:	the inode structure
  * @inode2:	the inode structure
- * @index:	page index
+ * @index1:	page index
+ * @index2:	page index
  * @page:	result page vector
  *
  * Grab two locked pages for inode's by inode order
  */
 static int
 mext_page_double_lock(struct inode *inode1, struct inode *inode2,
-		      pgoff_t index, struct page *page[2])
+		      pgoff_t index1, pgoff_t index2, struct page *page[2])
 {
 	struct address_space *mapping[2];
 	unsigned fl = AOP_FLAG_NOFS;
@@ -793,15 +142,18 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
 		mapping[0] = inode1->i_mapping;
 		mapping[1] = inode2->i_mapping;
 	} else {
+		pgoff_t tmp = index1;
+		index1 = index2;
+		index2 = tmp;
 		mapping[0] = inode2->i_mapping;
 		mapping[1] = inode1->i_mapping;
 	}
 
-	page[0] = grab_cache_page_write_begin(mapping[0], index, fl);
+	page[0] = grab_cache_page_write_begin(mapping[0], index1, fl);
 	if (!page[0])
 		return -ENOMEM;
 
-	page[1] = grab_cache_page_write_begin(mapping[1], index, fl);
+	page[1] = grab_cache_page_write_begin(mapping[1], index2, fl);
 	if (!page[1]) {
 		unlock_page(page[0]);
 		page_cache_release(page[0]);
@@ -893,25 +245,27 @@ out:
  * @o_filp:			file structure of original file
  * @donor_inode:		donor inode
  * @orig_page_offset:		page index on original file
+ * @donor_page_offset:		page index on donor file
  * @data_offset_in_page:	block index where data swapping starts
  * @block_len_in_page:		the number of blocks to be swapped
  * @unwritten:			orig extent is unwritten or not
  * @err:			pointer to save return value
  *
  * Save the data in original inode blocks and replace original inode extents
- * with donor inode extents by calling mext_replace_branches().
+ * with donor inode extents by calling ext4_swap_extents().
  * Finally, write out the saved data in new original inode blocks. Return
  * replaced block count.
  */
 static int
 move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
-		  pgoff_t orig_page_offset, int data_offset_in_page,
-		  int block_len_in_page, int unwritten, int *err)
+		     pgoff_t orig_page_offset, pgoff_t donor_page_offset,
+		     int data_offset_in_page,
+		     int block_len_in_page, int unwritten, int *err)
 {
 	struct inode *orig_inode = file_inode(o_filp);
 	struct page *pagep[2] = {NULL, NULL};
 	handle_t *handle;
-	ext4_lblk_t orig_blk_offset;
+	ext4_lblk_t orig_blk_offset, donor_blk_offset;
 	unsigned long blocksize = orig_inode->i_sb->s_blocksize;
 	unsigned int tmp_data_size, data_size, replaced_size;
 	int err2, jblocks, retries = 0;
@@ -936,6 +290,9 @@ again:
 	orig_blk_offset = orig_page_offset * blocks_per_page +
 		data_offset_in_page;
 
+	donor_blk_offset = donor_page_offset * blocks_per_page +
+		data_offset_in_page;
+
 	/* Calculate data_size */
 	if ((orig_blk_offset + block_len_in_page - 1) ==
 	    ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) {
@@ -956,7 +313,7 @@ again:
 	replaced_size = data_size;
 
 	*err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset,
-				     pagep);
+				     donor_page_offset, pagep);
 	if (unlikely(*err < 0))
 		goto stop_journal;
 	/*
@@ -975,7 +332,7 @@ again:
 		if (*err)
 			goto drop_data_sem;
 
-		unwritten &= mext_check_coverage(donor_inode, orig_blk_offset,
+		unwritten &= mext_check_coverage(donor_inode, donor_blk_offset,
 						 block_len_in_page, 1, err);
 		if (*err)
 			goto drop_data_sem;
@@ -991,9 +348,10 @@ again:
 			*err = -EBUSY;
 			goto drop_data_sem;
 		}
-		replaced_count = mext_replace_branches(handle, orig_inode,
-						donor_inode, orig_blk_offset,
-						block_len_in_page, err);
+		replaced_count = ext4_swap_extents(handle, orig_inode,
+						   donor_inode, orig_blk_offset,
+						   donor_blk_offset,
+						   block_len_in_page, 1, err);
 	drop_data_sem:
 		ext4_double_up_write_data_sem(orig_inode, donor_inode);
 		goto unlock_pages;
@@ -1011,9 +369,9 @@ data_copy:
 		goto unlock_pages;
 	}
 	ext4_double_down_write_data_sem(orig_inode, donor_inode);
-	replaced_count = mext_replace_branches(handle, orig_inode, donor_inode,
-					       orig_blk_offset,
-					       block_len_in_page, err);
+	replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode,
+					   orig_blk_offset, donor_blk_offset,
+					   block_len_in_page, 1, err);
 	ext4_double_up_write_data_sem(orig_inode, donor_inode);
 	if (*err) {
 		if (replaced_count) {
@@ -1061,9 +419,9 @@ repair_branches:
 	 * Try to swap extents to it's original places
 	 */
 	ext4_double_down_write_data_sem(orig_inode, donor_inode);
-	replaced_count = mext_replace_branches(handle, donor_inode, orig_inode,
-					       orig_blk_offset,
-					       block_len_in_page, &err2);
+	replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode,
+					       orig_blk_offset, donor_blk_offset,
+					   block_len_in_page, 0, &err2);
 	ext4_double_up_write_data_sem(orig_inode, donor_inode);
 	if (replaced_count != block_len_in_page) {
 		EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset),
@@ -1093,10 +451,14 @@ mext_check_arguments(struct inode *orig_inode,
 		     struct inode *donor_inode, __u64 orig_start,
 		     __u64 donor_start, __u64 *len)
 {
-	ext4_lblk_t orig_blocks, donor_blocks;
+	__u64 orig_eof, donor_eof;
 	unsigned int blkbits = orig_inode->i_blkbits;
 	unsigned int blocksize = 1 << blkbits;
 
+	orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits;
+	donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits;
+
+
 	if (donor_inode->i_mode & (S_ISUID|S_ISGID)) {
 		ext4_debug("ext4 move extent: suid or sgid is set"
 			   " to donor file [ino:orig %lu, donor %lu]\n",
@@ -1112,7 +474,7 @@ mext_check_arguments(struct inode *orig_inode,
 		ext4_debug("ext4 move extent: The argument files should "
 			"not be swapfile [ino:orig %lu, donor %lu]\n",
 			orig_inode->i_ino, donor_inode->i_ino);
-		return -EINVAL;
+		return -EBUSY;
 	}
 
 	/* Ext4 move extent supports only extent based file */
@@ -1132,67 +494,28 @@ mext_check_arguments(struct inode *orig_inode,
 	}
 
 	/* Start offset should be same */
-	if (orig_start != donor_start) {
+	if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) !=
+	    (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) {
 		ext4_debug("ext4 move extent: orig and donor's start "
-			"offset are not same [ino:orig %lu, donor %lu]\n",
+			"offset are not alligned [ino:orig %lu, donor %lu]\n",
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
 
 	if ((orig_start >= EXT_MAX_BLOCKS) ||
+	    (donor_start >= EXT_MAX_BLOCKS) ||
 	    (*len > EXT_MAX_BLOCKS) ||
+	    (donor_start + *len >= EXT_MAX_BLOCKS) ||
 	    (orig_start + *len >= EXT_MAX_BLOCKS))  {
 		ext4_debug("ext4 move extent: Can't handle over [%u] blocks "
 			"[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS,
 			orig_inode->i_ino, donor_inode->i_ino);
 		return -EINVAL;
 	}
-
-	if (orig_inode->i_size > donor_inode->i_size) {
-		donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits;
-		/* TODO: eliminate this artificial restriction */
-		if (orig_start >= donor_blocks) {
-			ext4_debug("ext4 move extent: orig start offset "
-			"[%llu] should be less than donor file blocks "
-			"[%u] [ino:orig %lu, donor %lu]\n",
-			orig_start, donor_blocks,
-			orig_inode->i_ino, donor_inode->i_ino);
-			return -EINVAL;
-		}
-
-		/* TODO: eliminate this artificial restriction */
-		if (orig_start + *len > donor_blocks) {
-			ext4_debug("ext4 move extent: End offset [%llu] should "
-				"be less than donor file blocks [%u]."
-				"So adjust length from %llu to %llu "
-				"[ino:orig %lu, donor %lu]\n",
-				orig_start + *len, donor_blocks,
-				*len, donor_blocks - orig_start,
-				orig_inode->i_ino, donor_inode->i_ino);
-			*len = donor_blocks - orig_start;
-		}
-	} else {
-		orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits;
-		if (orig_start >= orig_blocks) {
-			ext4_debug("ext4 move extent: start offset [%llu] "
-				"should be less than original file blocks "
-				"[%u] [ino:orig %lu, donor %lu]\n",
-				 orig_start, orig_blocks,
-				orig_inode->i_ino, donor_inode->i_ino);
-			return -EINVAL;
-		}
-
-		if (orig_start + *len > orig_blocks) {
-			ext4_debug("ext4 move extent: Adjust length "
-				"from %llu to %llu. Because it should be "
-				"less than original file blocks "
-				"[ino:orig %lu, donor %lu]\n",
-				*len, orig_blocks - orig_start,
-				orig_inode->i_ino, donor_inode->i_ino);
-			*len = orig_blocks - orig_start;
-		}
-	}
-
+	if (orig_eof < orig_start + *len - 1)
+		*len = orig_eof - orig_start;
+	if (donor_eof < donor_start + *len - 1)
+		*len = donor_eof - donor_start;
 	if (!*len) {
 		ext4_debug("ext4 move extent: len should not be 0 "
 			"[ino:orig %lu, donor %lu]\n", orig_inode->i_ino,
@@ -1208,60 +531,27 @@ mext_check_arguments(struct inode *orig_inode,
  *
  * @o_filp:		file structure of the original file
  * @d_filp:		file structure of the donor file
- * @orig_start:		start offset in block for orig
- * @donor_start:	start offset in block for donor
+ * @orig_blk:		start offset in block for orig
+ * @donor_blk:		start offset in block for donor
  * @len:		the number of blocks to be moved
  * @moved_len:		moved block length
  *
  * This function returns 0 and moved block length is set in moved_len
  * if succeed, otherwise returns error value.
  *
- * Note: ext4_move_extents() proceeds the following order.
- * 1:ext4_move_extents() calculates the last block number of moving extent
- *   function by the start block number (orig_start) and the number of blocks
- *   to be moved (len) specified as arguments.
- *   If the {orig, donor}_start points a hole, the extent's start offset
- *   pointed by ext_cur (current extent), holecheck_path, orig_path are set
- *   after hole behind.
- * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent
- *   or the ext_cur exceeds the block_end which is last logical block number.
- * 3:To get the length of continues area, call mext_next_extent()
- *   specified with the ext_cur (initial value is holecheck_path) re-cursive,
- *   until find un-continuous extent, the start logical block number exceeds
- *   the block_end or the extent points to the last extent.
- * 4:Exchange the original inode data with donor inode data
- *   from orig_page_offset to seq_end_page.
- *   The start indexes of data are specified as arguments.
- *   That of the original inode is orig_page_offset,
- *   and the donor inode is also orig_page_offset
- *   (To easily handle blocksize != pagesize case, the offset for the
- *   donor inode is block unit).
- * 5:Update holecheck_path and orig_path to points a next proceeding extent,
- *   then returns to step 2.
- * 6:Release holecheck_path, orig_path and set the len to moved_len
- *   which shows the number of moved blocks.
- *   The moved_len is useful for the command to calculate the file offset
- *   for starting next move extent ioctl.
- * 7:Return 0 on success, or a negative error value on failure.
  */
 int
-ext4_move_extents(struct file *o_filp, struct file *d_filp,
-		 __u64 orig_start, __u64 donor_start, __u64 len,
-		 __u64 *moved_len)
+ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk,
+		  __u64 donor_blk, __u64 len, __u64 *moved_len)
 {
 	struct inode *orig_inode = file_inode(o_filp);
 	struct inode *donor_inode = file_inode(d_filp);
-	struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL;
-	struct ext4_extent *ext_prev, *ext_cur, *ext_dummy;
-	ext4_lblk_t block_start = orig_start;
-	ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0;
-	ext4_lblk_t rest_blocks;
-	pgoff_t orig_page_offset = 0, seq_end_page;
-	int ret, depth, last_extent = 0;
+	struct ext4_ext_path *path = NULL;
 	int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits;
-	int data_offset_in_page;
-	int block_len_in_page;
-	int unwritten;
+	ext4_lblk_t o_end, o_start = orig_blk;
+	ext4_lblk_t d_start = donor_blk;
+	int ret;
+	__u64 m_len = *moved_len;
 
 	if (orig_inode->i_sb != donor_inode->i_sb) {
 		ext4_debug("ext4 move extent: The argument files "
@@ -1303,121 +593,58 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 	/* Protect extent tree against block allocations via delalloc */
 	ext4_double_down_write_data_sem(orig_inode, donor_inode);
 	/* Check the filesystem environment whether move_extent can be done */
-	ret = mext_check_arguments(orig_inode, donor_inode, orig_start,
-				    donor_start, &len);
+	ret = mext_check_arguments(orig_inode, donor_inode, orig_blk,
+				    donor_blk, &len);
 	if (ret)
 		goto out;
+	o_end = o_start + len;
 
-	file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits;
-	block_end = block_start + len - 1;
-	if (file_end < block_end)
-		len -= block_end - file_end;
-
-	ret = get_ext_path(orig_inode, block_start, &orig_path);
-	if (ret)
-		goto out;
-
-	/* Get path structure to check the hole */
-	ret = get_ext_path(orig_inode, block_start, &holecheck_path);
-	if (ret)
-		goto out;
-
-	depth = ext_depth(orig_inode);
-	ext_cur = holecheck_path[depth].p_ext;
-
-	/*
-	 * Get proper starting location of block replacement if block_start was
-	 * within the hole.
-	 */
-	if (le32_to_cpu(ext_cur->ee_block) +
-		ext4_ext_get_actual_len(ext_cur) - 1 < block_start) {
-		/*
-		 * The hole exists between extents or the tail of
-		 * original file.
-		 */
-		last_extent = mext_next_extent(orig_inode,
-					holecheck_path, &ext_cur);
-		if (last_extent < 0) {
-			ret = last_extent;
-			goto out;
-		}
-		last_extent = mext_next_extent(orig_inode, orig_path,
-							&ext_dummy);
-		if (last_extent < 0) {
-			ret = last_extent;
-			goto out;
-		}
-		seq_start = le32_to_cpu(ext_cur->ee_block);
-	} else if (le32_to_cpu(ext_cur->ee_block) > block_start)
-		/* The hole exists at the beginning of original file. */
-		seq_start = le32_to_cpu(ext_cur->ee_block);
-	else
-		seq_start = block_start;
-
-	/* No blocks within the specified range. */
-	if (le32_to_cpu(ext_cur->ee_block) > block_end) {
-		ext4_debug("ext4 move extent: The specified range of file "
-							"may be the hole\n");
-		ret = -EINVAL;
-		goto out;
-	}
-
-	/* Adjust start blocks */
-	add_blocks = min(le32_to_cpu(ext_cur->ee_block) +
-			 ext4_ext_get_actual_len(ext_cur), block_end + 1) -
-		     max(le32_to_cpu(ext_cur->ee_block), block_start);
-
-	while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) {
-		seq_blocks += add_blocks;
+	while (o_start < o_end) {
+		struct ext4_extent *ex;
+		ext4_lblk_t cur_blk, next_blk;
+		pgoff_t orig_page_index, donor_page_index;
+		int offset_in_page;
+		int unwritten, cur_len;
 
-		/* Adjust tail blocks */
-		if (seq_start + seq_blocks - 1 > block_end)
-			seq_blocks = block_end - seq_start + 1;
-
-		ext_prev = ext_cur;
-		last_extent = mext_next_extent(orig_inode, holecheck_path,
-						&ext_cur);
-		if (last_extent < 0) {
-			ret = last_extent;
+		ret = get_ext_path(orig_inode, o_start, &path);
+		if (ret)
 			break;
-		}
-		add_blocks = ext4_ext_get_actual_len(ext_cur);
-
-		/*
-		 * Extend the length of contiguous block (seq_blocks)
-		 * if extents are contiguous.
-		 */
-		if (ext4_can_extents_be_merged(orig_inode,
-					       ext_prev, ext_cur) &&
-		    block_end >= le32_to_cpu(ext_cur->ee_block) &&
-		    !last_extent)
+		ex = path[path->p_depth].p_ext;
+		next_blk = ext4_ext_next_allocated_block(path);
+		cur_blk = le32_to_cpu(ex->ee_block);
+		cur_len = ext4_ext_get_actual_len(ex);
+		/* Check hole before the start pos */
+		if (cur_blk + cur_len - 1 < o_start) {
+			if (next_blk == EXT_MAX_BLOCKS) {
+				o_start = o_end;
+				ret = -ENODATA;
+				break;
+			}
+			d_start += next_blk - o_start;
+			o_start = next_blk;
 			continue;
-
-		/* Is original extent is unwritten */
-		unwritten = ext4_ext_is_unwritten(ext_prev);
-
-		data_offset_in_page = seq_start % blocks_per_page;
-
-		/*
-		 * Calculate data blocks count that should be swapped
-		 * at the first page.
-		 */
-		if (data_offset_in_page + seq_blocks > blocks_per_page) {
-			/* Swapped blocks are across pages */
-			block_len_in_page =
-					blocks_per_page - data_offset_in_page;
-		} else {
-			/* Swapped blocks are in a page */
-			block_len_in_page = seq_blocks;
+		/* Check hole after the start pos */
+		} else if (cur_blk > o_start) {
+			/* Skip hole */
+			d_start += cur_blk - o_start;
+			o_start = cur_blk;
+			/* Extent inside requested range ?*/
+			if (cur_blk >= o_end)
+				break;
+		} else { /* in_range(o_start, o_blk, o_len) */
+			cur_len += cur_blk - o_start;
 		}
-
-		orig_page_offset = seq_start >>
-				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
-		seq_end_page = (seq_start + seq_blocks - 1) >>
-				(PAGE_CACHE_SHIFT - orig_inode->i_blkbits);
-		seq_start = le32_to_cpu(ext_cur->ee_block);
-		rest_blocks = seq_blocks;
-
+		unwritten = ext4_ext_is_unwritten(ex);
+		if (o_end - o_start < cur_len)
+			cur_len = o_end - o_start;
+
+		orig_page_index = o_start >> (PAGE_CACHE_SHIFT -
+					       orig_inode->i_blkbits);
+		donor_page_index = d_start >> (PAGE_CACHE_SHIFT -
+					       donor_inode->i_blkbits);
+		offset_in_page = o_start % blocks_per_page;
+		if (cur_len > blocks_per_page- offset_in_page)
+			cur_len = blocks_per_page - offset_in_page;
 		/*
 		 * Up semaphore to avoid following problems:
 		 * a. transaction deadlock among ext4_journal_start,
@@ -1426,77 +653,24 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp,
 		 *    in move_extent_per_page
 		 */
 		ext4_double_up_write_data_sem(orig_inode, donor_inode);
-
-		while (orig_page_offset <= seq_end_page) {
-
-			/* Swap original branches with new branches */
-			block_len_in_page = move_extent_per_page(
-						o_filp, donor_inode,
-						orig_page_offset,
-						data_offset_in_page,
-						block_len_in_page,
-						unwritten, &ret);
-
-			/* Count how many blocks we have exchanged */
-			*moved_len += block_len_in_page;
-			if (ret < 0)
-				break;
-			if (*moved_len > len) {
-				EXT4_ERROR_INODE(orig_inode,
-					"We replaced blocks too much! "
-					"sum of replaced: %llu requested: %llu",
-					*moved_len, len);
-				ret = -EIO;
-				break;
-			}
-
-			orig_page_offset++;
-			data_offset_in_page = 0;
-			rest_blocks -= block_len_in_page;
-			if (rest_blocks > blocks_per_page)
-				block_len_in_page = blocks_per_page;
-			else
-				block_len_in_page = rest_blocks;
-		}
-
+		/* Swap original branches with new branches */
+		move_extent_per_page(o_filp, donor_inode,
+				     orig_page_index, donor_page_index,
+				     offset_in_page, cur_len,
+				     unwritten, &ret);
 		ext4_double_down_write_data_sem(orig_inode, donor_inode);
 		if (ret < 0)
 			break;
-
-		/* Decrease buffer counter */
-		if (holecheck_path)
-			ext4_ext_drop_refs(holecheck_path);
-		ret = get_ext_path(orig_inode, seq_start, &holecheck_path);
-		if (ret)
-			break;
-		depth = holecheck_path->p_depth;
-
-		/* Decrease buffer counter */
-		if (orig_path)
-			ext4_ext_drop_refs(orig_path);
-		ret = get_ext_path(orig_inode, seq_start, &orig_path);
-		if (ret)
-			break;
-
-		ext_cur = holecheck_path[depth].p_ext;
-		add_blocks = ext4_ext_get_actual_len(ext_cur);
-		seq_blocks = 0;
-
+		o_start += cur_len;
+		d_start += cur_len;
+		m_len += cur_len;
 	}
 out:
-	if (*moved_len) {
-		ext4_discard_preallocations(orig_inode);
-		ext4_discard_preallocations(donor_inode);
-	}
-
-	if (orig_path) {
-		ext4_ext_drop_refs(orig_path);
-		kfree(orig_path);
-	}
-	if (holecheck_path) {
-		ext4_ext_drop_refs(holecheck_path);
-		kfree(holecheck_path);
-	}
+	WARN_ON(m_len > len);
+	if (ret == 0)
+		*moved_len = m_len;
+	ext4_ext_drop_refs(path);
+	kfree(path);
 	ext4_double_up_write_data_sem(orig_inode, donor_inode);
 	ext4_inode_resume_unlocked_dio(orig_inode);
 	ext4_inode_resume_unlocked_dio(donor_inode);
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -34,6 +34,7 @@
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
 #include <linux/bio.h>
+#include <linux/virtinfo.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
@@ -96,6 +97,8 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode,
 	struct ext4_dir_entry *dirent;
 	int err = 0, is_dx_block = 0;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	bh = ext4_bread(NULL, inode, block, 0, &err);
 	if (!bh) {
 		if (err == 0) {
@@ -1441,6 +1444,11 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi
 					 ino);
 			return ERR_PTR(-EIO);
 		}
+		if (!IS_ERR(inode) &&
+		    inode == EXT4_SB(inode->i_sb)->s_balloon_ino) {
+			iput(inode);
+			return ERR_PTR(-EPERM);
+		}
 	}
 	return d_splice_alias(inode, dentry);
 }
@@ -2271,6 +2279,8 @@ retry:
 		ext4_journal_stop(handle);
 	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
 		goto retry;
+	if (!err && S_ISREG(mode) && ext4_want_data_csum(dir))
+		ext4_start_data_csum(inode);
 	return err;
 }
 
@@ -2423,6 +2433,8 @@ out_clear_inode:
 	err = ext4_mark_inode_dirty(handle, dir);
 	if (err)
 		goto out_clear_inode;
+	if (ext4_test_inode_state(dir, EXT4_STATE_PFCACHE_CSUM))
+		ext4_save_dir_csum(inode);
 	unlock_new_inode(inode);
 	d_instantiate(dentry, inode);
 	if (IS_DIRSYNC(dir))
@@ -2786,6 +2798,10 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry)
 	if (IS_DIRSYNC(dir))
 		ext4_handle_sync(handle);
 
+	retval = -EPERM;
+	if (inode == EXT4_SB(dir->i_sb)->s_balloon_ino)
+		goto end_unlink;
+
 	if (!inode->i_nlink) {
 		ext4_warning(inode->i_sb,
 			     "Deleting nonexistent file (%lu), %d",
@@ -3220,7 +3236,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	 */
 	retval = -ENOENT;
 	if (!old.bh || le32_to_cpu(old.de->inode) != old.inode->i_ino)
-		goto end_rename;
+		goto out_release;
 
 	new.bh = ext4_find_entry(new.dir, &new.dentry->d_name,
 				 &new.de, &new.inlined);
@@ -3354,6 +3370,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
 	retval = 0;
 
 end_rename:
+	if (handle)
+		ext4_journal_stop(handle);
+out_release:
 	brelse(old.dir_bh);
 	brelse(old.bh);
 	brelse(new.bh);
@@ -3363,8 +3382,6 @@ end_rename:
 		unlock_new_inode(whiteout);
 		iput(whiteout);
 	}
-	if (handle)
-		ext4_journal_stop(handle);
 	return retval;
 }
 
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -397,6 +397,23 @@ submit_and_retry:
 	return 0;
 }
 
+
+static void bdi_congestion_wait(struct backing_dev_info *bdi)
+{
+	DEFINE_WAIT(_wait);
+
+	for (;;) {
+		prepare_to_wait(&bdi->cong_waitq, &_wait,
+				TASK_UNINTERRUPTIBLE);
+		if (!bdi_write_congested2(bdi))
+			break;
+
+		io_schedule();
+	}
+
+	finish_wait(&bdi->cong_waitq, &_wait);
+}
+
 int ext4_bio_write_page(struct ext4_io_submit *io,
 			struct page *page,
 			int len,
@@ -462,6 +479,10 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
 		set_buffer_async_write(bh);
 	} while ((bh = bh->b_this_page) != head);
 
+	if (!wbc->for_reclaim &&
+	    bdi_write_congested2(page->mapping->backing_dev_info))
+		bdi_congestion_wait(page->mapping->backing_dev_info);
+
 	/* Now submit buffers to write */
 	bh = head = page_buffers(page);
 	do {
--- /dev/null
+++ b/fs/ext4/pfcache.c
@@ -0,0 +1,774 @@
+/*
+ *  fs/ext4/pfcache.c
+ *
+ *  Automatic SHA-1 (FIPS 180-1) data checksummig
+ *
+ *  Copyright (c) 2013-2015 Parallels IP Holdings GmbH
+ *  Author: Konstantin Khlebnikov
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/jbd2.h>
+#include <linux/cryptohash.h>
+#include <linux/namei.h>
+#include <linux/exportfs.h>
+#include <linux/init_task.h>	/* for init_cred */
+#include <linux/memcontrol.h>
+#include "ext4.h"
+#include "xattr.h"
+#include "../internal.h"
+
+#define PFCACHE_MAX_PATH	(EXT4_DATA_CSUM_SIZE * 2 + 2)
+static void pfcache_path(struct inode *inode, char *path)
+{
+	char *p;
+	int i;
+
+	/* like .git/objects hex[0]/hex[1..] */
+	p = pack_hex_byte(path, EXT4_I(inode)->i_data_csum[0]);
+	*p++ = '/';
+	for ( i = 1 ; i < EXT4_DATA_CSUM_SIZE ; i++ )
+		p = pack_hex_byte(p, EXT4_I(inode)->i_data_csum[i]);
+	*p = 0;
+}
+
+/* require inode->i_mutex held or unreachable inode */
+int ext4_open_pfcache(struct inode *inode)
+{
+	struct super_block *sb = inode->i_sb;
+	const struct cred *cur_cred;
+	char name[PFCACHE_MAX_PATH];
+	struct path root, path;
+	int ret;
+
+	if (inode->i_mapping->i_peer_file)
+		return -EBUSY;
+
+	if (!(ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM) &&
+	      EXT4_I(inode)->i_data_csum_end < 0))
+		return -ENODATA;
+
+	if (!EXT4_SB(sb)->s_pfcache_root.mnt)
+		return -ENODEV;
+
+	spin_lock(&EXT4_SB(sb)->s_pfcache_lock);
+	root = EXT4_SB(sb)->s_pfcache_root;
+	path_get(&root);
+	spin_unlock(&EXT4_SB(sb)->s_pfcache_lock);
+
+	if (!root.mnt)
+		return -ENODEV;
+
+	pfcache_path(inode, name);
+
+	/*
+	 * Lookups over shared area shouldn't be accounted to any particular
+	 * memory cgroup, otherwise a cgroup can be pinned for indefinitely
+	 * long after destruction, because a file or directory located in this
+	 * area is likely to be in use by another containers or host.
+	 */
+	memcg_stop_kmem_account();
+
+	cur_cred = override_creds(&init_cred);
+	/*
+	 * Files in cache area must not have csum attributes or
+	 * pfcache must be disabled for underlain filesystem,
+	 * otherwise real lock-recursion can happens for i_mutex.
+	 * Here we disable lockdep to avoid false-positive reports.
+	 */
+	lockdep_off();
+	ret = vfs_path_lookup(root.dentry, root.mnt, name, 0, &path);
+	lockdep_on();
+	revert_creds(cur_cred);
+	path_put(&root);
+	if (ret)
+		goto out;
+
+	ret = open_mapping_peer(inode->i_mapping, &path, &init_cred);
+	if (!ret)
+		percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_pfcache_peers);
+	path_put(&path);
+out:
+	memcg_resume_kmem_account();
+	return ret;
+}
+
+/* require inode->i_mutex held or unreachable inode */
+int ext4_close_pfcache(struct inode *inode)
+{
+	if (!inode->i_mapping->i_peer_file)
+		return -ENOENT;
+	close_mapping_peer(inode->i_mapping);
+	percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_pfcache_peers);
+	return 0;
+}
+
+/* under sb->s_umount write lock */
+int ext4_relink_pfcache(struct super_block *sb, char *new_root, bool new_sb)
+{
+	int old_root = !!EXT4_SB(sb)->s_pfcache_root.mnt;
+	struct inode *inode, *old_inode = NULL;
+	struct file *file;
+	long nr_opened = 0, nr_closed = 0, nr_total;
+	bool reload_csum = false;
+	struct path root, path;
+
+	if (new_root) {
+		int err;
+
+		err = kern_path(new_root, LOOKUP_DIRECTORY, &root);
+		if (err) {
+			printk(KERN_ERR"PFCache: lookup \"%s\" failed %d\n",
+					new_root, err);
+			return new_sb ? 0 : err;
+		}
+		if (!test_opt2(sb, PFCACHE_CSUM)) {
+			set_opt2(sb, PFCACHE_CSUM);
+			reload_csum = true;
+		}
+	} else {
+		root.mnt = NULL;
+		root.dentry = NULL;
+	}
+
+	if (new_sb) {
+		path_put(&EXT4_SB(sb)->s_pfcache_root);
+		EXT4_SB(sb)->s_pfcache_root = root;
+		return 0;
+	}
+
+	path_get(&root);
+	spin_lock(&EXT4_SB(sb)->s_pfcache_lock);
+	path = EXT4_SB(sb)->s_pfcache_root;
+	EXT4_SB(sb)->s_pfcache_root = root;
+	spin_unlock(&EXT4_SB(sb)->s_pfcache_lock);
+	path_put(&path);
+
+	spin_lock(&inode_sb_list_lock);
+
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+			continue;
+		if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode))
+			continue;
+		if (!ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM)) {
+			if (!reload_csum)
+				continue;
+		} else if (!(EXT4_I(inode)->i_data_csum_end < 0))
+			continue;
+		__iget(inode);
+		spin_unlock(&inode_sb_list_lock);
+		iput(old_inode);
+		old_inode = inode;
+
+		path.mnt = NULL;
+		path.dentry = NULL;
+
+		mutex_lock(&inode->i_mutex);
+
+		if (!ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM)) {
+			if (!reload_csum)
+				goto next;
+			if (S_ISDIR(inode->i_mode)) {
+				ext4_load_dir_csum(inode);
+				goto next;
+			}
+			if (ext4_load_data_csum(inode))
+				goto next;
+		} else if (!(EXT4_I(inode)->i_data_csum_end < 0) ||
+				S_ISDIR(inode->i_mode))
+			goto next;
+
+		if (new_root) {
+			char name[PFCACHE_MAX_PATH];
+			const struct cred *cur_cred;
+			int err;
+
+			pfcache_path(inode, name);
+			cur_cred = override_creds(&init_cred);
+			err = vfs_path_lookup(root.dentry, root.mnt,
+					name, 0, &path);
+			revert_creds(cur_cred);
+			if (err) {
+				path.mnt = NULL;
+				path.dentry = NULL;
+			}
+		}
+
+		file = inode->i_mapping->i_peer_file;
+		if ((!path.mnt && !file) || (path.mnt && file &&
+		     file->f_mapping == path.dentry->d_inode->i_mapping))
+			goto next;
+
+		if (file) {
+			close_mapping_peer(inode->i_mapping);
+			nr_closed++;
+		}
+
+		if (path.mnt) {
+			if (!open_mapping_peer(inode->i_mapping,
+						&path, &init_cred))
+				nr_opened++;
+		}
+next:
+		mutex_unlock(&inode->i_mutex);
+		path_put(&path);
+		cond_resched();
+		spin_lock(&inode_sb_list_lock);
+	}
+	spin_unlock(&inode_sb_list_lock);
+	iput(old_inode);
+
+	percpu_counter_add(&EXT4_SB(sb)->s_pfcache_peers,
+			   nr_opened - nr_closed);
+	nr_total = percpu_counter_sum(&EXT4_SB(sb)->s_pfcache_peers);
+
+	if (new_root && (old_root || nr_total))
+		printk(KERN_INFO"PFCache: relink %u:%u to \"%s\""
+				" +%ld -%ld =%ld peers\n",
+				MAJOR(sb->s_dev), MINOR(sb->s_dev), new_root,
+				nr_opened, nr_closed, nr_total);
+	if (!new_root && nr_total)
+		printk(KERN_ERR"PFCache: %ld peers lost", nr_total);
+
+	path_put(&root);
+
+	return 0;
+}
+
+#define MAX_LOCK_BATCH	256
+
+long ext4_dump_pfcache(struct super_block *sb,
+		      struct pfcache_dump_request __user *user_req)
+{
+	struct inode *inode, *old_inode = NULL;
+	struct pfcache_dump_request req;
+	u8 __user *user_buffer;
+	u64 state, *x;
+	void *buffer, *p;
+	long ret, size;
+	int lock_batch = 0;
+
+	if (copy_from_user(&req, user_req, sizeof(req)))
+		return -EFAULT;
+
+	if (!access_ok(VERIFY_WRITE, user_req,
+		       req.header_size + req.buffer_size))
+		return -EFAULT;
+
+	/* check for unknown flags */
+	if ((req.filter & ~PFCACHE_FILTER_MASK) ||
+	    (req.payload & ~PFCACHE_PAYLOAD_MASK))
+		return -EINVAL;
+
+	buffer = kzalloc(PFCACHE_PAYLOAD_MAX_SIZE, GFP_KERNEL);
+	if (!buffer)
+		return -ENOMEM;
+
+	ret = 0;
+	/* skip all new fields in the user request header */
+	user_buffer = (void*)user_req + req.header_size;
+
+	spin_lock(&inode_sb_list_lock);
+	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
+		if (inode->i_state & (I_FREEING|I_CLEAR|I_WILL_FREE|I_NEW))
+			continue;
+		if (!S_ISREG(inode->i_mode) ||
+		    inode == EXT4_SB(sb)->s_balloon_ino)
+			goto next;
+
+		/* evaluate the inode state */
+		state = 0;
+
+		if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM) &&
+		    EXT4_I(inode)->i_data_csum_end < 0)
+			state |= PFCACHE_FILTER_WITH_CSUM;
+		else
+			state |= PFCACHE_FILTER_WITHOUT_CSUM;
+
+		if (inode->i_mapping->i_peer_file)
+			state |= PFCACHE_FILTER_WITH_PEER;
+		else
+			state |= PFCACHE_FILTER_WITHOUT_PEER;
+
+		/* check state-filter */
+		if (req.filter & state)
+			goto next;
+
+		/* check csum-filter */
+		if ((req.filter & PFCACHE_FILTER_COMPARE_CSUM) &&
+		    memcmp(EXT4_I(inode)->i_data_csum,
+			    req.csum_filter, EXT4_DATA_CSUM_SIZE))
+			goto next;
+
+		/* -- add new filters above this line -- */
+
+		/* check offset-filter at the last */
+		if (req.offset > 0) {
+			req.offset--;
+			goto next;
+		}
+
+		/* construct the payload */
+		p = buffer;
+
+		if (req.payload & PFCACHE_PAYLOAD_CSUM) {
+			BUILD_BUG_ON(PFCACHE_CSUM_SIZE != EXT4_DATA_CSUM_SIZE);
+			if (state & PFCACHE_FILTER_WITH_CSUM)
+				memcpy(p, EXT4_I(inode)->i_data_csum,
+						EXT4_DATA_CSUM_SIZE);
+			else
+				memset(p, 0, EXT4_DATA_CSUM_SIZE);
+			p += ALIGN(PFCACHE_CSUM_SIZE, sizeof(u64));
+		}
+
+		if (req.payload & PFCACHE_PAYLOAD_FHANDLE) {
+			unsigned *x = p;
+
+			*x++ = 8;
+			*x++ = FILEID_INO32_GEN;
+			*x++ = inode->i_ino;
+			*x++ = inode->i_generation;
+			p += 16;
+		}
+
+		if (req.payload & PFCACHE_PAYLOAD_STATE) {
+			x = p;
+			*x = state;
+			p += sizeof(u64);
+		}
+
+		if (req.payload & PFCACHE_PAYLOAD_FSIZE) {
+			x = p;
+			*x = i_size_read(inode);
+			p += sizeof(u64);
+		}
+
+		if (req.payload & PFCACHE_PAYLOAD_PAGES) {
+			x = p;
+			*x = inode->i_mapping->nrpages;
+			p += sizeof(u64);
+		}
+
+		/* -- add new payloads above this line -- */
+
+		size = p - buffer;
+		BUG_ON(!IS_ALIGNED(size, sizeof(u64)));
+		BUG_ON(size > PFCACHE_PAYLOAD_MAX_SIZE);
+
+		if (size > req.buffer_size)
+			goto out;
+
+		pagefault_disable();
+		if (!__copy_to_user_inatomic(user_buffer, buffer, size)) {
+			pagefault_enable();
+		} else {
+			pagefault_enable();
+			__iget(inode);
+			spin_unlock(&inode_sb_list_lock);
+			iput(old_inode);
+			old_inode = inode;
+			if (copy_to_user(user_buffer, buffer, size)) {
+				ret = -EFAULT;
+				goto out_nolock;
+			}
+			cond_resched();
+			lock_batch = 0;
+			spin_lock(&inode_sb_list_lock);
+		}
+
+		ret++;
+		user_buffer += size;
+		req.buffer_size -= size;
+next:
+		if (signal_pending(current)) {
+			if (!ret)
+				ret = -EINTR;
+			goto out;
+		}
+		if (++lock_batch > MAX_LOCK_BATCH || need_resched() ||
+				spin_needbreak(&inode_sb_list_lock)) {
+			__iget(inode);
+			spin_unlock(&inode_sb_list_lock);
+			iput(old_inode);
+			old_inode = inode;
+			cond_resched();
+			lock_batch = 0;
+			spin_lock(&inode_sb_list_lock);
+		}
+	}
+out:
+	spin_unlock(&inode_sb_list_lock);
+out_nolock:
+	iput(old_inode);
+
+	kfree(buffer);
+
+	return ret;
+}
+
+static void ext4_init_data_csum(struct inode *inode)
+{
+	EXT4_I(inode)->i_data_csum_end = 0;
+	sha_init((__u32 *)EXT4_I(inode)->i_data_csum);
+	ext4_set_inode_state(inode, EXT4_STATE_PFCACHE_CSUM);
+	percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_csum_partial);
+}
+
+void ext4_clear_data_csum(struct inode *inode)
+{
+	ext4_clear_inode_state(inode, EXT4_STATE_PFCACHE_CSUM);
+	if (!S_ISREG(inode->i_mode))
+		return;
+	if (EXT4_I(inode)->i_data_csum_end < 0)
+		percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_csum_complete);
+	else
+		percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_csum_partial);
+}
+
+void ext4_start_data_csum(struct inode *inode)
+{
+	if (!ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM)) {
+		spin_lock(&inode->i_lock);
+		if (!ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+			ext4_init_data_csum(inode);
+		spin_unlock(&inode->i_lock);
+	}
+}
+
+int ext4_load_data_csum(struct inode *inode)
+{
+	int ret;
+
+	ret = ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED,
+			EXT4_DATA_CSUM_NAME, EXT4_I(inode)->i_data_csum,
+			EXT4_DATA_CSUM_SIZE);
+	if (ret < 0)
+		return ret;
+	if (ret != EXT4_DATA_CSUM_SIZE)
+		return -EIO;
+
+	EXT4_I(inode)->i_data_csum_end = -1;
+	ext4_set_inode_state(inode, EXT4_STATE_PFCACHE_CSUM);
+	percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_csum_complete);
+	return 0;
+}
+
+static int ext4_save_data_csum(struct inode *inode, u8 *csum)
+{
+	int ret;
+
+	WARN_ON(journal_current_handle());
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM) &&
+	    EXT4_I(inode)->i_data_csum_end < 0 &&
+	    memcmp(EXT4_I(inode)->i_data_csum, csum, EXT4_DATA_CSUM_SIZE))
+		ext4_close_pfcache(inode);
+
+	spin_lock(&inode->i_lock);
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_clear_data_csum(inode);
+	memcpy(EXT4_I(inode)->i_data_csum, csum, EXT4_DATA_CSUM_SIZE);
+	EXT4_I(inode)->i_data_csum_end = -1;
+	ext4_set_inode_state(inode, EXT4_STATE_PFCACHE_CSUM);
+	percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_csum_complete);
+	spin_unlock(&inode->i_lock);
+
+	ext4_open_pfcache(inode);
+
+	/* In order to guarantie csum consistenty force block allocation first */
+	ret = ext4_alloc_da_blocks(inode);
+	if (ret)
+		return ret;
+
+	return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
+			EXT4_DATA_CSUM_NAME, EXT4_I(inode)->i_data_csum,
+			EXT4_DATA_CSUM_SIZE, 0);
+}
+
+void ext4_load_dir_csum(struct inode *inode)
+{
+	char value[EXT4_DIR_CSUM_VALUE_LEN];
+	int ret;
+
+	ret = ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED,
+			     EXT4_DATA_CSUM_NAME, value, sizeof(value));
+	if (ret == EXT4_DIR_CSUM_VALUE_LEN &&
+	    !strncmp(value, EXT4_DIR_CSUM_VALUE, sizeof(value)))
+		ext4_set_inode_state(inode, EXT4_STATE_PFCACHE_CSUM);
+}
+
+void ext4_save_dir_csum(struct inode *inode)
+{
+	ext4_set_inode_state(inode, EXT4_STATE_PFCACHE_CSUM);
+	ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
+			EXT4_DATA_CSUM_NAME,
+			EXT4_DIR_CSUM_VALUE,
+			EXT4_DIR_CSUM_VALUE_LEN, 0);
+}
+
+void ext4_truncate_data_csum(struct inode *inode, loff_t pos)
+{
+
+	if (!S_ISREG(inode->i_mode))
+		return;
+
+	if (EXT4_I(inode)->i_data_csum_end < 0) {
+		WARN_ON(journal_current_handle());
+		ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
+				EXT4_DATA_CSUM_NAME, NULL, 0, 0);
+		ext4_close_pfcache(inode);
+	}
+	spin_lock(&inode->i_lock);
+	ext4_clear_data_csum(inode);
+	if (!pos && test_opt2(inode->i_sb, PFCACHE_CSUM))
+		ext4_init_data_csum(inode);
+	spin_unlock(&inode->i_lock);
+}
+
+void ext4_check_pos_data_csum(struct inode *inode, loff_t pos)
+{
+	if ((pos & ~(loff_t)(SHA_MESSAGE_BYTES-1)) !=
+	    EXT4_I(inode)->i_data_csum_end)
+		ext4_truncate_data_csum(inode, pos);
+}
+
+static void sha_batch_transform(__u32 *digest, const char *data, unsigned rounds)
+{
+	__u32 temp[SHA_WORKSPACE_WORDS];
+
+	while (rounds--) {
+		sha_transform(digest, data, temp);
+		data += SHA_MESSAGE_BYTES;
+	}
+}
+
+void ext4_update_data_csum(struct inode *inode, loff_t pos,
+			   unsigned len, struct page* page)
+{
+	__u32 *digest = (__u32 *)EXT4_I(inode)->i_data_csum;
+	u8 *kaddr, *data;
+
+	if (!len)
+		return;
+
+	len += pos & (SHA_MESSAGE_BYTES-1);
+	len &= ~(SHA_MESSAGE_BYTES-1);
+	pos &= ~(loff_t)(SHA_MESSAGE_BYTES-1);
+
+	BUG_ON(pos != EXT4_I(inode)->i_data_csum_end);
+	EXT4_I(inode)->i_data_csum_end += len;
+
+	kaddr = kmap_atomic(page);
+	data = kaddr + (pos & (PAGE_CACHE_SIZE - 1));
+	sha_batch_transform(digest, data, len / SHA_MESSAGE_BYTES);
+	kunmap_atomic(kaddr);
+}
+
+static int ext4_finish_data_csum(struct inode *inode, u8 *csum)
+{
+	__u32 *digest = (__u32 *)csum;
+	__u8 data[SHA_MESSAGE_BYTES * 2];
+	loff_t end;
+	unsigned tail;
+	__be64 bits;
+
+	BUILD_BUG_ON(EXT4_DATA_CSUM_SIZE != SHA_DIGEST_WORDS * 4);
+
+	memcpy(csum, EXT4_I(inode)->i_data_csum, EXT4_DATA_CSUM_SIZE);
+
+	end = EXT4_I(inode)->i_data_csum_end;
+	if (end < 0)
+		return 0;
+
+	if (!inode->i_size)
+		return -ENODATA;
+
+	tail = inode->i_size - end;
+	if (tail >= SHA_MESSAGE_BYTES)
+		return -EIO;
+
+	if (tail) {
+		struct page *page;
+		u8 *kaddr;
+
+		page = read_cache_page_gfp(inode->i_mapping,
+					   end >> PAGE_CACHE_SHIFT,
+					   GFP_NOFS);
+		if (IS_ERR(page))
+			return PTR_ERR(page);
+
+		kaddr = kmap_atomic(page);
+		memcpy(data, kaddr + (end & (PAGE_CACHE_SIZE-1)), tail);
+		kunmap_atomic(kaddr);
+		page_cache_release(page);
+	}
+
+	memset(data + tail, 0, sizeof(data) - tail);
+	data[tail] = 0x80;
+
+	bits = cpu_to_be64((end + tail) << 3);
+	if (tail >= SHA_MESSAGE_BYTES - sizeof(bits)) {
+		memcpy(data + SHA_MESSAGE_BYTES * 2 - sizeof(bits),
+				&bits, sizeof(bits));
+		sha_batch_transform(digest, data, 2);
+	} else {
+		memcpy(data + SHA_MESSAGE_BYTES - sizeof(bits),
+				&bits, sizeof(bits));
+		sha_batch_transform(digest, data, 1);
+	}
+
+	for (tail = 0; tail < SHA_DIGEST_WORDS ; tail++)
+		digest[tail] = cpu_to_be32(digest[tail]);
+
+	return 0;
+}
+
+void ext4_commit_data_csum(struct inode *inode)
+{
+	u8 csum[EXT4_DATA_CSUM_SIZE];
+
+	if (!S_ISREG(inode->i_mode) || EXT4_I(inode)->i_data_csum_end < 0)
+		return;
+
+	mutex_lock(&inode->i_mutex);
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM) &&
+	    !ext4_finish_data_csum(inode, csum))
+		ext4_save_data_csum(inode, csum);
+	else
+		ext4_truncate_data_csum(inode, 0);
+	mutex_unlock(&inode->i_mutex);
+}
+
+static int ext4_xattr_trusted_csum_get(struct dentry *dentry, const char *name,
+				       void *buffer, size_t size, int handler_flags)
+{
+	struct inode *inode = dentry->d_inode;
+	u8 csum[EXT4_DATA_CSUM_SIZE];
+	int i;
+
+	if (strcmp(name, ""))
+		return -ENODATA;
+
+	if (!test_opt2(inode->i_sb, PFCACHE_CSUM))
+		return -EOPNOTSUPP;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (S_ISDIR(inode->i_mode))
+		return ext4_xattr_get(inode, EXT4_XATTR_INDEX_TRUSTED,
+				      EXT4_DATA_CSUM_NAME, buffer, size);
+
+	if (!S_ISREG(inode->i_mode))
+		return -ENODATA;
+
+	if (!buffer)
+		return EXT4_DATA_CSUM_SIZE * 2;
+
+	spin_lock(&inode->i_lock);
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM) &&
+	    EXT4_I(inode)->i_data_csum_end < 0) {
+		memcpy(csum, EXT4_I(inode)->i_data_csum, EXT4_DATA_CSUM_SIZE);
+	} else {
+		spin_unlock(&inode->i_lock);
+		return -ENODATA;
+	}
+	spin_unlock(&inode->i_lock);
+
+	if (size == EXT4_DATA_CSUM_SIZE) {
+		memcpy(buffer, csum, EXT4_DATA_CSUM_SIZE);
+		return EXT4_DATA_CSUM_SIZE;
+	}
+
+	if (size >= EXT4_DATA_CSUM_SIZE * 2) {
+		for ( i = 0 ; i < EXT4_DATA_CSUM_SIZE ; i++ )
+			buffer = pack_hex_byte(buffer, csum[i]);
+		return EXT4_DATA_CSUM_SIZE * 2;
+	}
+
+	return -ERANGE;
+}
+
+static int ext4_xattr_trusted_csum_set(struct dentry *dentry, const char *name,
+				const void *value, size_t size, int flags, int handler_flags)
+{
+	struct inode *inode = dentry->d_inode;
+	const char *text = value;
+	u8 csum[EXT4_DATA_CSUM_SIZE];
+	int i;
+
+	if (strcmp(name, ""))
+		return -ENODATA;
+
+	if (!test_opt2(inode->i_sb, PFCACHE_CSUM))
+		return -EOPNOTSUPP;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (S_ISDIR(inode->i_mode)) {
+		if (!value)
+			ext4_clear_inode_state(inode, EXT4_STATE_PFCACHE_CSUM);
+		else if (size == EXT4_DIR_CSUM_VALUE_LEN &&
+			 !strncmp(value, EXT4_DIR_CSUM_VALUE, size))
+			ext4_set_inode_state(inode, EXT4_STATE_PFCACHE_CSUM);
+		else
+			return -EINVAL;
+
+		return ext4_xattr_set(inode, EXT4_XATTR_INDEX_TRUSTED,
+				      EXT4_DATA_CSUM_NAME, value, size, flags);
+	}
+
+	if (!S_ISREG(inode->i_mode))
+		return -ENODATA;
+
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM)) {
+		if (flags & XATTR_CREATE)
+			return -EEXIST;
+	} else {
+		if (flags & XATTR_REPLACE)
+			return -ENODATA;
+	}
+
+	if (!value) {
+		ext4_truncate_data_csum(inode, 1);
+		return 0;
+	}
+
+	if (size == EXT4_DATA_CSUM_SIZE) {
+		memcpy(csum, value, EXT4_DATA_CSUM_SIZE);
+	} else if (size == EXT4_DATA_CSUM_SIZE * 2) {
+		for ( i = 0 ; i < EXT4_DATA_CSUM_SIZE ; i++ ) {
+			int hi = hex_to_bin(text[i*2]);
+			int lo = hex_to_bin(text[i*2+1]);
+			if ((hi < 0) || (lo < 0))
+				return -EINVAL;
+			csum[i] = (hi << 4) | lo;
+		}
+	} else
+		return -EINVAL;
+
+	if (mapping_writably_mapped(inode->i_mapping))
+		return -EBUSY;
+
+	return ext4_save_data_csum(inode, csum);
+}
+
+#define XATTR_TRUSTED_CSUM_PREFIX XATTR_TRUSTED_PREFIX EXT4_DATA_CSUM_NAME
+#define XATTR_TRUSTED_CSUM_PREFIX_LEN (sizeof (XATTR_TRUSTED_CSUM_PREFIX) - 1)
+
+static size_t
+ext4_xattr_trusted_csum_list(struct dentry *dentry, char *list, size_t list_size,
+			     const char *name, size_t name_len, int handler_flags)
+{
+	return 0;
+}
+
+struct xattr_handler ext4_xattr_trusted_csum_handler = {
+	.prefix = XATTR_TRUSTED_CSUM_PREFIX,
+	.list   = ext4_xattr_trusted_csum_list,
+	.get    = ext4_xattr_trusted_csum_get,
+	.set    = ext4_xattr_trusted_csum_set,
+};
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -314,6 +314,7 @@ next_group:
 
 		if (start_blk + itb > last_blk)
 			goto next_group;
+
 		group_data[it_index].inode_table = start_blk;
 		group = ext4_get_group_number(sb, start_blk);
 		next_group_start = ext4_group_first_block_no(sb, group + 1);
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -39,6 +39,7 @@
 #include <linux/log2.h>
 #include <linux/crc16.h>
 #include <linux/cleancache.h>
+#include <linux/ve.h>
 #include <asm/uaccess.h>
 
 #include <linux/kthread.h>
@@ -192,6 +193,7 @@ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 		 (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 }
+EXPORT_SYMBOL(ext4_block_bitmap);
 
 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 			       struct ext4_group_desc *bg)
@@ -200,6 +202,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 }
+EXPORT_SYMBOL(ext4_inode_bitmap);
 
 ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 			      struct ext4_group_desc *bg)
@@ -208,6 +211,7 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 		(EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
 		 (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
+EXPORT_SYMBOL(ext4_inode_table);
 
 __u32 ext4_free_group_clusters(struct super_block *sb,
 			       struct ext4_group_desc *bg)
@@ -818,6 +822,9 @@ static void ext4_put_super(struct super_block *sb)
 	percpu_counter_destroy(&sbi->s_dirs_counter);
 	percpu_counter_destroy(&sbi->s_dirtyclusters_counter);
 	percpu_counter_destroy(&sbi->s_extent_cache_cnt);
+	percpu_counter_destroy(&sbi->s_csum_partial);
+	percpu_counter_destroy(&sbi->s_csum_complete);
+	percpu_counter_destroy(&sbi->s_pfcache_peers);
 	brelse(sbi->s_sbh);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -946,7 +953,7 @@ static int __init init_inodecache(void)
 	ext4_inode_cachep = kmem_cache_create("ext4_inode_cache",
 					     sizeof(struct ext4_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (ext4_inode_cachep == NULL)
 		return -ENOMEM;
@@ -977,6 +984,10 @@ void ext4_clear_inode(struct inode *inode)
 		jbd2_free_inode(EXT4_I(inode)->jinode);
 		EXT4_I(inode)->jinode = NULL;
 	}
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM)) {
+		ext4_close_pfcache(inode);
+		ext4_clear_data_csum(inode);
+	}
 }
 
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
@@ -1159,11 +1170,14 @@ enum {
 	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
 	Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_dax,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
+	Opt_lazytime, Opt_nolazytime,
 	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
 	Opt_dioread_nolock, Opt_dioread_lock,
 	Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable,
-	Opt_max_dir_size_kb, Opt_nojournal_checksum,
+	Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_balloon_ino,
+	Opt_pfcache_csum, Opt_nopfcache_csum,
+	Opt_pfcache, Opt_nopfcache,
 };
 
 static const match_table_t tokens = {
@@ -1223,6 +1237,8 @@ static const match_table_t tokens = {
 	{Opt_dax, "dax"},
 	{Opt_stripe, "stripe=%u"},
 	{Opt_delalloc, "delalloc"},
+	{Opt_lazytime, "lazytime"},
+	{Opt_nolazytime, "nolazytime"},
 	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_removed, "mblk_io_submit"},
 	{Opt_removed, "nomblk_io_submit"},
@@ -1241,6 +1257,11 @@ static const match_table_t tokens = {
 	{Opt_init_itable, "init_itable"},
 	{Opt_noinit_itable, "noinit_itable"},
 	{Opt_max_dir_size_kb, "max_dir_size_kb=%u"},
+	{Opt_balloon_ino, "balloon_ino=%u"},
+	{Opt_pfcache_csum, "pfcache_csum"},
+	{Opt_nopfcache_csum, "nopfcache_csum"},
+	{Opt_pfcache, "pfcache=%s"},
+	{Opt_nopfcache, "nopfcache"},
 	{Opt_removed, "check=none"},	/* mount option from ext2/3 */
 	{Opt_removed, "nocheck"},	/* mount option from ext2/3 */
 	{Opt_removed, "reservation"},	/* mount option from ext2/3 */
@@ -1358,6 +1379,7 @@ static int clear_qf_name(struct super_block *sb, int qtype)
 #define MOPT_NO_EXT3	0x0200
 #define MOPT_EXT4_ONLY	(MOPT_NO_EXT2 | MOPT_NO_EXT3)
 #define MOPT_STRING	0x0400
+#define MOPT_WANT_SYS_ADMIN	0x0800
 
 static const struct mount_opts {
 	int	token;
@@ -1388,7 +1410,7 @@ static const struct mount_opts {
 				    EXT4_MOUNT_JOURNAL_CHECKSUM),
 	 MOPT_EXT4_ONLY | MOPT_SET},
 	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_NO_EXT2 | MOPT_SET},
-	{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
+	{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR|MOPT_WANT_SYS_ADMIN},
 	{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
 	{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
 	{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT,
@@ -1442,12 +1464,16 @@ static const struct mount_opts {
 	{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
 	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
 	{Opt_max_dir_size_kb, 0, MOPT_GTE0},
+	{Opt_balloon_ino, 0, 0},
+	{Opt_pfcache_csum, 0, 0},
+	{Opt_nopfcache_csum, 0, 0},
 	{Opt_err, 0, 0}
 };
 
 static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 			    substring_t *args, unsigned long *journal_devnum,
-			    unsigned int *journal_ioprio, int is_remount)
+			    unsigned int *journal_ioprio,
+			    unsigned long *balloon_ino, int is_remount)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	const struct mount_opts *m;
@@ -1481,6 +1507,27 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 	case Opt_i_version:
 		sb->s_flags |= MS_I_VERSION;
 		return 1;
+	case Opt_pfcache:
+		if (capable(CAP_SYS_ADMIN)) {
+			char *path;
+			int err;
+
+			path = match_strdup(&args[0]);
+			err = ext4_relink_pfcache(sb, path, !is_remount);
+			kfree(path);
+			return err ? -1 : 1;
+		}
+		return 1;
+	case Opt_nopfcache:
+		if (capable(CAP_SYS_ADMIN))
+			ext4_relink_pfcache(sb, NULL, !is_remount);
+		return 1;
+	case Opt_lazytime:
+		sb->s_flags |= MS_LAZYTIME;
+		return 1;
+	case Opt_nolazytime:
+		sb->s_flags &= ~MS_LAZYTIME;
+		return 1;
 	}
 
 	for (m = ext4_mount_opts; m->token != Opt_err; m++)
@@ -1512,6 +1559,9 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 		set_opt2(sb, EXPLICIT_DELALLOC);
 	if (m->flags & MOPT_CLEAR_ERR)
 		clear_opt(sb, ERRORS_MASK);
+	if (m->flags & MOPT_WANT_SYS_ADMIN && !capable(CAP_SYS_ADMIN))
+		return 1;
+
 	if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
 		ext4_msg(sb, KERN_ERR, "Cannot change quota "
 			 "options when quota turned on");
@@ -1612,6 +1662,14 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 		}
 		*journal_ioprio =
 			IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+	} else if (token == Opt_balloon_ino) {
+		*balloon_ino = arg;
+	} else if (token == Opt_pfcache_csum) {
+		if (capable(CAP_SYS_ADMIN))
+			set_opt2(sb, PFCACHE_CSUM);
+	} else if (token == Opt_nopfcache_csum) {
+		if (capable(CAP_SYS_ADMIN))
+			clear_opt2(sb, PFCACHE_CSUM);
 	} else if (m->flags & MOPT_DATAJ) {
 		if (is_remount) {
 			if (!sbi->s_journal)
@@ -1677,6 +1735,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token,
 static int parse_options(char *options, struct super_block *sb,
 			 unsigned long *journal_devnum,
 			 unsigned int *journal_ioprio,
+			 unsigned long *balloon_ino,
 			 int is_remount)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1697,7 +1756,8 @@ static int parse_options(char *options, struct super_block *sb,
 		args[0].to = args[0].from = NULL;
 		token = match_token(p, tokens, args);
 		if (handle_mount_opt(sb, p, token, args, journal_devnum,
-				     journal_ioprio, is_remount) < 0)
+				     journal_ioprio, balloon_ino,
+				     is_remount) < 0)
 			return 0;
 	}
 #ifdef CONFIG_QUOTA
@@ -1867,6 +1927,24 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
 	if (test_opt(sb, DATA_ERR_ABORT))
 		SEQ_OPTS_PUTS("data_err=abort");
 
+	if (sbi->s_balloon_ino)
+		SEQ_OPTS_PRINT("balloon_ino=%ld", sbi->s_balloon_ino->i_ino);
+
+	if (ve_is_super(get_exec_env())) {
+		if (test_opt2(sb, PFCACHE_CSUM))
+			SEQ_OPTS_PUTS("pfcache_csum");
+		else if (nodefs)
+			SEQ_OPTS_PUTS("nopfcache_csum");
+		if (sbi->s_pfcache_root.mnt) {
+			spin_lock(&sbi->s_pfcache_lock);
+			if (sbi->s_pfcache_root.mnt) {
+				SEQ_OPTS_PUTS("pfcache=");
+				seq_path(seq, &sbi->s_pfcache_root, "\\ \t\n");
+			}
+			spin_unlock(&sbi->s_pfcache_lock);
+		}
+	}
+
 	ext4_show_quota_options(seq, sb);
 	return 0;
 }
@@ -2619,6 +2697,30 @@ static ssize_t sbi_deprecated_show(struct ext4_attr *a,
 	return snprintf(buf, PAGE_SIZE, "%d\n", a->u.deprecated_val);
 }
 
+static ssize_t csum_partial_show(struct ext4_attr *a,
+					      struct ext4_sb_info *sbi,
+					      char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(s64) percpu_counter_sum(&sbi->s_csum_partial));
+}
+
+static ssize_t csum_complete_show(struct ext4_attr *a,
+					      struct ext4_sb_info *sbi,
+					      char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(s64) percpu_counter_sum(&sbi->s_csum_complete));
+}
+
+static ssize_t pfcache_peers_show(struct ext4_attr *a,
+					      struct ext4_sb_info *sbi,
+					      char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+			(s64) percpu_counter_sum(&sbi->s_pfcache_peers));
+}
+
 #define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
 static struct ext4_attr ext4_attr_##_name = {			\
 	.attr = {.name = __stringify(_name), .mode = _mode },	\
@@ -2665,6 +2767,9 @@ EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
 EXT4_RW_ATTR(reserved_clusters);
+EXT4_RO_ATTR(csum_partial);
+EXT4_RO_ATTR(csum_complete);
+EXT4_RO_ATTR(pfcache_peers);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
 		 inode_readahead_blks_store, s_inode_readahead_blks);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2686,6 +2791,7 @@ EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
 EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
 EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
 EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
+EXT4_RW_ATTR_SBI_UI(bd_full_ratelimit, s_bd_full_ratelimit);
 
 static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(delayed_allocation_blocks),
@@ -2712,6 +2818,10 @@ static struct attribute *ext4_attrs[] = {
 	ATTR_LIST(errors_count),
 	ATTR_LIST(first_error_time),
 	ATTR_LIST(last_error_time),
+	ATTR_LIST(bd_full_ratelimit),
+	ATTR_LIST(csum_partial),
+	ATTR_LIST(csum_complete),
+	ATTR_LIST(pfcache_peers),
 	NULL,
 };
 
@@ -2766,6 +2876,54 @@ static struct kobj_type ext4_ktype = {
 	.release	= ext4_sb_release,
 };
 
+static void ext4_load_balloon(struct super_block *sb, unsigned long ino)
+{
+	struct inode *inode;
+	struct ext4_sb_info *sbi;
+
+	sbi = EXT4_SB(sb);
+
+	if (!ino) {
+		/* FIXME locking */
+		if (sbi->s_balloon_ino) {
+			iput(sbi->s_balloon_ino);
+			sbi->s_balloon_ino = NULL;
+		}
+
+		return;
+	}
+
+	if (ino < EXT4_FIRST_INO(sb)) {
+		ext4_msg(sb, KERN_WARNING, "bad balloon inode specified");
+		return;
+	}
+
+	inode = ext4_iget(sb, ino);
+	if (IS_ERR(inode)) {
+		ext4_msg(sb, KERN_WARNING, "can't load balloon inode (%ld)", PTR_ERR(inode));
+		return;
+	}
+
+	if (!S_ISREG(inode->i_mode)) {
+		iput(inode);
+		ext4_msg(sb, KERN_WARNING, "balloon should be regular");
+		return;
+	}
+
+	if (!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)) {
+		iput(inode);
+		ext4_msg(sb, KERN_WARNING, "balloon should support extents");
+		return;
+	}
+
+	/* FIXME - locking */
+	if (sbi->s_balloon_ino)
+		iput(sbi->s_balloon_ino);
+	sbi->s_balloon_ino = inode;
+	ext4_msg(sb, KERN_INFO, "loaded balloon from %ld (%ld blocks)",
+			inode->i_ino, inode->i_blocks);
+}
+
 static void ext4_feat_release(struct kobject *kobj)
 {
 	complete(&ext4_feat->f_kobj_unregister);
@@ -2913,7 +3071,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
 	sb = elr->lr_super;
 	ngroups = EXT4_SB(sb)->s_groups_count;
 
-	sb_start_write(sb);
 	for (group = elr->lr_next_group; group < ngroups; group++) {
 		gdp = ext4_get_group_desc(sb, group, NULL);
 		if (!gdp) {
@@ -2940,8 +3097,6 @@ static int ext4_run_li_request(struct ext4_li_request *elr)
 		elr->lr_next_sched = jiffies + elr->lr_timeout;
 		elr->lr_next_group = group + 1;
 	}
-	sb_end_write(sb);
-
 	return ret;
 }
 
@@ -2991,9 +3146,9 @@ static struct task_struct *ext4_lazyinit_task;
 static int ext4_lazyinit_thread(void *arg)
 {
 	struct ext4_lazy_init *eli = (struct ext4_lazy_init *)arg;
-	struct list_head *pos, *n;
 	struct ext4_li_request *elr;
 	unsigned long next_wakeup, cur;
+	LIST_HEAD(request_list);
 
 	BUG_ON(NULL == eli);
 
@@ -3006,21 +3161,43 @@ cont_thread:
 			mutex_unlock(&eli->li_list_mtx);
 			goto exit_thread;
 		}
-
-		list_for_each_safe(pos, n, &eli->li_request_list) {
-			elr = list_entry(pos, struct ext4_li_request,
-					 lr_request);
-
-			if (time_after_eq(jiffies, elr->lr_next_sched)) {
-				if (ext4_run_li_request(elr) != 0) {
-					/* error, remove the lazy_init job */
-					ext4_remove_li_request(elr);
-					continue;
+		list_splice_init(&eli->li_request_list, &request_list);
+		while (!list_empty(&request_list)) {
+			int err = 0;
+			int progress = 0;
+
+			elr = list_entry(request_list.next,
+					 struct ext4_li_request, lr_request);
+			list_move(request_list.next, &eli->li_request_list);
+			if (time_before(jiffies, elr->lr_next_sched)) {
+				if (time_before(elr->lr_next_sched, next_wakeup))
+					next_wakeup = elr->lr_next_sched;
+				continue;
+			}
+			if (down_read_trylock(&elr->lr_super->s_umount)) {
+				if (sb_start_write_trylock(elr->lr_super)) {
+					progress = 1;
+					/* We holds sb->s_umount, sb can not
+					 * be removed from the list, it is
+					 * now safe to drop li_list_mtx
+					 */
+					mutex_unlock(&eli->li_list_mtx);
+					err = ext4_run_li_request(elr);
+					sb_end_write(elr->lr_super);
+					mutex_lock(&eli->li_list_mtx);
 				}
+				up_read((&elr->lr_super->s_umount));
+			}
+			/* error, remove the lazy_init job */
+			if (err) {
+				ext4_remove_li_request(elr);
+				continue;
+			}
+			if (!progress) {
+				elr->lr_next_sched = jiffies +
+					(prandom_u32()
+					 % (EXT4_DEF_LI_MAX_START_DELAY * HZ));
 			}
-
-			if (time_before(elr->lr_next_sched, next_wakeup))
-				next_wakeup = elr->lr_next_sched;
 		}
 		mutex_unlock(&eli->li_list_mtx);
 
@@ -3457,6 +3634,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	__u64 blocks_count;
 	int err = 0;
 	unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+	unsigned long balloon_ino = 0;
 	ext4_group_t first_not_zeroed;
 
 	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
@@ -3580,8 +3758,12 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	else if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_WBACK)
 		set_opt(sb, WRITEBACK_DATA);
 
-	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC)
-		set_opt(sb, ERRORS_PANIC);
+	if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_PANIC) {
+		if (capable(CAP_SYS_ADMIN))
+			set_opt(sb, ERRORS_PANIC);
+		else
+			set_opt(sb, ERRORS_RO);
+	}
 	else if (le16_to_cpu(sbi->s_es->s_errors) == EXT4_ERRORS_CONTINUE)
 		set_opt(sb, ERRORS_CONT);
 	else
@@ -3615,14 +3797,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
 
 	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
-			   &journal_devnum, &journal_ioprio, 0)) {
+			   &journal_devnum, &journal_ioprio, &balloon_ino,
+			   0)) {
 		ext4_msg(sb, KERN_WARNING,
 			 "failed to parse options in superblock: %s",
 			 sbi->s_es->s_mount_opts);
 	}
 	sbi->s_def_mount_opt = sbi->s_mount_opt;
 	if (!parse_options((char *) data, sb, &journal_devnum,
-			   &journal_ioprio, 0))
+			   &journal_ioprio, &balloon_ino, 0))
 		goto failed_mount;
 
 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
@@ -3967,6 +4150,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	sbi->s_gdb_count = db_count;
 	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
 	spin_lock_init(&sbi->s_next_gen_lock);
+	spin_lock_init(&sbi->s_pfcache_lock);
 
 	init_timer(&sbi->s_err_report);
 	sbi->s_err_report.function = print_daily_error_info;
@@ -3976,13 +4160,23 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 	ext4_es_register_shrinker(sbi);
 
 	err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0, GFP_KERNEL);
-	if (err) {
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_csum_partial, 0, GFP_KERNEL);
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_csum_complete, 0, GFP_KERNEL);
+	}
+	if (!err) {
+		err = percpu_counter_init(&sbi->s_pfcache_peers, 0, GFP_KERNEL);
+	}
+	if (err != 0) {
 		ext4_msg(sb, KERN_ERR, "insufficient memory");
 		goto failed_mount3;
 	}
 
 	sbi->s_stripe = ext4_get_stripe_size(sbi);
 	sbi->s_extent_max_zeroout_kb = 32;
+	sbi->s_bd_full_ratelimit = 1024;
 
 	/*
 	 * set up enough so that it can read an inode
@@ -4255,6 +4449,8 @@ no_journal:
 				 "the device does not support discard");
 	}
 
+	ext4_load_balloon(sb, balloon_ino);
+
 	ext4_msg(sb, KERN_INFO, "mounted filesystem with%s. "
 		 "Opts: %s%s%s", descr, sbi->s_es->s_mount_opts,
 		 *sbi->s_es->s_mount_opts ? "; " : "", orig_data);
@@ -4308,6 +4504,9 @@ failed_mount3:
 	ext4_es_unregister_shrinker(sbi);
 	del_timer_sync(&sbi->s_err_report);
 	percpu_counter_destroy(&sbi->s_extent_cache_cnt);
+	percpu_counter_destroy(&sbi->s_csum_partial);
+	percpu_counter_destroy(&sbi->s_csum_complete);
+	percpu_counter_destroy(&sbi->s_pfcache_peers);
 	if (sbi->s_mmp_tsk)
 		kthread_stop(sbi->s_mmp_tsk);
 failed_mount2:
@@ -4321,6 +4520,8 @@ failed_mount:
 		remove_proc_entry("options", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 	}
+	if (sbi->s_pfcache_root.mnt)
+		ext4_relink_pfcache(sb, NULL, true);
 #ifdef CONFIG_QUOTA
 	for (i = 0; i < MAXQUOTAS; i++)
 		kfree(sbi->s_qf_names[i]);
@@ -4735,8 +4936,12 @@ int ext4_force_commit(struct super_block *sb)
 {
 	journal_t *journal;
 
-	if (sb->s_flags & MS_RDONLY)
+	if (sb->s_flags & MS_RDONLY) {
+		smp_rmb();
+		if (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED)
+			return -EROFS;
 		return 0;
+	}
 
 	journal = EXT4_SB(sb)->s_journal;
 	return ext4_journal_force_commit(journal);
@@ -4876,6 +5081,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	int i, j;
 #endif
 	char *orig_data = kstrdup(data, GFP_KERNEL);
+	unsigned long balloon_ino = -1;
 
 	/* Store the original options */
 	old_sb_flags = sb->s_flags;
@@ -4904,7 +5110,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	if (sbi->s_journal && sbi->s_journal->j_task->io_context)
 		journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
 
-	if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
+	if (!parse_options(data, sb, NULL, &journal_ioprio, &balloon_ino, 1)) {
 		err = -EINVAL;
 		goto restore_opts;
 	}
@@ -5060,6 +5266,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	if (sbi->s_journal == NULL && !(old_sb_flags & MS_RDONLY))
 		ext4_commit_super(sb, 1);
 
+	if (balloon_ino != -1)
+		ext4_load_balloon(sb, balloon_ino);
+
 #ifdef CONFIG_QUOTA
 	/* Release old quota file names */
 	for (i = 0; i < MAXQUOTAS; i++)
@@ -5076,6 +5285,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 	}
 #endif
 
+	*flags = (*flags & ~MS_LAZYTIME) | (sb->s_flags & MS_LAZYTIME);
 	ext4_msg(sb, KERN_INFO, "re-mounted. Opts: %s", orig_data);
 	kfree(orig_data);
 	return 0;
@@ -5132,6 +5342,20 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
 	buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
 	buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
 
+	if (sbi->s_balloon_ino) {
+		struct ext4_inode_info *ei;
+		blkcnt_t balloon_blocks;
+
+		balloon_blocks = sbi->s_balloon_ino->i_blocks;
+		ei = EXT4_I(sbi->s_balloon_ino);
+		spin_lock(&ei->i_block_reservation_lock);
+		balloon_blocks += ei->i_reserved_data_blocks;
+		spin_unlock(&ei->i_block_reservation_lock);
+
+		BUG_ON(sbi->s_balloon_ino->i_blkbits < 9);
+		buf->f_blocks -= balloon_blocks >> (sbi->s_balloon_ino->i_blkbits - 9);
+	}
+
 	return 0;
 }
 
@@ -5497,6 +5721,8 @@ out:
 static struct dentry *ext4_mount(struct file_system_type *fs_type, int flags,
 		       const char *dev_name, void *data)
 {
+	if (!current_user_ns_initial())
+		return ERR_PTR(-EPERM);
 	return mount_bdev(fs_type, flags, dev_name, data, ext4_fill_super);
 }
 
@@ -5562,13 +5788,29 @@ static inline void unregister_as_ext3(void) { }
 static inline int ext3_feature_set_ok(struct super_block *sb) { return 0; }
 #endif
 
+static void ext4_kill_sb(struct super_block *sb)
+{
+	struct ext4_sb_info *sbi;
+
+	sbi = EXT4_SB(sb);
+	if (sbi && sbi->s_balloon_ino)
+		iput(sbi->s_balloon_ino);
+
+	if (sbi && sbi->s_pfcache_root.mnt)
+		ext4_relink_pfcache(sb, NULL, false);
+
+	kill_block_super(sb);
+}
+
 static struct file_system_type ext4_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "ext4",
 	.mount		= ext4_mount,
-	.kill_sb	= kill_block_super,
+	.kill_sb	= ext4_kill_sb,
 	.fs_flags	= FS_REQUIRES_DEV | FS_HAS_INVALIDATE_RANGE |
-			  FS_HAS_DIO_IODONE2,
+			  FS_HAS_DIO_IODONE2 | FS_VIRTUALIZED |
+			  FS_HAS_MMAP_PREP | FS_USERNS_MOUNT |
+			  FS_USERNS_DEV_MOUNT,
 };
 MODULE_ALIAS_FS("ext4");
 
--- a/fs/ext4/truncate.h
+++ b/fs/ext4/truncate.h
@@ -12,6 +12,8 @@ static inline void ext4_truncate_failed_write(struct inode *inode)
 {
 	down_write(&EXT4_I(inode)->i_mmap_sem);
 	truncate_inode_pages(inode->i_mapping, inode->i_size);
+	if (ext4_test_inode_state(inode, EXT4_STATE_PFCACHE_CSUM))
+		ext4_truncate_data_csum(inode, inode->i_size);
 	ext4_truncate(inode);
 	up_write(&EXT4_I(inode)->i_mmap_sem);
 }
@@ -41,5 +43,6 @@ static inline unsigned long ext4_blocks_for_truncate(struct inode *inode)
 		needed = EXT4_MAX_TRANS_DATA;
 
 	return EXT4_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
+
 }
 
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -106,6 +106,7 @@ static const struct xattr_handler *ext4_xattr_handler_map[] = {
 
 const struct xattr_handler *ext4_xattr_handlers[] = {
 	&ext4_xattr_user_handler,
+	&ext4_xattr_trusted_csum_handler,
 	&ext4_xattr_trusted_handler,
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
 	&ext4_xattr_acl_access_handler,
@@ -871,6 +872,10 @@ inserted:
 						EXT4_C2B(EXT4_SB(sb), 1));
 				if (error)
 					goto cleanup;
+				if (check_bd_full(inode, 1)) {
+					error = -ENOSPC;
+					goto cleanup_dquot;
+				}
 				BUFFER_TRACE(new_bh, "get_write_access");
 				error = ext4_journal_get_write_access(handle,
 								      new_bh);
--- a/fs/ext4/xattr_trusted.c
+++ b/fs/ext4/xattr_trusted.c
@@ -19,7 +19,12 @@ ext4_xattr_trusted_list(struct dentry *dentry, char *list, size_t list_size,
 	const size_t prefix_len = XATTR_TRUSTED_PREFIX_LEN;
 	const size_t total_len = prefix_len + name_len + 1;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!strcmp(name, EXT4_DATA_CSUM_NAME) &&
+	    (!capable(CAP_SYS_ADMIN) ||
+	     !test_opt2(dentry->d_inode->i_sb, PFCACHE_CSUM)))
+		return 0;
+
+	if (!ve_capable(CAP_SYS_ADMIN))
 		return 0;
 
 	if (list && total_len <= list_size) {
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -223,12 +223,10 @@ static int f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	case ACL_TYPE_ACCESS:
 		name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS;
 		if (acl) {
-			error = posix_acl_equiv_mode(acl, &inode->i_mode);
-			if (error < 0)
+			error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+			if (error)
 				return error;
 			set_acl_inode(fi, inode->i_mode);
-			if (error == 0)
-				acl = NULL;
 		}
 		break;
 
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -99,7 +99,6 @@ out:
 static const struct vm_operations_struct f2fs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= f2fs_vm_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -732,8 +732,9 @@ MODULE_ALIAS_FS("f2fs");
 
 static int __init init_inodecache(void)
 {
-	f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache",
-			sizeof(struct f2fs_inode_info), NULL);
+	f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache",
+			sizeof(struct f2fs_inode_info), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, NULL);
 	if (f2fs_inode_cachep == NULL)
 		return -ENOMEM;
 	return 0;
--- a/fs/fat/inode.c
+++ b/fs/fat/inode.c
@@ -609,7 +609,7 @@ static int __init fat_init_inodecache(void)
 	fat_inode_cachep = kmem_cache_create("fat_inode_cache",
 					     sizeof(struct msdos_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (fat_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -22,6 +22,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
 #include <linux/shmem_fs.h>
+#include <linux/ve.h>
 
 #include <asm/poll.h>
 #include <asm/siginfo.h>
@@ -29,11 +30,50 @@
 
 #define SETFL_MASK (O_APPEND | O_NONBLOCK | O_NDELAY | O_DIRECT | O_NOATIME)
 
+void generic_set_file_flags_unlocked(struct file *filp, unsigned int arg)
+{
+	filp->f_flags = (arg & SETFL_MASK) |
+		(filp->f_flags & ~SETFL_MASK);
+
+}
+EXPORT_SYMBOL(generic_set_file_flags_unlocked);
+
+int generic_set_file_flags(struct file *filp, unsigned int arg)
+{
+	spin_lock(&filp->f_lock);
+	generic_set_file_flags_unlocked(filp, arg);
+	spin_unlock(&filp->f_lock);
+	return 0;
+
+}
+EXPORT_SYMBOL(generic_set_file_flags);
+
+int may_use_odirect(void)
+{
+	int may;
+
+	if (ve_is_super(get_exec_env()))
+		return 1;
+
+	may = capable(CAP_SYS_RAWIO);
+	if (!may) {
+		may = get_exec_env()->odirect_enable;
+		if (may == 2)
+			may = get_ve0()->odirect_enable;
+	}
+
+	return may;
+}
+
 static int setfl(int fd, struct file * filp, unsigned long arg)
 {
 	struct inode * inode = file_inode(filp);
 	int error = 0;
 
+	if (!may_use_odirect())
+		arg &= ~O_DIRECT;
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		arg &= ~O_SYNC;
 	/*
 	 * O_APPEND cannot be cleared if the file is marked as append-only
 	 * and the file is open for write.
@@ -57,10 +97,6 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 				return -EINVAL;
 	}
 
-	if (filp->f_op && filp->f_op->check_flags)
-		error = filp->f_op->check_flags(arg);
-	if (error)
-		return error;
 
 	/*
 	 * ->fasync() is responsible for setting the FASYNC bit.
@@ -73,10 +109,11 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
 		if (error > 0)
 			error = 0;
 	}
-	spin_lock(&filp->f_lock);
-	filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
-	spin_unlock(&filp->f_lock);
 
+	if (filp->f_op && filp->f_op->set_flags)
+		error = filp->f_op->set_flags(filp, arg);
+	else
+		error = generic_set_file_flags(filp, arg);
  out:
 	return error;
 }
@@ -746,7 +783,7 @@ static int __init fcntl_init(void)
 		));
 
 	fasync_cache = kmem_cache_create("fasync_cache",
-		sizeof(struct fasync_struct), 0, SLAB_PANIC, NULL);
+		sizeof(struct fasync_struct), 0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
 	return 0;
 }
 
--- a/fs/file.c
+++ b/fs/file.c
@@ -37,11 +37,11 @@ static void *alloc_fdmem(size_t size)
 	 * vmalloc() if the allocation size will be considered "large" by the VM.
 	 */
 	if (size <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) {
-		void *data = kmalloc(size, GFP_KERNEL|__GFP_NOWARN);
+		void *data = kmalloc(size, GFP_KERNEL_ACCOUNT|__GFP_NOWARN|__GFP_NORETRY);
 		if (data != NULL)
 			return data;
 	}
-	return vmalloc(size);
+	return __vmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM, PAGE_KERNEL);
 }
 
 static void free_fdmem(void *ptr)
@@ -110,7 +110,7 @@ static struct fdtable * alloc_fdtable(unsigned int nr)
 	if (unlikely(nr > sysctl_nr_open))
 		nr = ((sysctl_nr_open - 1) | (BITS_PER_LONG - 1)) + 1;
 
-	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL);
+	fdt = kmalloc(sizeof(struct fdtable), GFP_KERNEL_ACCOUNT);
 	if (!fdt)
 		goto out;
 	fdt->max_fds = nr;
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -27,9 +27,13 @@
 #include <linux/task_work.h>
 #include <linux/ima.h>
 #include <linux/swap.h>
+#include <linux/ve.h>
 
 #include <linux/atomic.h>
 
+#include <bc/beancounter.h>
+#include <bc/misc.h>
+
 #include "internal.h"
 
 /* sysctl tunables... */
@@ -52,8 +56,10 @@ static void file_free_rcu(struct rcu_head *head)
 
 static inline void file_free(struct file *f)
 {
-	percpu_counter_dec(&nr_files);
 	file_check_state(f);
+	if (f->f_ub == get_ub0())
+		percpu_counter_dec(&nr_files);
+	ub_file_uncharge(f);
 	call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
 }
 
@@ -108,11 +114,14 @@ struct file *get_empty_filp(void)
 	static long old_max;
 	struct file *f;
 	int error;
+	int acct;
 
+	acct = (get_exec_ub() == get_ub0());
 	/*
 	 * Privileged users can go above max_files
 	 */
-	if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
+	if (acct && get_nr_files() >= files_stat.max_files &&
+			!capable(CAP_SYS_ADMIN)) {
 		/*
 		 * percpu_counters are inaccurate.  Do an expensive check before
 		 * we go and fail.
@@ -125,7 +134,13 @@ struct file *get_empty_filp(void)
 	if (unlikely(!f))
 		return ERR_PTR(-ENOMEM);
 
-	percpu_counter_inc(&nr_files);
+	if (ub_file_charge(f)) {
+		kmem_cache_free(filp_cachep, f);
+		return ERR_PTR(-ENOMEM);
+	}
+	if (acct)
+		percpu_counter_inc(&nr_files);
+
 	f->f_cred = get_cred(cred);
 	error = security_file_alloc(f);
 	if (unlikely(error)) {
@@ -227,6 +242,8 @@ static void __fput(struct file *file)
 	struct dentry *dentry = file->f_path.dentry;
 	struct vfsmount *mnt = file->f_path.mnt;
 	struct inode *inode = file->f_inode;
+	struct dentry *original_dentry = file->f_original_path.dentry;
+	struct vfsmount *original_mnt = file->f_original_path.mnt;
 
 	might_sleep();
 
@@ -258,10 +275,14 @@ static void __fput(struct file *file)
 		drop_file_write_access(file);
 	file->f_path.dentry = NULL;
 	file->f_path.mnt = NULL;
+	file->f_original_path.dentry = NULL;
+	file->f_original_path.mnt = NULL;
 	file->f_inode = NULL;
 	file_free(file);
 	dput(dentry);
 	mntput(mnt);
+	dput(original_dentry);
+	mntput(original_mnt);
 }
 
 static DEFINE_SPINLOCK(delayed_fput_lock);
--- a/fs/filesystems.c
+++ b/fs/filesystems.c
@@ -217,6 +217,11 @@ int __init get_filesystem_list(char *buf)
 	return len;
 }
 
+static inline bool filesystem_permitted(const struct file_system_type *fs)
+{
+	return ve_is_super(get_exec_env()) || (fs->fs_flags & FS_VIRTUALIZED);
+}
+
 #ifdef CONFIG_PROC_FS
 static int filesystems_proc_show(struct seq_file *m, void *v)
 {
@@ -225,9 +230,11 @@ static int filesystems_proc_show(struct seq_file *m, void *v)
 	read_lock(&file_systems_lock);
 	tmp = file_systems;
 	while (tmp) {
-		seq_printf(m, "%s\t%s\n",
-			(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
-			tmp->name);
+		if (filesystem_permitted(tmp)) {
+			seq_printf(m, "%s\t%s\n",
+				(tmp->fs_flags & FS_REQUIRES_DEV) ? "" : "nodev",
+				tmp->name);
+		}
 		tmp = tmp->next;
 	}
 	read_unlock(&file_systems_lock);
@@ -236,7 +243,7 @@ static int filesystems_proc_show(struct seq_file *m, void *v)
 
 static int filesystems_proc_open(struct inode *inode, struct file *file)
 {
-	return single_open(file, filesystems_proc_show, NULL);
+	return single_open(file, filesystems_proc_show, inode->i_sb);
 }
 
 static const struct file_operations filesystems_proc_fops = {
@@ -248,7 +255,7 @@ static const struct file_operations filesystems_proc_fops = {
 
 static int __init proc_filesystems_init(void)
 {
-	proc_create("filesystems", 0, NULL, &filesystems_proc_fops);
+	proc_create("filesystems", S_ISVTX, NULL, &filesystems_proc_fops);
 	return 0;
 }
 module_init(proc_filesystems_init);
@@ -276,7 +283,8 @@ struct file_system_type *get_fs_type(const char *name)
 	if (!fs && (request_module("fs-%.*s", len, name) == 0))
 		fs = __get_fs_type(name, len);
 
-	if (dot && fs && !(fs->fs_flags & FS_HAS_SUBTYPE)) {
+	if (fs && (!filesystem_permitted(fs) ||
+		   (dot && !(fs->fs_flags & FS_HAS_SUBTYPE)))) {
 		put_filesystem(fs);
 		fs = NULL;
 	}
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -27,6 +27,7 @@
 #include <linux/backing-dev.h>
 #include <linux/tracepoint.h>
 #include "internal.h"
+#include <bc/io_acct.h>
 
 /*
  * 4MB minimal write chunk size
@@ -41,6 +42,7 @@ struct wb_writeback_work {
 	struct super_block *sb;
 	unsigned long *older_than_this;
 	enum writeback_sync_modes sync_mode;
+	unsigned int filter_ub:1;
 	unsigned int tagged_writepages:1;
 	unsigned int for_kupdate:1;
 	unsigned int range_cyclic:1;
@@ -50,8 +52,21 @@ struct wb_writeback_work {
 
 	struct list_head list;		/* pending work list */
 	struct completion *done;	/* set if the caller waits */
+	struct user_beancounter *ub;
 };
 
+/*
+ * If an inode is constantly having its pages dirtied, but then the
+ * updates stop dirtytime_expire_interval seconds in the past, it's
+ * possible for the worst case time between when an inode has its
+ * timestamps updated and when they finally get written out to be two
+ * dirtytime_expire_intervals.  We set the default to 12 hours (in
+ * seconds), which means most of the time inodes will have their
+ * timestamps written to disk after 12 hours, but in the worst case a
+ * few inodes might not their timestamps updated for 24 hours.
+ */
+unsigned int dirtytime_expire_interval = 12 * 60 * 60;
+
 /**
  * writeback_in_progress - determine whether there is writeback in progress
  * @bdi: the device's backing_dev_info structure.
@@ -107,7 +122,8 @@ out_unlock:
 
 static void
 __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
-		      bool range_cyclic, enum wb_reason reason)
+			struct user_beancounter *ub, bool range_cyclic,
+			enum wb_reason reason)
 {
 	struct wb_writeback_work *work;
 
@@ -126,6 +142,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 	work->nr_pages	= nr_pages;
 	work->range_cyclic = range_cyclic;
 	work->reason	= reason;
+	work->ub	= ub;
 
 	bdi_queue_work(bdi, work);
 }
@@ -145,7 +162,7 @@ __bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 			enum wb_reason reason)
 {
-	__bdi_start_writeback(bdi, nr_pages, true, reason);
+	__bdi_start_writeback(bdi, nr_pages, NULL, true, reason);
 }
 
 /**
@@ -236,14 +253,19 @@ static bool inode_dirtied_after(struct inode *inode, unsigned long t)
 	return ret;
 }
 
+#define EXPIRE_DIRTY_ATIME 0x0001
+
 /*
  * Move expired (dirtied before work->older_than_this) dirty inodes from
  * @delaying_queue to @dispatch_queue.
  */
 static int move_expired_inodes(struct list_head *delaying_queue,
 			       struct list_head *dispatch_queue,
+			       int flags,
 			       struct wb_writeback_work *work)
 {
+	unsigned long *older_than_this = NULL;
+	unsigned long expire_time;
 	LIST_HEAD(tmp);
 	struct list_head *pos, *node;
 	struct super_block *sb = NULL;
@@ -251,13 +273,24 @@ static int move_expired_inodes(struct list_head *delaying_queue,
 	int do_sb_sort = 0;
 	int moved = 0;
 
+	if ((flags & EXPIRE_DIRTY_ATIME) == 0)
+		older_than_this = work->older_than_this;
+	else if (!work->for_sync) {
+		expire_time = jiffies - (dirtytime_expire_interval * HZ);
+		older_than_this = &expire_time;
+	}
+
 	while (!list_empty(delaying_queue)) {
 		inode = wb_inode(delaying_queue->prev);
 		if (work->older_than_this &&
 		    inode_dirtied_after(inode, *work->older_than_this))
 			break;
+
 		list_move(&inode->i_wb_list, &tmp);
 		moved++;
+		if (flags & EXPIRE_DIRTY_ATIME)
+			set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state);
+
 		if (sb_is_blkdev_sb(inode->i_sb))
 			continue;
 		if (sb && sb != inode->i_sb)
@@ -298,9 +331,12 @@ out:
 static void queue_io(struct bdi_writeback *wb, struct wb_writeback_work *work)
 {
 	int moved;
+
 	assert_spin_locked(&wb->list_lock);
 	list_splice_init(&wb->b_more_io, &wb->b_io);
-	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, work);
+	moved = move_expired_inodes(&wb->b_dirty, &wb->b_io, 0, work);
+	moved += move_expired_inodes(&wb->b_dirty_time, &wb->b_io,
+				     EXPIRE_DIRTY_ATIME, work);
 	trace_writeback_queue_io(wb, work, moved);
 }
 
@@ -424,6 +460,9 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
 		 * updates after data IO completion.
 		 */
 		redirty_tail(inode, wb);
+	} else if (inode->i_state & I_DIRTY_TIME) {
+		inode->dirtied_when = jiffies;
+		list_move(&inode->i_wb_list, &wb->b_dirty_time);
 	} else {
 		/* The inode is clean. Remove from writeback lists. */
 		list_del_init(&inode->i_wb_list);
@@ -436,7 +475,7 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb,
  * setting I_SYNC flag and calling inode_sync_complete() to clear it.
  */
 static int
-__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+__do_writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 {
 	struct address_space *mapping = inode->i_mapping;
 	long nr_to_write = wbc->nr_to_write;
@@ -471,11 +510,23 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	/* Clear I_DIRTY_PAGES if we've written out all dirty pages */
 	if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
 		inode->i_state &= ~I_DIRTY_PAGES;
-	dirty = inode->i_state & I_DIRTY;
-	inode->i_state &= ~(I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+
+	dirty = inode->i_state & (I_DIRTY_SYNC | I_DIRTY_DATASYNC);
+	if (inode->i_state & I_DIRTY_TIME) {
+		if ((dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) ||
+		    unlikely(inode->i_state & I_DIRTY_TIME_EXPIRED) ||
+		    unlikely(time_after(jiffies,
+					(inode->dirtied_time_when +
+					 dirtytime_expire_interval * HZ)))) {
+			dirty |= I_DIRTY_TIME | I_DIRTY_TIME_EXPIRED;
+			trace_writeback_lazytime(inode);
+		}
+	} else
+		inode->i_state &= ~I_DIRTY_TIME_EXPIRED;
+	inode->i_state &= ~dirty;
 	spin_unlock(&inode->i_lock);
 	/* Don't write the inode if only I_DIRTY_PAGES was set */
-	if (dirty & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+	if (dirty & ~I_DIRTY_PAGES) {
 		int err = write_inode(inode, wbc);
 		if (ret == 0)
 			ret = err;
@@ -484,6 +535,25 @@ __writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
 	return ret;
 }
 
+static int
+__writeback_single_inode(struct inode *inode, struct writeback_control *wbc)
+{
+	struct user_beancounter *ub;
+	int ret;
+
+	rcu_read_lock();
+	ub = rcu_dereference(inode->i_mapping->dirtied_ub);
+	if (!ub || !get_beancounter_rcu(ub))
+		ub = get_beancounter(get_ub0());
+	rcu_read_unlock();
+
+	ub = set_exec_ub(ub);
+	ret = __do_writeback_single_inode(inode, wbc);
+	put_beancounter(set_exec_ub(ub));
+
+	return ret;
+}
+
 /*
  * Write out an inode's dirty pages. Either the caller has an active reference
  * on the inode or the inode has I_WILL_FREE set.
@@ -523,7 +593,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	 * make sure inode is on some writeback list and leave it there unless
 	 * we have completely cleaned the inode.
 	 */
-	if (!(inode->i_state & I_DIRTY) &&
+	if (!(inode->i_state & I_DIRTY_ALL) &&
 	    (wbc->sync_mode != WB_SYNC_ALL ||
 	     !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
 		goto out;
@@ -538,7 +608,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb,
 	 * If inode is clean, remove it from writeback lists. Otherwise don't
 	 * touch it. See comment above for explanation.
 	 */
-	if (!(inode->i_state & I_DIRTY))
+	if (!(inode->i_state & I_DIRTY_ALL))
 		list_del_init(&inode->i_wb_list);
 	spin_unlock(&wb->list_lock);
 	inode_sync_complete(inode);
@@ -634,6 +704,14 @@ static long writeback_sb_inodes(struct super_block *sb,
 			redirty_tail(inode, wb);
 			continue;
 		}
+		if ((work->ub || work->filter_ub) &&
+		    ((inode->i_state & I_DIRTY) == I_DIRTY_PAGES) &&
+		     ub_should_skip_writeback(work->ub, inode)) {
+			spin_unlock(&inode->i_lock);
+			redirty_tail(inode, wb);
+			continue;
+		}
+
 		if ((inode->i_state & I_SYNC) && wbc.sync_mode != WB_SYNC_ALL) {
 			/*
 			 * If this inode is locked for writeback and we are not
@@ -649,6 +727,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 			trace_writeback_sb_inodes_requeue(inode);
 			continue;
 		}
+
 		spin_unlock(&wb->list_lock);
 
 		/*
@@ -680,7 +759,7 @@ static long writeback_sb_inodes(struct super_block *sb,
 		wrote += write_chunk - wbc.nr_to_write;
 		spin_lock(&wb->list_lock);
 		spin_lock(&inode->i_lock);
-		if (!(inode->i_state & I_DIRTY))
+		if (!(inode->i_state & I_DIRTY_ALL))
 			wrote++;
 		requeue_inode(inode, wb, &wbc);
 		inode_sync_complete(inode);
@@ -696,6 +775,9 @@ static long writeback_sb_inodes(struct super_block *sb,
 			if (work->nr_pages <= 0)
 				break;
 		}
+
+		WARN_ON(wbc.pages_skipped > write_chunk - wbc.nr_to_write);
+		wrote -= wbc.pages_skipped;
 	}
 	return wrote;
 }
@@ -734,14 +816,15 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
 	return wrote;
 }
 
-static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
-				enum wb_reason reason)
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
+			enum wb_reason reason, struct user_beancounter *ub)
 {
 	struct wb_writeback_work work = {
 		.nr_pages	= nr_pages,
 		.sync_mode	= WB_SYNC_NONE,
 		.range_cyclic	= 1,
 		.reason		= reason,
+		.ub		= ub,
 	};
 
 	spin_lock(&wb->list_lock);
@@ -756,6 +839,7 @@ static long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
 static bool over_bground_thresh(struct backing_dev_info *bdi)
 {
 	unsigned long background_thresh, dirty_thresh;
+	unsigned long bdi_thresh, bdi_bg_thresh;
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
 
@@ -763,8 +847,11 @@ static bool over_bground_thresh(struct backing_dev_info *bdi)
 	    global_page_state(NR_UNSTABLE_NFS) > background_thresh)
 		return true;
 
-	if (bdi_stat(bdi, BDI_RECLAIMABLE) >
-				bdi_dirty_limit(bdi, background_thresh))
+	bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+	bdi_bg_thresh = div_u64((u64)bdi_thresh * background_thresh,
+				dirty_thresh + 1);
+
+	if (bdi_stat(bdi, BDI_RECLAIMABLE) > bdi_bg_thresh)
 		return true;
 
 	return false;
@@ -829,8 +916,14 @@ static long wb_writeback(struct bdi_writeback *wb,
 		 * For background writeout, stop when we are below the
 		 * background dirty threshold
 		 */
-		if (work->for_background && !over_bground_thresh(wb->bdi))
-			break;
+		if (work->for_background) {
+			if (over_bground_thresh(wb->bdi))
+				work->filter_ub = 0;
+			else if (ub_over_bground_thresh())
+				work->filter_ub = 1;
+			else
+				break;
+		}
 
 		/*
 		 * Kupdate and background works are special and we want to
@@ -921,7 +1014,8 @@ static unsigned long get_nr_dirty_pages(void)
 
 static long wb_check_background_flush(struct bdi_writeback *wb)
 {
-	if (over_bground_thresh(wb->bdi)) {
+	if (over_bground_thresh(wb->bdi) ||
+		ub_over_bground_thresh()) {
 
 		struct wb_writeback_work work = {
 			.nr_pages	= LONG_MAX,
@@ -1040,7 +1134,7 @@ void bdi_writeback_workfn(struct work_struct *work)
 		 * enough for efficient IO.
 		 */
 		pages_written = writeback_inodes_wb(&bdi->wb, 1024,
-						    WB_REASON_FORKER_THREAD);
+						WB_REASON_FORKER_THREAD, NULL);
 		trace_writeback_pages_written(pages_written);
 	}
 
@@ -1056,7 +1150,8 @@ void bdi_writeback_workfn(struct work_struct *work)
  * Start writeback of `nr_pages' pages.  If `nr_pages' is zero, write back
  * the whole world.
  */
-void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
+void wakeup_flusher_threads_ub(long nr_pages, struct user_beancounter *ub,
+			enum wb_reason reason)
 {
 	struct backing_dev_info *bdi;
 
@@ -1067,9 +1162,64 @@ void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
 	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
 		if (!bdi_has_dirty_io(bdi))
 			continue;
-		__bdi_start_writeback(bdi, nr_pages, false, reason);
+		__bdi_start_writeback(bdi, nr_pages, ub, false, reason);
+	}
+	rcu_read_unlock();
+}
+
+void wakeup_flusher_threads(long nr_pages, enum wb_reason reason)
+{
+	wakeup_flusher_threads_ub(nr_pages, NULL, reason);
+}
+
+/*
+ * Wake up bdi's periodically to make sure dirtytime inodes gets
+ * written back periodically.  We deliberately do *not* check the
+ * b_dirtytime list in wb_has_dirty_io(), since this would cause the
+ * kernel to be constantly waking up once there are any dirtytime
+ * inodes on the system.  So instead we define a separate delayed work
+ * function which gets called much more rarely.  (By default, only
+ * once every 12 hours.)
+ *
+ * If there is any other write activity going on in the file system,
+ * this function won't be necessary.  But if the only thing that has
+ * happened on the file system is a dirtytime inode caused by an atime
+ * update, we need this infrastructure below to make sure that inode
+ * eventually gets pushed out to disk.
+ */
+static void wakeup_dirtytime_writeback(struct work_struct *w);
+static DECLARE_DELAYED_WORK(dirtytime_work, wakeup_dirtytime_writeback);
+
+static void wakeup_dirtytime_writeback(struct work_struct *w)
+{
+	struct backing_dev_info *bdi;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
+		if (list_empty(&bdi->wb.b_dirty_time))
+			continue;
+		mod_delayed_work(bdi_wq, &bdi->wb.dwork, 0);
 	}
 	rcu_read_unlock();
+	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+}
+
+static int __init start_dirtytime_writeback(void)
+{
+	schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ);
+	return 0;
+}
+__initcall(start_dirtytime_writeback);
+
+int dirtytime_interval_handler(struct ctl_table *table, int write,
+			       void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret == 0 && write)
+		mod_delayed_work(system_wq, &dirtytime_work, 0);
+	return ret;
 }
 
 static noinline void block_dump___mark_inode_dirty(struct inode *inode)
@@ -1118,16 +1268,20 @@ static noinline void block_dump___mark_inode_dirty(struct inode *inode)
  * page->mapping->host, so the page-dirtying time is recorded in the internal
  * blockdev inode.
  */
+#define I_DIRTY_INODE (I_DIRTY_SYNC | I_DIRTY_DATASYNC)
 void __mark_inode_dirty(struct inode *inode, int flags)
 {
 	struct super_block *sb = inode->i_sb;
 	struct backing_dev_info *bdi = NULL;
+	int dirtytime;
+
+	trace_writeback_mark_inode_dirty(inode, flags);
 
 	/*
 	 * Don't do this for I_DIRTY_PAGES - that doesn't actually
 	 * dirty the inode itself
 	 */
-	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC)) {
+	if (flags & (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_TIME)) {
 		trace_writeback_dirty_inode_start(inode, flags);
 
 		if (sb->s_op->dirty_inode)
@@ -1135,6 +1289,9 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 
 		trace_writeback_dirty_inode(inode, flags);
 	}
+	if (flags & I_DIRTY_INODE)
+		flags &= ~I_DIRTY_TIME;
+	dirtytime = flags & I_DIRTY_TIME;
 
 	/*
 	 * make sure that changes are seen by all cpus before we test i_state
@@ -1143,16 +1300,21 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 	smp_mb();
 
 	/* avoid the locking if we can */
-	if ((inode->i_state & flags) == flags)
+	if (((inode->i_state & flags) == flags) ||
+	    (dirtytime && (inode->i_state & I_DIRTY_INODE)))
 		return;
 
 	if (unlikely(block_dump))
 		block_dump___mark_inode_dirty(inode);
 
 	spin_lock(&inode->i_lock);
+	if (dirtytime && (inode->i_state & I_DIRTY_INODE))
+		goto out_unlock_inode;
 	if ((inode->i_state & flags) != flags) {
 		const int was_dirty = inode->i_state & I_DIRTY;
 
+		if (flags & I_DIRTY_INODE)
+			inode->i_state &= ~I_DIRTY_TIME;
 		inode->i_state |= flags;
 
 		/*
@@ -1199,8 +1361,15 @@ void __mark_inode_dirty(struct inode *inode, int flags)
 			}
 
 			inode->dirtied_when = jiffies;
-			list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+			if (dirtytime)
+				inode->dirtied_time_when = jiffies;
+			if (inode->i_state & (I_DIRTY_INODE | I_DIRTY_PAGES))
+				list_move(&inode->i_wb_list, &bdi->wb.b_dirty);
+			else
+				list_move(&inode->i_wb_list,
+					  &bdi->wb.b_dirty_time);
 			spin_unlock(&bdi->wb.list_lock);
+			trace_writeback_dirty_inode_enqueue(inode);
 
 			if (wakeup_bdi)
 				bdi_wakeup_thread_delayed(bdi);
@@ -1213,7 +1382,7 @@ out_unlock_inode:
 }
 EXPORT_SYMBOL(__mark_inode_dirty);
 
-static void wait_sb_inodes(struct super_block *sb)
+static void wait_sb_inodes(struct super_block *sb, struct user_beancounter *ub)
 {
 	struct inode *inode, *old_inode = NULL;
 
@@ -1241,6 +1410,12 @@ static void wait_sb_inodes(struct super_block *sb)
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
+		if (ub && (mapping->dirtied_ub != ub) &&
+		    ((inode->i_state & I_DIRTY) == I_DIRTY_PAGES)) {
+			spin_unlock(&inode->i_lock);
+			continue;
+		}
+
 		__iget(inode);
 		spin_unlock(&inode->i_lock);
 		spin_unlock(&inode_sb_list_lock);
@@ -1281,7 +1456,8 @@ static void wait_sb_inodes(struct super_block *sb)
  * on how many (if any) will be written, and this function does not wait
  * for IO completion of submitted IO.
  */
-void writeback_inodes_sb_nr(struct super_block *sb,
+static void writeback_inodes_sb_ub_nr(struct super_block *sb,
+			    struct user_beancounter *ub,
 			    unsigned long nr,
 			    enum wb_reason reason)
 {
@@ -1293,6 +1469,7 @@ void writeback_inodes_sb_nr(struct super_block *sb,
 		.done			= &done,
 		.nr_pages		= nr,
 		.reason			= reason,
+		.ub			= ub,
 	};
 
 	if (sb->s_bdi == &noop_backing_dev_info)
@@ -1301,8 +1478,22 @@ void writeback_inodes_sb_nr(struct super_block *sb,
 	bdi_queue_work(sb->s_bdi, &work);
 	wait_for_completion(&done);
 }
+
+void writeback_inodes_sb_nr(struct super_block *sb,
+			    unsigned long nr,
+			    enum wb_reason reason)
+{
+
+	writeback_inodes_sb_ub_nr(sb, NULL, nr, reason);
+}
 EXPORT_SYMBOL(writeback_inodes_sb_nr);
 
+void writeback_inodes_sb_ub(struct super_block *sb, struct user_beancounter *ub,
+			enum wb_reason reason)
+{
+	return writeback_inodes_sb_ub_nr(sb, ub, get_nr_dirty_pages(), reason);
+}
+
 /**
  * writeback_inodes_sb	-	writeback dirty inodes from given super_block
  * @sb: the superblock
@@ -1364,7 +1555,7 @@ EXPORT_SYMBOL(try_to_writeback_inodes_sb);
  * This function writes and waits on any dirty inode belonging to this
  * super_block.
  */
-void sync_inodes_sb(struct super_block *sb)
+void sync_inodes_sb_ub(struct super_block *sb, struct user_beancounter *ub)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	struct wb_writeback_work work = {
@@ -1375,6 +1566,7 @@ void sync_inodes_sb(struct super_block *sb)
 		.done		= &done,
 		.reason		= WB_REASON_SYNC,
 		.for_sync	= 1,
+		.ub		= ub,
 	};
 
 	/* Nothing to do? */
@@ -1385,7 +1577,12 @@ void sync_inodes_sb(struct super_block *sb)
 	bdi_queue_work(sb->s_bdi, &work);
 	wait_for_completion(&done);
 
-	wait_sb_inodes(sb);
+	wait_sb_inodes(sb, ub);
+}
+
+void sync_inodes_sb(struct super_block *sb)
+{
+	sync_inodes_sb_ub(sb, NULL);
 }
 EXPORT_SYMBOL(sync_inodes_sb);
 
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -10,6 +10,7 @@
 
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/seq_file.h>
 
 #define FUSE_CTL_SUPER_MAGIC 0x65735543
 
@@ -196,6 +197,243 @@ static const struct file_operations fuse_conn_congestion_threshold_ops = {
 	.llseek = no_llseek,
 };
 
+struct fuse_conn_priv {
+	struct fuse_conn *conn;
+	struct list_head *req_list;
+};
+
+enum {
+	FUSE_PENDING_REQ = 1,
+	FUSE_PROCESSING_REQ,
+	FUSE_IO_REQ,
+};
+
+static void *fuse_req_start(struct seq_file *m, loff_t *p)
+{
+	struct fuse_conn_priv *fcp = m->private;
+
+	spin_lock(&fcp->conn->lock);
+	return seq_list_start(fcp->req_list, *p);
+}
+
+static void *fuse_req_next(struct seq_file *m, void *v, loff_t *p)
+{
+	struct fuse_conn_priv *fcp = m->private;
+	return seq_list_next(v, fcp->req_list, p);
+}
+
+static void fuse_req_stop(struct seq_file *m, void *v)
+{
+	struct fuse_conn_priv *fcp = m->private;
+	spin_unlock(&fcp->conn->lock);
+}
+
+static int fuse_req_show(struct seq_file *f, void *v)
+{
+	struct fuse_req *req;
+
+	req = list_entry((struct list_head *)v, struct fuse_req, list);
+	seq_printf(f, "state: %-2d flags: %c%c%c%c%c%c%c "
+			"in: op %-4d uniq 0x%016Lx node 0x%016Lx "
+			"out: err %-6d uniq 0x%016Lx\n",
+			req->state,
+			req->isreply ? 'r' : '-',
+			req->force ? 'f' : '-',
+			req->aborted ? 'a' : '-',
+			req->background ? 'b' : '-',
+			req->interrupted ? 'i' : '-',
+			req->locked ? 'l' : '-',
+			req->waiting ? 'w': '-',
+			req->in.h.opcode,
+			req->in.h.unique,
+			req->in.h.nodeid,
+			req->out.h.error,
+			req->out.h.unique);
+
+	return 0;
+}
+
+static const struct seq_operations fuse_conn_req_ops = {
+	.start = fuse_req_start,
+	.next = fuse_req_next,
+	.stop = fuse_req_stop,
+	.show = fuse_req_show,
+};
+
+static int fuse_conn_seq_open(struct file *filp, int list_id)
+{
+	struct fuse_conn *conn;
+	struct fuse_conn_priv *fcp;
+
+	conn = fuse_ctl_file_conn_get(filp);
+	if (!conn)
+		return -ESTALE;
+
+	fcp = __seq_open_private(filp, &fuse_conn_req_ops,
+			sizeof(struct fuse_conn_priv));
+	if (fcp == NULL) {
+		fuse_conn_put(conn);
+		return -ENOMEM;
+	}
+
+	fcp->conn = conn;
+	switch (list_id) {
+	case FUSE_PROCESSING_REQ:
+		fcp->req_list = &conn->processing;
+		break;
+	case FUSE_PENDING_REQ:
+		fcp->req_list = &conn->pending;
+		break;
+	case FUSE_IO_REQ:
+		fcp->req_list = &conn->io;
+		break;
+	default:
+		BUG();
+	}
+
+	return 0;
+}
+
+static int fuse_conn_release(struct inode *inode, struct file *filp)
+{
+	struct fuse_conn_priv *fcp = ((struct seq_file *)filp->private_data)->private;
+
+	if (fcp)
+		fuse_conn_put(fcp->conn);
+
+	return seq_release_private(inode, filp);
+}
+
+static int fuse_conn_pending_open(struct inode *inode, struct file *filp)
+{
+	return fuse_conn_seq_open(filp, FUSE_PENDING_REQ);
+}
+
+static const struct file_operations fuse_conn_pending_req = {
+	.open = fuse_conn_pending_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = fuse_conn_release,
+};
+
+static int fuse_conn_processing_open(struct inode *inode, struct file *filp)
+{
+	return fuse_conn_seq_open(filp, FUSE_PROCESSING_REQ);
+}
+
+static const struct file_operations fuse_conn_processing_req = {
+	.open = fuse_conn_processing_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = fuse_conn_release,
+};
+
+static int fuse_conn_io_open(struct inode *inode, struct file *filp)
+{
+	return fuse_conn_seq_open(filp, FUSE_IO_REQ);
+}
+
+static const struct file_operations fuse_conn_io_req = {
+	.open = fuse_conn_io_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = fuse_conn_release,
+};
+
+static int fuse_files_show(struct seq_file *f, void *v)
+{
+	struct fuse_file *ff;
+
+	ff = list_entry(v, struct fuse_file, fl);
+	seq_printf(f, "kh 0x%016Lx fh 0x%016Lx node 0x%016Lx flags 0x%08x name ",
+			ff->kh, ff->fh, ff->nodeid, ff->open_flags);
+	if (ff->ff_dentry)
+		seq_dentry(f, ff->ff_dentry, "");
+	else
+		seq_putc(f, '-');
+	seq_putc(f, '\n');
+
+	return 0;
+}
+
+static const struct seq_operations fuse_conn_files_seq_ops = {
+	.start = fuse_req_start,
+	.next = fuse_req_next,
+	.stop = fuse_req_stop,
+	.show = fuse_files_show,
+};
+
+static int fuse_conn_files_open(struct inode *inode, struct file *filp)
+{
+	struct fuse_conn *conn;
+	struct fuse_conn_priv *fcp;
+
+	conn = fuse_ctl_file_conn_get(filp);
+	if (!conn)
+		return -ESTALE;
+
+	fcp = __seq_open_private(filp, &fuse_conn_files_seq_ops,
+			sizeof(struct fuse_conn_priv));
+	if (fcp == NULL) {
+		fuse_conn_put(conn);
+		return -ENOMEM;
+	}
+
+	fcp->conn = conn;
+	fcp->req_list = &conn->conn_files;
+	return 0;
+}
+
+static const struct file_operations fuse_conn_files_ops = {
+	.open = fuse_conn_files_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = fuse_conn_release,
+};
+
+static int fuse_conn_show(struct seq_file *sf, void *v)
+{
+	struct fuse_conn *fc = sf->private;
+	seq_printf(sf, "Connected: %d\n", fc->connected);
+	seq_printf(sf, "Initialized: %d\n", fc->initialized);
+	seq_printf(sf, "Blocked: %d\n", fc->blocked);
+	seq_printf(sf, "WQ active: %d\n", waitqueue_active(&fc->waitq));
+	seq_printf(sf, "Blocked_wq active: %d\n", waitqueue_active(&fc->blocked_waitq));
+	seq_printf(sf, "num_background: %d\n", fc->num_background);
+	seq_printf(sf, "num_waiting: %d\n", atomic_read(&fc->num_waiting));
+	return 0;
+}
+
+static int fuse_conn_info_open(struct inode *inode, struct file *filp)
+{
+	int ret;
+	struct fuse_conn *conn;
+
+	conn = fuse_ctl_file_conn_get(filp);
+	if (!conn)
+		return -ESTALE;
+
+	ret = single_open(filp, fuse_conn_show, conn);
+	if (ret)
+		fuse_conn_put(conn);
+
+	return ret;
+}
+
+static int fuse_conn_info_release(struct inode *inode, struct file *filp)
+{
+	struct fuse_conn *conn = ((struct seq_file *)filp->private_data)->private;
+	fuse_conn_put(conn);
+	return single_release(inode, filp);
+}
+
+static const struct file_operations fuse_conn_info_ops = {
+	.open = fuse_conn_info_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = fuse_conn_info_release,
+};
+
 static struct dentry *fuse_ctl_add_dentry(struct dentry *parent,
 					  struct fuse_conn *fc,
 					  const char *name,
@@ -260,7 +498,23 @@ int fuse_ctl_add_conn(struct fuse_conn *fc)
 				 1, NULL, &fuse_conn_max_background_ops) ||
 	    !fuse_ctl_add_dentry(parent, fc, "congestion_threshold",
 				 S_IFREG | 0600, 1, NULL,
-				 &fuse_conn_congestion_threshold_ops))
+				 &fuse_conn_congestion_threshold_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "pending_req",
+		    		S_IFREG | 0600, 1, NULL,
+				&fuse_conn_pending_req) ||
+	    !fuse_ctl_add_dentry(parent, fc, "processing_req",
+		    		S_IFREG | 0600, 1, NULL,
+				&fuse_conn_processing_req) ||
+	    !fuse_ctl_add_dentry(parent, fc, "io_req",
+		    		S_IFREG | 0600, 1, NULL,
+				&fuse_conn_io_req) ||
+	    !fuse_ctl_add_dentry(parent, fc, "open_files",
+		    		S_IFREG | 0600, 1, NULL,
+				&fuse_conn_files_ops) ||
+	    !fuse_ctl_add_dentry(parent, fc, "conn_info",
+			    	S_IFREG | 0600, 1, NULL,
+				&fuse_conn_info_ops)
+	    )
 		goto err;
 
 	return 0;
--- a/fs/fuse/cuse.c
+++ b/fs/fuse/cuse.c
@@ -95,7 +95,7 @@ static ssize_t cuse_read(struct file *file, char __user *buf, size_t count,
 	struct iovec iov = { .iov_base = buf, .iov_len = count };
 	struct fuse_io_priv io = { .async = 0, .file = file };
 
-	return fuse_direct_io(&io, &iov, 1, count, &pos, 0);
+	return fuse_direct_io(&io, &iov, 1, count, &pos, FUSE_DIO_CUSE);
 }
 
 static ssize_t cuse_write(struct file *file, const char __user *buf,
@@ -109,7 +109,8 @@ static ssize_t cuse_write(struct file *file, const char __user *buf,
 	 * No locking or generic_write_checks(), the server is
 	 * responsible for locking and sanity checks.
 	 */
-	return fuse_direct_io(&io, &iov, 1, count, &pos, 1);
+	return fuse_direct_io(&io, &iov, 1, count, &pos,
+			      FUSE_DIO_WRITE | FUSE_DIO_CUSE);
 }
 
 static int cuse_open(struct inode *inode, struct file *file)
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -25,6 +25,7 @@ MODULE_ALIAS_MISCDEV(FUSE_MINOR);
 MODULE_ALIAS("devname:fuse");
 
 static struct kmem_cache *fuse_req_cachep;
+extern struct workqueue_struct *fuse_fput_wq;
 
 static struct fuse_conn *fuse_get_conn(struct file *file)
 {
@@ -99,19 +100,6 @@ void fuse_request_free(struct fuse_req *req)
 	kmem_cache_free(fuse_req_cachep, req);
 }
 
-static void block_sigs(sigset_t *oldset)
-{
-	sigset_t mask;
-
-	siginitsetinv(&mask, sigmask(SIGKILL));
-	sigprocmask(SIG_BLOCK, &mask, oldset);
-}
-
-static void restore_sigs(sigset_t *oldset)
-{
-	sigprocmask(SIG_SETMASK, oldset, NULL);
-}
-
 void __fuse_get_request(struct fuse_req *req)
 {
 	atomic_inc(&req->count);
@@ -128,7 +116,7 @@ static void fuse_req_init_context(struct fuse_req *req)
 {
 	req->in.h.uid = from_kuid_munged(&init_user_ns, current_fsuid());
 	req->in.h.gid = from_kgid_munged(&init_user_ns, current_fsgid());
-	req->in.h.pid = current->pid;
+	req->in.h.pid = task_pid_vnr(current);
 }
 
 static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background)
@@ -144,15 +132,9 @@ static struct fuse_req *__fuse_get_req(struct fuse_conn *fc, unsigned npages,
 	atomic_inc(&fc->num_waiting);
 
 	if (fuse_block_alloc(fc, for_background)) {
-		sigset_t oldset;
-		int intr;
-
-		block_sigs(&oldset);
-		intr = wait_event_interruptible_exclusive(fc->blocked_waitq,
-				!fuse_block_alloc(fc, for_background));
-		restore_sigs(&oldset);
 		err = -EINTR;
-		if (intr)
+		if (wait_event_killable_exclusive(fc->blocked_waitq,
+				!fuse_block_alloc(fc, for_background)))
 			goto out;
 	}
 
@@ -412,6 +394,19 @@ __acquires(fc->lock)
 	spin_lock(&fc->lock);
 }
 
+static void wait_answer_killable(struct fuse_conn *fc,
+				 struct fuse_req *req)
+__releases(fc->lock)
+__acquires(fc->lock)
+{
+	if (fatal_signal_pending(current))
+		return;
+
+	spin_unlock(&fc->lock);
+	wait_event_killable(req->waitq, req->state == FUSE_REQ_FINISHED);
+	spin_lock(&fc->lock);
+}
+
 static void queue_interrupt(struct fuse_conn *fc, struct fuse_req *req)
 {
 	list_add_tail(&req->intr_entry, &fc->interrupts);
@@ -438,12 +433,8 @@ __acquires(fc->lock)
 	}
 
 	if (!req->force) {
-		sigset_t oldset;
-
 		/* Only fatal signals may interrupt this */
-		block_sigs(&oldset);
-		wait_answer_interruptible(fc, req);
-		restore_sigs(&oldset);
+		wait_answer_killable(fc, req);
 
 		if (req->aborted)
 			goto aborted;
@@ -484,7 +475,8 @@ __acquires(fc->lock)
 	}
 }
 
-static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req,
+				struct fuse_file *ff)
 {
 	BUG_ON(req->background);
 	spin_lock(&fc->lock);
@@ -492,6 +484,8 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 		req->out.h.error = -ENOTCONN;
 	else if (fc->conn_error)
 		req->out.h.error = -ECONNREFUSED;
+	else if (ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state))
+		req->out.h.error = -EIO;
 	else {
 		req->in.h.unique = fuse_get_unique(fc);
 		queue_request(fc, req);
@@ -504,10 +498,16 @@ static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
 	spin_unlock(&fc->lock);
 }
 
-void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+void fuse_request_check_and_send(struct fuse_conn *fc, struct fuse_req *req,
+				 struct fuse_file *ff)
 {
 	req->isreply = 1;
-	__fuse_request_send(fc, req);
+	__fuse_request_send(fc, req, ff);
+}
+
+void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
+{
+	fuse_request_check_and_send(fc, req, NULL);
 }
 EXPORT_SYMBOL_GPL(fuse_request_send);
 
@@ -530,7 +530,13 @@ static void fuse_request_send_nowait_locked(struct fuse_conn *fc,
 static void fuse_request_send_nowait(struct fuse_conn *fc, struct fuse_req *req)
 {
 	spin_lock(&fc->lock);
-	if (fc->connected) {
+	if (req->page_cache && req->ff &&
+	    test_bit(FUSE_S_FAIL_IMMEDIATELY, &req->ff->ff_state)) {
+		BUG_ON(req->in.h.opcode != FUSE_READ);
+		req->out.h.error = -EIO;
+		req->background = 0;
+		request_end(fc, req);
+	} else if (fc->connected) {
 		fuse_request_send_nowait_locked(fc, req);
 		spin_unlock(&fc->lock);
 	} else {
@@ -591,7 +597,7 @@ void fuse_force_forget(struct file *file, u64 nodeid)
 	req->in.args[0].size = sizeof(inarg);
 	req->in.args[0].value = &inarg;
 	req->isreply = 0;
-	__fuse_request_send(fc, req);
+	__fuse_request_send(fc, req, NULL);
 	/* ignore errors */
 	fuse_put_request(fc, req);
 }
@@ -913,7 +919,7 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page,
  * done atomically
  */
 static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
-			  unsigned offset, unsigned count, int zeroing)
+			  unsigned offset, unsigned count, int zeroing, int moving)
 {
 	int err;
 	struct page *page = *pagep;
@@ -925,7 +931,7 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep,
 		if (cs->write && cs->pipebufs && page) {
 			return fuse_ref_page(cs, page, offset, count);
 		} else if (!cs->len) {
-			if (cs->move_pages && page &&
+			if (cs->move_pages && page && moving &&
 			    offset == 0 && count == PAGE_SIZE) {
 				err = fuse_try_move_page(cs, pagep);
 				if (err <= 0)
@@ -962,7 +968,7 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
 		unsigned count = min(nbytes, req->page_descs[i].length);
 
 		err = fuse_copy_page(cs, &req->pages[i], offset, count,
-				     zeroing);
+				     zeroing, 1);
 		if (err)
 			return err;
 
@@ -971,6 +977,24 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
 	return 0;
 }
 
+static int fuse_copy_bvec(struct fuse_copy_state *cs, unsigned nbytes,
+			   int zeroing)
+{
+	unsigned i;
+	struct fuse_req *req = cs->req;
+
+	for (i = 0; i < req->num_bvecs && (nbytes || zeroing); i++) {
+		struct bio_vec *bvec = &req->bvec[i];
+
+		int err = fuse_copy_page(cs, &bvec->bv_page,
+					 bvec->bv_offset, bvec->bv_len, zeroing, 0);
+		if (err)
+			return err;
+	}
+
+	return 0;
+}
+
 /* Copy a single argument in the request to/from userspace buffer */
 static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
 {
@@ -987,7 +1011,7 @@ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
 
 /* Copy request arguments to/from userspace buffer */
 static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
-			  unsigned argpages, struct fuse_arg *args,
+			  unsigned argpages, unsigned argbvec, struct fuse_arg *args,
 			  int zeroing)
 {
 	int err = 0;
@@ -997,6 +1021,8 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
 		struct fuse_arg *arg = &args[i];
 		if (i == numargs - 1 && argpages)
 			err = fuse_copy_pages(cs, arg->size, zeroing);
+		else if (i == numargs - 1 && argbvec)
+			err = fuse_copy_bvec(cs, arg->size, zeroing);
 		else
 			err = fuse_copy_one(cs, arg->value, arg->size);
 	}
@@ -1252,7 +1278,7 @@ static ssize_t fuse_dev_do_read(struct fuse_conn *fc, struct file *file,
 	cs->req = req;
 	err = fuse_copy_one(cs, &in->h, sizeof(in->h));
 	if (!err)
-		err = fuse_copy_args(cs, in->numargs, in->argpages,
+		err = fuse_copy_args(cs, in->numargs, in->argpages, in->argbvec,
 				     (struct fuse_arg *) in->args, 0);
 	fuse_copy_finish(cs);
 	spin_lock(&fc->lock);
@@ -1598,7 +1624,7 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size,
 			goto out_iput;
 
 		this_num = min_t(unsigned, num, PAGE_CACHE_SIZE - offset);
-		err = fuse_copy_page(cs, &page, offset, this_num, 0);
+		err = fuse_copy_page(cs, &page, offset, this_num, 0, 1);
 		if (!err && offset == 0 && (num != 0 || file_size == end))
 			SetPageUptodate(page);
 		unlock_page(page);
@@ -1734,9 +1760,43 @@ copy_finish:
 	return err;
 }
 
+static int fuse_notify_inval_files(struct fuse_conn *fc, unsigned int size,
+				   struct fuse_copy_state *cs)
+{
+	struct fuse_notify_inval_files_out outarg;
+	int err = -EINVAL;
+
+	if (size != sizeof(outarg))
+		goto err;
+
+	err = fuse_copy_one(cs, &outarg, sizeof(outarg));
+	if (err)
+		goto err;
+	fuse_copy_finish(cs);
+
+	down_read(&fc->killsb);
+	err = -ENOENT;
+	if (!fc->sb)
+		goto err_unlock;
+
+	err = fuse_invalidate_files(fc, outarg.ino);
+
+err_unlock:
+	up_read(&fc->killsb);
+	return err;
+
+err:
+	fuse_copy_finish(cs);
+	return err;
+}
+
 static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 		       unsigned int size, struct fuse_copy_state *cs)
 {
+	/* ASSUMPTION: pstorage fused doesn't use FUSE_NOTIFY_STORE */
+	if (fc->compat_inval_files && code == 4)
+		code = FUSE_NOTIFY_INVAL_FILES;
+
 	switch (code) {
 	case FUSE_NOTIFY_POLL:
 		return fuse_notify_poll(fc, size, cs);
@@ -1756,6 +1816,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code,
 	case FUSE_NOTIFY_DELETE:
 		return fuse_notify_delete(fc, size, cs);
 
+	case FUSE_NOTIFY_INVAL_FILES:
+		return fuse_notify_inval_files(fc, size, cs);
+
 	default:
 		fuse_copy_finish(cs);
 		return -EINVAL;
@@ -1795,8 +1858,8 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
 			return -EINVAL;
 		lastarg->size -= diffsize;
 	}
-	return fuse_copy_args(cs, out->numargs, out->argpages, out->args,
-			      out->page_zeroing);
+	return fuse_copy_args(cs, out->numargs, out->argpages, out->argbvec,
+			      out->args, out->page_zeroing);
 }
 
 /*
@@ -2181,11 +2244,16 @@ static struct miscdevice fuse_miscdevice = {
 int __init fuse_dev_init(void)
 {
 	int err = -ENOMEM;
+
+	fuse_fput_wq = create_workqueue("fuse_fput");
+	if (!fuse_fput_wq)
+		goto out;
+
 	fuse_req_cachep = kmem_cache_create("fuse_request",
 					    sizeof(struct fuse_req),
 					    0, 0, NULL);
 	if (!fuse_req_cachep)
-		goto out;
+		goto out_destroq_wq;
 
 	err = misc_register(&fuse_miscdevice);
 	if (err)
@@ -2195,6 +2263,8 @@ int __init fuse_dev_init(void)
 
  out_cache_clean:
 	kmem_cache_destroy(fuse_req_cachep);
+ out_destroq_wq:
+	destroy_workqueue(fuse_fput_wq);
  out:
 	return err;
 }
@@ -2203,4 +2273,5 @@ void fuse_dev_cleanup(void)
 {
 	misc_deregister(&fuse_miscdevice);
 	kmem_cache_destroy(fuse_req_cachep);
+	destroy_workqueue(fuse_fput_wq);
 }
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -187,8 +187,7 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
 	inode = ACCESS_ONCE(entry->d_inode);
 	if (inode && is_bad_inode(inode))
 		goto invalid;
-	else if (time_before64(fuse_dentry_time(entry), get_jiffies_64()) ||
-		 (flags & LOOKUP_REVAL)) {
+	else if (1) {
 		int err;
 		struct fuse_entry_out outarg;
 		struct fuse_req *req;
@@ -246,13 +245,6 @@ static int fuse_dentry_revalidate(struct dentry *entry, unsigned int flags)
 				       entry_attr_timeout(&outarg),
 				       attr_version);
 		fuse_change_entry_timeout(entry, &outarg);
-	} else if (inode) {
-		fc = get_fuse_conn(inode);
-		if (fc->readdirplus_auto) {
-			parent = dget_parent(entry);
-			fuse_advise_use_readdirplus(parent->d_inode);
-			dput(parent);
-		}
 	}
 	ret = 1;
 out:
@@ -324,7 +316,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 
 	*inode = fuse_iget(sb, outarg->nodeid, outarg->generation,
 			   &outarg->attr, entry_attr_timeout(outarg),
-			   attr_version);
+			   attr_version, 0);
 	err = -ENOMEM;
 	if (!*inode) {
 		fuse_queue_forget(fc, forget, outarg->nodeid, 1);
@@ -421,6 +413,9 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	/* Userspace expects S_IFREG in create mode */
 	BUG_ON((mode & S_IFMT) != S_IFREG);
 
+	if ((flags & O_DIRECT) && !(fc->flags & FUSE_ODIRECT))
+		return -EINVAL;
+
 	forget = fuse_alloc_forget();
 	err = -ENOMEM;
 	if (!forget)
@@ -475,7 +470,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	ff->nodeid = outentry.nodeid;
 	ff->open_flags = outopen.open_flags;
 	inode = fuse_iget(dir->i_sb, outentry.nodeid, outentry.generation,
-			  &outentry.attr, entry_attr_timeout(&outentry), 0);
+			  &outentry.attr, entry_attr_timeout(&outentry), 0, 1);
 	if (!inode) {
 		flags &= ~(O_CREAT | O_EXCL | O_TRUNC);
 		fuse_sync_release(ff, flags);
@@ -489,6 +484,10 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
 	fuse_invalidate_attr(dir);
 	err = finish_open(file, entry, generic_file_open, opened);
 	if (err) {
+		if (fc->writeback_cache) {
+			struct fuse_inode *fi = get_fuse_inode(inode);
+			atomic_dec(&fi->num_openers);
+		}
 		fuse_sync_release(ff, flags);
 	} else {
 		file->private_data = fuse_file_get(ff);
@@ -590,7 +589,7 @@ static int create_new_entry(struct fuse_conn *fc, struct fuse_req *req,
 		goto out_put_forget_req;
 
 	inode = fuse_iget(dir->i_sb, outarg.nodeid, outarg.generation,
-			  &outarg.attr, entry_attr_timeout(&outarg), 0);
+			  &outarg.attr, entry_attr_timeout(&outarg), 0, 0);
 	if (!inode) {
 		fuse_queue_forget(fc, forget, outarg.nodeid, 1);
 		return -ENOMEM;
@@ -900,6 +899,14 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
 			  struct kstat *stat)
 {
 	unsigned int blkbits;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	/* see the comment in fuse_change_attributes() */
+	if (fc->writeback_cache && S_ISREG(inode->i_mode)) {
+		attr->size = i_size_read(inode);
+		attr->mtime = inode->i_mtime.tv_sec;
+		attr->mtimensec = inode->i_mtime.tv_nsec;
+	}
 
 	stat->dev = inode->i_sb->s_dev;
 	stat->ino = attr->ino;
@@ -926,7 +933,7 @@ static void fuse_fillattr(struct inode *inode, struct fuse_attr *attr,
 }
 
 static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
-			   struct file *file)
+			   struct file *file, int get_size_form_attr)
 {
 	int err;
 	struct fuse_getattr_in inarg;
@@ -972,13 +979,32 @@ static int fuse_do_getattr(struct inode *inode, struct kstat *stat,
 			fuse_change_attributes(inode, &outarg.attr,
 					       attr_timeout(&outarg),
 					       attr_version);
-			if (stat)
+			if (get_size_form_attr)
+				stat->size = outarg.attr.size;
+			else if (stat) {
+				struct fuse_inode *fi = get_fuse_inode(inode);
 				fuse_fillattr(inode, &outarg.attr, stat);
+				if (!atomic_read(&fi->num_openers))
+					stat->size = outarg.attr.size;
+			}
 		}
 	}
 	return err;
 }
 
+int fuse_getattr_size(struct inode *inode, struct file *file, u64 *size)
+{
+	struct kstat stat;
+	int err;
+
+	err = fuse_do_getattr(inode, &stat, file, 1);
+	if (err)
+		return err;
+
+	*size = stat.size;
+	return 0;
+}
+
 int fuse_update_attributes(struct inode *inode, struct kstat *stat,
 			   struct file *file, bool *refreshed)
 {
@@ -988,7 +1014,7 @@ int fuse_update_attributes(struct inode *inode, struct kstat *stat,
 
 	if (time_before64(fi->i_time, get_jiffies_64())) {
 		r = true;
-		err = fuse_do_getattr(inode, stat, file);
+		err = fuse_do_getattr(inode, stat, file, 0);
 	} else {
 		r = false;
 		err = 0;
@@ -1138,7 +1164,7 @@ static int fuse_perm_getattr(struct inode *inode, int mask)
 	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
 
-	return fuse_do_getattr(inode, NULL, NULL);
+	return fuse_do_getattr(inode, NULL, NULL, 0);
 }
 
 /*
@@ -1324,7 +1350,7 @@ static int fuse_direntplus_link(struct file *file,
 		goto out;
 
 	inode = fuse_iget(dir->i_sb, o->nodeid, o->generation,
-			  &o->attr, entry_attr_timeout(o), attr_version);
+			  &o->attr, entry_attr_timeout(o), attr_version, 0);
 	if (!inode)
 		goto out;
 
@@ -1626,6 +1652,89 @@ void fuse_release_nowrite(struct inode *inode)
 	spin_unlock(&fc->lock);
 }
 
+static void fuse_setattr_fill(struct fuse_conn *fc, struct fuse_req *req,
+			      struct inode *inode,
+			      struct fuse_setattr_in *inarg_p,
+			      struct fuse_attr_out *outarg_p)
+{
+	req->in.h.opcode = FUSE_SETATTR;
+	req->in.h.nodeid = get_node_id(inode);
+	req->in.numargs = 1;
+	req->in.args[0].size = sizeof(*inarg_p);
+	req->in.args[0].value = inarg_p;
+	req->out.numargs = 1;
+	if (fc->minor < 9)
+		req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
+	else
+		req->out.args[0].size = sizeof(*outarg_p);
+	req->out.args[0].value = outarg_p;
+}
+
+/*
+ * Flush inode->i_mtime to the server
+ */
+int fuse_flush_mtime(struct file *file, bool nofail)
+{
+	struct inode *inode = file->f_mapping->host;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req = NULL;
+	struct fuse_setattr_in inarg;
+	struct fuse_attr_out outarg;
+	int err;
+
+	if (nofail) {
+		req = fuse_get_req_nofail_nopages(fc, file);
+	} else {
+		req = fuse_get_req_nopages(fc);
+		if (IS_ERR(req))
+			return PTR_ERR(req);
+	}
+
+	memset(&inarg, 0, sizeof(inarg));
+	memset(&outarg, 0, sizeof(outarg));
+
+	inarg.valid |= FATTR_MTIME;
+	inarg.mtime = inode->i_mtime.tv_sec;
+	inarg.mtimensec = inode->i_mtime.tv_nsec;
+
+	fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
+	fuse_request_send(fc, req);
+	err = req->out.h.error;
+	fuse_put_request(fc, req);
+
+	if (!err)
+		clear_bit(FUSE_I_MTIME_UPDATED, &fi->state);
+
+	return err;
+}
+
+static inline void set_mtime_helper(struct inode *inode, struct timespec mtime)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	inode->i_mtime = mtime;
+	clear_bit(FUSE_I_MTIME_UPDATED, &fi->state);
+}
+
+/*
+ * S_NOCMTIME is clear, so we need to update inode->i_mtime manually. But
+ * we can also clear FUSE_I_MTIME_UPDATED if FUSE_SETATTR has just changed
+ * mtime on server.
+ */
+static void fuse_set_mtime_local(struct iattr *iattr, struct inode *inode)
+{
+	unsigned ivalid = iattr->ia_valid;
+
+	if ((ivalid & ATTR_MTIME) && update_mtime(ivalid)) {
+		if (ivalid & ATTR_MTIME_SET)
+			set_mtime_helper(inode, iattr->ia_mtime);
+		else
+			set_mtime_helper(inode, current_fs_time(inode->i_sb));
+	} else if (ivalid & ATTR_SIZE)
+		set_mtime_helper(inode, current_fs_time(inode->i_sb));
+}
+
 /*
  * Set attributes, and at the same time refresh them.
  *
@@ -1643,6 +1752,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 	struct fuse_setattr_in inarg;
 	struct fuse_attr_out outarg;
 	bool is_truncate = false;
+	bool is_wb = fc->writeback_cache;
 	loff_t oldsize;
 	int err;
 
@@ -1684,17 +1794,7 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 		inarg.valid |= FATTR_LOCKOWNER;
 		inarg.lock_owner = fuse_lock_owner_id(fc, current->files);
 	}
-	req->in.h.opcode = FUSE_SETATTR;
-	req->in.h.nodeid = get_node_id(inode);
-	req->in.numargs = 1;
-	req->in.args[0].size = sizeof(inarg);
-	req->in.args[0].value = &inarg;
-	req->out.numargs = 1;
-	if (fc->minor < 9)
-		req->out.args[0].size = FUSE_COMPAT_ATTR_OUT_SIZE;
-	else
-		req->out.args[0].size = sizeof(outarg);
-	req->out.args[0].value = &outarg;
+	fuse_setattr_fill(fc, req, inode, &inarg, &outarg);
 	fuse_request_send(fc, req);
 	err = req->out.h.error;
 	fuse_put_request(fc, req);
@@ -1711,10 +1811,16 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 	}
 
 	spin_lock(&fc->lock);
+	/* the kernel maintains i_mtime locally */
+	if (fc->writeback_cache && S_ISREG(inode->i_mode))
+		fuse_set_mtime_local(attr, inode);
+
 	fuse_change_attributes_common(inode, &outarg.attr,
 				      attr_timeout(&outarg));
 	oldsize = inode->i_size;
-	i_size_write(inode, outarg.attr.size);
+	/* see the comment in fuse_change_attributes() */
+	if (!is_wb || is_truncate || !S_ISREG(inode->i_mode))
+		i_size_write(inode, outarg.attr.size);
 
 	if (is_truncate) {
 		/* NOTE: this may release/reacquire fc->lock */
@@ -1726,7 +1832,8 @@ int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 	 * Only call invalidate_inode_pages2() after removing
 	 * FUSE_NOWRITE, otherwise fuse_launder_page() would deadlock.
 	 */
-	if (S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
+	if ((is_truncate || !is_wb) &&
+			S_ISREG(inode->i_mode) && oldsize != outarg.attr.size) {
 		truncate_pagecache(inode, outarg.attr.size);
 		invalidate_inode_pages2(inode->i_mapping);
 	}
@@ -1938,6 +2045,17 @@ static int fuse_removexattr(struct dentry *entry, const char *name)
 	return err;
 }
 
+static int fuse_update_time(struct inode *inode, struct timespec *now,
+			    int flags)
+{
+	if (flags & S_MTIME) {
+		inode->i_mtime = *now;
+		set_bit(FUSE_I_MTIME_UPDATED, &get_fuse_inode(inode)->state);
+		BUG_ON(!S_ISREG(inode->i_mode));
+	}
+	return 0;
+}
+
 static const struct inode_operations_wrapper fuse_dir_inode_operations = {
 	.ops = {
 	.lookup		= fuse_lookup,
@@ -1980,6 +2098,7 @@ static const struct inode_operations fuse_common_inode_operations = {
 	.getxattr	= fuse_getxattr,
 	.listxattr	= fuse_listxattr,
 	.removexattr	= fuse_removexattr,
+	.update_time	= fuse_update_time,
 };
 
 static const struct inode_operations fuse_symlink_inode_operations = {
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -17,8 +17,27 @@
 #include <linux/swap.h>
 #include <linux/aio.h>
 #include <linux/falloc.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/virtinfo.h>
+#include <linux/file.h>
+
+struct workqueue_struct *fuse_fput_wq;
+static DEFINE_SPINLOCK(fuse_fput_lock);
+static LIST_HEAD(fuse_fput_head);
+static void fuse_fput_routine(struct work_struct *);
+static DECLARE_WORK(fuse_fput_work, fuse_fput_routine);
 
 static const struct file_operations fuse_direct_io_file_operations;
+static void fuse_sync_writes(struct inode *inode);
+
+static void fuse_account_request(struct fuse_conn *fc, size_t count)
+{
+	struct user_beancounter *ub = get_exec_ub();
+
+	ub_percpu_inc(ub, fuse_requests);
+	ub_percpu_add(ub, fuse_bytes, count);
+	virtinfo_notifier_call_irq(VITYPE_IO, VIRTINFO_IO_FUSE_REQ, NULL);
+}
 
 static int fuse_send_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 			  int opcode, struct fuse_open_out *outargp)
@@ -58,6 +77,8 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 	if (unlikely(!ff))
 		return NULL;
 
+	ff->ff_state = 0;
+
 	ff->fc = fc;
 	ff->reserved_req = fuse_request_alloc(0);
 	if (unlikely(!ff->reserved_req)) {
@@ -66,19 +87,30 @@ struct fuse_file *fuse_file_alloc(struct fuse_conn *fc)
 	}
 
 	INIT_LIST_HEAD(&ff->write_entry);
+	INIT_LIST_HEAD(&ff->rw_entry);
 	atomic_set(&ff->count, 0);
 	RB_CLEAR_NODE(&ff->polled_node);
 	init_waitqueue_head(&ff->poll_wait);
 
 	spin_lock(&fc->lock);
 	ff->kh = ++fc->khctr;
+	ff->ff_dentry = NULL;
+	list_add_tail(&ff->fl, &fc->conn_files);
 	spin_unlock(&fc->lock);
 
 	return ff;
 }
 
+static void fuse_file_list_del(struct fuse_file *ff)
+{
+	spin_lock(&ff->fc->lock);
+	list_del_init(&ff->fl);
+	spin_unlock(&ff->fc->lock);
+}
+
 void fuse_file_free(struct fuse_file *ff)
 {
+	fuse_file_list_del(ff);
 	fuse_request_free(ff->reserved_req);
 	kfree(ff);
 }
@@ -130,9 +162,17 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
 		if (sync) {
 			req->background = 0;
 			fuse_request_send(ff->fc, req);
+			if (req->out.h.error == -EINTR) {
+				req->state = FUSE_REQ_INIT;
+				req->out.h.error = 0;
+				goto async_fallback;
+			}
+			fuse_file_list_del(ff);
 			path_put(&req->misc.release.path);
 			fuse_put_request(ff->fc, req);
 		} else {
+async_fallback:
+			fuse_file_list_del(ff);
 			req->end = fuse_release_end;
 			req->background = 1;
 			fuse_request_send_background(ff->fc, req);
@@ -141,6 +181,12 @@ static void fuse_file_put(struct fuse_file *ff, bool sync)
 	}
 }
 
+static void __fuse_file_put(struct fuse_file *ff)
+{
+	if (atomic_dec_and_test(&ff->count))
+		BUG();
+}
+
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		 bool isdir)
 {
@@ -171,11 +217,43 @@ int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 }
 EXPORT_SYMBOL_GPL(fuse_do_open);
 
+static void fuse_link_file(struct file *file, bool write)
+{
+	struct inode *inode = file_inode(file);
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_file *ff = file->private_data;
+
+	struct list_head *entry = write ? &ff->write_entry : &ff->rw_entry;
+	struct list_head *list  = write ? &fi->write_files : &fi->rw_files;
+
+	/*
+	 * file may be written through mmap, so chain it onto the
+	 * inodes's write_file list
+	 */
+	spin_lock(&fc->lock);
+	if (list_empty(entry))
+		list_add(entry, list);
+	spin_unlock(&fc->lock);
+}
+
+static void fuse_link_write_file(struct file *file)
+{
+	fuse_link_file(file, true);
+}
+
+static void fuse_link_rw_file(struct file *file)
+{
+	fuse_link_file(file, false);
+}
+
 void fuse_finish_open(struct inode *inode, struct file *file)
 {
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
+	ff->ff_dentry = file->f_dentry;
+
 	if (ff->open_flags & FOPEN_DIRECT_IO)
 		file->f_op = &fuse_direct_io_file_operations;
 	if (!(ff->open_flags & FOPEN_KEEP_CACHE))
@@ -191,6 +269,10 @@ void fuse_finish_open(struct inode *inode, struct file *file)
 		spin_unlock(&fc->lock);
 		fuse_invalidate_attr(inode);
 	}
+	if ((file->f_mode & FMODE_WRITE) && fc->writeback_cache)
+		fuse_link_write_file(file);
+
+	fuse_link_rw_file(file);
 }
 
 int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
@@ -198,6 +280,9 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	int err;
 
+	if ((file->f_flags & O_DIRECT) && !(fc->flags & FUSE_ODIRECT))
+		return -EINVAL;
+
 	err = generic_file_open(inode, file);
 	if (err)
 		return err;
@@ -206,6 +291,40 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir)
 	if (err)
 		return err;
 
+	if (fc->writeback_cache && !isdir) {
+		struct fuse_inode *fi = get_fuse_inode(inode);
+		u64 size;
+
+		mutex_lock(&inode->i_mutex);
+
+		spin_lock(&fc->lock);
+		atomic_inc(&fi->num_openers);
+
+		if (atomic_read(&fi->num_openers) == 1) {
+			fi->i_size_unstable = 1;
+			spin_unlock(&fc->lock);
+			err = fuse_getattr_size(inode, file, &size);
+			if (err) {
+				spin_lock(&fc->lock);
+				atomic_dec(&fi->num_openers);
+				fi->i_size_unstable = 0;
+				spin_unlock(&fc->lock);
+
+				mutex_unlock(&inode->i_mutex);
+				fuse_release_common(file, FUSE_RELEASE);
+				return err;
+			}
+
+			spin_lock(&fc->lock);
+			i_size_write(inode, size);
+			fi->i_size_unstable = 0;
+			spin_unlock(&fc->lock);
+		} else
+			spin_unlock(&fc->lock);
+
+		mutex_unlock(&inode->i_mutex);
+	}
+
 	fuse_finish_open(inode, file);
 
 	return 0;
@@ -219,6 +338,7 @@ static void fuse_prepare_release(struct fuse_file *ff, int flags, int opcode)
 
 	spin_lock(&fc->lock);
 	list_del(&ff->write_entry);
+	list_del(&ff->rw_entry);
 	if (!RB_EMPTY_NODE(&ff->polled_node))
 		rb_erase(&ff->polled_node, &fc->polled_files);
 	spin_unlock(&fc->lock);
@@ -257,6 +377,14 @@ void fuse_release_common(struct file *file, int opcode)
 	req->misc.release.path = file->f_path;
 
 	/*
+	 * No more in-flight asynchronous READ or WRITE requests if
+	 * fuse file release is synchronous
+	 */
+	if (ff->fc->close_wait) {
+		BUG_ON(atomic_read(&ff->count) != 1);
+	}
+
+	/*
 	 * Normally this will send the RELEASE request, however if
 	 * some asynchronous READ or WRITE requests are outstanding,
 	 * the sending will be delayed.
@@ -265,7 +393,8 @@ void fuse_release_common(struct file *file, int opcode)
 	 * synchronous RELEASE is allowed (and desirable) in this case
 	 * because the server can be trusted not to screw up.
 	 */
-	fuse_file_put(ff, ff->fc->destroy_req != NULL);
+	fuse_file_put(ff, ff->fc->destroy_req != NULL ||
+			  ff->fc->close_wait);
 }
 
 static int fuse_open(struct inode *inode, struct file *file)
@@ -275,6 +404,57 @@ static int fuse_open(struct inode *inode, struct file *file)
 
 static int fuse_release(struct inode *inode, struct file *file)
 {
+	struct fuse_file *ff = file->private_data;
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	if (ff->fc->writeback_cache) {
+		if (file->f_mode & FMODE_WRITE) {
+			filemap_write_and_wait(file->f_mapping);
+
+			/* Must remove file from write list. Otherwise it is possible this
+			 * file will get more writeback from another files rerouted via write_files
+			 */
+			spin_lock(&ff->fc->lock);
+			list_del_init(&ff->write_entry);
+			spin_unlock(&ff->fc->lock);
+
+			/* A writeback from another fuse file might come after
+			 * filemap_write_and_wait() above
+			 */
+			if (!ff->fc->close_wait)
+				filemap_write_and_wait(file->f_mapping);
+		} else
+			BUG_ON(!list_empty(&ff->write_entry));
+
+		/* This can livelock. Inode can be open via another file
+		 * and that file can generate continuous writeback.
+		 * I think i_mutex could be taken around this.
+		 * 
+		 * For now we replace this with waiting on ff->count,
+		 * it is safe, because we essentially wait only for writeback (and readahead)
+		 * enqueued on this file and it is not going to get new one: it is closing.
+		 */
+		if (!ff->fc->close_wait)
+			wait_event(fi->page_waitq, list_empty_careful(&fi->writepages));
+		else
+			wait_event(fi->page_waitq, atomic_read(&ff->count) == 1);
+
+		/* Wait for threads just released ff to leave their critical sections.
+		 * Taking spinlock is the first thing fuse_release_common does, so that
+		 * this is unneseccary, but it is still good to emphasize right here,
+		 * that we need this.
+		 */
+		spin_unlock_wait(&ff->fc->lock);
+
+		/* since now we can trust userspace attr.size */
+		atomic_dec(&fi->num_openers);
+	} else if (ff->fc->close_wait)
+		wait_event(fi->page_waitq, atomic_read(&ff->count) == 1);
+
+	if (test_bit(FUSE_I_MTIME_UPDATED,
+		     &get_fuse_inode(inode)->state))
+		fuse_flush_mtime(file, true);
+
 	fuse_release_common(file, FUSE_RELEASE);
 
 	/* return value is ignored by VFS */
@@ -284,6 +464,7 @@ static int fuse_release(struct inode *inode, struct file *file)
 void fuse_sync_release(struct fuse_file *ff, int flags)
 {
 	WARN_ON(atomic_read(&ff->count) > 1);
+	fuse_file_list_del(ff);
 	fuse_prepare_release(ff, flags, FUSE_RELEASE);
 	ff->reserved_req->force = 1;
 	ff->reserved_req->background = 0;
@@ -315,6 +496,31 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id)
 	return (u64) v0 + ((u64) v1 << 32);
 }
 
+static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from,
+				    pgoff_t idx_to)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_req *req;
+	bool found = false;
+
+	spin_lock(&fc->lock);
+	list_for_each_entry(req, &fi->writepages, writepages_entry) {
+		pgoff_t curr_index;
+
+		BUG_ON(req->inode != inode);
+		curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
+		if (!(idx_from >= curr_index + req->num_pages ||
+		      idx_to < curr_index)) {
+			found = true;
+			break;
+		}
+	}
+	spin_unlock(&fc->lock);
+
+	return found;
+}
+
 /*
  * Check if page is under writeback
  *
@@ -334,7 +540,8 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
 
 		BUG_ON(req->inode != inode);
 		curr_index = req->misc.write.in.offset >> PAGE_CACHE_SHIFT;
-		if (curr_index == index) {
+		if (curr_index <= index &&
+		    index < curr_index + req->num_pages) {
 			found = true;
 			break;
 		}
@@ -350,12 +557,45 @@ static bool fuse_page_is_writeback(struct inode *inode, pgoff_t index)
  * Since fuse doesn't rely on the VM writeback tracking, this has to
  * use some other means.
  */
-static int fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
+static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index)
 {
 	struct fuse_inode *fi = get_fuse_inode(inode);
 
 	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index));
-	return 0;
+}
+
+/*
+ * Can be woken up by FUSE_NOTIFY_INVAL_FILES
+ */
+static void __fuse_wait_on_page_writeback_or_invalidate(struct inode *inode,
+							struct fuse_file *ff,
+							pgoff_t index)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+
+	wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index) ||
+		   test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state));
+}
+
+static void fuse_wait_on_page_writeback_or_invalidate(struct inode *inode,
+						      struct file *file,
+						      pgoff_t index)
+{
+	return __fuse_wait_on_page_writeback_or_invalidate(inode,
+				file->private_data, index);
+}
+
+static void fuse_wait_on_writeback(struct inode *inode, pgoff_t start,
+				   size_t bytes)
+{
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	pgoff_t idx_from, idx_to;
+
+	idx_from = start >> PAGE_CACHE_SHIFT;
+	idx_to = (start + bytes - 1) >> PAGE_CACHE_SHIFT;
+
+	wait_event(fi->page_waitq,
+		   !fuse_range_is_writeback(inode, idx_from, idx_to));
 }
 
 static int fuse_flush(struct file *file, fl_owner_t id)
@@ -370,9 +610,24 @@ static int fuse_flush(struct file *file, fl_owner_t id)
 	if (is_bad_inode(inode))
 		return -EIO;
 
-	if (fc->no_flush)
+	if (fc->no_flush || !(file->f_mode & FMODE_WRITE))
 		return 0;
 
+	err = filemap_write_and_wait(file->f_mapping);
+	if (err)
+		return err;
+
+	mutex_lock(&inode->i_mutex);
+	fuse_sync_writes(inode);
+	mutex_unlock(&inode->i_mutex);
+
+	if (test_and_clear_bit(AS_ENOSPC, &file->f_mapping->flags))
+		err = -ENOSPC;
+	if (test_and_clear_bit(AS_EIO, &file->f_mapping->flags))
+		err = -EIO;
+	if (err)
+		return err;
+
 	req = fuse_get_req_nofail_nopages(fc, file);
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
@@ -441,12 +696,31 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
 
 	fuse_sync_writes(inode);
 
+	/* Due to implementation of fuse writeback filemap_write_and_wait_range()
+	 * does not catch errors. We have to do this directly after fuse_sync_writes()
+	 */
+	if (test_and_clear_bit(AS_ENOSPC, &file->f_mapping->flags))
+		err = -ENOSPC;
+	if (test_and_clear_bit(AS_EIO, &file->f_mapping->flags))
+		err = -EIO;
+	if (err)
+		goto out;
+
+	if (!datasync && test_bit(FUSE_I_MTIME_UPDATED,
+				  &get_fuse_inode(inode)->state)) {
+		err = fuse_flush_mtime(file, false);
+		if (err)
+			goto out;
+	}
+
 	req = fuse_get_req_nopages(fc);
 	if (IS_ERR(req)) {
 		err = PTR_ERR(req);
 		goto out;
 	}
 
+	mutex_unlock(&inode->i_mutex);
+
 	memset(&inarg, 0, sizeof(inarg));
 	inarg.fh = ff->fh;
 	inarg.fsync_flags = datasync ? 1 : 0;
@@ -465,6 +739,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end,
 			fc->no_fsync = 1;
 		err = 0;
 	}
+	return err;
 out:
 	mutex_unlock(&inode->i_mutex);
 	return err;
@@ -494,6 +769,9 @@ void fuse_read_fill(struct fuse_req *req, struct file *file, loff_t pos,
 	req->out.argvar = 1;
 	req->out.numargs = 1;
 	req->out.args[0].size = count;
+
+	if (opcode == FUSE_READ)
+		req->inode = file->f_dentry->d_inode;
 }
 
 static void fuse_release_user_pages(struct fuse_req *req, int write)
@@ -508,6 +786,29 @@ static void fuse_release_user_pages(struct fuse_req *req, int write)
 	}
 }
 
+static void fuse_fput_routine(struct work_struct *data)
+{
+	spin_lock(&fuse_fput_lock);
+	while (likely(!list_empty(&fuse_fput_head))) {
+		struct fuse_io_priv *io = list_entry(fuse_fput_head.next,
+						     struct fuse_io_priv,
+						     list);
+		struct file *file = io->file;
+
+		list_del(&io->list);
+		spin_unlock(&fuse_fput_lock);
+
+		/* hack: __fput() is not visible outside fs/file_table.c */
+		BUG_ON(atomic_long_read(&file->f_count));
+		atomic_long_inc(&file->f_count);
+		fput(file);
+
+		kfree(io);
+		spin_lock(&fuse_fput_lock);
+	}
+	spin_unlock(&fuse_fput_lock);
+}
+
 /**
  * In case of short read, the caller sets 'pos' to the position of
  * actual end of fuse request in IO request. Otherwise, if bytes_requested
@@ -539,6 +840,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 
 	if (!left) {
 		long res;
+		struct file *file = io->iocb->ki_filp;
 
 		if (io->err)
 			res = io->err;
@@ -548,7 +850,7 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 			res = io->bytes < 0 ? io->size : io->bytes;
 
 			if (!is_sync_kiocb(io->iocb)) {
-				struct path *path = &io->iocb->ki_filp->f_path;
+				struct path *path = &file->f_path;
 				struct inode *inode = path->dentry->d_inode;
 				struct fuse_conn *fc = get_fuse_conn(inode);
 				struct fuse_inode *fi = get_fuse_inode(inode);
@@ -559,8 +861,32 @@ static void fuse_aio_complete(struct fuse_io_priv *io, int err, ssize_t pos)
 			}
 		}
 
+		if (res < 0 && printk_ratelimit())
+			printk("fuse_aio_complete(io=%p, err=%d, pos=%ld"
+			       "): io->err=%d io->bytes=%ld io->size=%ld "
+			       "is_sync=%d res=%ld ki_opcode=%d ki_pos=%llu\n",
+			       io, err, pos, io->err, io->bytes,
+			       io->size, is_sync_kiocb(io->iocb), res,
+			       io->iocb->ki_opcode, io->iocb->ki_pos);
+
+		/* We have to bump f_count here to avoid deadlock for
+		 * single-threaded fuse daemon: if process who generated
+		 * AIO is already close(2) the file, fput() called from
+		 * aio_complete will be the last fput(); hence, it will send
+		 * flush_mtime (or release) request to userspace who is busy
+		 * now writing ACK for given AIO to in-kernel fuse */
+		get_file(file);
+		BUG_ON(io->file != io->iocb->ki_filp);
 		aio_complete(io->iocb, res, 0);
-		kfree(io);
+
+		if (unlikely(atomic_long_dec_and_test(&file->f_count))) {
+			spin_lock(&fuse_fput_lock);
+			list_add(&io->list, &fuse_fput_head);
+			spin_unlock(&fuse_fput_lock);
+			queue_work(fuse_fput_wq, &fuse_fput_work);
+		} else {
+			kfree(io);
+		}
 	}
 }
 
@@ -569,7 +895,8 @@ static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
 	struct fuse_io_priv *io = req->io;
 	ssize_t pos = -1;
 
-	fuse_release_user_pages(req, !io->write);
+	if (!req->bvec)
+		fuse_release_user_pages(req, !io->write);
 
 	if (io->write) {
 		if (req->misc.write.in.size != req->misc.write.out.size)
@@ -581,6 +908,15 @@ static void fuse_aio_complete_req(struct fuse_conn *fc, struct fuse_req *req)
 				req->out.args[0].size;
 	}
 
+	if (req->out.h.error)
+		printk("fuse_aio_complete_req: request (rw=%s fh=0x%llx "
+		       "pos=%lld size=%d) completed with err=%d\n",
+		       !io->write ? "READ"                   : "WRITE",
+		       !io->write ? req->misc.read.in.fh     : req->misc.write.in.fh,
+		       !io->write ? req->misc.read.in.offset : req->misc.write.in.offset,
+		       !io->write ? req->misc.read.in.size   : req->misc.write.in.size,
+		       req->out.h.error);
+
 	fuse_aio_complete(io, req->out.h.error, pos);
 }
 
@@ -609,6 +945,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io,
 	struct fuse_conn *fc = ff->fc;
 
 	fuse_read_fill(req, file, pos, count, FUSE_READ);
+	fuse_account_request(fc, count);
 	if (owner != NULL) {
 		struct fuse_read_in *inarg = &req->misc.read.in;
 
@@ -619,7 +956,7 @@ static size_t fuse_send_read(struct fuse_req *req, struct fuse_io_priv *io,
 	if (io->async)
 		return fuse_async_req_send(fc, req, count, io);
 
-	fuse_request_send(fc, req);
+	fuse_request_check_and_send(fc, req, ff);
 	return req->out.args[0].size;
 }
 
@@ -638,86 +975,156 @@ static void fuse_read_update_size(struct inode *inode, loff_t size,
 	spin_unlock(&fc->lock);
 }
 
-static int fuse_readpage(struct file *file, struct page *page)
+static void fuse_short_read(struct fuse_req *req, struct inode *inode,
+			    u64 attr_ver)
+{
+	size_t num_read = req->out.args[0].size;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+
+	if (fc->writeback_cache) {
+		/*
+		 * A hole in a file. Some data after the hole are in page cache,
+		 * but have not reached the client fs yet. So, the hole is not
+		 * present there.
+		 */
+		int i;
+		int start_idx = num_read >> PAGE_CACHE_SHIFT;
+		size_t off = num_read & (PAGE_CACHE_SIZE - 1);
+
+		for (i = start_idx; i < req->num_pages; i++) {
+			struct page *page = req->pages[i];
+			void *mapaddr = kmap_atomic(page);
+
+			memset(mapaddr + off, 0, PAGE_CACHE_SIZE - off);
+
+			kunmap_atomic(mapaddr);
+			off = 0;
+		}
+	} else {
+		loff_t pos = page_offset(req->pages[0]) + num_read;
+		fuse_read_update_size(inode, pos, attr_ver);
+	}
+}
+
+static int __fuse_readpage(struct file *file, struct page *page, size_t count,
+			   int *err, struct fuse_req **req_pp, u64 *attr_ver_p,
+			   bool page_needs_release, bool *killed_p)
 {
 	struct fuse_io_priv io = { .async = 0, .file = file };
 	struct inode *inode = page->mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_req *req;
-	size_t num_read;
-	loff_t pos = page_offset(page);
-	size_t count = PAGE_CACHE_SIZE;
-	u64 attr_ver;
-	int err;
-
-	err = -EIO;
-	if (is_bad_inode(inode))
-		goto out;
+	size_t num_read = 0;
+	bool killed = false;
 
 	/*
 	 * Page writeback can extend beyond the lifetime of the
 	 * page-cache page, so make sure we read a properly synced
 	 * page.
+	 *
+	 * But we can't wait if FUSE_NOTIFY_INVAL_FILES is in progress.
 	 */
-	fuse_wait_on_page_writeback(inode, page->index);
+	fuse_wait_on_page_writeback_or_invalidate(inode, file, page->index);
 
 	req = fuse_get_req(fc, 1);
-	err = PTR_ERR(req);
+	*err = PTR_ERR(req);
 	if (IS_ERR(req))
-		goto out;
+		goto read_done;
 
-	attr_ver = fuse_get_attr_version(fc);
+	if (attr_ver_p)
+		*attr_ver_p = fuse_get_attr_version(fc);
 
 	req->out.page_zeroing = 1;
 	req->out.argpages = 1;
 	req->num_pages = 1;
 	req->pages[0] = page;
 	req->page_descs[0].length = count;
-	num_read = fuse_send_read(req, &io, pos, count, NULL);
-	err = req->out.h.error;
-	fuse_put_request(fc, req);
+	req->page_cache = 1;
+	req->page_needs_release = page_needs_release;
+
+	num_read = fuse_send_read(req, &io, page_offset(page), count, NULL);
+	killed = req->killed;
+	*err = killed ? -EIO : req->out.h.error;
+
+	if (*err)
+		fuse_put_request(fc, req);
+	else
+		*req_pp = req;
+read_done:
+	if (killed_p)
+		*killed_p = killed;
+	return num_read;
+}
+
+static int fuse_readpage(struct file *file, struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_req *req = NULL;
+	size_t num_read;
+	size_t count = PAGE_CACHE_SIZE;
+	u64 attr_ver;
+	int err;
+	bool killed = false;
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out;
 
+	num_read = __fuse_readpage(file, page, count, &err, &req, &attr_ver,
+				   false, &killed);
 	if (!err) {
 		/*
 		 * Short read means EOF.  If file size is larger, truncate it
 		 */
 		if (num_read < count)
-			fuse_read_update_size(inode, pos + num_read, attr_ver);
+			fuse_short_read(req, inode, attr_ver);
 
 		SetPageUptodate(page);
 	}
-
-	fuse_invalidate_attr(inode); /* atime changed */
- out:
-	unlock_page(page);
+	if (req) {
+		fuse_put_request(fc, req);
+		fuse_invalidate_attr(inode); /* atime changed */
+	}
+out:
+	if (!killed)
+		unlock_page(page);
 	return err;
 }
 
+void fuse_release_ff(struct inode *inode, struct fuse_file *ff)
+{
+	if (ff) {
+		if (ff->fc->close_wait) {
+			spin_lock(&ff->fc->lock);
+			__fuse_file_put(ff);
+			wake_up(&get_fuse_inode(inode)->page_waitq);
+			spin_unlock(&ff->fc->lock);
+		} else {
+			fuse_file_put(ff, false);
+		}
+	}
+}
+
 static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
 {
 	int i;
 	size_t count = req->misc.read.in.size;
 	size_t num_read = req->out.args[0].size;
-	struct address_space *mapping = NULL;
-
-	for (i = 0; mapping == NULL && i < req->num_pages; i++)
-		mapping = req->pages[i]->mapping;
+	struct inode *inode = req->inode;
 
-	if (mapping) {
-		struct inode *inode = mapping->host;
+	/* fused might process given request before lost-lease happened */
+	if (req->killed && !req->out.h.error)
+		req->out.h.error = -EIO;
 
-		/*
-		 * Short read means EOF. If file size is larger, truncate it
-		 */
-		if (!req->out.h.error && num_read < count) {
-			loff_t pos;
+	if (req->killed)
+		goto killed;
 
-			pos = page_offset(req->pages[0]) + num_read;
-			fuse_read_update_size(inode, pos,
-					      req->misc.read.attr_ver);
-		}
-		fuse_invalidate_attr(inode); /* atime changed */
-	}
+	/*
+	 * Short read means EOF. If file size is larger, truncate it
+	 */
+	if (!req->out.h.error && num_read < count)
+		fuse_short_read(req, inode, req->misc.read.attr_ver);
 
 	for (i = 0; i < req->num_pages; i++) {
 		struct page *page = req->pages[i];
@@ -728,8 +1135,12 @@ static void fuse_readpages_end(struct fuse_conn *fc, struct fuse_req *req)
 		unlock_page(page);
 		page_cache_release(page);
 	}
+
+killed:
+	fuse_invalidate_attr(inode); /* atime changed */
+
 	if (req->ff)
-		fuse_file_put(req->ff, false);
+		fuse_release_ff(inode, req->ff);
 }
 
 static void fuse_send_readpages(struct fuse_req *req, struct file *file)
@@ -742,7 +1153,10 @@ static void fuse_send_readpages(struct fuse_req *req, struct file *file)
 	req->out.argpages = 1;
 	req->out.page_zeroing = 1;
 	req->out.page_replace = 1;
+	req->page_cache = 1;
+	req->page_needs_release = false;
 	fuse_read_fill(req, file, pos, count, FUSE_READ);
+	fuse_account_request(fc, count);
 	req->misc.read.attr_ver = fuse_get_attr_version(fc);
 	if (fc->async_read) {
 		req->ff = fuse_file_get(ff);
@@ -767,9 +1181,11 @@ static int fuse_readpages_fill(void *_data, struct page *page)
 	struct fuse_fill_data *data = _data;
 	struct fuse_req *req = data->req;
 	struct inode *inode = data->inode;
+	struct file *file = data->file;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 
-	fuse_wait_on_page_writeback(inode, page->index);
+	/* we can't wait if FUSE_NOTIFY_INVAL_FILES is in progress */
+	fuse_wait_on_page_writeback_or_invalidate(inode, file, page->index);
 
 	if (req->num_pages &&
 	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
@@ -892,6 +1308,7 @@ static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
 	struct fuse_write_in *inarg = &req->misc.write.in;
 
 	fuse_write_fill(req, ff, pos, count);
+	fuse_account_request(fc, count);
 	inarg->flags = file->f_flags;
 	if (owner != NULL) {
 		inarg->write_flags |= FUSE_WRITE_LOCKOWNER;
@@ -905,16 +1322,21 @@ static size_t fuse_send_write(struct fuse_req *req, struct fuse_io_priv *io,
 	return req->misc.write.out.size;
 }
 
-void fuse_write_update_size(struct inode *inode, loff_t pos)
+bool fuse_write_update_size(struct inode *inode, loff_t pos)
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
+	bool ret = false;
 
 	spin_lock(&fc->lock);
 	fi->attr_version = ++fc->attr_version;
-	if (pos > inode->i_size)
+	if (pos > inode->i_size) {
 		i_size_write(inode, pos);
+		ret = true;
+	}
 	spin_unlock(&fc->lock);
+
+	return ret;
 }
 
 static size_t fuse_send_write_pages(struct fuse_req *req, struct file *file,
@@ -993,6 +1415,7 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 
 		mark_page_accessed(page);
 
+		iov_iter_advance(ii, tmp);
 		if (!tmp) {
 			unlock_page(page);
 			page_cache_release(page);
@@ -1005,7 +1428,6 @@ static ssize_t fuse_fill_write_pages(struct fuse_req *req,
 		req->page_descs[req->num_pages].length = tmp;
 		req->num_pages++;
 
-		iov_iter_advance(ii, tmp);
 		count += tmp;
 		pos += tmp;
 		offset += tmp;
@@ -1099,6 +1521,9 @@ static ssize_t fuse_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 	struct iov_iter i;
 	loff_t endbyte = 0;
 
+	if (get_fuse_conn(inode)->writeback_cache)
+		return generic_file_aio_write(iocb, iov, nr_segs, pos);
+
 	WARN_ON(iocb->ki_pos != pos);
 
 	ocount = 0;
@@ -1181,7 +1606,12 @@ static inline void fuse_page_descs_length_init(struct fuse_req *req,
 
 static inline unsigned long fuse_get_user_addr(const struct iov_iter *ii)
 {
-	return (unsigned long)ii->iov->iov_base + ii->iov_offset;
+	struct iovec *iov;
+
+	BUG_ON(!iov_iter_has_iovec(ii));
+	iov = (struct iovec *)ii->data;
+
+	return (unsigned long)iov->iov_base + ii->iov_offset;
 }
 
 static inline size_t fuse_get_frag_size(const struct iov_iter *ii,
@@ -1272,8 +1702,10 @@ static inline int fuse_iter_npages(const struct iov_iter *ii_p)
 
 ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 		       unsigned long nr_segs, size_t count, loff_t *ppos,
-		       int write)
+		       int flags)
 {
+	int write = flags & FUSE_DIO_WRITE;
+	int cuse = flags & FUSE_DIO_CUSE;
 	struct file *file = io->file;
 	struct fuse_file *ff = file->private_data;
 	struct fuse_conn *fc = ff->fc;
@@ -1283,6 +1715,8 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 	struct fuse_req *req;
 	struct iov_iter ii;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	iov_iter_init(&ii, iov, nr_segs, count, 0);
 
 	if (io->async)
@@ -1302,10 +1736,17 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 			break;
 		}
 
-		if (write)
+		if (!cuse)
+			fuse_wait_on_writeback(file->f_mapping->host, pos,
+					       nbytes);
+
+		if (write) {
 			nres = fuse_send_write(req, io, pos, nbytes, owner);
-		else
+			task_io_account_write(nbytes);
+		} else {
 			nres = fuse_send_read(req, io, pos, nbytes, owner);
+			task_io_account_read(nbytes);
+		}
 
 		if (!io->async)
 			fuse_release_user_pages(req, !write);
@@ -1380,7 +1821,8 @@ static ssize_t __fuse_direct_write(struct fuse_io_priv *io,
 
 	res = generic_write_checks(file, ppos, &count, 0);
 	if (!res)
-		res = fuse_direct_io(io, iov, nr_segs, count, ppos, 1);
+		res = fuse_direct_io(io, iov, nr_segs, count, ppos,
+				     FUSE_DIO_WRITE);
 
 	fuse_invalidate_attr(inode);
 
@@ -1410,8 +1852,13 @@ static ssize_t fuse_direct_write(struct file *file, const char __user *buf,
 
 static void fuse_writepage_free(struct fuse_conn *fc, struct fuse_req *req)
 {
-	__free_page(req->pages[0]);
-	fuse_file_put(req->ff, false);
+	int i;
+
+	for (i = 0; i < req->num_pages; i++)
+		__free_page(req->pages[i]);
+
+	if (!fc->writeback_cache && !fc->close_wait)
+		fuse_file_put(req->ff, false);
 }
 
 static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
@@ -1419,11 +1866,16 @@ static void fuse_writepage_finish(struct fuse_conn *fc, struct fuse_req *req)
 	struct inode *inode = req->inode;
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+	int i;
 
 	list_del(&req->writepages_entry);
-	dec_bdi_stat(bdi, BDI_WRITEBACK);
-	dec_zone_page_state(req->pages[0], NR_WRITEBACK_TEMP);
-	bdi_writeout_inc(bdi);
+	if (fc->writeback_cache || fc->close_wait)
+		__fuse_file_put(req->ff);
+	for (i = 0; i < req->num_pages; i++) {
+		dec_bdi_stat(bdi, BDI_WRITEBACK);
+		dec_zone_page_state(req->pages[i], NR_WRITEBACK_TEMP);
+		bdi_writeout_inc(bdi);
+	}
 	wake_up(&fi->page_waitq);
 }
 
@@ -1435,14 +1887,16 @@ __acquires(fc->lock)
 	struct fuse_inode *fi = get_fuse_inode(req->inode);
 	loff_t size = i_size_read(req->inode);
 	struct fuse_write_in *inarg = &req->misc.write.in;
+	__u64 data_size = req->num_pages * PAGE_CACHE_SIZE;
 
-	if (!fc->connected)
+	if (!fc->connected ||
+	    test_bit(FUSE_S_FAIL_IMMEDIATELY, &req->ff->ff_state))
 		goto out_free;
 
-	if (inarg->offset + PAGE_CACHE_SIZE <= size) {
-		inarg->size = PAGE_CACHE_SIZE;
+	if (inarg->offset + data_size <= size) {
+		inarg->size = data_size;
 	} else if (inarg->offset < size) {
-		inarg->size = size & (PAGE_CACHE_SIZE - 1);
+		inarg->size = size - inarg->offset;
 	} else {
 		/* Got truncated off completely */
 		goto out_free;
@@ -1495,20 +1949,62 @@ static void fuse_writepage_end(struct fuse_conn *fc, struct fuse_req *req)
 	fuse_writepage_free(fc, req);
 }
 
-static int fuse_writepage_locked(struct page *page)
+static struct fuse_file *fuse_write_file(struct fuse_conn *fc,
+					 struct fuse_inode *fi)
+{
+	struct fuse_file *ff = NULL;
+
+	spin_lock(&fc->lock);
+	if (!list_empty(&fi->write_files)) {
+		ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
+		fuse_file_get(ff);
+	}
+	spin_unlock(&fc->lock);
+
+	return ff;
+}
+
+static int fuse_writepage_locked(struct page *page,
+				 struct writeback_control *wbc,
+				 struct fuse_file **ff_pp)
 {
 	struct address_space *mapping = page->mapping;
 	struct inode *inode = mapping->host;
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	struct fuse_req *req;
-	struct fuse_file *ff;
 	struct page *tmp_page;
+	struct fuse_file *ff;
+	int err = 0;
 
-	set_page_writeback(page);
+	if (fuse_page_is_writeback(inode, page->index)) {
+		if (wbc->sync_mode != WB_SYNC_ALL) {
+			redirty_page_for_writepage(wbc, page);
+			return 0;
+		}
 
-	req = fuse_request_alloc_nofs(1);
-	if (!req)
+		/* we can acquire ff here because we do have locked pages here! */
+		ff = fuse_write_file(fc, get_fuse_inode(inode));
+		if (!ff)
+			goto dummy_end_page_wb_err;
+
+		/* FUSE_NOTIFY_INVAL_FILES must be able to wake us up */
+		__fuse_wait_on_page_writeback_or_invalidate(inode, ff, page->index);
+
+		if (test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state)) {
+			if (ff_pp)
+				*ff_pp = ff;
+			goto dummy_end_page_wb;
+		}
+
+		fuse_release_ff(inode, ff);
+	}
+
+	if (test_set_page_writeback(page))
+		BUG();
+
+	req = fuse_request_alloc_nofs(1);
+	if (!req)
 		goto err;
 
 	req->background = 1; /* writeback always goes to bg_queue */
@@ -1516,13 +2012,13 @@ static int fuse_writepage_locked(struct page *page)
 	if (!tmp_page)
 		goto err_free;
 
-	spin_lock(&fc->lock);
-	BUG_ON(list_empty(&fi->write_files));
-	ff = list_entry(fi->write_files.next, struct fuse_file, write_entry);
-	req->ff = fuse_file_get(ff);
-	spin_unlock(&fc->lock);
-
-	fuse_write_fill(req, ff, page_offset(page), 0);
+	req->ff = fuse_write_file(fc, fi);
+	if (!req->ff)
+		goto err_nofile;
+	if (ff_pp)
+		*ff_pp = fuse_file_get(req->ff);
+	fuse_write_fill(req, req->ff, page_offset(page), 0);
+	fuse_account_request(fc, PAGE_CACHE_SIZE);
 
 	copy_highpage(tmp_page, page);
 	req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
@@ -1547,31 +2043,433 @@ static int fuse_writepage_locked(struct page *page)
 
 	return 0;
 
+err_nofile:
+	printk("FUSE: page dirtied on dead file\n");
+	__free_page(tmp_page);
 err_free:
 	fuse_request_free(req);
 err:
 	end_page_writeback(page);
 	return -ENOMEM;
+
+dummy_end_page_wb_err:
+	printk("FUSE: page under fwb dirtied on dead file\n");
+	err = -EIO;
+	/* fall through ... */
+dummy_end_page_wb:
+	if (test_set_page_writeback(page))
+		BUG();
+	end_page_writeback(page);
+	return err;
 }
 
 static int fuse_writepage(struct page *page, struct writeback_control *wbc)
 {
 	int err;
 
-	err = fuse_writepage_locked(page);
+	err = fuse_writepage_locked(page, wbc, NULL);
+	unlock_page(page);
+
+	return err;
+}
+
+static int fuse_send_writepages(struct fuse_fill_data *data)
+{
+	int i, all_ok = 1;
+	struct fuse_req *req = data->req;
+	struct inode *inode = data->inode;
+	struct backing_dev_info *bdi = inode->i_mapping->backing_dev_info;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	struct fuse_file *ff;
+	loff_t off = -1;
+
+	/* we can acquire ff here because we do have locked pages here! */
+	ff = fuse_write_file(fc, fi);
+
+	if (!ff) {
+		printk("FUSE: pages dirtied on dead file\n");
+		for (i = 0; i < req->num_pages; i++)
+			end_page_writeback(req->pages[i]);
+		return -EIO;
+	}
+
+	if (test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state)) {
+		for (i = 0; i < req->num_pages; i++) {
+			struct page *page = req->pages[i];
+			req->pages[i] = NULL;
+			SetPageError(page);
+			end_page_writeback(page);
+		}
+		fuse_release_ff(inode, ff);
+		fuse_put_request(fc, req);
+		return 0;
+	}
+
+	req->inode = inode;
+	req->misc.write.in.offset = page_offset(req->pages[0]);
+
+	spin_lock(&fc->lock);
+	list_add(&req->writepages_entry, &fi->writepages);
+	spin_unlock(&fc->lock);
+
+	for (i = 0; i < req->num_pages; i++) {
+		struct page *page = req->pages[i];
+		struct page *tmp_page;
+
+		tmp_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
+		if (tmp_page) {
+			copy_highpage(tmp_page, page);
+			inc_bdi_stat(bdi, BDI_WRITEBACK);
+			inc_zone_page_state(tmp_page, NR_WRITEBACK_TEMP);
+		} else
+			all_ok = 0;
+		req->pages[i] = tmp_page;
+		if (i == 0)
+			off = page_offset(page);
+
+		end_page_writeback(page);
+	}
+
+	if (!all_ok) {
+		for (i = 0; i < req->num_pages; i++) {
+			struct page *page = req->pages[i];
+			if (page) {
+				dec_bdi_stat(bdi, BDI_WRITEBACK);
+				dec_zone_page_state(page, NR_WRITEBACK_TEMP);
+				__free_page(page);
+				req->pages[i] = NULL;
+			}
+		}
+
+		spin_lock(&fc->lock);
+		list_del(&req->writepages_entry);
+		wake_up(&fi->page_waitq);
+		spin_unlock(&fc->lock);
+
+		fuse_release_ff(inode, ff);
+		return -ENOMEM;
+	}
+
+	req->ff = fuse_file_get(ff);
+	fuse_write_fill(req, ff, off, 0);
+	fuse_account_request(fc, req->num_pages << PAGE_CACHE_SHIFT);
+
+	req->misc.write.in.write_flags |= FUSE_WRITE_CACHE;
+	req->in.argpages = 1;
+	req->background = 1;
+	fuse_page_descs_length_init(req, 0, req->num_pages);
+	req->end = fuse_writepage_end;
+
+	spin_lock(&fc->lock);
+	list_add_tail(&req->list, &fi->queued_writes);
+	fuse_flush_writepages(data->inode);
+	spin_unlock(&fc->lock);
+
+	fuse_release_ff(inode, ff);
+	return 0;
+}
+
+/*
+ * Returns true if and only if fuse connection is blocked and there is
+ * no file invalidation in progress.
+ */
+static inline bool fuse_blocked_for_wb(struct inode *inode)
+{
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	bool blocked = true;
+
+	if (!fc->blocked)
+		return false;
+
+	spin_lock(&fc->lock);
+	if (!list_empty(&fi->rw_files)) {
+		struct fuse_file *ff = list_entry(fi->rw_files.next,
+						  struct fuse_file, rw_entry);
+		if (test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state))
+			blocked = false;
+	}
+	spin_unlock(&fc->lock);
+
+	return blocked;
+}
+
+static int fuse_writepages_fill(struct page *page,
+		struct writeback_control *wbc, void *_data)
+{
+	struct fuse_fill_data *data = _data;
+	struct fuse_req *req = data->req;
+	struct inode *inode = data->inode;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	int check_for_blocked = 0;
+
+	if (fuse_page_is_writeback(inode, page->index)) {
+		struct fuse_file *ff;
+
+		if (wbc->sync_mode != WB_SYNC_ALL) {
+			redirty_page_for_writepage(wbc, page);
+			unlock_page(page);
+			return 0;
+		}
+
+		/* we can acquire ff here because we do have locked pages here! */
+		ff = fuse_write_file(fc, get_fuse_inode(inode));
+		if (!ff) {
+			printk("FUSE: dirty page on dead file\n");
+			unlock_page(page);
+			return -EIO;
+		}
+
+		/* FUSE_NOTIFY_INVAL_FILES must be able to wake us up */
+		__fuse_wait_on_page_writeback_or_invalidate(inode, ff, page->index);
+
+		if (test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state)) {
+			unlock_page(page);
+			fuse_release_ff(inode, ff);
+			return 0;
+		}
+
+		fuse_release_ff(inode, ff);
+	}
+
+	if (req->num_pages &&
+	    (req->num_pages == FUSE_MAX_PAGES_PER_REQ ||
+	     (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_write ||
+	     req->pages[req->num_pages - 1]->index + 1 != page->index)) {
+		int err;
+
+		err = fuse_send_writepages(data);
+		if (err) {
+			unlock_page(page);
+			return err;
+		}
+
+		data->req = req =
+			fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ);
+		if (!req) {
+			unlock_page(page);
+			return -ENOMEM;
+		}
+
+		check_for_blocked = 1;
+	}
+
+	req->pages[req->num_pages] = page;
+	req->num_pages++;
+
+	if (test_set_page_writeback(page))
+		BUG();
+
 	unlock_page(page);
 
+	if (wbc->sync_mode != WB_SYNC_NONE && check_for_blocked)
+		wait_event(fc->blocked_waitq, !fuse_blocked_for_wb(inode));
+
+	return 0;
+}
+
+static int fuse_dummy_writepage(struct page *page,
+				struct writeback_control *wbc,
+				void *data)
+{
+	unlock_page(page);
+	return 0;
+}
+
+static int fuse_writepages(struct address_space *mapping,
+			   struct writeback_control *wbc)
+{
+	struct inode *inode = mapping->host;
+	struct fuse_conn *fc = get_fuse_conn(inode);
+	struct fuse_fill_data data;
+	struct fuse_file *ff;
+	int err;
+
+	if (!fc->writeback_cache)
+		return generic_writepages(mapping, wbc);
+
+	err = -EIO;
+	if (is_bad_inode(inode))
+		goto out;
+
+	/*
+	 * We use fuse_blocked_for_wb() instead of just fc->blocked to avoid
+	 * deadlock when we are called from fuse_invalidate_files() in case
+	 * of single-threaded fused.
+	 */
+	if (wbc->sync_mode != WB_SYNC_NONE)
+		wait_event(fc->blocked_waitq, !fuse_blocked_for_wb(inode));
+
+	/* More than optimization: writeback pages to /dev/null; fused would
+	 * drop our FUSE_WRITE requests anyway, but it will be blocked while
+	 * sending NOTIFY_INVAL_FILES until we return!
+	 *
+	 * NB: We can't wait till fuse_send_writepages() because
+	 * fuse_writepages_fill() would possibly deadlock on
+	 * fuse_page_is_writeback().
+	 */
+	ff = fuse_write_file(fc, get_fuse_inode(inode));
+	if (ff && test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state)) {
+		err = write_cache_pages(mapping, wbc, fuse_dummy_writepage,
+					mapping);
+		fuse_release_ff(inode, ff);
+		goto out;
+	}
+	if (ff)
+		fuse_release_ff(inode, ff);
+
+	data.inode = inode;
+	data.req = fuse_request_alloc_nofs(FUSE_MAX_PAGES_PER_REQ);
+	err = -ENOMEM;
+	if (!data.req)
+		goto out;
+
+	err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data);
+	if (data.req) {
+		if (!err && data.req->num_pages) {
+			err = fuse_send_writepages(&data);
+			if (err)
+				fuse_put_request(fc, data.req);
+		} else
+			fuse_put_request(fc, data.req);
+	}
+out:
 	return err;
 }
 
+/*
+ * Determine the number of bytes of data the page contains
+ */
+static inline unsigned fuse_page_length(struct page *page)
+{
+	loff_t i_size = i_size_read(page_file_mapping(page)->host);
+
+	if (i_size > 0) {
+		pgoff_t page_index = page_file_index(page);
+		pgoff_t end_index = (i_size - 1) >> PAGE_CACHE_SHIFT;
+		if (page_index < end_index)
+			return PAGE_CACHE_SIZE;
+		if (page_index == end_index)
+			return ((i_size - 1) & ~PAGE_CACHE_MASK) + 1;
+	}
+	return 0;
+}
+
+static inline bool fuse_file_fail_immediately(struct file *file)
+{
+	struct fuse_file *ff = file->private_data;
+
+	return test_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
+}
+
+static int fuse_prepare_write(struct fuse_conn *fc, struct file *file,
+		struct page *page, loff_t pos, unsigned len)
+{
+	struct fuse_req *req = NULL;
+	unsigned num_read;
+	unsigned page_len;
+	int err;
+	bool killed = false;
+
+	if (fuse_file_fail_immediately(file)) {
+		unlock_page(page);
+		page_cache_release(page);
+		return -EIO;
+	}
+
+	if (PageUptodate(page) || (len == PAGE_CACHE_SIZE))
+		return 0;
+
+	page_len = fuse_page_length(page);
+	if (!page_len) {
+		zero_user(page, 0, PAGE_CACHE_SIZE);
+		return 0;
+	}
+
+	num_read = __fuse_readpage(file, page, page_len, &err, &req, NULL,
+				   true, &killed);
+	if (req)
+		fuse_put_request(fc, req);
+	if (err) {
+		if (!killed) {
+			unlock_page(page);
+			page_cache_release(page);
+		}
+	} else if (num_read != PAGE_CACHE_SIZE) {
+		zero_user_segment(page, num_read, PAGE_CACHE_SIZE);
+	}
+	return err;
+}
+
+/*
+ * It's worthy to make sure that space is reserved on disk for the write,
+ * but how to implement it without killing performance need more thinking.
+ */
+static int fuse_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+	struct fuse_conn *fc = get_fuse_conn(file->f_dentry->d_inode);
+
+	BUG_ON(!fc->writeback_cache);
+
+	*pagep = grab_cache_page_write_begin(mapping, index, flags);
+	if (!*pagep)
+		return -ENOMEM;
+
+	return fuse_prepare_write(fc, file, *pagep, pos, len);
+}
+
+static int fuse_commit_write(struct file *file, struct page *page,
+		unsigned from, unsigned to)
+{
+	struct inode *inode = page->mapping->host;
+	loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+
+	fuse_write_update_size(inode, pos);
+	set_page_dirty(page);
+	return 0;
+}
+
+static int fuse_write_end(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned copied,
+		struct page *page, void *fsdata)
+{
+	unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+
+	fuse_commit_write(file, page, from, from+copied);
+
+	unlock_page(page);
+	page_cache_release(page);
+
+	return copied;
+}
+
 static int fuse_launder_page(struct page *page)
 {
 	int err = 0;
 	if (clear_page_dirty_for_io(page)) {
 		struct inode *inode = page->mapping->host;
-		err = fuse_writepage_locked(page);
-		if (!err)
-			fuse_wait_on_page_writeback(inode, page->index);
+		struct writeback_control wbc = {
+			.sync_mode = WB_SYNC_ALL,
+		};
+		struct fuse_file *ff = NULL;
+		err = fuse_writepage_locked(page, &wbc, &ff);
+		if (!err) {
+			/*
+			 * We need to check FAIL_IMMEDIATELY because otherwise
+			 * fuse_do_setattr may stick in invalidate_inode_pages2
+			 * forever (if fuse_invalidate_files is in progress).
+			 */
+			__fuse_wait_on_page_writeback_or_invalidate(inode,
+								    ff, page->index);
+			fuse_release_ff(inode, ff);
+		}
 	}
 	return err;
 }
@@ -1582,7 +2480,11 @@ static int fuse_launder_page(struct page *page)
  */
 static void fuse_vma_close(struct vm_area_struct *vma)
 {
-	filemap_write_and_wait(vma->vm_file->f_mapping);
+	struct file *file = vma->vm_file;
+	struct fuse_file *ff = file->private_data;
+
+	if (!ff->fc->writeback_cache)
+		filemap_write_and_wait(file->f_mapping);
 }
 
 /*
@@ -1609,6 +2511,9 @@ static int fuse_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 */
 	struct inode *inode = vma->vm_file->f_mapping->host;
 
+	if (fuse_file_fail_immediately(vma->vm_file))
+		return -EIO;
+
 	fuse_wait_on_page_writeback(inode, page->index);
 	return 0;
 }
@@ -1617,25 +2522,13 @@ static const struct vm_operations_struct fuse_file_vm_ops = {
 	.close		= fuse_vma_close,
 	.fault		= filemap_fault,
 	.page_mkwrite	= fuse_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int fuse_file_mmap(struct file *file, struct vm_area_struct *vma)
 {
-	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE)) {
-		struct inode *inode = file_inode(file);
-		struct fuse_conn *fc = get_fuse_conn(inode);
-		struct fuse_inode *fi = get_fuse_inode(inode);
-		struct fuse_file *ff = file->private_data;
-		/*
-		 * file may be written through mmap, so chain it onto the
-		 * inodes's write_file list
-		 */
-		spin_lock(&fc->lock);
-		if (list_empty(&ff->write_entry))
-			list_add(&ff->write_entry, &fi->write_files);
-		spin_unlock(&fc->lock);
-	}
+	if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_MAYWRITE))
+		fuse_link_write_file(file);
+
 	file_accessed(file);
 	vma->vm_ops = &fuse_file_vm_ops;
 	return 0;
@@ -1874,8 +2767,9 @@ static int fuse_ioctl_copy_user(struct page **pages, struct iovec *iov,
 		kaddr = kmap(page);
 
 		while (todo) {
-			char __user *uaddr = ii.iov->iov_base + ii.iov_offset;
-			size_t iov_len = ii.iov->iov_len - ii.iov_offset;
+			struct iovec *iiov = (struct iovec *)ii.data;
+			char __user *uaddr = iiov->iov_base + ii.iov_offset;
+			size_t iov_len = iiov->iov_len - ii.iov_offset;
 			size_t copy = min(todo, iov_len);
 			size_t left;
 
@@ -2366,6 +3260,104 @@ int fuse_notify_poll_wakeup(struct fuse_conn *fc,
 	return 0;
 }
 
+static struct fuse_io_priv *fuse_io_priv_create(struct kiocb *iocb,
+		loff_t off, int rw, bool async)
+{
+	struct fuse_io_priv *io;
+
+	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
+	if (!io)
+		return NULL;
+
+	spin_lock_init(&io->lock);
+	io->reqs = 1;
+	io->bytes = -1;
+	io->size = 0;
+	io->offset = off;
+	io->write = (rw == WRITE);
+	io->err = 0;
+	io->file = iocb->ki_filp;
+	io->async = async;
+	io->iocb = iocb;
+
+	return io;
+}
+
+static ssize_t fuse_direct_IO_bvec(int rw, struct kiocb *iocb,
+		struct bio_vec *bvec, loff_t offset, unsigned long bvec_len)
+{
+	struct fuse_io_priv *io;
+	struct fuse_req *req;
+	struct file *file = iocb->ki_filp;
+	struct fuse_file *ff = file->private_data;
+	struct fuse_conn *fc = ff->fc;
+	size_t nmax = (rw == WRITE ? fc->max_write : fc->max_read);
+	size_t filled, nres;
+	loff_t pos = iocb->ki_pos;
+	int i;
+
+	if (nmax > FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT)
+		nmax = FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT;
+
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
+	io = fuse_io_priv_create(iocb, pos, rw, true);
+	if (!io)
+		return -ENOMEM;
+
+	req = NULL;
+	filled = 0;
+	i = 0;
+
+	while (1) {
+		if (!req) {
+			req = fuse_get_req_for_background(fc, 0);
+			if (IS_ERR(req))
+				break;
+
+			if (rw == WRITE)
+				req->in.argbvec = 1;
+			else
+				req->out.argbvec = 1;
+
+			filled = 0;
+			req->bvec = bvec;
+		}
+
+		if (filled + bvec->bv_len <= nmax) {
+			filled += bvec->bv_len;
+			req->num_bvecs++;
+			bvec++;
+			i++;
+
+			if (i < bvec_len)
+				continue;
+		}
+
+		BUG_ON(!filled);
+
+		if (rw == WRITE)
+			nres = fuse_send_write(req, io, pos,
+					filled, NULL);
+		else
+			nres = fuse_send_read(req, io, pos,
+					filled, NULL);
+
+		BUG_ON(nres != filled);
+		fuse_put_request(fc, req);
+
+		if (i == bvec_len)
+			break;
+
+		pos += filled;
+		req = NULL;
+		filled = 0;
+	}
+
+	fuse_aio_complete(io, !IS_ERR(req) ? 0 : PTR_ERR(req), -1);
+	return -EIOCBQUEUED;
+}
+
 static void fuse_do_truncate(struct file *file)
 {
 	struct inode *inode = file->f_mapping->host;
@@ -2392,7 +3384,7 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	ssize_t ret = 0;
 	struct file *file = iocb->ki_filp;
 	struct fuse_file *ff = file->private_data;
-	bool async_dio = ff->fc->async_dio;
+	bool async_dio = ff->fc->async_dio | ff->fc->writeback_cache;
 	loff_t pos = 0;
 	struct inode *inode;
 	loff_t i_size;
@@ -2403,37 +3395,40 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	inode = file->f_mapping->host;
 	i_size = i_size_read(inode);
 
+	if ((rw == READ) && (offset > i_size))
+		return 0;
+
 	/* optimization for short read */
 	if (async_dio && rw != WRITE && offset + count > i_size) {
+		loff_t new_count;
+
 		if (offset >= i_size)
 			return 0;
-		count = min_t(loff_t, count, fuse_round_up(i_size - offset));
+
+		new_count = i_size - offset;
+		if (!ff->fc->writeback_cache)
+			new_count = fuse_round_up(new_count);
+
+		count = min_t(loff_t, count, new_count);
 	}
 
-	io = kmalloc(sizeof(struct fuse_io_priv), GFP_KERNEL);
-	if (!io)
-		return -ENOMEM;
-	spin_lock_init(&io->lock);
-	io->reqs = 1;
-	io->bytes = -1;
-	io->size = 0;
-	io->offset = offset;
-	io->write = (rw == WRITE);
-	io->err = 0;
-	io->file = file;
 	/*
 	 * By default, we want to optimize all I/Os with async request
 	 * submission to the client filesystem if supported.
 	 */
-	io->async = async_dio;
-	io->iocb = iocb;
+	io = fuse_io_priv_create(iocb, offset, rw, async_dio);
+	if (!io)
+		return -ENOMEM;
 
 	/*
 	 * We cannot asynchronously extend the size of a file. We have no method
 	 * to wait on real async I/O requests, so we must submit this request
 	 * synchronously.
+	 * And it's useless to process small sync READs asynchronously.
 	 */
-	if (!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE)
+	if ((!is_sync_kiocb(iocb) && (offset + count > i_size) && rw == WRITE) ||
+	    (rw != WRITE && is_sync_kiocb(iocb) &&
+	     count <= (FUSE_MAX_PAGES_PER_REQ << PAGE_SHIFT)))
 		io->async = false;
 
 	if (rw == WRITE)
@@ -2442,6 +3437,14 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 		ret = __fuse_direct_read(io, iov, nr_segs, &pos, count);
 
 	if (io->async) {
+		if (ret != count && printk_ratelimit()) {
+			struct fuse_file *ff = file->private_data;
+			printk("fuse_direct_IO: failed to %s %ld bytes "
+			       "(offset=%llu ret=%ld i_size=%llu ino=%lu "
+			       "fh=%llu\n", rw == WRITE ? "write" : "read",
+			       count, offset, ret, i_size, inode->i_ino,
+			       ff->fh);
+		}
 		fuse_aio_complete(io, ret < 0 ? ret : 0, -1);
 
 		/* we have a non-extending, async request, so return */
@@ -2463,6 +3466,32 @@ fuse_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
 	return ret;
 }
 
+static ssize_t fuse_direct_IO_page(int rw, struct kiocb *iocb,
+	struct page *page, loff_t offset)
+{
+	struct iovec iov;
+	mm_segment_t oldfs;
+	ssize_t ret;
+
+	iov.iov_base = kmap(page);
+	iov.iov_len = PAGE_SIZE;
+
+	oldfs = get_fs();
+	set_fs(KERNEL_DS);
+
+	ret = fuse_direct_IO(rw, iocb, &iov, offset, 1);
+	if (ret != -EIOCBQUEUED && ret != PAGE_SIZE)
+		printk("fuse_direct_IO_page: io failed with err=%ld "
+		       "(rw=%s fh=0x%llx pos=%lld)\n",
+		       ret, rw == WRITE ? "WRITE" : "READ",
+		       ((struct fuse_file *)iocb->ki_filp->private_data)->fh,
+		       offset);
+
+	set_fs(oldfs);
+	kunmap(page);
+	return ret;
+}
+
 static long fuse_file_fallocate(struct file *file, int mode, loff_t offset,
 				loff_t length)
 {
@@ -2559,6 +3588,8 @@ static const struct file_operations fuse_file_operations = {
 	.compat_ioctl	= fuse_file_compat_ioctl,
 	.poll		= fuse_file_poll,
 	.fallocate	= fuse_file_fallocate,
+	.read_iter	= generic_file_read_iter,
+	.write_iter	= generic_file_write_iter,
 };
 
 static const struct file_operations fuse_direct_io_file_operations = {
@@ -2582,11 +3613,16 @@ static const struct file_operations fuse_direct_io_file_operations = {
 static const struct address_space_operations fuse_file_aops  = {
 	.readpage	= fuse_readpage,
 	.writepage	= fuse_writepage,
+	.writepages	= fuse_writepages,
 	.launder_page	= fuse_launder_page,
 	.readpages	= fuse_readpages,
 	.set_page_dirty	= __set_page_dirty_nobuffers,
 	.bmap		= fuse_bmap,
 	.direct_IO	= fuse_direct_IO,
+	.direct_IO_bvec	= fuse_direct_IO_bvec,
+	.direct_IO_page	= fuse_direct_IO_page,
+	.write_begin	= fuse_write_begin,
+	.write_end	= fuse_write_end,
 };
 
 void fuse_init_file_inode(struct inode *inode)
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -33,7 +33,7 @@
 #define FUSE_NAME_MAX 1024
 
 /** Number of dentries for each connection in the control filesystem */
-#define FUSE_CTL_NUM_DENTRIES 5
+#define FUSE_CTL_NUM_DENTRIES 10
 
 /** If the FUSE_DEFAULT_PERMISSIONS flag is given, the filesystem
     module will check permissions based on the file mode.  Otherwise no
@@ -44,6 +44,15 @@
     doing the mount will be allowed to access the filesystem */
 #define FUSE_ALLOW_OTHER         (1 << 1)
 
+/* Enable direct access */
+#define FUSE_ODIRECT             (1 << 2)
+
+/* Enable synchronous umount */
+#define FUSE_UMOUNT_WAIT	(1 << 3)
+
+/* Disable synchronous close */
+#define FUSE_DISABLE_CLOSE_WAIT	(1 << 4)
+
 /** Number of page pointers embedded in fuse_req */
 #define FUSE_REQ_INLINE_PAGES 1
 
@@ -94,6 +103,9 @@ struct fuse_inode {
 	/** Files usable in writepage.  Protected by fc->lock */
 	struct list_head write_files;
 
+	/** List of all opened files.  Protected by fc->lock */
+	struct list_head rw_files;
+
 	/** Writepages pending on truncate or fsync */
 	struct list_head queued_writes;
 
@@ -109,6 +121,12 @@ struct fuse_inode {
 
 	/** Miscellaneous bits describing inode state */
 	unsigned long state;
+
+	/** Mostly to detect very first open */
+	atomic_t num_openers;
+
+	/** Even though num_openers>0, trust server i_size */
+	int i_size_unstable;
 };
 
 /** FUSE inode state bits */
@@ -117,6 +135,8 @@ enum {
 	FUSE_I_ADVISE_RDPLUS,
 	/** An operation changing file size is in progress  */
 	FUSE_I_SIZE_UNSTABLE,
+	/** i_mtime has been updated locally; a flush to userspace needed */
+	FUSE_I_MTIME_UPDATED,
 };
 
 struct fuse_conn;
@@ -147,14 +167,28 @@ struct fuse_file {
 	/** Entry on inode's write_files list */
 	struct list_head write_entry;
 
+	/** Entry on inode's rw_files list */
+	struct list_head rw_entry;
+
 	/** RB node to be linked on fuse_conn->polled_files */
 	struct rb_node polled_node;
 
 	/** Wait queue head for poll */
 	wait_queue_head_t poll_wait;
 
+	struct list_head fl;
+	struct dentry *ff_dentry;
+
 	/** Has flock been performed on this file? */
 	bool flock:1;
+
+	unsigned long ff_state;
+};
+
+/** FUSE file states (ff_state) */
+enum {
+	/** Any fops on given ff should fail immediately */
+	FUSE_S_FAIL_IMMEDIATELY,
 };
 
 /** One input argument of a request */
@@ -170,6 +204,8 @@ struct fuse_in {
 
 	/** True if the data for the last argument is in req->pages */
 	unsigned argpages:1;
+	/** True is the data for the last argument is in req->bvecs */
+	unsigned argbvec:1;
 
 	/** Number of arguments */
 	unsigned numargs;
@@ -200,6 +236,8 @@ struct fuse_out {
 
 	/** Last argument is a list of pages to copy data to */
 	unsigned argpages:1;
+	/** Last argument is a list of bvecs to copy data to */
+	unsigned argbvec:1;
 
 	/** Zero partially or not copied pages */
 	unsigned page_zeroing:1;
@@ -242,6 +280,7 @@ struct fuse_io_priv {
 	int err;
 	struct kiocb *iocb;
 	struct file *file;
+	struct list_head list;
 };
 
 /**
@@ -288,6 +327,15 @@ struct fuse_req {
 	/** Request is counted as "waiting" */
 	unsigned waiting:1;
 
+	/** Request contains pages from page-cache */
+	unsigned page_cache:1;
+
+	/** Request pages need page_cache_release() */
+	unsigned page_needs_release:1;
+
+	/** Request was killed -- pages were released */
+	unsigned killed:1;
+
 	/** State of the request */
 	enum fuse_req_state state;
 
@@ -326,6 +374,7 @@ struct fuse_req {
 
 	/** page vector */
 	struct page **pages;
+	struct bio_vec *bvec;
 
 	/** page-descriptor vector */
 	struct fuse_page_desc *page_descs;
@@ -339,8 +388,11 @@ struct fuse_req {
 	/** inline page-descriptor vector */
 	struct fuse_page_desc inline_page_descs[FUSE_REQ_INLINE_PAGES];
 
-	/** number of pages in vector */
-	unsigned num_pages;
+	/** number of pages/bvecs in vector */
+	union {
+		unsigned num_pages;
+		unsigned num_bvecs;
+	};
 
 	/** File used in the request (or NULL) */
 	struct fuse_file *ff;
@@ -478,6 +530,9 @@ struct fuse_conn {
 	/** Set if bdi is valid */
 	unsigned bdi_initialized:1;
 
+	/** write-back cache policy (default is write-through) */
+	unsigned writeback_cache:1;
+
 	/*
 	 * The following bitfields are only for optimization purposes
 	 * and hence races in setting them will not cause malfunction
@@ -540,6 +595,9 @@ struct fuse_conn {
 	/** Use enhanced/automatic page cache invalidation. */
 	unsigned auto_inval_data:1;
 
+	/** Wait for response from daemon on close */
+	unsigned close_wait:1;
+
 	/** Does the filesystem support readdirplus? */
 	unsigned do_readdirplus:1;
 
@@ -549,6 +607,9 @@ struct fuse_conn {
 	/** Does the filesystem support asynchronous direct-IO submission? */
 	unsigned async_dio:1;
 
+	/** Handle wrong FUSE_NOTIFY_INVAL_FILES from old fused */
+	unsigned compat_inval_files:1;
+
 	/** The number of requests waiting for completion */
 	atomic_t num_waiting;
 
@@ -590,6 +651,8 @@ struct fuse_conn {
 
 	/** Read/write semaphore to hold when accessing sb. */
 	struct rw_semaphore killsb;
+
+	struct list_head conn_files;
 };
 
 static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb)
@@ -627,7 +690,7 @@ int fuse_inode_eq(struct inode *inode, void *_nodeidp);
  */
 struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 			int generation, struct fuse_attr *attr,
-			u64 attr_valid, u64 attr_version);
+			u64 attr_valid, u64 attr_version, int creat);
 
 int fuse_lookup_name(struct super_block *sb, u64 nodeid, struct qstr *name,
 		     struct fuse_entry_out *outarg, struct inode **inode);
@@ -772,6 +835,12 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req);
 void fuse_request_send(struct fuse_conn *fc, struct fuse_req *req);
 
 /**
+ * Send a request (synchronous) if not FUSE_S_FAIL_IMMEDIATELY
+ */
+void fuse_request_check_and_send(struct fuse_conn *fc, struct fuse_req *req,
+				 struct fuse_file *ff);
+
+/**
  * Send a request in the background
  */
 void fuse_request_send_background(struct fuse_conn *fc, struct fuse_req *req);
@@ -831,6 +900,8 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id);
 int fuse_update_attributes(struct inode *inode, struct kstat *stat,
 			   struct file *file, bool *refreshed);
 
+int fuse_getattr_size(struct inode *inode, struct file *file, u64 *size);
+
 void fuse_flush_writepages(struct inode *inode);
 
 void fuse_set_nowrite(struct inode *inode);
@@ -857,11 +928,28 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
 int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid,
 			     u64 child_nodeid, struct qstr *name);
 
+/**
+ * File-system tells the kernel to invalidate all fuse-files (and cache)
+ * for the given node id.
+ */
+int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid);
+
 int fuse_do_open(struct fuse_conn *fc, u64 nodeid, struct file *file,
 		 bool isdir);
+
+/**
+ * fuse_direct_io() flags
+ */
+
+/** If set, it is WRITE; otherwise - READ */
+#define FUSE_DIO_WRITE (1 << 0)
+
+/** CUSE pass fuse_direct_io() a file which f_mapping->host is not from FUSE */
+#define FUSE_DIO_CUSE  (1 << 1)
+
 ssize_t fuse_direct_io(struct fuse_io_priv *io, const struct iovec *iov,
 		       unsigned long nr_segs, size_t count, loff_t *ppos,
-		       int write);
+		       int flags);
 long fuse_do_ioctl(struct file *file, unsigned int cmd, unsigned long arg,
 		   unsigned int flags);
 long fuse_ioctl_common(struct file *file, unsigned int cmd,
@@ -869,7 +957,9 @@ long fuse_ioctl_common(struct file *file, unsigned int cmd,
 unsigned fuse_file_poll(struct file *file, poll_table *wait);
 int fuse_dev_release(struct inode *inode, struct file *file);
 
-void fuse_write_update_size(struct inode *inode, loff_t pos);
+bool fuse_write_update_size(struct inode *inode, loff_t pos);
+
+int fuse_flush_mtime(struct file *file, bool nofail);
 
 int fuse_do_setattr(struct inode *inode, struct iattr *attr,
 		    struct file *file);
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -20,6 +20,7 @@
 #include <linux/random.h>
 #include <linux/sched.h>
 #include <linux/exportfs.h>
+#include <linux/ve.h>
 
 MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>");
 MODULE_DESCRIPTION("Filesystem in Userspace");
@@ -29,6 +30,8 @@ static struct kmem_cache *fuse_inode_cachep;
 struct list_head fuse_conn_list;
 DEFINE_MUTEX(fuse_mutex);
 
+static int fuse_ve_odirect;
+
 static int set_global_limit(const char *val, struct kernel_param *kp);
 
 unsigned max_user_bgreq;
@@ -66,6 +69,7 @@ struct fuse_mount_data {
 	unsigned rootmode_present:1;
 	unsigned user_id_present:1;
 	unsigned group_id_present:1;
+	unsigned writeback_cache:1;
 	unsigned flags;
 	unsigned max_read;
 	unsigned blksize;
@@ -93,7 +97,9 @@ static struct inode *fuse_alloc_inode(struct super_block *sb)
 	fi->writectr = 0;
 	fi->orig_ino = 0;
 	fi->state = 0;
+	fi->i_size_unstable = 0;
 	INIT_LIST_HEAD(&fi->write_files);
+	INIT_LIST_HEAD(&fi->rw_files);
 	INIT_LIST_HEAD(&fi->queued_writes);
 	INIT_LIST_HEAD(&fi->writepages);
 	init_waitqueue_head(&fi->page_waitq);
@@ -116,6 +122,7 @@ static void fuse_destroy_inode(struct inode *inode)
 {
 	struct fuse_inode *fi = get_fuse_inode(inode);
 	BUG_ON(!list_empty(&fi->write_files));
+	BUG_ON(!list_empty(&fi->rw_files));
 	BUG_ON(!list_empty(&fi->queued_writes));
 	kfree(fi->forget);
 	call_rcu(&inode->i_rcu, fuse_i_callback);
@@ -170,8 +177,11 @@ void fuse_change_attributes_common(struct inode *inode, struct fuse_attr *attr,
 	inode->i_blocks  = attr->blocks;
 	inode->i_atime.tv_sec   = attr->atime;
 	inode->i_atime.tv_nsec  = attr->atimensec;
-	inode->i_mtime.tv_sec   = attr->mtime;
-	inode->i_mtime.tv_nsec  = attr->mtimensec;
+	/* mtime from server may be stale due to local buffered write */
+	if (!fc->writeback_cache || !S_ISREG(inode->i_mode)) {
+		inode->i_mtime.tv_sec   = attr->mtime;
+		inode->i_mtime.tv_nsec  = attr->mtimensec;
+	}
 	inode->i_ctime.tv_sec   = attr->ctime;
 	inode->i_ctime.tv_nsec  = attr->ctimensec;
 
@@ -197,6 +207,7 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 {
 	struct fuse_conn *fc = get_fuse_conn(inode);
 	struct fuse_inode *fi = get_fuse_inode(inode);
+	bool is_wb = fc->writeback_cache;
 	loff_t oldsize;
 	struct timespec old_mtime;
 
@@ -211,10 +222,17 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 	fuse_change_attributes_common(inode, attr, attr_valid);
 
 	oldsize = inode->i_size;
-	i_size_write(inode, attr->size);
+        /*
+	 * In case of writeback_cache enabled, the cached writes beyond EOF
+	 * extend local i_size without keeping userspace server in sync. So,
+	 * attr->size coming from server can be stale. We cannot trust it.
+	 */
+	if (!is_wb || !S_ISREG(inode->i_mode) ||
+	    !atomic_read(&fi->num_openers) || fi->i_size_unstable)
+		i_size_write(inode, attr->size);
 	spin_unlock(&fc->lock);
 
-	if (S_ISREG(inode->i_mode)) {
+	if (!is_wb && S_ISREG(inode->i_mode)) {
 		bool inval = false;
 
 		if (oldsize != attr->size) {
@@ -239,10 +257,16 @@ void fuse_change_attributes(struct inode *inode, struct fuse_attr *attr,
 	}
 }
 
-static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr)
+static void fuse_init_inode(struct inode *inode, struct fuse_attr *attr,
+			    int num_openers)
 {
+	struct fuse_inode *fi = get_fuse_inode(inode);
+	atomic_set(&fi->num_openers, num_openers);
+
 	inode->i_mode = attr->mode & S_IFMT;
 	inode->i_size = attr->size;
+	inode->i_mtime.tv_sec  = attr->mtime;
+	inode->i_mtime.tv_nsec = attr->mtimensec;
 	if (S_ISREG(inode->i_mode)) {
 		fuse_init_common(inode);
 		fuse_init_file_inode(inode);
@@ -277,7 +301,7 @@ static int fuse_inode_set(struct inode *inode, void *_nodeidp)
 
 struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 			int generation, struct fuse_attr *attr,
-			u64 attr_valid, u64 attr_version)
+			u64 attr_valid, u64 attr_version, int creat)
 {
 	struct inode *inode;
 	struct fuse_inode *fi;
@@ -289,10 +313,13 @@ struct inode *fuse_iget(struct super_block *sb, u64 nodeid,
 		return NULL;
 
 	if ((inode->i_state & I_NEW)) {
-		inode->i_flags |= S_NOATIME|S_NOCMTIME;
+		inode->i_flags |= S_NOATIME;
+		if (!fc->writeback_cache)
+			inode->i_flags |= S_NOCMTIME;
 		inode->i_generation = generation;
 		inode->i_data.backing_dev_info = &fc->bdi;
-		fuse_init_inode(inode, attr);
+		fuse_init_inode(inode, attr,
+				fc->writeback_cache ? creat : 0);
 		unlock_new_inode(inode);
 	} else if ((inode->i_mode ^ attr->mode) & S_IFMT) {
 		/* Inode has changed type, any I/O on the old should fail */
@@ -335,6 +362,82 @@ int fuse_reverse_inval_inode(struct super_block *sb, u64 nodeid,
 	return 0;
 }
 
+static void fuse_kill_requests(struct fuse_conn *fc, struct inode *inode,
+			       struct list_head *req_list)
+{
+	struct fuse_req *req;
+
+	list_for_each_entry(req, req_list, list)
+		if (req->inode == inode && req->page_cache && !req->killed) {
+			int i;
+
+			BUG_ON(req->in.h.opcode != FUSE_READ);
+			req->killed = 1;
+
+			for (i = 0; i < req->num_pages; i++) {
+				struct page *page = req->pages[i];
+				SetPageError(page);
+				unlock_page(page);
+				if (req->page_needs_release)
+					page_cache_release(page);
+				req->pages[i] = NULL;
+			}
+
+			req->num_pages = 0;
+		}
+}
+
+int fuse_invalidate_files(struct fuse_conn *fc, u64 nodeid)
+{
+	struct super_block *sb = fc->sb;
+	struct inode *inode;
+	struct fuse_inode *fi;
+	struct fuse_file *ff;
+	int err;
+
+	if (!fc->async_read) {
+		printk(KERN_ERR "Turn async_read ON to use "
+				"FUSE_NOTIFY_INVAL_FILES!\n");
+		return -EOPNOTSUPP;
+	}
+
+	inode = ilookup5(sb, nodeid, fuse_inode_eq, &nodeid);
+	if (!inode)
+		return -ENOENT;
+
+	fi = get_fuse_inode(inode);
+	spin_lock(&fc->lock);
+	list_for_each_entry(ff, &fi->rw_files, rw_entry) {
+		set_bit(FUSE_S_FAIL_IMMEDIATELY, &ff->ff_state);
+	}
+	spin_unlock(&fc->lock);
+
+	/* let them see FUSE_S_FAIL_IMMEDIATELY */
+	wake_up_all(&fc->blocked_waitq);
+
+	/* see how fuse_writepages_fill() waits for fuse writeback */
+	wake_up(&fi->page_waitq);
+
+	err = filemap_write_and_wait(inode->i_mapping);
+	if (!err || err == -EIO) { /* AS_EIO might trigger -EIO */
+		spin_lock(&fc->lock);
+		fuse_kill_requests(fc, inode, &fc->processing);
+		fuse_kill_requests(fc, inode, &fc->pending);
+		fuse_kill_requests(fc, inode, &fc->bg_queue);
+		fuse_kill_requests(fc, inode, &fc->io);
+		wake_up(&fi->page_waitq); /* readpage[s] can wait on fuse wb */
+		spin_unlock(&fc->lock);
+
+		err = invalidate_inode_pages2(inode->i_mapping);
+	}
+
+	if (!err)
+		fuse_invalidate_attr(inode);
+
+	iput(inode);
+	return err;
+}
+
 static void fuse_umount_begin(struct super_block *sb)
 {
 	fuse_abort_conn(get_fuse_conn_super(sb));
@@ -446,6 +549,10 @@ enum {
 	OPT_ALLOW_OTHER,
 	OPT_MAX_READ,
 	OPT_BLKSIZE,
+	OPT_WBCACHE,
+	OPT_ODIRECT,
+	OPT_UMOUNT_WAIT,
+	OPT_DISABLE_CLOSE_WAIT,
 	OPT_ERR
 };
 
@@ -458,6 +565,10 @@ static const match_table_t tokens = {
 	{OPT_ALLOW_OTHER,		"allow_other"},
 	{OPT_MAX_READ,			"max_read=%u"},
 	{OPT_BLKSIZE,			"blksize=%u"},
+	{OPT_WBCACHE,			"writeback_enable"},
+	{OPT_ODIRECT,			"direct_enable"},
+	{OPT_UMOUNT_WAIT,		"umount_wait"},
+	{OPT_DISABLE_CLOSE_WAIT,	"disable_close_wait"},
 	{OPT_ERR,			NULL}
 };
 
@@ -531,6 +642,28 @@ static int parse_fuse_opt(char *opt, struct fuse_mount_data *d, int is_bdev)
 			d->blksize = value;
 			break;
 
+		case OPT_WBCACHE:
+			if (!ve_is_super(get_exec_env()) && !fuse_ve_odirect)
+				return -EPERM;
+			d->writeback_cache = 1;
+			break;
+
+		case OPT_ODIRECT:
+			if (!ve_is_super(get_exec_env()) && !fuse_ve_odirect)
+				return -EPERM;
+			d->flags |= FUSE_ODIRECT;
+			break;
+
+		case OPT_UMOUNT_WAIT:
+			if (!ve_is_super(get_exec_env()) && !fuse_ve_odirect)
+				return -EPERM;
+			d->flags |= FUSE_UMOUNT_WAIT;
+			break;
+
+		case OPT_DISABLE_CLOSE_WAIT:
+			d->flags |= FUSE_DISABLE_CLOSE_WAIT;
+			break;
+
 		default:
 			return 0;
 		}
@@ -554,10 +687,18 @@ static int fuse_show_options(struct seq_file *m, struct dentry *root)
 		seq_puts(m, ",default_permissions");
 	if (fc->flags & FUSE_ALLOW_OTHER)
 		seq_puts(m, ",allow_other");
+	if (fc->flags & FUSE_ODIRECT)
+		seq_puts(m, ",direct_enable");
+	if (fc->flags & FUSE_UMOUNT_WAIT)
+		seq_puts(m, ",umount_wait");
+	if (fc->flags & FUSE_DISABLE_CLOSE_WAIT)
+		seq_puts(m, ",disable_close_wait");
 	if (fc->max_read != ~0)
 		seq_printf(m, ",max_read=%u", fc->max_read);
 	if (sb->s_bdev && sb->s_blocksize != FUSE_DEFAULT_BLKSIZE)
 		seq_printf(m, ",blksize=%lu", sb->s_blocksize);
+	if (fc->writeback_cache)
+		seq_puts(m, ",writeback_enable");
 	return 0;
 }
 
@@ -577,6 +718,7 @@ void fuse_conn_init(struct fuse_conn *fc)
 	INIT_LIST_HEAD(&fc->interrupts);
 	INIT_LIST_HEAD(&fc->bg_queue);
 	INIT_LIST_HEAD(&fc->entry);
+	INIT_LIST_HEAD(&fc->conn_files);
 	fc->forget_list_tail = &fc->forget_list_head;
 	atomic_set(&fc->num_waiting, 0);
 	fc->max_background = FUSE_DEFAULT_MAX_BACKGROUND;
@@ -617,7 +759,7 @@ static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode)
 	attr.mode = mode;
 	attr.ino = FUSE_ROOT_ID;
 	attr.nlink = 1;
-	return fuse_iget(sb, 1, 0, &attr, 0, 0);
+	return fuse_iget(sb, 1, 0, &attr, 0, 0, 0);
 }
 
 struct fuse_inode_handle {
@@ -949,10 +1091,10 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 		return err;
 
 	/*
-	 * For a single fuse filesystem use max 1% of dirty +
+	 * For a single fuse filesystem use max 20% of dirty +
 	 * writeback threshold.
 	 *
-	 * This gives about 1M of write buffer for memory maps on a
+	 * This gives about 20M of write buffer for memory maps on a
 	 * machine with 1G and 10% dirty_ratio, which should be more
 	 * than enough.
 	 *
@@ -960,7 +1102,13 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb)
 	 *
 	 *    /sys/class/bdi/<bdi>/max_ratio
 	 */
-	bdi_set_max_ratio(&fc->bdi, 1);
+	bdi_set_max_ratio(&fc->bdi, 20);
+
+	/*
+	 * These values have precedence over max_ratio
+	 */
+	bdi_set_max_dirty(&fc->bdi, (512 * 1024 * 1024) / PAGE_SIZE);
+	bdi_set_min_dirty(&fc->bdi, (256 * 1024 * 1024) / PAGE_SIZE);
 
 	return 0;
 }
@@ -1007,7 +1155,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		goto err;
 
 	if ((file->f_op != &fuse_dev_operations) ||
-	    (file->f_cred->user_ns != &init_user_ns))
+	    (file->f_cred->user_ns != ve_init_user_ns()))
 		goto err_fput;
 
 	fc = kmalloc(sizeof(*fc), GFP_KERNEL);
@@ -1035,6 +1183,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 	fc->user_id = d.user_id;
 	fc->group_id = d.group_id;
 	fc->max_read = max_t(unsigned, 4096, d.max_read);
+	fc->writeback_cache = d.writeback_cache;
 
 	/* Used by get_root_inode() */
 	sb->s_fs_info = fc;
@@ -1052,7 +1201,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent)
 		goto err_put_root;
 	init_req->background = 1;
 
-	if (is_bdev) {
+	if (is_bdev || (fc->flags & FUSE_UMOUNT_WAIT)) {
 		fc->destroy_req = fuse_request_alloc(0);
 		if (!fc->destroy_req)
 			goto err_free_init_req;
@@ -1102,7 +1251,25 @@ static struct dentry *fuse_mount(struct file_system_type *fs_type,
 		       int flags, const char *dev_name,
 		       void *raw_data)
 {
-	return mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
+	struct dentry *dentry;
+
+	dentry = mount_nodev(fs_type, flags, raw_data, fuse_fill_super);
+
+	/* Hack to distinguish pcs fuse service and to force synchronous close for it.
+	 * Seems, this is the only place where we have some variable (dev_name), which
+	 * is not confined by fuse API and already defined.
+	 */
+	if (!IS_ERR(dentry) && dev_name &&
+			(strncmp(dev_name, "pstorage://", 11) == 0 ||
+				strncmp(dev_name, "vstorage://", 11) == 0) ) {
+		struct fuse_conn *fc = dentry->d_sb->s_fs_info;
+
+		if (!(fc->flags & FUSE_DISABLE_CLOSE_WAIT))
+			fc->close_wait = 1;
+
+		fc->compat_inval_files = 1;
+	}
+	return dentry;
 }
 
 static void fuse_kill_sb_anon(struct super_block *sb)
@@ -1121,7 +1288,7 @@ static void fuse_kill_sb_anon(struct super_block *sb)
 static struct file_system_type fuse_fs_type = {
 	.owner		= THIS_MODULE,
 	.name		= "fuse",
-	.fs_flags	= FS_HAS_SUBTYPE,
+	.fs_flags	= FS_HAS_SUBTYPE | FS_VIRTUALIZED,
 	.mount		= fuse_mount,
 	.kill_sb	= fuse_kill_sb_anon,
 };
@@ -1189,8 +1356,8 @@ static int __init fuse_fs_init(void)
 	int err;
 
 	fuse_inode_cachep = kmem_cache_create("fuse_inode",
-					      sizeof(struct fuse_inode),
-					      0, SLAB_HWCACHE_ALIGN,
+					      sizeof(struct fuse_inode), 0,
+					      SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
 					      fuse_inode_init_once);
 	err = -ENOMEM;
 	if (!fuse_inode_cachep)
@@ -1260,6 +1427,24 @@ static void fuse_sysfs_cleanup(void)
 	kobject_put(fuse_kobj);
 }
 
+static ctl_table fuse_table[] = {
+	{
+		.procname	= "fuse-ve-odirect",
+		.data		= &fuse_ve_odirect,
+		.maxlen		= sizeof(fuse_ve_odirect),
+		.mode		= 0600,
+		.proc_handler	= &proc_dointvec,
+	},
+	{}
+};
+
+static struct ctl_path fuse_path[] = {
+	{ .procname = "fs", },
+	{},
+};
+
+static struct ctl_table_header * fuse_sysctl_header;
+
 static int __init fuse_init(void)
 {
 	int res;
@@ -1287,6 +1472,8 @@ static int __init fuse_init(void)
 	sanitize_global_limit(&max_user_bgreq);
 	sanitize_global_limit(&max_user_congthresh);
 
+	fuse_sysctl_header = register_sysctl_paths(fuse_path, fuse_table);
+
 	return 0;
 
  err_sysfs_cleanup:
@@ -1307,6 +1494,7 @@ static void __exit fuse_exit(void)
 	fuse_sysfs_cleanup();
 	fuse_fs_cleanup();
 	fuse_dev_cleanup();
+	unregister_sysctl_table(fuse_sysctl_header);
 }
 
 module_init(fuse_init);
--- a/fs/gfs2/acl.c
+++ b/fs/gfs2/acl.c
@@ -272,8 +272,8 @@ static int gfs2_xattr_system_set(struct dentry *dentry, const char *name,
 
 	if (type == ACL_TYPE_ACCESS) {
 		umode_t mode = inode->i_mode;
-		error = posix_acl_equiv_mode(acl, &mode);
 
+		error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
 		if (error <= 0) {
 			posix_acl_release(acl);
 			acl = NULL;
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -498,7 +498,6 @@ out:
 static const struct vm_operations_struct gfs2_vm_ops = {
 	.fault = filemap_fault,
 	.page_mkwrite = gfs2_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 /**
@@ -655,7 +654,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 {
 	struct address_space *mapping = file->f_mapping;
 	struct inode *inode = mapping->host;
-	int sync_state = inode->i_state & I_DIRTY;
+	int sync_state = inode->i_state & I_DIRTY_ALL;
 	struct gfs2_inode *ip = GFS2_I(inode);
 	int ret = 0, ret1 = 0;
 
@@ -668,7 +667,7 @@ static int gfs2_fsync(struct file *file, loff_t start, loff_t end,
 	if (!gfs2_is_jdata(ip))
 		sync_state &= ~I_DIRTY_PAGES;
 	if (datasync)
-		sync_state &= ~I_DIRTY_SYNC;
+		sync_state &= ~(I_DIRTY_SYNC | I_DIRTY_TIME);
 
 	if (sync_state) {
 		ret = sync_inode_metadata(inode, 1);
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -1373,21 +1373,22 @@ add_back_to_lru:
  * gfs2_dispose_glock_lru() above.
  */
 
-static void gfs2_scan_glock_lru(int nr)
+static long gfs2_scan_glock_lru(int nr)
 {
 	struct gfs2_glock *gl;
 	LIST_HEAD(skipped);
 	LIST_HEAD(dispose);
+	long freed = 0;
 
 	spin_lock(&lru_lock);
-	while(nr && !list_empty(&lru_list)) {
+	while ((nr-- >= 0) && !list_empty(&lru_list)) {
 		gl = list_entry(lru_list.next, struct gfs2_glock, gl_lru);
 
 		/* Test for being demotable */
 		if (!test_bit(GLF_LOCK, &gl->gl_flags)) {
 			list_move(&gl->gl_lru, &dispose);
 			atomic_dec(&lru_count);
-			nr--;
+			freed++;
 			continue;
 		}
 
@@ -1397,23 +1398,28 @@ static void gfs2_scan_glock_lru(int nr)
 	if (!list_empty(&dispose))
 		gfs2_dispose_glock_lru(&dispose);
 	spin_unlock(&lru_lock);
+
+	return freed;
 }
 
-static int gfs2_shrink_glock_memory(struct shrinker *shrink,
-				    struct shrink_control *sc)
+static unsigned long gfs2_glock_shrink_scan(struct shrinker *shrink,
+					    struct shrink_control *sc)
 {
-	if (sc->nr_to_scan) {
-		if (!(sc->gfp_mask & __GFP_FS))
-			return -1;
-		gfs2_scan_glock_lru(sc->nr_to_scan);
-	}
+	if (!(sc->gfp_mask & __GFP_FS))
+		return SHRINK_STOP;
+	return gfs2_scan_glock_lru(sc->nr_to_scan);
+}
 
-	return (atomic_read(&lru_count) / 100) * sysctl_vfs_cache_pressure;
+static unsigned long gfs2_glock_shrink_count(struct shrinker *shrink,
+					     struct shrink_control *sc)
+{
+	return vfs_pressure_ratio(atomic_read(&lru_count));
 }
 
 static struct shrinker glock_shrinker = {
-	.shrink = gfs2_shrink_glock_memory,
 	.seeks = DEFAULT_SEEKS,
+	.count_objects = gfs2_glock_shrink_count,
+	.scan_objects = gfs2_glock_shrink_scan,
 };
 
 /**
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -112,7 +112,8 @@ static int __init init_gfs2_fs(void)
 	gfs2_inode_cachep = kmem_cache_create("gfs2_inode",
 					      sizeof(struct gfs2_inode),
 					      0,  SLAB_RECLAIM_ACCOUNT|
-					          SLAB_MEM_SPREAD,
+						  SLAB_MEM_SPREAD|
+						  SLAB_ACCOUNT,
 					      gfs2_init_inode_once);
 	if (!gfs2_inode_cachep)
 		goto fail;
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -142,7 +142,8 @@ static void gfs2_qd_dispose(struct list_head *list)
 	}
 }
 
-static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock, void *arg)
+static enum lru_status gfs2_qd_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
 	struct list_head *dispose = arg;
 	struct gfs2_quota_data *qd = list_entry(item, struct gfs2_quota_data, qd_lru);
@@ -152,35 +153,41 @@ static enum lru_status gfs2_qd_isolate(struct list_head *item, spinlock_t *lock,
 
 	if (qd->qd_lockref.count == 0) {
 		lockref_mark_dead(&qd->qd_lockref);
-		list_move(&qd->qd_lru, dispose);
+		list_lru_isolate_move(lru, &qd->qd_lru, dispose);
 	}
 
 	spin_unlock(&qd->qd_lockref.lock);
 	return LRU_REMOVED;
 }
 
-static int gfs2_shrink_qd_memory(struct shrinker *shrink,
-				 struct shrink_control *sc)
+static unsigned long gfs2_qd_shrink_scan(struct shrinker *shrink,
+					 struct shrink_control *sc)
 {
 	LIST_HEAD(dispose);
-
-	if (sc->nr_to_scan == 0)
-		goto out;
+	unsigned long freed;
 
 	if (!(sc->gfp_mask & __GFP_FS))
-		return -1;
+		return SHRINK_STOP;
 
-	list_lru_walk(&gfs2_qd_lru, gfs2_qd_isolate, &dispose, sc->nr_to_scan);
+	freed = list_lru_shrink_walk(&gfs2_qd_lru, sc,
+				     gfs2_qd_isolate, &dispose);
 
 	gfs2_qd_dispose(&dispose);
 
-out:
-	return (list_lru_count(&gfs2_qd_lru) * sysctl_vfs_cache_pressure) / 100;
+	return freed;
+}
+
+static unsigned long gfs2_qd_shrink_count(struct shrinker *shrink,
+					  struct shrink_control *sc)
+{
+	return vfs_pressure_ratio(list_lru_shrink_count(&gfs2_qd_lru, sc));
 }
 
 struct shrinker gfs2_qd_shrinker = {
-	.shrink = gfs2_shrink_qd_memory,
+	.count_objects = gfs2_qd_shrink_count,
+	.scan_objects = gfs2_qd_shrink_scan,
 	.seeks = DEFAULT_SEEKS,
+	.flags = SHRINKER_NUMA_AWARE,
 };
 
 static u64 qd2index(struct gfs2_quota_data *qd)
--- a/fs/hfs/super.c
+++ b/fs/hfs/super.c
@@ -481,8 +481,8 @@ static int __init init_hfs_fs(void)
 	int err;
 
 	hfs_inode_cachep = kmem_cache_create("hfs_inode_cache",
-		sizeof(struct hfs_inode_info), 0, SLAB_HWCACHE_ALIGN,
-		hfs_init_once);
+		sizeof(struct hfs_inode_info), 0,
+		SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, hfs_init_once);
 	if (!hfs_inode_cachep)
 		return -ENOMEM;
 	err = register_filesystem(&hfs_fs_type);
--- a/fs/hfsplus/super.c
+++ b/fs/hfsplus/super.c
@@ -656,7 +656,7 @@ static int __init init_hfsplus_fs(void)
 	int err;
 
 	hfsplus_inode_cachep = kmem_cache_create("hfsplus_icache",
-		HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN,
+		HFSPLUS_INODE_SIZE, 0, SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT,
 		hfsplus_init_once);
 	if (!hfsplus_inode_cachep)
 		return -ENOMEM;
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -228,7 +228,7 @@ static struct inode *hostfs_alloc_inode(struct super_block *sb)
 {
 	struct hostfs_inode_info *hi;
 
-	hi = kmalloc(sizeof(*hi), GFP_KERNEL);
+	hi = kmalloc(sizeof(*hi), GFP_KERNEL_ACCOUNT);
 	if (hi == NULL)
 		return NULL;
 	hi->fd = -1;
--- a/fs/hpfs/super.c
+++ b/fs/hpfs/super.c
@@ -201,7 +201,7 @@ static int init_inodecache(void)
 	hpfs_inode_cachep = kmem_cache_create("hpfs_inode_cache",
 					     sizeof(struct hpfs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (hpfs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -169,7 +169,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr,
 		addr = ALIGN(addr, huge_page_size(h));
 		vma = find_vma(mm, addr);
 		if (TASK_SIZE - len >= addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)))
 			return addr;
 	}
 
@@ -1042,7 +1042,7 @@ static int __init init_hugetlbfs_fs(void)
 	error = -ENOMEM;
 	hugetlbfs_inode_cachep = kmem_cache_create("hugetlbfs_inode_cache",
 					sizeof(struct hugetlbfs_inode_info),
-					0, 0, init_once);
+					0, SLAB_ACCOUNT, init_once);
 	if (hugetlbfs_inode_cachep == NULL)
 		goto out2;
 
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -17,6 +17,9 @@
 #include <linux/prefetch.h>
 #include <linux/buffer_head.h> /* for inode_has_buffers */
 #include <linux/ratelimit.h>
+#include <linux/list_lru.h>
+#include <linux/vzstat.h>
+#include <trace/events/writeback.h>
 #include "internal.h"
 
 /*
@@ -24,12 +27,12 @@
  *
  * inode->i_lock protects:
  *   inode->i_state, inode->i_hash, __iget()
- * inode->i_sb->s_inode_lru_lock protects:
+ * Inode LRU list locks protect:
  *   inode->i_sb->s_inode_lru, inode->i_lru
  * inode_sb_list_lock protects:
  *   sb->s_inodes, inode->i_sb_list
  * bdi->wb.list_lock protects:
- *   bdi->wb.b_{dirty,io,more_io}, inode->i_wb_list
+ *   bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list
  * inode_hash_lock protects:
  *   inode_hashtable, inode->i_hash
  *
@@ -37,7 +40,7 @@
  *
  * inode_sb_list_lock
  *   inode->i_lock
- *     inode->i_sb->s_inode_lru_lock
+ *     Inode LRU list locks
  *
  * bdi->wb.list_lock
  *   inode->i_lock
@@ -56,6 +59,7 @@ static struct hlist_head *inode_hashtable __read_mostly;
 static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock);
 
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock);
+EXPORT_SYMBOL_GPL(inode_sb_list_lock);
 
 /*
  * Empty aops. Can be used for the cases where the user does not
@@ -65,38 +69,42 @@ const struct address_space_operations empty_aops = {
 };
 EXPORT_SYMBOL(empty_aops);
 
+const struct inode_operations empty_iops = {
+};
+EXPORT_SYMBOL(empty_iops);
+
 /*
  * Statistics gathering..
  */
 struct inodes_stat_t inodes_stat;
 
-static DEFINE_PER_CPU(unsigned int, nr_inodes);
-static DEFINE_PER_CPU(unsigned int, nr_unused);
+static DEFINE_PER_CPU(unsigned long, nr_inodes);
+static DEFINE_PER_CPU(unsigned long, nr_unused);
 
 static struct kmem_cache *inode_cachep __read_mostly;
 
-static int get_nr_inodes(void)
+static long get_nr_inodes(void)
 {
 	int i;
-	int sum = 0;
+	long sum = 0;
 	for_each_possible_cpu(i)
 		sum += per_cpu(nr_inodes, i);
 	return sum < 0 ? 0 : sum;
 }
 
-static inline int get_nr_inodes_unused(void)
+static inline long get_nr_inodes_unused(void)
 {
 	int i;
-	int sum = 0;
+	long sum = 0;
 	for_each_possible_cpu(i)
 		sum += per_cpu(nr_unused, i);
 	return sum < 0 ? 0 : sum;
 }
 
-int get_nr_dirty_inodes(void)
+long get_nr_dirty_inodes(void)
 {
 	/* not actually dirty inodes, but a wild approximation */
-	int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
+	long nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
 	return nr_dirty > 0 ? nr_dirty : 0;
 }
 
@@ -109,7 +117,7 @@ int proc_nr_inodes(ctl_table *table, int write,
 {
 	inodes_stat.nr_inodes = get_nr_inodes();
 	inodes_stat.nr_unused = get_nr_inodes_unused();
-	return proc_dointvec(table, write, buffer, lenp, ppos);
+	return proc_doulongvec_minmax(table, write, buffer, lenp, ppos);
 }
 #endif
 
@@ -174,6 +182,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
 	mapping->private_data = NULL;
 	mapping->backing_dev_info = &default_backing_dev_info;
 	mapping->writeback_index = 0;
+	mapping->dirtied_ub = NULL;
 
 	/*
 	 * If the block_device provides a backing_dev_info for client
@@ -237,6 +246,7 @@ EXPORT_SYMBOL(free_inode_nonrcu);
 void __destroy_inode(struct inode *inode)
 {
 	BUG_ON(inode_has_buffers(inode));
+	BUG_ON(inode->i_data.dirtied_ub);
 	security_inode_free(inode);
 	fsnotify_inode_delete(inode);
 	if (!inode->i_nlink) {
@@ -357,7 +367,7 @@ void address_space_init_once(struct address_space *mapping)
 	INIT_LIST_HEAD(&mapping->private_list);
 	spin_lock_init(&mapping->private_lock);
 	mapping->i_mmap = RB_ROOT;
-	INIT_LIST_HEAD(&mapping->i_mmap_nonlinear);
+	INIT_LIST_HEAD(&mapping->i_peer_list);
 }
 EXPORT_SYMBOL(address_space_init_once);
 
@@ -395,6 +405,7 @@ void __iget(struct inode *inode)
 {
 	atomic_inc(&inode->i_count);
 }
+EXPORT_SYMBOL(__iget);
 
 /*
  * get additional reference to inode; caller must already hold one.
@@ -407,13 +418,8 @@ EXPORT_SYMBOL(ihold);
 
 static void inode_lru_list_add(struct inode *inode)
 {
-	spin_lock(&inode->i_sb->s_inode_lru_lock);
-	if (list_empty(&inode->i_lru)) {
-		list_add(&inode->i_lru, &inode->i_sb->s_inode_lru);
-		inode->i_sb->s_nr_inodes_unused++;
+	if (list_lru_add(&inode->i_sb->s_inode_lru, &inode->i_lru))
 		this_cpu_inc(nr_unused);
-	}
-	spin_unlock(&inode->i_sb->s_inode_lru_lock);
 }
 
 /*
@@ -423,7 +429,8 @@ static void inode_lru_list_add(struct inode *inode)
  */
 void inode_add_lru(struct inode *inode)
 {
-	if (!(inode->i_state & (I_DIRTY | I_SYNC | I_FREEING | I_WILL_FREE)) &&
+	if (!(inode->i_state & (I_DIRTY_ALL | I_SYNC |
+				I_FREEING | I_WILL_FREE)) &&
 	    !atomic_read(&inode->i_count) && inode->i_sb->s_flags & MS_ACTIVE)
 		inode_lru_list_add(inode);
 }
@@ -431,13 +438,9 @@ void inode_add_lru(struct inode *inode)
 
 static void inode_lru_list_del(struct inode *inode)
 {
-	spin_lock(&inode->i_sb->s_inode_lru_lock);
-	if (!list_empty(&inode->i_lru)) {
-		list_del_init(&inode->i_lru);
-		inode->i_sb->s_nr_inodes_unused--;
+
+	if (list_lru_del(&inode->i_sb->s_inode_lru, &inode->i_lru))
 		this_cpu_dec(nr_unused);
-	}
-	spin_unlock(&inode->i_sb->s_inode_lru_lock);
 }
 
 /**
@@ -658,7 +661,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 			spin_unlock(&inode->i_lock);
 			continue;
 		}
-		if (inode->i_state & I_DIRTY && !kill_dirty) {
+		if (inode->i_state & I_DIRTY_ALL && !kill_dirty) {
 			spin_unlock(&inode->i_lock);
 			busy = 1;
 			continue;
@@ -681,24 +684,8 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty)
 	return busy;
 }
 
-static int can_unuse(struct inode *inode)
-{
-	if (inode->i_state & ~I_REFERENCED)
-		return 0;
-	if (inode_has_buffers(inode))
-		return 0;
-	if (atomic_read(&inode->i_count))
-		return 0;
-	if (inode->i_data.nrpages)
-		return 0;
-	return 1;
-}
-
 /*
- * Walk the superblock inode LRU for freeable inodes and attempt to free them.
- * This is called from the superblock shrinker function with a number of inodes
- * to trim from the LRU. Inodes to be freed are moved to a temporary list and
- * then are freed outside inode_lock by dispose_list().
+ * Isolate the inode from the LRU in preparation for freeing it.
  *
  * Any inodes which are pinned purely because of attached pagecache have their
  * pagecache removed.  If the inode has metadata buffers attached to
@@ -712,89 +699,83 @@ static int can_unuse(struct inode *inode)
  * LRU does not have strict ordering. Hence we don't want to reclaim inodes
  * with this flag set because they are the inodes that are out of order.
  */
-void prune_icache_sb(struct super_block *sb, int nr_to_scan)
+static enum lru_status inode_lru_isolate(struct list_head *item,
+		struct list_lru_one *lru, spinlock_t *lru_lock, void *arg)
 {
-	LIST_HEAD(freeable);
-	int nr_scanned;
-	unsigned long reap = 0;
+	struct list_head *freeable = arg;
+	struct inode	*inode = container_of(item, struct inode, i_lru);
 
-	spin_lock(&sb->s_inode_lru_lock);
-	for (nr_scanned = nr_to_scan; nr_scanned >= 0; nr_scanned--) {
-		struct inode *inode;
+	/*
+	 * we are inverting the lru lock/inode->i_lock here, so use a trylock.
+	 * If we fail to get the lock, just skip it.
+	 */
+	if (!spin_trylock(&inode->i_lock))
+		return LRU_SKIP;
 
-		if (list_empty(&sb->s_inode_lru))
-			break;
+	/*
+	 * Referenced or dirty inodes are still in use. Give them another pass
+	 * through the LRU as we canot reclaim them now.
+	 */
+	if (atomic_read(&inode->i_count) ||
+	    (inode->i_state & ~I_REFERENCED)) {
+		list_lru_isolate(lru, &inode->i_lru);
+		spin_unlock(&inode->i_lock);
+		this_cpu_dec(nr_unused);
+		return LRU_REMOVED;
+	}
 
-		inode = list_entry(sb->s_inode_lru.prev, struct inode, i_lru);
+	/* recently referenced inodes get one more pass */
+	if (inode->i_state & I_REFERENCED) {
+		inode->i_state &= ~I_REFERENCED;
+		spin_unlock(&inode->i_lock);
+		return LRU_ROTATE;
+	}
 
-		/*
-		 * we are inverting the sb->s_inode_lru_lock/inode->i_lock here,
-		 * so use a trylock. If we fail to get the lock, just move the
-		 * inode to the back of the list so we don't spin on it.
-		 */
-		if (!spin_trylock(&inode->i_lock)) {
-			list_move(&inode->i_lru, &sb->s_inode_lru);
-			continue;
+	if (inode_has_buffers(inode) || inode->i_data.nrpages) {
+		__iget(inode);
+		spin_unlock(&inode->i_lock);
+		spin_unlock(lru_lock);
+		if (remove_inode_buffers(inode)) {
+			unsigned long reap;
+			reap = invalidate_mapping_pages(&inode->i_data, 0, -1);
+			if (current_is_kswapd())
+				__count_vm_events(KSWAPD_INODESTEAL, reap);
+			else
+				__count_vm_events(PGINODESTEAL, reap);
+			if (current->reclaim_state)
+				current->reclaim_state->reclaimed_slab += reap;
 		}
+		iput(inode);
+		spin_lock(lru_lock);
+		return LRU_RETRY;
+	}
 
-		/*
-		 * Referenced or dirty inodes are still in use. Give them
-		 * another pass through the LRU as we canot reclaim them now.
-		 */
-		if (atomic_read(&inode->i_count) ||
-		    (inode->i_state & ~I_REFERENCED)) {
-			list_del_init(&inode->i_lru);
-			spin_unlock(&inode->i_lock);
-			sb->s_nr_inodes_unused--;
-			this_cpu_dec(nr_unused);
-			continue;
-		}
+	WARN_ON(inode->i_state & I_NEW);
+	inode->i_state |= I_FREEING;
+	list_lru_isolate_move(lru, &inode->i_lru, freeable);
+	spin_unlock(&inode->i_lock);
 
-		/* recently referenced inodes get one more pass */
-		if (inode->i_state & I_REFERENCED) {
-			inode->i_state &= ~I_REFERENCED;
-			list_move(&inode->i_lru, &sb->s_inode_lru);
-			spin_unlock(&inode->i_lock);
-			continue;
-		}
-		if (inode_has_buffers(inode) || inode->i_data.nrpages) {
-			__iget(inode);
-			spin_unlock(&inode->i_lock);
-			spin_unlock(&sb->s_inode_lru_lock);
-			if (remove_inode_buffers(inode))
-				reap += invalidate_mapping_pages(&inode->i_data,
-								0, -1);
-			iput(inode);
-			spin_lock(&sb->s_inode_lru_lock);
-
-			if (inode != list_entry(sb->s_inode_lru.next,
-						struct inode, i_lru))
-				continue;	/* wrong inode or list_empty */
-			/* avoid lock inversions with trylock */
-			if (!spin_trylock(&inode->i_lock))
-				continue;
-			if (!can_unuse(inode)) {
-				spin_unlock(&inode->i_lock);
-				continue;
-			}
-		}
-		WARN_ON(inode->i_state & I_NEW);
-		inode->i_state |= I_FREEING;
-		spin_unlock(&inode->i_lock);
+	this_cpu_dec(nr_unused);
+	return LRU_REMOVED;
+}
 
-		list_move(&inode->i_lru, &freeable);
-		sb->s_nr_inodes_unused--;
-		this_cpu_dec(nr_unused);
-	}
-	if (current_is_kswapd())
-		__count_vm_events(KSWAPD_INODESTEAL, reap);
-	else
-		__count_vm_events(PGINODESTEAL, reap);
-	spin_unlock(&sb->s_inode_lru_lock);
-	if (current->reclaim_state)
-		current->reclaim_state->reclaimed_slab += reap;
+/*
+ * Walk the superblock inode LRU for freeable inodes and attempt to free them.
+ * This is called from the superblock shrinker function with a number of inodes
+ * to trim from the LRU. Inodes to be freed are moved to a temporary list and
+ * then are freed outside inode_lock by dispose_list().
+ */
+long prune_icache_sb(struct super_block *sb, struct shrink_control *sc)
+{
+	LIST_HEAD(freeable);
+	long freed;
 
+	KSTAT_PERF_ENTER(shrink_icache);
+	freed = list_lru_shrink_walk(&sb->s_inode_lru, sc,
+				     inode_lru_isolate, &freeable);
 	dispose_list(&freeable);
+	KSTAT_PERF_LEAVE(shrink_icache);
+	return freed;
 }
 
 static void __wait_on_freeing_inode(struct inode *inode);
@@ -1328,6 +1309,56 @@ struct inode *ilookup(struct super_block *sb, unsigned long ino)
 }
 EXPORT_SYMBOL(ilookup);
 
+/**
+ * find_inode_nowait - find an inode in the inode cache
+ * @sb:		super block of file system to search
+ * @hashval:	hash value (usually inode number) to search for
+ * @match:	callback used for comparisons between inodes
+ * @data:	opaque data pointer to pass to @match
+ *
+ * Search for the inode specified by @hashval and @data in the inode
+ * cache, where the helper function @match will return 0 if the inode
+ * does not match, 1 if the inode does match, and -1 if the search
+ * should be stopped.  The @match function must be responsible for
+ * taking the i_lock spin_lock and checking i_state for an inode being
+ * freed or being initialized, and incrementing the reference count
+ * before returning 1.  It also must not sleep, since it is called with
+ * the inode_hash_lock spinlock held.
+ *
+ * This is a even more generalized version of ilookup5() when the
+ * function must never block --- find_inode() can block in
+ * __wait_on_freeing_inode() --- or when the caller can not increment
+ * the reference count because the resulting iput() might cause an
+ * inode eviction.  The tradeoff is that the @match funtion must be
+ * very carefully implemented.
+ */
+struct inode *find_inode_nowait(struct super_block *sb,
+				unsigned long hashval,
+				int (*match)(struct inode *, unsigned long,
+					     void *),
+				void *data)
+{
+	struct hlist_head *head = inode_hashtable + hash(sb, hashval);
+	struct inode *inode, *ret_inode = NULL;
+	int mval;
+
+	spin_lock(&inode_hash_lock);
+	hlist_for_each_entry(inode, head, i_hash) {
+		if (inode->i_sb != sb)
+			continue;
+		mval = match(inode, hashval, data);
+		if (mval == 0)
+			continue;
+		if (mval == 1)
+			ret_inode = inode;
+		goto out;
+	}
+out:
+	spin_unlock(&inode_hash_lock);
+	return ret_inode;
+}
+EXPORT_SYMBOL(find_inode_nowait);
+
 int insert_inode_locked(struct inode *inode)
 {
 	struct super_block *sb = inode->i_sb;
@@ -1478,11 +1509,20 @@ static void iput_final(struct inode *inode)
  */
 void iput(struct inode *inode)
 {
-	if (inode) {
-		BUG_ON(inode->i_state & I_CLEAR);
-
-		if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock))
-			iput_final(inode);
+	if (!inode)
+		return;
+	BUG_ON(inode->i_state & I_CLEAR);
+retry:
+	if (atomic_dec_and_lock(&inode->i_count, &inode->i_lock)) {
+		if (inode->i_nlink && (inode->i_state & I_DIRTY_TIME)) {
+			atomic_inc(&inode->i_count);
+			inode->i_state &= ~I_DIRTY_TIME;
+			spin_unlock(&inode->i_lock);
+			trace_writeback_lazytime_iput(inode);
+			mark_inode_dirty_sync(inode);
+			goto retry;
+		}
+		iput_final(inode);
 	}
 }
 EXPORT_SYMBOL(iput);
@@ -1507,6 +1547,8 @@ sector_t bmap(struct inode *inode, sector_t block)
 }
 EXPORT_SYMBOL(bmap);
 
+unsigned __read_mostly relatime_interval = 24*60*60; /* one day */
+
 /*
  * With relative atime, only update atime if the previous atime is
  * earlier than either the ctime or mtime or if at least a day has
@@ -1530,10 +1572,10 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
 		return 1;
 
 	/*
-	 * Is the previous atime value older than a day? If yes,
-	 * update atime:
+	 * Is the previous atime value older than a update interval?
+	 * If yes, update atime:
 	 */
-	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= 24*60*60)
+	if ((long)(now.tv_sec - inode->i_atime.tv_sec) >= relatime_interval)
 		return 1;
 	/*
 	 * Good, we can skip the atime update:
@@ -1541,14 +1583,9 @@ static int relatime_need_update(struct vfsmount *mnt, struct inode *inode,
 	return 0;
 }
 
-/*
- * This does the actual work of updating an inodes time or version.  Must have
- * had called mnt_want_write() before calling this.
- */
-static int update_time(struct inode *inode, struct timespec *time, int flags)
+int generic_update_time(struct inode *inode, struct timespec *time, int flags)
 {
-	if (inode->i_op->update_time)
-		return inode->i_op->update_time(inode, time, flags);
+	int iflags = I_DIRTY_TIME;
 
 	if (flags & S_ATIME)
 		inode->i_atime = *time;
@@ -1558,9 +1595,27 @@ static int update_time(struct inode *inode, struct timespec *time, int flags)
 		inode->i_ctime = *time;
 	if (flags & S_MTIME)
 		inode->i_mtime = *time;
-	mark_inode_dirty_sync(inode);
+
+	if (!(inode->i_sb->s_flags & MS_LAZYTIME) || (flags & S_VERSION))
+		iflags |= I_DIRTY_SYNC;
+	__mark_inode_dirty(inode, iflags);
 	return 0;
 }
+EXPORT_SYMBOL(generic_update_time);
+
+/*
+ * This does the actual work of updating an inodes time or version.  Must have
+ * had called mnt_want_write() before calling this.
+ */
+static int update_time(struct inode *inode, struct timespec *time, int flags)
+{
+	int (*update_time)(struct inode *, struct timespec *, int);
+
+	update_time = inode->i_op->update_time ? inode->i_op->update_time :
+		generic_update_time;
+
+	return update_time(inode, time, flags);
+}
 
 /**
  *	touch_atime	-	update the access time
@@ -1844,7 +1899,7 @@ void __init inode_init(void)
 					 sizeof(struct inode),
 					 0,
 					 (SLAB_RECLAIM_ACCOUNT|SLAB_PANIC|
-					 SLAB_MEM_SPREAD),
+					 SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					 init_once);
 
 	/* Hash may have been set up in inode_init_early */
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -16,6 +16,7 @@ struct file_system_type;
 struct linux_binprm;
 struct path;
 struct mount;
+struct shrink_control;
 
 /*
  * block_dev.c
@@ -37,6 +38,11 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait)
 #endif
 
 /*
+ * buffer.c
+ */
+extern void guard_bio_eod(int rw, struct bio *bio);
+
+/*
  * char_dev.c
  */
 extern void __init chrdev_init(void);
@@ -45,8 +51,6 @@ extern void __init chrdev_init(void);
  * namei.c
  */
 extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *);
-extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
-			   const char *, unsigned int, struct path *);
 
 /*
  * namespace.c
@@ -109,6 +113,7 @@ extern int open_check_o_direct(struct file *f);
  * inode.c
  */
 extern spinlock_t inode_sb_list_lock;
+extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc);
 extern void inode_add_lru(struct inode *inode);
 
 /*
@@ -116,7 +121,7 @@ extern void inode_add_lru(struct inode *inode);
  */
 extern void inode_wb_list_del(struct inode *inode);
 
-extern int get_nr_dirty_inodes(void);
+extern long get_nr_dirty_inodes(void);
 extern void evict_inodes(struct super_block *);
 extern int invalidate_inodes(struct super_block *, bool);
 
@@ -125,6 +130,7 @@ extern int invalidate_inodes(struct super_block *, bool);
  */
 extern struct dentry *__d_alloc(struct super_block *, const struct qstr *);
 extern int d_set_mounted(struct dentry *dentry);
+extern long prune_dcache_sb(struct super_block *sb, struct shrink_control *sc);
 
 /*
  * read_write.c
--- a/fs/ioprio.c
+++ b/fs/ioprio.c
@@ -28,6 +28,7 @@
 #include <linux/syscalls.h>
 #include <linux/security.h>
 #include <linux/pid_namespace.h>
+#include <bc/beancounter.h>
 
 int set_task_ioprio(struct task_struct *task, int ioprio)
 {
@@ -68,6 +69,25 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 	kuid_t uid;
 	int ret;
 
+	if (!ve_is_super(get_exec_env())) {
+		if (which == IOPRIO_WHO_UBC)
+			return -EPERM;
+
+		switch (class) {
+			case IOPRIO_CLASS_RT:
+				if (!ve_capable(CAP_SYS_ADMIN))
+					return -EPERM;
+				class = IOPRIO_CLASS_BE;
+				data = 0;
+				break;
+			case IOPRIO_CLASS_IDLE:
+				class = IOPRIO_CLASS_BE;
+				data = IOPRIO_BE_NR - 1;
+				break;
+		}
+		ioprio = IOPRIO_PRIO_VALUE(class, data);
+	}
+
 	switch (class) {
 		case IOPRIO_CLASS_RT:
 			if (!capable(CAP_SYS_ADMIN))
@@ -88,6 +108,12 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 			return -EINVAL;
 	}
 
+	if (which == IOPRIO_WHO_UBC) {
+		if (class != IOPRIO_CLASS_BE)
+			return -ERANGE;
+		return ub_set_ioprio(who, data);
+	}
+
 	ret = -ESRCH;
 	rcu_read_lock();
 	switch (which) {
@@ -123,6 +149,10 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
 				break;
 
 			do_each_thread(g, p) {
+#ifdef CONFIG_VE
+				if (p->task_ve != get_exec_env())
+					continue;
+#endif
 				if (!uid_eq(task_uid(p), uid))
 					continue;
 				ret = set_task_ioprio(p, ioprio);
@@ -149,8 +179,10 @@ static int get_task_ioprio(struct task_struct *p)
 	if (ret)
 		goto out;
 	ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
+	task_lock(p);
 	if (p->io_context)
 		ret = p->io_context->ioprio;
+	task_unlock(p);
 out:
 	return ret;
 }
@@ -220,6 +252,10 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
 				break;
 
 			do_each_thread(g, p) {
+#ifdef CONFIG_VE
+				if (p->task_ve != get_exec_env())
+					continue;
+#endif
 				if (!uid_eq(task_uid(p), user->uid))
 					continue;
 				tmpio = get_task_ioprio(p);
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -98,7 +98,7 @@ static int init_inodecache(void)
 	isofs_inode_cachep = kmem_cache_create("isofs_inode_cache",
 					sizeof(struct iso_inode_info),
 					0, (SLAB_RECLAIM_ACCOUNT|
-					SLAB_MEM_SPREAD),
+					SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					init_once);
 	if (isofs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/jbd/transaction.c
+++ b/fs/jbd/transaction.c
@@ -27,6 +27,7 @@
 #include <linux/highmem.h>
 #include <linux/hrtimer.h>
 #include <linux/backing-dev.h>
+#include <linux/virtinfo.h>
 
 static void __journal_temp_unlink_buffer(struct journal_head *jh);
 
@@ -98,6 +99,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle)
 		goto out;
 	}
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_JOURNAL, NULL);
+
 alloc_transaction:
 	if (!journal->j_running_transaction) {
 		new_transaction = kzalloc(sizeof(*new_transaction), GFP_NOFS);
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c
@@ -465,7 +465,7 @@ int jbd2_cleanup_journal_tail(journal_t *journal)
 	 * jbd2_cleanup_journal_tail() doesn't get called all that often.
 	 */
 	if (journal->j_flags & JBD2_BARRIER)
-		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
+		blkdev_issue_flush(journal->j_fs_dev, GFP_NOFS, NULL);
 
 	__jbd2_update_log_tail(journal, first_tid, blocknr);
 	return 0;
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -716,10 +716,9 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
 				!tid_gt(tid, journal->j_commit_sequence));
 		read_lock(&journal->j_state_lock);
 	}
-	read_unlock(&journal->j_state_lock);
-
 	if (unlikely(is_journal_aborted(journal)))
 		err = -EIO;
+	read_unlock(&journal->j_state_lock);
 	return err;
 }
 
@@ -1407,11 +1406,12 @@ void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
 /**
  * jbd2_mark_journal_empty() - Mark on disk journal as empty.
  * @journal: The journal to update.
+ * @write_op: With which operation should we write the journal sb
  *
  * Update a journal's dynamic superblock fields to show that journal is empty.
  * Write updated superblock to disk waiting for IO to complete.
  */
-static void jbd2_mark_journal_empty(journal_t *journal)
+static void jbd2_mark_journal_empty(journal_t *journal, int write_op)
 {
 	journal_superblock_t *sb = journal->j_superblock;
 
@@ -1429,7 +1429,7 @@ static void jbd2_mark_journal_empty(journal_t *journal)
 	sb->s_start    = cpu_to_be32(0);
 	read_unlock(&journal->j_state_lock);
 
-	jbd2_write_superblock(journal, WRITE_FUA);
+	jbd2_write_superblock(journal, write_op);
 
 	/* Log is no longer empty */
 	write_lock(&journal->j_state_lock);
@@ -1705,7 +1705,13 @@ int jbd2_journal_destroy(journal_t *journal)
 	if (journal->j_sb_buffer) {
 		if (!is_journal_aborted(journal)) {
 			mutex_lock(&journal->j_checkpoint_mutex);
-			jbd2_mark_journal_empty(journal);
+
+			write_lock(&journal->j_state_lock);
+			journal->j_tail_sequence =
+				++journal->j_transaction_sequence;
+			write_unlock(&journal->j_state_lock);
+
+			jbd2_mark_journal_empty(journal, WRITE_FLUSH_FUA);
 			mutex_unlock(&journal->j_checkpoint_mutex);
 		} else
 			err = -EIO;
@@ -1957,7 +1963,7 @@ int jbd2_journal_flush(journal_t *journal)
 	 * the magic code for a fully-recovered superblock.  Any future
 	 * commits of data to the journal will restore the current
 	 * s_start value. */
-	jbd2_mark_journal_empty(journal);
+	jbd2_mark_journal_empty(journal, WRITE_FUA);
 	mutex_unlock(&journal->j_checkpoint_mutex);
 	write_lock(&journal->j_state_lock);
 	J_ASSERT(!journal->j_running_transaction);
@@ -2002,7 +2008,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
 	if (write) {
 		/* Lock to make assertions happy... */
 		mutex_lock(&journal->j_checkpoint_mutex);
-		jbd2_mark_journal_empty(journal);
+		jbd2_mark_journal_empty(journal, WRITE_FUA);
 		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
 
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -36,6 +36,9 @@ struct recovery_info
 	int		nr_replays;
 	int		nr_revokes;
 	int		nr_revoke_hits;
+
+	unsigned int		last_log_block;
+	struct buffer_head	*last_commit_bh;
 };
 
 enum passtype {PASS_SCAN, PASS_REVOKE, PASS_REPLAY};
@@ -233,6 +236,71 @@ do {									\
 		var -= ((journal)->j_last - (journal)->j_first);	\
 } while (0)
 
+/*
+ * The 'Raid amnesia' effect protection: https://jira.sw.ru/browse/PSBM-15484
+ *
+ * Some blockdevices can return different data on read requests from same block
+ * after power failure (for example mirrored raid is out of sync, and resync is
+ * in progress) In that case following sutuation is possible:
+ *
+ * Power failure happen after transaction commit log was issued for
+ * transaction 'D', next boot first dist will have commit block, but
+ * second one will not.
+ * mirror1: journal={Ac-Bc-Cc-Dc }
+ * mirror2: journal={Ac-Bc-Cc-D  }
+ * Now let's let assumes that we read from mirror1 and found that 'D' has
+ * valid commit block, so journal_replay will replay that transaction, but
+ * second power failure may happen before journal_reset() so next
+ * journal_replay() may read from mirror2 and found that 'C' is last valid
+ * transaction. This result in corruption because we already replayed
+ * trandaction 'D'.
+ * In order to avoid such ambiguity we should pefrorm 'stabilize write'.
+ * 1) Read and rewrite latest commit id block
+ * 2) Invalidate next block in
+ * order to guarantee that journal head becomes stable.
+ * Yes i know that 'stabilize write' approach is ugly but this is the only
+ * way to run filesystem on blkdevices with 'raid amnesia' effect
+ */
+static int stabilize_journal_head(journal_t *journal, struct recovery_info *info)
+{
+	struct buffer_head *bh[2] = {NULL, NULL};
+	int err, err2, i;
+
+	if (!info->last_commit_bh)
+		return 0;
+
+	bh[0] = info->last_commit_bh;
+	info->last_commit_bh = NULL;
+
+	err = jread(&bh[1], journal, info->last_log_block);
+	if (err)
+		goto out;
+
+	for (i = 0; i < 2; i++) {
+		lock_buffer(bh[i]);
+		/* Explicitly invalidate block beyond last commit block */
+		if (i == 1)
+			memset(bh[i]->b_data, 0, journal->j_blocksize);
+
+		BUFFER_TRACE(bh[i], "marking dirty");
+		set_buffer_uptodate(bh[i]);
+		mark_buffer_dirty(bh[i]);
+		BUFFER_TRACE(bh[i], "marking uptodate");
+		unlock_buffer(bh[i]);
+	}
+	err = sync_blockdev(journal->j_dev);
+	/* Make sure data is on permanent storage */
+	if (journal->j_flags & JBD2_BARRIER) {
+		err2 = blkdev_issue_flush(journal->j_dev, GFP_KERNEL, NULL);
+		if (!err)
+			err = err2;
+	}
+out:
+	brelse(bh[0]);
+	brelse(bh[1]);
+	return err;
+}
+
 /**
  * jbd2_journal_recover - recovers a on-disk journal
  * @journal: the journal to recover
@@ -270,6 +338,8 @@ int jbd2_journal_recover(journal_t *journal)
 
 	err = do_one_pass(journal, &info, PASS_SCAN);
 	if (!err)
+		err = stabilize_journal_head(journal, &info);
+	if (!err)
 		err = do_one_pass(journal, &info, PASS_REVOKE);
 	if (!err)
 		err = do_one_pass(journal, &info, PASS_REPLAY);
@@ -319,6 +389,7 @@ int jbd2_journal_skip_recovery(journal_t *journal)
 	memset (&info, 0, sizeof(info));
 
 	err = do_one_pass(journal, &info, PASS_SCAN);
+	brelse(info.last_commit_bh);
 
 	if (err) {
 		printk(KERN_ERR "JBD2: error %d scanning journal\n", err);
@@ -422,6 +493,7 @@ static int do_one_pass(journal_t *journal,
 {
 	unsigned int		first_commit_ID, next_commit_ID;
 	unsigned long		next_log_block;
+	unsigned long		last_commit_block;
 	int			err, success = 0;
 	journal_superblock_t *	sb;
 	journal_header_t *	tmp;
@@ -442,6 +514,7 @@ static int do_one_pass(journal_t *journal,
 	sb = journal->j_superblock;
 	next_commit_ID = be32_to_cpu(sb->s_sequence);
 	next_log_block = be32_to_cpu(sb->s_start);
+	last_commit_block = 0;
 
 	first_commit_ID = next_commit_ID;
 	if (pass == PASS_SCAN)
@@ -758,7 +831,9 @@ static int do_one_pass(journal_t *journal,
 					break;
 				}
 			}
-			brelse(bh);
+			brelse(info->last_commit_bh);
+			info->last_commit_bh = bh;
+			info->last_log_block = next_log_block;
 			next_commit_ID++;
 			continue;
 
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -29,6 +29,7 @@
 #include <linux/backing-dev.h>
 #include <linux/bug.h>
 #include <linux/module.h>
+#include <linux/virtinfo.h>
 
 #include <trace/events/jbd2.h>
 
@@ -294,6 +295,8 @@ static int start_this_handle(journal_t *journal, handle_t *handle,
 		return -ENOSPC;
 	}
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_JOURNAL, NULL);
+
 alloc_transaction:
 	if (!journal->j_running_transaction) {
 		new_transaction = kmem_cache_zalloc(transaction_cache,
@@ -1103,6 +1106,7 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 		JBUFFER_TRACE(jh, "file as BJ_Reserved");
 		spin_lock(&journal->j_list_lock);
 		__jbd2_journal_file_buffer(jh, transaction, BJ_Reserved);
+		spin_unlock(&journal->j_list_lock);
 	} else if (jh->b_transaction == journal->j_committing_transaction) {
 		/* first access by this transaction */
 		jh->b_modified = 0;
@@ -1110,8 +1114,8 @@ int jbd2_journal_get_create_access(handle_t *handle, struct buffer_head *bh)
 		JBUFFER_TRACE(jh, "set next transaction");
 		spin_lock(&journal->j_list_lock);
 		jh->b_next_transaction = transaction;
+		spin_unlock(&journal->j_list_lock);
 	}
-	spin_unlock(&journal->j_list_lock);
 	jbd_unlock_bh_state(bh);
 
 	/*
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -243,9 +243,10 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 	case ACL_TYPE_ACCESS:
 		xprefix = JFFS2_XPREFIX_ACL_ACCESS;
 		if (acl) {
-			umode_t mode = inode->i_mode;
-			rc = posix_acl_equiv_mode(acl, &mode);
-			if (rc < 0)
+			umode_t mode;
+
+			rc = posix_acl_update_mode(inode, &mode, &acl);
+			if (rc)
 				return rc;
 			if (inode->i_mode != mode) {
 				struct iattr attr;
@@ -257,8 +258,6 @@ static int jffs2_set_acl(struct inode *inode, int type, struct posix_acl *acl)
 				if (rc < 0)
 					return rc;
 			}
-			if (rc == 0)
-				acl = NULL;
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
--- a/fs/jffs2/super.c
+++ b/fs/jffs2/super.c
@@ -386,7 +386,7 @@ static int __init init_jffs2_fs(void)
 	jffs2_inode_cachep = kmem_cache_create("jffs2_i",
 					     sizeof(struct jffs2_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     jffs2_i_init_once);
 	if (!jffs2_inode_cachep) {
 		pr_err("error: Failed to initialise inode cache\n");
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -38,7 +38,7 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
 		return rc;
 
 	mutex_lock(&inode->i_mutex);
-	if (!(inode->i_state & I_DIRTY) ||
+	if (!(inode->i_state & I_DIRTY_ALL) ||
 	    (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) {
 		/* Make sure committed changes hit the disk */
 		jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1);
--- a/fs/jfs/super.c
+++ b/fs/jfs/super.c
@@ -878,7 +878,7 @@ static int __init init_jfs_fs(void)
 
 	jfs_inode_cachep =
 	    kmem_cache_create("jfs_ip", sizeof(struct jfs_inode_info), 0,
-			    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+			    SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
 			    init_once);
 	if (jfs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -962,7 +962,7 @@ int generic_file_fsync(struct file *file, loff_t start, loff_t end,
 
 	mutex_lock(&inode->i_mutex);
 	ret = sync_mapping_buffers(inode->i_mapping);
-	if (!(inode->i_state & I_DIRTY))
+	if (!(inode->i_state & I_DIRTY_ALL))
 		goto out;
 	if (datasync && !(inode->i_state & I_DIRTY_DATASYNC))
 		goto out;
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -135,6 +135,9 @@
 
 #include <asm/uaccess.h>
 
+#include <bc/beancounter.h>
+#include <bc/misc.h>
+
 #define IS_POSIX(fl)	(fl->fl_flags & FL_POSIX)
 #define IS_FLOCK(fl)	(fl->fl_flags & FL_FLOCK)
 #define IS_LEASE(fl)	(fl->fl_flags & (FL_LEASE|FL_DELEG|FL_LAYOUT))
@@ -211,10 +214,26 @@ static void locks_init_lock_heads(struct file_lock *fl)
 }
 
 /* Allocate an empty lock structure. */
-struct file_lock *locks_alloc_lock(void)
+struct file_lock *locks_alloc_lock(int charge)
 {
-	struct file_lock *fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL);
+	struct file_lock *fl;
+
+	fl = kmem_cache_zalloc(filelock_cache, GFP_KERNEL);
+#ifdef CONFIG_BEANCOUNTERS
+	if (fl == NULL)
+		goto out;
+	fl->fl_ub = get_beancounter(get_exec_ub());
+	fl->fl_charged = 0;
+	if (!charge)
+		goto out;
+	if (!ub_flock_charge(fl, 1))
+		goto out;
 
+	put_beancounter(fl->fl_ub);
+	kmem_cache_free(filelock_cache, fl);
+	fl = NULL;
+out:
+#endif
 	if (fl)
 		locks_init_lock_heads(fl);
 
@@ -241,7 +260,11 @@ void locks_free_lock(struct file_lock *fl)
 	BUG_ON(!list_empty(&fl->fl_block));
 	BUG_ON(!hlist_unhashed(&fl->fl_link));
 
+	ub_flock_uncharge(fl);
 	locks_release_private(fl);
+#ifdef CONFIG_BEANCOUNTERS
+	put_beancounter(fl->fl_ub);
+#endif
 	kmem_cache_free(filelock_cache, fl);
 }
 EXPORT_SYMBOL(locks_free_lock);
@@ -319,7 +342,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock,
 	if (type < 0)
 		return type;
 	
-	fl = locks_alloc_lock();
+	fl = locks_alloc_lock(type != F_UNLCK);
 	if (fl == NULL)
 		return -ENOMEM;
 
@@ -480,7 +503,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl)
 /* Allocate a file_lock initialised to this type of lease */
 static struct file_lock *lease_alloc(struct file *filp, long type)
 {
-	struct file_lock *fl = locks_alloc_lock();
+	struct file_lock *fl = locks_alloc_lock(1);
 	int error = -ENOMEM;
 
 	if (fl == NULL)
@@ -835,7 +858,12 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
 	int found = 0;
 
 	if (!(request->fl_flags & FL_ACCESS) && (request->fl_type != F_UNLCK)) {
-		new_fl = locks_alloc_lock();
+		/*
+		 * Nont F_UNLCK request must be already charged in
+		 * flock_make_lock(). Actually new_fl must be charged not the
+		 * request, but we try to fail earlier.
+		 */
+		new_fl = locks_alloc_lock(0);
 		if (!new_fl)
 			return -ENOMEM;
 	}
@@ -865,16 +893,6 @@ static int flock_lock_inode(struct inode *inode, struct file_lock *request)
 		goto out;
 	}
 
-	/*
-	 * If a higher-priority process was blocked on the old file lock,
-	 * give it the opportunity to lock the file.
-	 */
-	if (found) {
-		spin_unlock(&inode->i_lock);
-		cond_resched();
-		spin_lock(&inode->i_lock);
-	}
-
 find_conflict:
 	for_each_lock(inode, before) {
 		struct file_lock *fl = *before;
@@ -893,6 +911,10 @@ find_conflict:
 	}
 	if (request->fl_flags & FL_ACCESS)
 		goto out;
+
+	set_flock_charged(new_fl);
+	unset_flock_charged(request);
+
 	locks_copy_lock(new_fl, request);
 	locks_insert_lock(before, new_fl);
 	new_fl = NULL;
@@ -925,8 +947,11 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 	if (!(request->fl_flags & FL_ACCESS) &&
 	    (request->fl_type != F_UNLCK ||
 	     request->fl_start != 0 || request->fl_end != OFFSET_MAX)) {
-		new_fl = locks_alloc_lock();
-		new_fl2 = locks_alloc_lock();
+		if (request->fl_type != F_UNLCK)
+			new_fl = locks_alloc_lock(1);
+		else
+			new_fl = NULL;
+		new_fl2 = locks_alloc_lock(0);
 	}
 
 	spin_lock(&inode->i_lock);
@@ -1070,7 +1095,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 	 * done below this, so it's safe yet to bail out.
 	 */
 	error = -ENOLCK; /* "no luck" */
-	if (right && left == right && !new_fl2)
+	if (right && left == right && !(request->fl_type == F_UNLCK || new_fl2))
 		goto out;
 
 	error = 0;
@@ -1081,23 +1106,32 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 			goto out;
 		}
 
-		if (!new_fl) {
-			error = -ENOLCK;
+		error = -ENOLCK;
+		if (!new_fl)
+			goto out;
+		if (right && (left == right) && ub_flock_charge(new_fl, 1))
 			goto out;
-		}
 		locks_copy_lock(new_fl, request);
 		locks_insert_lock(before, new_fl);
 		new_fl = NULL;
+		error = 0;
 	}
 	if (right) {
 		if (left == right) {
 			/* The new lock breaks the old one in two pieces,
 			 * so we have to use the second new lock.
 			 */
+			error = -ENOLCK;
+			if (added && ub_flock_charge(new_fl2,
+						request->fl_type != F_UNLCK))
+				goto out;
+			/* FIXME move all fl_charged manipulations in ub code */
+			set_flock_charged(new_fl2);
 			left = new_fl2;
 			new_fl2 = NULL;
 			locks_copy_lock(left, right);
 			locks_insert_lock(before, left);
+			error = 0;
 		}
 		right->fl_start = request->fl_end + 1;
 		locks_wake_up_blocks(right);
@@ -1524,8 +1558,9 @@ static int
 generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv)
 {
 	struct file_lock *fl, **before, **my_before = NULL, *lease;
-	struct dentry *dentry = filp->f_path.dentry;
-	struct inode *inode = file_inode(filp);
+	struct dentry *dentry = filp->f_original_path.mnt ?
+		filp->f_original_path.dentry: filp->f_path.dentry;
+	struct inode *inode = filp->f_path.dentry->d_inode;
 	bool is_deleg = (*flp)->fl_flags & FL_DELEG;
 	int error;
 
@@ -1655,7 +1690,7 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp,
 	struct inode *inode = file_inode(filp);
 	int error;
 
-	if ((!uid_eq(current_fsuid(), inode->i_uid)) && !capable(CAP_LEASE))
+	if ((!uid_eq(current_fsuid(), inode->i_uid)) && !ve_capable(CAP_LEASE))
 		return -EACCES;
 	if (!S_ISREG(inode->i_mode))
 		return -EINVAL;
@@ -2029,7 +2064,7 @@ static int do_lock_file_wait(struct file *filp, unsigned int cmd,
 int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 		struct flock __user *l)
 {
-	struct file_lock *file_lock = locks_alloc_lock();
+	struct file_lock *file_lock = locks_alloc_lock(0);
 	struct flock flock;
 	struct inode *inode;
 	struct file *f;
@@ -2147,7 +2182,7 @@ out:
 int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
 		struct flock64 __user *l)
 {
-	struct file_lock *file_lock = locks_alloc_lock();
+	struct file_lock *file_lock = locks_alloc_lock(0);
 	struct flock64 flock;
 	struct inode *inode;
 	struct file *f;
@@ -2464,6 +2499,7 @@ void show_fd_locks(struct seq_file *f,
 		 * matches ->fl_file.
 		 */
 		if (fl->fl_owner != files &&
+		    fl->fl_owner != (fl_owner_t)filp &&
 		    fl->fl_owner != NULL)
 			continue;
 
@@ -2520,7 +2556,7 @@ static const struct file_operations proc_locks_operations = {
 
 static int __init proc_locks_init(void)
 {
-	proc_create("locks", 0, NULL, &proc_locks_operations);
+	proc_create("locks", S_ISVTX, NULL, &proc_locks_operations);
 	return 0;
 }
 module_init(proc_locks_init);
@@ -2611,7 +2647,7 @@ static int __init filelock_init(void)
 	int i;
 
 	filelock_cache = kmem_cache_create("file_lock_cache",
-			sizeof(struct file_lock), 0, SLAB_PANIC, NULL);
+			sizeof(struct file_lock), 0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
 
 	lg_lock_init(&file_lock_lglock, "file_lock_lglock");
 
--- a/fs/logfs/inode.c
+++ b/fs/logfs/inode.c
@@ -408,7 +408,8 @@ const struct super_operations logfs_super_operations = {
 int logfs_init_inode_cache(void)
 {
 	logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
-			sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
+			sizeof(struct logfs_inode), 0,
+			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
 			logfs_init_once);
 	if (!logfs_inode_cache)
 		return -ENOMEM;
--- a/fs/mbcache.c
+++ b/fs/mbcache.c
@@ -86,18 +86,6 @@ static LIST_HEAD(mb_cache_list);
 static LIST_HEAD(mb_cache_lru_list);
 static DEFINE_SPINLOCK(mb_cache_spinlock);
 
-/*
- * What the mbcache registers as to get shrunk dynamically.
- */
-
-static int mb_cache_shrink_fn(struct shrinker *shrink,
-			      struct shrink_control *sc);
-
-static struct shrinker mb_cache_shrinker = {
-	.shrink = mb_cache_shrink_fn,
-	.seeks = DEFAULT_SEEKS,
-};
-
 static inline int
 __mb_cache_entry_is_hashed(struct mb_cache_entry *ce)
 {
@@ -151,7 +139,7 @@ forget:
 
 
 /*
- * mb_cache_shrink_fn()  memory pressure callback
+ * mb_cache_shrink_scan()  memory pressure callback
  *
  * This function is called by the kernel memory management when memory
  * gets low.
@@ -159,17 +147,16 @@ forget:
  * @shrink: (ignored)
  * @sc: shrink_control passed from reclaim
  *
- * Returns the number of objects which are present in the cache.
+ * Returns the number of objects freed.
  */
-static int
-mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+mb_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
 	LIST_HEAD(free_list);
-	struct mb_cache *cache;
 	struct mb_cache_entry *entry, *tmp;
-	int count = 0;
 	int nr_to_scan = sc->nr_to_scan;
 	gfp_t gfp_mask = sc->gfp_mask;
+	unsigned long freed = 0;
 
 	mb_debug("trying to free %d entries", nr_to_scan);
 	spin_lock(&mb_cache_spinlock);
@@ -179,19 +166,37 @@ mb_cache_shrink_fn(struct shrinker *shrink, struct shrink_control *sc)
 				   struct mb_cache_entry, e_lru_list);
 		list_move_tail(&ce->e_lru_list, &free_list);
 		__mb_cache_entry_unhash(ce);
+		freed++;
+	}
+	spin_unlock(&mb_cache_spinlock);
+	list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
+		__mb_cache_entry_forget(entry, gfp_mask);
 	}
+	return freed;
+}
+
+static unsigned long
+mb_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	struct mb_cache *cache;
+	unsigned long count = 0;
+
+	spin_lock(&mb_cache_spinlock);
 	list_for_each_entry(cache, &mb_cache_list, c_cache_list) {
 		mb_debug("cache %s (%d)", cache->c_name,
 			  atomic_read(&cache->c_entry_count));
 		count += atomic_read(&cache->c_entry_count);
 	}
 	spin_unlock(&mb_cache_spinlock);
-	list_for_each_entry_safe(entry, tmp, &free_list, e_lru_list) {
-		__mb_cache_entry_forget(entry, gfp_mask);
-	}
-	return (count / 100) * sysctl_vfs_cache_pressure;
+
+	return vfs_pressure_ratio(count);
 }
 
+static struct shrinker mb_cache_shrinker = {
+	.count_objects = mb_cache_shrink_count,
+	.scan_objects = mb_cache_shrink_scan,
+	.seeks = DEFAULT_SEEKS,
+};
 
 /*
  * mb_cache_create()  create a new cache
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -91,7 +91,7 @@ static int init_inodecache(void)
 	minix_inode_cachep = kmem_cache_create("minix_inode_cache",
 					     sizeof(struct minix_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (minix_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/mount.h
+++ b/fs/mount.h
@@ -10,7 +10,7 @@ struct mnt_namespace {
 	struct user_namespace	*user_ns;
 	u64			seq;	/* Sequence number to prevent loops */
 	wait_queue_head_t poll;
-	u64 event;
+	int event;
 };
 
 struct mnt_pcp {
@@ -18,12 +18,6 @@ struct mnt_pcp {
 	int mnt_writers;
 };
 
-struct mountpoint {
-	struct list_head m_hash;
-	struct dentry *m_dentry;
-	int m_count;
-};
-
 struct mount {
 	struct list_head mnt_hash;
 	struct mount *mnt_parent;
@@ -46,7 +40,7 @@ struct mount {
 	struct list_head mnt_slave;	/* slave list entry */
 	struct mount *mnt_master;	/* slave is on master->mnt_slave_list */
 	struct mnt_namespace *mnt_ns;	/* containing namespace */
-	struct mountpoint *mnt_mp;	/* where is it mounted */
+	struct mountpoint *mnt_mp;      /* where is it mounted */
 #ifdef CONFIG_FSNOTIFY
 	struct hlist_head mnt_fsnotify_marks;
 	__u32 mnt_fsnotify_mask;
@@ -76,6 +70,8 @@ static inline int is_mounted(struct vfsmount *mnt)
 	return !IS_ERR_OR_NULL(real_mount(mnt));
 }
 
+extern struct rw_semaphore namespace_sem;
+
 extern struct mount *__lookup_mnt(struct vfsmount *, struct dentry *, int);
 
 static inline void get_mnt_ns(struct mnt_namespace *ns)
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -28,6 +28,7 @@
 #include <linux/backing-dev.h>
 #include <linux/pagevec.h>
 #include <linux/cleancache.h>
+#include "internal.h"
 
 /*
  * I/O completion handler for multipage BIOs.
@@ -57,6 +58,7 @@ static void mpage_end_io(struct bio *bio, int err)
 static struct bio *mpage_bio_submit(int rw, struct bio *bio)
 {
 	bio->bi_end_io = mpage_end_io;
+	guard_bio_eod(rw, bio);
 	submit_bio(rw, bio);
 	return NULL;
 }
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -34,6 +34,7 @@
 #include <linux/device_cgroup.h>
 #include <linux/fs_struct.h>
 #include <linux/posix_acl.h>
+#include <linux/ve.h>
 #include <asm/uaccess.h>
 
 #include "internal.h"
@@ -888,7 +889,7 @@ static int may_linkat(struct path *link)
 	 * otherwise, it must be a safe source.
 	 */
 	if (uid_eq(cred->fsuid, inode->i_uid) || safe_hardlink_source(inode) ||
-	    capable(CAP_FOWNER))
+	    ve_capable(CAP_FOWNER))
 		return 0;
 
 	audit_log_link_denied("linkat", link);
@@ -1322,6 +1323,12 @@ static int follow_dotdot(struct nameidata *nd)
 		    nd->path.mnt == nd->root.mnt) {
 			break;
 		}
+#ifdef CONFIG_VE
+		if (nd->path.dentry == get_exec_env()->root_path.dentry &&
+			nd->path.mnt == get_exec_env()->root_path.mnt) {
+			break;
+		}
+#endif
 		if (nd->path.dentry != nd->path.mnt->mnt_root) {
 			/* rare case of legitimate dget_parent()... */
 			nd->path.dentry = dget_parent(nd->path.dentry);
@@ -2596,7 +2603,8 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
 		return -EPERM;
 
 	if (check_sticky(dir, inode) || IS_APPEND(inode) ||
-	    IS_IMMUTABLE(inode) || IS_SWAPFILE(inode))
+	    IS_IMMUTABLE(inode) ||
+	    (IS_SWAPFILE(inode) && inode->i_nlink == 1))
 		return -EPERM;
 	if (isdir) {
 		if (!d_is_dir(victim))
@@ -3514,7 +3522,7 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
 	if (error)
 		return error;
 
-	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !capable(CAP_MKNOD))
+	if ((S_ISCHR(mode) || S_ISBLK(mode)) && !ve_capable(CAP_MKNOD))
 		return -EPERM;
 
 	if (!dir->i_op->mknod)
@@ -4078,7 +4086,7 @@ SYSCALL_DEFINE5(linkat, int, olddfd, const char __user *, oldname,
 	 * handlink using the passed filedescriptor.
 	 */
 	if (flags & AT_EMPTY_PATH) {
-		if (!capable(CAP_DAC_READ_SEARCH))
+		if (!ve_capable(CAP_DAC_READ_SEARCH))
 			return -ENOENT;
 		how = LOOKUP_EMPTY;
 	}
@@ -4339,7 +4347,7 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
 	    (flags & RENAME_EXCHANGE))
 		return -EINVAL;
 
-	if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD))
+	if ((flags & RENAME_WHITEOUT) && !ve_capable(CAP_MKNOD))
 		return -EPERM;
 
 retry:
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -23,6 +23,7 @@
 #include <linux/uaccess.h>
 #include <linux/proc_ns.h>
 #include <linux/magic.h>
+#include <linux/ve.h>
 #include "pnode.h"
 #include "internal.h"
 
@@ -39,7 +40,7 @@ static int mnt_group_start = 1;
 static struct list_head mount_hashtable[HASH_SIZE];
 static struct list_head mountpoint_hashtable[HASH_SIZE];
 static struct kmem_cache *mnt_cache __read_mostly;
-static struct rw_semaphore namespace_sem;
+struct rw_semaphore namespace_sem;
 
 /* /sys/fs */
 struct kobject *fs_kobj;
@@ -164,7 +165,16 @@ unsigned int mnt_get_count(struct mount *mnt)
 
 static struct mount *alloc_vfsmnt(const char *name)
 {
-	struct mount *mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
+	struct mount *mnt;
+
+	if (!ve_mount_allowed()) {
+		pr_warn_ratelimited(
+			"CT#%s reached the limit on mounts.\n",
+			ve_name(get_exec_env()));
+		return NULL;
+	}
+
+	mnt = kmem_cache_zalloc(mnt_cache, GFP_KERNEL);
 	if (mnt) {
 		int err;
 
@@ -173,7 +183,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 			goto out_free_cache;
 
 		if (name) {
-			mnt->mnt_devname = kstrdup(name, GFP_KERNEL);
+			mnt->mnt_devname = kstrdup(name, GFP_KERNEL_ACCOUNT);
 			if (!mnt->mnt_devname)
 				goto out_free_id;
 		}
@@ -201,6 +211,7 @@ static struct mount *alloc_vfsmnt(const char *name)
 		INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks);
 #endif
 	}
+	ve_mount_nr_inc();
 	return mnt;
 
 #ifdef CONFIG_SMP
@@ -541,6 +552,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
 
 static void free_vfsmnt(struct mount *mnt)
 {
+	ve_mount_nr_dec();
 	kfree(mnt->mnt_devname);
 	mnt_free_id(mnt);
 #ifdef CONFIG_SMP
@@ -1279,6 +1291,8 @@ static int do_umount(struct mount *mnt, int flags)
 		 * Special case for "unmounting" root ...
 		 * we just try to remount it readonly.
 		 */
+		if (!ve_capable(CAP_SYS_ADMIN))
+			return -EPERM;
 		down_write(&sb->s_umount);
 		if (!(sb->s_flags & MS_RDONLY))
 			retval = do_remount_sb(sb, MS_RDONLY, NULL, 0);
@@ -1859,6 +1873,154 @@ static int change_mount_flags(struct vfsmount *mnt, int ms_flags)
 	return error;
 }
 
+#ifdef CONFIG_VE
+/*
+ * Returns first occurrence of needle in haystack separated by sep,
+ * or NULL if not found
+ */
+static char *strstr_separated(char *haystack, char *needle, char sep)
+{
+	int needle_len = strlen(needle);
+
+	while (haystack) {
+		if (!strncmp(haystack, needle, needle_len) &&
+		    (haystack[needle_len] == 0 || /* end-of-line or */
+		     haystack[needle_len] == sep)) /* separator */
+			return haystack;
+
+		haystack = strchr(haystack, sep);
+		if (haystack)
+			haystack++;
+	}
+
+	return NULL;
+}
+
+static int ve_devmnt_check(char *options, char *allowed)
+{
+	char *p;
+
+	if (!options || !*options)
+		return 0;
+
+	if (!allowed)
+		return -EPERM;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		if (!*p)
+			continue;
+
+		if (!strstr_separated(allowed, p, ','))
+			return -EPERM;
+	}
+
+	return 0;
+}
+
+static int ve_devmnt_insert(char *options, char *hidden)
+{
+	int options_len;
+	int hidden_len;
+
+	if (!hidden)
+		return 0;
+
+	if (!options)
+		return -EAGAIN;
+
+	options_len = strlen(options);
+	hidden_len = strlen(hidden);
+
+	if (hidden_len + options_len + 2 > PAGE_SIZE)
+		return -EPERM;
+
+	memmove(options + hidden_len + 1, options, options_len);
+	memcpy(options, hidden, hidden_len);
+
+	options[hidden_len] = ',';
+	options[hidden_len + options_len + 1] = 0;
+
+	return 0;
+}
+
+int ve_devmnt_process(struct ve_struct *ve, dev_t dev, void **data_pp, int remount)
+{
+	void *data = *data_pp;
+	struct ve_devmnt *devmnt;
+	int err;
+again:
+	err = 1;
+	mutex_lock(&ve->devmnt_mutex);
+	list_for_each_entry(devmnt, &ve->devmnt_list, link) {
+		if (devmnt->dev == dev) {
+			err = ve_devmnt_check(data, devmnt->allowed_options);
+
+			/*
+			 * In case of @is_pseudouser set, ie restore procedure,
+			 * we don't check for allowed options filtering, since
+			 * restore mode is special.
+			 */
+			if ((ve->is_pseudosuper || !err) && !remount)
+				err = ve_devmnt_insert(data, devmnt->hidden_options);
+
+			break;
+		}
+	}
+	mutex_unlock(&ve->devmnt_mutex);
+
+	switch (err) {
+	case -EAGAIN:
+		if (!(data = (void *)__get_free_page(GFP_KERNEL)))
+			return -ENOMEM;
+		*(char *)data = 0; /* the string must be zero-terminated */
+		goto again;
+	case 1:
+		if (*data_pp) {
+			/*
+			 * Same as in chunk above but for case where
+			 * ve->devmnt_list is empty. Depending on
+			 * the way userspace tool restore container
+			 * it might be nonempty as well.
+			 */
+			if (ve->is_pseudosuper) {
+				err = 0;
+			} else {
+				ve_pr_warn_ratelimited(VE_LOG_BOTH, "VE%s: no allowed "
+					  "mount options found for device %u:%u\n",
+					  ve->ve_name, MAJOR(dev), MINOR(dev));
+				err = -EPERM;
+			}
+		} else
+			err = 0;
+		break;
+	case 0:
+		*data_pp = data;
+		break;
+	}
+
+	if (data && data != *data_pp)
+		free_page((unsigned long)data);
+
+	return err;
+}
+#endif
+
+static int do_check_and_remount_sb(struct super_block *sb, int flags, void *data)
+{
+#ifdef CONFIG_VE
+	struct ve_struct *ve = get_exec_env();
+
+	if (sb->s_bdev && data && !ve_is_super(ve)) {
+		int err;
+
+		err = ve_devmnt_process(ve, sb->s_bdev->bd_dev, &data, 1);
+		if (err)
+			return err;
+	}
+#endif
+	return do_remount_sb(sb, flags, data, 0);
+}
+
 /*
  * change filesystem flags. dir should be a physical root of filesystem.
  * If you've mounted a non-root directory somewhere and want to do remount
@@ -1884,13 +2046,13 @@ static int do_remount(struct path *path, int flags, int mnt_flags,
 	down_write(&sb->s_umount);
 	if (flags & MS_BIND)
 		err = change_mount_flags(path->mnt, flags);
-	else if (!capable(CAP_SYS_ADMIN))
+	else if (!ve_capable(CAP_SYS_ADMIN))
 		err = -EPERM;
 	else
-		err = do_remount_sb(sb, flags, data, 0);
+		err = do_check_and_remount_sb(sb, flags, data);
 	if (!err) {
 		br_write_lock(&vfsmount_lock);
-		mnt_flags |= mnt->mnt.mnt_flags & MNT_PROPAGATION_MASK;
+		mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
 		mnt->mnt.mnt_flags = mnt_flags;
 		br_write_unlock(&vfsmount_lock);
 	}
@@ -2069,7 +2231,7 @@ static int do_new_mount(struct path *path, const char *fstype, int flags,
 	if (!type)
 		return -ENODEV;
 
-	if (user_ns != &init_user_ns) {
+	if (user_ns != ve_init_user_ns()) {
 		if (!(type->fs_flags & FS_USERNS_MOUNT)) {
 			put_filesystem(type);
 			return -EPERM;
@@ -2538,12 +2700,6 @@ struct mnt_namespace *copy_mnt_ns(unsigned long flags, struct mnt_namespace *ns,
 	if (!(flags & CLONE_NEWNS))
 		return ns;
 
-	/* Unprivileged creation currently disabled in RHEL7  */
-	if (!capable(CAP_SYS_ADMIN)) {
-		put_mnt_ns(ns);
-		return ERR_PTR(-EPERM);
-	}
-
 	new_ns = dup_mnt_ns(ns, user_ns, new_fs);
 
 	put_mnt_ns(ns);
@@ -2820,7 +2976,7 @@ void __init mnt_init(void)
 	init_rwsem(&namespace_sem);
 
 	mnt_cache = kmem_cache_create("mnt_cache", sizeof(struct mount),
-			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+			0, SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT, NULL);
 
 	printk(KERN_INFO "Mount-cache hash table entries: %lu\n", HASH_SIZE);
 
--- a/fs/ncpfs/inode.c
+++ b/fs/ncpfs/inode.c
@@ -80,7 +80,7 @@ static int init_inodecache(void)
 	ncp_inode_cachep = kmem_cache_create("ncp_inode_cache",
 					     sizeof(struct ncp_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (ncp_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1503,7 +1503,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry,
 	dfprintk(VFS, "NFS: atomic_open(%s/%lu), %pd\n",
 			dir->i_sb->s_id, dir->i_ino, dentry);
 
-	err = nfs_check_flags(open_flags);
+	err = nfs_set_flags(file, open_flags);
 	if (err)
 		return err;
 
@@ -2112,11 +2112,13 @@ static void nfs_access_free_list(struct list_head *head)
 	}
 }
 
-int nfs_do_access_cache_shrinker(int nr_to_scan)
+static unsigned long
+nfs_do_access_cache_scan(unsigned int nr_to_scan)
 {
 	LIST_HEAD(head);
 	struct nfs_inode *nfsi, *next;
 	struct nfs_access_entry *cache;
+	long freed = 0;
 
 	spin_lock(&nfs_access_lru_lock);
 	list_for_each_entry_safe(nfsi, next, &nfs_access_lru_list, access_cache_inode_lru) {
@@ -2132,6 +2134,7 @@ int nfs_do_access_cache_shrinker(int nr_to_scan)
 				struct nfs_access_entry, lru);
 		list_move(&cache->lru, &head);
 		rb_erase(&cache->rb_node, &nfsi->access_cache);
+		freed++;
 		if (!list_empty(&nfsi->access_cache_entry_lru))
 			list_move_tail(&nfsi->access_cache_inode_lru,
 					&nfs_access_lru_list);
@@ -2146,18 +2149,24 @@ remove_lru_entry:
 	}
 	spin_unlock(&nfs_access_lru_lock);
 	nfs_access_free_list(&head);
-	return (atomic_long_read(&nfs_access_nr_entries) / 100) * sysctl_vfs_cache_pressure;
+	return freed;
 }
 
-int nfs_access_cache_shrinker(struct shrinker *shrink,
-			      struct shrink_control *sc)
+unsigned long
+nfs_access_cache_count(struct shrinker *shrink, struct shrink_control *sc)
+{
+	return vfs_pressure_ratio(atomic_long_read(&nfs_access_nr_entries));
+}
+
+unsigned long
+nfs_access_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
 	int nr_to_scan = sc->nr_to_scan;
 	gfp_t gfp_mask = sc->gfp_mask;
 
 	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
-		return (nr_to_scan == 0) ? 0 : -1;
-	return nfs_do_access_cache_shrinker(nr_to_scan);
+		return SHRINK_STOP;
+	return nfs_do_access_cache_scan(nr_to_scan);
 }
 
 static void
@@ -2173,7 +2182,7 @@ nfs_access_cache_enforce_limit(void)
 	diff = nr_entries - nfs_access_max_cachesize;
 	if (diff < nr_to_scan)
 		nr_to_scan = diff;
-	nfs_do_access_cache_shrinker(nr_to_scan);
+	nfs_do_access_cache_scan(nr_to_scan);
 }
 
 static void __nfs_access_zap_cache(struct nfs_inode *nfsi, struct list_head *head)
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -51,6 +51,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/sunrpc/clnt.h>
+#include <linux/virtinfo.h>
 
 #include <asm/uaccess.h>
 #include <linux/atomic.h>
@@ -641,6 +642,8 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, const struct iovec *iov,
 	ssize_t result = -EINVAL;
 	size_t count;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	count = iov_length(iov, nr_segs);
 	nfs_add_stats(mapping->host, NFSIOS_DIRECTREADBYTES, count);
 
@@ -1102,6 +1105,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	loff_t end;
 	size_t count;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	count = iov_length(iov, nr_segs);
 	end = (pos + count - 1) >> PAGE_CACHE_SHIFT;
 
@@ -1127,12 +1132,10 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	if (result)
 		goto out_unlock;
 
-	if (mapping->nrpages) {
-		result = invalidate_inode_pages2_range(mapping,
-					pos >> PAGE_CACHE_SHIFT, end);
-		if (result)
-			goto out_unlock;
-	}
+	result = invalidate_inode_pages2_range(mapping,
+				pos >> PAGE_CACHE_SHIFT, end);
+	if (result)
+		goto out_unlock;
 
 	task_io_account_write(count);
 
@@ -1156,10 +1159,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 
 	result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
 
-	if (mapping->nrpages) {
-		invalidate_inode_pages2_range(mapping,
-					      pos >> PAGE_CACHE_SHIFT, end);
-	}
+	invalidate_inode_pages2_range(mapping,
+				pos >> PAGE_CACHE_SHIFT, end);
 
 	mutex_unlock(&inode->i_mutex);
 
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -49,14 +49,14 @@ static const struct vm_operations_struct nfs_file_vm_ops;
 # define IS_SWAPFILE(inode)	(0)
 #endif
 
-int nfs_check_flags(int flags)
+int nfs_set_flags(struct file * filp, int flags)
 {
 	if ((flags & (O_APPEND | O_DIRECT)) == (O_APPEND | O_DIRECT))
 		return -EINVAL;
 
-	return 0;
+	return generic_set_file_flags(filp, flags);
 }
-EXPORT_SYMBOL_GPL(nfs_check_flags);
+EXPORT_SYMBOL_GPL(nfs_set_flags);
 
 /*
  * Open file
@@ -69,7 +69,7 @@ nfs_file_open(struct inode *inode, struct file *filp)
 	dprintk("NFS: open file(%pD2)\n", filp);
 
 	nfs_inc_stats(inode, NFSIOS_VFSOPEN);
-	res = nfs_check_flags(filp->f_flags);
+	res = nfs_set_flags(filp, filp->f_flags);
 	if (res)
 		return res;
 
@@ -636,7 +636,6 @@ out:
 static const struct vm_operations_struct nfs_file_vm_ops = {
 	.fault = filemap_fault,
 	.page_mkwrite = nfs_vm_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 static int nfs_need_check_write(struct file *filp, struct inode *inode)
@@ -967,7 +966,7 @@ const struct file_operations nfs_file_operations = {
 	.flock		= nfs_flock,
 	.splice_read	= nfs_file_splice_read,
 	.splice_write	= nfs_file_splice_write,
-	.check_flags	= nfs_check_flags,
+	.set_flags	= nfs_set_flags,
 	.setlease	= nfs_setlease,
 };
 EXPORT_SYMBOL_GPL(nfs_file_operations);
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -37,7 +37,6 @@
 #include <linux/nfs_xdr.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
-#include <linux/freezer.h>
 
 #include <asm/uaccess.h>
 
@@ -77,7 +76,7 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
  */
 int nfs_wait_bit_killable(struct wait_bit_key *key, int mode)
 {
-	freezable_schedule_unsafe();
+	schedule();
 	if (signal_pending_state(mode, current))
 		return -ERESTARTSYS;
 	return 0;
@@ -1066,10 +1065,11 @@ static int nfs_invalidate_mapping(struct inode *inode, struct address_space *map
 			if (ret < 0)
 				return ret;
 		}
-		ret = invalidate_inode_pages2(mapping);
-		if (ret < 0)
-			return ret;
 	}
+	ret = invalidate_inode_pages2(mapping);
+	if (ret < 0)
+		return ret;
+
 	if (S_ISDIR(inode->i_mode)) {
 		spin_lock(&inode->i_lock);
 		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
@@ -1946,7 +1946,7 @@ static int __init nfs_init_inodecache(void)
 	nfs_inode_cachep = kmem_cache_create("nfs_inode_cache",
 					     sizeof(struct nfs_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (nfs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -342,8 +342,10 @@ extern struct nfs_client *nfs_init_client(struct nfs_client *clp,
 			   const char *ip_addr);
 
 /* dir.c */
-extern int nfs_access_cache_shrinker(struct shrinker *shrink,
-					struct shrink_control *sc);
+extern unsigned long nfs_access_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc);
+extern unsigned long nfs_access_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc);
 extern void nfs_force_use_readdirplus(struct inode *dir);
 struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int);
 int nfs_create(struct inode *, struct dentry *, umode_t, bool);
@@ -368,7 +370,7 @@ int nfs_lock(struct file *, int, struct file_lock *);
 int nfs_flock(struct file *, int, struct file_lock *);
 ssize_t nfs_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *,
 			      size_t, unsigned int);
-int nfs_check_flags(int);
+int nfs_set_flags(struct file * filp, int flags);
 int nfs_setlease(struct file *, long, struct file_lock **, void **priv);
 
 /* inode.c */
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -17,7 +17,6 @@
 #include <linux/nfs_page.h>
 #include <linux/lockd/bind.h>
 #include <linux/nfs_mount.h>
-#include <linux/freezer.h>
 
 #include "iostat.h"
 #include "internal.h"
@@ -34,7 +33,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags)
 		res = rpc_call_sync(clnt, msg, flags);
 		if (res != -EJUKEBOX)
 			break;
-		freezable_schedule_timeout_killable_unsafe(NFS_JUKEBOX_RETRY_TIME);
+		schedule_timeout_killable(NFS_JUKEBOX_RETRY_TIME);
 		res = -ERESTARTSYS;
 	} while (!fatal_signal_pending(current));
 	return res;
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -44,7 +44,7 @@ nfs4_file_open(struct inode *inode, struct file *filp)
 
 	dprintk("NFS: open file(%pd2)\n", dentry);
 
-	err = nfs_check_flags(openflags);
+	err = nfs_set_flags(filp, openflags);
 	if (err)
 		return err;
 
@@ -339,7 +339,7 @@ const struct file_operations nfs4_file_operations = {
 	.flock		= nfs_flock,
 	.splice_read	= nfs_file_splice_read,
 	.splice_write	= nfs_file_splice_write,
-	.check_flags	= nfs_check_flags,
+	.set_flags	= nfs_set_flags,
 	.setlease	= nfs_setlease,
 #ifdef CONFIG_NFS_V4_2
 	.llseek		= nfs4_file_llseek,
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -54,7 +54,6 @@
 #include <linux/module.h>
 #include <linux/xattr.h>
 #include <linux/utsname.h>
-#include <linux/freezer.h>
 
 #include "nfs4_fs.h"
 #include "delegation.h"
@@ -334,7 +333,7 @@ static int nfs4_delay(struct rpc_clnt *clnt, long *timeout)
 
 	might_sleep();
 
-	freezable_schedule_timeout_killable_unsafe(
+	schedule_timeout_killable(
 		nfs4_update_delay(timeout));
 	if (fatal_signal_pending(current))
 		res = -ERESTARTSYS;
@@ -5472,7 +5471,7 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
 static unsigned long
 nfs4_set_lock_task_retry(unsigned long timeout)
 {
-	freezable_schedule_timeout_killable_unsafe(timeout);
+	schedule_timeout_killable(timeout);
 	timeout <<= 1;
 	if (timeout > NFS4_LOCK_MAXTIMEOUT)
 		return NFS4_LOCK_MAXTIMEOUT;
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -41,7 +41,6 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/lockd/bind.h>
-#include <linux/freezer.h>
 #include "internal.h"
 
 #define NFSDBG_FACILITY		NFSDBG_PROC
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -55,6 +55,9 @@
 #include <linux/nsproxy.h>
 #include <linux/rcupdate.h>
 
+#include <uapi/linux/vzcalluser.h>
+#include <linux/ve.h>
+
 #include <asm/uaccess.h>
 
 #include "nfs4_fs.h"
@@ -292,7 +295,8 @@ struct file_system_type nfs_fs_type = {
 	.name		= "nfs",
 	.mount		= nfs_fs_mount,
 	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA|FS_HAS_INVALIDATE_RANGE,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA|
+			  FS_HAS_INVALIDATE_RANGE|FS_VIRTUALIZED|FS_USERNS_MOUNT,
 };
 MODULE_ALIAS_FS("nfs");
 EXPORT_SYMBOL_GPL(nfs_fs_type);
@@ -332,7 +336,8 @@ struct file_system_type nfs4_fs_type = {
 	.name		= "nfs4",
 	.mount		= nfs_fs_mount,
 	.kill_sb	= nfs_kill_super,
-	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA|FS_HAS_INVALIDATE_RANGE,
+	.fs_flags	= FS_RENAME_DOES_D_MOVE|FS_BINARY_MOUNTDATA|
+			  FS_HAS_INVALIDATE_RANGE|FS_VIRTUALIZED|FS_USERNS_MOUNT,
 };
 MODULE_ALIAS_FS("nfs4");
 MODULE_ALIAS("nfs4");
@@ -359,7 +364,8 @@ static void unregister_nfs4_fs(void)
 #endif
 
 static struct shrinker acl_shrinker = {
-	.shrink		= nfs_access_cache_shrinker,
+	.count_objects	= nfs_access_cache_count,
+	.scan_objects	= nfs_access_cache_scan,
 	.seeks		= DEFAULT_SEEKS,
 };
 
@@ -2618,6 +2624,11 @@ struct dentry *nfs_fs_mount(struct file_system_type *fs_type,
 	struct nfs_subversion *nfs_mod;
 	int error;
 
+	if (!(get_exec_env()->features & VE_FEATURE_NFS))
+		return ERR_PTR(-ENODEV);
+	if (!current_user_ns_initial())
+		return ERR_PTR(-EPERM);
+
 	mount_info.parsed = nfs_alloc_parsed_mount_data();
 	mount_info.mntfh = nfs_alloc_fhandle();
 	if (mount_info.parsed == NULL || mount_info.mntfh == NULL)
--- a/fs/nfsd/nfs3xdr.c
+++ b/fs/nfsd/nfs3xdr.c
@@ -358,6 +358,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 {
 	unsigned int len, v, hdr, dlen;
 	u32 max_blocksize = svc_max_payload(rqstp);
+	struct kvec *head = rqstp->rq_arg.head;
+	struct kvec *tail = rqstp->rq_arg.tail;
 
 	p = decode_fh(p, &args->fh);
 	if (!p)
@@ -367,6 +369,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 	args->count = ntohl(*p++);
 	args->stable = ntohl(*p++);
 	len = args->len = ntohl(*p++);
+	if ((void *)p > head->iov_base + head->iov_len)
+		return 0;
 	/*
 	 * The count must equal the amount of data passed.
 	 */
@@ -377,9 +381,8 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 	 * Check to make sure that we got the right number of
 	 * bytes.
 	 */
-	hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
-	dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
-		+ rqstp->rq_arg.tail[0].iov_len - hdr;
+	hdr = (void*)p - head->iov_base;
+	dlen = head->iov_len + rqstp->rq_arg.page_len + tail->iov_len - hdr;
 	/*
 	 * Round the length of the data which was specified up to
 	 * the next multiple of XDR units and then compare that
@@ -396,7 +399,7 @@ nfs3svc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 		len = args->len = max_blocksize;
 	}
 	rqstp->rq_vec[0].iov_base = (void*)p;
-	rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+	rqstp->rq_vec[0].iov_len = head->iov_len - hdr;
 	v = 0;
 	while (len > rqstp->rq_vec[v].iov_len) {
 		len -= rqstp->rq_vec[v].iov_len;
@@ -471,6 +474,8 @@ nfs3svc_decode_symlinkargs(struct svc_rqst *rqstp, __be32 *p,
 	/* first copy and check from the first page */
 	old = (char*)p;
 	vec = &rqstp->rq_arg.head[0];
+	if ((void *)old > vec->iov_base + vec->iov_len)
+		return 0;
 	avail = vec->iov_len - (old - (char*)vec->iov_base);
 	while (len && avail && *old) {
 		*new++ = *old++;
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -179,7 +179,7 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
 	struct file_lock *fl;
 	int status;
 
-	fl = locks_alloc_lock();
+	fl = locks_alloc_lock(1);
 	if (!fl)
 		return -ENOMEM;
 	locks_init_lock(fl);
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -36,7 +36,6 @@
 #include <linux/slab.h>
 #include <linux/namei.h>
 #include <linux/crypto.h>
-#include <linux/sched.h>
 #include <linux/fs.h>
 #include <linux/module.h>
 #include <net/net_namespace.h>
@@ -44,6 +43,8 @@
 #include <linux/sunrpc/clnt.h>
 #include <linux/nfsd/cld.h>
 
+#include <linux/ve.h>
+
 #include "nfsd.h"
 #include "state.h"
 #include "vfs.h"
@@ -701,7 +702,7 @@ cld_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
 	struct cld_upcall *tmp, *cup;
 	struct cld_msg __user *cmsg = (struct cld_msg __user *)src;
 	uint32_t xid;
-	struct nfsd_net *nn = net_generic(file_inode(filp)->i_sb->s_fs_info,
+	struct nfsd_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info,
 						nfsd_net_id);
 	struct cld_net *cn = nn->cld_net;
 
@@ -1205,7 +1206,7 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1)
 	argv[2] = arg;
 	argv[3] = NULL;
 
-	ret = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
+	ret = call_usermodehelper_ve(get_exec_env(), argv[0], argv, envp, UMH_WAIT_PROC);
 	/*
 	 * Disable the upcall mechanism if we're getting an ENOENT or EACCES
 	 * error. The admin can re-enable it on the fly by using sysfs
@@ -1248,13 +1249,6 @@ nfsd4_umh_cltrack_init(struct net *net)
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 	char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time);
 
-	/* XXX: The usermode helper s not working in container yet. */
-	if (net != &init_net) {
-		pr_warn("NFSD: attempt to initialize umh client tracking in a container ignored.\n");
-		kfree(grace_start);
-		return -EINVAL;
-	}
-
 	ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL);
 	kfree(grace_start);
 	return ret;
@@ -1415,24 +1409,29 @@ nfsd4_client_tracking_init(struct net *net)
 	if (!status)
 		return status;
 
-	/*
-	 * See if the recoverydir exists and is a directory. If it is,
-	 * then use the legacy ops.
-	 */
-	nn->client_tracking_ops = &nfsd4_legacy_tracking_ops;
-	status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
-	if (!status) {
-		status = S_ISDIR(path.dentry->d_inode->i_mode);
-		path_put(&path);
-		if (status)
-			goto do_init;
+	if (net_eq(net, &init_net)) {
+		/*
+		 * See if the recoverydir exists and is a directory. If it is,
+		 * then use the legacy ops.
+		 */
+		nn->client_tracking_ops = &nfsd4_legacy_tracking_ops;
+		status = kern_path(nfs4_recoverydir(), LOOKUP_FOLLOW, &path);
+		if (!status) {
+			status = S_ISDIR(path.dentry->d_inode->i_mode);
+			path_put(&path);
+			if (status)
+				goto do_init;
+		}
 	}
 
 	/* Finally, try to use nfsdcld */
 	nn->client_tracking_ops = &nfsd4_cld_tracking_ops;
-	printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be "
-			"removed in 3.10. Please transition to using "
-			"nfsdcltrack.\n");
+
+	if (net_eq(net, &init_net)) {
+		printk(KERN_WARNING "NFSD: the nfsdcld client tracking upcall will be "
+				"removed in 3.10. Please transition to using "
+				"nfsdcltrack.\n");
+	}
 do_init:
 	status = nn->client_tracking_ops->init(net);
 	if (status) {
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -4037,7 +4037,7 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag)
 {
 	struct file_lock *fl;
 
-	fl = locks_alloc_lock();
+	fl = locks_alloc_lock(1);
 	if (!fl)
 		return NULL;
 	fl->fl_lmops = &nfsd_lease_mng_ops;
@@ -5628,7 +5628,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (!locks_in_grace(net) && lock->lk_reclaim)
 		goto out;
 
-	file_lock = locks_alloc_lock();
+	file_lock = locks_alloc_lock(1);
 	if (!file_lock) {
 		dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
 		status = nfserr_jukebox;
@@ -5672,7 +5672,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
 	nfs4_transform_lock_offset(file_lock);
 
-	conflock = locks_alloc_lock();
+	conflock = locks_alloc_lock(1);
 	if (!conflock) {
 		dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
 		status = nfserr_jukebox;
@@ -5773,7 +5773,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
 		goto out;
 
-	file_lock = locks_alloc_lock();
+	file_lock = locks_alloc_lock(1);
 	if (!file_lock) {
 		dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
 		status = nfserr_jukebox;
@@ -5850,7 +5850,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		status = nfserr_lock_range;
 		goto put_stateid;
 	}
-	file_lock = locks_alloc_lock();
+	file_lock = locks_alloc_lock(1);
 	if (!file_lock) {
 		dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
 		status = nfserr_jukebox;
--- a/fs/nfsd/nfscache.c
+++ b/fs/nfsd/nfscache.c
@@ -64,11 +64,14 @@ static unsigned int		longest_chain_cachesize;
 
 static int	nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *vec);
 static void	cache_cleaner_func(struct work_struct *unused);
-static int 	nfsd_reply_cache_shrink(struct shrinker *shrink,
-					struct shrink_control *sc);
+static unsigned long nfsd_reply_cache_count(struct shrinker *shrink,
+					    struct shrink_control *sc);
+static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink,
+					   struct shrink_control *sc);
 
 static struct shrinker nfsd_reply_cache_shrinker = {
-	.shrink	= nfsd_reply_cache_shrink,
+	.scan_objects = nfsd_reply_cache_scan,
+	.count_objects = nfsd_reply_cache_count,
 	.seeks	= 1,
 };
 
@@ -243,6 +246,7 @@ prune_bucket(struct nfsd_drc_bucket *b)
 		    time_before(jiffies, rp->c_timestamp + RC_EXPIRE))
 			break;
 		nfsd_reply_cache_free_locked(rp);
+		freed++;
 	}
 	return freed;
 }
@@ -285,12 +289,17 @@ cache_cleaner_func(struct work_struct *unused)
 	prune_cache_entries();
 }
 
-static int
-nfsd_reply_cache_shrink(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc)
 {
 	return atomic_read(&num_drc_entries);
 }
 
+static unsigned long
+nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc)
+{
+	return prune_cache_entries();
+}
 /*
  * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes
  */
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -15,6 +15,7 @@
 #include <linux/sunrpc/gss_krb5_enctypes.h>
 #include <linux/sunrpc/rpc_pipe_fs.h>
 #include <linux/module.h>
+#include <uapi/linux/vzcalluser.h>
 
 #include "idmap.h"
 #include "nfsd.h"
@@ -1167,6 +1168,10 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
 static struct dentry *nfsd_mount(struct file_system_type *fs_type,
 	int flags, const char *dev_name, void *data)
 {
+	if (!(get_exec_env()->features & VE_FEATURE_NFSD))
+		return ERR_PTR(-ENODEV);
+	if (!current_user_ns_initial())
+		return ERR_PTR(-EPERM);
 	return mount_ns(fs_type, flags, current->nsproxy->net_ns, nfsd_fill_super);
 }
 
@@ -1183,6 +1188,7 @@ static struct file_system_type nfsd_fs_type = {
 	.name		= "nfsd",
 	.mount		= nfsd_mount,
 	.kill_sb	= nfsd_umount,
+	.fs_flags	= FS_VIRTUALIZED|FS_USERNS_MOUNT,
 };
 MODULE_ALIAS_FS("nfsd");
 
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -733,6 +733,37 @@ static __be32 map_new_errors(u32 vers, __be32 nfserr)
 	return nfserr;
 }
 
+/*
+ * A write procedure can have a large argument, and a read procedure can
+ * have a large reply, but no NFSv2 or NFSv3 procedure has argument and
+ * reply that can both be larger than a page.  The xdr code has taken
+ * advantage of this assumption to be a sloppy about bounds checking in
+ * some cases.  Pending a rewrite of the NFSv2/v3 xdr code to fix that
+ * problem, we enforce these assumptions here:
+ */
+static bool nfs_request_too_big(struct svc_rqst *rqstp,
+				struct svc_procedure *proc)
+{
+	/*
+	 * The ACL code has more careful bounds-checking and is not
+	 * susceptible to this problem:
+	 */
+	if (rqstp->rq_prog != NFS_PROGRAM)
+		return false;
+	/*
+	 * Ditto NFSv4 (which can in theory have argument and reply both
+	 * more than a page):
+	 */
+	if (rqstp->rq_vers >= 4)
+		return false;
+	/* The reply will be small, we're OK: */
+	if (proc->pc_xdrressize > 0 &&
+	    proc->pc_xdrressize < XDR_QUADLEN(PAGE_SIZE))
+		return false;
+
+	return rqstp->rq_arg.len > PAGE_SIZE;
+}
+
 int
 nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 {
@@ -745,6 +776,11 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
 				rqstp->rq_vers, rqstp->rq_proc);
 	proc = rqstp->rq_procinfo;
 
+	if (nfs_request_too_big(rqstp, proc)) {
+		dprintk("nfsd: NFSv%d argument too large\n", rqstp->rq_vers);
+		*statp = rpc_garbage_args;
+		return 1;
+	}
 	/*
 	 * Give the xdr decoder a chance to change this if it wants
 	 * (necessary in the NFSv4.0 compound case)
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -280,6 +280,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 					struct nfsd_writeargs *args)
 {
 	unsigned int len, hdr, dlen;
+	struct kvec *head = rqstp->rq_arg.head;
 	int v;
 
 	p = decode_fh(p, &args->fh);
@@ -300,9 +301,10 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 	 * Check to make sure that we got the right number of
 	 * bytes.
 	 */
-	hdr = (void*)p - rqstp->rq_arg.head[0].iov_base;
-	dlen = rqstp->rq_arg.head[0].iov_len + rqstp->rq_arg.page_len
-		- hdr;
+	hdr = (void*)p - head->iov_base;
+	if (hdr > head->iov_len)
+		return 0;
+	dlen = head->iov_len + rqstp->rq_arg.page_len - hdr;
 
 	/*
 	 * Round the length of the data which was specified up to
@@ -316,7 +318,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, __be32 *p,
 		return 0;
 
 	rqstp->rq_vec[0].iov_base = (void*)p;
-	rqstp->rq_vec[0].iov_len = rqstp->rq_arg.head[0].iov_len - hdr;
+	rqstp->rq_vec[0].iov_len = head->iov_len - hdr;
 	v = 0;
 	while (len > rqstp->rq_vec[v].iov_len) {
 		len -= rqstp->rq_vec[v].iov_len;
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -2216,7 +2216,7 @@ nfsd_racache_init(int cache_size)
 
 		raparm = &raparm_hash[i].pb_head;
 		for (j = 0; j < nperbucket; j++) {
-			*raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL);
+			*raparm = kzalloc(sizeof(struct raparms), GFP_KERNEL_ACCOUNT);
 			if (!*raparm)
 				goto out_nomem;
 			raparm = &(*raparm)->p_next;
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -135,7 +135,6 @@ static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
 static const struct vm_operations_struct nilfs_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= nilfs_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -1403,7 +1403,8 @@ static int __init nilfs_init_cachep(void)
 {
 	nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
 			sizeof(struct nilfs_inode_info), 0,
-			SLAB_RECLAIM_ACCOUNT, nilfs_inode_init_once);
+			SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+			nilfs_inode_init_once);
 	if (!nilfs_inode_cachep)
 		goto fail;
 
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -654,7 +654,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 	pr_debug("%s: flags=%d event_f_flags=%d\n",
 		__func__, flags, event_f_flags);
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	if (flags & ~FAN_ALL_INIT_FLAGS)
@@ -715,7 +715,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 
 	if (flags & FAN_UNLIMITED_QUEUE) {
 		fd = -EPERM;
-		if (!capable(CAP_SYS_ADMIN))
+		if (!ve_capable(CAP_SYS_ADMIN))
 			goto out_destroy_group;
 		group->max_events = UINT_MAX;
 	} else {
@@ -724,7 +724,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
 
 	if (flags & FAN_UNLIMITED_MARKS) {
 		fd = -EPERM;
-		if (!capable(CAP_SYS_ADMIN))
+		if (!ve_capable(CAP_SYS_ADMIN))
 			goto out_destroy_group;
 		group->fanotify_data.max_marks = UINT_MAX;
 	} else {
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -25,16 +25,15 @@ static int show_fdinfo(struct seq_file *m, struct file *f,
 {
 	struct fsnotify_group *group = f->private_data;
 	struct fsnotify_mark *mark;
-	int ret = 0;
 
 	mutex_lock(&group->mark_mutex);
 	list_for_each_entry(mark, &group->marks_list, g_list) {
-		ret = show(m, mark);
-		if (ret)
+		show(m, mark);
+		if (m->count == m->size)
 			break;
 	}
 	mutex_unlock(&group->mark_mutex);
-	return ret;
+	return 0;
 }
 
 #if defined(CONFIG_EXPORTFS)
@@ -42,7 +41,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
 {
 	struct {
 		struct file_handle handle;
-		u8 pad[64];
+		u8 pad[MAX_HANDLE_SZ];
 	} f;
 	int size, ret, i;
 
@@ -50,7 +49,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
 	size = f.handle.handle_bytes >> 2;
 
 	ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0);
-	if ((ret == 255) || (ret == -ENOSPC)) {
+	if ((ret == FILEID_INVALID) || (ret < 0)) {
 		WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret);
 		return 0;
 	}
@@ -58,13 +57,13 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
 	f.handle.handle_type = ret;
 	f.handle.handle_bytes = size * sizeof(u32);
 
-	ret = seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:",
-			 f.handle.handle_bytes, f.handle.handle_type);
+	seq_printf(m, "fhandle-bytes:%x fhandle-type:%x f_handle:",
+		   f.handle.handle_bytes, f.handle.handle_type);
 
 	for (i = 0; i < f.handle.handle_bytes; i++)
-		ret |= seq_printf(m, "%02x", (int)f.handle.f_handle[i]);
+		seq_printf(m, "%02x", (int)f.handle.f_handle[i]);
 
-	return ret;
+	return 0;
 }
 #else
 static int show_mark_fhandle(struct seq_file *m, struct inode *inode)
@@ -79,25 +78,23 @@ static int inotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 {
 	struct inotify_inode_mark *inode_mark;
 	struct inode *inode;
-	int ret = 0;
 
-	if (!(mark->flags & (FSNOTIFY_MARK_FLAG_ALIVE | FSNOTIFY_MARK_FLAG_INODE)))
+	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE) ||
+	    !(mark->flags & FSNOTIFY_MARK_FLAG_INODE))
 		return 0;
 
 	inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
 	inode = igrab(mark->i.inode);
 	if (inode) {
-		ret = seq_printf(m, "inotify wd:%x ino:%lx sdev:%x "
-				 "mask:%x ignored_mask:%x ",
-				 inode_mark->wd, inode->i_ino,
-				 inode->i_sb->s_dev,
-				 mark->mask, mark->ignored_mask);
-		ret |= show_mark_fhandle(m, inode);
-		ret |= seq_putc(m, '\n');
+		seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
+			   inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
+			   mark->mask, mark->ignored_mask);
+		show_mark_fhandle(m, inode);
+		seq_putc(m, '\n');
 		iput(inode);
 	}
 
-	return ret;
+	return 0;
 }
 
 int inotify_show_fdinfo(struct seq_file *m, struct file *f)
@@ -113,7 +110,6 @@ static int fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 {
 	unsigned int mflags = 0;
 	struct inode *inode;
-	int ret = 0;
 
 	if (!(mark->flags & FSNOTIFY_MARK_FLAG_ALIVE))
 		return 0;
@@ -124,23 +120,20 @@ static int fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark)
 	if (mark->flags & FSNOTIFY_MARK_FLAG_INODE) {
 		inode = igrab(mark->i.inode);
 		if (!inode)
-			goto out;
-		ret = seq_printf(m, "fanotify ino:%lx sdev:%x "
-				 "mflags:%x mask:%x ignored_mask:%x ",
-				 inode->i_ino, inode->i_sb->s_dev,
-				 mflags, mark->mask, mark->ignored_mask);
-		ret |= show_mark_fhandle(m, inode);
-		ret |= seq_putc(m, '\n');
+			return 0;
+		seq_printf(m, "fanotify ino:%lx sdev:%x mflags:%x mask:%x ignored_mask:%x ",
+			   inode->i_ino, inode->i_sb->s_dev,
+			   mflags, mark->mask, mark->ignored_mask);
+		show_mark_fhandle(m, inode);
+		seq_putc(m, '\n');
 		iput(inode);
 	} else if (mark->flags & FSNOTIFY_MARK_FLAG_VFSMOUNT) {
 		struct mount *mnt = real_mount(mark->m.mnt);
 
-		ret = seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x "
-				 "ignored_mask:%x\n", mnt->mnt_id, mflags,
-				 mark->mask, mark->ignored_mask);
+		seq_printf(m, "fanotify mnt_id:%x mflags:%x mask:%x ignored_mask:%x\n",
+			   mnt->mnt_id, mflags, mark->mask, mark->ignored_mask);
 	}
-out:
-	return ret;
+	return 0;
 }
 
 int fanotify_show_fdinfo(struct seq_file *m, struct file *f)
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -813,8 +813,8 @@ static int __init inotify_user_setup(void)
 	inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC);
 
 	inotify_max_queued_events = 16384;
-	inotify_max_user_instances = 128;
-	inotify_max_user_watches = 8192;
+	inotify_max_user_instances = INT_MAX;
+	inotify_max_user_watches = INT_MAX;
 
 	return 0;
 }
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -3146,8 +3146,8 @@ static int __init init_ntfs_fs(void)
 
 	ntfs_big_inode_cache = kmem_cache_create(ntfs_big_inode_cache_name,
 			sizeof(big_ntfs_inode), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
-			ntfs_big_inode_init_once);
+			SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|
+			SLAB_ACCOUNT, ntfs_big_inode_init_once);
 	if (!ntfs_big_inode_cache) {
 		printk(KERN_CRIT "NTFS: Failed to create %s!\n",
 				ntfs_big_inode_cache_name);
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -274,8 +274,9 @@ static int ocfs2_set_acl(handle_t *handle,
 	case ACL_TYPE_ACCESS:
 		name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
 		if (acl) {
-			umode_t mode = inode->i_mode;
-			ret = posix_acl_equiv_mode(acl, &mode);
+			umode_t mode;
+
+			ret = posix_acl_update_mode(inode, &mode, &acl);
 			if (ret < 0)
 				return ret;
 			else {
@@ -286,7 +287,6 @@ static int ocfs2_set_acl(handle_t *handle,
 							 handle, mode);
 				if (ret)
 					return ret;
-
 			}
 		}
 		break;
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -656,7 +656,7 @@ static int __init init_dlmfs_fs(void)
 	dlmfs_inode_cache = kmem_cache_create("dlmfs_inode_cache",
 				sizeof(struct dlmfs_inode_private),
 				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-					SLAB_MEM_SPREAD),
+					SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 				dlmfs_init_once);
 	if (!dlmfs_inode_cache) {
 		status = -ENOMEM;
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -173,7 +173,6 @@ out:
 static const struct vm_operations_struct ocfs2_file_vm_ops = {
 	.fault		= ocfs2_fault,
 	.page_mkwrite	= ocfs2_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 int ocfs2_mmap(struct file *file, struct vm_area_struct *vma)
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1790,7 +1790,7 @@ static int ocfs2_initialize_mem_caches(void)
 				       sizeof(struct ocfs2_inode_info),
 				       0,
 				       (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 				       ocfs2_inode_init_once);
 	ocfs2_dquot_cachep = kmem_cache_create("ocfs2_dquot_cache",
 					sizeof(struct ocfs2_dquot),
@@ -2100,6 +2100,8 @@ static int ocfs2_initialize_super(struct super_block *sb,
 	cbits = le32_to_cpu(di->id2.i_super.s_clustersize_bits);
 	bbits = le32_to_cpu(di->id2.i_super.s_blocksize_bits);
 	sb->s_maxbytes = ocfs2_max_file_offset(bbits, cbits);
+	memcpy(sb->s_uuid, di->id2.i_super.s_uuid,
+	       sizeof(di->id2.i_super.s_uuid));
 
 	osb->osb_dx_mask = (1 << (cbits - bbits)) - 1;
 
@@ -2363,7 +2365,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
 		mlog_errno(status);
 		goto bail;
 	}
-	cleancache_init_shared_fs((char *)&di->id2.i_super.s_uuid, sb);
+	cleancache_init_shared_fs(sb);
 
 bail:
 	return status;
--- a/fs/open.c
+++ b/fs/open.c
@@ -234,7 +234,8 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 
 	/* Return error if mode is not supported */
 	if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
-		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE))
+		     FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |
+		     FALLOC_FL_CONVERT_UNWRITTEN))
 		return -EOPNOTSUPP;
 
 	/* Punch hole and zero range are mutually exclusive */
@@ -252,6 +253,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	    (mode & ~FALLOC_FL_COLLAPSE_RANGE))
 		return -EINVAL;
 
+	/* Convert-and-extend should only be used exclusively. */
+	if ((mode & FALLOC_FL_CONVERT_UNWRITTEN) &&
+	    (mode & ~FALLOC_FL_CONVERT_UNWRITTEN))
+		return -EINVAL;
+
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
 
@@ -705,6 +711,11 @@ static int do_dentry_open(struct file *f,
 	static const struct file_operations empty_fops = {};
 	int error;
 
+	if (!may_use_odirect())
+		f->f_flags &= ~O_DIRECT;
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		f->f_flags &= ~O_SYNC;
+
 	f->f_mode = OPEN_FMODE(f->f_flags) | FMODE_LSEEK |
 				FMODE_PREAD | FMODE_PWRITE;
 
@@ -882,9 +893,17 @@ int vfs_open(const struct path *path, struct file *filp,
 {
 	struct inode *inode = path->dentry->d_inode;
 	iop_dentry_open_t dentry_open = get_dentry_open_iop(inode);
+	int do_cleanup = 0;
+	int ret;
+
+	if (!filp->f_original_path.mnt && dentry_open) {
+		filp->f_original_path = *path;
+		path_get(&filp->f_original_path);
+		do_cleanup = 1;
+	}
 
 	if (dentry_open)
-		return dentry_open(path->dentry, filp, cred);
+		ret = dentry_open(path->dentry, filp, cred);
 	else {
 		struct dentry *dentry = d_real(path->dentry, NULL, filp->f_flags);
 
@@ -892,8 +911,15 @@ int vfs_open(const struct path *path, struct file *filp,
 			return PTR_ERR(dentry);
 
 		filp->f_path = *path;
-		return do_dentry_open(filp, dentry->d_inode, NULL, cred);
+		ret = do_dentry_open(filp, dentry->d_inode, NULL, cred);
+	}
+
+	if (ret && do_cleanup) {
+		path_put(&filp->f_original_path);
+		filp->f_original_path.mnt = NULL;
+		filp->f_original_path.dentry = NULL;
 	}
+	return ret;
 }
 EXPORT_SYMBOL(vfs_open);
 
@@ -1120,7 +1146,7 @@ EXPORT_SYMBOL(sys_close);
  */
 SYSCALL_DEFINE0(vhangup)
 {
-	if (capable(CAP_SYS_TTY_CONFIG)) {
+	if (ve_capable(CAP_SYS_TTY_CONFIG)) {
 		tty_vhangup_self();
 		return 0;
 	}
--- a/fs/openpromfs/inode.c
+++ b/fs/openpromfs/inode.c
@@ -449,7 +449,7 @@ static int __init init_openprom_fs(void)
 					    sizeof(struct op_inode_info),
 					    0,
 					    (SLAB_RECLAIM_ACCOUNT |
-					     SLAB_MEM_SPREAD),
+					     SLAB_MEM_SPREAD | SLAB_ACCOUNT),
 					    op_inode_init_once);
 	if (!op_inode_cachep)
 		return -ENOMEM;
--- a/fs/overlayfs/dir.c
+++ b/fs/overlayfs/dir.c
@@ -1067,6 +1067,7 @@ const struct inode_operations_wrapper ovl_dir_inode_operations = {
 	.listxattr	= ovl_listxattr,
 	.removexattr	= generic_removexattr,
 	.get_acl	= ovl_get_acl,
+	.update_time	= ovl_update_time,
 	},
 	.rename2	= ovl_rename2,
 };
--- a/fs/overlayfs/inode.c
+++ b/fs/overlayfs/inode.c
@@ -216,8 +216,6 @@ static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
 	if (!realinode->i_op->readlink)
 		return -EINVAL;
 
-	touch_atime(&realpath);
-
 	old_cred = ovl_override_creds(dentry->d_sb);
 	err = realinode->i_op->readlink(realpath.dentry, buf, bufsiz);
 	revert_creds(old_cred);
@@ -373,6 +371,29 @@ int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags)
 	return err;
 }
 
+int ovl_update_time(struct inode *inode, struct timespec *ts, int flags)
+{
+	struct dentry *alias;
+	struct path upperpath;
+
+	if (!(flags & S_ATIME))
+		return 0;
+
+	alias = d_find_any_alias(inode);
+	if (!alias)
+		return 0;
+
+	ovl_path_upper(alias, &upperpath);
+	if (upperpath.dentry) {
+		touch_atime(&upperpath);
+		inode->i_atime = d_inode(upperpath.dentry)->i_atime;
+	}
+
+	dput(alias);
+
+	return 0;
+}
+
 static const struct inode_operations_wrapper ovl_file_inode_operations = {
 	.ops = {
 	.setattr	= ovl_setattr,
@@ -383,6 +404,7 @@ static const struct inode_operations_wrapper ovl_file_inode_operations = {
 	.listxattr	= ovl_listxattr,
 	.removexattr	= generic_removexattr,
 	.get_acl	= ovl_get_acl,
+	.update_time	= ovl_update_time,
 	},
 };
 
@@ -396,13 +418,14 @@ static const struct inode_operations ovl_symlink_inode_operations = {
 	.getxattr	= ovl_getxattr,
 	.listxattr	= ovl_listxattr,
 	.removexattr	= generic_removexattr,
+	.update_time	= ovl_update_time,
 };
 
 static void ovl_fill_inode(struct inode *inode, umode_t mode)
 {
 	inode->i_ino = get_next_ino();
 	inode->i_mode = mode;
-	inode->i_flags |= S_NOATIME | S_NOCMTIME;
+	inode->i_flags |= S_NOCMTIME;
 
 	mode &= S_IFMT;
 	switch (mode) {
--- a/fs/overlayfs/overlayfs.h
+++ b/fs/overlayfs/overlayfs.h
@@ -190,6 +190,7 @@ ssize_t ovl_getxattr(struct dentry *dentry, const char *name,
 ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size);
 struct posix_acl *ovl_get_acl(struct inode *inode, int type);
 int ovl_open_maybe_copy_up(struct dentry *dentry, unsigned int file_flags);
+int ovl_update_time(struct inode *inode, struct timespec *ts, int flags);
 
 struct inode *ovl_new_inode(struct super_block *sb, umode_t mode);
 struct inode *ovl_get_inode(struct super_block *sb, struct inode *realinode);
@@ -198,6 +199,9 @@ static inline void ovl_copyattr(struct inode *from, struct inode *to)
 	to->i_uid = from->i_uid;
 	to->i_gid = from->i_gid;
 	to->i_mode = from->i_mode;
+	to->i_atime = from->i_atime;
+	to->i_mtime = from->i_mtime;
+	to->i_ctime = from->i_ctime;
 }
 
 /* dir.c */
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -330,12 +330,11 @@ static struct dentry *ovl_d_real(struct dentry *dentry,
 	if (!real)
 		goto bug;
 
-	if (!inode || inode == d_inode(real))
-		return real;
-
 	/* Handle recursion */
-	return d_real(real, inode, open_flags);
+	real = d_real(real, inode, open_flags);
 
+	if (!inode || inode == d_inode(real))
+		return real;
 bug:
 	WARN(1, "ovl_d_real(%pd4, %s:%lu\n): real dentry not found\n", dentry,
 	     inode ? inode->i_sb->s_id : "NULL", inode ? inode->i_ino : 0);
@@ -618,7 +617,7 @@ out:
 
 struct file *ovl_path_open(struct path *path, int flags)
 {
-	return dentry_open(path, flags, current_cred());
+	return dentry_open(path, flags | O_NOATIME, current_cred());
 }
 
 static void ovl_put_super(struct super_block *sb)
@@ -1200,6 +1199,10 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 			pr_err("overlayfs: failed to clone upperpath\n");
 			goto out_put_lowerpath;
 		}
+		/* Don't inherit atime flags */
+		ufs->upper_mnt->mnt_flags &= ~(MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME);
+
+		sb->s_time_gran = ufs->upper_mnt->mnt_sb->s_time_gran;
 
 		ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry);
 		err = PTR_ERR(ufs->workdir);
@@ -1248,7 +1251,7 @@ static int ovl_fill_super(struct super_block *sb, void *data, int silent)
 		 * Make lower_mnt R/O.  That way fchmod/fchown on lower file
 		 * will fail instead of modifying lower fs.
 		 */
-		mnt->mnt_flags |= MNT_READONLY;
+		mnt->mnt_flags |= MNT_READONLY | MNT_NOATIME;
 
 		ufs->lower_mnt[ufs->numlower] = mnt;
 		ufs->numlower++;
@@ -1337,13 +1340,6 @@ out:
 static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags,
 				const char *dev_name, void *raw_data)
 {
-	static bool seen = false;
-
-	if (!seen) {
-		mark_tech_preview("Overlay filesystem", THIS_MODULE);
-		seen = true;
-	}
-
 	return mount_nodev(fs_type, flags, raw_data, ovl_fill_super);
 }
 
@@ -1352,7 +1348,7 @@ static struct file_system_type ovl_fs_type = {
 	.name		= "overlay",
 	.mount		= ovl_mount,
 	.kill_sb	= kill_anon_super,
-	.fs_flags	= FS_HAS_DOPS_WRAPPER,
+	.fs_flags	= FS_HAS_DOPS_WRAPPER | FS_VIRTUALIZED,
 };
 MODULE_ALIAS_FS("overlay");
 
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -22,6 +22,7 @@
 #include <linux/syscalls.h>
 #include <linux/fcntl.h>
 #include <linux/aio.h>
+#include <linux/memcontrol.h>
 
 #include <asm/uaccess.h>
 #include <asm/ioctls.h>
@@ -229,12 +230,24 @@ static void anon_pipe_buf_release(struct pipe_inode_info *pipe,
 	 * temporary page, let's keep track of it as a one-deep
 	 * allocation cache. (Otherwise just release our reference to it)
 	 */
-	if (page_count(page) == 1 && !pipe->tmp_page)
+	if (page_count(page) == 1 && !pipe->tmp_page) {
 		pipe->tmp_page = page;
-	else
+	} else
 		page_cache_release(page);
 }
 
+static int anon_pipe_buf_steal(struct pipe_inode_info *pipe,
+			       struct pipe_buffer *buf)
+{
+	struct page *page = buf->page;
+
+	if (page_count(page) == 1) {
+		memcg_kmem_uncharge_pages(page, 0);
+		lock_page(page);
+		return 0;
+	}
+	return 1;
+}
 /**
  * generic_pipe_buf_map - virtually map a pipe buffer
  * @pipe:	the pipe that the buffer belongs to
@@ -365,7 +378,7 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = {
 	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = anon_pipe_buf_release,
-	.steal = generic_pipe_buf_steal,
+	.steal = anon_pipe_buf_steal,
 	.get = generic_pipe_buf_get,
 };
 
@@ -375,7 +388,7 @@ static const struct pipe_buf_operations packet_pipe_buf_ops = {
 	.unmap = generic_pipe_buf_unmap,
 	.confirm = generic_pipe_buf_confirm,
 	.release = anon_pipe_buf_release,
-	.steal = generic_pipe_buf_steal,
+	.steal = anon_pipe_buf_steal,
 	.get = generic_pipe_buf_get,
 };
 
@@ -593,7 +606,7 @@ redo1:
 			size_t remaining;
 
 			if (!page) {
-				page = alloc_page(GFP_HIGHUSER);
+				page = alloc_pages(GFP_HIGHUSER | __GFP_ACCOUNT, 0);
 				if (unlikely(!page)) {
 					ret = ret ? : -ENOMEM;
 					break;
@@ -822,7 +835,7 @@ struct pipe_inode_info *alloc_pipe_info(void)
 {
 	struct pipe_inode_info *pipe;
 
-	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL);
+	pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL_ACCOUNT);
 	if (pipe) {
 		unsigned long pipe_bufs = PIPE_DEF_BUFFERS;
 		struct user_struct *user = get_current_user();
--- a/fs/posix_acl.c
+++ b/fs/posix_acl.c
@@ -418,3 +418,34 @@ posix_acl_chmod(struct posix_acl **acl, gfp_t gfp, umode_t mode)
 	return err;
 }
 EXPORT_SYMBOL(posix_acl_chmod);
+
+/**
+ * posix_acl_update_mode  -  update mode in set_acl
+ *
+ * Update the file mode when setting an ACL: compute the new file permission
+ * bits based on the ACL.  In addition, if the ACL is equivalent to the new
+ * file mode, set *acl to NULL to indicate that no ACL should be set.
+ *
+ * As with chmod, clear the setgit bit if the caller is not in the owning group
+ * or capable of CAP_FSETID (see inode_change_ok).
+ *
+ * Called from set_acl inode operations.
+ */
+int posix_acl_update_mode(struct inode *inode, umode_t *mode_p,
+			  struct posix_acl **acl)
+{
+	umode_t mode = inode->i_mode;
+	int error;
+
+	error = posix_acl_equiv_mode(*acl, &mode);
+	if (error < 0)
+		return error;
+	if (error == 0)
+		*acl = NULL;
+	if (!in_group_p(inode->i_gid) &&
+	    !capable_wrt_inode_uidgid(inode, CAP_FSETID))
+		mode &= ~S_ISGID;
+	*mode_p = mode;
+	return 0;
+}
+EXPORT_SYMBOL(posix_acl_update_mode);
--- a/fs/proc/array.c
+++ b/fs/proc/array.c
@@ -82,6 +82,9 @@
 #include <linux/ptrace.h>
 #include <linux/tracehook.h>
 #include <linux/user_namespace.h>
+#include <linux/ve.h>
+
+#include <bc/beancounter.h>
 
 #include <asm/pgtable.h>
 #include <asm/processor.h>
@@ -160,6 +163,18 @@ static inline const char *get_task_state(struct task_struct *tsk)
 	return *p;
 }
 
+static int task_virtual_pid(struct task_struct *t)
+{
+	struct pid *pid;
+
+	pid = task_pid(t);
+	/*
+	 * this will give wrong result for tasks,
+	 * that failed to enter VE, but that's OK
+	 */
+	return pid ? pid->numbers[pid->level].nr : 0;
+}
+
 static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 				struct pid *pid, struct task_struct *p)
 {
@@ -168,17 +183,17 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	int g;
 	struct fdtable *fdt = NULL;
 	const struct cred *cred;
-	pid_t ppid, tpid;
+	pid_t ppid, tpid, vpid;
 
 	rcu_read_lock();
-	ppid = pid_alive(p) ?
-		task_tgid_nr_ns(rcu_dereference(p->real_parent), ns) : 0;
+	ppid = pid_alive(p) ? ve_task_ppid_nr_ns(p, ns) : 0;
 	tpid = 0;
 	if (pid_alive(p)) {
 		struct task_struct *tracer = ptrace_parent(p);
 		if (tracer)
 			tpid = task_pid_nr_ns(tracer, ns);
 	}
+	vpid = task_virtual_pid(p);
 	cred = get_task_cred(p);
 	seq_printf(m,
 		"State:\t%s\n"
@@ -221,6 +236,13 @@ static inline void task_state(struct seq_file *m, struct pid_namespace *ns,
 	put_cred(cred);
 
 	seq_putc(m, '\n');
+
+#ifdef CONFIG_VE
+	rcu_read_lock();
+	seq_printf(m, "envID:\t%s\nVPid:\t%d\n",
+			task_ve_name(p), vpid);
+	rcu_read_unlock();
+#endif
 }
 
 void render_sigset_t(struct seq_file *m, const char *header,
@@ -260,10 +282,10 @@ static void collect_sigign_sigcatch(struct task_struct *p, sigset_t *ign,
 	}
 }
 
-static inline void task_sig(struct seq_file *m, struct task_struct *p)
+void task_sig(struct seq_file *m, struct task_struct *p)
 {
 	unsigned long flags;
-	sigset_t pending, shpending, blocked, ignored, caught;
+	sigset_t pending, shpending, blocked, ignored, caught, saved;
 	int num_threads = 0;
 	unsigned long qsize = 0;
 	unsigned long qlim = 0;
@@ -273,11 +295,13 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 	sigemptyset(&blocked);
 	sigemptyset(&ignored);
 	sigemptyset(&caught);
+	sigemptyset(&saved);
 
 	if (lock_task_sighand(p, &flags)) {
 		pending = p->pending.signal;
 		shpending = p->signal->shared_pending.signal;
 		blocked = p->blocked;
+		saved = p->saved_sigmask;
 		collect_sigign_sigcatch(p, &ignored, &caught);
 		num_threads = get_nr_threads(p);
 		rcu_read_lock();  /* FIXME: is this correct? */
@@ -296,6 +320,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
 	render_sigset_t(m, "SigBlk:\t", &blocked);
 	render_sigset_t(m, "SigIgn:\t", &ignored);
 	render_sigset_t(m, "SigCgt:\t", &caught);
+	render_sigset_t(m, "SigSvd:\t", &saved);
 }
 
 static void render_cap_t(struct seq_file *m, const char *header,
@@ -340,6 +365,20 @@ static inline void task_seccomp(struct seq_file *m, struct task_struct *p)
 #endif
 }
 
+#ifdef CONFIG_BEANCOUNTERS
+static inline void ub_dump_task_info(struct task_struct *tsk,
+		char *stsk, int ltsk, char *smm, int lmm)
+{
+	snprintf(stsk, ltsk, "%s", tsk->task_bc.task_ub->ub_name);
+	task_lock(tsk);
+	if (tsk->mm)
+		snprintf(smm, lmm, "%s", tsk->mm->mm_ub->ub_name);
+	else
+		strncpy(smm, "N/A", lmm);
+	task_unlock(tsk);
+}
+#endif
+
 static inline void task_context_switch_counts(struct seq_file *m,
 						struct task_struct *p)
 {
@@ -363,6 +402,9 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 			struct pid *pid, struct task_struct *task)
 {
 	struct mm_struct *mm = get_task_mm(task);
+#ifdef CONFIG_BEANCOUNTERS
+	char tsk_ub_info[64], mm_ub_info[64];
+#endif
 
 	task_name(m, task);
 	task_state(m, ns, pid, task);
@@ -377,6 +419,14 @@ int proc_pid_status(struct seq_file *m, struct pid_namespace *ns,
 	task_cpus_allowed(m, task);
 	cpuset_task_status_allowed(m, task);
 	task_context_switch_counts(m, task);
+#ifdef CONFIG_BEANCOUNTERS
+	ub_dump_task_info(task,
+			tsk_ub_info, sizeof(tsk_ub_info),
+			mm_ub_info, sizeof(mm_ub_info));
+
+	seq_printf(m, "TaskUB:\t%s\n", tsk_ub_info);
+	seq_printf(m, "MMUB:\t%s\n", mm_ub_info);
+#endif
 	return 0;
 }
 
@@ -400,6 +450,11 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	unsigned long rsslim = 0;
 	char tcomm[sizeof(task->comm)];
 	unsigned long flags;
+	int is_super = ve_is_super(get_exec_env());
+#ifdef CONFIG_BEANCOUNTERS
+	char ub_task_info[64];
+	char ub_mm_info[64];
+#endif
 
 	state = *get_task_state(task);
 	vsize = eip = esp = 0;
@@ -457,7 +512,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 		}
 
 		sid = task_session_nr_ns(task, ns);
-		ppid = task_tgid_nr_ns(task->real_parent, ns);
+		ppid = ve_task_ppid_nr_ns(task, ns);
 		pgid = task_pgrp_nr_ns(task, ns);
 
 		unlock_task_sighand(task, &flags);
@@ -482,9 +537,28 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	start_time =
 		(unsigned long long)task->real_start_time.tv_sec * NSEC_PER_SEC
 				+ task->real_start_time.tv_nsec;
+#ifdef CONFIG_VE
+	if (!is_super) {
+		struct timespec *ve_start_ts =
+				&get_exec_env()->real_start_timespec;
+		start_time -=
+			(unsigned long long)ve_start_ts->tv_sec * NSEC_PER_SEC
+				+ ve_start_ts->tv_nsec;
+	}
+	/* tasks inside a CT can have negative start time e.g. if the CT was
+	 * migrated from another hw node, in which case we will report 0 in
+	 * order not to confuse userspace */
+	if ((s64)start_time < 0)
+		start_time = 0;
+#endif
 	/* convert nsec -> ticks */
 	start_time = nsec_to_clock_t(start_time);
 
+#ifdef CONFIG_BEANCOUNTERS
+	ub_dump_task_info(task, ub_task_info, sizeof(ub_task_info),
+			ub_mm_info, sizeof(ub_mm_info));
+#endif
+
 	seq_printf(m, "%d (%s) %c", pid_nr_ns(pid, ns), tcomm, state);
 	seq_put_decimal_ll(m, ' ', ppid);
 	seq_put_decimal_ll(m, ' ', pgid);
@@ -525,7 +599,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	seq_put_decimal_ull(m, ' ', 0);
 	seq_put_decimal_ull(m, ' ', 0);
 	seq_put_decimal_ll(m, ' ', task->exit_signal);
-	seq_put_decimal_ll(m, ' ', task_cpu(task));
+	seq_put_decimal_ll(m, ' ', is_super ? task_cpu(task) : task_vcpu_id(task));
 	seq_put_decimal_ull(m, ' ', task->rt_priority);
 	seq_put_decimal_ull(m, ' ', task->policy);
 	seq_put_decimal_ull(m, ' ', delayacct_blkio_ticks(task));
@@ -548,6 +622,18 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns,
 	else
 		seq_put_decimal_ll(m, ' ', 0);
 
+#ifdef CONFIG_VE
+	seq_printf(m, " %s", " 0 0 0 0 0");
+	seq_put_decimal_ll(m, ' ', task_pid_nr_ns(task, task_active_pid_ns(task)));
+	rcu_read_lock();
+	seq_printf(m, " %s", task_ve_name(task));
+	rcu_read_unlock();
+#endif
+#ifdef CONFIG_BEANCOUNTERS
+	seq_printf(m, " %s", ub_task_info);
+	seq_printf(m, " %s", ub_mm_info);
+#endif
+
 	seq_putc(m, '\n');
 	if (mm)
 		mmput(mm);
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -87,6 +87,7 @@
 #include <linux/slab.h>
 #include <linux/flex_array.h>
 #include <linux/posix-timers.h>
+#include <linux/aio.h>
 #ifdef CONFIG_HARDWALL
 #include <asm/hardwall.h>
 #endif
@@ -608,10 +609,14 @@ static int proc_oom_score(struct task_struct *task, char *buffer)
 {
 	unsigned long totalpages = totalram_pages + total_swap_pages;
 	unsigned long points = 0;
+	struct user_beancounter *ub = get_exec_ub();
+
+	if (ub != get_ub0())
+		totalpages = min(totalpages, ub_total_pages(ub, true));
 
 	tasklist_read_lock();
 	if (pid_alive(task))
-		points = oom_badness(task, NULL, NULL, totalpages) *
+		points = oom_badness(task, NULL, NULL, totalpages, NULL) *
 						1000 / totalpages;
 	read_unlock(&tasklist_lock);
 	return sprintf(buffer, "%lu\n", points);
@@ -718,17 +723,36 @@ static int proc_pid_syscall(struct task_struct *task, char *buffer)
 static int proc_fd_access_allowed(struct inode *inode)
 {
 	struct task_struct *task;
-	int allowed = 0;
+	int err;
+
 	/* Allow access to a task's file descriptors if it is us or we
 	 * may use ptrace attach to the process and find out that
 	 * information.
 	 */
+	err = -ENOENT;
 	task = get_proc_task(inode);
 	if (task) {
-		allowed = ptrace_may_access(task, PTRACE_MODE_READ);
+		if (task->flags & PF_KTHREAD)
+			/*
+			 * Always allow access to kernel threads /proc entries.
+			 */
+			err = 0;
+		else if (ptrace_may_access(task, PTRACE_MODE_READ))
+			err = 0;
+		else
+			/*
+			 * This clever ptrace_may_attach() may play a trick
+			 * on us. If the task is zombie it will consider this
+			 * task to be not dumpable at all and will deny any
+			 * ptracing in VE. Not a big deal for ptrace(), but
+			 * following the link will fail with the -EACCESS
+			 * reason. Some software is unable to stand such a
+			 * swindle and refuses to work :(
+			 */
+			err = (task->mm ? -EACCES : -ENOENT);
 		put_task_struct(task);
 	}
-	return allowed;
+	return err;
 }
 
 int proc_setattr(struct dentry *dentry, struct iattr *attr)
@@ -763,6 +787,14 @@ static bool has_pid_permissions(struct pid_namespace *pid,
 	return ptrace_may_access(task, PTRACE_MODE_READ);
 }
 
+static bool is_visible_task(struct pid_namespace *ns, struct task_struct *tsk)
+{
+	if (ns->hide_pidns == 1 && task_active_pid_ns(tsk) != ns)
+		return false;
+	if (!has_pid_permissions(ns, tsk, 2))
+		return false;
+	return true;
+}
 
 static int proc_pid_permission(struct inode *inode, int mask)
 {
@@ -1186,7 +1218,7 @@ static ssize_t oom_score_adj_read(struct file *file, char __user *buf,
 	if (!task)
 		return -ESRCH;
 	if (lock_task_sighand(task, &flags)) {
-		oom_score_adj = task->signal->oom_score_adj;
+		oom_score_adj = get_task_oom_score_adj(task);
 		unlock_task_sighand(task, &flags);
 	}
 	put_task_struct(task);
@@ -1238,7 +1270,7 @@ static ssize_t oom_score_adj_write(struct file *file, const char __user *buf,
 	}
 
 	if ((short)oom_score_adj < task->signal->oom_score_adj_min &&
-			!capable(CAP_SYS_RESOURCE)) {
+			!ve_capable(CAP_SYS_RESOURCE)) {
 		err = -EACCES;
 		goto err_sighand;
 	}
@@ -1627,10 +1659,11 @@ static void *proc_pid_follow_link(struct dentry *dentry, struct nameidata *nd)
 {
 	struct inode *inode = dentry->d_inode;
 	struct path path;
-	int error = -EACCES;
+	int error;
 
 	/* Are we allowed to snoop on the tasks file descriptors? */
-	if (!proc_fd_access_allowed(inode))
+	error = proc_fd_access_allowed(inode);
+	if (error < 0)
 		goto out;
 
 	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
@@ -1669,12 +1702,13 @@ static int do_proc_readlink(struct path *path, char __user *buffer, int buflen)
 
 static int proc_pid_readlink(struct dentry * dentry, char __user * buffer, int buflen)
 {
-	int error = -EACCES;
+	int error;
 	struct inode *inode = dentry->d_inode;
 	struct path path;
 
 	/* Are we allowed to snoop on the tasks file descriptors? */
-	if (!proc_fd_access_allowed(inode))
+	error = proc_fd_access_allowed(inode);
+	if (error < 0)
 		goto out;
 
 	error = PROC_I(inode)->op.proc_get_link(dentry, &path);
@@ -2319,8 +2353,33 @@ static const struct file_operations proc_timers_operations = {
 	.llseek		= seq_lseek,
 	.release	= seq_release_private,
 };
+
+
 #endif /* CONFIG_CHECKPOINT_RESTORE */
 
+#ifdef CONFIG_VE
+static long proc_aio_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct inode *inode = file_inode(file);
+	struct task_struct *task;
+	int ret;
+
+	task = get_proc_task(inode);
+	if (!task)
+		return -ESRCH;
+
+	ret = ve_aio_ioctl(task, cmd, arg);
+
+	put_task_struct(task);
+
+	return ret;
+}
+
+static const struct file_operations proc_aio_operations = {
+	.unlocked_ioctl		= proc_aio_ioctl,
+};
+#endif /* CONFIG_VE */
+
 static struct dentry *proc_pident_instantiate(struct inode *dir,
 	struct dentry *dentry, struct task_struct *task, const void *ptr)
 {
@@ -2943,6 +3002,7 @@ static const struct pid_entry tgid_base_stuff[] = {
 #endif
 #ifdef CONFIG_CHECKPOINT_RESTORE
 	REG("timers",	  S_IRUGO, proc_timers_operations),
+	REG("aio",	  S_IRUGO|S_IWUSR, proc_aio_operations),
 #endif
 };
 
@@ -3196,7 +3256,7 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	for (iter = next_tgid(ns, iter);
 	     iter.task;
 	     iter.tgid += 1, iter = next_tgid(ns, iter)) {
-		if (has_pid_permissions(ns, iter.task, 2))
+		if (is_visible_task(ns, iter.task))
 			__filldir = filldir;
 		else
 			__filldir = fake_filldir;
--- a/fs/proc/cmdline.c
+++ b/fs/proc/cmdline.c
@@ -2,10 +2,12 @@
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/sched.h>
 
 static int cmdline_proc_show(struct seq_file *m, void *v)
 {
-	seq_printf(m, "%s\n", saved_command_line);
+	seq_printf(m, "%s\n",
+		ve_is_super(get_exec_env()) ? saved_command_line : "quiet");
 	return 0;
 }
 
@@ -23,7 +25,7 @@ static const struct file_operations cmdline_proc_fops = {
 
 static int __init proc_cmdline_init(void)
 {
-	proc_create("cmdline", 0, NULL, &cmdline_proc_fops);
+	proc_create("cmdline", S_ISVTX, NULL, &cmdline_proc_fops);
 	return 0;
 }
 module_init(proc_cmdline_init);
--- a/fs/proc/cpuinfo.c
+++ b/fs/proc/cpuinfo.c
@@ -18,7 +18,7 @@ static const struct file_operations proc_cpuinfo_operations = {
 
 static int __init proc_cpuinfo_init(void)
 {
-	proc_create("cpuinfo", 0, NULL, &proc_cpuinfo_operations);
+	proc_create("cpuinfo", S_ISVTX, NULL, &proc_cpuinfo_operations);
 	return 0;
 }
 module_init(proc_cpuinfo_init);
--- a/fs/proc/devices.c
+++ b/fs/proc/devices.c
@@ -2,6 +2,7 @@
 #include <linux/init.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
+#include <linux/sched.h>
 
 static int devinfo_show(struct seq_file *f, void *v)
 {
@@ -64,7 +65,7 @@ static const struct file_operations proc_devinfo_operations = {
 
 static int __init proc_devices_init(void)
 {
-	proc_create("devices", 0, NULL, &proc_devinfo_operations);
+	proc_create("devices", S_ISVTX, NULL, &proc_devinfo_operations);
 	return 0;
 }
 module_init(proc_devices_init);
--- a/fs/proc/fd.c
+++ b/fs/proc/fd.c
@@ -162,6 +162,7 @@ static int proc_fd_link(struct dentry *dentry, struct path *path)
 
 		spin_lock(&files->file_lock);
 		fd_file = fcheck_files(files, fd);
+		ret = -EACCES;
 		if (fd_file) {
 			*path = fd_file->f_path;
 			path_get(&fd_file->f_path);
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -100,12 +100,21 @@ bool pde_subdir_insert(struct proc_dir_entry *dir,
 	return true;
 }
 
+bool proc_in_container(struct super_block *sb)
+{
+	return !ve_is_super(get_exec_env());
+}
+
 static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 {
 	struct inode *inode = dentry->d_inode;
 	struct proc_dir_entry *de = PDE(inode);
 	int error;
 
+	if (proc_in_container(dentry->d_sb) &&
+	    (iattr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)))
+	    return -EPERM;
+
 	error = inode_change_ok(inode, iattr);
 	if (error)
 		return error;
@@ -113,9 +122,14 @@ static int proc_notify_change(struct dentry *dentry, struct iattr *iattr)
 	setattr_copy(inode, iattr);
 	mark_inode_dirty(inode);
 
-	de->uid = inode->i_uid;
-	de->gid = inode->i_gid;
-	de->mode = inode->i_mode;
+	if (iattr->ia_valid & ATTR_UID)
+		de->uid = inode->i_uid;
+	if (iattr->ia_valid & ATTR_GID)
+		de->gid = inode->i_gid;
+	if (iattr->ia_valid & ATTR_MODE)
+		de->mode = (de->mode & ~S_IRWXUGO) |
+			   (inode->i_mode & S_IRWXUGO);
+
 	return 0;
 }
 
@@ -259,10 +273,15 @@ struct dentry *proc_lookup_de(struct proc_dir_entry *de, struct inode *dir,
 		struct dentry *dentry)
 {
 	struct inode *inode;
+	bool in_container = proc_in_container(dentry->d_sb);
 
 	spin_lock(&proc_subdir_lock);
 	de = pde_subdir_find(de, dentry->d_name.name, dentry->d_name.len);
 	if (de) {
+		if (in_container && !(de->mode & S_ISVTX)) {
+			spin_unlock(&proc_subdir_lock);
+			return ERR_PTR(-ENOENT);
+		}
 		pde_get(de);
 		spin_unlock(&proc_subdir_lock);
 		inode = proc_get_inode(dir->i_sb, de);
@@ -298,6 +317,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
 	int i;
 	struct inode *inode = file_inode(filp);
 	int ret = 0;
+	bool in_container = proc_in_container(filp->f_path.dentry->d_sb);
 
 	ino = inode->i_ino;
 	i = filp->f_pos;
@@ -326,15 +346,22 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
 					spin_unlock(&proc_subdir_lock);
 					goto out;
 				}
-				if (!i)
-					break;
+				if (!in_container || (de->mode & S_ISVTX)) {
+					if (!i)
+						break;
+					i--;
+				}
 				de = pde_subdir_next(de);
-				i--;
 			}
 
 			do {
 				struct proc_dir_entry *next;
 
+				if (in_container && !(de->mode & S_ISVTX)) {
+					de = pde_subdir_next(de);
+					continue;
+				}
+
 				/* filldir passes info to user space */
 				pde_get(de);
 				spin_unlock(&proc_subdir_lock);
@@ -353,7 +380,7 @@ int proc_readdir_de(struct proc_dir_entry *de, struct file *filp, void *dirent,
 	}
 	ret = 1;
 out:
-	return ret;	
+	return ret;
 }
 
 int proc_readdir(struct file *filp, void *dirent, filldir_t filldir)
@@ -406,6 +433,9 @@ static int proc_register(struct proc_dir_entry * dir, struct proc_dir_entry * dp
 		return -EINVAL;
 	}
 
+	if (dir->mode & S_ISGID)
+		dp->mode |= S_ISVTX;
+
 	spin_lock(&proc_subdir_lock);
 
 	dp->parent = dir;
@@ -461,13 +491,12 @@ out:
 	return ent;
 }
 
-struct proc_dir_entry *proc_symlink(const char *name,
+struct proc_dir_entry *proc_symlink_mode(const char *name, umode_t mode,
 		struct proc_dir_entry *parent, const char *dest)
 {
 	struct proc_dir_entry *ent;
 
-	ent = __proc_create(&parent, name,
-			  (S_IFLNK | S_IRUGO | S_IWUGO | S_IXUGO),1);
+	ent = __proc_create(&parent, name, S_IFLNK | mode, 1);
 
 	if (ent) {
 		ent->data = kmalloc((ent->size=strlen(dest))+1, GFP_KERNEL);
@@ -485,7 +514,7 @@ struct proc_dir_entry *proc_symlink(const char *name,
 	}
 	return ent;
 }
-EXPORT_SYMBOL(proc_symlink);
+EXPORT_SYMBOL(proc_symlink_mode);
 
 struct proc_dir_entry *proc_mkdir_data(const char *name, umode_t mode,
 		struct proc_dir_entry *parent, void *data)
@@ -535,7 +564,7 @@ struct proc_dir_entry *proc_create_data(const char *name, umode_t mode,
 		return NULL;
 	}
 
-	if ((mode & S_IALLUGO) == 0)
+	if ((mode & S_IRWXUGO) == 0)
 		mode |= S_IRUGO;
 	pde = __proc_create(&parent, name, mode, 1);
 	if (!pde)
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -103,7 +103,8 @@ void __init proc_init_inodecache(void)
 	proc_inode_cachep = kmem_cache_create("proc_inode_cache",
 					     sizeof(struct proc_inode),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD|SLAB_PANIC),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT|
+						SLAB_PANIC),
 					     init_once);
 }
 
@@ -116,6 +117,8 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root)
 		seq_printf(seq, ",gid=%u", from_kgid_munged(&init_user_ns, pid->pid_gid));
 	if (pid->hide_pid != 0)
 		seq_printf(seq, ",hidepid=%u", pid->hide_pid);
+	if (pid->hide_pidns)
+		seq_printf(seq, ",hidepidns=%u", pid->hide_pidns);
 
 	return 0;
 }
@@ -382,8 +385,8 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de)
 		inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
 		PROC_I(inode)->pde = de;
 
-		if (de->mode) {
-			inode->i_mode = de->mode;
+		if (de->mode & (S_IFMT | S_IRWXUGO)) {
+			inode->i_mode = de->mode & (S_IFMT | S_IRWXUGO);
 			inode->i_uid = de->uid;
 			inode->i_gid = de->gid;
 		}
@@ -421,7 +424,7 @@ int proc_fill_super(struct super_block *s)
 	s->s_magic = PROC_SUPER_MAGIC;
 	s->s_op = &proc_sops;
 	s->s_time_gran = 1;
-	
+
 	pde_get(&proc_root);
 	root_inode = proc_get_inode(s, &proc_root);
 	if (!root_inode) {
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -35,33 +35,6 @@ struct mempolicy;
  * only, so, changes in this structure should be bypassed by kABI checker, and
  * such changes should not impact of procfs users.
  */
-struct proc_dir_entry {
-	unsigned int low_ino;
-	umode_t mode;
-	nlink_t nlink;
-	kuid_t uid;
-	kgid_t gid;
-	loff_t size;
-	const struct inode_operations *proc_iops;
-	const struct file_operations *proc_fops;
-#ifdef __GENKSYMS__
-	struct proc_dir_entry *next, *parent, *subdir;
-#else
-	struct proc_dir_entry *parent;
-	struct rb_root subdir;
-	struct rb_node subdir_node;
-#endif
-	void *data;
-	atomic_t count;		/* use count */
-	atomic_t in_use;	/* number of callers into module in progress; */
-			/* negative -> it's going away RSN */
-	struct completion *pde_unload_completion;
-	struct list_head pde_openers;	/* who did ->open, but not ->release */
-	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
-	u8 namelen;
-	char name[];
-};
-
 union proc_op {
 	int (*proc_get_link)(struct dentry *, struct path *);
 	int (*proc_read)(struct task_struct *task, char *page);
--- a/fs/proc/kmsg.c
+++ b/fs/proc/kmsg.c
@@ -13,11 +13,13 @@
 #include <linux/proc_fs.h>
 #include <linux/fs.h>
 #include <linux/syslog.h>
+#include <linux/module.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
 
-extern wait_queue_head_t log_wait;
+extern void log_poll_wait(struct file *filp, poll_table *p);
 
 static int kmsg_open(struct inode * inode, struct file * file)
 {
@@ -41,7 +43,7 @@ static ssize_t kmsg_read(struct file *file, char __user *buf,
 
 static unsigned int kmsg_poll(struct file *file, poll_table *wait)
 {
-	poll_wait(file, &log_wait, wait);
+	log_poll_wait(file, wait);
 	if (do_syslog(SYSLOG_ACTION_SIZE_UNREAD, NULL, 0, SYSLOG_FROM_PROC))
 		return POLLIN | POLLRDNORM;
 	return 0;
@@ -58,7 +60,7 @@ static const struct file_operations proc_kmsg_operations = {
 
 static int __init proc_kmsg_init(void)
 {
-	proc_create("kmsg", S_IRUSR, NULL, &proc_kmsg_operations);
+	proc_create("kmsg", S_IRUSR|S_ISVTX, NULL, &proc_kmsg_operations);
 	return 0;
 }
 module_init(proc_kmsg_init);
--- a/fs/proc/loadavg.c
+++ b/fs/proc/loadavg.c
@@ -6,6 +6,7 @@
 #include <linux/seq_file.h>
 #include <linux/seqlock.h>
 #include <linux/time.h>
+#include <linux/ve.h>
 
 #define LOAD_INT(x) ((x) >> FSHIFT)
 #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
@@ -13,6 +14,15 @@
 static int loadavg_proc_show(struct seq_file *m, void *v)
 {
 	unsigned long avnrun[3];
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	if (!ve_is_super(ve)) {
+		int ret;
+		ret = ve_show_loadavg(ve, m);
+		if (ret != -ENOSYS)
+			return ret;
+	}
 
 	get_avenrun(avnrun, FIXED_1/200, 0);
 
@@ -39,7 +49,7 @@ static const struct file_operations loadavg_proc_fops = {
 
 static int __init proc_loadavg_init(void)
 {
-	proc_create("loadavg", 0, NULL, &loadavg_proc_fops);
+	proc_create("loadavg", S_ISVTX, NULL, &loadavg_proc_fops);
 	return 0;
 }
 module_init(proc_loadavg_init);
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -10,19 +10,84 @@
 #include <linux/seq_file.h>
 #include <linux/swap.h>
 #include <linux/vmstat.h>
+#include <linux/virtinfo.h>
+#include <linux/ve.h>
 #include <linux/atomic.h>
 #include <linux/vmalloc.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
+#include <bc/beancounter.h>
 #include "internal.h"
 
 void __attribute__((weak)) arch_report_meminfo(struct seq_file *m)
 {
 }
 
-static int meminfo_proc_show(struct seq_file *m, void *v)
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+
+static int meminfo_proc_show_mi(struct seq_file *m, struct meminfo *mi)
+{
+	seq_printf(m,
+		"MemTotal:       %8lu kB\n"
+		"MemFree:        %8lu kB\n"
+		"Cached:         %8lu kB\n"
+		"Buffers:        %8lu kB\n"
+		"Active:         %8lu kB\n"
+		"Inactive:       %8lu kB\n"
+		"Active(anon):   %8lu kB\n"
+		"Inactive(anon): %8lu kB\n"
+		"Active(file):   %8lu kB\n"
+		"Inactive(file): %8lu kB\n"
+		"Unevictable:    %8lu kB\n"
+		"Mlocked:        %8lu kB\n"
+		"SwapTotal:      %8lu kB\n"
+		"SwapFree:       %8lu kB\n"
+		"Dirty:          %8lu kB\n"
+		"Writeback:      %8lu kB\n"
+		"AnonPages:      %8lu kB\n"
+		"Shmem:          %8lu kB\n"
+		"Slab:           %8lu kB\n"
+		"SReclaimable:   %8lu kB\n"
+		"SUnreclaim:     %8lu kB\n"
+		,
+		K(mi->si->totalram),
+		K(mi->si->freeram),
+		K(mi->cached),
+		K(0L),
+		K(mi->pages[LRU_ACTIVE_ANON]   + mi->pages[LRU_ACTIVE_FILE]),
+		K(mi->pages[LRU_INACTIVE_ANON] + mi->pages[LRU_INACTIVE_FILE]),
+		K(mi->pages[LRU_ACTIVE_ANON]),
+		K(mi->pages[LRU_INACTIVE_ANON]),
+		K(mi->pages[LRU_ACTIVE_FILE]),
+		K(mi->pages[LRU_INACTIVE_FILE]),
+		K(mi->pages[LRU_UNEVICTABLE]),
+		K(mi->locked),
+		K(mi->si->totalswap),
+		K(mi->si->freeswap),
+		K(mi->dirty_pages),
+		K(mi->writeback_pages),
+		K(mi->pages[LRU_ACTIVE_ANON] + mi->pages[LRU_INACTIVE_ANON]),
+		K(mi->shmem),
+		K(mi->slab_reclaimable + mi->slab_unreclaimable),
+		K(mi->slab_reclaimable),
+		K(mi->slab_unreclaimable));
+
+	return 0;
+}
+
+#ifdef CONFIG_TCACHE
+extern unsigned long get_nr_tcache_pages(void);
+#endif
+#ifdef CONFIG_TSWAP
+extern unsigned long get_nr_tswap_pages(void);
+#endif
+
+int meminfo_proc_show_ub(struct seq_file *m, void *v,
+		struct user_beancounter *ub, unsigned long meminfo_val)
 {
+	int ret;
 	struct sysinfo i;
+	struct meminfo mi;
 	unsigned long committed;
 	struct vmalloc_info vmi;
 	long cached;
@@ -30,12 +95,23 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 	unsigned long pages[NR_LRU_LISTS];
 	int lru;
 
+	si_meminfo(&i);
+	si_swapinfo(&i);
+
+	memset(&mi, 0, sizeof(mi));
+	mi.si = &i;
+	mi.ub = ub;
+	mi.meminfo_val = meminfo_val;
+
+	ret = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_MEMINFO, &mi);
+	if (ret & NOTIFY_FAIL)
+		return 0;
+	if (ret & NOTIFY_OK)
+		return meminfo_proc_show_mi(m, &mi);
+
 /*
  * display in kilobytes.
  */
-#define K(x) ((x) << (PAGE_SHIFT - 10))
-	si_meminfo(&i);
-	si_swapinfo(&i);
 	committed = percpu_counter_read_positive(&vm_committed_as);
 
 	cached = global_page_state(NR_FILE_PAGES) -
@@ -106,6 +182,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 		"AnonHugePages:  %8lu kB\n"
 #endif
+#ifdef CONFIG_TCACHE
+		"Tcache:         %8lu kB\n"
+#endif
+#ifdef CONFIG_TSWAP
+		"Tswap:          %8lu kB\n"
+#endif
 		,
 		K(i.totalram),
 		K(i.freeram),
@@ -167,6 +249,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		,K(global_page_state(NR_ANON_TRANSPARENT_HUGEPAGES) *
 		   HPAGE_PMD_NR)
 #endif
+#ifdef CONFIG_TCACHE
+		,K(get_nr_tcache_pages())
+#endif
+#ifdef CONFIG_TSWAP
+		,K(get_nr_tswap_pages())
+#endif
 		);
 
 	hugetlb_report_meminfo(m);
@@ -177,6 +265,12 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 #undef K
 }
 
+static int meminfo_proc_show(struct seq_file *m, void *v)
+{
+	return meminfo_proc_show_ub(m, v, mm_ub(current->mm),
+			get_exec_env()->meminfo_val);
+}
+
 static int meminfo_proc_open(struct inode *inode, struct file *file)
 {
 	return single_open(file, meminfo_proc_show, NULL);
@@ -191,7 +285,7 @@ static const struct file_operations meminfo_proc_fops = {
 
 static int __init proc_meminfo_init(void)
 {
-	proc_create("meminfo", 0, NULL, &meminfo_proc_fops);
+	proc_create("meminfo", S_ISVTX, NULL, &meminfo_proc_fops);
 	return 0;
 }
 module_init(proc_meminfo_init);
--- a/fs/proc/page.c
+++ b/fs/proc/page.c
@@ -8,6 +8,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/hugetlb.h>
+#include <linux/memcontrol.h>
 #include <linux/kernel-page-flags.h>
 #include <asm/uaccess.h>
 #include "internal.h"
@@ -215,10 +216,62 @@ static const struct file_operations proc_kpageflags_operations = {
 	.read = kpageflags_read,
 };
 
+#ifdef CONFIG_MEMCG
+static ssize_t kpagecgroup_read(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	u64 __user *out = (u64 __user *)buf;
+	struct page *ppage;
+	unsigned long src = *ppos;
+	unsigned long pfn;
+	ssize_t ret = 0;
+	u64 ino;
+
+	pfn = src / KPMSIZE;
+	count = min_t(unsigned long, count, (max_pfn * KPMSIZE) - src);
+	if (src & KPMMASK || count & KPMMASK)
+		return -EINVAL;
+
+	while (count > 0) {
+		if (pfn_valid(pfn))
+			ppage = pfn_to_page(pfn);
+		else
+			ppage = NULL;
+
+		if (ppage)
+			ino = page_cgroup_ino(ppage);
+		else
+			ino = 0;
+
+		if (put_user(ino, out)) {
+			ret = -EFAULT;
+			break;
+		}
+
+		pfn++;
+		out++;
+		count -= KPMSIZE;
+	}
+
+	*ppos += (char __user *)out - buf;
+	if (!ret)
+		ret = (char __user *)out - buf;
+	return ret;
+}
+
+static const struct file_operations proc_kpagecgroup_operations = {
+	.llseek = mem_lseek,
+	.read = kpagecgroup_read,
+};
+#endif /* CONFIG_MEMCG */
+
 static int __init proc_page_init(void)
 {
 	proc_create("kpagecount", S_IRUSR, NULL, &proc_kpagecount_operations);
 	proc_create("kpageflags", S_IRUSR, NULL, &proc_kpageflags_operations);
+#ifdef CONFIG_MEMCG
+	proc_create("kpagecgroup", S_IRUSR, NULL, &proc_kpagecgroup_operations);
+#endif
 	return 0;
 }
 module_init(proc_page_init);
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -183,6 +183,15 @@ const struct file_operations proc_net_operations = {
 	.readdir	= proc_tgid_net_readdir,
 };
 
+struct proc_dir_entry *proc_net_create_data(const char *name, umode_t mode,
+					    struct proc_dir_entry *parent,
+					    const struct file_operations *fops,
+					    void *data)
+{
+	return proc_create_data(name, S_ISVTX | mode, parent, fops, data);
+}
+EXPORT_SYMBOL_GPL(proc_net_create_data);
+
 static __net_init int proc_net_ns_init(struct net *net)
 {
 	struct proc_dir_entry *netd, *net_statd;
@@ -198,6 +207,7 @@ static __net_init int proc_net_ns_init(struct net *net)
 	netd->nlink = 2;
 	netd->namelen = 3;
 	netd->parent = &proc_root;
+	netd->mode = S_ISGID;
 	memcpy(netd->name, "net", 4);
 
 	err = -EEXIST;
@@ -228,7 +238,7 @@ static struct pernet_operations __net_initdata proc_net_ns_ops = {
 
 int __init proc_net_init(void)
 {
-	proc_symlink("net", NULL, "self/net");
+	proc_symlink_mode("net", S_ISVTX | S_IRWXUGO, NULL, "self/net");
 
 	return register_pernet_subsys(&proc_net_ns_ops);
 }
--- a/fs/proc/proc_sysctl.c
+++ b/fs/proc/proc_sysctl.c
@@ -9,6 +9,7 @@
 #include <linux/security.h>
 #include <linux/sched.h>
 #include <linux/namei.h>
+#include <linux/nsproxy.h>
 #include <linux/mm.h>
 #include <linux/module.h>
 #include "internal.h"
@@ -35,6 +36,16 @@ static struct ctl_table root_table[] = {
 	},
 	{ }
 };
+
+static int sysctl_root_permissions(struct ctl_table_header *head,
+		struct ctl_table *table)
+{
+	if (ve_is_super(get_exec_env()) || (table->mode & S_ISVTX))
+		return table->mode;
+
+	return table->mode & ~S_IWUGO;
+}
+
 static struct ctl_table_root sysctl_table_root = {
 	.default_set.dir.header = {
 		{{.count = 1,
@@ -44,6 +55,7 @@ static struct ctl_table_root sysctl_table_root = {
 		.root = &sysctl_table_root,
 		.set = &sysctl_table_root.default_set,
 	},
+	.permissions = sysctl_root_permissions,
 };
 
 static DEFINE_SPINLOCK(sysctl_lock);
@@ -410,7 +422,7 @@ static struct inode *proc_sys_make_inode(struct super_block *sb,
 	ei->sysctl_entry = table;
 
 	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-	inode->i_mode = table->mode;
+	inode->i_mode = table->mode & S_IRWXUGO;
 	if (!S_ISDIR(table->mode)) {
 		inode->i_mode |= S_IFREG;
 		inode->i_op = &proc_sys_inode_operations;
@@ -747,13 +759,23 @@ static int proc_sys_getattr(struct vfsmount *mnt, struct dentry *dentry, struct
 	struct inode *inode = dentry->d_inode;
 	struct ctl_table_header *head = grab_header(inode);
 	struct ctl_table *table = PROC_I(inode)->sysctl_entry;
+	struct ctl_table_root *root;
 
 	if (IS_ERR(head))
 		return PTR_ERR(head);
 
+	root = head->root;
+
 	generic_fillattr(inode, stat);
-	if (table)
-		stat->mode = (stat->mode & S_IFMT) | table->mode;
+
+	if (table) {
+		umode_t mode = table->mode;
+
+		if (root->permissions)
+			mode = root->permissions(head, table);
+
+		stat->mode = (stat->mode & S_IFMT) | (mode & S_IRWXUGO);
+	}
 
 	sysctl_head_finish(head);
 	return 0;
@@ -1023,11 +1045,13 @@ static int sysctl_check_table(const char *path, struct ctl_table *table)
 				err = sysctl_err(path, table, "No data");
 			if (!table->maxlen)
 				err = sysctl_err(path, table, "No maxlen");
+			if (table->mode & S_ISVTX)
+				err = sysctl_err(path, table, "Unsafe v12n");
 		}
 		if (!table->proc_handler)
 			err = sysctl_err(path, table, "No proc_handler");
 
-		if ((table->mode & (S_IRUGO|S_IWUGO)) != table->mode)
+		if ((table->mode & (S_IRUGO|S_IWUGO|S_ISVTX)) != table->mode)
 			err = sysctl_err(path, table, "bogus .mode 0%o",
 				table->mode);
 	}
@@ -1590,7 +1614,7 @@ int __init proc_sys_init(void)
 {
 	struct proc_dir_entry *proc_sys_root;
 
-	proc_sys_root = proc_mkdir("sys", NULL);
+	proc_sys_root = proc_mkdir_mode("sys", S_ISVTX | S_IRUGO | S_IXUGO, NULL);
 	proc_sys_root->proc_iops = &proc_sys_dir_operations;
 	proc_sys_root->proc_fops = &proc_sys_dir_file_operations;
 	proc_sys_root->nlink = 0;
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -39,11 +39,12 @@ static int proc_set_super(struct super_block *sb, void *data)
 }
 
 enum {
-	Opt_gid, Opt_hidepid, Opt_err,
+	Opt_gid, Opt_hidepid, Opt_hidepidns, Opt_err,
 };
 
 static const match_table_t tokens = {
 	{Opt_hidepid, "hidepid=%u"},
+	{Opt_hidepidns, "hidepidns=%u"},
 	{Opt_gid, "gid=%u"},
 	{Opt_err, NULL},
 };
@@ -79,6 +80,15 @@ static int proc_parse_options(char *options, struct pid_namespace *pid)
 			}
 			pid->hide_pid = option;
 			break;
+		case Opt_hidepidns:
+			if (match_int(&args[0], &option))
+				return 0;
+			if (option < 0 || option > 1) {
+				pr_err("proc: hidepidns value must be between 0 and 1.\n");
+				return 0;
+			}
+			pid->hide_pidns = option;
+			break;
 		default:
 			pr_err("proc: unrecognized mount option \"%s\" "
 			       "or missing value\n", p);
@@ -125,6 +135,8 @@ static struct dentry *proc_mount(struct file_system_type *fs_type,
 	}
 
 	if (!sb->s_root) {
+		sb->s_flags &= ~MS_RDONLY;
+
 		err = proc_fill_super(sb);
 		if (err) {
 			deactivate_locked_super(sb);
@@ -152,7 +164,7 @@ static struct file_system_type proc_fs_type = {
 	.name		= "proc",
 	.mount		= proc_mount,
 	.kill_sb	= proc_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT,
+	.fs_flags	= FS_USERNS_MOUNT | FS_VIRTUALIZED,
 };
 
 void __init proc_root_init(void)
@@ -165,16 +177,17 @@ void __init proc_root_init(void)
 		return;
 
 	proc_self_init();
-	proc_symlink("mounts", NULL, "self/mounts");
+	proc_symlink_mode("mounts", S_ISVTX | S_IRWXUGO, NULL, "self/mounts");
 
 	proc_net_init();
 
 #ifdef CONFIG_SYSVIPC
-	proc_mkdir("sysvipc", NULL);
+	proc_mkdir_mode("sysvipc", S_ISVTX | S_IRUGO | S_IXUGO, NULL);
 #endif
-	proc_mkdir("fs", NULL);
+	proc_mkdir_mode("fs", S_ISVTX | S_IRUGO | S_IXUGO, NULL);
 	proc_mkdir("driver", NULL);
-	proc_mkdir("fs/nfsd", NULL); /* somewhere for the nfsd filesystem to be mounted */
+	/* somewhere for the nfsd filesystem to be mounted */
+	proc_mkdir_mode("fs/nfsd", S_ISVTX | S_IRUGO | S_IXUGO, NULL);
 #if defined(CONFIG_SUN_OPENPROMFS) || defined(CONFIG_SUN_OPENPROMFS_MODULE)
 	/* just give it a mountpoint */
 	proc_mkdir("openprom", NULL);
--- a/fs/proc/self.c
+++ b/fs/proc/self.c
@@ -57,7 +57,7 @@ int proc_setup_self(struct super_block *s)
 		if (inode) {
 			inode->i_ino = self_inum;
 			inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
-			inode->i_mode = S_IFLNK | S_IRWXUGO;
+			inode->i_mode = S_IFLNK | S_IRWXUGO | S_ISVTX;
 			inode->i_uid = GLOBAL_ROOT_UID;
 			inode->i_gid = GLOBAL_ROOT_GID;
 			inode->i_op = &proc_self_inode_operations;
--- a/fs/proc/stat.c
+++ b/fs/proc/stat.c
@@ -4,13 +4,15 @@
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/proc_fs.h>
-#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
 #include <linux/time.h>
 #include <linux/irqnr.h>
 #include <linux/cputime.h>
 #include <linux/tick.h>
+#include <linux/mm.h>
+#include <linux/vmstat.h>
+#include <linux/ve.h>
 
 #ifndef arch_irq_stat_cpu
 #define arch_irq_stat_cpu(cpu) 0
@@ -87,12 +89,22 @@ static int show_stat(struct seq_file *p, void *v)
 	u64 sum_softirq = 0;
 	unsigned int per_softirq_sums[NR_SOFTIRQS] = {0};
 	struct timespec boottime;
+	struct ve_struct *ve;
+
+	getboottime(&boottime);
+	jif = boottime.tv_sec;
+
+	ve = get_exec_env();
+	if (!ve_is_super(ve)) {
+		int ret;
+		ret = ve_show_cpu_stat(ve, p);
+		if (ret != -ENOSYS)
+			return ret;
+	}
 
 	user = nice = system = idle = iowait =
 		irq = softirq = steal = 0;
 	guest = guest_nice = 0;
-	getboottime(&boottime);
-	jif = boottime.tv_sec;
 
 	for_each_possible_cpu(i) {
 		user += kcpustat_cpu(i).cpustat[CPUTIME_USER];
@@ -200,7 +212,7 @@ static const struct file_operations proc_stat_operations = {
 
 static int __init proc_stat_init(void)
 {
-	proc_create("stat", 0, NULL, &proc_stat_operations);
+	proc_create("stat", S_ISVTX, NULL, &proc_stat_operations);
 	return 0;
 }
 module_init(proc_stat_init);
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -13,6 +13,7 @@
 #include <linux/swapops.h>
 #include <linux/shmem_fs.h>
 #include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
 
 #include <asm/elf.h>
 #include <asm/uaccess.h>
@@ -293,11 +294,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
 
 	/* We don't show the stack guard page in /proc/maps */
 	start = vma->vm_start;
-	if (stack_guard_page_start(vma, start))
-		start += PAGE_SIZE;
 	end = vma->vm_end;
-	if (stack_guard_page_end(vma, end))
-		end -= PAGE_SIZE;
 
 	seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
 			start,
@@ -453,7 +450,6 @@ struct mem_size_stats {
 	unsigned long anonymous;
 	unsigned long anonymous_thp;
 	unsigned long swap;
-	unsigned long nonlinear;
 	u64 pss;
 	bool check_shmem_swap;
 };
@@ -476,7 +472,6 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,
 {
 	struct mem_size_stats *mss = walk->private;
 	struct vm_area_struct *vma = mss->vma;
-	pgoff_t pgoff = linear_page_index(vma, addr);
 	struct page *page = NULL;
 	int mapcount;
 
@@ -489,9 +484,6 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,
 			mss->swap += ptent_size;
 		else if (is_migration_entry(swpent))
 			page = migration_entry_to_page(swpent);
-	} else if (pte_file(ptent)) {
-		if (pte_to_pgoff(ptent) != pgoff)
-			mss->nonlinear += ptent_size;
 	} else if (unlikely(IS_ENABLED(CONFIG_SHMEM) && mss->check_shmem_swap
 			    && pte_none(ptent))) {
 		/* We shouldn't encounter huge pages here */
@@ -515,12 +507,9 @@ static void smaps_pte_entry(pte_t ptent, unsigned long addr,
 	if (PageAnon(page))
 		mss->anonymous += ptent_size;
 
-	if (page->index != pgoff)
-		mss->nonlinear += ptent_size;
-
 	mss->resident += ptent_size;
 	/* Accumulate the size in pages that have been accessed. */
-	if (pte_young(ptent) || PageReferenced(page))
+	if (pte_young(ptent) || page_is_young(page) || PageReferenced(page))
 		mss->referenced += ptent_size;
 	mapcount = page_mapcount(page);
 	if (mapcount >= 2) {
@@ -602,7 +591,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
 		[ilog2(VM_ACCOUNT)]	= "ac",
 		[ilog2(VM_NORESERVE)]	= "nr",
 		[ilog2(VM_HUGETLB)]	= "ht",
-		[ilog2(VM_NONLINEAR)]	= "nl",
 		[ilog2(VM_ARCH_1)]	= "ar",
 		[ilog2(VM_DONTDUMP)]	= "dd",
 #ifdef CONFIG_MEM_SOFT_DIRTY
@@ -703,10 +691,6 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
 		   (vma->vm_flags & VM_LOCKED) ?
 			(unsigned long)(mss.pss >> (10 + PSS_SHIFT)) : 0);
 
-	if (vma->vm_flags & VM_NONLINEAR)
-		seq_printf(m, "Nonlinear:      %8lu kB\n",
-				mss.nonlinear >> 10);
-
 	show_smap_vma_flags(m, vma);
 
 	if (m->count < m->size)  /* vma is copied successfully */
@@ -810,8 +794,6 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 		ptent = pte_clear_flags(ptent, _PAGE_SOFT_DIRTY);
 	} else if (is_swap_pte(ptent)) {
 		ptent = pte_swp_clear_soft_dirty(ptent);
-	} else if (pte_file(ptent)) {
-		ptent = pte_file_clear_soft_dirty(ptent);
 	}
 
 	set_pte_at(vma->vm_mm, addr, pte, ptent);
@@ -849,6 +831,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 
 		/* Clear accessed and referenced bits. */
 		ptep_test_and_clear_young(vma, addr, pte);
+		test_and_clear_page_young(page);
 		ClearPageReferenced(page);
 	}
 	pte_unmap_unlock(pte - 1, ptl);
@@ -934,6 +917,10 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 				continue;
 			if (type == CLEAR_REFS_MAPPED && !vma->vm_file)
 				continue;
+			if (type == CLEAR_REFS_SOFT_DIRTY) {
+				if (vma->vm_flags & VM_SOFTDIRTY)
+					vma->vm_flags &= ~VM_SOFTDIRTY;
+			}
 			walk_page_range(vma->vm_start, vma->vm_end,
 					&clear_refs_walk);
 		}
@@ -1317,6 +1304,10 @@ out:
 
 static int pagemap_open(struct inode *inode, struct file *file)
 {
+	/* do not disclose physical addresses: attack vector */
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
 	pr_warn_once("Bits 55-60 of /proc/PID/pagemap entries are about "
 			"to stop being page-shift some time soon. See the "
 			"linux/Documentation/vm/pagemap.txt for details.\n");
--- a/fs/proc/uptime.c
+++ b/fs/proc/uptime.c
@@ -1,16 +1,15 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/proc_fs.h>
-#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/time.h>
 #include <linux/kernel_stat.h>
 #include <linux/cputime.h>
+#include <linux/ve.h>
+#include <linux/cgroup.h>
 
-static int uptime_proc_show(struct seq_file *m, void *v)
+static inline void get_ve0_idle(struct timespec *idle)
 {
-	struct timespec uptime;
-	struct timespec idle;
 	u64 idletime;
 	u64 nsec;
 	u32 rem;
@@ -20,10 +19,38 @@ static int uptime_proc_show(struct seq_file *m, void *v)
 	for_each_possible_cpu(i)
 		idletime += (__force u64) kcpustat_cpu(i).cpustat[CPUTIME_IDLE];
 
-	get_monotonic_boottime(&uptime);
 	nsec = cputime64_to_jiffies64(idletime) * TICK_NSEC;
-	idle.tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
-	idle.tv_nsec = rem;
+	idle->tv_sec = div_u64_rem(nsec, NSEC_PER_SEC, &rem);
+	idle->tv_nsec = rem;
+}
+
+static inline void get_veX_idle(struct ve_struct *ve, struct timespec *idle)
+{
+	struct kernel_cpustat kstat;
+
+	ve_get_cpu_stat(ve, &kstat);
+	cputime_to_timespec(kstat.cpustat[CPUTIME_IDLE], idle);
+}
+
+static int uptime_proc_show(struct seq_file *m, void *v)
+{
+	struct timespec uptime;
+	struct timespec idle;
+	struct ve_struct *ve = get_exec_env();
+
+	if (ve_is_super(ve))
+		get_ve0_idle(&idle);
+	else
+		get_veX_idle(ve, &idle);
+
+	get_monotonic_boottime(&uptime);
+#ifdef CONFIG_VE
+	if (!ve_is_super(get_exec_env())) {
+		set_normalized_timespec(&uptime,
+			uptime.tv_sec - get_exec_env()->real_start_timespec.tv_sec,
+			uptime.tv_nsec - get_exec_env()->real_start_timespec.tv_nsec);
+	}
+#endif
 	seq_printf(m, "%lu.%02lu %lu.%02lu\n",
 			(unsigned long) uptime.tv_sec,
 			(uptime.tv_nsec / (NSEC_PER_SEC / 100)),
@@ -46,7 +73,7 @@ static const struct file_operations uptime_proc_fops = {
 
 static int __init proc_uptime_init(void)
 {
-	proc_create("uptime", 0, NULL, &uptime_proc_fops);
+	proc_create("uptime", S_ISVTX, NULL, &uptime_proc_fops);
 	return 0;
 }
 module_init(proc_uptime_init);
--- a/fs/proc/version.c
+++ b/fs/proc/version.c
@@ -28,7 +28,7 @@ static const struct file_operations version_proc_fops = {
 
 static int __init proc_version_init(void)
 {
-	proc_create("version", 0, NULL, &version_proc_fops);
+	proc_create("version", S_ISVTX, NULL, &version_proc_fops);
 	return 0;
 }
 module_init(proc_version_init);
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -44,6 +44,7 @@ static int show_sb_opts(struct seq_file *m, struct super_block *sb)
 		{ MS_SYNCHRONOUS, ",sync" },
 		{ MS_DIRSYNC, ",dirsync" },
 		{ MS_MANDLOCK, ",mand" },
+		{ MS_LAZYTIME, ",lazytime" },
 		{ 0, NULL }
 	};
 	const struct proc_fs_info *fs_infop;
--- a/fs/qnx4/inode.c
+++ b/fs/qnx4/inode.c
@@ -382,7 +382,7 @@ static int init_inodecache(void)
 	qnx4_inode_cachep = kmem_cache_create("qnx4_inode_cache",
 					     sizeof(struct qnx4_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (qnx4_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/qnx6/inode.c
+++ b/fs/qnx6/inode.c
@@ -642,7 +642,7 @@ static int init_inodecache(void)
 	qnx6_inode_cachep = kmem_cache_create("qnx6_inode_cache",
 					     sizeof(struct qnx6_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (!qnx6_inode_cachep)
 		return -ENOMEM;
--- a/fs/quota/Kconfig
+++ b/fs/quota/Kconfig
@@ -27,7 +27,7 @@ config QUOTA_NETLINK_INTERFACE
 config PRINT_QUOTA_WARNING
 	bool "Print quota warnings to console (OBSOLETE)"
 	depends on QUOTA
-	default y
+	default n
 	help
 	  If you say Y here, quota warnings (about exceeding softlimit, reaching
 	  hardlimit, etc.) will be printed to the process' controlling terminal.
@@ -42,6 +42,15 @@ config QUOTA_DEBUG
 	  If you say Y here, quota subsystem will perform some additional
 	  sanity checks of quota internal structures. If unsure, say N.
 
+config QUOTA_COMPAT
+	bool "Compatibility with older quotactl interface"
+	depends on QUOTA
+	help
+	  This option enables compatibility layer for older version
+	  of quotactl interface with byte granularity (QUOTAON at 0x0100,
+	  GETQUOTA at 0x0D00).  Interface versions older than that one and
+	  with block granularity are still not supported.
+
 # Generic support for tree structured quota files. Selected when needed.
 config QUOTA_TREE
 	 tristate
--- a/fs/quota/compat.c
+++ b/fs/quota/compat.c
@@ -51,6 +51,11 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
 	compat_uint_t data;
 	u16 xdata;
 	long ret;
+#ifdef CONFIG_QUOTA_COMPAT
+	struct compat_dqblk __user *cdq;
+	struct compat_compat_dqblk __user *compat_cdq;
+	compat_time_t time;
+#endif
 
 	cmds = cmd >> SUBCMDSHIFT;
 
@@ -111,6 +116,43 @@ asmlinkage long sys32_quotactl(unsigned int cmd, const char __user *special,
 			break;
 		ret = 0;
 		break;
+#ifdef CONFIG_QUOTA_COMPAT
+	case QC_GETQUOTA:
+		cdq = compat_alloc_user_space(sizeof(struct compat_dqblk));
+		compat_cdq = addr;
+		ret = sys_quotactl(cmd, special, id, cdq);
+		if (ret)
+			break;
+		ret = -EFAULT;
+		if (copy_in_user(compat_cdq, cdq, sizeof(struct compat_compat_dqblk) -
+				offsetof(struct compat_compat_dqblk, dqb_curspace)) ||
+			copy_in_user(&compat_cdq->dqb_curspace, &cdq->dqb_curspace,
+				sizeof(cdq->dqb_curspace)) ||
+			get_user(time, &cdq->dqb_btime) ||
+			put_user(time, &compat_cdq->dqb_btime) ||
+			get_user(time, &cdq->dqb_itime) ||
+			put_user(time, &compat_cdq->dqb_itime))
+			break;
+		ret = 0;
+		break;
+	case QC_SETQUOTA:
+	case QC_SETUSE:
+	case QC_SETQLIM:
+		cdq = compat_alloc_user_space(sizeof(struct compat_dqblk));
+		compat_cdq = addr;
+		ret = -EFAULT;
+		if (copy_in_user(cdq, compat_cdq, sizeof(struct compat_compat_dqblk) -
+				offsetof(struct compat_compat_dqblk, dqb_curspace)) ||
+			copy_in_user(&cdq->dqb_curspace, &compat_cdq->dqb_curspace,
+				sizeof(cdq->dqb_curspace)) ||
+			get_user(time, &compat_cdq->dqb_btime) ||
+			put_user(time, &cdq->dqb_btime) ||
+			get_user(time, &compat_cdq->dqb_itime) ||
+			put_user(time, &cdq->dqb_itime))
+			break;
+		ret = sys_quotactl(cmd, special, id, cdq);
+		break;
+#endif
 	default:
 		ret = sys_quotactl(cmd, special, id, addr);
 	}
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -246,7 +246,6 @@ static struct hlist_head *dquot_hash;
 struct dqstats dqstats;
 EXPORT_SYMBOL(dqstats);
 
-static qsize_t inode_get_rsv_space(struct inode *inode);
 static void __dquot_initialize(struct inode *inode, int type);
 
 static inline unsigned int
@@ -684,45 +683,39 @@ int dquot_quota_sync(struct super_block *sb, int type)
 }
 EXPORT_SYMBOL(dquot_quota_sync);
 
-/* Free unused dquots from cache */
-static void prune_dqcache(int count)
+static unsigned long
+dqcache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
 {
 	struct list_head *head;
 	struct dquot *dquot;
+	unsigned long freed = 0;
 
+	spin_lock(&dq_list_lock);
 	head = free_dquots.prev;
-	while (head != &free_dquots && count) {
+	while (head != &free_dquots && sc->nr_to_scan) {
 		dquot = list_entry(head, struct dquot, dq_free);
 		remove_dquot_hash(dquot);
 		remove_free_dquot(dquot);
 		remove_inuse(dquot);
 		do_destroy_dquot(dquot);
-		count--;
+		sc->nr_to_scan--;
+		freed++;
 		head = free_dquots.prev;
 	}
+	spin_unlock(&dq_list_lock);
+	return freed;
 }
 
-/*
- * This is called from kswapd when we think we need some
- * more memory
- */
-static int shrink_dqcache_memory(struct shrinker *shrink,
-				 struct shrink_control *sc)
+static unsigned long
+dqcache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
 {
-	int nr = sc->nr_to_scan;
-
-	if (nr) {
-		spin_lock(&dq_list_lock);
-		prune_dqcache(nr);
-		spin_unlock(&dq_list_lock);
-	}
-	return ((unsigned)
-		percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS])
-		/100) * sysctl_vfs_cache_pressure;
+	return vfs_pressure_ratio(
+	percpu_counter_read_positive(&dqstats.counter[DQST_FREE_DQUOTS]));
 }
 
 static struct shrinker dqcache_shrinker = {
-	.shrink = shrink_dqcache_memory,
+	.count_objects = dqcache_shrink_count,
+	.scan_objects = dqcache_shrink_scan,
 	.seeks = DEFAULT_SEEKS,
 };
 
@@ -1505,13 +1498,14 @@ EXPORT_SYMBOL(dquot_drop);
  * inode_reserved_space is managed internally by quota, and protected by
  * i_lock similar to i_blocks+i_bytes.
  */
-static qsize_t *inode_reserved_space(struct inode * inode)
+qsize_t *inode_reserved_space(struct inode * inode)
 {
 	/* Filesystem must explicitly define it's own method in order to use
 	 * quota reservation interface */
 	BUG_ON(!inode->i_sb->dq_op->get_reserved_space);
 	return inode->i_sb->dq_op->get_reserved_space(inode);
 }
+EXPORT_SYMBOL(inode_reserved_space);
 
 void inode_add_rsv_space(struct inode *inode, qsize_t number)
 {
@@ -1547,7 +1541,7 @@ void inode_sub_rsv_space(struct inode *inode, qsize_t number)
 }
 EXPORT_SYMBOL(inode_sub_rsv_space);
 
-static qsize_t inode_get_rsv_space(struct inode *inode)
+qsize_t inode_get_rsv_space(struct inode *inode)
 {
 	qsize_t ret;
 
@@ -1558,8 +1552,9 @@ static qsize_t inode_get_rsv_space(struct inode *inode)
 	spin_unlock(&inode->i_lock);
 	return ret;
 }
+EXPORT_SYMBOL(inode_get_rsv_space);
 
-static void inode_incr_space(struct inode *inode, qsize_t number,
+void inode_incr_space(struct inode *inode, qsize_t number,
 				int reserve)
 {
 	if (reserve)
@@ -1567,14 +1562,16 @@ static void inode_incr_space(struct inode *inode, qsize_t number,
 	else
 		inode_add_bytes(inode, number);
 }
+EXPORT_SYMBOL(inode_incr_space);
 
-static void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
+void inode_decr_space(struct inode *inode, qsize_t number, int reserve)
 {
 	if (reserve)
 		inode_sub_rsv_space(inode, number);
 	else
 		inode_sub_bytes(inode, number);
 }
+EXPORT_SYMBOL(inode_decr_space);
 
 /*
  * This functions updates i_blocks+i_bytes fields and quota information
--- a/fs/quota/quota.c
+++ b/fs/quota/quota.c
@@ -17,6 +17,7 @@
 #include <linux/quotaops.h>
 #include <linux/types.h>
 #include <linux/writeback.h>
+#include <linux/compat.h>
 
 static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
 				     qid_t id)
@@ -38,7 +39,7 @@ static int check_quotactl_permission(struct super_block *sb, int type, int cmd,
 			break;
 		/*FALLTHROUGH*/
 	default:
-		if (!capable(CAP_SYS_ADMIN))
+		if (!ve_capable(CAP_SYS_ADMIN))
 			return -EPERM;
 	}
 
@@ -459,6 +460,181 @@ static struct super_block *quotactl_block(const char __user *special, int cmd)
 #endif
 }
 
+#ifdef CONFIG_QUOTA_COMPAT
+
+asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special, qid_t id, void __user *addr);
+
+static long compat_quotactl(unsigned int cmds, unsigned int type,
+		const char __user *special, qid_t id,
+		void __user *addr)
+{
+	struct super_block *sb;
+	long ret;
+
+	sb = NULL;
+	switch (cmds) {
+		case QC_QUOTAON:
+			return sys_quotactl(QCMD(Q_QUOTAON, type),
+					special, id, addr);
+
+		case QC_QUOTAOFF:
+			return sys_quotactl(QCMD(Q_QUOTAOFF, type),
+					special, id, addr);
+
+		case QC_SYNC:
+			return sys_quotactl(QCMD(Q_SYNC, type),
+					special, id, addr);
+
+		case QC_GETQUOTA: {
+			struct if_dqblk idq;
+			struct fs_disk_quota fdq;
+			struct compat_dqblk cdq;
+			struct kqid qid;
+
+			sb = quotactl_block(special, cmds);
+			ret = PTR_ERR(sb);
+			if (IS_ERR(sb))
+				break;
+			ret = check_quotactl_permission(sb, type, Q_GETQUOTA, id);
+			if (ret)
+				break;
+			qid = make_kqid(current_user_ns(), type, id);
+			ret = -EINVAL;
+			if (!qid_valid(qid))
+				break;
+			ret = sb->s_qcop->get_dqblk(sb, qid, &fdq);
+			copy_to_if_dqblk(&idq, &fdq);
+			if (ret)
+				break;
+			memset(&cdq, 0, sizeof(cdq));
+			cdq.dqb_ihardlimit = fdq.d_ino_hardlimit;
+			cdq.dqb_isoftlimit = fdq.d_ino_softlimit;
+			cdq.dqb_curinodes = fdq.d_icount;
+			cdq.dqb_bhardlimit = fdq.d_blk_hardlimit;
+			cdq.dqb_bsoftlimit = fdq.d_blk_softlimit;
+			cdq.dqb_curspace = fdq.d_bcount;
+			cdq.dqb_btime = fdq.d_btimer;
+			cdq.dqb_itime = fdq.d_itimer;
+			ret = 0;
+			if (copy_to_user(addr, &cdq, sizeof(cdq)))
+				ret = -EFAULT;
+			break;
+		}
+
+		case QC_SETQUOTA:
+		case QC_SETUSE:
+		case QC_SETQLIM: {
+			struct if_dqblk idq;
+			struct fs_disk_quota fdq;
+			struct compat_dqblk cdq;
+			struct kqid qid;
+
+			sb = quotactl_block(special, cmds);
+			ret = PTR_ERR(sb);
+			if (IS_ERR(sb))
+				break;
+			ret = check_quotactl_permission(sb, type, Q_GETQUOTA, id);
+			if (ret)
+				break;
+			ret = -EFAULT;
+			if (copy_from_user(&cdq, addr, sizeof(cdq)))
+				break;
+			qid = make_kqid(current_user_ns(), type, id);
+			ret = -EINVAL;
+			if (!qid_valid(qid))
+				break;
+			idq.dqb_ihardlimit = cdq.dqb_ihardlimit;
+			idq.dqb_isoftlimit = cdq.dqb_isoftlimit;
+			idq.dqb_curinodes = cdq.dqb_curinodes;
+			idq.dqb_bhardlimit = cdq.dqb_bhardlimit;
+			idq.dqb_bsoftlimit = cdq.dqb_bsoftlimit;
+			idq.dqb_curspace = cdq.dqb_curspace;
+			idq.dqb_valid = 0;
+			if (cmds == QC_SETQUOTA || cmds == QC_SETQLIM)
+				idq.dqb_valid |= QIF_LIMITS;
+			if (cmds == QC_SETQUOTA || cmds == QC_SETUSE)
+				idq.dqb_valid |= QIF_USAGE;
+			copy_from_if_dqblk(&fdq, &idq);
+			ret = sb->s_qcop->set_dqblk(sb, qid, &fdq);
+			break;
+		}
+
+		case QC_GETINFO: {
+			struct if_dqinfo iinf;
+			struct compat_dqinfo cinf;
+
+			sb = quotactl_block(special, cmds);
+			ret = PTR_ERR(sb);
+			if (IS_ERR(sb))
+				break;
+			ret = check_quotactl_permission(sb, type, Q_GETQUOTA, id);
+			if (ret)
+				break;
+			ret = sb->s_qcop->get_info(sb, type, &iinf);
+			if (ret)
+				break;
+
+			memset(&cinf, 0, sizeof(cinf));
+			cinf.dqi_bgrace = iinf.dqi_bgrace;
+			cinf.dqi_igrace = iinf.dqi_igrace;
+			if (iinf.dqi_flags & DQF_INFO_DIRTY)
+				cinf.dqi_flags |= 0x0010;
+			ret = 0;
+			if (copy_to_user(addr, &cinf, sizeof(cinf)))
+				ret = -EFAULT;
+			break;
+		}
+
+		case QC_SETINFO:
+		case QC_SETGRACE:
+		case QC_SETFLAGS: {
+			struct if_dqinfo iinf;
+			struct compat_dqinfo cinf;
+
+			sb = quotactl_block(special, cmds);
+			ret = PTR_ERR(sb);
+			if (IS_ERR(sb))
+				break;
+			ret = check_quotactl_permission(sb, type, Q_GETQUOTA, id);
+			if (ret)
+				break;
+			ret = -EFAULT;
+			if (copy_from_user(&cinf, addr, sizeof(cinf)))
+				break;
+			iinf.dqi_bgrace = cinf.dqi_bgrace;
+			iinf.dqi_igrace = cinf.dqi_igrace;
+			iinf.dqi_flags = cinf.dqi_flags;
+			iinf.dqi_valid = 0;
+			if (cmds == QC_SETINFO || cmds == QC_SETGRACE)
+				iinf.dqi_valid |= IIF_BGRACE | IIF_IGRACE;
+			if (cmds == QC_SETINFO || cmds == QC_SETFLAGS)
+				iinf.dqi_valid |= IIF_FLAGS;
+			ret = sb->s_qcop->set_info(sb, type, &iinf);
+			break;
+		}
+
+		case QC_GETSTATS: {
+			struct compat_dqstats stat;
+
+			memset(&stat, 0, sizeof(stat));
+			stat.version = 6*10000+5*100+0;
+			ret = 0;
+			if (copy_to_user(addr, &stat, sizeof(stat)))
+				ret = -EFAULT;
+			break;
+		}
+
+		default:
+			ret = -ENOSYS;
+			break;
+	}
+	if (sb && !IS_ERR(sb))
+		drop_super(sb);
+	return ret;
+}
+
+#endif
+
 /*
  * This is the system call interface. This communicates with
  * the user-level programs. Currently this only supports diskquota
@@ -476,6 +652,11 @@ SYSCALL_DEFINE4(quotactl, unsigned int, cmd, const char __user *, special,
 	cmds = cmd >> SUBCMDSHIFT;
 	type = cmd & SUBCMDMASK;
 
+#ifdef CONFIG_QUOTA_COMPAT
+	if (cmds >= 0x0100 && cmds < 0x3000)
+		return compat_quotactl(cmds, type, special, id, addr);
+#endif
+
 	/*
 	 * As a special case Q_SYNC can be called without a specific device.
 	 * It will iterate all superblocks that have quota enabled and call
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -23,6 +23,8 @@
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 
+#include <bc/beancounter.h>
+
 typedef ssize_t (*io_fn_t)(struct file *, char __user *, size_t, loff_t *);
 typedef ssize_t (*iov_fn_t)(struct kiocb *, const struct iovec *,
 		unsigned long, loff_t);
--- a/fs/reiserfs/super.c
+++ b/fs/reiserfs/super.c
@@ -600,7 +600,8 @@ static int init_inodecache(void)
 						  sizeof(struct
 							 reiserfs_inode_info),
 						  0, (SLAB_RECLAIM_ACCOUNT|
-							SLAB_MEM_SPREAD),
+						      SLAB_MEM_SPREAD|
+						      SLAB_ACCOUNT),
 						  init_once);
 	if (reiserfs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/reiserfs/xattr_acl.c
+++ b/fs/reiserfs/xattr_acl.c
@@ -286,13 +286,9 @@ reiserfs_set_acl(struct reiserfs_transaction_handle *th, struct inode *inode,
 	case ACL_TYPE_ACCESS:
 		name = POSIX_ACL_XATTR_ACCESS;
 		if (acl) {
-			error = posix_acl_equiv_mode(acl, &inode->i_mode);
-			if (error < 0)
+			error = posix_acl_update_mode(inode, &inode->i_mode, &acl);
+			if (error)
 				return error;
-			else {
-				if (error == 0)
-					acl = NULL;
-			}
 		}
 		break;
 	case ACL_TYPE_DEFAULT:
--- a/fs/romfs/super.c
+++ b/fs/romfs/super.c
@@ -623,8 +623,8 @@ static int __init init_romfs_fs(void)
 	romfs_inode_cachep =
 		kmem_cache_create("romfs_i",
 				  sizeof(struct romfs_inode_info), 0,
-				  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
-				  romfs_i_init_once);
+				  SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD |
+				  SLAB_ACCOUNT, romfs_i_init_once);
 
 	if (!romfs_inode_cachep) {
 		printk(KERN_ERR
--- a/fs/select.c
+++ b/fs/select.c
@@ -32,7 +32,6 @@
 
 #include <asm/uaccess.h>
 
-
 /*
  * Estimate expected accuracy in ns from a timeval.
  *
@@ -578,7 +577,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp,
 	if (size > sizeof(stack_fds) / 6) {
 		/* Not enough space in on-stack array; must use kmalloc */
 		ret = -ENOMEM;
-		bits = kmalloc(6 * size, GFP_KERNEL);
+		bits = kmalloc(6 * size, GFP_KERNEL_ACCOUNT);
 		if (!bits)
 			goto out_nofds;
 	}
@@ -901,7 +900,7 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds,
 
 		len = min(todo, POLLFD_PER_PAGE);
 		size = sizeof(struct poll_list) + sizeof(struct pollfd) * len;
-		walk = walk->next = kmalloc(size, GFP_KERNEL);
+		walk = walk->next = kmalloc(size, GFP_KERNEL_ACCOUNT);
 		if (!walk) {
 			err = -ENOMEM;
 			goto out_fds;
--- a/fs/seq_file.c
+++ b/fs/seq_file.c
@@ -28,9 +28,9 @@ static void *seq_buf_alloc(unsigned long size)
 {
 	void *buf;
 
-	buf = kmalloc(size, GFP_KERNEL | __GFP_NOWARN);
+	buf = kmalloc(size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
 	if (!buf && size > PAGE_SIZE)
-		buf = vmalloc(size);
+		buf = vmalloc_account(size);
 	return buf;
 }
 
@@ -61,7 +61,7 @@ int seq_open(struct file *file, const struct seq_operations *op)
 	struct seq_file *p = file->private_data;
 
 	if (!p) {
-		p = kmalloc(sizeof(*p), GFP_KERNEL);
+		p = kmalloc(sizeof(*p), GFP_KERNEL_ACCOUNT);
 		if (!p)
 			return -ENOMEM;
 		file->private_data = p;
@@ -604,7 +604,7 @@ static void single_stop(struct seq_file *p, void *v)
 int single_open(struct file *file, int (*show)(struct seq_file *, void *),
 		void *data)
 {
-	struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL);
+	struct seq_operations *op = kmalloc(sizeof(*op), GFP_KERNEL_ACCOUNT);
 	int res = -ENOMEM;
 
 	if (op) {
@@ -666,7 +666,7 @@ void *__seq_open_private(struct file *f, const struct seq_operations *ops,
 	void *private;
 	struct seq_file *seq;
 
-	private = kzalloc(psize, GFP_KERNEL);
+	private = kzalloc(psize, GFP_KERNEL_ACCOUNT);
 	if (private == NULL)
 		goto out;
 
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -33,6 +33,7 @@
 #include <linux/socket.h>
 #include <linux/compat.h>
 #include "internal.h"
+#include <linux/virtinfo.h>
 
 /*
  * Attempt to steal a page from a pipe buffer. This should perhaps go into
@@ -103,6 +104,7 @@ static int page_cache_pipe_buf_confirm(struct pipe_inode_info *pipe,
 	int err;
 
 	if (!PageUptodate(page)) {
+		virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
 		lock_page(page);
 
 		/*
@@ -340,6 +342,15 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 	spd.nr_pages = find_get_pages_contig(mapping, index, nr_pages, spd.pages);
 	index += spd.nr_pages;
 
+	while (spd.nr_pages < nr_pages && mapping->i_peer_file) {
+		page = pick_peer_page(mapping, index, &in->f_ra,
+				      req_pages - spd.nr_pages);
+		if (!page)
+			break;
+		pages[spd.nr_pages++] = page;
+		index++;
+	}
+
 	/*
 	 * If find_get_pages_contig() returned fewer pages than we needed,
 	 * readahead/allocate the rest and fill in the holes.
@@ -409,6 +420,7 @@ __generic_file_splice_read(struct file *in, loff_t *ppos,
 		 * If the page isn't uptodate, we may need to start io on it
 		 */
 		if (!PageUptodate(page)) {
+			virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
 			lock_page(page);
 
 			/*
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -417,7 +417,8 @@ static int __init init_inodecache(void)
 {
 	squashfs_inode_cachep = kmem_cache_create("squashfs_inode_cache",
 		sizeof(struct squashfs_inode_info), 0,
-		SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT, init_once);
+		SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
+		init_once);
 
 	return squashfs_inode_cachep ? 0 : -ENOMEM;
 }
--- a/fs/statfs.c
+++ b/fs/statfs.c
@@ -7,6 +7,7 @@
 #include <linux/statfs.h>
 #include <linux/security.h>
 #include <linux/uaccess.h>
+#include <linux/device_cgroup.h>
 #include "internal.h"
 
 static int flags_by_mnt(int mnt_flags)
@@ -46,28 +47,37 @@ static int calculate_f_flags(struct vfsmount *mnt)
 		flags_by_sb(mnt->mnt_sb->s_flags);
 }
 
-static int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
+static int statfs_by_sb(struct super_block *sb, struct dentry *dentry, struct kstatfs *buf)
 {
 	int retval;
 
-	if (!dentry->d_sb->s_op->statfs)
+	if (!sb->s_op->statfs)
 		return -ENOSYS;
 
 	memset(buf, 0, sizeof(*buf));
-	retval = security_sb_statfs(dentry);
-	if (retval)
-		return retval;
-	retval = dentry->d_sb->s_op->statfs(dentry, buf);
+	retval = sb->s_op->statfs(dentry, buf);
 	if (retval == 0 && buf->f_frsize == 0)
 		buf->f_frsize = buf->f_bsize;
 	return retval;
 }
 
+int statfs_by_dentry(struct dentry *dentry, struct kstatfs *buf)
+{
+	int retval;
+
+	retval = security_sb_statfs(dentry);
+	if (!retval)
+		retval = statfs_by_sb(dentry->d_sb, dentry, buf);
+	return retval;
+}
+
 int vfs_statfs(struct path *path, struct kstatfs *buf)
 {
 	int error;
 
-	error = statfs_by_dentry(path->dentry, buf);
+	error = security_sb_statfs(path->dentry);
+	if (!error)
+		error = statfs_by_sb(path->mnt->mnt_sb, path->dentry, buf);
 	if (!error)
 		buf->f_flags = calculate_f_flags(path->mnt);
 	return error;
@@ -227,9 +237,16 @@ int vfs_ustat(dev_t dev, struct kstatfs *sbuf)
 
 SYSCALL_DEFINE2(ustat, unsigned, dev, struct ustat __user *, ubuf)
 {
+	dev_t kdev = new_decode_dev(dev);
 	struct ustat tmp;
 	struct kstatfs sbuf;
-	int err = vfs_ustat(new_decode_dev(dev), &sbuf);
+	int err;
+
+	err = devcgroup_device_permission(S_IFBLK, kdev, MAY_READ);
+	if (err)
+		return err;
+
+	err = vfs_ustat(kdev, &sbuf);
 	if (err)
 		return err;
 
--- a/fs/super.c
+++ b/fs/super.c
@@ -34,6 +34,7 @@
 #include <linux/cleancache.h>
 #include <linux/fsnotify.h>
 #include <linux/lockdep.h>
+#include <linux/memcontrol.h>
 #include "internal.h"
 
 const unsigned super_block_wrapper_version = 0;
@@ -47,6 +48,24 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
 	"sb_internal",
 };
 
+static bool dcache_is_low(struct mem_cgroup *memcg)
+{
+	unsigned long anon, file, dcache;
+
+	if (sysctl_vfs_cache_min_ratio <= 0)
+		return false;
+
+	if (memcg)
+		return mem_cgroup_dcache_is_low(memcg);
+
+	anon = global_page_state(NR_ANON_PAGES);
+	file = global_page_state(NR_FILE_PAGES);
+	dcache = global_page_state(NR_SLAB_RECLAIMABLE);
+
+	return dcache / sysctl_vfs_cache_min_ratio <
+			(anon + file + dcache) / 100;
+}
+
 /*
  * One thing we have to be careful of with a per-sb shrinker is that we don't
  * drop the last active reference to the superblock from within the shrinker.
@@ -54,11 +73,15 @@ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
  * shrinker path and that leads to deadlock on the shrinker_rwsem. Hence we
  * take a passive reference to the superblock to avoid this from occurring.
  */
-static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long super_cache_scan(struct shrinker *shrink,
+				      struct shrink_control *sc)
 {
 	struct super_block *sb;
-	int	fs_objects = 0;
-	int	total_objects;
+	long	fs_objects = 0;
+	long	total_objects;
+	long	freed = 0;
+	long	dentries;
+	long	inodes;
 
 	sb = container_of(shrink, struct super_block, s_shrink);
 
@@ -66,43 +89,68 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
 	 * Deadlock avoidance.  We may hold various FS locks, and we don't want
 	 * to recurse into the FS that called us in clear_inode() and friends..
 	 */
-	if (sc->nr_to_scan && !(sc->gfp_mask & __GFP_FS))
-		return -1;
+	if (!(sc->gfp_mask & __GFP_FS))
+		return SHRINK_STOP;
+
+	if (!grab_super_passive(sb))
+                return SHRINK_STOP;
 
 	if (sb->s_op && sb->s_op->nr_cached_objects)
-		fs_objects = sb->s_op->nr_cached_objects(sb);
-
-	total_objects = sb->s_nr_dentry_unused +
-			sb->s_nr_inodes_unused + fs_objects + 1;
-
-	if (sc->nr_to_scan) {
-		int	dentries;
-		int	inodes;
-
-		/* proportion the scan between the caches */
-		dentries = (sc->nr_to_scan * sb->s_nr_dentry_unused) /
-							total_objects;
-		inodes = (sc->nr_to_scan * sb->s_nr_inodes_unused) /
-							total_objects;
-		if (fs_objects)
-			fs_objects = (sc->nr_to_scan * fs_objects) /
-							total_objects;
-		/*
-		 * prune the dcache first as the icache is pinned by it, then
-		 * prune the icache, followed by the filesystem specific caches
-		 */
-		prune_dcache_sb(sb, dentries);
-		prune_icache_sb(sb, inodes);
+		fs_objects = sb->s_op->nr_cached_objects(sb, sc);
 
-		if (fs_objects && sb->s_op->free_cached_objects) {
-			sb->s_op->free_cached_objects(sb, fs_objects);
-			fs_objects = sb->s_op->nr_cached_objects(sb);
-		}
-		total_objects = sb->s_nr_dentry_unused +
-				sb->s_nr_inodes_unused + fs_objects;
+	inodes = list_lru_shrink_count(&sb->s_inode_lru, sc);
+	dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc);
+	total_objects = dentries + inodes + fs_objects + 1;
+
+	/* proportion the scan between the caches */
+	dentries = mult_frac(sc->nr_to_scan, dentries, total_objects);
+	inodes = mult_frac(sc->nr_to_scan, inodes, total_objects);
+	fs_objects = mult_frac(sc->nr_to_scan, fs_objects, total_objects);
+
+	/*
+	 * prune the dcache first as the icache is pinned by it, then
+	 * prune the icache, followed by the filesystem specific caches
+	 */
+	sc->nr_to_scan = dentries;
+	freed = prune_dcache_sb(sb, sc);
+	sc->nr_to_scan = inodes;
+	freed += prune_icache_sb(sb, sc);
+
+	if (fs_objects) {
+		sc->nr_to_scan = fs_objects;
+		freed += sb->s_op->free_cached_objects(sb, sc);
 	}
 
-	total_objects = (total_objects / 100) * sysctl_vfs_cache_pressure;
+	drop_super(sb);
+	return freed;
+}
+
+static unsigned long super_cache_count(struct shrinker *shrink,
+				       struct shrink_control *sc)
+{
+	struct super_block *sb;
+	long	total_objects = 0;
+
+	if (!sc->for_drop_caches && dcache_is_low(sc->memcg))
+		return 0;
+
+	sb = container_of(shrink, struct super_block, s_shrink);
+
+	/*
+	 * Don't call grab_super_passive as it is a potential
+	 * scalability bottleneck. The counts could get updated
+	 * between super_cache_count and super_cache_scan anyway.
+	 * Call to super_cache_count with shrinker_rwsem held
+	 * ensures the safety of call to list_lru_count_node() and
+	 * s_op->nr_cached_objects().
+	 */
+	if (sb->s_op && sb->s_op->nr_cached_objects)
+		total_objects = sb->s_op->nr_cached_objects(sb, sc);
+
+	total_objects += list_lru_shrink_count(&sb->s_dentry_lru, sc);
+	total_objects += list_lru_shrink_count(&sb->s_inode_lru, sc);
+
+	total_objects = vfs_pressure_ratio(total_objects);
 	return total_objects;
 }
 
@@ -115,6 +163,10 @@ static int prune_super(struct shrinker *shrink, struct shrink_control *sc)
 static void destroy_super(struct super_block *s)
 {
 	int i;
+
+	list_lru_destroy(&s->s_dentry_lru);
+	list_lru_destroy(&s->s_inode_lru);
+
 	for (i = 0; i < SB_FREEZE_LEVELS; i++)
 		percpu_counter_destroy(&s->s_writers.counter[i]);
 	security_sb_free(s);
@@ -141,6 +193,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	if (!s)
 		return NULL;
 
+	INIT_LIST_HEAD(&s->s_mounts);
+
 	if (security_sb_alloc(s))
 		goto fail;
 	for (i = 0; i < SB_FREEZE_LEVELS; i++) {
@@ -157,10 +211,12 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	INIT_HLIST_NODE(&s->s_instances);
 	INIT_HLIST_BL_HEAD(&s->s_anon);
 	INIT_LIST_HEAD(&s->s_inodes);
-	INIT_LIST_HEAD(&s->s_dentry_lru);
-	INIT_LIST_HEAD(&s->s_inode_lru);
-	spin_lock_init(&s->s_inode_lru_lock);
-	INIT_LIST_HEAD(&s->s_mounts);
+
+	if (list_lru_init_memcg(&s->s_dentry_lru))
+		goto fail;
+	if (list_lru_init_memcg(&s->s_inode_lru))
+		goto err_out_dentry_lru;
+
 	init_rwsem(&s->s_umount);
 	lockdep_set_class(&s->s_umount, &type->s_umount_key);
 	/*
@@ -189,12 +245,17 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags)
 	s->s_maxbytes = MAX_NON_LFS;
 	s->s_op = &default_op;
 	s->s_time_gran = 1000000000;
-	s->cleancache_poolid = -1;
+	s->cleancache_poolid = CLEANCACHE_NO_POOL;
 
 	s->s_shrink.seeks = DEFAULT_SEEKS;
-	s->s_shrink.shrink = prune_super;
+	s->s_shrink.scan_objects = super_cache_scan;
+	s->s_shrink.count_objects = super_cache_count;
 	s->s_shrink.batch = 1024;
+	s->s_shrink.flags = SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE;
 	return s;
+
+err_out_dentry_lru:
+	list_lru_destroy(&s->s_dentry_lru);
 fail:
 	destroy_super(s);
 	return NULL;
@@ -220,7 +281,7 @@ static void __put_super(struct super_block *sb)
  *	Drops a temporary reference, frees superblock if there's no
  *	references left.
  */
-static void put_super(struct super_block *sb)
+void put_super(struct super_block *sb)
 {
 	spin_lock(&sb_lock);
 	__put_super(sb);
@@ -247,6 +308,14 @@ void deactivate_locked_super(struct super_block *s)
 		unregister_shrinker(&s->s_shrink);
 		fs->kill_sb(s);
 
+		/*
+		 * Since list_lru_destroy() may sleep, we cannot call it from
+		 * put_super(), where we hold the sb_lock. Therefore we destroy
+		 * the lru lists right now.
+		 */
+		list_lru_destroy(&s->s_dentry_lru);
+		list_lru_destroy(&s->s_inode_lru);
+
 		put_filesystem(fs);
 		put_super(s);
 	} else {
@@ -907,7 +976,7 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 	if (!(flags & MS_RDONLY))
 		mode |= FMODE_WRITE;
 
-	bdev = blkdev_get_by_path(dev_name, mode, fs_type);
+	bdev = blkdev_get_by_path(dev_name, mode | FMODE_MOUNT, fs_type);
 	if (IS_ERR(bdev))
 		return ERR_CAST(bdev);
 
@@ -947,11 +1016,26 @@ struct dentry *mount_bdev(struct file_system_type *fs_type,
 		down_write(&s->s_umount);
 	} else {
 		char b[BDEVNAME_SIZE];
-
+#ifdef CONFIG_VE
+		void *data_orig = data;
+		struct ve_struct *ve = get_exec_env();
+
+		if (!ve_is_super(ve)) {
+			error = ve_devmnt_process(ve, bdev->bd_dev, &data, 0);
+			if (error) {
+				deactivate_locked_super(s);
+				goto error;
+			}
+		}
+#endif
 		s->s_mode = mode;
 		strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id));
 		sb_set_blocksize(s, block_size(bdev));
 		error = fill_super(s, data, flags & MS_SILENT ? 1 : 0);
+#ifdef CONFIG_VE
+		if (data_orig != data)
+			free_page((unsigned long)data);
+#endif
 		if (error) {
 			deactivate_locked_super(s);
 			goto error;
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -7,15 +7,21 @@
 #include <linux/fs.h>
 #include <linux/slab.h>
 #include <linux/export.h>
+#include <linux/mount.h>
 #include <linux/namei.h>
-#include <linux/sched.h>
 #include <linux/writeback.h>
 #include <linux/syscalls.h>
 #include <linux/linkage.h>
+#include <linux/pid_namespace.h>
 #include <linux/pagemap.h>
 #include <linux/quotaops.h>
 #include <linux/backing-dev.h>
+#include <linux/ve.h>
 #include "internal.h"
+#include "mount.h"
+
+#include <bc/beancounter.h>
+#include <bc/io_acct.h>
 
 #define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \
 			SYNC_FILE_RANGE_WAIT_AFTER)
@@ -27,12 +33,13 @@
  * wait == 1 case since in that case write_inode() functions do
  * sync_dirty_buffer() and thus effectively write one block at a time.
  */
-static int __sync_filesystem(struct super_block *sb, int wait)
+static int __sync_filesystem(struct super_block *sb,
+			     struct user_beancounter *ub, int wait)
 {
 	if (wait)
-		sync_inodes_sb(sb);
+		sync_inodes_sb_ub(sb, ub);
 	else
-		writeback_inodes_sb(sb, WB_REASON_SYNC);
+		writeback_inodes_sb_ub(sb, ub, WB_REASON_SYNC);
 
 	if (sb->s_op->sync_fs)
 		sb->s_op->sync_fs(sb, wait);
@@ -44,7 +51,7 @@ static int __sync_filesystem(struct super_block *sb, int wait)
  * superblock.  Filesystem data as well as the underlying block
  * device.  Takes the superblock lock.
  */
-int sync_filesystem(struct super_block *sb)
+static int sync_filesystem_ub(struct super_block *sb, struct user_beancounter *ub)
 {
 	int ret;
 
@@ -60,10 +67,15 @@ int sync_filesystem(struct super_block *sb)
 	if (sb->s_flags & MS_RDONLY)
 		return 0;
 
-	ret = __sync_filesystem(sb, 0);
+	ret = __sync_filesystem(sb, ub, 0);
 	if (ret < 0)
 		return ret;
-	return __sync_filesystem(sb, 1);
+	return __sync_filesystem(sb, ub, 1);
+}
+
+int sync_filesystem(struct super_block *sb)
+{
+	return sync_filesystem_ub(sb, NULL);
 }
 EXPORT_SYMBOL_GPL(sync_filesystem);
 
@@ -94,6 +106,111 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
 	filemap_fdatawait_keep_errors(bdev->bd_inode->i_mapping);
 }
 
+struct sync_sb {
+	struct list_head list;
+	struct super_block *sb;
+};
+
+static void sync_release_filesystems(struct list_head *sync_list)
+{
+	struct sync_sb *ss, *tmp;
+
+	list_for_each_entry_safe(ss, tmp, sync_list, list) {
+		list_del(&ss->list);
+		put_super(ss->sb);
+		kfree(ss);
+	}
+}
+
+static int sync_filesystem_collected(struct list_head *sync_list, struct super_block *sb)
+{
+	struct sync_sb *ss;
+
+	list_for_each_entry(ss, sync_list, list)
+		if (ss->sb == sb)
+			return 1;
+	return 0;
+}
+
+static int sync_collect_filesystems(struct ve_struct *ve, struct list_head *sync_list)
+{
+	struct mount *mnt;
+	struct mnt_namespace *mnt_ns = ve->ve_ns->mnt_ns;
+	struct sync_sb *ss;
+	int ret = 0;
+
+	BUG_ON(!list_empty(sync_list));
+
+	down_read(&namespace_sem);
+	list_for_each_entry(mnt, &mnt_ns->list, mnt_list) {
+		if (sync_filesystem_collected(sync_list, mnt->mnt.mnt_sb))
+			continue;
+
+		ss = kmalloc(sizeof(*ss), GFP_KERNEL);
+		if (ss == NULL) {
+			ret = -ENOMEM;
+			break;
+		}
+		ss->sb = mnt->mnt.mnt_sb;
+		/*
+		 * We hold mount point and thus can be sure, that superblock is
+		 * alive. And it means, that we can safely increase it's usage
+		 * counter.
+		 */
+		spin_lock(&sb_lock);
+		ss->sb->s_count++;
+		spin_unlock(&sb_lock);
+		list_add_tail(&ss->list, sync_list);
+	}
+	up_read(&namespace_sem);
+	return ret;
+}
+
+static void sync_filesystems_ve(struct ve_struct *ve, struct user_beancounter *ub, int wait)
+{
+	struct super_block *sb;
+	LIST_HEAD(sync_list);
+	struct sync_sb *ss;
+
+	/*
+	 * We don't need to care about allocating failure here. At least we
+	 * don't need to skip sync on such error.
+	 * Let's sync what we collected already instead.
+	 */
+	sync_collect_filesystems(ve, &sync_list);
+
+	list_for_each_entry(ss, &sync_list, list) {
+		sb = ss->sb;
+		down_read(&sb->s_umount);
+		if (!(sb->s_flags & MS_RDONLY) && sb->s_root && sb->s_bdi)
+			__sync_filesystem(sb, ub, wait);
+		up_read(&sb->s_umount);
+	}
+
+	sync_release_filesystems(&sync_list);
+}
+
+static int __ve_fsync_behavior(struct ve_struct *ve)
+{
+	if (ve->fsync_enable == 2)
+		return get_ve0()->fsync_enable;
+	else if (ve->fsync_enable)
+		return FSYNC_FILTERED; /* sync forced by ve is always filtered */
+	else
+		return 0;
+}
+
+int ve_fsync_behavior(void)
+{
+	struct ve_struct *ve;
+
+	ve = get_exec_env();
+	if (ve_is_super(ve))
+		return FSYNC_ALWAYS;
+	else
+		return __ve_fsync_behavior(ve);
+}
+
 /*
  * Sync everything. We start by waking flusher threads so that most of
  * writeback runs on all devices in parallel. Then we sync all inodes reliably
@@ -106,8 +223,40 @@ static void fdatawait_one_bdev(struct block_device *bdev, void *arg)
  */
 SYSCALL_DEFINE0(sync)
 {
+	struct ve_struct *ve = get_exec_env();
+	struct user_beancounter *ub, *sync_ub = NULL;
 	int nowait = 0, wait = 1;
 
+	ub = get_exec_ub();
+	ub_percpu_inc(ub, sync);
+
+	if (!ve_is_super(ve)) {
+		int fsb;
+		/*
+		 * init can't sync during VE stop. Rationale:
+		 *  - NFS with -o hard will block forever as network is down
+		 *  - no useful job is performed as VE0 will call umount/sync
+		 *    by his own later
+		 *  Den
+		 */
+		if (is_child_reaper(task_pid(current)))
+			goto skip;
+
+		fsb = __ve_fsync_behavior(ve);
+		if (fsb == FSYNC_NEVER)
+			goto skip;
+
+		if (fsb == FSYNC_FILTERED)
+			sync_ub = get_io_ub();
+
+		if (sync_ub && (sync_ub != get_ub0())) {
+			wakeup_flusher_threads_ub(0, sync_ub, WB_REASON_SYNC);
+			sync_filesystems_ve(get_exec_env(), sync_ub, nowait);
+			sync_filesystems_ve(get_exec_env(), sync_ub, wait);
+			goto skip;
+		}
+	}
+
 	wakeup_flusher_threads(0, WB_REASON_SYNC);
 	iterate_supers(sync_inodes_one_sb, NULL);
 	iterate_supers(sync_fs_one_sb, &nowait);
@@ -116,6 +265,8 @@ SYSCALL_DEFINE0(sync)
 	iterate_bdevs(fdatawait_one_bdev, NULL);
 	if (unlikely(laptop_mode))
 		laptop_sync_completion();
+skip:
+	ub_percpu_inc(ub, sync_done);
 	return 0;
 }
 
@@ -155,17 +306,49 @@ SYSCALL_DEFINE1(syncfs, int, fd)
 {
 	struct fd f = fdget(fd);
 	struct super_block *sb;
-	int ret;
+	int ret = 0;
+	struct user_beancounter *ub, *sync_ub = NULL;
+	struct ve_struct *ve;
+
+	ub = get_exec_ub();
+	ve = get_exec_env();
+	ub_percpu_inc(ub, sync);
+
+	if (!f.file) {
+		ret = -EBADF;
+		goto skip;
+	}
+
+	if (!ve_is_super(ve)) {
+		int fsb;
+		/*
+		 * init can't sync during VE stop. Rationale:
+		 *  - NFS with -o hard will block forever as network is down
+		 *  - no useful job is performed as VE0 will call umount/sync
+		 *    by his own later
+		 *  Den
+		 */
+		if (is_child_reaper(task_pid(current)))
+			goto fdput;
+
+		fsb = __ve_fsync_behavior(ve);
+		if (fsb == FSYNC_NEVER)
+			goto fdput;
+
+		if (fsb == FSYNC_FILTERED)
+			sync_ub = get_io_ub();
+	}
 
-	if (!f.file)
-		return -EBADF;
 	sb = f.file->f_dentry->d_sb;
 
 	down_read(&sb->s_umount);
-	ret = sync_filesystem(sb);
+	if (sb->s_root)
+		ret = sync_filesystem_ub(sb, sync_ub);
 	up_read(&sb->s_umount);
-
+fdput:
 	fdput(f);
+skip:
+	ub_percpu_inc(ub, sync_done);
 	return ret;
 }
 
@@ -182,9 +365,34 @@ SYSCALL_DEFINE1(syncfs, int, fd)
  */
 int vfs_fsync_range(struct file *file, loff_t start, loff_t end, int datasync)
 {
+	struct user_beancounter *ub;
+	int ret;
+	struct inode *inode = file->f_mapping->host;
+
 	if (!file->f_op || !file->f_op->fsync)
 		return -EINVAL;
-	return file->f_op->fsync(file, start, end, datasync);
+
+	if (!datasync && (inode->i_state & I_DIRTY_TIME)) {
+		spin_lock(&inode->i_lock);
+		inode->i_state &= ~I_DIRTY_TIME;
+		spin_unlock(&inode->i_lock);
+		mark_inode_dirty_sync(inode);
+	}
+
+	ub = get_exec_ub();
+	if (datasync)
+		ub_percpu_inc(ub, fdsync);
+	else
+		ub_percpu_inc(ub, fsync);
+
+	ret = file->f_op->fsync(file, start, end, datasync);
+
+	if (datasync)
+		ub_percpu_inc(ub, fdsync_done);
+	else
+		ub_percpu_inc(ub, fsync_done);
+
+	return ret;
 }
 EXPORT_SYMBOL(vfs_fsync_range);
 
@@ -204,9 +412,13 @@ EXPORT_SYMBOL(vfs_fsync);
 
 static int do_fsync(unsigned int fd, int datasync)
 {
-	struct fd f = fdget(fd);
+	struct fd f;
 	int ret = -EBADF;
 
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		return 0;
+
+	f = fdget(fd);
 	if (f.file) {
 		ret = vfs_fsync(f.file, datasync);
 		fdput(f);
@@ -291,6 +503,7 @@ EXPORT_SYMBOL(generic_write_sync);
 SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
 				unsigned int, flags)
 {
+	struct user_beancounter *ub;
 	int ret;
 	struct fd f;
 	struct address_space *mapping;
@@ -349,22 +562,27 @@ SYSCALL_DEFINE4(sync_file_range, int, fd, loff_t, offset, loff_t, nbytes,
 		goto out_put;
 	}
 
+	ub = get_exec_ub();
+	ub_percpu_inc(ub, frsync);
+
 	ret = 0;
 	if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) {
 		ret = filemap_fdatawait_range(mapping, offset, endbyte);
 		if (ret < 0)
-			goto out_put;
+			goto out_acct;
 	}
 
 	if (flags & SYNC_FILE_RANGE_WRITE) {
 		ret = filemap_fdatawrite_range(mapping, offset, endbyte);
 		if (ret < 0)
-			goto out_put;
+			goto out_acct;
 	}
 
 	if (flags & SYNC_FILE_RANGE_WAIT_AFTER)
 		ret = filemap_fdatawait_range(mapping, offset, endbyte);
 
+out_acct:
+	ub_percpu_inc(ub, frsync_done);
 out_put:
 	fdput(f);
 out:
--- a/fs/sysfs/Makefile
+++ b/fs/sysfs/Makefile
@@ -4,3 +4,4 @@
 
 obj-y		:= inode.o file.o dir.o symlink.o mount.o bin.o \
 		   group.o
+obj-$(CONFIG_VE) += ve.o
--- a/fs/sysfs/dir.c
+++ b/fs/sysfs/dir.c
@@ -23,12 +23,13 @@
 #include <linux/slab.h>
 #include <linux/security.h>
 #include <linux/hash.h>
+#include <linux/ve.h>
 #include "sysfs.h"
 
 DEFINE_MUTEX(sysfs_mutex);
 DEFINE_SPINLOCK(sysfs_assoc_lock);
 
-#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb);
+#define to_sysfs_dirent(X) rb_entry((X), struct sysfs_dirent, s_rb)
 
 static DEFINE_SPINLOCK(sysfs_ino_lock);
 static DEFINE_IDA(sysfs_ino_ida);
@@ -73,6 +74,32 @@ static int sysfs_sd_compare(const struct sysfs_dirent *left,
 				  right);
 }
 
+static bool sysfs_sd_visible(struct sysfs_dirent *sd, struct super_block *sb)
+{
+	struct ve_struct *ve = sysfs_info(sb)->ve;
+	struct sysfs_dirent *tmp_sd = sd;
+
+	/* Host sees anything */
+	if (ve_is_super(ve))
+		return true;
+
+	/* Entries with namespace tag and their sub-entries always visible */
+	while (tmp_sd) {
+		if (tmp_sd->s_ns)
+			return true;
+		tmp_sd = tmp_sd->s_parent;
+	}
+
+	/* Symlinks are visible if target sd is visible */
+	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
+		sd = sd->s_symlink.target_sd;
+
+	if (kmapset_get_value(sd->s_ve_perms, &ve->ve_sysfs_perms))
+		return true;
+
+	return false;
+}
+
 /**
  *	sysfs_link_subling - link sysfs_dirent into sibling rbtree
  *	@sd: sysfs_dirent of interest
@@ -279,6 +306,8 @@ void release_sysfs_dirent(struct sysfs_dirent * sd)
 	if (sd->s_iattr && sd->s_iattr->ia_secdata)
 		security_release_secctx(sd->s_iattr->ia_secdata,
 					sd->s_iattr->ia_secdata_len);
+	if (sd->s_ve_perms)
+		kmapset_put(sd->s_ve_perms);
 	kfree(sd->s_iattr);
 	sysfs_free_ino(sd->s_ino);
 	kmem_cache_free(sysfs_dir_cachep, sd);
@@ -326,6 +355,9 @@ static int sysfs_dentry_revalidate(struct dentry *dentry, unsigned int flags)
 			goto out_bad;
 	}
 
+	if (!sysfs_sd_visible(sd, dentry->d_sb))
+		goto out_bad;
+
 	mutex_unlock(&sysfs_mutex);
 out_valid:
 	return 1;
@@ -444,17 +476,19 @@ int __sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd)
 {
 	struct sysfs_inode_attrs *ps_iattr;
 	int ret;
-
+#ifndef CONFIG_VE
 	if (!!sysfs_ns_type(acxt->parent_sd) != !!sd->s_ns) {
 		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
 			sysfs_ns_type(acxt->parent_sd)? "required": "invalid",
 			acxt->parent_sd->s_name, sd->s_name);
 		return -EINVAL;
 	}
-
+#endif
 	sd->s_hash = sysfs_name_hash(sd->s_ns, sd->s_name);
 	sd->s_parent = sysfs_get(acxt->parent_sd);
 
+	sd->s_ve_perms = kmapset_commit(kmapset_new(&ve_sysfs_perms));
+
 	ret = sysfs_link_sibling(sd);
 	if (ret)
 		return ret;
@@ -612,14 +646,14 @@ struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 {
 	struct rb_node *node = parent_sd->s_dir.children.rb_node;
 	unsigned int hash;
-
+#ifndef CONFIG_VE
 	if (!!sysfs_ns_type(parent_sd) != !!ns) {
 		WARN(1, KERN_WARNING "sysfs: ns %s in '%s' for '%s'\n",
 			sysfs_ns_type(parent_sd)? "required": "invalid",
 			parent_sd->s_name, name);
 		return NULL;
 	}
-
+#endif
 	hash = sysfs_name_hash(ns, name);
 	while (node) {
 		struct sysfs_dirent *sd;
@@ -779,7 +813,7 @@ static struct dentry * sysfs_lookup(struct inode *dir, struct dentry *dentry,
 	sd = sysfs_find_dirent(parent_sd, ns, dentry->d_name.name);
 
 	/* no such entry */
-	if (!sd) {
+	if (!sd || !sysfs_sd_visible(sd, dentry->d_sb)) {
 		ret = ERR_PTR(-ENOENT);
 		goto out_unlock;
 	}
@@ -945,8 +979,8 @@ static int sysfs_dir_release(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
-	struct sysfs_dirent *parent_sd,	loff_t hash, struct sysfs_dirent *pos)
+static struct sysfs_dirent *sysfs_dir_pos(struct sysfs_dirent *parent_sd,
+					  loff_t hash, struct sysfs_dirent *pos)
 {
 	if (pos) {
 		int valid = !(pos->s_flags & SYSFS_FLAG_REMOVED) &&
@@ -969,29 +1003,32 @@ static struct sysfs_dirent *sysfs_dir_pos(const void *ns,
 				break;
 		}
 	}
-	/* Skip over entries in the wrong namespace */
-	while (pos && pos->s_ns != ns) {
-		struct rb_node *node = rb_next(&pos->s_rb);
-		if (!node)
-			pos = NULL;
-		else
-			pos = to_sysfs_dirent(node);
-	}
 	return pos;
 }
 
-static struct sysfs_dirent *sysfs_dir_next_pos(const void *ns,
-	struct sysfs_dirent *parent_sd,	ino_t ino, struct sysfs_dirent *pos)
+static struct sysfs_dirent *sysfs_next_entry(struct sysfs_dirent *cur)
 {
-	pos = sysfs_dir_pos(ns, parent_sd, ino, pos);
-	if (pos) do {
-		struct rb_node *node = rb_next(&pos->s_rb);
-		if (!node)
-			pos = NULL;
-		else
-			pos = to_sysfs_dirent(node);
-	} while (pos && pos->s_ns != ns);
-	return pos;
+	struct rb_node *node = rb_next(&cur->s_rb);
+
+	return node ? to_sysfs_dirent(node) : NULL;
+}
+
+struct sysfs_dirent *sysfs_next_recursive(struct sysfs_dirent *sd)
+{
+	struct rb_node *node;
+
+	if (sysfs_type(sd) == SYSFS_DIR &&
+	    !RB_EMPTY_ROOT(&sd->s_dir.children))
+		return to_sysfs_dirent(rb_first(&sd->s_dir.children));
+
+	do {
+		node = rb_next(&sd->s_rb);
+		if (node)
+			return to_sysfs_dirent(node);
+		sd = sd->s_parent;
+	} while (sd);
+
+	return NULL;
 }
 
 static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
@@ -1026,13 +1063,16 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 	}
 	mutex_lock(&sysfs_mutex);
 	off = filp->f_pos;
-	for (pos = sysfs_dir_pos(ns, parent_sd, filp->f_pos, pos);
-	     pos;
-	     pos = sysfs_dir_next_pos(ns, parent_sd, filp->f_pos, pos)) {
+	pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos);
+	for (; pos; pos = sysfs_next_entry(pos)) {
 		const char * name;
 		unsigned int type;
 		int len, ret;
 
+		/* Skip invisible entries and extries from wrong namespace */
+		if (pos->s_ns != ns || !sysfs_sd_visible(pos, dentry->d_sb))
+			continue;
+
 		name = pos->s_name;
 		len = strlen(name);
 		ino = pos->s_ino;
@@ -1045,6 +1085,9 @@ static int sysfs_readdir(struct file * filp, void * dirent, filldir_t filldir)
 		mutex_lock(&sysfs_mutex);
 		if (ret < 0)
 			break;
+
+		/* Revalidate position pointer after reacquiring sysfs_mutex */
+		pos = sysfs_dir_pos(parent_sd, filp->f_pos, pos);
 	}
 	mutex_unlock(&sysfs_mutex);
 
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -29,7 +29,8 @@ static void remove_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 			sysfs_hash_and_remove(dir_sd, NULL, (*attr)->name);
 	if (grp->bin_attrs)
 		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++)
-			sysfs_remove_bin_file(kobj, *bin_attr);
+			sysfs_hash_and_remove(dir_sd, NULL,
+					      (*bin_attr)->attr.name);
 }
 
 static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
@@ -71,8 +72,10 @@ static int create_files(struct sysfs_dirent *dir_sd, struct kobject *kobj,
 	if (grp->bin_attrs) {
 		for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
 			if (update)
-				sysfs_remove_bin_file(kobj, *bin_attr);
-			error = sysfs_create_bin_file(kobj, *bin_attr);
+				sysfs_hash_and_remove(dir_sd, NULL,
+						      (*bin_attr)->attr.name);
+			error = sysfs_add_file(dir_sd, &(*bin_attr)->attr,
+					       SYSFS_KOBJ_BIN_ATTR);
 			if (error)
 				break;
 		}
--- a/fs/sysfs/inode.c
+++ b/fs/sysfs/inode.c
@@ -17,11 +17,11 @@
 #include <linux/backing-dev.h>
 #include <linux/capability.h>
 #include <linux/errno.h>
-#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/sysfs.h>
 #include <linux/xattr.h>
 #include <linux/security.h>
+#include <linux/ve.h>
 #include "sysfs.h"
 
 extern struct super_block * sysfs_sb;
@@ -113,6 +113,9 @@ int sysfs_setattr(struct dentry *dentry, struct iattr *iattr)
 	if (!sd)
 		return -EINVAL;
 
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	mutex_lock(&sysfs_mutex);
 	error = inode_change_ok(inode, iattr);
 	if (error)
@@ -339,9 +342,35 @@ int sysfs_hash_and_remove(struct sysfs_dirent *dir_sd, const void *ns, const cha
 		return -ENOENT;
 }
 
+static int sysfs_sd_permission(struct sysfs_dirent *sd, int mask)
+{
+	struct ve_struct *ve = get_exec_env();
+	struct sysfs_dirent *tmp_sd = sd;
+	int perm;
+
+	if (ve_is_super(ve))
+		return 0;
+
+	while (tmp_sd) {
+		if (tmp_sd->s_ns)
+			return 0;
+		tmp_sd = tmp_sd->s_parent;
+	}
+
+	if (sysfs_type(sd) == SYSFS_KOBJ_LINK)
+		sd = sd->s_symlink.target_sd;
+
+	perm = kmapset_get_value(sd->s_ve_perms, &ve->ve_sysfs_perms);
+	if ((mask & ~perm & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0)
+		return 0;
+
+	return -EACCES;
+}
+
 int sysfs_permission(struct inode *inode, int mask)
 {
 	struct sysfs_dirent *sd;
+	int ret;
 
 	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
@@ -349,8 +378,12 @@ int sysfs_permission(struct inode *inode, int mask)
 	sd = inode->i_private;
 
 	mutex_lock(&sysfs_mutex);
+	ret = sysfs_sd_permission(sd, mask);
 	sysfs_refresh_inode(sd, inode);
 	mutex_unlock(&sysfs_mutex);
 
+	if (ret)
+		return ret;
+
 	return generic_permission(inode, mask);
 }
--- a/fs/sysfs/mount.c
+++ b/fs/sysfs/mount.c
@@ -20,6 +20,9 @@
 #include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/user_namespace.h>
+#include <linux/seq_file.h>
+#include <linux/xattr.h>
+#include <linux/ve.h>
 
 #include "sysfs.h"
 
@@ -27,10 +30,21 @@
 static struct vfsmount *sysfs_mnt;
 struct kmem_cache *sysfs_dir_cachep;
 
+static int sysfs_show_options(struct seq_file *m, struct dentry *root)
+{
+	struct ve_struct *ve = sysfs_info(root->d_sb)->ve;
+
+	if (!ve_is_super(ve))
+		seq_printf(m, ",ve=%s", ve_name(ve));
+
+	return 0;
+}
+
 static const struct super_operations sysfs_ops = {
 	.statfs		= simple_statfs,
 	.drop_inode	= generic_delete_inode,
 	.evict_inode	= sysfs_evict_inode,
+	.show_options	= sysfs_show_options,
 };
 
 struct sysfs_dirent sysfs_root = {
@@ -84,6 +98,8 @@ static int sysfs_test_super(struct super_block *sb, void *data)
 		if (sb_info->ns[type] != info->ns[type])
 			found = 0;
 	}
+	if (sb_info->ve != info->ve)
+		found = 0;
 	return found;
 }
 
@@ -101,6 +117,7 @@ static void free_sysfs_super_info(struct sysfs_super_info *info)
 	int type;
 	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
 		kobj_ns_drop(type, info->ns[type]);
+	put_ve(info->ve);
 	kfree(info);
 }
 
@@ -121,6 +138,7 @@ static struct dentry *sysfs_mount(struct file_system_type *fs_type,
 
 	for (type = KOBJ_NS_TYPE_NONE; type < KOBJ_NS_TYPES; type++)
 		info->ns[type] = kobj_ns_grab_current(type);
+	info->ve = get_ve(get_exec_env());
 
 	sb = sget(fs_type, sysfs_test_super, sysfs_set_super, flags, info);
 	if (IS_ERR(sb) || sb->s_fs_info != info)
@@ -153,13 +171,21 @@ static struct file_system_type sysfs_fs_type = {
 	.name		= "sysfs",
 	.mount		= sysfs_mount,
 	.kill_sb	= sysfs_kill_sb,
-	.fs_flags	= FS_USERNS_MOUNT,
+	.fs_flags	= FS_USERNS_MOUNT | FS_VIRTUALIZED,
 };
 
 int __init sysfs_init(void)
 {
 	int err = -ENOMEM;
 
+	kmapset_init_set(&ve_sysfs_perms);
+
+	sysfs_root.s_ve_perms = kmapset_new(&ve_sysfs_perms);
+	if (!sysfs_root.s_ve_perms)
+		goto out;
+
+	kmapset_commit(sysfs_root.s_ve_perms);
+
 	sysfs_dir_cachep = kmem_cache_create("sysfs_dir_cache",
 					      sizeof(struct sysfs_dirent),
 					      0, 0, NULL);
--- a/fs/sysfs/symlink.c
+++ b/fs/sysfs/symlink.c
@@ -59,6 +59,10 @@ static int sysfs_do_create_link_sd(struct sysfs_dirent *parent_sd,
 	sysfs_addrm_start(&acxt, parent_sd);
 	/* Symlinks must be between directories with the same ns_type */
 	if (!ns_type ||
+#ifdef CONFIG_VE
+	    /* or if target doesn't have ns_type */
+	    !sysfs_ns_type(sd->s_symlink.target_sd->s_parent) ||
+#endif
 	    (ns_type == sysfs_ns_type(sd->s_symlink.target_sd->s_parent))) {
 		if (warn)
 			error = sysfs_add_one(&acxt, sd);
--- a/fs/sysfs/sysfs.h
+++ b/fs/sysfs/sysfs.h
@@ -81,6 +81,8 @@ struct sysfs_dirent {
 	umode_t 		s_mode;
 	unsigned int		s_ino;
 	struct sysfs_inode_attrs *s_iattr;
+
+	struct kmapset_map	*s_ve_perms;
 };
 
 #define SD_DEACTIVATED_BIAS		INT_MIN
@@ -147,6 +149,7 @@ struct sysfs_addrm_cxt {
  */
 struct sysfs_super_info {
 	void *ns[KOBJ_NS_TYPES];
+	struct ve_struct *ve;
 };
 #define sysfs_info(SB) ((struct sysfs_super_info *)(SB->s_fs_info))
 extern struct sysfs_dirent sysfs_root;
@@ -172,6 +175,7 @@ int sysfs_add_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_remove_one(struct sysfs_addrm_cxt *acxt, struct sysfs_dirent *sd);
 void sysfs_addrm_finish(struct sysfs_addrm_cxt *acxt);
 
+struct sysfs_dirent *sysfs_next_recursive(struct sysfs_dirent *sd);
 struct sysfs_dirent *sysfs_find_dirent(struct sysfs_dirent *parent_sd,
 				       const void *ns,
 				       const unsigned char *name);
--- /dev/null
+++ b/fs/sysfs/ve.c
@@ -0,0 +1,313 @@
+/*
+ *  fs/sysfs/ve.c - sysfs permissions for containers
+ *
+ *  Copyright (c) 2013-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/seq_file.h>
+#include <linux/kmapset.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+#include <linux/ve.h>
+#include <net/sock.h>
+#include "sysfs.h"
+
+static void *ve_grab_current_ns(void)
+{
+	return get_ve(get_exec_env());
+}
+
+static const void *ve_initial_ns(void)
+{
+	return get_ve0();
+}
+
+static void ve_drop_ns(void *p)
+{
+	put_ve(p);
+}
+
+const void *ve_netlink_ns(struct sock *sk)
+{
+	return sock_net(sk)->owner_ve;
+}
+
+struct kobj_ns_type_operations ve_ns_type_operations = {
+	.type = KOBJ_NS_TYPE_VE,
+	.grab_current_ns = ve_grab_current_ns,
+	.netlink_ns = ve_netlink_ns,
+	.initial_ns = ve_initial_ns,
+	.drop_ns = ve_drop_ns,
+};
+
+static bool sysfs_perms_shown(struct ve_struct *ve, struct sysfs_dirent *sd)
+{
+	if (!ve) /* default_sysfs_permissions */
+		return sd->s_ve_perms->default_value != 0;
+	return kmapset_lookup(sd->s_ve_perms, &ve->ve_sysfs_perms) != NULL;
+}
+
+static void * sysfs_perms_start(struct seq_file *m, loff_t *ppos)
+{
+	struct ve_struct *ve = m->private;
+	struct sysfs_dirent *sd = &sysfs_root;
+	loff_t pos = *ppos;
+
+	mutex_lock(&sysfs_mutex);
+	for (sd = &sysfs_root; sd; sd = sysfs_next_recursive(sd)) {
+		if (sysfs_perms_shown(ve, sd) && !pos--)
+			break;
+	};
+	return sd;
+}
+
+static void * sysfs_perms_next(struct seq_file *m, void *v, loff_t *ppos)
+{
+	struct ve_struct *ve = m->private;
+	struct sysfs_dirent *sd = v;
+
+	(*ppos)++;
+	while ((sd = sysfs_next_recursive(sd))) {
+		if (sysfs_perms_shown(ve, sd))
+			break;
+	};
+	return sd;
+}
+
+static void sysfs_perms_stop(struct seq_file *m, void *v)
+{
+	mutex_unlock(&sysfs_mutex);
+}
+
+static int sysfs_perms_show(struct seq_file *m, void *v)
+{
+	struct ve_struct *ve = m->private;
+	struct sysfs_dirent *sd = v;
+	char *buf;
+	size_t size, len, off;
+	int mask;
+
+	if (!ve)
+		mask = sd->s_ve_perms->default_value;
+	else
+		mask = kmapset_get_value(sd->s_ve_perms, &ve->ve_sysfs_perms);
+
+	size = seq_get_buf(m, &buf);
+	if (size) {
+		off = size;
+		do {
+			len = strlen(sd->s_name);
+			if (len >= off) {
+				seq_commit(m, -1);
+				return 0;
+			}
+			if (sysfs_type(sd) == SYSFS_DIR)
+				buf[--off] = '/';
+			off -= len;
+			memcpy(buf + off, sd->s_name, len);
+			sd = sd->s_parent;
+		} while (sd && sd != &sysfs_root);
+		memmove(buf, buf + off, size - off);
+		seq_commit(m, size - off);
+	}
+
+	seq_putc(m, ' ');
+
+	if (!mask)
+		seq_putc(m, '-');
+	if (mask & MAY_READ)
+		seq_putc(m, 'r');
+	if (mask & MAY_WRITE)
+		seq_putc(m, 'w');
+	if (mask & MAY_EXEC)
+		seq_putc(m, 'x');
+
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
+static int sysfs_perms_set(char *path, struct ve_struct *ve, int mask)
+{
+	struct sysfs_dirent *sd = &sysfs_root;
+	struct kmapset_map *map = NULL;
+	char *name = path, *sep;
+	int ret;
+
+	mutex_lock(&sysfs_mutex);
+	do {
+		sep = strchr(name, '/');
+		if (sep)
+			*sep = 0;
+		if (*name)
+			sd = sysfs_find_dirent(sd, NULL, name);
+		if (sep)
+			*sep = '/';
+		name = sep + 1;
+	} while (sd && sep);
+
+	ret = -ENOENT;
+	if (!sd)
+		goto out;
+
+	ret = -ENOMEM;
+	map = kmapset_dup(sd->s_ve_perms);
+	if (!map)
+		goto out;
+
+	ret = 0;
+	if (!ve) {
+		kmapset_set_default(map, mask > 0 ? mask : 0);
+	} else if (mask < 0) {
+		kmapset_del_value(map, &ve->ve_sysfs_perms);
+	} else {
+		ret = kmapset_set_value(map, &ve->ve_sysfs_perms, mask);
+	}
+
+	if (!ret) {
+		map = kmapset_commit(map);
+		swap(map, sd->s_ve_perms);
+	}
+out:
+	mutex_unlock(&sysfs_mutex);
+	kmapset_put(map);
+	return ret;
+}
+
+static int sysfs_perms_line(struct ve_struct *ve, char *line)
+{
+	int mask = 0;
+	char *p;
+
+	p = strpbrk(line, " \t");
+	if (!p)
+		return -EINVAL;
+	*p++ = 0;
+	p = skip_spaces(p);
+	while (1) {
+		switch (*p++) {
+			case 'r':
+				mask |= MAY_READ;
+				break;
+			case 'w':
+				mask |= MAY_WRITE;
+				break;
+			case 'x':
+				mask |= MAY_EXEC;
+				break;
+			case '-':
+				mask = -1;
+				break;
+			case 0:
+				return sysfs_perms_set(line, ve, mask);
+			default:
+				return -EINVAL;
+		}
+	}
+}
+
+static ssize_t sysfs_perms_write(struct cgroup *cgrp,
+		struct cftype *cftype, struct file * file,
+		const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct ve_struct *ve = cgroup_ve(file->f_dentry->d_parent->d_fsdata);
+	char *line, *next, *page;
+	int ret, len;
+
+	ve = ve_is_super(ve) ? NULL : ve;
+
+	page = (unsigned char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+	len = min(count, PAGE_SIZE - 1);
+	ret = copy_from_user(page, buf, len);
+	if (ret)
+		goto err;
+
+	page[len] = '\0';
+
+	next = page;
+	while (1) {
+		line = skip_spaces(next);
+		next = strchr(line, '\n');
+		if (next) {
+			*next++ = '\0';
+		} else if (len < count) {
+			ret = line != page ? line - page : -EINVAL;
+			break;
+		}
+		if (*line && *line != '#') {
+			ret = sysfs_perms_line(ve, line);
+			if (ret)
+				break;
+		}
+		if (!next) {
+			ret = len;
+			break;
+		}
+	}
+err:
+	free_page((unsigned long)page);
+	return ret;
+}
+
+struct seq_operations sysfs_perms_sops = {
+	.start = sysfs_perms_start,
+	.stop = sysfs_perms_stop,
+	.next = sysfs_perms_next,
+	.show = sysfs_perms_show,
+};
+
+static int sysfs_perms_open(struct inode *inode, struct file *file)
+{
+	struct ve_struct *ve = cgroup_ve(file->f_dentry->d_parent->d_fsdata);
+	struct seq_file *m;
+	int ret;
+
+	ret = seq_open(file, &sysfs_perms_sops);
+	if (!ret) {
+		m = file->private_data;
+		m->private = ve_is_super(ve) ? NULL : ve;
+	}
+	return ret;
+}
+
+static ssize_t sysfs_perms_read(struct cgroup *cgrp, struct cftype *cft,
+	struct file *file, char __user *buf, size_t nbytes, loff_t *ppos)
+{
+	return seq_read(file, buf, nbytes, ppos);
+}
+
+static int sysfs_perms_release(struct inode *inode, struct file *file)
+{
+	return seq_release(inode, file);
+}
+
+static struct cftype sysfs_ve_cftypes[] = {
+	{
+		.name = "default_sysfs_permissions",
+		.flags = CFTYPE_ONLY_ON_ROOT,
+		.open = sysfs_perms_open,
+		.read = sysfs_perms_read,
+		.write = sysfs_perms_write,
+		.release = sysfs_perms_release,
+		.mode = S_IRUGO | S_IWUSR,
+	},
+	{
+		.name = "sysfs_permissions",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.open = sysfs_perms_open,
+		.read = sysfs_perms_read,
+		.write = sysfs_perms_write,
+		.release = sysfs_perms_release,
+		.mode = S_IRUGO | S_IWUSR,
+	},
+	{ },
+};
+
+static int init_sysfs_ve_perms(void)
+{
+	return cgroup_add_cftypes(&ve_subsys, sysfs_ve_cftypes);
+}
+module_init(init_sysfs_ve_perms);
--- a/fs/sysv/inode.c
+++ b/fs/sysv/inode.c
@@ -351,7 +351,7 @@ int __init sysv_init_icache(void)
 {
 	sysv_inode_cachep = kmem_cache_create("sysv_inode_cache",
 			sizeof(struct sysv_inode_info), 0,
-			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD,
+			SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD|SLAB_ACCOUNT,
 			init_once);
 	if (!sysv_inode_cachep)
 		return -ENOMEM;
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -24,6 +24,7 @@
 #include <linux/syscalls.h>
 #include <linux/compat.h>
 #include <linux/rcupdate.h>
+#include <linux/ve.h>
 
 struct timerfd_ctx {
 	struct hrtimer tmr;
@@ -349,7 +350,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
 }
 
 static int do_timerfd_settime(int ufd, int flags, 
-		const struct itimerspec *new,
+		struct itimerspec *new,
 		struct itimerspec *old)
 {
 	struct fd f;
@@ -395,6 +396,9 @@ static int do_timerfd_settime(int ufd, int flags,
 	/*
 	 * Re-program the timer to the new value ...
 	 */
+	if ((flags & TFD_TIMER_ABSTIME) &&
+	    (new->it_value.tv_sec || new->it_value.tv_nsec))
+		monotonic_ve_to_abs(ctx->clockid, &new->it_value);
 	ret = timerfd_setup(ctx, flags, new);
 
 	spin_unlock_irq(&ctx->wqh.lock);
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1538,7 +1538,6 @@ out_unlock:
 static const struct vm_operations_struct ubifs_file_vm_ops = {
 	.fault        = filemap_fault,
 	.page_mkwrite = ubifs_vm_page_mkwrite,
-	.remap_pages = generic_file_remap_pages,
 };
 
 static int ubifs_file_mmap(struct file *file, struct vm_area_struct *vma)
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -277,18 +277,25 @@ static int kick_a_thread(void)
 	return 0;
 }
 
-int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
+unsigned long ubifs_shrink_count(struct shrinker *shrink,
+				 struct shrink_control *sc)
 {
-	int nr = sc->nr_to_scan;
-	int freed, contention = 0;
 	long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
 
-	if (nr == 0)
-		/*
-		 * Due to the way UBIFS updates the clean znode counter it may
-		 * temporarily be negative.
-		 */
-		return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
+	/*
+	 * Due to the way UBIFS updates the clean znode counter it may
+	 * temporarily be negative.
+	 */
+	return clean_zn_cnt >= 0 ? clean_zn_cnt : 1;
+}
+
+unsigned long ubifs_shrink_scan(struct shrinker *shrink,
+				struct shrink_control *sc)
+{
+	unsigned long nr = sc->nr_to_scan;
+	int contention = 0;
+	unsigned long freed;
+	long clean_zn_cnt = atomic_long_read(&ubifs_clean_zn_cnt);
 
 	if (!clean_zn_cnt) {
 		/*
@@ -316,10 +323,10 @@ int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc)
 
 	if (!freed && contention) {
 		dbg_tnc("freed nothing, but contention");
-		return -1;
+		return SHRINK_STOP;
 	}
 
 out:
-	dbg_tnc("%d znodes were freed, requested %d", freed, nr);
+	dbg_tnc("%lu znodes were freed, requested %lu", freed, nr);
 	return freed;
 }
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -49,7 +49,8 @@ struct kmem_cache *ubifs_inode_slab;
 
 /* UBIFS TNC shrinker description */
 static struct shrinker ubifs_shrinker_info = {
-	.shrink = ubifs_shrinker,
+	.scan_objects = ubifs_shrink_scan,
+	.count_objects = ubifs_shrink_count,
 	.seeks = DEFAULT_SEEKS,
 };
 
@@ -2248,8 +2249,8 @@ static int __init ubifs_init(void)
 
 	ubifs_inode_slab = kmem_cache_create("ubifs_inode_slab",
 				sizeof(struct ubifs_inode), 0,
-				SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT,
-				&inode_slab_ctor);
+				SLAB_MEM_SPREAD | SLAB_RECLAIM_ACCOUNT |
+				SLAB_ACCOUNT, &inode_slab_ctor);
 	if (!ubifs_inode_slab)
 		return -ENOMEM;
 
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -1624,7 +1624,10 @@ int ubifs_tnc_start_commit(struct ubifs_info *c, struct ubifs_zbranch *zroot);
 int ubifs_tnc_end_commit(struct ubifs_info *c);
 
 /* shrinker.c */
-int ubifs_shrinker(struct shrinker *shrink, struct shrink_control *sc);
+unsigned long ubifs_shrink_scan(struct shrinker *shrink,
+				struct shrink_control *sc);
+unsigned long ubifs_shrink_count(struct shrinker *shrink,
+				 struct shrink_control *sc);
 
 /* commit.c */
 int ubifs_bg_thread(void *info);
--- a/fs/udf/super.c
+++ b/fs/udf/super.c
@@ -165,7 +165,8 @@ static int init_inodecache(void)
 	udf_inode_cachep = kmem_cache_create("udf_inode_cache",
 					     sizeof(struct udf_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT |
-						 SLAB_MEM_SPREAD),
+						 SLAB_MEM_SPREAD |
+						 SLAB_ACCOUNT),
 					     init_once);
 	if (!udf_inode_cachep)
 		return -ENOMEM;
--- a/fs/ufs/super.c
+++ b/fs/ufs/super.c
@@ -1458,7 +1458,7 @@ static int init_inodecache(void)
 	ufs_inode_cachep = kmem_cache_create("ufs_inode_cache",
 					     sizeof(struct ufs_inode_info),
 					     0, (SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 					     init_once);
 	if (ufs_inode_cachep == NULL)
 		return -ENOMEM;
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -52,7 +52,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
 	 * The trusted.* namespace can only be accessed by privileged users.
 	 */
 	if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN)) {
-		if (!capable(CAP_SYS_ADMIN))
+		if (!ve_capable(CAP_SYS_ADMIN))
 			return (mask & MAY_WRITE) ? -EPERM : -ENODATA;
 		return 0;
 	}
--- a/fs/xfs/kmem.h
+++ b/fs/xfs/kmem.h
@@ -84,6 +84,7 @@ kmem_zalloc(size_t size, xfs_km_flags_t flags)
 #define KM_ZONE_HWALIGN	SLAB_HWCACHE_ALIGN
 #define KM_ZONE_RECLAIM	SLAB_RECLAIM_ACCOUNT
 #define KM_ZONE_SPREAD	SLAB_MEM_SPREAD
+#define KM_ZONE_ACCOUNT	SLAB_ACCOUNT
 
 #define kmem_zone	kmem_cache
 #define kmem_zone_t	struct kmem_cache
--- a/fs/xfs/xfs_acl.c
+++ b/fs/xfs/xfs_acl.c
@@ -404,9 +404,9 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
 		goto out_release;
 
 	if (type == ACL_TYPE_ACCESS) {
-		umode_t mode = inode->i_mode;
-		error = posix_acl_equiv_mode(acl, &mode);
+		umode_t mode;
 
+		error = posix_acl_update_mode(inode, &mode, &acl);
 		if (error <= 0) {
 			posix_acl_release(acl);
 			acl = NULL;
@@ -414,7 +414,6 @@ xfs_xattr_acl_set(struct dentry *dentry, const char *name,
 			if (error < 0)
 				return error;
 		}
-
 		error = xfs_set_mode(inode, mode);
 		if (error)
 			goto out_release;
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -36,6 +36,7 @@
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
+#include <bc/io_acct.h>
 
 /*
  * structure owned by writepages passed to individual writepage calls
@@ -1817,6 +1818,11 @@ xfs_vm_set_page_dirty(
 			account_page_dirtied(page, mapping);
 			radix_tree_tag_set(&mapping->page_tree,
 					page_index(page), PAGECACHE_TAG_DIRTY);
+			if (mapping_cap_account_dirty(mapping) &&
+					!radix_tree_prev_tag_get(
+						&mapping->page_tree,
+						PAGECACHE_TAG_DIRTY))
+				ub_io_account_dirty(mapping);
 		}
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 		__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -80,54 +80,6 @@ xfs_buf_vmap_len(
 }
 
 /*
- * xfs_buf_lru_add - add a buffer to the LRU.
- *
- * The LRU takes a new reference to the buffer so that it will only be freed
- * once the shrinker takes the buffer off the LRU.
- */
-STATIC void
-xfs_buf_lru_add(
-	struct xfs_buf	*bp)
-{
-	struct xfs_buftarg *btp = bp->b_target;
-
-	spin_lock(&btp->bt_lru_lock);
-	if (list_empty(&bp->b_lru)) {
-		atomic_inc(&bp->b_hold);
-		list_add_tail(&bp->b_lru, &btp->bt_lru);
-		btp->bt_lru_nr++;
-		bp->b_state &= ~XFS_BSTATE_DISPOSE;
-	}
-	spin_unlock(&btp->bt_lru_lock);
-}
-
-/*
- * xfs_buf_lru_del - remove a buffer from the LRU
- *
- * The unlocked check is safe here because it only occurs when there are not
- * b_lru_ref counts left on the inode under the pag->pag_buf_lock. it is there
- * to optimise the shrinker removing the buffer from the LRU and calling
- * xfs_buf_free(). i.e. it removes an unnecessary round trip on the
- * bt_lru_lock.
- */
-STATIC void
-xfs_buf_lru_del(
-	struct xfs_buf	*bp)
-{
-	struct xfs_buftarg *btp = bp->b_target;
-
-	if (list_empty(&bp->b_lru))
-		return;
-
-	spin_lock(&btp->bt_lru_lock);
-	if (!list_empty(&bp->b_lru)) {
-		list_del_init(&bp->b_lru);
-		btp->bt_lru_nr--;
-	}
-	spin_unlock(&btp->bt_lru_lock);
-}
-
-/*
  * Bump the I/O in flight count on the buftarg if we haven't yet done so for
  * this buffer. The count is incremented once per buffer (per hold cycle)
  * because the corresponding decrement is deferred to buffer release. Buffers
@@ -200,20 +152,12 @@ xfs_buf_stale(
 
 	spin_lock(&bp->b_lock);
 	atomic_set(&bp->b_lru_ref, 0);
-	if (!list_empty(&bp->b_lru)) {
-		struct xfs_buftarg *btp = bp->b_target;
+	if (!(bp->b_state & XFS_BSTATE_DISPOSE) &&
+	    (list_lru_del(&bp->b_target->bt_lru, &bp->b_lru)))
+		atomic_dec(&bp->b_hold);
 
-		spin_lock(&btp->bt_lru_lock);
-		if (!list_empty(&bp->b_lru) &&
-		    !(bp->b_state & XFS_BSTATE_DISPOSE)) {
-			list_del_init(&bp->b_lru);
-			btp->bt_lru_nr--;
-			atomic_dec(&bp->b_hold);
-		}
-		spin_unlock(&btp->bt_lru_lock);
-	}
-	spin_unlock(&bp->b_lock);
 	ASSERT(atomic_read(&bp->b_hold) >= 1);
+	spin_unlock(&bp->b_lock);
 }
 
 static int
@@ -1019,7 +963,10 @@ xfs_buf_rele(
 		 * reference to the buffer for the LRU and clear the
 		 * (now stale) dispose list state flag
 		 */
-		xfs_buf_lru_add(bp);
+		if (list_lru_add(&bp->b_target->bt_lru, &bp->b_lru)) {
+			bp->b_state &= ~XFS_BSTATE_DISPOSE;
+			atomic_inc(&bp->b_hold);
+		}
 		spin_unlock(&pag->pag_buf_lock);
 	} else {
 		/*
@@ -1029,11 +976,10 @@ xfs_buf_rele(
 		 * buffer was on was the disposal list
 		 */
 		if (!(bp->b_state & XFS_BSTATE_DISPOSE)) {
-			xfs_buf_lru_del(bp);
+			list_lru_del(&bp->b_target->bt_lru, &bp->b_lru);
 		} else {
 			ASSERT(list_empty(&bp->b_lru));
 		}
-
 		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 		rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
 		spin_unlock(&pag->pag_buf_lock);
@@ -1615,129 +1561,128 @@ xfs_buf_iomove(
  * returned. These buffers will have an elevated hold count, so wait on those
  * while freeing all the buffers only held by the LRU.
  */
+static enum lru_status
+xfs_buftarg_wait_rele(
+	struct list_head	*item,
+	struct list_lru_one     *lru,
+	spinlock_t		*lru_lock,
+	void			*arg)
+
+{
+	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
+	struct list_head        *dispose = arg;
+
+	if (atomic_read(&bp->b_hold) > 1) {
+		/* need to wait */
+		trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
+		return LRU_SKIP;
+	}
+	if (!spin_trylock(&bp->b_lock))
+		return LRU_SKIP;
+	/*
+	 * clear the LRU reference count so the buffer doesn't get
+	 * ignored in xfs_buf_rele().
+	 */
+	atomic_set(&bp->b_lru_ref, 0);
+	bp->b_state |= XFS_BSTATE_DISPOSE;
+	list_lru_isolate_move(lru, item, dispose);
+	spin_unlock(&bp->b_lock);
+	return LRU_REMOVED;
+}
+
 void
 xfs_wait_buftarg(
 	struct xfs_buftarg	*btp)
 {
-	struct xfs_buf		*bp;
 	LIST_HEAD(dispose);
+	int loop = 0;
 
-	/*
-	 * First wait on the buftarg I/O count for all in-flight buffers to be
-	 * released. This is critical as new buffers do not make the LRU until
-	 * they are released.
-	 *
-	 * Next, flush the buffer workqueue to ensure all completion processing
-	 * has finished. Just waiting on buffer locks is not sufficient for
-	 * async IO as the reference count held over IO is not released until
-	 * after the buffer lock is dropped. Hence we need to ensure here that
-	 * all reference counts have been dropped before we start walking the
-	 * LRU list.
-	 */
-	while (percpu_counter_sum(&btp->bt_io_count))
-		delay(100);
-	flush_workqueue(btp->bt_mount->m_buf_workqueue);
-
-restart:
-	spin_lock(&btp->bt_lru_lock);
-	while (!list_empty(&btp->bt_lru)) {
-		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-		if (atomic_read(&bp->b_hold) > 1) {
-			/* need to wait, so skip it this pass */
-			trace_xfs_buf_wait_buftarg(bp, _RET_IP_);
-skip:
-			list_move_tail(&bp->b_lru, &btp->bt_lru);
-			spin_unlock(&btp->bt_lru_lock);
-			delay(100);
-			goto restart;
-		}
-		if (!spin_trylock(&bp->b_lock))
-			goto skip;
+	/* loop until there is nothing left on the lru list. */
+	while (list_lru_count(&btp->bt_lru)) {
+		list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele,
+			      &dispose, LONG_MAX);
 
-		/*
-		 * clear the LRU reference count so the buffer doesn't get
-		 * ignored in xfs_buf_rele().
-		 */
-		atomic_set(&bp->b_lru_ref, 0);
-		if (bp->b_flags & XBF_WRITE_FAIL) {
-			xfs_alert(btp->bt_mount,
-"Corruption Alert: Buffer at block 0x%llx had permanent write failures!",
-				(long long)bp->b_bn);
-			xfs_alert(btp->bt_mount,
-"Please run xfs_repair to determine the extent of the problem.");
+		while (!list_empty(&dispose)) {
+			struct xfs_buf *bp;
+			bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
+			list_del_init(&bp->b_lru);
+			if (bp->b_flags & XBF_WRITE_FAIL) {
+				xfs_alert(btp->bt_mount,
+"Corruption Alert: Buffer at block 0x%llx had permanent write failures!\n"
+"Please run xfs_repair to determine the extent of the problem.",
+					(long long)bp->b_bn);
+			}
+			xfs_buf_rele(bp);
 		}
-		bp->b_state |= XFS_BSTATE_DISPOSE;
-		list_move_tail(&bp->b_lru, &dispose);
-		spin_unlock(&bp->b_lock);
+		if (loop++ != 0)
+			delay(100);
 	}
-	spin_unlock(&btp->bt_lru_lock);
+}
 
-	while (!list_empty(&dispose)) {
-		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
-		list_del_init(&bp->b_lru);
-		xfs_buf_rele(bp);
+static enum lru_status
+xfs_buftarg_isolate(
+	struct list_head	*item,
+	struct list_lru_one     *lru,
+	spinlock_t		*lru_lock,
+	void			*arg)
+{
+	struct xfs_buf		*bp = container_of(item, struct xfs_buf, b_lru);
+	struct list_head	*dispose = arg;
+
+	/*
+	 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
+	 * If we fail to get the lock, just skip it.
+	 */
+	if (!spin_trylock(&bp->b_lock))
+		return LRU_SKIP;
+	/*
+	 * Decrement the b_lru_ref count unless the value is already
+	 * zero. If the value is already zero, we need to reclaim the
+	 * buffer, otherwise it gets another trip through the LRU.
+	 */
+	if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
+		spin_unlock(&bp->b_lock);
+		return LRU_ROTATE;
 	}
+
+	bp->b_state |= XFS_BSTATE_DISPOSE;
+	list_lru_isolate_move(lru, item, dispose);
+	spin_unlock(&bp->b_lock);
+	return LRU_REMOVED;
 }
 
-int
-xfs_buftarg_shrink(
+static unsigned long
+xfs_buftarg_shrink_scan(
 	struct shrinker		*shrink,
 	struct shrink_control	*sc)
 {
 	struct xfs_buftarg	*btp = container_of(shrink,
 					struct xfs_buftarg, bt_shrinker);
-	struct xfs_buf		*bp;
-	int nr_to_scan = sc->nr_to_scan;
 	LIST_HEAD(dispose);
+	unsigned long		freed;
+	unsigned long		nr_to_scan = sc->nr_to_scan;
 
-	if (!nr_to_scan)
-		return btp->bt_lru_nr;
-
-	spin_lock(&btp->bt_lru_lock);
-	while (!list_empty(&btp->bt_lru)) {
-		if (nr_to_scan-- <= 0)
-			break;
-
-		bp = list_first_entry(&btp->bt_lru, struct xfs_buf, b_lru);
-
-		/*
-		 * we are inverting the lru lock/bp->b_lock here, so use a trylock.
-		 * If we fail to get the lock, just skip it.
-		 */
-		if (!spin_trylock(&bp->b_lock)) {
-			list_move_tail(&bp->b_lru, &btp->bt_lru);
-			continue;
-		}
-
-		/*
-		 * Decrement the b_lru_ref count unless the value is already
-		 * zero. If the value is already zero, we need to reclaim the
-		 * buffer, otherwise it gets another trip through the LRU.
-		 */
-		if (!atomic_add_unless(&bp->b_lru_ref, -1, 0)) {
-			spin_unlock(&bp->b_lock);
-			list_move_tail(&bp->b_lru, &btp->bt_lru);
-			continue;
-		}
-
-		/*
-		 * remove the buffer from the LRU now to avoid needing another
-		 * lock round trip inside xfs_buf_rele().
-		 */
-		list_move(&bp->b_lru, &dispose);
-		btp->bt_lru_nr--;
-		bp->b_state |= XFS_BSTATE_DISPOSE;
-		spin_unlock(&bp->b_lock);
-	}
-	spin_unlock(&btp->bt_lru_lock);
+	freed = list_lru_walk_node(&btp->bt_lru, sc->nid, xfs_buftarg_isolate,
+				       &dispose, &nr_to_scan);
 
 	while (!list_empty(&dispose)) {
+		struct xfs_buf *bp;
 		bp = list_first_entry(&dispose, struct xfs_buf, b_lru);
 		list_del_init(&bp->b_lru);
 		xfs_buf_rele(bp);
 	}
 
-	return btp->bt_lru_nr;
+	return freed;
+}
+
+static unsigned long
+xfs_buftarg_shrink_count(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_buftarg	*btp = container_of(shrink,
+					struct xfs_buftarg, bt_shrinker);
+	return list_lru_count_node(&btp->bt_lru, sc->nid);
 }
 
 void
@@ -1748,6 +1693,7 @@ xfs_free_buftarg(
 	unregister_shrinker(&btp->bt_shrinker);
 	ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0);
 	percpu_counter_destroy(&btp->bt_io_count);
+	list_lru_destroy(&btp->bt_lru);
 
 	if (mp->m_flags & XFS_MOUNT_BARRIER)
 		xfs_blkdev_issue_flush(btp);
@@ -1809,16 +1755,18 @@ xfs_alloc_buftarg(
 	btp->bt_bdev = bdev;
 	btp->bt_bdi = blk_get_backing_dev_info(bdev);
 
-	INIT_LIST_HEAD(&btp->bt_lru);
-	spin_lock_init(&btp->bt_lru_lock);
 	if (xfs_setsize_buftarg_early(btp, bdev))
 		goto error;
 
-	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+	if (list_lru_init(&btp->bt_lru))
 		goto error;
 
-	btp->bt_shrinker.shrink = xfs_buftarg_shrink;
+	if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL))
+		goto error;
+	btp->bt_shrinker.count_objects = xfs_buftarg_shrink_count;
+	btp->bt_shrinker.scan_objects = xfs_buftarg_shrink_scan;
 	btp->bt_shrinker.seeks = DEFAULT_SEEKS;
+	btp->bt_shrinker.flags = SHRINKER_NUMA_AWARE;
 	register_shrinker(&btp->bt_shrinker);
 	return btp;
 
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -26,6 +26,7 @@
 #include <linux/dax.h>
 #include <linux/buffer_head.h>
 #include <linux/uio.h>
+#include <linux/list_lru.h>
 
 /*
  *	Base types
@@ -88,7 +89,8 @@ typedef unsigned int xfs_buf_flags_t;
 /*
  * Internal state flags.
  */
-#define XFS_BSTATE_DISPOSE	(1 << 0)	/* buffer being discarded */
+#define XFS_BSTATE_DISPOSE	 (1 << 0)	/* buffer being discarded */
+
 
 /*
  * The xfs_buftarg contains 2 notions of "sector size" -
@@ -115,9 +117,7 @@ typedef struct xfs_buftarg {
 
 	/* LRU control structures */
 	struct shrinker		bt_shrinker;
-	struct list_head	bt_lru;
-	spinlock_t		bt_lru_lock;
-	unsigned int		bt_lru_nr;
+	struct list_lru		bt_lru;
 
 	struct percpu_counter	bt_io_count;
 } xfs_buftarg_t;
@@ -163,6 +163,7 @@ typedef struct xfs_buf {
 	 * bt_lru_lock and not by b_sema
 	 */
 	struct list_head	b_lru;		/* lru list */
+	xfs_buf_flags_t		b_lru_flags;	/* internal lru status flags */
 	spinlock_t		b_lock;		/* internal state lock */
 	unsigned int		b_state;	/* internal state flags */
 	int			b_io_error;	/* internal IO error state */
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -945,13 +945,8 @@ xfs_qm_dqput(
 		struct xfs_quotainfo	*qi = dqp->q_mount->m_quotainfo;
 		trace_xfs_dqput_free(dqp);
 
-		mutex_lock(&qi->qi_lru_lock);
-		if (list_empty(&dqp->q_lru)) {
-			list_add_tail(&dqp->q_lru, &qi->qi_lru_list);
-			qi->qi_lru_count++;
+		if (list_lru_add(&qi->qi_lru, &dqp->q_lru))
 			XFS_STATS_INC(dqp->q_mount, xs_qm_dquot_unused);
-		}
-		mutex_unlock(&qi->qi_lru_lock);
 
 	}
 	xfs_dqunlock(dqp);
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -346,7 +346,7 @@ xfs_file_aio_read(
 	 * serialisation.
 	 */
 	xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-	if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
+	if ((ioflags & XFS_IO_ISDIRECT)) {
 		xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
 		xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
 
@@ -361,22 +361,20 @@ xfs_file_aio_read(
 		 * flush and reduce the chances of repeated iolock cycles going
 		 * forward.
 		 */
-		if (inode->i_mapping->nrpages) {
-			ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
-			if (ret) {
-				xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
-				return ret;
-			}
-
-			/*
-			 * Invalidate whole pages. This can return an error if
-			 * we fail to invalidate a page, but this should never
-			 * happen on XFS. Warn if it does fail.
-			 */
-			ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
-			WARN_ON_ONCE(ret);
-			ret = 0;
+		ret = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+		if (ret) {
+			xfs_rw_iunlock(ip, XFS_IOLOCK_EXCL);
+			return ret;
 		}
+
+		/*
+		 * Invalidate whole pages. This can return an error if
+		 * we fail to invalidate a page, but this should never
+		 * happen on XFS. Warn if it does fail.
+		 */
+		ret = invalidate_inode_pages2(VFS_I(ip)->i_mapping);
+		WARN_ON_ONCE(ret);
+		ret = 0;
 		xfs_rw_ilock_demote(ip, XFS_IOLOCK_EXCL);
 	}
 
@@ -1715,7 +1713,6 @@ static const struct vm_operations_struct xfs_file_vm_ops = {
 	.fault		= xfs_filemap_fault,
 	.pmd_fault	= xfs_filemap_pmd_fault,
 	.page_mkwrite	= xfs_filemap_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 	.pfn_mkwrite	= xfs_filemap_pfn_mkwrite,
 };
 
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -1155,7 +1155,7 @@ xfs_reclaim_inodes(
  * them to be cleaned, which we hope will not be very long due to the
  * background walker having already kicked the IO off on those dirty inodes.
  */
-void
+long
 xfs_reclaim_inodes_nr(
 	struct xfs_mount	*mp,
 	int			nr_to_scan)
@@ -1164,7 +1164,7 @@ xfs_reclaim_inodes_nr(
 	xfs_reclaim_work_queue(mp);
 	xfs_ail_push_all(mp->m_ail);
 
-	xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
+	return xfs_reclaim_inodes_ag(mp, SYNC_TRYLOCK | SYNC_WAIT, &nr_to_scan);
 }
 
 /*
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -59,7 +59,7 @@ void xfs_reclaim_worker(struct work_struct *work);
 
 int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
 int xfs_reclaim_inodes_count(struct xfs_mount *mp);
-void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
+long xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
 
 void xfs_inode_set_reclaim_tag(struct xfs_inode *ip);
 
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -46,8 +46,9 @@
  */
 STATIC int	xfs_qm_init_quotainos(xfs_mount_t *);
 STATIC int	xfs_qm_init_quotainfo(xfs_mount_t *);
-STATIC int	xfs_qm_shake(struct shrinker *, struct shrink_control *);
 
+
+STATIC void	xfs_qm_dqfree_one(struct xfs_dquot *dqp);
 /*
  * We use the batch lookup interface to iterate over the dquots as it
  * currently is the only interface into the radix tree code that allows
@@ -181,12 +182,9 @@ xfs_qm_dqpurge(
 	 * We move dquots to the freelist as soon as their reference count
 	 * hits zero, so it really should be on the freelist here.
 	 */
-	mutex_lock(&qi->qi_lru_lock);
 	ASSERT(!list_empty(&dqp->q_lru));
-	list_del_init(&dqp->q_lru);
-	qi->qi_lru_count--;
+	list_lru_del(&qi->qi_lru, &dqp->q_lru);
 	XFS_STATS_DEC(mp, xs_qm_dquot_unused);
-	mutex_unlock(&qi->qi_lru_lock);
 
 	xfs_qm_dqdestroy(dqp);
 	return 0;
@@ -455,6 +453,143 @@ xfs_qm_set_defquota(
 	}
 }
 
+struct xfs_qm_isolate {
+	struct list_head	buffers;
+	struct list_head	dispose;
+};
+
+static enum lru_status
+xfs_qm_dquot_isolate(
+	struct list_head	*item,
+	struct list_lru_one	*lru,
+	spinlock_t		*lru_lock,
+	void			*arg)
+{
+	struct xfs_dquot	*dqp = container_of(item,
+						struct xfs_dquot, q_lru);
+	struct xfs_qm_isolate	*isol = arg;
+
+	if (!xfs_dqlock_nowait(dqp))
+		goto out_miss_busy;
+
+	/*
+	 * This dquot has acquired a reference in the meantime remove it from
+	 * the freelist and try again.
+	 */
+	if (dqp->q_nrefs) {
+		xfs_dqunlock(dqp);
+		XFS_STATS_INC(dqp->q_mount, xs_qm_dqwants);
+
+		trace_xfs_dqreclaim_want(dqp);
+		list_lru_isolate(lru, &dqp->q_lru);
+		XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
+		return LRU_REMOVED;
+	}
+
+	/*
+	 * If the dquot is dirty, flush it. If it's already being flushed, just
+	 * skip it so there is time for the IO to complete before we try to
+	 * reclaim it again on the next LRU pass.
+	 */
+	if (!xfs_dqflock_nowait(dqp)) {
+		xfs_dqunlock(dqp);
+		goto out_miss_busy;
+	}
+
+	if (XFS_DQ_IS_DIRTY(dqp)) {
+		struct xfs_buf	*bp = NULL;
+		int		error;
+
+		trace_xfs_dqreclaim_dirty(dqp);
+
+		/* we have to drop the LRU lock to flush the dquot */
+		spin_unlock(lru_lock);
+
+		error = xfs_qm_dqflush(dqp, &bp);
+		if (error) {
+			xfs_warn(dqp->q_mount, "%s: dquot %p flush failed",
+				 __func__, dqp);
+			goto out_unlock_dirty;
+		}
+
+		xfs_buf_delwri_queue(bp, &isol->buffers);
+		xfs_buf_relse(bp);
+		goto out_unlock_dirty;
+	}
+	xfs_dqfunlock(dqp);
+
+	/*
+	 * Prevent lookups now that we are past the point of no return.
+	 */
+	dqp->dq_flags |= XFS_DQ_FREEING;
+	xfs_dqunlock(dqp);
+
+	ASSERT(dqp->q_nrefs == 0);
+	list_lru_isolate_move(lru, &dqp->q_lru, &isol->dispose);
+	XFS_STATS_DEC(dqp->q_mount, xs_qm_dquot_unused);
+	trace_xfs_dqreclaim_done(dqp);
+	XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaims);
+	return LRU_REMOVED;
+
+out_miss_busy:
+	trace_xfs_dqreclaim_busy(dqp);
+	XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
+	return LRU_SKIP;
+
+out_unlock_dirty:
+	trace_xfs_dqreclaim_busy(dqp);
+	XFS_STATS_INC(dqp->q_mount, xs_qm_dqreclaim_misses);
+	xfs_dqunlock(dqp);
+	spin_lock(lru_lock);
+	return LRU_RETRY;
+}
+
+static unsigned long
+xfs_qm_shrink_scan(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_quotainfo	*qi = container_of(shrink,
+					struct xfs_quotainfo, qi_shrinker);
+	struct xfs_qm_isolate	isol;
+	unsigned long		freed;
+	int			error;
+
+	if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
+		return 0;
+
+	INIT_LIST_HEAD(&isol.buffers);
+	INIT_LIST_HEAD(&isol.dispose);
+
+	freed = list_lru_shrink_walk(&qi->qi_lru, sc,
+				     xfs_qm_dquot_isolate, &isol);
+
+	error = xfs_buf_delwri_submit(&isol.buffers);
+	if (error)
+		xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
+
+	while (!list_empty(&isol.dispose)) {
+		struct xfs_dquot	*dqp;
+
+		dqp = list_first_entry(&isol.dispose, struct xfs_dquot, q_lru);
+		list_del_init(&dqp->q_lru);
+		xfs_qm_dqfree_one(dqp);
+	}
+
+	return freed;
+}
+
+static unsigned long
+xfs_qm_shrink_count(
+	struct shrinker		*shrink,
+	struct shrink_control	*sc)
+{
+	struct xfs_quotainfo	*qi = container_of(shrink,
+					struct xfs_quotainfo, qi_shrinker);
+
+	return list_lru_shrink_count(&qi->qi_lru, sc);
+}
+
 /*
  * This initializes all the quota information that's kept in the
  * mount structure
@@ -471,11 +606,18 @@ xfs_qm_init_quotainfo(
 
 	qinf = mp->m_quotainfo = kmem_zalloc(sizeof(xfs_quotainfo_t), KM_SLEEP);
 
+	if ((error = list_lru_init(&qinf->qi_lru))) {
+		kmem_free(qinf);
+		mp->m_quotainfo = NULL;
+		return error;
+	}
+
 	/*
 	 * See if quotainodes are setup, and if not, allocate them,
 	 * and change the superblock accordingly.
 	 */
 	if ((error = xfs_qm_init_quotainos(mp))) {
+		list_lru_destroy(&qinf->qi_lru);
 		kmem_free(qinf);
 		mp->m_quotainfo = NULL;
 		return error;
@@ -486,10 +628,6 @@ xfs_qm_init_quotainfo(
 	INIT_RADIX_TREE(&qinf->qi_pquota_tree, GFP_NOFS);
 	mutex_init(&qinf->qi_tree_lock);
 
-	INIT_LIST_HEAD(&qinf->qi_lru_list);
-	qinf->qi_lru_count = 0;
-	mutex_init(&qinf->qi_lru_lock);
-
 	/* mutex used to serialize quotaoffs */
 	mutex_init(&qinf->qi_quotaofflock);
 
@@ -554,8 +692,10 @@ xfs_qm_init_quotainfo(
 	if (XFS_IS_PQUOTA_RUNNING(mp))
 		xfs_qm_set_defquota(mp, XFS_DQ_PROJ, qinf);
 
-	qinf->qi_shrinker.shrink = xfs_qm_shake;
+	qinf->qi_shrinker.count_objects = xfs_qm_shrink_count;
+	qinf->qi_shrinker.scan_objects = xfs_qm_shrink_scan;
 	qinf->qi_shrinker.seeks = DEFAULT_SEEKS;
+	qinf->qi_shrinker.flags = SHRINKER_NUMA_AWARE;
 	register_shrinker(&qinf->qi_shrinker);
 	return 0;
 }
@@ -576,6 +716,7 @@ xfs_qm_destroy_quotainfo(
 	ASSERT(qi != NULL);
 
 	unregister_shrinker(&qi->qi_shrinker);
+	list_lru_destroy(&qi->qi_lru);
 
 	if (qi->qi_uquotaip) {
 		IRELE(qi->qi_uquotaip);
@@ -1471,132 +1612,6 @@ xfs_qm_dqfree_one(
 	xfs_qm_dqdestroy(dqp);
 }
 
-STATIC void
-xfs_qm_dqreclaim_one(
-	struct xfs_dquot	*dqp,
-	struct list_head	*buffer_list,
-	struct list_head	*dispose_list)
-{
-	struct xfs_mount	*mp = dqp->q_mount;
-	struct xfs_quotainfo	*qi = mp->m_quotainfo;
-	int			error;
-
-	if (!xfs_dqlock_nowait(dqp))
-		goto out_move_tail;
-
-	/*
-	 * This dquot has acquired a reference in the meantime remove it from
-	 * the freelist and try again.
-	 */
-	if (dqp->q_nrefs) {
-		xfs_dqunlock(dqp);
-
-		trace_xfs_dqreclaim_want(dqp);
-		XFS_STATS_INC(mp, xs_qm_dqwants);
-
-		list_del_init(&dqp->q_lru);
-		qi->qi_lru_count--;
-		XFS_STATS_DEC(mp, xs_qm_dquot_unused);
-		return;
-	}
-
-	/*
-	 * Try to grab the flush lock. If this dquot is in the process of
-	 * getting flushed to disk, we don't want to reclaim it.
-	 */
-	if (!xfs_dqflock_nowait(dqp))
-		goto out_unlock_move_tail;
-
-	if (XFS_DQ_IS_DIRTY(dqp)) {
-		struct xfs_buf	*bp = NULL;
-
-		trace_xfs_dqreclaim_dirty(dqp);
-
-		error = xfs_qm_dqflush(dqp, &bp);
-		if (error) {
-			xfs_warn(mp, "%s: dquot %p flush failed",
-				 __func__, dqp);
-			goto out_unlock_move_tail;
-		}
-
-		xfs_buf_delwri_queue(bp, buffer_list);
-		xfs_buf_relse(bp);
-		/*
-		 * Give the dquot another try on the freelist, as the
-		 * flushing will take some time.
-		 */
-		goto out_unlock_move_tail;
-	}
-	xfs_dqfunlock(dqp);
-
-	/*
-	 * Prevent lookups now that we are past the point of no return.
-	 */
-	dqp->dq_flags |= XFS_DQ_FREEING;
-	xfs_dqunlock(dqp);
-
-	ASSERT(dqp->q_nrefs == 0);
-	list_move_tail(&dqp->q_lru, dispose_list);
-	qi->qi_lru_count--;
-	XFS_STATS_DEC(mp, xs_qm_dquot_unused);
-
-	trace_xfs_dqreclaim_done(dqp);
-	XFS_STATS_INC(mp, xs_qm_dqreclaims);
-	return;
-
-	/*
-	 * Move the dquot to the tail of the list so that we don't spin on it.
-	 */
-out_unlock_move_tail:
-	xfs_dqunlock(dqp);
-out_move_tail:
-	list_move_tail(&dqp->q_lru, &qi->qi_lru_list);
-	trace_xfs_dqreclaim_busy(dqp);
-	XFS_STATS_INC(mp, xs_qm_dqreclaim_misses);
-}
-
-STATIC int
-xfs_qm_shake(
-	struct shrinker		*shrink,
-	struct shrink_control	*sc)
-{
-	struct xfs_quotainfo	*qi =
-		container_of(shrink, struct xfs_quotainfo, qi_shrinker);
-	int			nr_to_scan = sc->nr_to_scan;
-	LIST_HEAD		(buffer_list);
-	LIST_HEAD		(dispose_list);
-	struct xfs_dquot	*dqp;
-	int			error;
-
-	if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
-		return 0;
-	if (!nr_to_scan)
-		goto out;
-
-	mutex_lock(&qi->qi_lru_lock);
-	while (!list_empty(&qi->qi_lru_list)) {
-		if (nr_to_scan-- <= 0)
-			break;
-		dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
-				       q_lru);
-		xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list);
-	}
-	mutex_unlock(&qi->qi_lru_lock);
-
-	error = xfs_buf_delwri_submit(&buffer_list);
-	if (error)
-		xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
-
-	while (!list_empty(&dispose_list)) {
-		dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
-		list_del_init(&dqp->q_lru);
-		xfs_qm_dqfree_one(dqp);
-	}
-
-out:
-	return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure;
-}
-
 /* --------------- utility functions for vnodeops ---------------- */
 
 
--- a/fs/xfs/xfs_qm.h
+++ b/fs/xfs/xfs_qm.h
@@ -74,9 +74,7 @@ typedef struct xfs_quotainfo {
 	struct xfs_inode	*qi_uquotaip;	/* user quota inode */
 	struct xfs_inode	*qi_gquotaip;	/* group quota inode */
 	struct xfs_inode	*qi_pquotaip;	/* project quota inode */
-	struct list_head qi_lru_list;
-	struct mutex	 qi_lru_lock;
-	int		 qi_lru_count;
+	struct list_lru	 qi_lru;
 	int		 qi_dquots;
 	time_t		 qi_btimelimit;	 /* limit for blks timer */
 	time_t		 qi_itimelimit;	 /* limit for inodes timer */
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -1646,19 +1646,20 @@ xfs_fs_mount(
 	return mount_bdev(fs_type, flags, dev_name, data, xfs_fs_fill_super);
 }
 
-static int
+static long
 xfs_fs_nr_cached_objects(
-	struct super_block	*sb)
+	struct super_block	*sb,
+	struct shrink_control	*sc)
 {
 	return xfs_reclaim_inodes_count(XFS_M(sb));
 }
 
-static void
+static long
 xfs_fs_free_cached_objects(
 	struct super_block	*sb,
-	int			nr_to_scan)
+	struct shrink_control	*sc)
 {
-	xfs_reclaim_inodes_nr(XFS_M(sb), nr_to_scan);
+	return xfs_reclaim_inodes_nr(XFS_M(sb), sc->nr_to_scan);
 }
 
 static const struct super_operations xfs_super_operations = {
@@ -1759,8 +1760,8 @@ xfs_init_zones(void)
 
 	xfs_inode_zone =
 		kmem_zone_init_flags(sizeof(xfs_inode_t), "xfs_inode",
-			KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD,
-			xfs_fs_inode_init_once);
+			KM_ZONE_HWALIGN | KM_ZONE_RECLAIM | KM_ZONE_SPREAD |
+			KM_ZONE_ACCOUNT, xfs_fs_inode_init_once);
 	if (!xfs_inode_zone)
 		goto out_destroy_efi_zone;
 
--- a/include/asm-generic/kexec.h
+++ b/include/asm-generic/kexec.h
@@ -4,7 +4,11 @@
 #ifdef CONFIG_KEXEC_AUTO_RESERVE
 
 #ifndef KEXEC_AUTO_RESERVED_SIZE
+#ifndef CONFIG_KASAN
 #define KEXEC_AUTO_RESERVED_SIZE ((1ULL<<27) + (1ULL<<25)) /* 160M */
+#else
+#define KEXEC_AUTO_RESERVED_SIZE ((1ULL<<28) + (1ULL<<26)) /* 320M */
+#endif
 #endif
 #ifndef KEXEC_AUTO_THRESHOLD
 #define KEXEC_AUTO_THRESHOLD (1ULL<<31) /* 2G */
--- a/include/asm-generic/pgtable.h
+++ b/include/asm-generic/pgtable.h
@@ -449,21 +449,6 @@ static inline pte_t pte_swp_clear_soft_dirty(pte_t pte)
 {
 	return pte;
 }
-
-static inline pte_t pte_file_clear_soft_dirty(pte_t pte)
-{
-       return pte;
-}
-
-static inline pte_t pte_file_mksoft_dirty(pte_t pte)
-{
-       return pte;
-}
-
-static inline int pte_file_soft_dirty(pte_t pte)
-{
-       return 0;
-}
 #endif
 
 #ifndef __HAVE_PFNMAP_TRACKING
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -490,6 +490,7 @@
 #define KERNEL_CTORS()	. = ALIGN(8);			   \
 			VMLINUX_SYMBOL(__ctors_start) = .; \
 			*(.ctors)			   \
+			*(SORT(.init_array.*))		   \
 			*(.init_array)			   \
 			VMLINUX_SYMBOL(__ctors_end) = .;
 #else
--- /dev/null
+++ b/include/bc/beancounter.h
@@ -0,0 +1,493 @@
+/*
+ *  include/bc/beancounter.h
+ *
+ *  Copyright (c) 1999-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ *  Andrey Savochkin	saw@sw-soft.com
+ *
+ */
+
+#ifndef _LINUX_BEANCOUNTER_H
+#define _LINUX_BEANCOUNTER_H
+
+#include <linux/interrupt.h>
+#include <linux/spinlock.h>
+#include <linux/cache.h>
+#include <linux/threads.h>
+#include <linux/percpu.h>
+#include <linux/percpu_counter.h>
+#include <linux/ratelimit.h>
+#include <linux/cgroup.h>
+#include <bc/decl.h>
+#include <asm/atomic.h>
+
+#include <uapi/linux/beancounter.h>
+
+/*
+ * This magic is used to distinuish user beancounter and pages beancounter
+ * in struct page. page_ub and page_bc are placed in union and MAGIC
+ * ensures us that we don't use pbc as ubc in ub_page_uncharge().
+ */
+#define UB_MAGIC		0x62756275
+
+/*
+ * UB_MAXVALUE is essentially LONG_MAX declared in a cross-compiling safe form.
+ */
+#define UB_MAXVALUE	( (1UL << (sizeof(unsigned long)*8-1)) - 1)
+
+
+/*
+ *	Resource management structures
+ * Serialization issues:
+ *   beancounter list management is protected via ub_hash_lock
+ *   task pointers are set only for current task and only once
+ *   refcount is managed atomically
+ *   value and limit comparison and change are protected by per-ub spinlock
+ */
+
+struct task_beancounter;
+
+struct ub_percpu_struct {
+	int dirty_pages;
+	int writeback_pages;
+	int wb_requests;
+	int wb_sectors;
+
+	unsigned long fuse_requests;
+	unsigned long fuse_bytes;
+
+#ifdef CONFIG_BC_IO_ACCOUNTING
+	unsigned long async_write_complete;
+	unsigned long async_write_canceled;
+	unsigned long long sync_write_bytes;
+	unsigned long long sync_read_bytes;
+#endif
+	unsigned long	sync;
+	unsigned long	sync_done;
+
+	unsigned long	fsync;
+	unsigned long	fsync_done;
+
+	unsigned long	fdsync;
+	unsigned long	fdsync_done;
+
+	unsigned long	frsync;
+	unsigned long	frsync_done;
+
+	/* percpu resource precharge */
+	int	precharge[UB_RESOURCES];
+};
+
+enum {
+	UB_MEM_CGROUP,
+	UB_BLKIO_CGROUP,
+	NR_UB_BOUND_CGROUPS,
+};
+
+struct user_beancounter {
+	struct cgroup_subsys_state css;
+
+	struct cgroup_subsys_state *ub_bound_css[NR_UB_BOUND_CGROUPS];
+
+	unsigned long		ub_magic;
+	struct list_head	ub_list;
+
+	spinlock_t		ub_lock;
+	const char		*ub_name;
+
+	struct ratelimit_state	ub_ratelimit;
+
+	atomic_long_t		dirty_pages;
+	atomic_long_t		writeback_pages;
+	atomic_long_t		wb_requests;
+	atomic_long_t		wb_sectors;
+
+	unsigned long		swapin;
+	unsigned long		swapout;
+
+	void			*iolimit;
+
+	/* resources statistic and settings */
+	struct ubparm		ub_parms[UB_RESOURCES];
+	/* resources statistic for last interval */
+	struct ubparm		*ub_store;
+
+	struct ub_percpu_struct	*ub_percpu;
+};
+
+extern int ub_count;
+
+enum ub_severity { UB_HARD, UB_SOFT, UB_FORCE };
+
+#define UB_TEST	0x100
+#define UB_SEV_FLAGS	UB_TEST
+
+extern struct cgroup_subsys ub_subsys;
+static inline struct user_beancounter *cgroup_ub(struct cgroup *cg)
+{
+	return container_of(cgroup_subsys_state(cg, ub_subsys_id),
+			    struct user_beancounter, css);
+}
+
+extern struct cgroup_subsys_state *
+__ub_get_css(struct user_beancounter *ub, int idx);
+
+static inline struct cgroup_subsys_state *
+ub_get_mem_css(struct user_beancounter *ub)
+{
+	return __ub_get_css(ub, UB_MEM_CGROUP);
+}
+
+static inline struct cgroup_subsys_state *
+ub_get_blkio_css(struct user_beancounter *ub)
+{
+	return __ub_get_css(ub, UB_BLKIO_CGROUP);
+}
+
+static inline int ub_barrier_hit(struct user_beancounter *ub, int resource)
+{
+	return ub->ub_parms[resource].held > ub->ub_parms[resource].barrier;
+}
+
+static inline int ub_hfbarrier_hit(struct user_beancounter *ub, int resource)
+{
+	return (ub->ub_parms[resource].held > 
+		((ub->ub_parms[resource].barrier) >> 1));
+}
+
+static inline int ub_barrier_farnr(struct user_beancounter *ub, int resource)
+{
+	struct ubparm *p;
+	p = ub->ub_parms + resource;
+	return p->held <= (p->barrier >> 3);
+}
+
+static inline int ub_barrier_farsz(struct user_beancounter *ub, int resource)
+{
+	struct ubparm *p;
+	p = ub->ub_parms + resource;
+	return p->held <= (p->barrier >> 3) && p->barrier >= 1024 * 1024;
+}
+
+static inline unsigned long ub_resource_bound(struct user_beancounter *ub,
+		int resource, enum ub_severity strict)
+{
+	switch (strict) {
+		case UB_HARD:
+			return ub->ub_parms[resource].barrier;
+		case UB_SOFT:
+			return ub->ub_parms[resource].limit;
+		case UB_FORCE:
+			return UB_MAXVALUE;
+		default:
+			{
+				extern int no_such_severity(void);
+				return no_such_severity();
+			}
+	}
+}
+
+static inline unsigned long ub_resource_excess(struct user_beancounter *ub,
+		int resource, enum ub_severity strict)
+{
+	unsigned long held, bound;
+
+	held = ub->ub_parms[resource].held;
+	bound = ub_resource_bound(ub, resource, strict);
+	if (likely(held < bound))
+		return bound - held;
+	return 0;
+}
+
+#ifndef CONFIG_BEANCOUNTERS
+
+#define ub_percpu(ub, cpu)		(NULL)
+#define __ub_percpu_sum(ub, field)	(0)
+#define ub_percpu_sum(ub, field)	(0)
+#define ub_percpu_add(ub, f, v)	do { } while (0)
+#define ub_percpu_sub(ub, f, v)	do { } while (0)
+#define ub_percpu_inc(ub, f)	do { } while (0)
+#define ub_percpu_dec(ub, f)	do { } while (0)
+
+#define mm_ub(mm)	(NULL)
+
+#define for_each_beancounter(__ubp)	while (0)
+
+extern inline struct user_beancounter *get_beancounter_by_name
+		(const char *name, int create) { return NULL; }
+extern inline struct user_beancounter *get_beancounter_byuid
+		(uid_t uid, int create) { return NULL; }
+extern inline struct user_beancounter *get_beancounter
+		(struct user_beancounter *ub) { return NULL; }
+extern inline void put_beancounter(struct user_beancounter *ub) { }
+
+static inline uid_t ub_legacy_id(struct user_beancounter *ub) { return -1; }
+
+static inline void ub_init_late(void) { };
+static inline void ub_init_early(void) { };
+
+static inline int charge_beancounter(struct user_beancounter *ub,
+			int resource, unsigned long val,
+			enum ub_severity strict) { return 0; }
+#define charge_beancounter_fast charge_beancounter
+static inline void uncharge_beancounter(struct user_beancounter *ub,
+			int resource, unsigned long val) { }
+#define uncharge_beancounter_fast uncharge_beancounter
+
+#else /* CONFIG_BEANCOUNTERS */
+
+extern struct list_head ub_list_head;
+
+#define for_each_beancounter(__ubp) \
+	list_for_each_entry_rcu(__ubp, &ub_list_head, ub_list)
+
+#define ub_percpu(ub, cpu) (per_cpu_ptr((ub)->ub_percpu, (cpu)))
+
+#define __ub_percpu_sum(ub, field)	({			\
+		struct user_beancounter *__ub = (ub);		\
+		typeof(ub_percpu(__ub, 0)->field) __sum = 0;	\
+		int __cpu;					\
+		for_each_possible_cpu(__cpu)			\
+			__sum += ub_percpu(__ub, __cpu)->field;	\
+		__sum;						\
+	})
+
+#define ub_percpu_sum(ub, field)	({			\
+		long __sum = __ub_percpu_sum(ub, field);	\
+		(__sum < 0) ? 0 : __sum;			\
+	})
+
+#define ub_percpu_add(ub, field, v)		do {			\
+		per_cpu_ptr(ub->ub_percpu, get_cpu())->field += (v);	\
+		put_cpu();						\
+	} while (0)
+#define ub_percpu_inc(ub, field) ub_percpu_add(ub, field, 1)
+
+#define ub_percpu_sub(ub, field, v)		do {			\
+		per_cpu_ptr(ub->ub_percpu, get_cpu())->field -= (v);	\
+		put_cpu();						\
+	} while (0)
+#define ub_percpu_dec(ub, field) ub_percpu_sub(ub, field, 1)
+
+#define mm_ub(mm)	((mm)->mm_ub)
+/*
+ *  Charge/uncharge operations
+ */
+
+extern int __charge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val, enum ub_severity strict);
+
+extern void __uncharge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val);
+
+extern void uncharge_warn(struct user_beancounter *ub, const char *resource,
+		unsigned long val, unsigned long held);
+
+extern int ub_update_memcg(struct user_beancounter *ub);
+extern void ub_sync_memcg(struct user_beancounter *ub);
+extern unsigned long ub_total_pages(struct user_beancounter *ub, bool swap);
+
+extern const char *ub_rnames[];
+/*
+ *	Put a beancounter reference
+ */
+
+static inline void put_beancounter(struct user_beancounter *ub)
+{
+	if (unlikely(ub == NULL))
+		return;
+
+	css_put(&ub->css);
+}
+
+/*
+ *	Create a new beancounter reference
+ */
+extern struct user_beancounter *get_beancounter_byuid(uid_t uid, int create);
+extern struct user_beancounter *get_beancounter_by_name(const char *name,
+							int create);
+
+static inline 
+struct user_beancounter *get_beancounter(struct user_beancounter *ub)
+{
+	if (unlikely(ub == NULL))
+		return NULL;
+
+	css_get(&ub->css);
+	return ub;
+}
+
+static inline 
+struct user_beancounter *get_beancounter_rcu(struct user_beancounter *ub)
+{
+	return css_tryget(&ub->css) ? ub : NULL;
+}
+
+extern uid_t ub_legacy_id(struct user_beancounter *ub);
+
+extern void ub_init_late(void);
+extern void ub_init_early(void);
+
+#define UB_STAT_BATCH	64
+
+static inline void __ub_stat_add(atomic_long_t *stat, int *pcpu, long val)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pcpu = per_cpu_ptr(pcpu, smp_processor_id());
+	if (*pcpu + val <= UB_STAT_BATCH)
+		*pcpu += val;
+	else {
+		atomic_long_add(*pcpu + val, stat);
+		*pcpu = 0;
+	}
+	local_irq_restore(flags);
+}
+
+static inline void __ub_stat_sub(atomic_long_t *stat, int *pcpu, long val)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pcpu = per_cpu_ptr(pcpu, smp_processor_id());
+	if (*pcpu - val >= -UB_STAT_BATCH)
+		*pcpu -= val;
+	else {
+		atomic_long_add(*pcpu - val, stat);
+		*pcpu = 0;
+	}
+	local_irq_restore(flags);
+}
+
+static inline void __ub_stat_flush_pcpu(atomic_long_t *stat, int *pcpu)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+	pcpu = per_cpu_ptr(pcpu, smp_processor_id());
+	atomic_long_add(*pcpu, stat);
+	*pcpu = 0;
+	local_irq_restore(flags);
+}
+
+#define ub_stat_add(ub, name, val)	__ub_stat_add(&(ub)->name, &(ub)->ub_percpu->name, val)
+#define ub_stat_sub(ub, name, val)	__ub_stat_sub(&(ub)->name, &(ub)->ub_percpu->name, val)
+#define ub_stat_inc(ub, name)		ub_stat_add(ub, name, 1)
+#define ub_stat_dec(ub, name)		ub_stat_sub(ub, name, 1)
+#define ub_stat_mod(ub, name, val)	atomic_long_add(val, &(ub)->name)
+#define __ub_stat_get(ub, name)		atomic_long_read(&(ub)->name)
+#define ub_stat_get(ub, name)		max(0l, atomic_long_read(&(ub)->name))
+#define ub_stat_get_exact(ub, name)	max(0l, __ub_stat_get(ub, name) + __ub_percpu_sum(ub, name))
+#define ub_stat_flush_pcpu(ub, name)	__ub_stat_flush_pcpu(&(ub)->name, &(ub)->ub_percpu->name)
+
+int ubstat_alloc_store(struct user_beancounter *ub);
+
+/*
+ *	Resource charging
+ * Change user's account and compare against limits
+ */
+
+static inline void ub_adjust_maxheld(struct user_beancounter *ub, int resource)
+{
+	if (ub->ub_parms[resource].maxheld < ub->ub_parms[resource].held)
+		ub->ub_parms[resource].maxheld = ub->ub_parms[resource].held;
+	if (ub->ub_parms[resource].minheld > ub->ub_parms[resource].held)
+		ub->ub_parms[resource].minheld = ub->ub_parms[resource].held;
+}
+
+int charge_beancounter(struct user_beancounter *ub, int resource,
+		unsigned long val, enum ub_severity strict);
+void uncharge_beancounter(struct user_beancounter *ub, int resource,
+		unsigned long val);
+
+extern int ub_resource_precharge[UB_RESOURCES];
+void init_beancounter_precharge(struct user_beancounter *ub, int resource);
+
+static inline int __try_charge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu, int resource, unsigned long val)
+{
+	BUG_ON(ub->ub_parms[resource].max_precharge < 0);
+
+	if (likely(ub_pcpu->precharge[resource] >= val)) {
+		ub_pcpu->precharge[resource] -= val;
+		return 0;
+	}
+	return -ENOMEM;
+}
+
+static inline int __try_uncharge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu, int resource, unsigned long val)
+{
+	BUG_ON(ub->ub_parms[resource].max_precharge < 0);
+
+	if (likely(ub_pcpu->precharge[resource] + val <=
+				ub->ub_parms[resource].max_precharge)) {
+		ub_pcpu->precharge[resource] += val;
+		return 0;
+	}
+
+	return -E2BIG;
+}
+
+int __charge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu,
+		int resource, unsigned long val, enum ub_severity strict);
+
+void __uncharge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu,
+		int resource, unsigned long val);
+
+static inline int charge_beancounter_fast(struct user_beancounter *ub,
+		int resource, unsigned long val, enum ub_severity strict)
+{
+	struct ub_percpu_struct *ub_pcpu;
+	unsigned long flags;
+	int retval = 0;
+
+	if (val > UB_MAXVALUE)
+		return -EINVAL;
+
+	local_irq_save(flags);
+	ub_pcpu = ub_percpu(ub, smp_processor_id());
+	if (__try_charge_beancounter_percpu(ub, ub_pcpu, resource, val))
+		retval = __charge_beancounter_percpu(ub, ub_pcpu, resource,
+							val, strict);
+	local_irq_restore(flags);
+
+	return retval;
+}
+
+static inline void uncharge_beancounter_fast(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	struct ub_percpu_struct *ub_pcpu;
+	unsigned long flags;
+
+	local_irq_save(flags);
+	ub_pcpu = ub_percpu(ub, smp_processor_id());
+	if (__try_uncharge_beancounter_percpu(ub, ub_pcpu, resource, val))
+		__uncharge_beancounter_percpu(ub, ub_pcpu, resource, val);
+	local_irq_restore(flags);
+}
+
+unsigned long __get_beancounter_usage_percpu(struct user_beancounter *ub,
+		int resource);
+
+int precharge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val);
+void ub_precharge_snapshot(struct user_beancounter *ub, int *precharge);
+
+#define UB_IOPRIO_MIN 0
+#define UB_IOPRIO_MAX 8
+
+#endif /* CONFIG_BEANCOUNTERS */
+
+#ifdef CONFIG_BC_IO_PRIORITY
+extern int ub_set_ioprio(int id, int ioprio);
+#else
+static inline int ub_set_ioprio(int veid, int ioprio) { return -EINVAL; }
+#endif
+
+#endif /* _LINUX_BEANCOUNTER_H */
--- /dev/null
+++ b/include/bc/decl.h
@@ -0,0 +1,39 @@
+/*
+ *  include/bc/decl.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __BC_DECL_H_
+#define __BC_DECL_H_
+
+#ifdef __KERNEL__
+
+/*
+ * Naming convension:
+ * ub_<section|object>_<operation>
+ */
+
+#ifdef CONFIG_BEANCOUNTERS
+
+#define UB_DECLARE_FUNC(ret_type, decl)	extern ret_type decl;
+#define UB_DECLARE_VOID_FUNC(decl)	extern void decl;
+
+#else /* CONFIG_BEANCOUNTERS */
+
+#define UB_DECLARE_FUNC(ret_type, decl)		\
+	static inline ret_type decl		\
+	{					\
+		return (ret_type)0;		\
+	}
+#define UB_DECLARE_VOID_FUNC(decl)		\
+	static inline void decl			\
+	{					\
+	}
+
+#endif /* CONFIG_BEANCOUNTERS */
+#endif
+
+#endif
--- /dev/null
+++ b/include/bc/io_acct.h
@@ -0,0 +1,124 @@
+/*
+ *  include/bc/io_acct.h
+ *
+ *  Copyright (c) 2006-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ *  Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#ifndef __UB_IO_ACCT_H_
+#define __UB_IO_ACCT_H_
+
+#ifdef CONFIG_BC_IO_ACCOUNTING
+#include <bc/beancounter.h>
+#include <linux/virtinfo.h>
+
+extern int ub_dirty_ratio;
+extern int ub_dirty_background_ratio;
+
+/*
+ * IO ub is required in task context only, so if exec_ub is set
+ * to NULL this means that uses doesn't need to charge some
+ * resources. nevertheless IO activity must be accounted, so we
+ * account it to current's task beancounter.
+ */
+
+static inline struct user_beancounter *get_io_ub(void)
+{
+	struct user_beancounter *ub;
+
+	ub = get_exec_ub();
+	if (unlikely(ub == NULL))
+		ub = get_task_ub(current);
+
+	return ub;
+}
+
+static inline void ub_io_account_read(size_t bytes)
+{
+	ub_percpu_add(get_io_ub(), sync_read_bytes, bytes);
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_ACCOUNT, &bytes);
+}
+
+static inline void ub_io_account_write(size_t bytes)
+{
+	ub_percpu_add(get_io_ub(), sync_write_bytes, bytes);
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_ACCOUNT, &bytes);
+}
+
+extern void ub_io_account_dirty(struct address_space *mapping);
+extern void ub_io_account_clean(struct address_space *mapping);
+extern void ub_io_account_cancel(struct address_space *mapping);
+extern void ub_io_writeback_inc(struct address_space *mapping);
+extern void ub_io_writeback_dec(struct address_space *mapping);
+
+extern int ub_dirty_limits(unsigned long *pbackground,
+			   long *pdirty, struct user_beancounter *ub);
+extern bool ub_over_bground_thresh(void);
+extern bool ub_should_skip_writeback(struct user_beancounter *ub,
+				     struct inode *inode);
+
+static inline void ub_writeback_io(unsigned long requests, unsigned long sectors)
+{
+	struct user_beancounter *ub = get_exec_ub();
+	ub_stat_add(ub, wb_requests, requests);
+	ub_stat_add(ub, wb_sectors, sectors);
+}
+
+#else /* UBC_IO_ACCT */
+
+static inline void ub_io_account_read(size_t bytes)
+{
+}
+
+static inline void ub_io_account_write(size_t bytes)
+{
+}
+
+static inline void ub_io_account_dirty(struct address_space *mapping)
+{
+}
+
+static inline void ub_io_account_clean(struct address_space *mapping)
+{
+}
+
+static inline void ub_io_account_cancel(struct address_space *mapping)
+{
+}
+
+static inline void ub_io_writeback_inc(struct address_space *mapping)
+{
+}
+
+static inline void ub_io_writeback_dec(struct address_space *mapping)
+{
+}
+
+static inline int ub_dirty_limits(unsigned long *pbackground,
+				  long *pdirty, struct user_beancounter *ub)
+{
+	return 0;
+}
+
+static inline bool ub_should_skip_writeback(struct user_beancounter *ub,
+				     struct inode *inode)
+{
+	return false;
+}
+
+static inline struct user_beancounter *get_io_ub(void)
+{
+	return NULL;
+}
+
+static inline bool ub_over_bground_thresh(void)
+{
+	return false;
+}
+
+#endif /* UBC_IO_ACCT */
+
+#endif
--- /dev/null
+++ b/include/bc/misc.h
@@ -0,0 +1,44 @@
+/*
+ *  include/bc/misc.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __BC_MISC_H_
+#define __BC_MISC_H_
+
+#include <bc/decl.h>
+
+struct tty_struct;
+struct file;
+struct file_lock;
+struct sigqueue;
+
+UB_DECLARE_FUNC(int, ub_file_charge(struct file *f))
+UB_DECLARE_VOID_FUNC(ub_file_uncharge(struct file *f))
+UB_DECLARE_FUNC(int, ub_flock_charge(struct file_lock *fl, int hard))
+UB_DECLARE_VOID_FUNC(ub_flock_uncharge(struct file_lock *fl))
+UB_DECLARE_FUNC(int, ub_siginfo_charge(struct sigqueue *q,
+			struct user_beancounter *ub, gfp_t gfp_mask))
+UB_DECLARE_VOID_FUNC(ub_siginfo_uncharge(struct sigqueue *q))
+UB_DECLARE_FUNC(int, ub_task_charge(struct user_beancounter *ub))
+UB_DECLARE_VOID_FUNC(ub_task_uncharge(struct user_beancounter *ub))
+UB_DECLARE_VOID_FUNC(ub_task_get(struct user_beancounter *ub,
+			struct task_struct *task))
+UB_DECLARE_VOID_FUNC(ub_task_put(struct task_struct *task))
+UB_DECLARE_FUNC(int, ub_pty_charge(struct tty_struct *tty))
+UB_DECLARE_VOID_FUNC(ub_pty_uncharge(struct tty_struct *tty))
+
+#ifdef CONFIG_BEANCOUNTERS
+#define set_flock_charged(fl)	do { (fl)->fl_charged = 1; } while (0)
+#define unset_flock_charged(fl)	do {		\
+		WARN_ON((fl)->fl_charged == 0);	\
+		(fl)->fl_charged = 0;		\
+	} while (0)
+#else
+#define set_flock_charged(fl)	do { } while (0)
+#define unset_flock_charged(fl)	do { } while (0)
+#endif
+#endif
--- /dev/null
+++ b/include/bc/proc.h
@@ -0,0 +1,38 @@
+/*
+ *  include/bc/proc.h
+ *
+ *  Copyright (c) 2006-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __UB_PROC_H_
+#define __UB_PROC_H_
+
+#include <linux/seq_file.h>
+
+struct bc_proc_entry {
+	char *name;
+	union {
+		int (*show)(struct seq_file *, void *);
+		struct file_operations *fops;
+	} u;
+	struct bc_proc_entry *next;
+	int cookie;
+};
+
+struct user_beancounter;
+
+void bc_register_proc_entry(struct bc_proc_entry *);
+void bc_register_proc_root_entry(struct bc_proc_entry *);
+
+static inline struct user_beancounter *seq_beancounter(struct seq_file *f)
+{
+	return (struct user_beancounter *)(f->private);
+}
+
+extern const char *bc_proc_lu_fmt;
+extern const char *bc_proc_lu_lfmt;
+extern const char *bc_proc_llu_fmt;
+extern const char *bc_proc_lu_lu_fmt;
+#endif
--- /dev/null
+++ b/include/bc/task.h
@@ -0,0 +1,49 @@
+/*
+ *  include/bc/task.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __BC_TASK_H_
+#define __BC_TASK_H_
+
+struct user_beancounter;
+struct callback_head;
+
+#ifdef CONFIG_BEANCOUNTERS
+struct task_beancounter {
+	struct user_beancounter	*exec_ub;
+	struct user_beancounter	*task_ub;
+	struct callback_head cgroup_attach_work;
+};
+
+extern int ub_attach_task(struct user_beancounter *, struct task_struct *);
+
+#define get_task_ub(__task)	((__task)->task_bc.task_ub)
+
+extern struct user_beancounter ub0;
+#define get_ub0()	(&ub0)
+
+#define get_exec_ub()		(current->task_bc.exec_ub)
+#define set_exec_ub(__newub)		\
+({					\
+	struct user_beancounter *old;	\
+	struct task_beancounter *tbc;	\
+ 					\
+	tbc = &current->task_bc;	\
+	old = tbc->exec_ub;		\
+	tbc->exec_ub = __newub;		\
+	old;				\
+})
+
+#else /* CONFIG_BEANCOUNTERS */
+
+#define get_ub0()		(NULL)
+#define get_exec_ub()		(NULL)
+#define get_task_ub(task)	(NULL)
+#define set_exec_ub(__ub)	(NULL)
+
+#endif /* CONFIG_BEANCOUNTERS */
+#endif /* __task.h_ */
--- /dev/null
+++ b/include/bc/vmpages.h
@@ -0,0 +1,52 @@
+/*
+ *  include/bc/vmpages.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __UB_PAGES_H_
+#define __UB_PAGES_H_
+
+#include <linux/linkage.h>
+#include <linux/sched.h>	/* for get_exec_ub() */
+#include <linux/mm.h>
+#include <bc/beancounter.h>
+#include <bc/decl.h>
+
+extern int ub_overcommit_memory;
+
+/*
+ * Check whether vma has private or copy-on-write mapping.
+ */
+#define VM_UB_PRIVATE(__flags, __file)					\
+		( ((__flags) & VM_WRITE) ?				\
+			(__file) == NULL || !((__flags) & VM_SHARED) :	\
+			0						\
+		)
+
+UB_DECLARE_FUNC(int, ub_memory_charge(struct mm_struct *mm,
+			unsigned long size,
+			unsigned vm_flags,
+			struct file *vm_file,
+			int strict))
+UB_DECLARE_VOID_FUNC(ub_memory_uncharge(struct mm_struct *mm,
+			unsigned long size,
+			unsigned vm_flags,
+			struct file *vm_file))
+
+struct shmem_inode_info;
+
+UB_DECLARE_FUNC(int, ub_locked_charge(struct mm_struct *mm,
+			unsigned long size))
+UB_DECLARE_VOID_FUNC(ub_locked_uncharge(struct mm_struct *mm,
+			unsigned long size))
+UB_DECLARE_FUNC(int, ub_lockedshm_charge(struct shmem_inode_info *shi,
+			unsigned long size))
+UB_DECLARE_VOID_FUNC(ub_lockedshm_uncharge(struct shmem_inode_info *shi,
+			unsigned long size))
+
+UB_DECLARE_FUNC(int, ub_enough_memory(struct mm_struct *mm, long pages))
+
+#endif /* __UB_PAGES_H_ */
--- a/include/crypto/algapi.h
+++ b/include/crypto/algapi.h
@@ -390,5 +390,21 @@ static inline int crypto_requires_sync(u32 type, u32 mask)
 	return (type ^ CRYPTO_ALG_ASYNC) & mask & CRYPTO_ALG_ASYNC;
 }
 
-#endif	/* _CRYPTO_ALGAPI_H */
+noinline unsigned long __crypto_memneq(const void *a, const void *b, size_t size);
+
+/**
+ * crypto_memneq - Compare two areas of memory without leaking
+ *		   timing information.
+ *
+ * @a: One area of memory
+ * @b: Another area of memory
+ * @size: The size of the area.
+ *
+ * Returns 0 when data is equal, 1 otherwise.
+ */
+static inline int crypto_memneq(const void *a, const void *b, size_t size)
+{
+	return __crypto_memneq(a, b, size) != 0UL ? 1 : 0;
+}
 
+#endif	/* _CRYPTO_ALGAPI_H */
--- a/include/crypto/hash.h
+++ b/include/crypto/hash.h
@@ -99,6 +99,7 @@ struct crypto_ahash {
 		      unsigned int keylen);
 
 	unsigned int reqsize;
+	bool has_setkey;
 	struct crypto_tfm base;
 };
 
@@ -186,6 +187,11 @@ static inline void *ahash_request_ctx(struct ahash_request *req)
 
 int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key,
 			unsigned int keylen);
+static inline bool crypto_ahash_has_setkey(struct crypto_ahash *tfm)
+{
+	return tfm->has_setkey;
+}
+
 int crypto_ahash_finup(struct ahash_request *req);
 int crypto_ahash_final(struct ahash_request *req);
 int crypto_ahash_digest(struct ahash_request *req);
--- a/include/crypto/if_alg.h
+++ b/include/crypto/if_alg.h
@@ -30,6 +30,9 @@ struct alg_sock {
 
 	struct sock *parent;
 
+	unsigned int refcnt;
+	unsigned int nokey_refcnt;
+
 	const struct af_alg_type *type;
 	void *private;
 };
@@ -49,8 +52,10 @@ struct af_alg_type {
 	void (*release)(void *private);
 	int (*setkey)(void *private, const u8 *key, unsigned int keylen);
 	int (*accept)(void *private, struct sock *sk);
+	int (*accept_nokey)(void *private, struct sock *sk);
 
 	struct proto_ops *ops;
+	struct proto_ops *ops_nokey;
 	struct module *owner;
 	char name[14];
 };
@@ -64,6 +69,7 @@ int af_alg_register_type(const struct af_alg_type *type);
 int af_alg_unregister_type(const struct af_alg_type *type);
 
 int af_alg_release(struct socket *sock);
+void af_alg_release_parent(struct sock *sk);
 int af_alg_accept(struct sock *sk, struct socket *newsock);
 
 int af_alg_make_sg(struct af_alg_sgl *sgl, void __user *addr, int len,
@@ -80,11 +86,6 @@ static inline struct alg_sock *alg_sk(struct sock *sk)
 	return (struct alg_sock *)sk;
 }
 
-static inline void af_alg_release_parent(struct sock *sk)
-{
-	sock_put(alg_sk(sk)->parent);
-}
-
 static inline void af_alg_init_completion(struct af_alg_completion *completion)
 {
 	init_completion(&completion->completion);
--- a/include/drm/drm_backport.h
+++ b/include/drm/drm_backport.h
@@ -51,58 +51,6 @@ static inline void get_monotonic_boottime64(struct timespec64 *ts)
  *
  */
 
-#include <linux/mm.h>
-
-#define SHRINK_STOP (~0UL)
-/*
- * A callback you can register to apply pressure to ageable caches.
- *
- * @count_objects should return the number of freeable items in the cache. If
- * there are no objects to free or the number of freeable items cannot be
- * determined, it should return 0. No deadlock checks should be done during the
- * count callback - the shrinker relies on aggregating scan counts that couldn't
- * be executed due to potential deadlocks to be run at a later call when the
- * deadlock condition is no longer pending.
- *
- * @scan_objects will only be called if @count_objects returned a non-zero
- * value for the number of freeable objects. The callout should scan the cache
- * and attempt to free items from the cache. It should then return the number
- * of objects freed during the scan, or SHRINK_STOP if progress cannot be made
- * due to potential deadlocks. If SHRINK_STOP is returned, then no further
- * attempts to call the @scan_objects will be made from the current reclaim
- * context.
- *
- * @flags determine the shrinker abilities, like numa awareness
- */
-struct shrinker2 {
-	unsigned long (*count_objects)(struct shrinker2 *,
-				       struct shrink_control *sc);
-	unsigned long (*scan_objects)(struct shrinker2 *,
-				      struct shrink_control *sc);
-
-	int seeks;	/* seeks to recreate an obj */
-	long batch;	/* reclaim batch size, 0 = default */
-	unsigned long flags;
-
-	/* These are for internal use */
-	struct list_head list;
-	/* objs pending delete, per node */
-	atomic_long_t *nr_deferred;
-
-	/* compat: */
-	struct shrinker compat;
-};
-int register_shrinker2(struct shrinker2 *shrinker);
-void unregister_shrinker2(struct shrinker2 *shrinker);
-
-#define shrinker            shrinker2
-#define register_shrinker   register_shrinker2
-#define unregister_shrinker unregister_shrinker2
-
-/*
- *
- */
-
 extern struct workqueue_struct *system_power_efficient_wq;
 
 
--- a/include/keys/user-type.h
+++ b/include/keys/user-type.h
@@ -46,5 +46,9 @@ extern void user_describe(const struct key *user, struct seq_file *m);
 extern long user_read(const struct key *key,
 		      char __user *buffer, size_t buflen);
 
+static inline const struct user_key_payload *user_key_payload(const struct key *key)
+{
+	return (struct user_key_payload *)rcu_dereference_key(key);
+}
 
 #endif /* _KEYS_USER_TYPE_H */
--- a/include/linux/aio.h
+++ b/include/linux/aio.h
@@ -14,6 +14,16 @@ struct kiocb;
 
 #define KIOCB_KEY		0
 
+#define AIO_MAX_NR_DEFAULT	0x10000
+
+struct ve_ioc_arg
+{
+	aio_context_t	ctx_id;
+	unsigned	val;
+};
+
+#define VE_AIO_IOC_WAIT_ACTIVE	_IOW('a',  1, struct ve_ioc_arg)
+
 /*
  * We use ki_cancel == KIOCB_CANCELLED to indicate that a kiocb has been either
  * cancelled or completed (this makes a certain amount of sense because
@@ -40,6 +50,7 @@ struct kiocb {
 	union {
 		void __user		*user;
 		struct task_struct	*tsk;
+		void			(*complete)(u64 user_data, long res);
 	} ki_obj;
 
 	__u64			ki_user_data;	/* user's data for completion */
@@ -64,6 +75,7 @@ struct kiocb {
 	 * this is the underlying eventfd context to deliver events to.
 	 */
 	struct eventfd_ctx	*ki_eventfd;
+	struct iov_iter		*ki_iter;
 };
 
 static inline bool is_sync_kiocb(struct kiocb *kiocb)
@@ -71,6 +83,11 @@ static inline bool is_sync_kiocb(struct kiocb *kiocb)
 	return kiocb->ki_ctx == NULL;
 }
 
+static inline bool is_kernel_kiocb(struct kiocb *kiocb)
+{
+	return kiocb->ki_ctx == (void *)-1;
+}
+
 static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
 {
 	*kiocb = (struct kiocb) {
@@ -91,6 +108,18 @@ extern void exit_aio(struct mm_struct *mm);
 extern long do_io_submit(aio_context_t ctx_id, long nr,
 			 struct iocb __user *__user *iocbpp, bool compat);
 void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel);
+struct kiocb *aio_kernel_alloc(gfp_t gfp);
+void aio_kernel_free(struct kiocb *iocb);
+void aio_kernel_init_iter(struct kiocb *iocb, struct file *filp,
+			  unsigned short op, struct iov_iter *iter, loff_t off);
+void aio_kernel_init_callback(struct kiocb *iocb,
+			      void (*complete)(u64 user_data, long res),
+			      u64 user_data);
+int aio_kernel_submit(struct kiocb *iocb);
+#ifdef CONFIG_VE
+int ve_aio_ioctl(struct task_struct *, unsigned int, unsigned long);
+#endif
+
 #else
 static inline ssize_t wait_on_sync_kiocb(struct kiocb *iocb) { return 0; }
 static inline void aio_put_req(struct kiocb *iocb) { }
@@ -102,6 +131,8 @@ static inline long do_io_submit(aio_context_t ctx_id, long nr,
 				bool compat) { return 0; }
 static inline void kiocb_set_cancel_fn(struct kiocb *req,
 				       kiocb_cancel_fn *cancel) { }
+static int ve_aio_ioctl(struct task_struct *task, unsigned int cmd,
+			unsigned long arg) { return 0; }
 #endif /* CONFIG_AIO */
 
 static inline struct kiocb *list_kiocb(struct list_head *h)
@@ -109,8 +140,4 @@ static inline struct kiocb *list_kiocb(struct list_head *h)
 	return list_entry(h, struct kiocb, ki_list);
 }
 
-/* for sysctl: */
-extern unsigned long aio_nr;
-extern unsigned long aio_max_nr;
-
 #endif /* __LINUX__AIO_H */
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -58,6 +58,7 @@ struct bdi_writeback {
 	struct list_head b_dirty;	/* dirty inodes */
 	struct list_head b_io;		/* parked for writeback */
 	struct list_head b_more_io;	/* parked for more writeback */
+	struct list_head b_dirty_time;	/* time stamps are dirty */
 	spinlock_t list_lock;		/* protects the b_* lists */
 };
 
@@ -67,7 +68,10 @@ struct backing_dev_info {
 	unsigned long state;	/* Always use atomic bitops on this */
 	unsigned int capabilities; /* Device capabilities */
 	congested_fn *congested_fn; /* Function pointer if device is md/dm */
+	congested_fn *congested_fn2; /* use per-bdi waitq */
 	void *congested_data;	/* Pointer to aux data for congested func */
+	int (*bd_full_fn) (struct backing_dev_info *, long long, int);
+	int bd_full; /* backing dev is full */
 
 	char *name;
 
@@ -94,6 +98,9 @@ struct backing_dev_info {
 	unsigned int min_ratio;
 	unsigned int max_ratio, max_prop_frac;
 
+	unsigned int min_dirty_pages;
+	unsigned int max_dirty_pages;
+
 	struct bdi_writeback wb;  /* default writeback info for this bdi */
 	spinlock_t wb_lock;	  /* protects work_list & wb.dwork scheduling */
 
@@ -103,6 +110,8 @@ struct backing_dev_info {
 
 	struct timer_list laptop_mode_wb_timer;
 
+        wait_queue_head_t cong_waitq; /* to wait on congestion */
+
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *debug_dir;
 	struct dentry *debug_stats;
@@ -121,6 +130,8 @@ int bdi_setup_and_register(struct backing_dev_info *, char *, unsigned int);
 void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages,
 			enum wb_reason reason);
 void bdi_start_background_writeback(struct backing_dev_info *bdi);
+long writeback_inodes_wb(struct bdi_writeback *wb, long nr_pages,
+			enum wb_reason reason, struct user_beancounter *ub);
 void bdi_writeback_workfn(struct work_struct *work);
 int bdi_has_dirty_io(struct backing_dev_info *bdi);
 void bdi_wakeup_thread_delayed(struct backing_dev_info *bdi);
@@ -216,6 +227,8 @@ static inline unsigned long bdi_stat_error(struct backing_dev_info *bdi)
 
 int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio);
 int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio);
+int bdi_set_min_dirty(struct backing_dev_info *bdi, unsigned int min_dirty);
+int bdi_set_max_dirty(struct backing_dev_info *bdi, unsigned int max_dirty);
 
 /*
  * Flags in backing_dev_info::capability
@@ -308,6 +321,30 @@ static inline int bdi_rw_congested(struct backing_dev_info *bdi)
 				  (1 << BDI_async_congested));
 }
 
+/* congestion helpers for block-devices supporting per-bdi waitq */
+static inline int bdi_congested2(struct backing_dev_info *bdi, int bdi_bits)
+{
+	if (bdi->congested_fn2)
+		return bdi->congested_fn2(bdi->congested_data, bdi_bits);
+	return 0;
+}
+
+static inline int bdi_read_congested2(struct backing_dev_info *bdi)
+{
+	return bdi_congested2(bdi, 1 << BDI_sync_congested);
+}
+
+static inline int bdi_write_congested2(struct backing_dev_info *bdi)
+{
+	return bdi_congested2(bdi, 1 << BDI_async_congested);
+}
+
+static inline int bdi_rw_congested2(struct backing_dev_info *bdi)
+{
+	return bdi_congested2(bdi, (1 << BDI_sync_congested) |
+				  (1 << BDI_async_congested));
+}
+
 enum {
 	BLK_RW_ASYNC	= 0,
 	BLK_RW_SYNC	= 1,
@@ -357,6 +394,11 @@ static inline bool mapping_cap_account_dirty(struct address_space *mapping)
 	return bdi_cap_account_dirty(mapping->backing_dev_info);
 }
 
+static inline bool mapping_cap_account_writeback(struct address_space *mapping)
+{
+	return bdi_cap_account_writeback(mapping->backing_dev_info);
+}
+
 static inline bool mapping_cap_swap_backed(struct address_space *mapping)
 {
 	return bdi_cap_swap_backed(mapping->backing_dev_info);
--- a/include/linux/blk_types.h
+++ b/include/linux/blk_types.h
@@ -28,6 +28,14 @@ struct bio_vec {
 	unsigned int	bv_offset;
 };
 
+static inline ssize_t bvec_length(const struct bio_vec *bvec, unsigned long nr)
+{
+	ssize_t bytes = 0;
+	while (nr--)
+		bytes += (bvec++)->bv_len;
+	return bytes;
+}
+
 /*
  * RHEL7 auxillary shadow structure used to extend 'struct bio' without
  * breaking RHEL kABI -- bio_init_aux() must be used to set bio->bio_aux
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -513,6 +513,9 @@ struct request_queue {
 	/* Throttle data */
 	struct throtl_data *td;
 #endif
+#ifdef CONFIG_BLK_DEV_CBT
+	struct cbt_info	*cbt;
+#endif
 	struct rcu_head		rcu_head;
 	wait_queue_head_t	mq_freeze_wq;
 	RH_KABI_DEPRECATE(struct percpu_counter, mq_usage_counter)
@@ -647,6 +650,7 @@ static inline void queue_flag_clear(unsigned int flag, struct request_queue *q)
 #define blk_queue_discard(q)	test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags)
 #define blk_queue_secdiscard(q)	(blk_queue_discard(q) && \
 	test_bit(QUEUE_FLAG_SECDISCARD, &(q)->queue_flags))
+#define blk_queue_sg_gaps(q)	test_bit(QUEUE_FLAG_SG_GAPS, &(q)->queue_flags)
 
 #define blk_noretry_request(rq) \
 	((rq)->cmd_flags & (REQ_FAILFAST_DEV|REQ_FAILFAST_TRANSPORT| \
@@ -1702,6 +1706,34 @@ struct blk_dax_ctl {
 	pfn_t pfn;
 };
 
+#if defined (CONFIG_BLK_DEV_CBT)
+extern void blk_cbt_update_size(struct block_device *bdev);
+extern void blk_cbt_release(struct request_queue *q);
+extern void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio);
+extern int blk_cbt_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg);
+extern int blk_cbt_map_copy_once(struct request_queue *q, __u8 *uuid,
+				 struct page ***map_ptr, blkcnt_t *block_max,
+				 blkcnt_t *block_bits);
+extern int blk_cbt_map_merge(struct request_queue *q, __u8 *uuid,
+			     struct page **map, blkcnt_t block_max,
+			     blkcnt_t block_bits);
+#else /* CONFIG_BLK_DEV_CBT */
+static inline void blk_cbt_update_size(struct block_device *bdev)
+{
+}
+static inline void blk_cbt_release(struct request_queue *q)
+{
+}
+static inline void blk_cbt_bio_queue(struct request_queue *q, struct bio *bio)
+{
+}
+static inline int blk_cbt_ioctl(struct block_device *bdev, unsigned cmd,
+				 char __user *arg)
+{
+	return 0;
+}
+#endif /* CONFIG_BLK_DEV_CBT */
+
 struct block_device_operations {
 	int (*open) (struct block_device *, fmode_t);
 	void (*release) (struct gendisk *, fmode_t);
--- a/include/linux/capability.h
+++ b/include/linux/capability.h
@@ -34,7 +34,6 @@ struct cpu_vfs_cap_data {
 #define _USER_CAP_HEADER_SIZE  (sizeof(struct __user_cap_header_struct))
 #define _KERNEL_CAP_T_SIZE     (sizeof(kernel_cap_t))
 
-
 struct file;
 struct inode;
 struct dentry;
@@ -45,6 +44,10 @@ struct user_namespace *current_user_ns(void);
 extern const kernel_cap_t __cap_empty_set;
 extern const kernel_cap_t __cap_init_eff_set;
 
+#include <linux/spinlock_types.h>
+
+extern spinlock_t task_capability_lock;
+
 /*
  * Internal kernel functions only
  */
@@ -213,6 +216,7 @@ extern bool has_ns_capability_noaudit(struct task_struct *t,
 				      struct user_namespace *ns, int cap);
 extern bool capable(int cap);
 extern bool ns_capable(struct user_namespace *ns, int cap);
+extern bool ve_capable(int cap);
 extern bool capable_wrt_inode_uidgid(const struct inode *inode, int cap);
 extern bool file_ns_capable(const struct file *file, struct user_namespace *ns, int cap);
 
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -20,6 +20,7 @@
 #include <linux/workqueue.h>
 #include <linux/xattr.h>
 #include <linux/fs.h>
+#include <linux/percpu-refcount.h>
 
 #ifdef CONFIG_CGROUP_PIDS
 void cgroup_pids_release(struct task_struct *task);
@@ -36,6 +37,33 @@ struct cgroup;
 struct css_id;
 struct eventfd_ctx;
 
+struct cgroup_sb_opts {
+	unsigned long subsys_mask;
+	unsigned long flags;
+	char *release_agent;
+	bool cpuset_clone_children;
+	char *name;
+	/* User explicitly requested empty subsystem */
+	bool none;
+
+	struct cgroupfs_root *new_root;
+
+};
+
+enum cgroup_open_flags {
+	CGRP_CREAT	= 0x0001,	/* create if not found */
+	CGRP_EXCL	= 0x0002,	/* fail if already exist */
+};
+
+struct vfsmount *cgroup_kernel_mount(struct cgroup_sb_opts *opts);
+struct cgroup *cgroup_get_root(struct vfsmount *mnt);
+struct cgroup *cgroup_kernel_lookup(struct vfsmount *mnt,
+				    const char *pathname);
+struct cgroup *cgroup_kernel_open(struct cgroup *parent,
+		enum cgroup_open_flags flags, const char *name);
+int cgroup_kernel_attach(struct cgroup *cgrp, struct task_struct *tsk);
+void cgroup_kernel_close(struct cgroup *cgrp);
+
 /*
  * Define the enumeration of all cgroup subsystems.
  *
@@ -98,13 +126,8 @@ struct cgroup_subsys_state {
 	 */
 	struct cgroup *cgroup;
 
-	/*
-	 * State maintained by the cgroup system to allow subsystems
-	 * to be "busy". Should be accessed via css_get(),
-	 * css_tryget() and css_put().
-	 */
-
-	atomic_t refcnt;
+	/* reference count - access via css_[try]get() and css_put() */
+	struct percpu_ref refcnt;
 
 	unsigned long flags;
 	/* ID for this css, if possible */
@@ -120,12 +143,6 @@ enum {
 	CSS_ONLINE	= (1 << 1), /* between ->css_online() and ->css_offline() */
 };
 
-/* Caller must verify that the css is not for root cgroup */
-static inline void __css_get(struct cgroup_subsys_state *css, int count)
-{
-	atomic_add(count, &css->refcnt);
-}
-
 /*
  * Call css_get() to hold a reference on the css; it can be used
  * for a reference obtained via:
@@ -137,7 +154,7 @@ static inline void css_get(struct cgroup_subsys_state *css)
 {
 	/* We don't need to reference count the root state */
 	if (!(css->flags & CSS_ROOT))
-		__css_get(css, 1);
+		percpu_ref_get(&css->refcnt);
 }
 
 /*
@@ -146,12 +163,11 @@ static inline void css_get(struct cgroup_subsys_state *css)
  * the css has been destroyed.
  */
 
-extern bool __css_tryget(struct cgroup_subsys_state *css);
 static inline bool css_tryget(struct cgroup_subsys_state *css)
 {
 	if (css->flags & CSS_ROOT)
 		return true;
-	return __css_tryget(css);
+	return percpu_ref_tryget(&css->refcnt);
 }
 
 /*
@@ -159,11 +175,10 @@ static inline bool css_tryget(struct cgroup_subsys_state *css)
  * css_get() or css_tryget()
  */
 
-extern void __css_put(struct cgroup_subsys_state *css);
 static inline void css_put(struct cgroup_subsys_state *css)
 {
 	if (!(css->flags & CSS_ROOT))
-		__css_put(css);
+		percpu_ref_put(&css->refcnt);
 }
 
 /* bits in struct cgroup flags field */
@@ -185,6 +200,9 @@ enum {
 	CGRP_CPUSET_CLONE_CHILDREN,
 	/* see the comment above CGRP_ROOT_SANE_BEHAVIOR for details */
 	CGRP_SANE_BEHAVIOR,
+
+	/* The cgroup is root in a VE */
+	CGRP_VE_ROOT,
 };
 
 struct cgroup_name {
@@ -255,9 +273,10 @@ struct cgroup {
 	struct list_head pidlists;
 	struct mutex pidlist_mutex;
 
-	/* For RCU-protected deletion */
+	/* For css percpu_ref killing and RCU-protected deletion */
 	struct rcu_head rcu_head;
-	struct work_struct free_work;
+	struct work_struct destroy_work;
+	atomic_t css_kill_cnt;
 
 	/* List of events which userspace want to receive */
 	struct list_head event_list;
@@ -265,6 +284,7 @@ struct cgroup {
 
 	/* directory xattrs */
 	struct simple_xattrs xattrs;
+	u64 subgroups_limit;
 };
 
 #define MAX_CGROUP_ROOT_NAMELEN 64
@@ -425,6 +445,7 @@ struct cgroup_map_cb {
 #define CFTYPE_ONLY_ON_ROOT	(1U << 0)	/* only create on root cg */
 #define CFTYPE_NOT_ON_ROOT	(1U << 1)	/* don't create on root cg */
 #define CFTYPE_INSANE		(1U << 2)	/* don't create if sane_behavior */
+#define CFTYPE_VE_WRITABLE	(1U << 15)	/* allow write from CT */
 
 #define MAX_CFTYPE_NAME		64
 
@@ -570,6 +591,7 @@ int cgroup_is_removed(const struct cgroup *cgrp);
 bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor);
 
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
+int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen);
 
 int cgroup_task_count(const struct cgroup *cgrp);
 
@@ -924,7 +946,6 @@ bool css_is_ancestor(struct cgroup_subsys_state *cg,
 
 /* Get id and depth of css */
 unsigned short css_id(struct cgroup_subsys_state *css);
-unsigned short css_depth(struct cgroup_subsys_state *css);
 struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id);
 
 #else /* !CONFIG_CGROUPS */
@@ -943,8 +964,6 @@ static inline void cgroup_post_fork(struct task_struct *p,
 				    void *ss_priv[CGROUP_CANFORK_COUNT]) {}
 static inline void cgroup_exit(struct task_struct *p, int callbacks) {}
 
-static inline void cgroup_lock(void) {}
-static inline void cgroup_unlock(void) {}
 static inline int cgroupstats_build(struct cgroupstats *stats,
 					struct dentry *dentry)
 {
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -89,7 +89,7 @@ SUBSYS(hugetlb)
 
 /* */
 
-#ifdef CONFIG_CGROUP_BCACHE
+#if IS_SUBSYS_ENABLED(CONFIG_CGROUP_BCACHE)
 SUBSYS(bcache)
 #endif
 
@@ -104,6 +104,17 @@ SUBSYS_TAG(CANFORK_END)
 #endif
 /* */
 
+#if IS_SUBSYS_ENABLED(CONFIG_VE)
+SUBSYS(ve)
+#endif
+
+/* */
+
+#if IS_SUBSYS_ENABLED(CONFIG_BEANCOUNTERS)
+SUBSYS(ub)
+#endif
+
+/* */
 #ifdef __TMP_SUBSYS_TAG
 #undef __TMP_SUBSYS_TAG
 #undef SUBSYS_TAG
--- a/include/linux/cleancache.h
+++ b/include/linux/cleancache.h
@@ -5,6 +5,10 @@
 #include <linux/exportfs.h>
 #include <linux/mm.h>
 
+#define CLEANCACHE_NO_POOL		-1
+#define CLEANCACHE_NO_BACKEND		-2
+#define CLEANCACHE_NO_BACKEND_SHARED	-3
+
 #define CLEANCACHE_KEY_MAX 6
 
 /*
@@ -33,10 +37,9 @@ struct cleancache_ops {
 	void (*invalidate_fs)(int);
 };
 
-extern struct cleancache_ops *
-	cleancache_register_ops(struct cleancache_ops *ops);
+extern int cleancache_register_ops(struct cleancache_ops *ops);
 extern void __cleancache_init_fs(struct super_block *);
-extern void __cleancache_init_shared_fs(char *, struct super_block *);
+extern void __cleancache_init_shared_fs(struct super_block *);
 extern int  __cleancache_get_page(struct page *);
 extern void __cleancache_put_page(struct page *);
 extern void __cleancache_invalidate_page(struct address_space *, struct page *);
@@ -78,10 +81,10 @@ static inline void cleancache_init_fs(struct super_block *sb)
 		__cleancache_init_fs(sb);
 }
 
-static inline void cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+static inline void cleancache_init_shared_fs(struct super_block *sb)
 {
 	if (cleancache_enabled)
-		__cleancache_init_shared_fs(uuid, sb);
+		__cleancache_init_shared_fs(sb);
 }
 
 static inline int cleancache_get_page(struct page *page)
--- a/include/linux/compat.h
+++ b/include/linux/compat.h
@@ -17,9 +17,11 @@
 #include <linux/fs.h>
 #include <linux/aio_abi.h>	/* for aio_context_t */
 
+#ifdef __KERNEL__
 #include <asm/compat.h>
 #include <asm/siginfo.h>
 #include <asm/signal.h>
+#endif
 
 #ifndef COMPAT_USE_64BIT_TIME
 #define COMPAT_USE_64BIT_TIME 0
@@ -141,6 +143,7 @@ struct compat_sigaction {
 	compat_sigset_t			sa_mask __packed;
 };
 
+#ifdef __KERNEL__
 /*
  * These functions operate strictly on struct compat_time*
  */
@@ -161,6 +164,7 @@ extern int compat_get_timespec(struct timespec *, const void __user *);
 extern int compat_put_timespec(const struct timespec *, void __user *);
 extern int compat_get_timeval(struct timeval *, const void __user *);
 extern int compat_put_timeval(const struct timeval *, void __user *);
+#endif
 
 struct compat_iovec {
 	compat_uptr_t	iov_base;
@@ -191,14 +195,18 @@ struct compat_rusage {
 	compat_long_t	ru_nivcsw;
 };
 
+#ifdef __KERNEL__
 extern int put_compat_rusage(const struct rusage *,
 			     struct compat_rusage __user *);
+#endif
 
 struct compat_siginfo;
 
+#ifdef __KERNEL__
 extern asmlinkage long compat_sys_waitid(int, compat_pid_t,
 		struct compat_siginfo __user *, int,
 		struct compat_rusage __user *);
+#endif
 
 struct compat_dirent {
 	u32		d_ino;
@@ -304,6 +312,7 @@ struct compat_kexec_segment;
 struct compat_mq_attr;
 struct compat_msgbuf;
 
+#ifdef __KERNEL__
 extern void compat_exit_robust_list(struct task_struct *curr);
 
 asmlinkage long
@@ -407,6 +416,7 @@ asmlinkage long compat_sys_settimeofday(struct compat_timeval __user *tv,
 asmlinkage long compat_sys_adjtimex(struct compat_timex __user *utp);
 
 extern int compat_printk(const char *fmt, ...);
+extern int ve_compat_printk(int dst, const char *fmt, ...);
 extern void sigset_from_compat(sigset_t *set, const compat_sigset_t *compat);
 extern void sigset_to_compat(compat_sigset_t *compat, const sigset_t *set);
 
@@ -661,6 +671,7 @@ asmlinkage ssize_t compat_sys_process_vm_writev(compat_pid_t pid,
 		const struct compat_iovec __user *lvec,
 		unsigned long liovcnt, const struct compat_iovec __user *rvec,
 		unsigned long riovcnt, unsigned long flags);
+#endif
 
 asmlinkage long compat_sys_sendfile(int out_fd, int in_fd,
 				    compat_off_t __user *offset, compat_size_t count);
@@ -695,6 +706,72 @@ asmlinkage long compat_sys_fanotify_mark(int, unsigned int, __u32, __u32,
 static inline bool in_compat_syscall(void) { return is_compat_task(); }
 #endif
 
+#ifdef CONFIG_QUOTA_COMPAT
+
+#define QC_QUOTAON  0x0100	/* enable quotas */
+#define QC_QUOTAOFF 0x0200	/* disable quotas */
+/* GETQUOTA, SETQUOTA and SETUSE which were at 0x0300-0x0500 has now other parameteres */
+#define QC_SYNC     0x0600	/* sync disk copy of a filesystems quotas */
+#define QC_SETQLIM  0x0700	/* set limits */
+/* GETSTATS at 0x0800 is now longer... */
+#define QC_GETINFO  0x0900	/* get info about quotas - graces, flags... */
+#define QC_SETINFO  0x0A00	/* set info about quotas */
+#define QC_SETGRACE 0x0B00	/* set inode and block grace */
+#define QC_SETFLAGS 0x0C00	/* set flags for quota */
+#define QC_GETQUOTA 0x0D00	/* get limits and usage */
+#define QC_SETQUOTA 0x0E00	/* set limits and usage */
+#define QC_SETUSE   0x0F00	/* set usage */
+/* 0x1000 used by old RSQUASH */
+#define QC_GETSTATS 0x1100	/* get collected stats */
+
+struct compat_dqblk {
+	unsigned int dqb_ihardlimit;
+	unsigned int dqb_isoftlimit;
+	unsigned int dqb_curinodes;
+	unsigned int dqb_bhardlimit;
+	unsigned int dqb_bsoftlimit;
+	qsize_t dqb_curspace;
+	__kernel_time_t dqb_btime;
+	__kernel_time_t dqb_itime;
+};
+
+#ifdef CONFIG_COMPAT
+
+struct compat_compat_dqblk {
+	compat_uint_t	dqb_ihardlimit;
+	compat_uint_t	dqb_isoftlimit;
+	compat_uint_t	dqb_curinodes;
+	compat_uint_t	dqb_bhardlimit;
+	compat_uint_t	dqb_bsoftlimit;
+	compat_u64	dqb_curspace;
+	compat_time_t	dqb_btime;
+	compat_time_t	dqb_itime;
+};
+
+#endif
+
+struct compat_dqinfo {
+	unsigned int dqi_bgrace;
+	unsigned int dqi_igrace;
+	unsigned int dqi_flags;
+	unsigned int dqi_blocks;
+	unsigned int dqi_free_blk;
+	unsigned int dqi_free_entry;
+};
+
+struct compat_dqstats {
+	__u32 lookups;
+	__u32 drops;
+	__u32 reads;
+	__u32 writes;
+	__u32 cache_hits;
+	__u32 allocated_dquots;
+	__u32 free_dquots;
+	__u32 syncs;
+	__u32 version;
+};
+#endif /* CONFIG_QUOTA_COMPAT */
+
 #else
 
 #define is_compat_task() (0)
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -66,6 +66,7 @@
 #define __deprecated			__attribute__((deprecated))
 #define __packed			__attribute__((packed))
 #define __weak				__attribute__((weak))
+#define __alias(symbol)		__attribute__((alias(#symbol)))
 
 /*
  * it doesn't make sense on ARM (currently the only user of __naked) to trace
@@ -100,15 +101,140 @@
 #define __maybe_unused			__attribute__((unused))
 #define __always_unused			__attribute__((unused))
 
-#define __gcc_header(x) #x
-#define _gcc_header(x) __gcc_header(linux/compiler-gcc##x.h)
-#define gcc_header(x) _gcc_header(x)
-#include gcc_header(__GNUC__)
+/* gcc version specific checks */
+
+#if GCC_VERSION < 30200
+# error Sorry, your compiler is too old - please upgrade it.
+#endif
+
+#if GCC_VERSION < 30300
+# define __used			__attribute__((__unused__))
+#else
+# define __used			__attribute__((__used__))
+#endif
+
+#ifdef CONFIG_GCOV_KERNEL
+# if GCC_VERSION < 30400
+#   error "GCOV profiling support for gcc versions below 3.4 not included"
+# endif /* __GNUC_MINOR__ */
+#endif /* CONFIG_GCOV_KERNEL */
+
+#if GCC_VERSION >= 30400
+#define __must_check		__attribute__((warn_unused_result))
+#endif
+
+#if GCC_VERSION >= 40000
+
+/* GCC 4.1.[01] miscompiles __weak */
+#ifdef __KERNEL__
+# if GCC_VERSION >= 40100 &&  GCC_VERSION <= 40101
+#  error Your version of gcc miscompiles the __weak directive
+# endif
+#endif
+
+#define __used			__attribute__((__used__))
+#define __compiler_offsetof(a, b)					\
+	__builtin_offsetof(a, b)
+
+#if GCC_VERSION >= 40100 && GCC_VERSION < 40600
+# define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
+#endif
+
+#if GCC_VERSION >= 40300
+/* Mark functions as cold. gcc will assume any path leading to a call
+ * to them will be unlikely.  This means a lot of manual unlikely()s
+ * are unnecessary now for any paths leading to the usual suspects
+ * like BUG(), printk(), panic() etc. [but let's keep them for now for
+ * older compilers]
+ *
+ * Early snapshots of gcc 4.3 don't support this and we can't detect this
+ * in the preprocessor, but we can live with this because they're unreleased.
+ * Maketime probing would be overkill here.
+ *
+ * gcc also has a __attribute__((__hot__)) to move hot functions into
+ * a special section, but I don't see any sense in this right now in
+ * the kernel context
+ */
+#define __cold			__attribute__((__cold__))
+
+#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
+
+#ifndef __CHECKER__
+# define __compiletime_warning(message) __attribute__((warning(message)))
+# define __compiletime_error(message) __attribute__((error(message)))
+#endif /* __CHECKER__ */
+#endif /* GCC_VERSION >= 40300 */
+
+#if GCC_VERSION >= 40500
+/*
+ * Mark a position in code as unreachable.  This can be used to
+ * suppress control flow warnings after asm blocks that transfer
+ * control elsewhere.
+ *
+ * Early snapshots of gcc 4.5 don't support this and we can't detect
+ * this in the preprocessor, but we can live with this because they're
+ * unreleased.  Really, we need to have autoconf for the kernel.
+ */
+#define unreachable() __builtin_unreachable()
+
+/* Mark a function definition as prohibited from being cloned. */
+#define __noclone	__attribute__((__noclone__))
+
+#endif /* GCC_VERSION >= 40500 */
+
+#if GCC_VERSION >= 40600
+/*
+ * Tell the optimizer that something else uses this function or variable.
+ */
+#define __visible	__attribute__((externally_visible))
+#endif
+
+/*
+ * GCC 'asm goto' miscompiles certain code sequences:
+ *
+ *   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670
+ *
+ * Work it around via a compiler barrier quirk suggested by Jakub Jelinek.
+ *
+ * (asm goto is automatically volatile - the naming reflects this.)
+ */
+#define asm_volatile_goto(x...)	do { asm goto(x); asm (""); } while (0)
+
+#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP
+#if GCC_VERSION >= 40400
+#define __HAVE_BUILTIN_BSWAP32__
+#define __HAVE_BUILTIN_BSWAP64__
+#endif
+#if GCC_VERSION >= 40800 || (defined(__powerpc__) && GCC_VERSION >= 40600)
+#define __HAVE_BUILTIN_BSWAP16__
+#endif
+#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
+
+#if GCC_VERSION >= 50000
+#define KASAN_ABI_VERSION 4
+#elif GCC_VERSION >= 40902
+#define KASAN_ABI_VERSION 3
+#endif
+
+#if GCC_VERSION >= 40902
+/*
+ * Tell the compiler that address safety instrumentation (KASAN)
+ * should not be applied to that function.
+ * Conflicts with inlining: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
+ */
+#define __no_sanitize_address __attribute__((no_sanitize_address))
+#endif
+
+#endif	/* gcc version >= 40000 specific checks */
 
 #if !defined(__noclone)
 #define __noclone	/* not needed */
 #endif
 
+#if !defined(__no_sanitize_address)
+#define __no_sanitize_address
+#endif
+
 /*
  * A trick to suppress uninitialized variable warning without generating any
  * code
--- a/include/linux/compiler-gcc3.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef __LINUX_COMPILER_H
-#error "Please don't include <linux/compiler-gcc3.h> directly, include <linux/compiler.h> instead."
-#endif
-
-#if GCC_VERSION < 30200
-# error Sorry, your compiler is too old - please upgrade it.
-#endif
-
-#if GCC_VERSION >= 30300
-# define __used			__attribute__((__used__))
-#else
-# define __used			__attribute__((__unused__))
-#endif
-
-#if GCC_VERSION >= 30400
-#define __must_check		__attribute__((warn_unused_result))
-#endif
-
-#ifdef CONFIG_GCOV_KERNEL
-# if GCC_VERSION < 30400
-#   error "GCOV profiling support for gcc versions below 3.4 not included"
-# endif /* __GNUC_MINOR__ */
-#endif /* CONFIG_GCOV_KERNEL */
--- a/include/linux/compiler-gcc4.h
+++ /dev/null
@@ -1,88 +0,0 @@
-#ifndef __LINUX_COMPILER_H
-#error "Please don't include <linux/compiler-gcc4.h> directly, include <linux/compiler.h> instead."
-#endif
-
-/* GCC 4.1.[01] miscompiles __weak */
-#ifdef __KERNEL__
-# if GCC_VERSION >= 40100 &&  GCC_VERSION <= 40101
-#  error Your version of gcc miscompiles the __weak directive
-# endif
-#endif
-
-#define __used			__attribute__((__used__))
-#define __must_check 		__attribute__((warn_unused_result))
-#define __compiler_offsetof(a,b) __builtin_offsetof(a,b)
-
-#if GCC_VERSION >= 40100 && GCC_VERSION < 40600
-# define __compiletime_object_size(obj) __builtin_object_size(obj, 0)
-#endif
-
-#if GCC_VERSION >= 40300
-/* Mark functions as cold. gcc will assume any path leading to a call
-   to them will be unlikely.  This means a lot of manual unlikely()s
-   are unnecessary now for any paths leading to the usual suspects
-   like BUG(), printk(), panic() etc. [but let's keep them for now for
-   older compilers]
-
-   Early snapshots of gcc 4.3 don't support this and we can't detect this
-   in the preprocessor, but we can live with this because they're unreleased.
-   Maketime probing would be overkill here.
-
-   gcc also has a __attribute__((__hot__)) to move hot functions into
-   a special section, but I don't see any sense in this right now in
-   the kernel context */
-#define __cold			__attribute__((__cold__))
-
-#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
-
-#ifndef __CHECKER__
-# define __compiletime_warning(message) __attribute__((warning(message)))
-# define __compiletime_error(message) __attribute__((error(message)))
-#endif /* __CHECKER__ */
-#endif /* GCC_VERSION >= 40300 */
-
-#if GCC_VERSION >= 40500
-/*
- * Mark a position in code as unreachable.  This can be used to
- * suppress control flow warnings after asm blocks that transfer
- * control elsewhere.
- *
- * Early snapshots of gcc 4.5 don't support this and we can't detect
- * this in the preprocessor, but we can live with this because they're
- * unreleased.  Really, we need to have autoconf for the kernel.
- */
-#define unreachable() __builtin_unreachable()
-
-/* Mark a function definition as prohibited from being cloned. */
-#define __noclone	__attribute__((__noclone__))
-
-#endif /* GCC_VERSION >= 40500 */
-
-#if GCC_VERSION >= 40600
-/*
- * Tell the optimizer that something else uses this function or variable.
- */
-#define __visible __attribute__((externally_visible))
-#endif
-
-/*
- * GCC 'asm goto' miscompiles certain code sequences:
- *
- *   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670
- *
- * Work it around via a compiler barrier quirk suggested by Jakub Jelinek.
- * Fixed in GCC 4.8.2 and later versions.
- *
- * (asm goto is automatically volatile - the naming reflects this.)
- */
-#define asm_volatile_goto(x...)	do { asm goto(x); asm (""); } while (0)
-
-#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP
-#if GCC_VERSION >= 40400
-#define __HAVE_BUILTIN_BSWAP32__
-#define __HAVE_BUILTIN_BSWAP64__
-#endif
-#if GCC_VERSION >= 40800 || (defined(__powerpc__) && GCC_VERSION >= 40600)
-#define __HAVE_BUILTIN_BSWAP16__
-#endif
-#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
--- a/include/linux/compiler-gcc5.h
+++ /dev/null
@@ -1,66 +0,0 @@
-#ifndef __LINUX_COMPILER_H
-#error "Please don't include <linux/compiler-gcc5.h> directly, include <linux/compiler.h> instead."
-#endif
-
-#define __used				__attribute__((__used__))
-#define __must_check			__attribute__((warn_unused_result))
-#define __compiler_offsetof(a, b)	__builtin_offsetof(a, b)
-
-/* Mark functions as cold. gcc will assume any path leading to a call
-   to them will be unlikely.  This means a lot of manual unlikely()s
-   are unnecessary now for any paths leading to the usual suspects
-   like BUG(), printk(), panic() etc. [but let's keep them for now for
-   older compilers]
-
-   Early snapshots of gcc 4.3 don't support this and we can't detect this
-   in the preprocessor, but we can live with this because they're unreleased.
-   Maketime probing would be overkill here.
-
-   gcc also has a __attribute__((__hot__)) to move hot functions into
-   a special section, but I don't see any sense in this right now in
-   the kernel context */
-#define __cold			__attribute__((__cold__))
-
-#define __UNIQUE_ID(prefix) __PASTE(__PASTE(__UNIQUE_ID_, prefix), __COUNTER__)
-
-#ifndef __CHECKER__
-# define __compiletime_warning(message) __attribute__((warning(message)))
-# define __compiletime_error(message) __attribute__((error(message)))
-#endif /* __CHECKER__ */
-
-/*
- * Mark a position in code as unreachable.  This can be used to
- * suppress control flow warnings after asm blocks that transfer
- * control elsewhere.
- *
- * Early snapshots of gcc 4.5 don't support this and we can't detect
- * this in the preprocessor, but we can live with this because they're
- * unreleased.  Really, we need to have autoconf for the kernel.
- */
-#define unreachable() __builtin_unreachable()
-
-/* Mark a function definition as prohibited from being cloned. */
-#define __noclone	__attribute__((__noclone__))
-
-/*
- * Tell the optimizer that something else uses this function or variable.
- */
-#define __visible __attribute__((externally_visible))
-
-/*
- * GCC 'asm goto' miscompiles certain code sequences:
- *
- *   http://gcc.gnu.org/bugzilla/show_bug.cgi?id=58670
- *
- * Work it around via a compiler barrier quirk suggested by Jakub Jelinek.
- * Fixed in GCC 4.8.2 and later versions.
- *
- * (asm goto is automatically volatile - the naming reflects this.)
- */
-#define asm_volatile_goto(x...)	do { asm goto(x); asm (""); } while (0)
-
-#ifdef CONFIG_ARCH_USE_BUILTIN_BSWAP
-#define __HAVE_BUILTIN_BSWAP32__
-#define __HAVE_BUILTIN_BSWAP64__
-#define __HAVE_BUILTIN_BSWAP16__
-#endif /* CONFIG_ARCH_USE_BUILTIN_BSWAP */
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -181,20 +181,46 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect);
 
 #include <uapi/linux/types.h>
 
-static __always_inline void __read_once_size(const volatile void *p, void *res, int size)
+#define __READ_ONCE_SIZE						\
+({									\
+	switch (size) {							\
+	case 1: *(__u8 *)res = *(volatile __u8 *)p; break;		\
+	case 2: *(__u16 *)res = *(volatile __u16 *)p; break;		\
+	case 4: *(__u32 *)res = *(volatile __u32 *)p; break;		\
+	case 8: *(__u64 *)res = *(volatile __u64 *)p; break;		\
+	default:							\
+		barrier();						\
+		__builtin_memcpy((void *)res, (const void *)p, size);	\
+		barrier();						\
+	}								\
+})
+
+static __always_inline
+void __read_once_size(const volatile void *p, void *res, int size)
 {
-	switch (size) {
-	case 1: *(__u8 *)res = *(volatile __u8 *)p; break;
-	case 2: *(__u16 *)res = *(volatile __u16 *)p; break;
-	case 4: *(__u32 *)res = *(volatile __u32 *)p; break;
-	case 8: *(__u64 *)res = *(volatile __u64 *)p; break;
-	default:
-		barrier();
-		__builtin_memcpy((void *)res, (const void *)p, size);
-		barrier();
-	}
+	__READ_ONCE_SIZE;
 }
 
+#ifdef CONFIG_KASAN
+/*
+ * This function is not 'inline' because __no_sanitize_address confilcts
+ * with inlining. Attempt to inline it may cause a build failure.
+ * 	https://gcc.gnu.org/bugzilla/show_bug.cgi?id=67368
+ * '__maybe_unused' allows us to avoid defined-but-not-used warnings.
+ */
+static __no_sanitize_address __maybe_unused
+void __read_once_size_nocheck(const volatile void *p, void *res, int size)
+{
+	__READ_ONCE_SIZE;
+}
+#else
+static __always_inline
+void __read_once_size_nocheck(const volatile void *p, void *res, int size)
+{
+	__READ_ONCE_SIZE;
+}
+#endif
+
 static __always_inline void __write_once_size(volatile void *p, void *res, int size)
 {
 	switch (size) {
@@ -231,8 +257,22 @@ static __always_inline void __write_once_size(volatile void *p, void *res, int s
  * required ordering.
  */
 
-#define READ_ONCE(x) \
-	({ union { typeof(x) __val; char __c[1]; } __u; __read_once_size(&(x), __u.__c, sizeof(x)); __u.__val; })
+#define __READ_ONCE(x, check)						\
+({									\
+	union { typeof(x) __val; char __c[1]; } __u;			\
+	if (check)							\
+		__read_once_size(&(x), __u.__c, sizeof(x));		\
+	else								\
+		__read_once_size_nocheck(&(x), __u.__c, sizeof(x));	\
+	__u.__val;							\
+})
+#define READ_ONCE(x) __READ_ONCE(x, 1)
+
+/*
+ * Use READ_ONCE_NOCHECK() instead of READ_ONCE() if you need
+ * to hide memory access from KASAN.
+ */
+#define READ_ONCE_NOCHECK(x) __READ_ONCE(x, 0)
 
 #define WRITE_ONCE(x, val) \
 	({ typeof(x) __val = (val); __write_once_size(&(x), &__val, sizeof(__val)); __val; })
--- a/include/linux/console.h
+++ b/include/linux/console.h
@@ -162,9 +162,6 @@ extern int braille_register_console(struct console *, int index,
 extern int braille_unregister_console(struct console *);
 #ifdef CONFIG_TTY
 extern void console_sysfs_notify(void);
-#else
-static inline void console_sysfs_notify(void)
-{ }
 #endif
 extern bool console_suspend_enabled;
 
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -361,6 +361,7 @@ struct ablkcipher_tfm {
 
 	unsigned int ivsize;
 	unsigned int reqsize;
+	bool has_setkey;
 };
 
 struct aead_tfm {
@@ -671,6 +672,13 @@ static inline int crypto_ablkcipher_setkey(struct crypto_ablkcipher *tfm,
 	return crt->setkey(crt->base, key, keylen);
 }
 
+static inline bool crypto_ablkcipher_has_setkey(struct crypto_ablkcipher *tfm)
+{
+	struct ablkcipher_tfm *crt = crypto_ablkcipher_crt(tfm);
+
+	return crt->has_setkey;
+}
+
 static inline struct crypto_ablkcipher *crypto_ablkcipher_reqtfm(
 	struct ablkcipher_request *req)
 {
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -55,11 +55,11 @@ struct qstr {
 #define hashlen_len(hashlen)  ((u32)((hashlen) >> 32))
 
 struct dentry_stat_t {
-	int nr_dentry;
-	int nr_unused;
-	int age_limit;          /* age in seconds */
-	int want_pages;         /* pages requested by system */
-	int dummy[2];
+	long nr_dentry;
+	long nr_unused;
+	long age_limit;          /* age in seconds */
+	long want_pages;         /* pages requested by system */
+	long dummy[2];
 };
 extern struct dentry_stat_t dentry_stat;
 
@@ -133,6 +133,8 @@ struct dentry {
 	struct hlist_node d_alias;	/* inode alias list */
 };
 
+extern struct kmem_cache *dentry_cache;
+
 /*
  * dentry->d_lock spinlock nesting subclasses:
  *
@@ -229,6 +231,8 @@ struct dentry_operations_wrapper {
 #define DCACHE_FILE_TYPE		0x04000000 /* Other file type */
 #define DCACHE_OP_REAL			0x08000000
 
+#define DCACHE_MAY_FREE			0x00800000
+
 extern seqlock_t rename_lock;
 
 static inline int dname_external(struct dentry *dentry)
@@ -509,7 +513,7 @@ static inline bool d_really_is_positive(const struct dentry *dentry)
 }
 
 extern int sysctl_vfs_cache_pressure;
-
+extern int sysctl_vfs_cache_min_ratio;
 
 /**
  * d_inode - Get the actual inode of this dentry
@@ -567,4 +571,8 @@ static inline struct dentry *d_backing_dentry(struct dentry *upper)
 	return upper;
 }
 
+static inline unsigned long vfs_pressure_ratio(unsigned long val)
+{
+	return mult_frac(val, sysctl_vfs_cache_pressure, 100);
+}
 #endif	/* __LINUX_DCACHE_H */
--- a/include/linux/delayacct.h
+++ b/include/linux/delayacct.h
@@ -102,6 +102,25 @@ static inline int delayacct_add_tsk(struct taskstats *d,
 	return __delayacct_add_tsk(d, tsk);
 }
 
+static inline void delayacct_add_stats(struct taskstats *d,
+					struct taskstats *s)
+{
+	if (!delayacct_on)
+		return;
+
+	d->cpu_count			+= s->cpu_count;
+	d->cpu_delay_total		+= s->cpu_delay_total;
+	d->cpu_run_real_total		+= s->cpu_run_real_total;
+	d->cpu_run_virtual_total	+= s->cpu_run_virtual_total;
+	d->cpu_scaled_run_real_total	+= s->cpu_scaled_run_real_total;
+	d->blkio_count			+= s->blkio_count;
+	d->blkio_delay_total		+= s->blkio_delay_total;
+	d->swapin_count			+= s->swapin_count;
+	d->swapin_delay_total		+= s->swapin_delay_total;
+	d->freepages_count		+= s->freepages_count;
+	d->freepages_delay_total	+= s->freepages_delay_total;
+}
+
 static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
 {
 	if (tsk->delays)
@@ -139,6 +158,9 @@ static inline void delayacct_blkio_end(void)
 static inline int delayacct_add_tsk(struct taskstats *d,
 					struct task_struct *tsk)
 { return 0; }
+static inline void delayacct_add_stats(struct taskstats *d,
+					struct taskstats *s)
+{}
 static inline __u64 delayacct_blkio_ticks(struct task_struct *tsk)
 { return 0; }
 static inline int delayacct_is_task_waiting_on_io(struct task_struct *p)
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -112,6 +112,14 @@ typedef int (*dm_iterate_devices_fn) (struct dm_target *ti,
 typedef void (*dm_io_hints_fn) (struct dm_target *ti,
 				struct queue_limits *limits);
 
+typedef void (*dm_ploop_modify_fn) (struct dm_target *ti, int action);
+
+/* "action" arg of dm_ploop_modify_fn */
+enum {
+	DM_PLOOP_ATTACH,
+	DM_PLOOP_DETACH,
+};
+
 /*
  * Returns:
  *    0: The target can handle the next I/O immediately.
@@ -166,6 +174,7 @@ struct target_type {
 	dm_busy_fn busy;
 	dm_iterate_devices_fn iterate_devices;
 	dm_io_hints_fn io_hints;
+	dm_ploop_modify_fn ploop_modify;
 
 	/* For internal device-mapper use. */
 	struct list_head list;
--- a/include/linux/device_cgroup.h
+++ b/include/linux/device_cgroup.h
@@ -11,9 +11,23 @@ static inline int devcgroup_inode_permission(struct inode *inode, int mask)
 		return 0;
 	return __devcgroup_inode_permission(inode, mask);
 }
+
+extern int devcgroup_device_permission(umode_t mode, dev_t dev, int mask);
+extern int devcgroup_device_visible(umode_t mode, int major,
+		int start_minor, int nr_minors);
+
+struct ve_struct;
+int devcgroup_set_perms_ve(struct ve_struct *, unsigned, dev_t, unsigned);
+int devcgroup_seq_show_ve(struct ve_struct *, struct seq_file *);
+
 #else
 static inline int devcgroup_inode_permission(struct inode *inode, int mask)
 { return 0; }
 static inline int devcgroup_inode_mknod(int mode, dev_t dev)
 { return 0; }
+static inline int devcgroup_device_permission(umode_t mode, dev_t dev, int mask)
+{ return 0; }
+static inline int devcgroup_device_visible(umode_t mode, int major,
+		int start_minor, int nr_minors)
+{ return 0; }
 #endif
--- /dev/null
+++ b/include/linux/fence-watchdog.h
@@ -0,0 +1,14 @@
+/*
+ *  include/linux/fence-watchdog.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_FENCE_WATCHDOG_H_
+#define _LINUX_FENCE_WATCHDOG_H_
+
+inline int fence_wdog_check_timer(void);
+bool fence_wdog_tmo_match(void);
+
+#endif
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -36,7 +36,11 @@ static inline unsigned int sk_filter_len(const struct sk_filter *fp)
 	return fp->len * sizeof(struct sock_filter) + sizeof(*fp);
 }
 
-extern int sk_filter(struct sock *sk, struct sk_buff *skb);
+int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap);
+static inline int sk_filter(struct sock *sk, struct sk_buff *skb)
+{
+	return sk_filter_trim_cap(sk, skb, 1);
+}
 extern unsigned int sk_run_filter(const struct sk_buff *skb,
 				  const struct sock_filter *filter);
 extern int sk_unattached_filter_create(struct sk_filter **pfp,
--- a/include/linux/freezer.h
+++ b/include/linux/freezer.h
@@ -172,14 +172,6 @@ static inline void freezable_schedule(void)
 	freezer_count();
 }
 
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-static inline void freezable_schedule_unsafe(void)
-{
-	freezer_do_not_count();
-	schedule();
-	freezer_count_unsafe();
-}
-
 /*
  * Like freezable_schedule_timeout(), but should not block the freezer.  Do not
  * call this with locks held.
@@ -216,16 +208,6 @@ static inline long freezable_schedule_timeout_killable(long timeout)
 	return __retval;
 }
 
-/* DO NOT ADD ANY NEW CALLERS OF THIS FUNCTION */
-static inline long freezable_schedule_timeout_killable_unsafe(long timeout)
-{
-	long __retval;
-	freezer_do_not_count();
-	__retval = schedule_timeout_killable(timeout);
-	freezer_count_unsafe();
-	return __retval;
-}
-
 /*
  * Like schedule_hrtimeout_range(), but should not block the freezer.  Do not
  * call this with locks held.
@@ -315,8 +297,6 @@ static inline void set_freezable(void) {}
 
 #define freezable_schedule()  schedule()
 
-#define freezable_schedule_unsafe()  schedule()
-
 #define freezable_schedule_timeout(timeout)  schedule_timeout(timeout)
 
 #define freezable_schedule_timeout_interruptible(timeout)		\
@@ -325,9 +305,6 @@ static inline void set_freezable(void) {}
 #define freezable_schedule_timeout_killable(timeout)			\
 	schedule_timeout_killable(timeout)
 
-#define freezable_schedule_timeout_killable_unsafe(timeout)		\
-	schedule_timeout_killable(timeout)
-
 #define freezable_schedule_hrtimeout_range(expires, delta, mode)	\
 	schedule_hrtimeout_range(expires, delta, mode)
 
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -10,6 +10,7 @@
 #include <linux/stat.h>
 #include <linux/cache.h>
 #include <linux/list.h>
+#include <linux/list_lru.h>
 #include <linux/radix-tree.h>
 #include <linux/rbtree.h>
 #include <linux/init.h>
@@ -78,6 +79,9 @@ typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
 #define MAY_CHDIR		0x00000040
 /* called from RCU mode, don't block */
 #define MAY_NOT_BLOCK		0x00000080
+/* for devgroup-vs-openvz only */
+#define MAY_QUOTACTL		0x00010000	/* deprecated */
+#define MAY_MOUNT		0x00020000
 
 /*
  * flags in file.f_mode.  Note that FMODE_READ and FMODE_WRITE must correspond
@@ -128,6 +132,12 @@ typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
 /* File needs atomic accesses to f_pos */
 #define FMODE_ATOMIC_POS	((__force fmode_t)0x8000)
 
+/* Can do sys_quotactl (for devperms) */
+#define FMODE_QUOTACTL		((__force fmode_t)0x8000)
+
+/* File is a block device opened by mount(2)  */
+#define FMODE_MOUNT		((__force fmode_t)0x10000)
+
 /* File was opened by fanotify and shouldn't generate fanotify events */
 #define FMODE_NONOTIFY		((__force fmode_t)0x1000000)
 
@@ -197,6 +207,8 @@ typedef void (dax_iodone_t)(struct buffer_head *bh_map, int uptodate);
 #define WRITE_FUA		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FUA)
 #define WRITE_FLUSH_FUA		(WRITE | REQ_SYNC | REQ_NOIDLE | REQ_FLUSH | REQ_FUA)
 
+extern int may_use_odirect(void);
+
 /*
  * Attribute flags.  These should be or-ed together to figure out what
  * has been changed!
@@ -309,37 +321,140 @@ struct address_space;
 struct writeback_control;
 
 struct iov_iter {
-	const struct iovec *iov;
+	struct iov_iter_ops *ops;
+	unsigned long data;
 	unsigned long nr_segs;
 	size_t iov_offset;
 	size_t count;
 };
 
-size_t iov_iter_copy_from_user_atomic(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes);
-size_t iov_iter_copy_from_user(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes);
-void iov_iter_advance(struct iov_iter *i, size_t bytes);
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes);
-size_t iov_iter_single_seg_count(const struct iov_iter *i);
-size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
-			 struct iov_iter *i);
-size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
-			struct iov_iter *i);
+struct iov_iter_ops {
+	size_t (*ii_copy_to_user_atomic)(struct page *, struct iov_iter *,
+					 unsigned long, size_t);
+	size_t (*ii_copy_to_user)(struct page *, struct iov_iter *,
+				  unsigned long, size_t);
+	size_t (*ii_copy_from_user_atomic)(struct page *, struct iov_iter *,
+					   unsigned long, size_t);
+	size_t (*ii_copy_from_user)(struct page *, struct iov_iter *,
+					  unsigned long, size_t);
+	void (*ii_advance)(struct iov_iter *, size_t);
+	int (*ii_fault_in_readable)(struct iov_iter *, size_t);
+	size_t (*ii_single_seg_count)(const struct iov_iter *);
+	int (*ii_shorten)(struct iov_iter *, size_t);
+};
+
+static inline size_t iov_iter_copy_to_user_atomic(struct page *page,
+                struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	return i->ops->ii_copy_to_user_atomic(page, i, offset, bytes);
+}
+static inline size_t iov_iter_copy_to_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	return i->ops->ii_copy_to_user(page, i, offset, bytes);
+}
+static inline size_t iov_iter_copy_from_user_atomic(struct page *page,
+                struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	return i->ops->ii_copy_from_user_atomic(page, i, offset, bytes);
+}
+static inline size_t iov_iter_copy_from_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	return i->ops->ii_copy_from_user(page, i, offset, bytes);
+}
+static inline void iov_iter_advance(struct iov_iter *i, size_t bytes)
+{
+	return i->ops->ii_advance(i, bytes);
+}
+static inline int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+	return i->ops->ii_fault_in_readable(i, bytes);
+}
+static inline size_t iov_iter_single_seg_count(const struct iov_iter *i)
+{
+	return i->ops->ii_single_seg_count(i);
+}
+static inline int iov_iter_shorten(struct iov_iter *i, size_t count)
+{
+	return i->ops->ii_shorten(i, count);
+}
+
+extern struct iov_iter_ops ii_bvec_ops;
+
+struct bio_vec;
+static inline void iov_iter_init_bvec(struct iov_iter *i,
+				      struct bio_vec *bvec,
+				      unsigned long nr_segs,
+				      size_t count, size_t written)
+{
+	i->ops = &ii_bvec_ops;
+	i->data = (unsigned long)bvec;
+	i->nr_segs = nr_segs;
+	i->iov_offset = 0;
+	i->count = count + written;
+
+	iov_iter_advance(i, written);
+}
+static inline int iov_iter_has_bvec(struct iov_iter *i)
+{
+	return i->ops == &ii_bvec_ops;
+}
+static inline struct bio_vec *iov_iter_bvec(struct iov_iter *i)
+{
+	BUG_ON(!iov_iter_has_bvec(i));
+	return (struct bio_vec *)i->data;
+}
+
+extern struct iov_iter_ops ii_page_ops;
+
+static inline void iov_iter_init_page(struct iov_iter *i,
+				      struct page *page,
+				      size_t count, size_t written)
+{
+	i->ops = &ii_page_ops;
+	i->data = (unsigned long)page;
+	i->nr_segs = 1;
+	i->iov_offset = 0;
+	i->count = count + written;
+
+	iov_iter_advance(i, written);
+}
+static inline int iov_iter_has_page(struct iov_iter *i)
+{
+	return i->ops == &ii_page_ops;
+}
+static inline struct page *iov_iter_page(struct iov_iter *i)
+{
+	BUG_ON(!iov_iter_has_page(i));
+	return (struct page *)i->data;
+}
+
+extern struct iov_iter_ops ii_iovec_ops;
 
 static inline void iov_iter_init(struct iov_iter *i,
 			const struct iovec *iov, unsigned long nr_segs,
 			size_t count, size_t written)
 {
-	i->iov = iov;
+	i->ops = &ii_iovec_ops;
+	i->data = (unsigned long)iov;
 	i->nr_segs = nr_segs;
 	i->iov_offset = 0;
 	i->count = count + written;
 
 	iov_iter_advance(i, written);
 }
+static inline int iov_iter_has_iovec(const struct iov_iter *i)
+{
+	return i->ops == &ii_iovec_ops;
+}
+static inline struct iovec *iov_iter_iovec(const struct iov_iter *i)
+{
+	BUG_ON(!iov_iter_has_iovec(i));
+	return (struct iovec *)i->data;
+}
 
-static inline size_t iov_iter_count(struct iov_iter *i)
+static inline size_t iov_iter_count(const struct iov_iter *i)
 {
 	return i->count;
 }
@@ -399,6 +514,12 @@ struct address_space_operations {
 	void (*freepage)(struct page *);
 	ssize_t (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
 			loff_t offset, unsigned long nr_segs);
+	ssize_t (*direct_IO_bvec)(int, struct kiocb *, struct bio_vec *bvec,
+			loff_t offset, unsigned long bvec_len);
+	ssize_t (*direct_IO_page)(int, struct kiocb *, struct page *page,
+			loff_t offset);
+	int (*get_xip_mem)(struct address_space *, pgoff_t, int,
+						void **, unsigned long *);
 	RH_KABI_DEPRECATE_FN(int, get_xip_mem, struct address_space *, pgoff_t,
 			int, void **, unsigned long *)
 	/*
@@ -444,7 +565,6 @@ struct address_space {
 	RH_KABI_REPLACE(unsigned int i_mmap_writable,
 			 atomic_t i_mmap_writable) /* count VM_SHARED mappings */
 	struct rb_root		i_mmap;		/* tree of private and shared mappings */
-	struct list_head	i_mmap_nonlinear;/*list VM_NONLINEAR mappings */
 	struct mutex		i_mmap_mutex;	/* protect tree, count, list */
 	/* Protected by tree_lock together with the radix tree */
 	unsigned long		nrpages;	/* number of total pages */
@@ -458,6 +578,9 @@ struct address_space {
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	void			*private_data;	/* ditto */
+	struct list_head	i_peer_list;
+	struct file		*i_peer_file;
+	struct user_beancounter *dirtied_ub;
 } __attribute__((aligned(sizeof(long))));
 	/*
 	 * On most architectures that alignment is already the case; but
@@ -518,8 +641,7 @@ int mapping_tagged(struct address_space *mapping, int tag);
  */
 static inline int mapping_mapped(struct address_space *mapping)
 {
-	return	!RB_EMPTY_ROOT(&mapping->i_mmap) ||
-		!list_empty(&mapping->i_mmap_nonlinear);
+	return	!RB_EMPTY_ROOT(&mapping->i_mmap);
 }
 
 /*
@@ -633,6 +755,7 @@ struct inode {
 	struct mutex		i_mutex;
 
 	unsigned long		dirtied_when;	/* jiffies of first dirtying */
+	unsigned long		dirtied_time_when;
 
 	struct hlist_node	i_hash;
 	struct list_head	i_wb_list;	/* backing dev IO list */
@@ -842,6 +965,7 @@ struct file {
 		struct rcu_head 	fu_rcuhead;
 	} f_u;
 	struct path		f_path;
+	struct path		f_original_path;
 #define f_dentry	f_path.dentry
 	struct inode		*f_inode;	/* cached value */
 	const struct file_operations	*f_op;
@@ -867,6 +991,7 @@ struct file {
 	struct fown_struct	f_owner;
 	const struct cred	*f_cred;
 	struct file_ra_state	f_ra;
+	struct user_beancounter	*f_ub;
 
 	u64			f_version;
 #ifdef CONFIG_SECURITY
@@ -1042,6 +1167,10 @@ struct file_lock {
 	fl_owner_t fl_owner;
 	unsigned int fl_flags;
 	unsigned char fl_type;
+#ifdef CONFIG_BEANCOUNTERS
+	unsigned char fl_charged;
+	struct user_beancounter *fl_ub;
+#endif
 	unsigned int fl_pid;
 	int fl_link_cpu;		/* what cpu's list is this on? */
 	struct pid *fl_nspid;
@@ -1078,6 +1207,8 @@ struct file_lock {
 
 extern void send_sigio(struct fown_struct *fown, int fd, int band);
 
+extern void generic_set_file_flags_unlocked(struct file*, unsigned int arg);
+extern int generic_set_file_flags(struct file*, unsigned int arg);
 #ifdef CONFIG_FILE_LOCKING
 extern int fcntl_getlk(struct file *, struct flock __user *);
 extern int fcntl_setlk(unsigned int, struct file *, unsigned int,
@@ -1095,7 +1226,7 @@ extern int fcntl_getlease(struct file *filp);
 /* fs/locks.c */
 void locks_free_lock(struct file_lock *fl);
 extern void locks_init_lock(struct file_lock *);
-extern struct file_lock * locks_alloc_lock(void);
+extern struct file_lock * locks_alloc_lock(int charge);
 extern void locks_copy_lock(struct file_lock *, struct file_lock *);
 extern void __locks_copy_lock(struct file_lock *, const struct file_lock *);
 extern void locks_remove_posix(struct file *, fl_owner_t);
@@ -1386,15 +1517,6 @@ struct super_block {
 #endif
 #endif
 	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
-	/* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */
-	struct list_head	s_dentry_lru;	/* unused dentry lru */
-	int			s_nr_dentry_unused;	/* # of dentry on lru */
-
-	/* s_inode_lru_lock protects s_inode_lru and s_nr_inodes_unused */
-	spinlock_t		s_inode_lru_lock ____cacheline_aligned_in_smp;
-	struct list_head	s_inode_lru;		/* unused inode lru */
-	int			s_nr_inodes_unused;	/* # of inodes on lru */
-
 	struct block_device	*s_bdev;
 	struct backing_dev_info *s_bdi;
 	struct mtd_info		*s_mtd;
@@ -1448,6 +1570,13 @@ struct super_block {
 
 	/* AIO completions deferred from interrupt context */
 	RH_KABI_EXTEND(struct workqueue_struct *s_dio_done_wq)
+
+	/*
+	 * Keep the lru lists last in the structure so they always sit on their
+	 * own individual cachelines.
+	 */
+	struct list_lru		s_dentry_lru ____cacheline_aligned_in_smp;
+	struct list_lru		s_inode_lru ____cacheline_aligned_in_smp;
 };
 
 extern const unsigned super_block_wrapper_version;
@@ -1480,10 +1609,6 @@ static inline int *get_s_stack_depth(struct super_block *sb)
 	return wrapper ? &wrapper->s_stack_depth : NULL;
 }
 
-/* superblock cache pruning functions */
-extern void prune_icache_sb(struct super_block *sb, int nr_to_scan);
-extern void prune_dcache_sb(struct super_block *sb, int nr_to_scan);
-
 extern struct timespec current_fs_time(struct super_block *sb);
 
 /*
@@ -1615,6 +1740,8 @@ extern int vfs_rmdir(struct inode *, struct dentry *);
 extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
 extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
 extern int vfs_whiteout(struct inode *, struct dentry *);
+extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
+			   const char *, unsigned int, struct path *);
 
 /*
  * VFS dentry helper functions.
@@ -1677,7 +1804,9 @@ struct file_operations {
 	ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
 	ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
 	ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+	ssize_t (*read_iter) (struct kiocb *, struct iov_iter *, loff_t);
 	ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
+	ssize_t (*write_iter) (struct kiocb *, struct iov_iter *, loff_t);
 	int (*readdir) (struct file *, void *, filldir_t);
 	unsigned int (*poll) (struct file *, struct poll_table_struct *);
 	long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
@@ -1692,10 +1821,11 @@ struct file_operations {
 	int (*lock) (struct file *, int, struct file_lock *);
 	ssize_t (*sendpage) (struct file *, struct page *, int, size_t, loff_t *, int);
 	unsigned long (*get_unmapped_area)(struct file *, unsigned long, unsigned long, unsigned long, unsigned long);
-	int (*check_flags)(int);
+	int (*set_flags)(struct file *, int);
 	int (*flock) (struct file *, int, struct file_lock *);
 	ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int);
 	ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int);
+	int (*fadvise)(struct file* file, loff_t offset, loff_t len, int advice);
 	RH_KABI_REPLACE(int (*setlease)(struct file *, long, struct file_lock **), int (*setlease)(struct file *, long, struct file_lock **, void **))
 	long (*fallocate)(struct file *file, int mode, loff_t offset,
 			  loff_t len);
@@ -1802,8 +1932,10 @@ struct super_operations {
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
 #endif
 	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
-	int (*nr_cached_objects)(struct super_block *);
-	void (*free_cached_objects)(struct super_block *, int);
+	long (*nr_cached_objects)(struct super_block *,
+				  struct shrink_control *);
+	long (*free_cached_objects)(struct super_block *,
+				    struct shrink_control *);
 };
 
 /*
@@ -1939,8 +2071,12 @@ struct super_operations {
 #define __I_DIO_WAKEUP		9
 #define I_DIO_WAKEUP		(1 << I_DIO_WAKEUP)
 #define I_LINKABLE		(1 << 10)
+#define I_DIRTY_TIME		(1 << 11)
+#define __I_DIRTY_TIME_EXPIRED	12
+#define I_DIRTY_TIME_EXPIRED	(1 << __I_DIRTY_TIME_EXPIRED)
 
 #define I_DIRTY (I_DIRTY_SYNC | I_DIRTY_DATASYNC | I_DIRTY_PAGES)
+#define I_DIRTY_ALL (I_DIRTY | I_DIRTY_TIME)
 
 extern void __mark_inode_dirty(struct inode *, int);
 static inline void mark_inode_dirty(struct inode *inode)
@@ -1996,7 +2132,8 @@ extern void touch_atime(struct path *);
 static inline void file_accessed(struct file *file)
 {
 	if (!(file->f_flags & O_NOATIME))
-		touch_atime(&file->f_path);
+		touch_atime(file->f_original_path.mnt ?
+			    &file->f_original_path : &file->f_path);
 }
 
 int sync_inode(struct inode *inode, struct writeback_control *wbc);
@@ -2010,6 +2147,7 @@ struct file_system_type {
 #define FS_HAS_SUBTYPE		4
 #define FS_USERNS_MOUNT		8	/* Can be mounted by userns root */
 #define FS_USERNS_DEV_MOUNT	16 /* A userns mount does not imply MNT_NODEV */
+#define FS_VIRTUALIZED		64	/* Can mount this fstype inside ve */
 #define FS_HAS_RM_XQUOTA	256	/* KABI: fs has the rm_xquota quota op */
 #define FS_HAS_INVALIDATE_RANGE	512	/* FS has new ->invalidatepage with length arg */
 #define FS_HAS_DIO_IODONE2	1024	/* KABI: fs supports new iodone */
@@ -2017,6 +2155,13 @@ struct file_system_type {
 #define FS_HAS_DOPS_WRAPPER	4096	/* kabi: fs is using dentry_operations_wrapper. sb->s_d_op points to
 dentry_operations_wrapper */
 #define FS_RENAME_DOES_D_MOVE	32768	/* FS will handle d_move() during rename() internally. */
+/*
+ * f_op->mmap must be called with vma=NULL before taking mmap_sem;
+ * workaround for wrong i_mutex vs mmap_sem lock ordering in pfcache
+ * (PSBM-23133) - vdavydov@
+ */
+#define FS_HAS_MMAP_PREP	(1<<18)
+
 	struct dentry *(*mount) (struct file_system_type *, int,
 		       const char *, void *);
 	void (*kill_sb) (struct super_block *);
@@ -2129,6 +2274,7 @@ void kill_anon_super(struct super_block *sb);
 void kill_litter_super(struct super_block *sb);
 void deactivate_super(struct super_block *sb);
 void deactivate_locked_super(struct super_block *sb);
+void put_super(struct super_block *sb);
 int set_anon_super(struct super_block *s, void *data);
 int get_anon_bdev(dev_t *);
 void free_anon_bdev(dev_t);
@@ -2180,8 +2326,11 @@ extern bool our_mnt(struct vfsmount *mnt);
 
 extern int current_umask(void);
 
+extern int ve_devmnt_process(struct ve_struct *, dev_t, void **, int);
+
 extern void ihold(struct inode * inode);
 extern void iput(struct inode *);
+extern int generic_update_time(struct inode *, struct timespec *, int);
 
 /* /sys/fs */
 extern struct kobject *fs_kobj;
@@ -2409,6 +2558,7 @@ extern int register_blkdev(unsigned int, const char *);
 extern void unregister_blkdev(unsigned int, const char *);
 extern struct block_device *bdget(dev_t);
 extern struct block_device *bdgrab(struct block_device *bdev);
+extern void bd_write_size(struct block_device *, loff_t size);
 extern void bd_set_size(struct block_device *, loff_t size);
 extern void bd_forget(struct inode *inode);
 extern void bdput(struct block_device *);
@@ -2679,6 +2829,12 @@ extern int is_subdir(struct dentry *, struct dentry *);
 extern int path_is_under(struct path *, struct path *);
 extern ino_t find_inode_number(struct dentry *, struct qstr *);
 
+int ve_fsync_behavior(void);
+
+#define FSYNC_NEVER	0	/* ve syncs are ignored    */
+#define FSYNC_ALWAYS	1	/* ve syncs work as ususal */
+#define FSYNC_FILTERED	2	/* ve syncs only its files */
+
 #include <linux/err.h>
 
 /* needed for stackable file system support */
@@ -2707,6 +2863,11 @@ extern struct inode *ilookup(struct super_block *sb, unsigned long ino);
 
 extern struct inode * iget5_locked(struct super_block *, unsigned long, int (*test)(struct inode *, void *), int (*set)(struct inode *, void *), void *);
 extern struct inode * iget_locked(struct super_block *, unsigned long);
+extern struct inode *find_inode_nowait(struct super_block *,
+				       unsigned long,
+				       int (*match)(struct inode *,
+						    unsigned long, void *),
+				       void *data);
 extern int insert_inode_locked4(struct inode *, unsigned long, int (*test)(struct inode *, void *), void *);
 extern int insert_inode_locked(struct inode *);
 #ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -2757,18 +2918,23 @@ extern int sb_min_blocksize(struct super_block *, int);
 
 extern int generic_file_mmap(struct file *, struct vm_area_struct *);
 extern int generic_file_readonly_mmap(struct file *, struct vm_area_struct *);
-extern int generic_file_remap_pages(struct vm_area_struct *, unsigned long addr,
-		unsigned long size, pgoff_t pgoff);
 extern int file_read_actor(read_descriptor_t * desc, struct page *page, unsigned long offset, unsigned long size);
 int generic_write_checks(struct file *file, loff_t *pos, size_t *count, int isblk);
 extern ssize_t generic_file_aio_read(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+extern ssize_t generic_file_read_iter(struct kiocb *, struct iov_iter *, loff_t);
 extern ssize_t __generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long,
 		loff_t *);
+extern ssize_t __generic_file_write_iter(struct kiocb *, struct iov_iter *, loff_t *);
 extern ssize_t generic_file_aio_write(struct kiocb *, const struct iovec *, unsigned long, loff_t);
+extern ssize_t generic_file_write_iter(struct kiocb *, struct iov_iter *, loff_t);
 extern ssize_t generic_file_direct_write(struct kiocb *, const struct iovec *,
 		unsigned long *, loff_t, loff_t *, size_t, size_t);
+extern ssize_t generic_file_direct_write_iter(struct kiocb *, struct iov_iter *,
+		loff_t, loff_t *, size_t);
 extern ssize_t generic_file_buffered_write(struct kiocb *, const struct iovec *,
 		unsigned long, loff_t, loff_t *, size_t, ssize_t);
+extern ssize_t generic_file_buffered_write_iter(struct kiocb *, struct iov_iter *,
+		loff_t, loff_t *, ssize_t);
 extern ssize_t do_sync_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos);
 extern ssize_t do_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos);
 extern int generic_segment_checks(const struct iovec *iov,
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -22,6 +22,7 @@
 #define part_to_dev(part)	(&((part)->__dev))
 
 extern struct device_type part_type;
+extern struct device_type disk_type;
 extern struct kobject *block_depr;
 extern struct class block_class;
 
--- a/include/linux/gfp.h
+++ b/include/linux/gfp.h
@@ -36,7 +36,7 @@ struct vm_area_struct;
 #define ___GFP_HARDWALL		0x20000u
 #define ___GFP_THISNODE		0x40000u
 #define ___GFP_RECLAIMABLE	0x80000u
-#define ___GFP_KMEMCG		0x100000u
+#define ___GFP_ACCOUNT		0x100000u
 #define ___GFP_NOTRACK		0x200000u
 #define ___GFP_NO_KSWAPD	0x400000u
 #define ___GFP_OTHER_NODE	0x800000u
@@ -92,11 +92,11 @@ struct vm_area_struct;
 #define __GFP_HARDWALL   ((__force gfp_t)___GFP_HARDWALL) /* Enforce hardwall cpuset memory allocs */
 #define __GFP_THISNODE	((__force gfp_t)___GFP_THISNODE)/* No fallback, no policies */
 #define __GFP_RECLAIMABLE ((__force gfp_t)___GFP_RECLAIMABLE) /* Page is reclaimable */
+#define __GFP_ACCOUNT	((__force gfp_t)___GFP_ACCOUNT)	/* Account to kmemcg */
 #define __GFP_NOTRACK	((__force gfp_t)___GFP_NOTRACK)  /* Don't track with kmemcheck */
 
 #define __GFP_NO_KSWAPD	((__force gfp_t)___GFP_NO_KSWAPD)
 #define __GFP_OTHER_NODE ((__force gfp_t)___GFP_OTHER_NODE) /* On behalf of other node */
-#define __GFP_KMEMCG	((__force gfp_t)___GFP_KMEMCG) /* Allocation comes from a memcg-accounted resource */
 #define __GFP_WRITE	((__force gfp_t)___GFP_WRITE)	/* Allocator intends to dirty page */
 
 /*
@@ -105,7 +105,7 @@ struct vm_area_struct;
  */
 #define __GFP_NOTRACK_FALSE_POSITIVE (__GFP_NOTRACK)
 
-#define __GFP_BITS_SHIFT 25	/* Room for N __GFP_FOO bits */
+#define __GFP_BITS_SHIFT 26	/* Room for N __GFP_FOO bits */
 #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1))
 
 /* This equals 0, but use constants in case they ever change */
@@ -115,6 +115,7 @@ struct vm_area_struct;
 #define GFP_NOIO	(__GFP_WAIT)
 #define GFP_NOFS	(__GFP_WAIT | __GFP_IO)
 #define GFP_KERNEL	(__GFP_WAIT | __GFP_IO | __GFP_FS)
+#define GFP_KERNEL_ACCOUNT (GFP_KERNEL | __GFP_ACCOUNT)
 #define GFP_TEMPORARY	(__GFP_WAIT | __GFP_IO | __GFP_FS | \
 			 __GFP_RECLAIMABLE)
 #define GFP_USER	(__GFP_WAIT | __GFP_IO | __GFP_FS | __GFP_HARDWALL)
@@ -389,9 +390,6 @@ extern void *__alloc_page_frag(struct page_frag_cache *nc,
 			       unsigned int fragsz, gfp_t gfp_mask);
 extern void __free_page_frag(void *addr);
 
-extern void __free_memcg_kmem_pages(struct page *page, unsigned int order);
-extern void free_memcg_kmem_pages(unsigned long addr, unsigned int order);
-
 #define __free_page(page) __free_pages((page), 0)
 #define free_page(addr) free_pages((addr), 0)
 
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -6,6 +6,7 @@
 #include <linux/ftrace_irq.h>
 #include <linux/vtime.h>
 
+#include <bc/task.h>
 
 #if defined(CONFIG_SMP) || defined(CONFIG_GENERIC_HARDIRQS)
 extern void synchronize_irq(unsigned int irq);
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -136,6 +136,12 @@ extern struct task_group root_task_group;
 # define INIT_CGROUP_SCHED(tsk)
 #endif
 
+#ifdef CONFIG_VE
+#define	INIT_TASK_VE(tsk) .task_ve = &ve0,
+#else
+#define	INIT_TASK_VE(tsk)
+#endif
+
 #ifdef CONFIG_PERF_EVENTS
 # define INIT_PERF_EVENTS(tsk)						\
 	.perf_event_mutex = 						\
@@ -164,6 +170,13 @@ extern struct task_group root_task_group;
 # define INIT_RT_MUTEXES(tsk)
 #endif
 
+#ifdef CONFIG_KASAN
+# define INIT_KASAN(tsk)						\
+	.kasan_depth = 1,
+#else
+# define INIT_KASAN(tsk)
+#endif
+
 /*
  *  INIT_TASK is used to set up the first task table, touch at
  * your own risk!. Base=0, limit=0x1fffff (=2MB)
@@ -192,6 +205,7 @@ extern struct task_group root_task_group;
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
 	INIT_PUSHABLE_TASKS(tsk)					\
 	INIT_CGROUP_SCHED(tsk)						\
+	INIT_TASK_VE(tsk)						\
 	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
 	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
 	.real_parent	= &tsk,						\
@@ -234,6 +248,7 @@ extern struct task_group root_task_group;
 	INIT_CPUSET_SEQ							\
 	INIT_RT_MUTEXES(tsk)					\
 	INIT_VTIME(tsk)							\
+	INIT_KASAN(tsk)							\
 }
 
 
--- a/include/linux/iocontext.h
+++ b/include/linux/iocontext.h
@@ -115,6 +115,9 @@ struct io_context {
 	struct hlist_head	icq_list;
 
 	struct work_struct release_work;
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *ioc_ub;
+#endif
 };
 
 /**
--- a/include/linux/ioprio.h
+++ b/include/linux/ioprio.h
@@ -39,6 +39,7 @@ enum {
 	IOPRIO_WHO_PROCESS = 1,
 	IOPRIO_WHO_PGRP,
 	IOPRIO_WHO_USER,
+	IOPRIO_WHO_UBC = 1000,
 };
 
 /*
--- /dev/null
+++ b/include/linux/kasan-checks.h
@@ -0,0 +1,12 @@
+#ifndef _LINUX_KASAN_CHECKS_H
+#define _LINUX_KASAN_CHECKS_H
+
+#ifdef CONFIG_KASAN
+void kasan_check_read(const void *p, unsigned int size);
+void kasan_check_write(const void *p, unsigned int size);
+#else
+static inline void kasan_check_read(const void *p, unsigned int size) { }
+static inline void kasan_check_write(const void *p, unsigned int size) { }
+#endif
+
+#endif
--- /dev/null
+++ b/include/linux/kasan.h
@@ -0,0 +1,88 @@
+#ifndef _LINUX_KASAN_H
+#define _LINUX_KASAN_H
+
+#include <linux/types.h>
+
+struct kmem_cache;
+struct page;
+struct vm_struct;
+
+#ifdef CONFIG_KASAN
+
+#define KASAN_SHADOW_SCALE_SHIFT 3
+#define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL)
+
+#include <asm/kasan.h>
+#include <linux/sched.h>
+
+static inline void *kasan_mem_to_shadow(const void *addr)
+{
+	return (void *)((unsigned long)addr >> KASAN_SHADOW_SCALE_SHIFT)
+		+ KASAN_SHADOW_OFFSET;
+}
+
+/* Enable reporting bugs after kasan_disable_current() */
+static inline void kasan_enable_current(void)
+{
+	current->kasan_depth++;
+}
+
+/* Disable reporting bugs for current task */
+static inline void kasan_disable_current(void)
+{
+	current->kasan_depth--;
+}
+
+void kasan_unpoison_shadow(const void *address, size_t size);
+
+void kasan_alloc_pages(struct page *page, unsigned int order);
+void kasan_free_pages(struct page *page, unsigned int order);
+
+void kasan_poison_slab(struct page *page);
+void kasan_unpoison_object_data(struct kmem_cache *cache, void *object);
+void kasan_poison_object_data(struct kmem_cache *cache, void *object);
+
+void kasan_kmalloc_large(const void *ptr, size_t size);
+void kasan_kfree_large(const void *ptr);
+void kasan_kfree(void *ptr);
+void kasan_kmalloc(struct kmem_cache *s, const void *object, size_t size);
+void kasan_krealloc(const void *object, size_t new_size);
+
+void kasan_slab_alloc(struct kmem_cache *s, void *object);
+void kasan_slab_free(struct kmem_cache *s, void *object);
+
+int kasan_module_alloc(void *addr, size_t size);
+void kasan_free_shadow(const struct vm_struct *vm);
+
+#else /* CONFIG_KASAN */
+
+static inline void kasan_unpoison_shadow(const void *address, size_t size) {}
+
+static inline void kasan_enable_current(void) {}
+static inline void kasan_disable_current(void) {}
+
+static inline void kasan_alloc_pages(struct page *page, unsigned int order) {}
+static inline void kasan_free_pages(struct page *page, unsigned int order) {}
+
+static inline void kasan_poison_slab(struct page *page) {}
+static inline void kasan_unpoison_object_data(struct kmem_cache *cache,
+					void *object) {}
+static inline void kasan_poison_object_data(struct kmem_cache *cache,
+					void *object) {}
+
+static inline void kasan_kmalloc_large(void *ptr, size_t size) {}
+static inline void kasan_kfree_large(const void *ptr) {}
+static inline void kasan_kfree(void *ptr) {}
+static inline void kasan_kmalloc(struct kmem_cache *s, const void *object,
+				size_t size) {}
+static inline void kasan_krealloc(const void *object, size_t new_size) {}
+
+static inline void kasan_slab_alloc(struct kmem_cache *s, void *object) {}
+static inline void kasan_slab_free(struct kmem_cache *s, void *object) {}
+
+static inline int kasan_module_alloc(void *addr, size_t size) { return 0; }
+static inline void kasan_free_shadow(const struct vm_struct *vm) {}
+
+#endif /* CONFIG_KASAN */
+
+#endif /* LINUX_KASAN_H */
--- /dev/null
+++ b/include/linux/kcov.h
@@ -0,0 +1,29 @@
+#ifndef _LINUX_KCOV_H
+#define _LINUX_KCOV_H
+
+#include <uapi/linux/kcov.h>
+
+struct task_struct;
+
+#ifdef CONFIG_KCOV
+
+void kcov_task_init(struct task_struct *t);
+void kcov_task_exit(struct task_struct *t);
+
+enum kcov_mode {
+	/* Coverage collection is not enabled yet. */
+	KCOV_MODE_DISABLED = 0,
+	/*
+	 * Tracing coverage collection mode.
+	 * Covered PCs are collected in a per-task buffer.
+	 */
+	KCOV_MODE_TRACE = 1,
+};
+
+#else
+
+static inline void kcov_task_init(struct task_struct *t) {}
+static inline void kcov_task_exit(struct task_struct *t) {}
+
+#endif /* CONFIG_KCOV */
+#endif /* _LINUX_KCOV_H */
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -25,6 +25,7 @@ enum cpu_usage_stat {
 	CPUTIME_IRQ,
 	CPUTIME_IDLE,
 	CPUTIME_IOWAIT,
+	CPUTIME_USED,
 	CPUTIME_STEAL,
 	CPUTIME_GUEST,
 	CPUTIME_GUEST_NICE,
@@ -35,6 +36,42 @@ struct kernel_cpustat {
 	u64 cpustat[NR_STATS];
 };
 
+static inline u64 kernel_cpustat_total_usage(const struct kernel_cpustat *p)
+{
+	return p->cpustat[CPUTIME_USER] + p->cpustat[CPUTIME_NICE] +
+		p->cpustat[CPUTIME_SYSTEM];
+}
+
+static inline u64 kernel_cpustat_total_idle(const struct kernel_cpustat *p)
+{
+	return p->cpustat[CPUTIME_IDLE] + p->cpustat[CPUTIME_IOWAIT];
+}
+
+static inline void kernel_cpustat_zero(struct kernel_cpustat *p)
+{
+	memset(p, 0, sizeof(*p));
+}
+
+static inline void kernel_cpustat_add(const struct kernel_cpustat *lhs,
+				      const struct kernel_cpustat *rhs,
+				      struct kernel_cpustat *res)
+{
+	int i;
+
+	for (i = 0; i < NR_STATS; i++)
+		res->cpustat[i] = lhs->cpustat[i] + rhs->cpustat[i];
+}
+
+static inline void kernel_cpustat_sub(const struct kernel_cpustat *lhs,
+				      const struct kernel_cpustat *rhs,
+				      struct kernel_cpustat *res)
+{
+	int i;
+
+	for (i = 0; i < NR_STATS; i++)
+		res->cpustat[i] = lhs->cpustat[i] - rhs->cpustat[i];
+}
+
 struct kernel_stat {
 #ifndef CONFIG_GENERIC_HARDIRQS
        unsigned int irqs[NR_IRQS];
--- /dev/null
+++ b/include/linux/kmapset.h
@@ -0,0 +1,104 @@
+/*
+ *  include/linux/kmapset.h
+ *
+ *  Copyright (c) 2013-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_KMAPSET_H
+#define _LINUX_KMAPSET_H
+
+#include <linux/kernel.h>
+#include <linux/rbtree.h>
+#include <linux/rculist.h>
+#include <linux/kref.h>
+
+struct kmapset_map;
+
+struct kmapset_set {
+	struct mutex		mutex;
+	struct rb_root		tree;
+	unsigned long		default_value;
+};
+
+struct kmapset_map {
+	struct kref		kref;
+	unsigned		size;
+	struct kmapset_set	*set;
+	unsigned long		default_value;
+	unsigned long		hash;
+	struct hlist_head	links;
+	union {
+		struct rb_node		node;
+		struct rcu_head		rcu_head;
+	};
+};
+
+struct kmapset_key {
+	struct hlist_head	links;
+};
+
+struct kmapset_link {
+	struct kmapset_map	*map;
+	struct kmapset_key	*key;
+	unsigned long		value;
+	struct hlist_node	map_link;
+	union {
+		struct hlist_node	key_link;
+		struct rcu_head		rcu_head;
+	};
+};
+
+static inline void kmapset_lock(struct kmapset_set *set)
+{
+	mutex_lock(&set->mutex);
+}
+
+static inline void kmapset_unlock(struct kmapset_set *set)
+{
+	mutex_unlock(&set->mutex);
+}
+
+struct kmapset_map *kmapset_new(struct kmapset_set *set);
+
+static inline void kmapset_init_set(struct kmapset_set *set)
+{
+	mutex_init(&set->mutex);
+	set->tree = RB_ROOT;
+	set->default_value = 0;
+}
+
+static inline void kmapset_init_map(struct kmapset_map *map,
+		struct kmapset_set *set)
+{
+	kref_init(&map->kref);
+	map->size = 0;
+	map->set = set;
+	map->default_value = set->default_value;
+	INIT_HLIST_HEAD(&map->links);
+	RB_CLEAR_NODE(&map->node);
+}
+
+static inline void kmapset_init_key(struct kmapset_key *key)
+{
+	 INIT_HLIST_HEAD(&key->links);
+}
+
+struct kmapset_map *kmapset_get(struct kmapset_map *map);
+void kmapset_put(struct kmapset_map *map);
+
+struct kmapset_map *kmapset_dup(struct kmapset_map *old);
+struct kmapset_map *kmapset_commit(struct kmapset_map *map);
+
+struct kmapset_link *kmapset_lookup(struct kmapset_map *map,
+		struct kmapset_key *key);
+unsigned long kmapset_get_value(struct kmapset_map *map,
+		struct kmapset_key *key);
+int kmapset_set_value(struct kmapset_map *map,
+		struct kmapset_key *key, unsigned long value);
+bool kmapset_del_value(struct kmapset_map *map, struct kmapset_key *key);
+void kmapset_set_default(struct kmapset_map *map, unsigned long value);
+
+void kmapset_unlink(struct kmapset_key *key, struct kmapset_set *set);
+
+#endif /* _LINUX_KMAPSET_H */
--- a/include/linux/kmemleak.h
+++ b/include/linux/kmemleak.h
@@ -21,6 +21,8 @@
 #ifndef __KMEMLEAK_H
 #define __KMEMLEAK_H
 
+#include <linux/slab.h>
+
 #ifdef CONFIG_DEBUG_KMEMLEAK
 
 extern void kmemleak_init(void) __ref;
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -23,7 +23,7 @@
 #include <linux/stddef.h>
 #include <linux/errno.h>
 #include <linux/compiler.h>
-#include <linux/workqueue.h>
+#include <linux/kthread.h>
 #include <linux/sysctl.h>
 
 #define KMOD_PATH_LEN 256
@@ -44,6 +44,11 @@ static inline int request_module_nowait(const char *name, ...) { return -ENOSYS;
 #define try_then_request_module(x, mod...) (x)
 #endif
 
+#ifdef CONFIG_VE_IPTABLES
+extern bool module_payload_allowed(const char *module);
+#else
+static inline bool module_payload_allowed(const char *module) { return true; }
+#endif
 
 struct cred;
 struct file;
@@ -54,7 +59,7 @@ struct file;
 #define UMH_KILLABLE	4	/* wait for EXEC/PROC killable */
 
 struct subprocess_info {
-	struct work_struct work;
+	struct kthread_work work;
 	struct completion *complete;
 	char *path;
 	char **argv;
@@ -67,6 +72,11 @@ struct subprocess_info {
 };
 
 extern int
+call_usermodehelper_by(struct kthread_worker *worker,
+			char *path, char **argv, char **envp, int wait,
+			int (*init)(struct subprocess_info *info, struct cred *new),
+			void (*cleanup)(struct subprocess_info *), void *data);
+extern int
 call_usermodehelper(char *path, char **argv, char **envp, int wait);
 
 extern struct subprocess_info *
@@ -85,6 +95,8 @@ enum umh_disable_depth {
 	UMH_DISABLED,
 };
 
+extern void usermodehelper_init(void);
+
 extern int __usermodehelper_disable(enum umh_disable_depth depth);
 extern void __usermodehelper_set_disable_depth(enum umh_disable_depth depth);
 
--- a/include/linux/kobject.h
+++ b/include/linux/kobject.h
@@ -206,6 +206,8 @@ extern struct kobject *firmware_kobj;
 int kobject_uevent(struct kobject *kobj, enum kobject_action action);
 int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 			char *envp[]);
+int kobject_uevent_env_one(struct kobject *kobj, enum kobject_action action,
+			char *envp[]);
 
 __printf(2, 3)
 int add_uevent_var(struct kobj_uevent_env *env, const char *format, ...);
--- a/include/linux/kobject_ns.h
+++ b/include/linux/kobject_ns.h
@@ -27,6 +27,7 @@ struct kobject;
 enum kobj_ns_type {
 	KOBJ_NS_TYPE_NONE = 0,
 	KOBJ_NS_TYPE_NET,
+	KOBJ_NS_TYPE_VE,
 	KOBJ_NS_TYPES
 };
 
--- a/include/linux/ksm.h
+++ b/include/linux/ksm.h
@@ -76,8 +76,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
 int page_referenced_ksm(struct page *page,
 			struct mem_cgroup *memcg, unsigned long *vm_flags);
 int try_to_unmap_ksm(struct page *page, enum ttu_flags flags);
-int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
-		  struct vm_area_struct *, unsigned long, void *), void *arg);
+int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc);
 void ksm_migrate_page(struct page *newpage, struct page *oldpage);
 
 #else  /* !CONFIG_KSM */
@@ -120,8 +119,8 @@ static inline int try_to_unmap_ksm(struct page *page, enum ttu_flags flags)
 	return 0;
 }
 
-static inline int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page*,
-		struct vm_area_struct *, unsigned long, void *), void *arg)
+static inline int rmap_walk_ksm(struct page *page,
+			struct rmap_walk_control *rwc)
 {
 	return 0;
 }
--- a/include/linux/kthread.h
+++ b/include/linux/kthread.h
@@ -5,15 +5,34 @@
 #include <linux/sched.h>
 
 __printf(4, 5)
-struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
-					   void *data,
-					   int node,
-					   const char namefmt[], ...);
+struct kthread_create_info
+{
+	/* Information passed to kthread() from kthreadd. */
+	int (*threadfn)(void *data);
+	void *data;
+	int node;
+
+	/* Result passed back to kthread_create() from kthreadd. */
+	struct task_struct *result;
+	struct completion done;
+
+	struct list_head list;
+};
+
+struct task_struct *__kthread_create_on_node(
+		void (*addfn)(void *data, struct kthread_create_info *create),
+		void *add_data,
+		int (*threadfn)(void *data),
+		void *data, int node,
+		const char namefmt[],
+		...);
+
+#define kthread_create_on_node(threadfn, data, node, namefmt, arg...)	\
+	__kthread_create_on_node(NULL, NULL, threadfn, data, node, namefmt, ##arg)
 
 #define kthread_create(threadfn, data, namefmt, arg...) \
 	kthread_create_on_node(threadfn, data, -1, namefmt, ##arg)
 
-
 struct task_struct *kthread_create_on_cpu(int (*threadfn)(void *data),
 					  void *data,
 					  unsigned int cpu,
--- a/include/linux/kvm_host.h
+++ b/include/linux/kvm_host.h
@@ -142,7 +142,11 @@ static inline bool is_error_page(struct page *page)
 #define KVM_REQ_GLOBAL_CLOCK_UPDATE 22
 #define KVM_REQ_APIC_PAGE_RELOAD  25
 #define KVM_REQ_SMI               26
+#define KVM_REQ_HV_CRASH          27
 #define KVM_REQ_IOAPIC_EOI_EXIT   28
+#define KVM_REQ_HV_RESET          29
+#define KVM_REQ_HV_EXIT           30
+#define KVM_REQ_HV_STIMER         31
 
 #define KVM_USERSPACE_IRQ_SOURCE_ID		0
 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID	1
@@ -310,6 +314,11 @@ static inline unsigned long kvm_dirty_bitmap_bytes(struct kvm_memory_slot *memsl
 	return ALIGN(memslot->npages, BITS_PER_LONG) / 8;
 }
 
+struct kvm_hv_sint {
+	u32 vcpu;
+	u32 sint;
+};
+
 struct kvm_kernel_irq_routing_entry {
 	u32 gsi;
 	u32 type;
@@ -322,6 +331,7 @@ struct kvm_kernel_irq_routing_entry {
 			unsigned pin;
 		} irqchip;
 		struct msi_msg msi;
+		struct kvm_hv_sint hv_sint;
 	};
 	struct hlist_node link;
 };
@@ -329,6 +339,7 @@ struct kvm_kernel_irq_routing_entry {
 #ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
 struct kvm_irq_routing_table {
 	int chip[KVM_NR_IRQCHIPS][KVM_IRQCHIP_NUM_PINS];
+	struct kvm_kernel_irq_routing_entry *rt_entries;
 	u32 nr_rt_entries;
 	/*
 	 * Array indexed by gsi. Each entry contains list of irq chips
@@ -434,6 +445,8 @@ struct kvm {
 
 #define vcpu_debug(vcpu, fmt, ...)					\
 	kvm_debug("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
+#define vcpu_err(vcpu, fmt, ...)					\
+	kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__)
 
 static inline struct kvm_vcpu *kvm_get_vcpu(struct kvm *kvm, int i)
 {
@@ -1046,6 +1059,8 @@ static inline void kvm_irq_routing_update(struct kvm *kvm)
 {
 }
 #endif
+void kvm_arch_irq_routing_update(struct kvm *kvm);
+void kvm_arch_post_irq_routing_update(struct kvm *kvm);
 
 static inline int kvm_ioeventfd(struct kvm *kvm, struct kvm_ioeventfd *args)
 {
--- a/include/linux/list_lru.h
+++ b/include/linux/list_lru.h
@@ -9,6 +9,9 @@
 
 #include <linux/list.h>
 #include <linux/nodemask.h>
+#include <linux/shrinker.h>
+
+struct mem_cgroup;
 
 /* list_lru_walk_cb has to always return one of those */
 enum lru_status {
@@ -21,24 +24,45 @@ enum lru_status {
 				   internally, but has to return locked. */
 };
 
-struct list_lru_node {
-	spinlock_t		lock;
+struct list_lru_one {
 	struct list_head	list;
-	/* kept as signed so we can catch imbalance bugs */
+	/* may become negative during memcg reparenting */
 	long			nr_items;
+};
+
+struct list_lru_memcg {
+	/* array of per cgroup lists, indexed by memcg_cache_id */
+	struct list_lru_one	*lru[0];
+};
+
+struct list_lru_node {
+	/* protects all lists on the node, including per cgroup */
+	spinlock_t		lock;
+	/* global list, used for the root cgroup in cgroup aware lrus */
+	struct list_lru_one	lru;
+#ifdef CONFIG_MEMCG_KMEM
+	/* for cgroup aware lrus points to per cgroup lists, otherwise NULL */
+	struct list_lru_memcg	*memcg_lrus;
+#endif
 } ____cacheline_aligned_in_smp;
 
 struct list_lru {
 	struct list_lru_node	*node;
-	nodemask_t		active_nodes;
+#ifdef CONFIG_MEMCG_KMEM
+	struct list_head	list;
+#endif
 };
 
 void list_lru_destroy(struct list_lru *lru);
-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key);
-static inline int list_lru_init(struct list_lru *lru)
-{
-	return list_lru_init_key(lru, NULL);
-}
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+		    struct lock_class_key *key);
+
+#define list_lru_init(lru)		__list_lru_init((lru), false, NULL)
+#define list_lru_init_key(lru, key)	__list_lru_init((lru), false, (key))
+#define list_lru_init_memcg(lru)	__list_lru_init((lru), true, NULL)
+
+int memcg_update_all_list_lrus(int num_memcgs);
+void memcg_drain_all_list_lrus(int src_idx, int dst_idx);
 
 /**
  * list_lru_add: add an element to the lru list's tail
@@ -72,32 +96,48 @@ bool list_lru_add(struct list_lru *lru, struct list_head *item);
 bool list_lru_del(struct list_lru *lru, struct list_head *item);
 
 /**
- * list_lru_count_node: return the number of objects currently held by @lru
+ * list_lru_count_one: return the number of objects currently held by @lru
  * @lru: the lru pointer.
  * @nid: the node id to count from.
+ * @memcg: the cgroup to count from.
  *
  * Always return a non-negative number, 0 for empty lists. There is no
  * guarantee that the list is not updated while the count is being computed.
  * Callers that want such a guarantee need to provide an outer lock.
  */
+unsigned long list_lru_count_one(struct list_lru *lru,
+				 int nid, struct mem_cgroup *memcg);
 unsigned long list_lru_count_node(struct list_lru *lru, int nid);
+
+static inline unsigned long list_lru_shrink_count(struct list_lru *lru,
+						  struct shrink_control *sc)
+{
+	return list_lru_count_one(lru, sc->nid, sc->memcg);
+}
+
 static inline unsigned long list_lru_count(struct list_lru *lru)
 {
 	long count = 0;
 	int nid;
 
-	for_each_node_mask(nid, lru->active_nodes)
+	for_each_node_state(nid, N_NORMAL_MEMORY)
 		count += list_lru_count_node(lru, nid);
 
 	return count;
 }
 
-typedef enum lru_status
-(*list_lru_walk_cb)(struct list_head *item, spinlock_t *lock, void *cb_arg);
+void list_lru_isolate(struct list_lru_one *list, struct list_head *item);
+void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+			   struct list_head *head);
+
+typedef enum lru_status (*list_lru_walk_cb)(struct list_head *item,
+		struct list_lru_one *list, spinlock_t *lock, void *cb_arg);
+
 /**
- * list_lru_walk_node: walk a list_lru, isolating and disposing freeable items.
+ * list_lru_walk_one: walk a list_lru, isolating and disposing freeable items.
  * @lru: the lru pointer.
  * @nid: the node id to scan from.
+ * @memcg: the cgroup to scan from.
  * @isolate: callback function that is resposible for deciding what to do with
  *  the item currently being scanned
  * @cb_arg: opaque type that will be passed to @isolate
@@ -115,18 +155,30 @@ typedef enum lru_status
  *
  * Return value: the number of objects effectively removed from the LRU.
  */
+unsigned long list_lru_walk_one(struct list_lru *lru,
+				int nid, struct mem_cgroup *memcg,
+				list_lru_walk_cb isolate, void *cb_arg,
+				unsigned long *nr_to_walk);
 unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
 				 list_lru_walk_cb isolate, void *cb_arg,
 				 unsigned long *nr_to_walk);
 
 static inline unsigned long
+list_lru_shrink_walk(struct list_lru *lru, struct shrink_control *sc,
+		     list_lru_walk_cb isolate, void *cb_arg)
+{
+	return list_lru_walk_one(lru, sc->nid, sc->memcg, isolate, cb_arg,
+				 &sc->nr_to_scan);
+}
+
+static inline unsigned long
 list_lru_walk(struct list_lru *lru, list_lru_walk_cb isolate,
 	      void *cb_arg, unsigned long nr_to_walk)
 {
 	long isolated = 0;
 	int nid;
 
-	for_each_node_mask(nid, lru->active_nodes) {
+	for_each_node_state(nid, N_NORMAL_MEMORY) {
 		isolated += list_lru_walk_node(lru, nid, isolate,
 					       cb_arg, &nr_to_walk);
 		if (nr_to_walk <= 0)
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -23,12 +23,16 @@
 #include <linux/vm_event_item.h>
 #include <linux/hardirq.h>
 #include <linux/jump_label.h>
+#include <linux/page-flags.h>
 
 struct mem_cgroup;
 struct page_cgroup;
 struct page;
 struct mm_struct;
 struct kmem_cache;
+struct oom_context;
+
+extern struct oom_context global_oom_ctx;
 
 /* Stats that can be updated by kernel. */
 enum mem_cgroup_page_stat_item {
@@ -42,46 +46,27 @@ struct mem_cgroup_reclaim_cookie {
 };
 
 #ifdef CONFIG_MEMCG
-/*
- * All "charge" functions with gfp_mask should use GFP_KERNEL or
- * (gfp_mask & GFP_RECLAIM_MASK). In current implementatin, memcg doesn't
- * alloc memory but reclaims memory from all available zones. So, "where I want
- * memory from" bits of gfp_mask has no meaning. So any bits of that field is
- * available but adding a rule is better. charge functions' gfp_mask should
- * be set to GFP_KERNEL or gfp_mask & GFP_RECLAIM_MASK for avoiding ambiguous
- * codes.
- * (Of course, if memcg does memory allocation in future, GFP_KERNEL is sane.)
- */
-
-extern int mem_cgroup_newpage_charge(struct page *page, struct mm_struct *mm,
-				gfp_t gfp_mask);
-/* for swap handling */
-extern int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
-		struct page *page, gfp_t mask, struct mem_cgroup **memcgp);
-extern void mem_cgroup_commit_charge_swapin(struct page *page,
-					struct mem_cgroup *memcg);
-extern void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg);
+int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
+			  gfp_t gfp_mask, struct mem_cgroup **memcgp);
+void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
+			      bool lrucare);
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg);
+void mem_cgroup_uncharge(struct page *page);
+void mem_cgroup_uncharge_list(struct list_head *page_list);
 
-extern int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
-					gfp_t gfp_mask);
+void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
+			bool lrucare);
 
 struct lruvec *mem_cgroup_zone_lruvec(struct zone *, struct mem_cgroup *);
 struct lruvec *mem_cgroup_page_lruvec(struct page *, struct zone *);
 
-/* For coalescing uncharge for reducing memcg' overhead*/
-extern void mem_cgroup_uncharge_start(void);
-extern void mem_cgroup_uncharge_end(void);
-
-extern void mem_cgroup_uncharge_page(struct page *page);
-extern void mem_cgroup_uncharge_cache_page(struct page *page);
-
 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg,
 				  struct mem_cgroup *memcg);
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg);
 
 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
-extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm);
+extern struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
 extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
 extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont);
@@ -101,11 +86,7 @@ bool mm_match_cgroup(const struct mm_struct *mm, const struct mem_cgroup *memcg)
 
 extern struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg);
 
-extern void
-mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
-			     struct mem_cgroup **memcgp);
-extern void mem_cgroup_end_migration(struct mem_cgroup *memcg,
-	struct page *oldpage, struct page *newpage, bool migration_ok);
+unsigned long page_cgroup_ino(struct page *page);
 
 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *,
 				   struct mem_cgroup *,
@@ -116,13 +97,18 @@ void mem_cgroup_iter_break(struct mem_cgroup *, struct mem_cgroup *);
  * For memory reclaim.
  */
 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec);
+bool mem_cgroup_dcache_is_low(struct mem_cgroup *memcg);
+bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg);
+bool mem_cgroup_cleancache_disabled(struct page *page);
 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg);
 unsigned long mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list);
 void mem_cgroup_update_lru_size(struct lruvec *, enum lru_list, int);
+extern struct oom_context *mem_cgroup_oom_context(struct mem_cgroup *memcg);
+extern unsigned long mem_cgroup_overdraft(struct mem_cgroup *memcg);
+extern void mem_cgroup_note_oom_kill(struct mem_cgroup *memcg,
+				     struct task_struct *task);
 extern void mem_cgroup_print_oom_info(struct mem_cgroup *memcg,
 					struct task_struct *p);
-extern void mem_cgroup_replace_page_cache(struct page *oldpage,
-					struct page *newpage);
 
 static inline void mem_cgroup_oom_enable(void)
 {
@@ -154,6 +140,16 @@ static inline bool mem_cgroup_disabled(void)
 	return false;
 }
 
+static inline void mem_cgroup_get(struct mem_cgroup *memcg)
+{
+	css_get(mem_cgroup_css(memcg));
+}
+
+static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+{
+	css_put(mem_cgroup_css(memcg));
+}
+
 void __mem_cgroup_begin_update_page_stat(struct page *page, bool *locked,
 					 unsigned long *flags);
 
@@ -221,46 +217,36 @@ void mem_cgroup_print_bad_page(struct page *page);
 #else /* CONFIG_MEMCG */
 struct mem_cgroup;
 
-static inline int mem_cgroup_newpage_charge(struct page *page,
-					struct mm_struct *mm, gfp_t gfp_mask)
-{
-	return 0;
-}
-
-static inline int mem_cgroup_cache_charge(struct page *page,
-					struct mm_struct *mm, gfp_t gfp_mask)
+static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
+					gfp_t gfp_mask,
+					struct mem_cgroup **memcgp)
 {
+	*memcgp = NULL;
 	return 0;
 }
 
-static inline int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
-		struct page *page, gfp_t gfp_mask, struct mem_cgroup **memcgp)
-{
-	return 0;
-}
-
-static inline void mem_cgroup_commit_charge_swapin(struct page *page,
-					  struct mem_cgroup *memcg)
-{
-}
-
-static inline void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
+static inline void mem_cgroup_commit_charge(struct page *page,
+					    struct mem_cgroup *memcg,
+					    bool lrucare)
 {
 }
 
-static inline void mem_cgroup_uncharge_start(void)
+static inline void mem_cgroup_cancel_charge(struct page *page,
+					    struct mem_cgroup *memcg)
 {
 }
 
-static inline void mem_cgroup_uncharge_end(void)
+static inline void mem_cgroup_uncharge(struct page *page)
 {
 }
 
-static inline void mem_cgroup_uncharge_page(struct page *page)
+static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 {
 }
 
-static inline void mem_cgroup_uncharge_cache_page(struct page *page)
+static inline void mem_cgroup_migrate(struct page *oldpage,
+				      struct page *newpage,
+				      bool lrucare)
 {
 }
 
@@ -281,17 +267,17 @@ static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 	return NULL;
 }
 
-static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
-{
-	return NULL;
-}
-
 static inline bool mm_match_cgroup(struct mm_struct *mm,
 		struct mem_cgroup *memcg)
 {
 	return true;
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+{
+       return NULL;
+}
+
 static inline int task_in_mem_cgroup(struct task_struct *task,
 				     const struct mem_cgroup *memcg)
 {
@@ -304,17 +290,6 @@ static inline struct cgroup_subsys_state
 	return NULL;
 }
 
-static inline void
-mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
-			     struct mem_cgroup **memcgp)
-{
-}
-
-static inline void mem_cgroup_end_migration(struct mem_cgroup *memcg,
-		struct page *oldpage, struct page *newpage, bool migration_ok)
-{
-}
-
 static inline struct mem_cgroup *
 mem_cgroup_iter(struct mem_cgroup *root,
 		struct mem_cgroup *prev,
@@ -333,12 +308,36 @@ static inline bool mem_cgroup_disabled(void)
 	return true;
 }
 
+static inline void mem_cgroup_get(struct mem_cgroup *memcg)
+{
+}
+
+static inline void mem_cgroup_put(struct mem_cgroup *memcg)
+{
+}
+
 static inline int
 mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 {
 	return 1;
 }
 
+static inline bool mem_cgroup_dcache_is_low(struct mem_cgroup *memcg)
+{
+	return false;
+}
+
+static inline bool mem_cgroup_low(struct mem_cgroup *root,
+				  struct mem_cgroup *memcg)
+{
+	return false;
+}
+
+static inline bool mem_cgroup_cleancache_disabled(struct page *page)
+{
+	return false;
+}
+
 static inline unsigned long
 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 {
@@ -351,6 +350,22 @@ mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
 {
 }
 
+static inline struct oom_context *
+mem_cgroup_oom_context(struct mem_cgroup *memcg)
+{
+	return &global_oom_ctx;
+}
+
+static inline unsigned long mem_cgroup_overdraft(struct mem_cgroup *memcg)
+{
+	return 0;
+}
+
+static inline void
+mem_cgroup_note_oom_kill(struct mem_cgroup *memcg, struct task_struct *task)
+{
+}
+
 static inline void
 mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
@@ -410,10 +425,6 @@ static inline
 void mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 }
-static inline void mem_cgroup_replace_page_cache(struct page *oldpage,
-				struct page *newpage)
-{
-}
 #endif /* CONFIG_MEMCG */
 
 #if !defined(CONFIG_MEMCG) || !defined(CONFIG_DEBUG_VM)
@@ -451,7 +462,19 @@ static inline void sock_release_memcg(struct sock *sk)
 #ifdef CONFIG_MEMCG_KMEM
 extern struct static_key memcg_kmem_enabled_key;
 
-extern int memcg_limited_groups_array_size;
+extern int memcg_nr_cache_ids;
+extern void memcg_get_cache_ids(void);
+extern void memcg_put_cache_ids(void);
+
+static inline void memcg_stop_kmem_account(void)
+{
+	current->memcg_kmem_skip_account++;
+}
+
+static inline void memcg_resume_kmem_account(void)
+{
+	current->memcg_kmem_skip_account--;
+}
 
 /*
  * Helper macro to loop through all memcg-specific caches. Callers must still
@@ -459,13 +482,15 @@ extern int memcg_limited_groups_array_size;
  * the slab_mutex must be held when looping through those caches
  */
 #define for_each_memcg_cache_index(_idx)	\
-	for ((_idx) = 0; (_idx) < memcg_limited_groups_array_size; (_idx)++)
+	for ((_idx) = 0; (_idx) < memcg_nr_cache_ids; (_idx)++)
 
 static inline bool memcg_kmem_enabled(void)
 {
 	return static_key_false(&memcg_kmem_enabled_key);
 }
 
+bool memcg_kmem_is_active(struct mem_cgroup *memcg);
+
 /*
  * In general, we'll do everything in our power to not incur in any overhead
  * for non-memcg users for the kmem functions. Not even a function call, if we
@@ -477,31 +502,25 @@ static inline bool memcg_kmem_enabled(void)
  * conditions, but because they are pretty simple, they are expected to be
  * fast.
  */
-bool __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg,
-					int order);
-void __memcg_kmem_commit_charge(struct page *page,
-				       struct mem_cgroup *memcg, int order);
+bool __memcg_kmem_newpage_charge(struct page *page, gfp_t gfp, int order);
 void __memcg_kmem_uncharge_pages(struct page *page, int order);
 
 int memcg_cache_id(struct mem_cgroup *memcg);
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-			 struct kmem_cache *root_cache);
-void memcg_release_cache(struct kmem_cache *cachep);
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep);
-
-int memcg_update_cache_size(struct kmem_cache *s, int num_groups);
-void memcg_update_array_size(int num_groups);
 
 struct kmem_cache *
 __memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp);
+void __memcg_kmem_put_cache(struct kmem_cache *cachep);
+
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr);
 
-void mem_cgroup_destroy_cache(struct kmem_cache *cachep);
-void kmem_cache_destroy_memcg_children(struct kmem_cache *s);
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, unsigned long nr_pages);
+void memcg_charge_kmem_nofail(struct mem_cgroup *memcg, unsigned long nr_pages);
+void memcg_uncharge_kmem(struct mem_cgroup *memcg, unsigned long nr_pages);
 
 /**
  * memcg_kmem_newpage_charge: verify if a new kmem allocation is allowed.
+ * @page: page to charge.
  * @gfp: the gfp allocation flags.
- * @memcg: a pointer to the memcg this was charged against.
  * @order: allocation order.
  *
  * returns true if the memcg where the current task belongs can hold this
@@ -511,10 +530,12 @@ void kmem_cache_destroy_memcg_children(struct kmem_cache *s);
  * any memcg.
  */
 static inline bool
-memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+memcg_kmem_newpage_charge(struct page *page, gfp_t gfp, int order)
 {
 	if (!memcg_kmem_enabled())
 		return true;
+	if (!(gfp & __GFP_ACCOUNT))
+		return true;
 
 	/*
 	 * __GFP_NOFAIL allocations will move on even if charging is not
@@ -522,7 +543,7 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
 	 * unaccounted. We could in theory charge it forcibly, but we hope
 	 * those allocations are rare, and won't be worth the trouble.
 	 */
-	if (!(gfp & __GFP_KMEMCG) || (gfp & __GFP_NOFAIL))
+	if (gfp & __GFP_NOFAIL)
 		return true;
 	if (in_interrupt() || (!current->mm) || (current->flags & PF_KTHREAD))
 		return true;
@@ -531,7 +552,7 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
 	if (unlikely(fatal_signal_pending(current)))
 		return true;
 
-	return __memcg_kmem_newpage_charge(gfp, memcg, order);
+	return __memcg_kmem_newpage_charge(page, gfp, order);
 }
 
 /**
@@ -544,44 +565,16 @@ memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
 static inline void
 memcg_kmem_uncharge_pages(struct page *page, int order)
 {
-	if (memcg_kmem_enabled())
+	if (memcg_kmem_enabled() && PageKmemcg(page))
 		__memcg_kmem_uncharge_pages(page, order);
 }
 
 /**
- * memcg_kmem_commit_charge: embeds correct memcg in a page
- * @page: pointer to struct page recently allocated
- * @memcg: the memcg structure we charged against
- * @order: allocation order.
- *
- * Needs to be called after memcg_kmem_newpage_charge, regardless of success or
- * failure of the allocation. if @page is NULL, this function will revert the
- * charges. Otherwise, it will commit the memcg given by @memcg to the
- * corresponding page_cgroup.
- */
-static inline void
-memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
-{
-	if (memcg_kmem_enabled() && memcg)
-		__memcg_kmem_commit_charge(page, memcg, order);
-}
-
-/**
  * memcg_kmem_get_cache: selects the correct per-memcg cache for allocation
  * @cachep: the original global kmem cache
  * @gfp: allocation flags.
  *
- * This function assumes that the task allocating, which determines the memcg
- * in the page allocator, belongs to the same cgroup throughout the whole
- * process.  Misacounting can happen if the task calls memcg_kmem_get_cache()
- * while belonging to a cgroup, and later on changes. This is considered
- * acceptable, and should only happen upon task migration.
- *
- * Before the cache is created by the memcg core, there is also a possible
- * imbalance: the task belongs to a memcg, but the cache being allocated from
- * is the global cache, since the child cache is not yet guaranteed to be
- * ready. This case is also fine, since in this case the GFP_KMEMCG will not be
- * passed and the page allocator will not attempt any cgroup accounting.
+ * All memory allocated from a per-memcg cache is charged to the owner memcg.
  */
 static __always_inline struct kmem_cache *
 memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
@@ -597,6 +590,19 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
 
 	return __memcg_kmem_get_cache(cachep, gfp);
 }
+
+static __always_inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
+{
+	if (memcg_kmem_enabled())
+		__memcg_kmem_put_cache(cachep);
+}
+
+static __always_inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
+{
+	if (!memcg_kmem_enabled())
+		return NULL;
+	return __mem_cgroup_from_kmem(ptr);
+}
 #else
 #define for_each_memcg_cache_index(_idx)	\
 	for (; NULL; )
@@ -606,8 +612,13 @@ static inline bool memcg_kmem_enabled(void)
 	return false;
 }
 
+static inline bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+{
+	return false;
+}
+
 static inline bool
-memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **memcg, int order)
+memcg_kmem_newpage_charge(struct page *page, gfp_t gfp, int order)
 {
 	return true;
 }
@@ -616,29 +627,24 @@ static inline void memcg_kmem_uncharge_pages(struct page *page, int order)
 {
 }
 
-static inline void
-memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, int order)
+static inline int memcg_cache_id(struct mem_cgroup *memcg)
 {
+	return -1;
 }
 
-static inline int memcg_cache_id(struct mem_cgroup *memcg)
+static inline void memcg_get_cache_ids(void)
 {
-	return -1;
 }
 
-static inline int
-memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-		     struct kmem_cache *root_cache)
+static inline void memcg_put_cache_ids(void)
 {
-	return 0;
 }
 
-static inline void memcg_release_cache(struct kmem_cache *cachep)
+static inline void memcg_stop_kmem_account(void)
 {
 }
 
-static inline void memcg_cache_list_add(struct mem_cgroup *memcg,
-					struct kmem_cache *s)
+static inline void memcg_resume_kmem_account(void)
 {
 }
 
@@ -648,9 +654,14 @@ memcg_kmem_get_cache(struct kmem_cache *cachep, gfp_t gfp)
 	return cachep;
 }
 
-static inline void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
+static inline void memcg_kmem_put_cache(struct kmem_cache *cachep)
 {
 }
+
+static inline struct mem_cgroup *mem_cgroup_from_kmem(void *ptr)
+{
+	return NULL;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 #endif /* _LINUX_MEMCONTROL_H */
 
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -82,7 +82,9 @@ extern int overcommit_kbytes_handler(struct ctl_table *, int, void __user *,
  * mmap() functions).
  */
 
-extern struct kmem_cache *vm_area_cachep;
+extern struct kmem_cache *__vm_area_cachep;
+#define allocate_vma(mm, gfp_flags)	kmem_cache_alloc(__vm_area_cachep, gfp_flags)
+#define free_vma(mm, vma)		kmem_cache_free(__vm_area_cachep, vma)
 
 #ifndef CONFIG_MMU
 extern struct rb_root nommu_region_tree;
@@ -127,7 +129,6 @@ extern unsigned int kobjsize(const void *objp);
 #define VM_ACCOUNT	0x00100000	/* Is a VM accounted object */
 #define VM_NORESERVE	0x00200000	/* should the VM suppress accounting */
 #define VM_HUGETLB	0x00400000	/* Huge TLB Page VM */
-#define VM_NONLINEAR	0x00800000	/* Is non-linear (remap_file_pages) */
 #define VM_ARCH_1	0x01000000	/* Architecture-specific flag */
 #define VM_ARCH_2	0x02000000
 #define VM_DONTDUMP	0x04000000	/* Do not include in the core dump */
@@ -207,21 +208,19 @@ extern unsigned int kobjsize(const void *objp);
 extern pgprot_t protection_map[16];
 
 #define FAULT_FLAG_WRITE	0x01	/* Fault was a write access */
-#define FAULT_FLAG_NONLINEAR	0x02	/* Fault was via a nonlinear mapping */
-#define FAULT_FLAG_MKWRITE	0x04	/* Fault was mkwrite of existing pte */
-#define FAULT_FLAG_ALLOW_RETRY	0x08	/* Retry fault if blocking */
-#define FAULT_FLAG_RETRY_NOWAIT	0x10	/* Don't drop mmap_sem and wait when retrying */
-#define FAULT_FLAG_KILLABLE	0x20	/* The fault task is in SIGKILL killable region */
-#define FAULT_FLAG_TRIED	0x40	/* second try */
-#define FAULT_FLAG_USER		0x80	/* The fault originated in userspace */
+#define FAULT_FLAG_MKWRITE	0x02	/* Fault was mkwrite of existing pte */
+#define FAULT_FLAG_ALLOW_RETRY	0x04	/* Retry fault if blocking */
+#define FAULT_FLAG_RETRY_NOWAIT	0x08	/* Don't drop mmap_sem and wait when retrying */
+#define FAULT_FLAG_KILLABLE	0x10	/* The fault task is in SIGKILL killable region */
+#define FAULT_FLAG_TRIED	0x20	/* Second try */
+#define FAULT_FLAG_USER		0x40	/* The fault originated in userspace */
 
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
  * ->fault function. The vma's ->fault is responsible for returning a bitmask
  * of VM_FAULT_xxx flags that give details about how the fault was handled.
  *
- * pgoff should be used in favour of virtual_address, if possible. If pgoff
- * is used, one may implement ->remap_pages to get nonlinear mapping support.
+ * pgoff should be used in favour of virtual_address, if possible.
  */
 struct vm_fault {
 	unsigned int flags;		/* FAULT_FLAG_xxx flags */
@@ -282,10 +281,6 @@ struct vm_operations_struct {
 	int (*migrate)(struct vm_area_struct *vma, const nodemask_t *from,
 		const nodemask_t *to, unsigned long flags);
 #endif
-	/* called by sys_remap_file_pages() to populate non-linear mapping */
-	int (*remap_pages)(struct vm_area_struct *vma, unsigned long addr,
-			   unsigned long size, pgoff_t pgoff);
-
 	/* same as page_mkwrite when using VM_PFNMAP|VM_MIXEDMAP */
 	RH_KABI_EXTEND(int (*pfn_mkwrite)(struct vm_area_struct *vma, struct vm_fault *vmf))
 	RH_KABI_EXTEND(int (*pmd_fault)(struct vm_area_struct *,
@@ -489,53 +484,6 @@ static inline struct page *virt_to_head_page(const void *x)
 	return compound_head(page);
 }
 
-/*
- * PageBuddy() indicate that the page is free and in the buddy system
- * (see mm/page_alloc.c).
- *
- * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to
- * -2 so that an underflow of the page_mapcount() won't be mistaken
- * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very
- * efficiently by most CPU architectures.
- */
-#define PAGE_BUDDY_MAPCOUNT_VALUE (-128)
-
-static inline int PageBuddy(struct page *page)
-{
-	return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE;
-}
-
-static inline void __SetPageBuddy(struct page *page)
-{
-	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
-	atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE);
-}
-
-static inline void __ClearPageBuddy(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageBuddy(page), page);
-	atomic_set(&page->_mapcount, -1);
-}
-
-#define PAGE_BALLOON_MAPCOUNT_VALUE (-256)
-
-static inline int PageBalloon(struct page *page)
-{
-	return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE;
-}
-
-static inline void __SetPageBalloon(struct page *page)
-{
-	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
-	atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE);
-}
-
-static inline void __ClearPageBalloon(struct page *page)
-{
-	VM_BUG_ON_PAGE(!PageBalloon(page), page);
-	atomic_set(&page->_mapcount, -1);
-}
-
 void put_pages_list(struct list_head *pages);
 
 void split_page(struct page *page, unsigned int order);
@@ -1146,7 +1094,6 @@ extern void user_shm_unlock(size_t, struct user_struct *);
  * Parameter block passed down to zap_pte_range in exceptional cases.
  */
 struct zap_details {
-	struct vm_area_struct *nonlinear_vma;	/* Check page->index if set */
 	struct address_space *check_mapping;	/* Check page->mapping if set */
 	pgoff_t	first_index;			/* Lowest page->index to unmap */
 	pgoff_t last_index;			/* Highest page->index to unmap */
@@ -1249,6 +1196,7 @@ static inline int fixup_user_fault(struct task_struct *tsk,
 }
 #endif
 
+extern unsigned long vma_address(struct page *page, struct vm_area_struct *vma);
 extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write);
 extern int access_remote_vm(struct mm_struct *mm, unsigned long addr,
 		void *buf, int len, int write);
@@ -1292,37 +1240,10 @@ int redirty_page_for_writepage(struct writeback_control *wbc,
 void account_page_dirtied(struct page *page, struct address_space *mapping);
 void account_page_writeback(struct page *page);
 int set_page_dirty(struct page *page);
+int set_page_dirty_mm(struct page *page, struct mm_struct *mm);
 int set_page_dirty_lock(struct page *page);
 int clear_page_dirty_for_io(struct page *page);
 
-/* Is the vma a continuation of the stack vma above it? */
-static inline int vma_growsdown(struct vm_area_struct *vma, unsigned long addr)
-{
-	return vma && (vma->vm_end == addr) && (vma->vm_flags & VM_GROWSDOWN);
-}
-
-static inline int stack_guard_page_start(struct vm_area_struct *vma,
-					     unsigned long addr)
-{
-	return (vma->vm_flags & VM_GROWSDOWN) &&
-		(vma->vm_start == addr) &&
-		!vma_growsdown(vma->vm_prev, addr);
-}
-
-/* Is the vma a continuation of the stack vma below it? */
-static inline int vma_growsup(struct vm_area_struct *vma, unsigned long addr)
-{
-	return vma && (vma->vm_start == addr) && (vma->vm_flags & VM_GROWSUP);
-}
-
-static inline int stack_guard_page_end(struct vm_area_struct *vma,
-					   unsigned long addr)
-{
-	return (vma->vm_flags & VM_GROWSUP) &&
-		(vma->vm_end == addr) &&
-		!vma_growsup(vma->vm_next, addr);
-}
-
 extern pid_t
 vm_is_stack(struct task_struct *task, struct vm_area_struct *vma, int in_group);
 
@@ -1841,12 +1762,6 @@ struct vm_area_struct *vma_interval_tree_iter_next(struct vm_area_struct *node,
 	for (vma = vma_interval_tree_iter_first(root, start, last);	\
 	     vma; vma = vma_interval_tree_iter_next(vma, start, last))
 
-static inline void vma_nonlinear_insert(struct vm_area_struct *vma,
-					struct list_head *list)
-{
-	list_add_tail(&vma->shared.nonlinear, list);
-}
-
 void anon_vma_interval_tree_insert(struct anon_vma_chain *node,
 				   struct rb_root *root);
 void anon_vma_interval_tree_remove(struct anon_vma_chain *node,
@@ -1978,6 +1893,14 @@ extern void truncate_inode_pages_final(struct address_space *);
 extern int filemap_fault(struct vm_area_struct *, struct vm_fault *);
 extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf);
 
+struct path;
+struct cred;
+int open_mapping_peer(struct address_space *mapping,
+		struct path *path, const struct cred *cred);
+void close_mapping_peer(struct address_space *mapping);
+struct page *pick_peer_page(struct address_space *mapping, pgoff_t index,
+		struct file_ra_state *ra, unsigned ra_size);
+
 /* mm/page-writeback.c */
 int write_one_page(struct page *page, int wait);
 void task_dirty_inc(struct task_struct *tsk);
@@ -2006,6 +1929,7 @@ unsigned long ra_submit(struct file_ra_state *ra,
 			struct address_space *mapping,
 			struct file *filp);
 
+extern unsigned long stack_guard_gap;
 /* Generic expand stack which grows the stack according to GROWS{UP,DOWN} */
 extern int expand_stack(struct vm_area_struct *vma, unsigned long address);
 
@@ -2034,6 +1958,30 @@ static inline struct vm_area_struct * find_vma_intersection(struct mm_struct * m
 	return vma;
 }
 
+static inline unsigned long vm_start_gap(struct vm_area_struct *vma)
+{
+	unsigned long vm_start = vma->vm_start;
+
+	if (vma->vm_flags & VM_GROWSDOWN) {
+		vm_start -= stack_guard_gap;
+		if (vm_start > vma->vm_start)
+			vm_start = 0;
+	}
+	return vm_start;
+}
+
+static inline unsigned long vm_end_gap(struct vm_area_struct *vma)
+{
+	unsigned long vm_end = vma->vm_end;
+
+	if (vma->vm_flags & VM_GROWSUP) {
+		vm_end += stack_guard_gap;
+		if (vm_end < vma->vm_end)
+			vm_end = -PAGE_SIZE;
+	}
+	return vm_end;
+}
+
 static inline unsigned long vma_pages(struct vm_area_struct *vma)
 {
 	return (vma->vm_end - vma->vm_start) >> PAGE_SHIFT;
@@ -2150,18 +2098,24 @@ int drop_caches_sysctl_handler(struct ctl_table *, int,
 					void __user *, size_t *, loff_t *);
 #endif
 
-unsigned long shrink_slab(struct shrink_control *shrink,
-			  unsigned long nr_pages_scanned,
-			  unsigned long lru_pages);
+void drop_slab(void);
+void drop_slab_node(int nid);
 
 #ifndef CONFIG_MMU
 #define randomize_va_space 0
 #else
-extern int randomize_va_space;
+extern int _randomize_va_space;
+#ifndef CONFIG_VE
+#define randomize_va_space _randomize_va_space
+#else
+#define randomize_va_space (get_exec_env()->_randomize_va_space)
+#endif
 #endif
 
 const char * arch_vma_name(struct vm_area_struct *vma);
-void print_vma_addr(char *prefix, unsigned long rip);
+void ve_print_vma_addr(int dst, char *prefix, unsigned long rip);
+#define print_vma_addr(prefix, rip) \
+	ve_print_vma_addr(VE0_LOG, (prefix), (rip))
 
 void sparse_mem_maps_populate_node(struct page **map_map,
 				   unsigned long pnum_begin,
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -25,6 +25,7 @@
 
 struct address_space;
 struct hmm;
+struct gang;
 
 #define USE_SPLIT_PTE_PTLOCKS	(NR_CPUS >= CONFIG_SPLIT_PTLOCK_CPUS)
 #define USE_SPLIT_PMD_PTLOCKS	(USE_SPLIT_PTE_PTLOCKS && \
@@ -306,15 +307,11 @@ struct vm_area_struct {
 
 	/*
 	 * For areas with an address space and backing store,
-	 * linkage into the address_space->i_mmap interval tree, or
-	 * linkage of vma in the address_space->i_mmap_nonlinear list.
+	 * linkage into the address_space->i_mmap interval tree.
 	 */
-	union {
-		struct {
-			struct rb_node rb;
-			unsigned long rb_subtree_last;
-		} linear;
-		struct list_head nonlinear;
+	struct {
+		struct rb_node rb;
+		unsigned long rb_subtree_last;
 	} shared;
 
 	/*
@@ -335,6 +332,15 @@ struct vm_area_struct {
 					   units, *not* PAGE_CACHE_SIZE */
 	struct file * vm_file;		/* File we map to (can be NULL). */
 	void * vm_private_data;		/* was vm_pte (shared mem) */
+	/*
+	 * Special for pfcache - we can't reuse vm_private_data
+	 * to save up memory as the field is inherited on fork and
+	 * to distinquish if it is our and we need to clear it we will
+	 * still need some kind of flag on address_space of these vma,
+	 * and there are too few free flags left so it implyes adding
+	 * yet another variable.
+	 */
+	void * vm_private_data2;
 
 #ifndef CONFIG_MMU
 	struct vm_region *vm_region;	/* NOMMU mapping region */
@@ -457,6 +463,11 @@ struct mm_struct {
 
 	unsigned long flags; /* Must use atomic bitops to access the bits */
 
+	unsigned int vps_dumpable:2;
+
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *mm_ub;
+#endif
 	struct core_state *core_state; /* coredumping support */
 #ifdef CONFIG_AIO
 	spinlock_t		ioctx_lock;
@@ -543,6 +554,12 @@ struct mm_struct {
 	RH_KABI_RESERVE(8)
 };
 
+#define VD_VE_ENTER_TASK	0/* tasks entered to VE from host, no ptrace,
+				  * or coredump or licdata access allowed */
+#define VD_PTRACE_COREDUMP	1/* tasks with ptrace and coredump allowed */
+#define VD_LICDATA_ACCESS	2/* tasks accessed containers license data,
+				  * no ptrace and no coredump allowed */
+
 static inline void mm_init_cpumask(struct mm_struct *mm)
 {
 #ifdef CONFIG_CPUMASK_OFFSTACK
--- a/include/linux/mmu_notifier.h
+++ b/include/linux/mmu_notifier.h
@@ -65,6 +65,15 @@ struct mmu_notifier_ops {
 				 unsigned long address);
 
 	/*
+	 * clear_young is a lightweight version of clear_flush_young. Like the
+	 * latter, it is supposed to test-and-clear the young/accessed bitflag
+	 * in the secondary pte, but it may omit flushing the secondary tlb.
+	 */
+	int (*clear_young)(struct mmu_notifier *mn,
+			   struct mm_struct *mm,
+			   unsigned long address);
+
+	/*
 	 * test_young is called to check the young/accessed bitflag in
 	 * the secondary pte. This is used to know if the page is
 	 * frequently used without actually clearing the flag or tearing
@@ -240,6 +249,8 @@ extern void __mmu_notifier_mm_destroy(struct mm_struct *mm);
 extern void __mmu_notifier_release(struct mm_struct *mm);
 extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 					  unsigned long address);
+extern int __mmu_notifier_clear_young(struct mm_struct *mm,
+				      unsigned long address);
 extern int __mmu_notifier_test_young(struct mm_struct *mm,
 				     unsigned long address);
 extern void __mmu_notifier_change_pte(struct mm_struct *mm,
@@ -268,6 +279,14 @@ static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return 0;
 }
 
+static inline int mmu_notifier_clear_young(struct mm_struct *mm,
+					   unsigned long address)
+{
+	if (mm_has_notifiers(mm))
+		return __mmu_notifier_clear_young(mm, address);
+	return 0;
+}
+
 static inline int mmu_notifier_test_young(struct mm_struct *mm,
 					  unsigned long address)
 {
@@ -382,6 +401,26 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 	___pmd;								\
 })
 
+#define ptep_clear_young_notify(__vma, __address, __ptep)		\
+({									\
+	int __young;							\
+	struct vm_area_struct *___vma = __vma;				\
+	unsigned long ___address = __address;				\
+	__young = ptep_test_and_clear_young(___vma, ___address, __ptep);\
+	__young |= mmu_notifier_clear_young(___vma->vm_mm, ___address);	\
+	__young;							\
+})
+
+#define pmdp_clear_young_notify(__vma, __address, __pmdp)		\
+({									\
+	int __young;							\
+	struct vm_area_struct *___vma = __vma;				\
+	unsigned long ___address = __address;				\
+	__young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\
+	__young |= mmu_notifier_clear_young(___vma->vm_mm, ___address);	\
+	__young;							\
+})
+
 /*
  * set_pte_at_notify() sets the pte _after_ running the notifier.
  * This is safe to start by updating the secondary MMUs, because the primary MMU
@@ -462,6 +501,8 @@ static inline void mmu_notifier_mm_destroy(struct mm_struct *mm)
 #define	ptep_clear_flush_notify ptep_clear_flush
 #define pmdp_clear_flush_notify pmdp_clear_flush
 #define pmdp_get_and_clear_notify pmdp_get_and_clear
+#define ptep_clear_young_notify ptep_test_and_clear_young
+#define pmdp_clear_young_notify pmdp_test_and_clear_young
 #define set_pte_at_notify set_pte_at
 
 #endif /* CONFIG_MMU_NOTIFIER */
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -16,6 +16,9 @@
 #include <linux/nodemask.h>
 #include <linux/pageblock-flags.h>
 #include <linux/page-flags-layout.h>
+#include <linux/workqueue.h>
+#include <linux/mutex.h>
+#include <generated/bounds.h>
 #include <linux/atomic.h>
 #include <asm/page.h>
 
@@ -373,7 +376,6 @@ struct zone {
 	 * free areas of different sizes
 	 */
 	spinlock_t		lock;
-	int                     all_unreclaimable; /* All pages pinned */
 #if defined CONFIG_COMPACTION || defined CONFIG_CMA
 	/* Set to true when the PG_migrate_skip bits should be cleared */
 	bool			compact_blockskip_flush;
@@ -428,6 +430,10 @@ struct zone {
 	 */
 	unsigned int inactive_ratio;
 
+#ifdef CONFIG_MEMCG
+	bool force_scan;
+#endif
+
 
 	ZONE_PADDING(_pad2_)
 	/* Rarely used or read-mostly fields */
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -84,7 +84,7 @@ void trim_init_extable(struct module *m);
 
 #ifdef MODULE
 #define MODULE_GENERIC_TABLE(gtype,name)			\
-extern const struct gtype##_id __mod_##gtype##_table		\
+extern const typeof(name) __mod_##gtype##_table			\
   __attribute__ ((unused, alias(__stringify(name))))
 
 #else  /* !MODULE */
@@ -377,9 +377,6 @@ struct module
 	/* What modules do I depend on? */
 	struct list_head target_list;
 
-	/* Who is waiting for us to be unloaded */
-	struct task_struct *waiter;
-
 	/* Destruction function. */
 	void (*exit)(void);
 
--- a/include/linux/moduleloader.h
+++ b/include/linux/moduleloader.h
@@ -80,4 +80,11 @@ int module_finalize(const Elf_Ehdr *hdr,
 /* Any cleanup needed when module leaves. */
 void module_arch_cleanup(struct module *mod);
 
+#ifdef CONFIG_KASAN
+#include <linux/kasan.h>
+#define MODULE_ALIGN (PAGE_SIZE << KASAN_SHADOW_SCALE_SHIFT)
+#else
+#define MODULE_ALIGN PAGE_SIZE
+#endif
+
 #endif
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -42,7 +42,9 @@ struct mnt_namespace;
  * flag, consider how it interacts with shared mounts.
  */
 #define MNT_SHARED_MASK	(MNT_UNBINDABLE)
-#define MNT_PROPAGATION_MASK	(MNT_SHARED | MNT_UNBINDABLE)
+#define MNT_USER_SETTABLE_MASK  (MNT_NOSUID | MNT_NODEV | MNT_NOEXEC \
+				 | MNT_NOATIME | MNT_NODIRATIME | MNT_RELATIME \
+				 | MNT_READONLY)
 
 #define MNT_INTERNAL_FLAGS (MNT_SHARED | MNT_WRITE_HOLD | MNT_INTERNAL | \
 			    MNT_MARKED)
@@ -60,6 +62,12 @@ struct vfsmount {
 	int mnt_flags;
 };
 
+struct mountpoint {
+	struct list_head m_hash;
+	struct dentry *m_dentry;
+	int m_count;
+};
+
 struct file; /* forward dec */
 
 extern int mnt_want_write(struct vfsmount *mnt);
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -247,6 +247,11 @@ do {								\
 	net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__)
 #define net_dbg_ratelimited(fmt, ...)				\
 	net_ratelimited_function(pr_debug, fmt, ##__VA_ARGS__)
+#define net_velog_ratelimited(fmt, ...)				\
+	net_ratelimited_function(ve_printk, VE_LOG, fmt, ##__VA_ARGS__)
+#define net_veboth_ratelimited(fmt, ...)				\
+	net_ratelimited_function(ve_printk, VE_LOG_BOTH, fmt, ##__VA_ARGS__)
+
 
 #define net_random()		prandom_u32()
 #define net_srandom(seed)	prandom_seed((__force u32)(seed))
--- a/include/linux/netdev_features.h
+++ b/include/linux/netdev_features.h
@@ -92,6 +92,9 @@ enum {
 	NETIF_F_HW_L2FW_DOFFLOAD_BIT,	/* Allow L2 Forwarding in Hardware */
 
 	NETIF_F_HW_TC_BIT,		/* Offload TC infrastructure */
+	NETIF_F_VENET_BIT,		/* device is venet device */
+	NETIF_F_VIRTUAL_BIT,		/* can be registered inside VE */
+	NETIF_F_FIXED_ADDR_BIT,
 
 	/*
 	 * Add your fresh new feature above and remember to update
@@ -153,6 +156,9 @@ enum {
 #define NETIF_F_HW_L2FW_DOFFLOAD	__NETIF_F(HW_L2FW_DOFFLOAD)
 #define NETIF_F_BUSY_POLL	__NETIF_F(BUSY_POLL)
 #define NETIF_F_HW_TC		__NETIF_F(HW_TC)
+#define NETIF_F_VENET		__NETIF_F(VENET)
+#define NETIF_F_VIRTUAL		__NETIF_F(VIRTUAL)
+#define NETIF_F_FIXED_ADDR	__NETIF_F(FIXED_ADDR)
 
 #define for_each_netdev_feature(mask_addr, bit)	\
 	for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -294,7 +294,6 @@ enum netdev_state_t {
 	__LINK_STATE_DORMANT,
 };
 
-
 /*
  * This structure holds at boot time configured netdevice settings. They
  * are then used in the device probing.
@@ -760,6 +759,11 @@ struct netdev_tc_txq {
 	u16 offset;
 };
 
+struct cpt_context;
+struct cpt_ops;
+struct rst_ops;
+struct cpt_netdev_image;
+
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 /*
  * This structure is to hold information about the device
@@ -1612,6 +1616,7 @@ struct net_device {
 						   because most packets are
 						   unicast) */
 
+	unsigned char		is_leaked;
 
 #ifdef CONFIG_RPS
 	struct netdev_rx_queue	*_rx;
@@ -1821,6 +1826,20 @@ struct net_device {
 };
 #define to_net_dev(d) container_of(d, struct net_device, dev)
 
+#define NETDEV_HASHBITS	8
+#define NETDEV_HASHENTRIES (1 << NETDEV_HASHBITS)
+
+static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
+{
+	unsigned hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
+	return &net->dev_name_head[hash & ((1 << NETDEV_HASHBITS) - 1)];
+}
+
+static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
+{
+	return &net->dev_index_head[ifindex & ((1 << NETDEV_HASHBITS) - 1)];
+}
+
 #define	NETDEV_ALIGN		32
 
 static inline
@@ -3130,7 +3149,6 @@ static inline void dev_consume_skb_any(struct sk_buff *skb)
 
 int netif_rx(struct sk_buff *skb);
 int netif_rx_ni(struct sk_buff *skb);
-int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb);
 int netif_receive_skb(struct sk_buff *skb);
 gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb);
 void napi_gro_flush(struct napi_struct *napi, bool flush_old);
@@ -3865,6 +3883,18 @@ netdev_features_t passthru_features_check(struct sk_buff *skb,
 					  netdev_features_t features);
 netdev_features_t netif_skb_features(struct sk_buff *skb);
 
+#if defined(CONFIG_VE) && defined(CONFIG_NET)
+static inline int ve_is_dev_movable(struct net_device *dev)
+{
+	return !(dev->features & (NETIF_F_VIRTUAL | NETIF_F_NETNS_LOCAL));
+}
+#else
+static inline int ve_is_dev_movable(struct net_device *dev)
+{
+	return 0;
+}
+#endif
+
 static inline bool net_gso_ok(netdev_features_t features, int gso_type)
 {
 	netdev_features_t feature = gso_type & SKB_GSO1_MASK;
--- a/include/linux/netfilter.h
+++ b/include/linux/netfilter.h
@@ -407,4 +407,32 @@ static inline void nf_ct_attach(struct sk_buff *new, struct sk_buff *skb) {}
  */
 DECLARE_PER_CPU(bool, nf_skb_duplicated);
 
+#ifdef CONFIG_VE_IPTABLES
+#include <linux/vziptable_defs.h>
+
+#define net_ipt_permitted(netns, ipt)					\
+	(mask_ipt_allow((netns)->owner_ve->ipt_mask, ipt))
+
+#define net_ipt_module_set(netns, ipt)					\
+	({								\
+		(netns)->_iptables_modules |= ipt##_MOD;	\
+	})
+
+#define net_ipt_module_clear(netns, ipt)				\
+	({								\
+		(netns)->_iptables_modules &= ~ipt##_MOD;	\
+	})
+
+#define net_is_ipt_module_set(netns, ipt)				\
+	((netns)->_iptables_modules & (ipt##_MOD))
+
+#else /* CONFIG_VE_IPTABLES */
+
+#define net_ipt_permitted(netns, ipt)		(1)
+#define net_is_ipt_module_set(netns, ipt)	(1)
+#define net_ipt_module_set(netns, ipt)
+#define net_ipt_module_clear(netns, ipt)
+
+#endif /* CONFIG_VE_IPTABLES */
+
 #endif /*__LINUX_NETFILTER_H*/
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -147,7 +147,8 @@ struct ip_set_type {
 	u8 revision_min, revision_max;
 
 	/* Create set */
-	int (*create)(struct ip_set *set, struct nlattr *tb[], u32 flags);
+	int (*create)(struct net *net, struct ip_set *set,
+		      struct nlattr *tb[], u32 flags);
 
 	/* Attribute policies */
 	const struct nla_policy create_policy[IPSET_ATTR_CREATE_MAX + 1];
@@ -248,11 +249,12 @@ ip_set_init_counter(struct ip_set_counter *counter,
 }
 
 /* register and unregister set references */
-extern ip_set_id_t ip_set_get_byname(const char *name, struct ip_set **set);
-extern void ip_set_put_byindex(ip_set_id_t index);
-extern const char *ip_set_name_byindex(ip_set_id_t index);
-extern ip_set_id_t ip_set_nfnl_get_byindex(ip_set_id_t index);
-extern void ip_set_nfnl_put(ip_set_id_t index);
+extern ip_set_id_t ip_set_get_byname(struct net *net,
+				     const char *name, struct ip_set **set);
+extern void ip_set_put_byindex(struct net *net, ip_set_id_t index);
+extern const char *ip_set_name_byindex(struct net *net, ip_set_id_t index);
+extern ip_set_id_t ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index);
+extern void ip_set_nfnl_put(struct net *net, ip_set_id_t index);
 
 /* API for iptables set match, and SET target */
 
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -224,6 +224,10 @@ struct xt_table_info {
 	unsigned int hook_entry[NF_INET_NUMHOOKS];
 	unsigned int underflow[NF_INET_NUMHOOKS];
 
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *ub;
+#endif
+
 	/*
 	 * Number of user chains. Since tables cannot have loops, at most
 	 * @stacksize jumps (number of user chains) can possibly be made.
@@ -504,4 +508,21 @@ int xt_compat_check_entry_offsets(const void *base, const char *elems,
 				  unsigned int next_offset);
 
 #endif /* CONFIG_COMPAT */
+
+#ifdef CONFIG_VE
+static inline bool ve_xt_table_forbidden(struct xt_table *xt)
+{
+	/*
+	 * The only purpose to have this check as a separate
+	 * helper is "grep"-a-bility
+	 *
+	 * If this helper hit it means that a VE has been
+	 * configured without the particular xt_table support
+	 */
+	return xt == NULL;
+}
+#else
+static inline bool ve_xt_table_forbidden(struct xt_table *xt) { return true; }
+#endif
+
 #endif /* _X_TABLES_H */
--- a/include/linux/notifier.h
+++ b/include/linux/notifier.h
@@ -156,8 +156,9 @@ extern int __srcu_notifier_call_chain(struct srcu_notifier_head *nh,
 
 #define NOTIFY_DONE		0x0000		/* Don't care */
 #define NOTIFY_OK		0x0001		/* Suits me */
+#define NOTIFY_FAIL		0x0002		/* Reject */
 #define NOTIFY_STOP_MASK	0x8000		/* Don't call further */
-#define NOTIFY_BAD		(NOTIFY_STOP_MASK|0x0002)
+#define NOTIFY_BAD		(NOTIFY_STOP_MASK|NOTIFY_FAIL)
 						/* Bad/Veto action */
 /*
  * Clean way to return from the notifier and stop further calls.
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -73,9 +73,10 @@ static inline void put_nsproxy(struct nsproxy *ns)
 	}
 }
 
-static inline void get_nsproxy(struct nsproxy *ns)
+static inline struct nsproxy *get_nsproxy(struct nsproxy *ns)
 {
 	atomic_inc(&ns->count);
+	return ns;
 }
 
 #endif
--- a/include/linux/oom.h
+++ b/include/linux/oom.h
@@ -6,6 +6,8 @@
 #include <linux/types.h>
 #include <linux/nodemask.h>
 #include <uapi/linux/oom.h>
+#include <linux/spinlock_types.h>
+#include <linux/wait.h>
 
 struct zonelist;
 struct notifier_block;
@@ -25,10 +27,23 @@ enum oom_constraint {
 enum oom_scan_t {
 	OOM_SCAN_OK,		/* scan thread and find its badness */
 	OOM_SCAN_CONTINUE,	/* do not consider thread for oom kill */
-	OOM_SCAN_ABORT,		/* abort the iteration and return */
 	OOM_SCAN_SELECT,	/* always select this thread first */
 };
 
+struct oom_context {
+	struct task_struct *owner;
+	struct task_struct *victim;
+	bool marked;
+	unsigned long oom_start;
+	unsigned long oom_end;
+	unsigned long overdraft;
+	int rage;
+	wait_queue_head_t waitq;
+};
+
+extern void init_oom_context(struct oom_context *ctx);
+extern void release_oom_context(struct oom_context *ctx);
+
 /* Thread is the potential origin of an oom condition; kill first on oom */
 #define OOM_FLAG_ORIGIN		((__force oom_flags_t)0x1)
 
@@ -47,26 +62,50 @@ static inline bool oom_task_origin(const struct task_struct *p)
 	return !!(p->signal->oom_flags & OOM_FLAG_ORIGIN);
 }
 
+/* linux/mm/oom_group.c */
+extern int get_task_oom_score_adj(struct task_struct *t);
+
+extern void mark_oom_victim(struct task_struct *tsk);
+
 extern unsigned long oom_badness(struct task_struct *p,
 		struct mem_cgroup *memcg, const nodemask_t *nodemask,
-		unsigned long totalpages);
+		unsigned long totalpages, unsigned long *overdraft);
+
+static inline bool oom_worse(unsigned long points, unsigned long overdraft,
+		unsigned long *chosen_points, unsigned long *max_overdraft)
+{
+	if (overdraft > *max_overdraft) {
+		*max_overdraft = overdraft;
+		*chosen_points = points;
+		return true;
+	}
+	if (overdraft == *max_overdraft && points > *chosen_points) {
+		*chosen_points = points;
+		return true;
+	}
+	return false;
+}
+
 extern void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-			     unsigned int points, unsigned long totalpages,
+			     unsigned long points, unsigned long overdraft,
+			     unsigned long totalpages,
 			     struct mem_cgroup *memcg, nodemask_t *nodemask,
 			     const char *message);
 
-extern int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
-extern void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_flags);
+extern bool oom_trylock(struct mem_cgroup *memcg);
+extern void oom_unlock(struct mem_cgroup *memcg);
 
 extern void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
 			       int order, const nodemask_t *nodemask);
 
 extern enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
-		unsigned long totalpages, const nodemask_t *nodemask,
-		bool force_kill);
+					       const nodemask_t *nodemask);
 
 extern void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-		int order, nodemask_t *mask, bool force_kill);
+			  int order, nodemask_t *mask);
+
+extern void exit_oom_victim(void);
+
 extern int register_oom_notifier(struct notifier_block *nb);
 extern int unregister_oom_notifier(struct notifier_block *nb);
 
@@ -88,4 +127,5 @@ extern struct task_struct *find_lock_task_mm(struct task_struct *p);
 extern int sysctl_oom_dump_tasks;
 extern int sysctl_oom_kill_allocating_task;
 extern int sysctl_panic_on_oom;
+extern int sysctl_oom_relaxation;
 #endif /* _INCLUDE_LINUX_OOM_H */
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -109,6 +109,10 @@ enum pageflags {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	PG_compound_lock,
 #endif
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+	PG_young,
+	PG_idle,
+#endif
 	__NR_PAGEFLAGS,
 
 	/* Filesystems */
@@ -275,6 +279,13 @@ PAGEFLAG_FALSE(HWPoison)
 #define __PG_HWPOISON 0
 #endif
 
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+TESTPAGEFLAG(Young, young)
+SETPAGEFLAG(Young, young)
+TESTCLEARFLAG(Young, young)
+PAGEFLAG(Idle, idle)
+#endif
+
 u64 stable_page_flags(struct page *page);
 
 static inline int PageUptodate(struct page *page)
@@ -466,6 +477,72 @@ static inline int PageTransTail(struct page *page)
 #endif
 
 /*
+ * PageBuddy() indicate that the page is free and in the buddy system
+ * (see mm/page_alloc.c).
+ *
+ * PAGE_BUDDY_MAPCOUNT_VALUE must be <= -2 but better not too close to
+ * -2 so that an underflow of the page_mapcount() won't be mistaken
+ * for a genuine PAGE_BUDDY_MAPCOUNT_VALUE. -128 can be created very
+ * efficiently by most CPU architectures.
+ */
+#define PAGE_BUDDY_MAPCOUNT_VALUE (-128)
+
+static inline int PageBuddy(struct page *page)
+{
+	return atomic_read(&page->_mapcount) == PAGE_BUDDY_MAPCOUNT_VALUE;
+}
+
+static inline void __SetPageBuddy(struct page *page)
+{
+	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
+	atomic_set(&page->_mapcount, PAGE_BUDDY_MAPCOUNT_VALUE);
+}
+
+static inline void __ClearPageBuddy(struct page *page)
+{
+	VM_BUG_ON_PAGE(!PageBuddy(page), page);
+	atomic_set(&page->_mapcount, -1);
+}
+
+#define PAGE_BALLOON_MAPCOUNT_VALUE (-256)
+
+static inline int PageBalloon(struct page *page)
+{
+	return atomic_read(&page->_mapcount) == PAGE_BALLOON_MAPCOUNT_VALUE;
+}
+
+static inline void __SetPageBalloon(struct page *page)
+{
+	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
+	atomic_set(&page->_mapcount, PAGE_BALLOON_MAPCOUNT_VALUE);
+}
+
+static inline void __ClearPageBalloon(struct page *page)
+{
+	VM_BUG_ON_PAGE(!PageBalloon(page), page);
+	atomic_set(&page->_mapcount, -1);
+}
+
+#define PAGE_KMEMCG_MAPCOUNT_VALUE (-512)
+
+static inline int PageKmemcg(struct page *page)
+{
+	return atomic_read(&page->_mapcount) == PAGE_KMEMCG_MAPCOUNT_VALUE;
+}
+
+static inline void __SetPageKmemcg(struct page *page)
+{
+	VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page);
+	atomic_set(&page->_mapcount, PAGE_KMEMCG_MAPCOUNT_VALUE);
+}
+
+static inline void __ClearPageKmemcg(struct page *page)
+{
+	VM_BUG_ON_PAGE(!PageKmemcg(page), page);
+	atomic_set(&page->_mapcount, -1);
+}
+
+/*
  * If network-based swap is enabled, sl*b must keep track of whether pages
  * were allocated from pfmemalloc reserves.
  */
--- a/include/linux/page_cgroup.h
+++ b/include/linux/page_cgroup.h
@@ -3,9 +3,9 @@
 
 enum {
 	/* flags for mem_cgroup */
-	PCG_LOCK,  /* Lock for pc->mem_cgroup and following bits. */
-	PCG_USED, /* this object is in use. */
-	PCG_MIGRATION, /* under page migration */
+	PCG_USED = 0x01,	/* This page is charged to a memcg */
+	PCG_MEM = 0x02,		/* This page holds a memory charge */
+	PCG_MEMSW = 0x04,	/* This page holds a memory+swap charge */
 	__NR_PCG_FLAGS,
 };
 
@@ -44,42 +44,9 @@ static inline void __init page_cgroup_init(void)
 struct page_cgroup *lookup_page_cgroup(struct page *page);
 struct page *lookup_cgroup_page(struct page_cgroup *pc);
 
-#define TESTPCGFLAG(uname, lname)			\
-static inline int PageCgroup##uname(struct page_cgroup *pc)	\
-	{ return test_bit(PCG_##lname, &pc->flags); }
-
-#define SETPCGFLAG(uname, lname)			\
-static inline void SetPageCgroup##uname(struct page_cgroup *pc)\
-	{ set_bit(PCG_##lname, &pc->flags);  }
-
-#define CLEARPCGFLAG(uname, lname)			\
-static inline void ClearPageCgroup##uname(struct page_cgroup *pc)	\
-	{ clear_bit(PCG_##lname, &pc->flags);  }
-
-#define TESTCLEARPCGFLAG(uname, lname)			\
-static inline int TestClearPageCgroup##uname(struct page_cgroup *pc)	\
-	{ return test_and_clear_bit(PCG_##lname, &pc->flags);  }
-
-TESTPCGFLAG(Used, USED)
-CLEARPCGFLAG(Used, USED)
-SETPCGFLAG(Used, USED)
-
-SETPCGFLAG(Migration, MIGRATION)
-CLEARPCGFLAG(Migration, MIGRATION)
-TESTPCGFLAG(Migration, MIGRATION)
-
-static inline void lock_page_cgroup(struct page_cgroup *pc)
-{
-	/*
-	 * Don't take this lock in IRQ context.
-	 * This lock is for pc->mem_cgroup, USED, MIGRATION
-	 */
-	bit_spin_lock(PCG_LOCK, &pc->flags);
-}
-
-static inline void unlock_page_cgroup(struct page_cgroup *pc)
+static inline int PageCgroupUsed(struct page_cgroup *pc)
 {
-	bit_spin_unlock(PCG_LOCK, &pc->flags);
+	return !!(pc->flags & PCG_USED);
 }
 
 #else /* CONFIG_MEMCG */
--- /dev/null
+++ b/include/linux/page_idle.h
@@ -0,0 +1,69 @@
+#ifndef _LINUX_MM_PAGE_IDLE_H
+#define _LINUX_MM_PAGE_IDLE_H
+
+#include <linux/page-flags.h>
+
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+
+static inline bool page_is_young(struct page *page)
+{
+	return PageYoung(page);
+}
+
+static inline void set_page_young(struct page *page)
+{
+	SetPageYoung(page);
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+	return TestClearPageYoung(page);
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+	return PageIdle(page);
+}
+
+static inline void set_page_idle(struct page *page)
+{
+	SetPageIdle(page);
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+	ClearPageIdle(page);
+}
+
+#else /* !CONFIG_IDLE_PAGE_TRACKING */
+
+static inline bool page_is_young(struct page *page)
+{
+	return false;
+}
+
+static inline void set_page_young(struct page *page)
+{
+}
+
+static inline bool test_and_clear_page_young(struct page *page)
+{
+	return false;
+}
+
+static inline bool page_is_idle(struct page *page)
+{
+	return false;
+}
+
+static inline void set_page_idle(struct page *page)
+{
+}
+
+static inline void clear_page_idle(struct page *page)
+{
+}
+
+#endif /* CONFIG_IDLE_PAGE_TRACKING */
+
+#endif /* _LINUX_MM_PAGE_IDLE_H */
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -338,7 +338,7 @@ do {									\
 #endif
 
 #ifndef this_cpu_sub
-# define this_cpu_sub(pcp, val)		this_cpu_add((pcp), -(val))
+# define this_cpu_sub(pcp, val)		this_cpu_add((pcp), -(typeof(pcp))(val))
 #endif
 
 #ifndef this_cpu_inc
@@ -424,7 +424,7 @@ do {									\
 # define this_cpu_add_return(pcp, val)	__pcpu_size_call_return2(this_cpu_add_return_, pcp, val)
 #endif
 
-#define this_cpu_sub_return(pcp, val)	this_cpu_add_return(pcp, -(val))
+#define this_cpu_sub_return(pcp, val)	this_cpu_add_return(pcp, -(typeof(pcp))(val))
 #define this_cpu_inc_return(pcp)	this_cpu_add_return(pcp, 1)
 #define this_cpu_dec_return(pcp)	this_cpu_add_return(pcp, -1)
 
@@ -592,7 +592,7 @@ do {									\
 #endif
 
 #ifndef __this_cpu_sub
-# define __this_cpu_sub(pcp, val)	__this_cpu_add((pcp), -(val))
+# define __this_cpu_sub(pcp, val)	__this_cpu_add((pcp), -(typeof(pcp))(val))
 #endif
 
 #ifndef __this_cpu_inc
@@ -674,7 +674,7 @@ do {									\
 	__pcpu_size_call_return2(__this_cpu_add_return_, pcp, val)
 #endif
 
-#define __this_cpu_sub_return(pcp, val)	__this_cpu_add_return(pcp, -(val))
+#define __this_cpu_sub_return(pcp, val)	__this_cpu_add_return(pcp, -(typeof(pcp))(val))
 #define __this_cpu_inc_return(pcp)	__this_cpu_add_return(pcp, 1)
 #define __this_cpu_dec_return(pcp)	__this_cpu_add_return(pcp, -1)
 
--- /dev/null
+++ b/include/linux/pfcache.h
@@ -0,0 +1,70 @@
+/*
+ *  include/linux/pfcache.h
+ *
+ *  Parallels File Cache
+ *
+ *  Copyright (c) 2012-2015 Parallels IP Holdings GmbH
+ *
+ *  Author: Konstantin Khlebnikov
+ *
+ */
+
+#ifndef LINUX_PFCACHE_H
+#define LINUX_PFCACHE_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define FS_IOC_PFCACHE_OPEN	_IO('f', 50)
+#define FS_IOC_PFCACHE_CLOSE	_IO('f', 51)
+#define FS_IOC_PFCACHE_DUMP	_IO('f', 52)
+
+#define PFCACHE_CSUM_SIZE	20	/* SHA-1 (FIPS 180-1) */
+
+#define PFCACHE_XATTR_NAME	"trusted.pfcache"
+
+/* extendable FS_IOC_PFCACHE_DUMP argument, must be 32/64-bits compatible */
+struct pfcache_dump_request {
+	__u32	header_size;		/* this struct size */
+	__u32	buffer_size;		/* tail buffer size */
+	__u64	filter;			/* filter flags */
+	__u64	payload;		/* payload flags */
+	__u32	offset;			/* skip inodes, after filtering */
+	__u8	csum_filter[PFCACHE_CSUM_SIZE];
+	/* -- add fields above this line -- */
+	__u8	buffer[0];
+};
+
+/* to check new fields presence */
+#define PFCACHE_DUMP_HAS(req, field)	((req)->header_size >= \
+		offsetof(typeof(*(req)), field) + sizeof((req)->field))
+
+/* filter bits, what to skip */
+#define PFCACHE_FILTER_WITH_CSUM	0x0001ll
+#define PFCACHE_FILTER_WITHOUT_CSUM	0x0002ll
+#define PFCACHE_FILTER_WITH_PEER	0x0004ll
+#define PFCACHE_FILTER_WITHOUT_PEER	0x0008ll
+#define PFCACHE_FILTER_COMPARE_CSUM	0x0010ll /* check csum_filter */
+#define PFCACHE_FILTER_MASK		0x001Fll /* all known filters */
+
+/* payload bits, what to dump */
+#define PFCACHE_PAYLOAD_CSUM		0x0001ll /* u8[EXT4_DATA_CSUM_SIZE] */
+#define PFCACHE_PAYLOAD_FHANDLE		0x0002ll /* struct file_handle */
+#define PFCACHE_PAYLOAD_STATE		0x0004ll /* u64 filter-state */
+#define PFCACHE_PAYLOAD_FSIZE		0x0008ll /* u64 file size */
+#define PFCACHE_PAYLOAD_PAGES		0x0010ll /* u64 page-cache size */
+#define PFCACHE_PAYLOAD_MASK		0x001Fll /* all known payloads */
+
+/* MAX_HANDLE_SZ */
+#define PFCACHE_FHANDLE_MAX		256
+
+/* see fs/fhandle.c */
+#define PFCACHE_FHANDLE_SIZE(ptr)	(*(__u32*)(ptr) + sizeof(__u32) * 2)
+
+/* all payload fields aligned to 8 bytes boundary */
+#define PFCACHE_PAYLOAD_MAX_SIZE			\
+	(ALIGN(PFCACHE_CSUM_SIZE, sizeof(__u64)) +	\
+	 PFCACHE_FHANDLE_MAX +				\
+	 sizeof(__u64) * 3)
+
+#endif /* LINUX_PFCACHE_H */
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -61,7 +61,7 @@ struct pid
 	/* lists of tasks that use this pid */
 	struct hlist_head tasks[PIDTYPE_MAX];
 	struct rcu_head rcu;
-	struct upid numbers[1];
+	struct upid numbers[2];
 };
 
 extern struct pid init_struct_pid;
@@ -171,6 +171,7 @@ static inline pid_t pid_nr(struct pid *pid)
 
 pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns);
 pid_t pid_vnr(struct pid *pid);
+pid_t ve_task_ppid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
 
 #define do_each_pid_task(pid, type, task)				\
 	do {								\
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -24,6 +24,7 @@ struct pid_namespace {
 	struct kref kref;
 	struct pidmap pidmap[PIDMAP_ENTRIES];
 	int last_pid;
+	int pid_max;
 	unsigned int nr_hashed;
 	struct task_struct *child_reaper;
 	struct kmem_cache *pid_cachep;
@@ -40,6 +41,7 @@ struct pid_namespace {
 	struct work_struct proc_work;
 	kgid_t pid_gid;
 	int hide_pid;
+	int hide_pidns;
 	int reboot;	/* group exit code if this pidns was rebooted */
 	unsigned int proc_inum;
 };
@@ -60,6 +62,7 @@ extern struct pid_namespace *copy_pid_ns(unsigned long flags,
 	struct user_namespace *user_ns, struct pid_namespace *ns);
 extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
 extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd);
+extern int change_active_pid_ns(struct task_struct *, struct pid_namespace *);
 extern void put_pid_ns(struct pid_namespace *ns);
 
 #else /* !CONFIG_PID_NS */
--- /dev/null
+++ b/include/linux/ploop/compat.h
@@ -0,0 +1,33 @@
+/*
+ *  include/linux/ploop/compat.h
+ *
+ *  This file contained macros to provide compatibility layer for 2.6.18,
+ *  where bio layer was different.
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_PLOOP_COMPAT_H_
+#define _LINUX_PLOOP_COMPAT_H_
+
+#include <linux/version.h>
+
+#define DEFINE_BIO_CB(func) \
+static void func(struct bio *bio, int err) {
+
+#define END_BIO_CB(func)  }
+
+#define BIO_ENDIO(_queue, _bio, _err)					\
+	do {								\
+		trace_block_bio_complete((_queue), (_bio), (_err));	\
+		bio_endio((_bio), (_err));				\
+	} while (0);
+
+#define F_DENTRY(file)	(file)->f_path.dentry
+#define F_MNT(file)	(file)->f_path.mnt
+
+#define KOBJECT_INIT(kobj, ktype) kobject_init(kobj, ktype)
+#define KOBJECT_ADD(kobj, parent, fmt, arg...) kobject_add(kobj, parent, fmt, arg)
+
+#endif
--- /dev/null
+++ b/include/linux/ploop/ploop.h
@@ -0,0 +1,915 @@
+/*
+ *  include/linux/ploop/ploop.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_PLOOP_H_
+#define _LINUX_PLOOP_H_
+
+#include <linux/rbtree.h>
+#include <linux/timer.h>
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+#include "ploop_if.h"
+#include "compat.h"
+
+#define PLOOP_NAME_SIZE		64
+#define PLOOP_MAX_FORMATS	32
+#define PLOOP_DEVICE_MAJOR	182
+#define PLOOP_DEVICE_RANGE	(1UL << MINORBITS)
+#define PLOOP_PART_SHIFT	4
+#define PLOOP_PART_MAX		(1UL << PLOOP_PART_SHIFT)
+
+/* 1. fastpath_reqs is subtracted because they don't consume preq-s
+ * 2. typically, entry_qlen and bio_qlen are close to zero */
+#define PLOOP_CONGESTED(plo)    (plo->entry_qlen + plo->active_reqs - \
+				 plo->fastpath_reqs + plo->bio_qlen)
+/* 32 bits for virtual block. Enough. */
+typedef u32	cluster_t;
+typedef u32	iblock_t;
+
+struct ploop_request;
+struct ploop_delta;
+
+enum {
+	PLOOP_S_RUNNING,	/* Device is active */
+	PLOOP_S_ATTENTION,	/* Device is processing a barrier, everything
+				 * is queued to be totally serialized */
+	PLOOP_S_WAIT_PROCESS,	/* Main thread is waiting for requests */
+	PLOOP_S_EXITING,	/* Exiting */
+	PLOOP_S_ABORT,		/* Device is aborted due to unrecoverable
+				 * error. Reads are still allowed. */
+	PLOOP_S_SYNC,		/* Unplug was requested */
+	PLOOP_S_CHANGED,	/* Media changed */
+	PLOOP_S_WRITE_CONG,	/* Write direction was congested */
+	PLOOP_S_READ_CONG,	/* Read direction was congested */
+	PLOOP_S_TRACK,		/* Write tracker is ON */
+	PLOOP_S_TRACK_ABORT,	/* Write tracker is aborted */
+	PLOOP_S_ENOSPC_EVENT,	/* ENOSPC event happened but but was not
+				 * consumed by userspace yet */
+	PLOOP_S_CONGESTED,	/* Too many bios submitted to us */
+	PLOOP_S_DISCARD,	/* ploop is ready to handle discard request */
+	PLOOP_S_DISCARD_LOADED,	/* A discard request was handled and
+				   free blocks loaded */
+	PLOOP_S_LOCKED,	        /* ploop is locked by userspace
+				   (for minor mgmt only) */
+	PLOOP_S_ONCE,	        /* An event (e.g. printk once) happened */
+	PLOOP_S_PUSH_BACKUP,	/* Push_backup is in progress */
+	PLOOP_S_NULLIFY,	/* Nullifying BAT is in progress */
+};
+
+enum {
+	PLOOP_F_NORMAL,		/* Default: not yet freezed or unfrozen */
+	PLOOP_F_FROZEN,		/* Frozen PLOOP_IOC_FREEZE */
+	PLOOP_F_THAWING,	/* thaw_bdev is in progress */
+};
+
+struct ploop_snapdata
+{
+	/* top_delta file reopened read-only. */
+	struct file		*file;
+};
+
+
+
+struct ploop_file
+{
+	struct list_head	list;
+
+	loff_t		vpos;	/* Position of this chunk in virtual map */
+	loff_t		start;	/* Start of data in this file, usually 0 */
+	loff_t		length;	/* Length of data in this file */
+	loff_t		limit;	/* Maximal size of this file. If it is
+				 * exceeded we must switch to the next chunk
+				 */
+	struct file		*file;	/* File */
+	struct address_space	*mapping;
+	struct inode		*inode;
+	struct extent_map_tree	*em_tree;
+	struct block_device	*bdev;
+	int flags; /* file flags */
+};
+
+/* Real functions are hidden deeply. :-)
+ *
+ * This struct describes how we do real IO on particular backing file.
+ */
+
+enum {
+	PLOOP_IO_FSYNC_DELAYED,  /* Must f_op->fsync before FLUSH|FUA */
+};
+
+struct ploop_io
+{
+	struct ploop_device	*plo;
+
+	loff_t		       *size_ptr; /* NULL or points to ploop_mapping */
+	loff_t			prealloced_size;
+	struct ploop_request   *prealloc_preq;  /* preq who does prealloc */
+	loff_t			max_size;	/* Infinity */
+	int			n_chunks;	/* 1. */
+	struct ploop_file	files;		/* Only 1 file is supported */
+
+	iblock_t		alloc_head;
+
+	struct list_head	fsync_queue;
+	struct task_struct	*fsync_thread;
+	int			fsync_qlen;
+	wait_queue_head_t	fsync_waitq;
+	struct timer_list	fsync_timer;
+
+	struct ploop_io_ops	*ops;
+	unsigned long		io_state;
+	u64                     io_count;
+};
+
+struct ploop_io_ops
+{
+	struct list_head	list;
+	unsigned int		id;
+	char			*name;
+	struct module		*owner;
+
+	void		(*unplug)(struct ploop_io *);
+	int		(*congested)(struct ploop_io *, int bits);
+
+	/* Allocate new block, return its index in image.
+	 * Data must be initialized to zeros and commited to disk.
+	 *
+	 * This function is slow and it is used only to allocate
+	 * index tables.
+	 */
+	int	(*alloc)(struct ploop_io *, loff_t pos, loff_t len);
+
+	/* These functions must schedule IO from/to disk.
+	 * If it returns 1, this means write is not complete and
+	 * preq is added to some internal queue.
+	 *
+	 * submit() makes IO to already allocated space (preq->iblock)
+	 * and must fail when writing to unallocated area.
+	 *
+	 * submit_alloc() assumes that storage is not allocated and allocates
+	 * new area in image.
+	 */
+	void	(*submit)(struct ploop_io *, struct ploop_request *,
+			  unsigned long rw,
+			  struct bio_list *sbl, iblock_t iblk, unsigned int size);
+	void	(*submit_alloc)(struct ploop_io *, struct ploop_request *,
+				struct bio_list *sbl, unsigned int size);
+	void	(*post_submit)(struct ploop_io *, struct ploop_request *);
+
+	int	(*disable_merge)(struct ploop_io * io, sector_t isector, unsigned int len);
+	int	(*fastmap)(struct ploop_io * io, struct bio *orig_bio,
+			   struct bio * bio, sector_t isec);
+
+	void	(*read_page)(struct ploop_io * io, struct ploop_request * preq,
+			     struct page * page, sector_t sec);
+	void	(*write_page)(struct ploop_io * io, struct ploop_request * preq,
+			      struct page * page, sector_t sec, unsigned long rw);
+
+
+	int	(*sync_read)(struct ploop_io * io, struct page * page,
+			     unsigned int len, unsigned int off, sector_t sec);
+	int	(*sync_write)(struct ploop_io * io, struct page * page,
+			      unsigned int len, unsigned int off, sector_t sec);
+
+
+	int	(*sync_readvec)(struct ploop_io * io, struct page ** pvec,
+				unsigned int nr, sector_t sec);
+	int	(*sync_writevec)(struct ploop_io * io, struct page ** pvec,
+				unsigned int nr, sector_t sec);
+
+	int	(*init)(struct ploop_io * io);
+	void	(*destroy)(struct ploop_io * io);
+	int	(*open)(struct ploop_io * io);
+	int	(*sync)(struct ploop_io * io);
+	int	(*stop)(struct ploop_io * io);
+	int	(*prepare_snapshot)(struct ploop_io *, struct ploop_snapdata *);
+	int	(*complete_snapshot)(struct ploop_io *, struct ploop_snapdata *);
+	int	(*prepare_merge)(struct ploop_io *, struct ploop_snapdata *);
+	int	(*start_merge)(struct ploop_io *, struct ploop_snapdata *);
+	int	(*truncate)(struct ploop_io *, struct file *, __u32 alloc_head);
+	void	(*queue_settings)(struct ploop_io *, struct request_queue *q);
+
+	void	(*issue_flush)(struct ploop_io*, struct ploop_request * preq);
+
+	int	(*dump)(struct ploop_io*);
+
+	loff_t  (*i_size_read)(struct ploop_io*);
+	fmode_t (*f_mode)(struct ploop_io*);
+
+	int     (*autodetect)(struct ploop_io * io);
+};
+
+static inline loff_t generic_i_size_read(struct ploop_io *io)
+{
+	BUG_ON(!io->files.file);
+	BUG_ON(!io->files.inode);
+
+	return i_size_read(io->files.inode);
+}
+static inline fmode_t generic_f_mode(struct ploop_io *io)
+{
+	BUG_ON(!io->files.file);
+
+	return io->files.file->f_mode;
+}
+
+enum {
+	PLOOP_MAP_IDENTICAL,
+	PLOOP_MAP_DEAD,
+};
+
+#define PLOOP_LRU_BUFFER	8
+
+struct ploop_map
+{
+	struct ploop_device	*plo;
+	struct list_head	delta_list;
+
+	struct rb_root		rb_root;
+	unsigned long		flags;
+	unsigned long		last_activity;
+
+	unsigned int		pages;
+	unsigned int		max_index;
+
+	struct map_node		*lru_buffer[PLOOP_LRU_BUFFER];
+	unsigned int		lru_buffer_ptr;
+
+	wait_queue_head_t	destroy_waitq;
+};
+
+#define PLOOP_FMT_CAP_DELTA	1
+#define PLOOP_FMT_CAP_WRITABLE	2
+#define PLOOP_FMT_CAP_IDENTICAL	4
+
+struct ploop_delta_ops
+{
+	struct list_head	list;
+	unsigned int		id;
+	char			*name;
+	struct module		*owner;
+
+	unsigned int		capability;
+
+	/* Return location of index page */
+	int		(*map_index)(struct ploop_delta *, unsigned long index,
+				     sector_t *sec);
+	void		(*read_index)(struct ploop_delta *, struct ploop_request * preq,
+				      struct page * page, sector_t sec);
+
+	/* Allocate new block in delta and write request there.
+	 * If request does not cover whole block, this function
+	 * must pad with zeros
+	 */
+	void		(*allocate)(struct ploop_delta *, struct ploop_request *,
+				    struct bio_list *sbl, unsigned int size);
+	void		(*allocate_complete)(struct ploop_delta *, struct ploop_request *);
+
+	int		(*compose)(struct ploop_delta *, int, struct ploop_ctl_chunk *);
+	int		(*open)(struct ploop_delta *);
+	void		(*destroy)(struct ploop_delta *);
+	int		(*start)(struct ploop_delta *);
+	int		(*stop)(struct ploop_delta *);
+	int		(*refresh)(struct ploop_delta *);
+	int		(*sync)(struct ploop_delta *);
+	int		(*prepare_snapshot)(struct ploop_delta *, struct ploop_snapdata *);
+	int		(*complete_snapshot)(struct ploop_delta *, struct ploop_snapdata *);
+	int		(*prepare_merge)(struct ploop_delta *, struct ploop_snapdata *);
+	int		(*start_merge)(struct ploop_delta *, struct ploop_snapdata *);
+	int		(*truncate)(struct ploop_delta *, struct file *, __u32 alloc_head);
+	int		(*prepare_grow)(struct ploop_delta *, u64 *new_size, int *reloc);
+	int		(*complete_grow)(struct ploop_delta *, u64 new_size);
+};
+
+/* Virtual image. */
+struct ploop_delta
+{
+	struct list_head	list;
+
+	int			level;		/* Level of delta. 0 is base image */
+	unsigned int		cluster_log;	/* In 512=1<<9 byte sectors */
+	unsigned int		flags;
+
+	struct ploop_device	*plo;
+
+	struct ploop_io		io;
+
+	void			*priv;
+
+	struct ploop_delta_ops	*ops;
+
+	struct kobject		kobj;
+
+	u64			max_delta_size; /* in sectors */
+};
+
+struct ploop_tunable
+{
+	int	max_requests;
+	int	batch_entry_qlen;
+	int	batch_entry_delay;
+	int	fsync_max;
+	int	fsync_delay;
+	int	min_map_pages;
+	int	max_map_inactivity;
+	int	congestion_high_watermark;
+	int	congestion_low_watermark;
+	int	max_active_requests;
+	int	push_backup_timeout; /* in seconds */
+	unsigned int pass_flushes : 1, pass_fuas : 1,
+		     congestion_detection : 1,
+		     check_zeros : 1,
+		     disable_root_threshold : 1,
+		     disable_user_threshold : 1;
+};
+
+#define DEFAULT_PLOOP_MAXRQ 256
+#define DEFAULT_PLOOP_BATCH_ENTRY_QLEN 32
+
+#define DEFAULT_PLOOP_TUNE \
+(struct ploop_tunable) { \
+.max_requests = DEFAULT_PLOOP_MAXRQ, \
+.batch_entry_qlen = 32, \
+.batch_entry_delay = HZ/20, \
+.fsync_max = DEFAULT_PLOOP_BATCH_ENTRY_QLEN, \
+.fsync_delay = HZ/10, \
+.min_map_pages = 32, \
+.max_map_inactivity = 10*HZ, \
+.congestion_high_watermark = 3*DEFAULT_PLOOP_MAXRQ/4, \
+.congestion_low_watermark = DEFAULT_PLOOP_MAXRQ/2, \
+.pass_flushes = 1, \
+.pass_fuas = 1, \
+.check_zeros = 1, \
+.max_active_requests = DEFAULT_PLOOP_BATCH_ENTRY_QLEN / 2, \
+.push_backup_timeout = 42, }
+
+struct ploop_stats
+{
+#define __DO(_at)	__u32	_at;
+#include "ploop_stat.h"
+#undef __DO
+};
+
+struct ploop_freeblks_desc;
+struct ploop_pushbackup_desc;
+
+struct ploop_device
+{
+	unsigned long		state;
+	spinlock_t		lock;
+
+	struct list_head	free_list;
+	struct list_head	entry_queue;
+	int			entry_qlen;
+	int			read_sync_reqs;
+	int			free_qlen; /* len of free_list */
+	int			free_qmax; /* max len of free_list */
+	int			blockable_reqs; /* depends on userspace tool */
+	int			blocked_bios; /* depends on userspace tool */
+
+	struct bio		*bio_head;
+	struct bio		*bio_tail;
+	struct bio		*bio_sync;
+	struct bio_list		bio_discard_list;
+	int			bio_discard_qlen;
+	int			bio_qlen;
+	int			bio_total;
+
+	struct rb_root		entry_tree[2];
+
+	struct list_head	ready_queue;
+
+	struct rb_root		lockout_tree;
+	struct rb_root		lockout_pb_tree;
+
+	int			cluster_log;
+	int			fmt_version;
+
+	int			active_reqs;
+	int			fastpath_reqs;
+	int			barrier_reqs;
+
+	struct bio		*cached_bio;
+
+	struct timer_list	mitigation_timer;
+	struct timer_list	freeze_timer;
+
+	wait_queue_head_t	waitq;
+	wait_queue_head_t	req_waitq;
+	wait_queue_head_t	freeze_waitq;
+	wait_queue_head_t	event_waitq;
+
+	struct ploop_map	map;
+	struct ploop_map	*trans_map;
+
+	struct ploop_tunable	tune;
+
+	int			index;
+	struct mutex		ctl_mutex;
+	atomic_t		open_count;
+	u64			bd_size;
+	struct gendisk		*disk;
+	struct block_device	*bdev;
+	struct request_queue	*queue;
+	struct task_struct	*thread;
+	struct block_device	*frozen_bdev;
+	int			freeze_state;
+	struct rb_node		link;
+
+	/* someone who wants to quiesce state-machine waits
+	 * here for signal from state-machine saying that
+	 * processing came to PLOOP_REQ_BARRIER request */
+	struct completion	*quiesce_comp;
+
+	/* state-machine in 'quiesce' state waits here till
+	 * someone call ploop_relax() */
+	struct completion	relax_comp;
+
+	/* someone who call ploop_relax() waits here to know
+	 * that 'relax' really happened and state-machine is
+	 * ready for next ploop_quiesce(). This is important
+	 * because someone might call ploop_quiesce() immediately
+	 * after ploop_relax() succeeded */
+	struct completion	relaxed_comp;
+
+	spinlock_t		track_lock;
+	struct rb_root		track_tree;
+	sector_t		track_end;
+	u32			track_cluster;
+	u32			track_ptr;
+
+	u32			merge_ptr;
+
+	atomic_t		maintenance_cnt;
+	struct completion	maintenance_comp;
+	int			maintenance_type;
+
+	u32			grow_start;
+	u32			grow_end;
+	u32			grow_relocated;
+	u64			grow_new_size;
+
+	spinlock_t		dummy_lock;
+	struct mutex		sysfs_mutex;
+	struct kobject		kobj;
+	struct kobject		*pstat_dir;
+	struct kobject		*pstate_dir;
+	struct kobject		*ptune_dir;
+
+	struct ploop_stats	st;
+	char                    cookie[PLOOP_COOKIE_SIZE];
+
+	struct ploop_freeblks_desc *fbd;
+	struct ploop_pushbackup_desc *pbd;
+	struct block_device *dm_crypt_bdev;
+
+	unsigned long		locking_state; /* plo locked by userspace */
+};
+
+enum
+{
+	PLOOP_REQ_LOCKOUT,	/* This preq is locking overapping requests */
+	PLOOP_REQ_PB_LOCKOUT,	/* This preq is locking overlapping WRITEs */
+	PLOOP_REQ_SYNC,
+	PLOOP_REQ_BARRIER,
+	PLOOP_REQ_UNSTABLE,
+	PLOOP_REQ_TRACK,
+	PLOOP_REQ_SORTED,
+	PLOOP_REQ_TRANS,
+	PLOOP_REQ_MERGE,
+	PLOOP_REQ_RELOC_A,	/* 'A' stands for allocate() */
+	PLOOP_REQ_RELOC_S,	/* 'S' stands for submit() */
+	PLOOP_REQ_RELOC_N,	/* 'N' stands for "nullify" */
+	PLOOP_REQ_ZERO,
+	PLOOP_REQ_DISCARD,
+	PLOOP_REQ_RSYNC,
+	PLOOP_REQ_KAIO_FSYNC,	/*force image fsync by KAIO module */
+	PLOOP_REQ_POST_SUBMIT, /* preq needs post_submit processing */
+	PLOOP_REQ_PUSH_BACKUP, /* preq was ACKed by userspace push_backup */
+	PLOOP_REQ_FSYNC_DONE,  /* fsync_thread() performed f_op->fsync() */
+	PLOOP_REQ_ISSUE_FLUSH, /* preq needs ->issue_flush before completing */
+	PLOOP_REQ_BLOCKABLE,  /* preq was accounted in plo->blockable_reqs */
+};
+
+#define PLOOP_REQ_MERGE_FL (1 << PLOOP_REQ_MERGE)
+#define PLOOP_REQ_RELOC_A_FL (1 << PLOOP_REQ_RELOC_A)
+#define PLOOP_REQ_RELOC_S_FL (1 << PLOOP_REQ_RELOC_S)
+#define PLOOP_REQ_RELOC_N_FL (1 << PLOOP_REQ_RELOC_N)
+#define PLOOP_REQ_DISCARD_FL (1 << PLOOP_REQ_DISCARD)
+#define PLOOP_REQ_ZERO_FL (1 << PLOOP_REQ_ZERO)
+
+enum
+{
+	PLOOP_E_ENTRY,		/* Not yet processed */
+	PLOOP_E_COMPLETE,	/* Complete. Maybe, with an error */
+	PLOOP_E_RELOC_COMPLETE,	/* Reloc complete. Maybe, with an error */
+	PLOOP_E_INDEX_READ,	/* Reading an index page */
+	PLOOP_E_TRANS_INDEX_READ,/* Reading a trans index page */
+	PLOOP_E_DELTA_READ,	/* Write request reads data from previos delta */
+	PLOOP_E_DELTA_COPIED,	/* Data from previos delta was bcopy-ied */
+	PLOOP_E_TRANS_DELTA_READ,/* Write request reads data from trans delta */
+	PLOOP_E_RELOC_DATA_READ,/* Read user data to relocate */
+	PLOOP_E_RELOC_NULLIFY,  /* Zeroing given iblock is in progress */
+	PLOOP_E_INDEX_DELAY,	/* Index update is blocked by already queued
+				 * index update.
+				 */
+	PLOOP_E_INDEX_WB,	/* Index writeback is in progress */
+	PLOOP_E_DATA_WBI,	/* Data writeback is in progress and index
+				 * is not updated.
+				 */
+	PLOOP_E_ZERO_INDEX,	/* Zeroing index of free block; original request
+				   can use .submit on completion */
+	PLOOP_E_DELTA_ZERO_INDEX,/* the same but for PLOOP_E_DELTA_READ */
+	PLOOP_E_FSYNC_PENDED,   /* INDEX_WB needs io->ops->sync() to proceed */
+};
+
+#define BIO_BDEV_REUSED	14	/* io_context is stored in bi_bdev */
+
+struct ploop_request
+{
+	struct list_head	list;	/* List link.
+					 * Req can be on
+					 * - free list
+					 * - entry queue
+					 * - ready queue
+					 * - delay_list of another request
+					 * nowhere
+					 */
+
+	struct ploop_device	*plo;
+
+	cluster_t		req_cluster;
+	sector_t		req_sector;
+	unsigned int		req_size;
+	unsigned int		req_rw;
+	unsigned int		req_index_update_rw;
+	unsigned long		tstamp;
+	struct io_context	*ioc;
+
+	struct bio_list		bl;
+
+	struct bio		*aux_bio;
+
+	atomic_t		io_count;
+
+	unsigned long		state;
+	unsigned long		eng_state;
+	int			error;
+
+	struct map_node		*map;
+	struct map_node		*trans_map;
+
+	iblock_t		iblock;
+
+	/* relocation info */
+	union {
+		struct {
+			iblock_t      src_iblock;
+			iblock_t      dst_iblock;
+		};
+		unsigned long	      ppb_state;
+	};
+	cluster_t		dst_cluster;
+	struct rb_node		reloc_link;
+
+	/* State specific information */
+	union {
+		/* E_INDEX_READ */
+		struct {
+			struct page	* tpage;
+			int		level;
+		} ri;
+
+		/* E_INDEX_WB */
+		struct {
+			struct page	* tpage;
+		} wi;
+	} sinfo;
+
+	u64			verf;
+
+	/* List of requests blocked until completion of this request. */
+	struct list_head	delay_list;
+
+	/* Link to tree of "blocking requests". Blocking request
+	 * is a request which triggers a kind of a change in image format,
+	 * which does not allow to proceed requests to the same area.
+	 * F.e. when we do not have mapping in delta and request
+	 * requires a copy of data block from previous delta,
+	 * this request locks all subseqent requests to the same virtual block
+	 * until we allocate and initialize block in delta.
+	 */
+	struct rb_node		lockout_link;
+	struct rb_node		lockout_pb_link;
+
+	u32			track_cluster;
+
+	/* # bytes in tail of image file to prealloc on behalf of this preq */
+	loff_t			prealloc_size;
+
+	/* if the engine starts operation on particular io, let's finish
+	 * the operation on the same io (see io.ops->post_submit) */
+	struct ploop_io	       *eng_io;
+};
+
+static inline struct ploop_delta * ploop_top_delta(struct ploop_device * plo)
+{
+	return list_empty(&plo->map.delta_list) ? NULL :
+		list_first_entry(&plo->map.delta_list,
+				 struct ploop_delta, list);
+}
+
+static inline struct ploop_delta * map_top_delta(struct ploop_map * map)
+{
+	return list_first_entry(&map->delta_list, struct ploop_delta, list);
+}
+
+void ploop_complete_io_state(struct ploop_request * preq);
+void ploop_fail_request(struct ploop_request * preq, int err);
+void ploop_preq_drop(struct ploop_device * plo, struct list_head *drop_list,
+		      int keep_locked);
+
+
+static inline int ploop_req_delay_fua_possible(struct ploop_request *preq)
+{
+	return preq->eng_state == PLOOP_E_DATA_WBI;
+}
+
+static inline void ploop_set_dm_crypt_bdev(struct block_device *ploop_bdev,
+				struct block_device *bdev)
+{
+	if (MAJOR(ploop_bdev->bd_dev) == PLOOP_DEVICE_MAJOR) {
+		struct ploop_device *plo = ploop_bdev->bd_disk->private_data;
+		mutex_lock(&plo->ctl_mutex);
+		plo->dm_crypt_bdev = bdev;
+		mutex_unlock(&plo->ctl_mutex);
+	}
+}
+
+static inline struct block_device *__ploop_get_dm_crypt_bdev(
+	struct ploop_device *plo)
+{
+	if (plo->dm_crypt_bdev)
+		bdgrab(plo->dm_crypt_bdev);
+
+	return plo->dm_crypt_bdev;
+}
+
+static inline struct block_device *ploop_get_dm_crypt_bdev(
+				struct ploop_device *plo)
+{
+	struct block_device *ret;
+
+	mutex_lock(&plo->ctl_mutex);
+	ret = __ploop_get_dm_crypt_bdev(plo);
+	mutex_unlock(&plo->ctl_mutex);
+	return ret;
+}
+
+static inline void ploop_req_set_error(struct ploop_request * preq, int err)
+{
+	if (!preq->error) {
+		preq->error = err;
+		if (!test_bit(PLOOP_S_ABORT, &preq->plo->state)) {
+			if (err != -ENOSPC) {
+				printk("ploop_set_error=%d on ploop%d\n",
+				       err, preq->plo->index);
+				return;
+			}
+			printk("No space left on device! Either free some "
+			       "space on disk or abort ploop%d manually.\n",
+				preq->plo->index);
+		}
+	}
+}
+
+#define PLOOP_TRACE_ERROR 1
+#define PLOOP_TRACE_ERROR_DUMP_STACK_ON 1
+
+#if PLOOP_TRACE_ERROR_DUMP_STACK_ON
+#define PLOOP_TRACE_ERROR_DUMP_STACK()	dump_stack();
+#else
+#define PLOOP_TRACE_ERROR_DUMP_STACK()
+#endif
+
+#if PLOOP_TRACE_ERROR
+#define PLOOP_REQ_TRACE_ERROR(preq, err)					\
+	do {									\
+		if ((err)) {							\
+			printk("%s() %d ploop%d set error %d\n",		\
+			__FUNCTION__, __LINE__, (preq)->plo->index, (int)(err));\
+			PLOOP_TRACE_ERROR_DUMP_STACK();				\
+		}								\
+	} while (0);
+#else
+#define PLOOP_REQ_TRACE_ERROR(preq, err)
+#endif
+
+#define PLOOP_REQ_SET_ERROR(preq, err)			\
+	do {						\
+		PLOOP_REQ_TRACE_ERROR(preq, err);	\
+		ploop_req_set_error(preq, err);		\
+	} while (0);
+
+#define PLOOP_FAIL_REQUEST(preq, err)			\
+	do {						\
+		PLOOP_REQ_TRACE_ERROR(preq, err);	\
+		ploop_fail_request(preq, err);		\
+	} while (0);
+
+static inline void ploop_prepare_io_request(struct ploop_request * preq)
+{
+	atomic_set(&preq->io_count, 1);
+}
+
+static inline void ploop_complete_io_request(struct ploop_request * preq)
+{
+	if (atomic_dec_and_test(&preq->io_count))
+		ploop_complete_io_state(preq);
+}
+
+static inline void ploop_prepare_tracker(struct ploop_request * preq,
+					 sector_t sec)
+{
+	if (unlikely(test_bit(PLOOP_S_TRACK, &preq->plo->state))) {
+		BUG_ON(test_bit(PLOOP_REQ_TRACK, &preq->state));
+		set_bit(PLOOP_REQ_TRACK, &preq->state);
+		preq->track_cluster = sec >> preq->plo->cluster_log;
+	}
+}
+
+void ploop_tracker_notify(struct ploop_device *, sector_t sec);
+
+static inline void ploop_acc_ff_in_locked(struct ploop_device *plo,
+					  unsigned long rw)
+{
+	if (unlikely(rw & REQ_FLUSH))
+		plo->st.bio_flush_in++;
+	if (unlikely(rw & REQ_FUA))
+		plo->st.bio_fua_in++;
+}
+static inline void ploop_acc_ff_in(struct ploop_device *plo,
+				   unsigned long rw)
+{
+	if (unlikely(rw & REQ_FLUSH)) {
+		unsigned long flags;
+		spin_lock_irqsave(&plo->lock, flags);
+		plo->st.bio_flush_in++;
+		spin_unlock_irqrestore(&plo->lock, flags);
+	}
+	if (unlikely(rw & REQ_FUA)) {
+		unsigned long flags;
+		spin_lock_irqsave(&plo->lock, flags);
+		plo->st.bio_fua_in++;
+		spin_unlock_irqrestore(&plo->lock, flags);
+	}
+}
+static inline void ploop_acc_ff_out_locked(struct ploop_device *plo,
+					   unsigned long rw)
+{
+	if (unlikely(rw & REQ_FLUSH))
+		plo->st.bio_flush_out++;
+	if (unlikely(rw & REQ_FUA))
+		plo->st.bio_fua_out++;
+}
+static inline void ploop_acc_ff_out(struct ploop_device *plo,
+				    unsigned long rw)
+{
+	if (unlikely(rw & REQ_FLUSH)) {
+		unsigned long flags;
+		spin_lock_irqsave(&plo->lock, flags);
+		plo->st.bio_flush_out++;
+		spin_unlock_irqrestore(&plo->lock, flags);
+	}
+	if (unlikely(rw & REQ_FUA)) {
+		unsigned long flags;
+		spin_lock_irqsave(&plo->lock, flags);
+		plo->st.bio_fua_out++;
+		spin_unlock_irqrestore(&plo->lock, flags);
+	}
+}
+static inline void ploop_acc_flush_skip_locked(struct ploop_device *plo,
+					       unsigned long rw)
+{
+	if (unlikely(rw & REQ_FLUSH))
+		plo->st.bio_flush_skip++;
+}
+
+static inline void ploop_entry_add(struct ploop_device * plo, struct ploop_request * preq)
+{
+	list_add_tail(&preq->list, &plo->entry_queue);
+	plo->entry_qlen++;
+	if (test_bit(PLOOP_REQ_SYNC, &preq->state) && (!(preq->req_rw & WRITE) || (preq->req_rw & (REQ_FLUSH|REQ_FUA)))) {
+		__set_bit(PLOOP_REQ_RSYNC, &preq->state);
+		plo->read_sync_reqs++;
+	}
+}
+
+static inline void ploop_entry_qlen_dec(struct ploop_request * preq)
+{
+	preq->plo->entry_qlen--;
+	if (test_bit(PLOOP_REQ_RSYNC, &preq->state)) {
+		__clear_bit(PLOOP_REQ_RSYNC, &preq->state);
+		preq->plo->read_sync_reqs--;
+	}
+}
+
+static inline int ploop_map_log(struct ploop_device *plo)
+{
+	switch (plo->fmt_version) {
+	case PLOOP_FMT_V1:
+		return plo->cluster_log;
+	case PLOOP_FMT_V2:
+		return 0;
+	default:
+		BUG();
+	}
+
+	return -1;
+}
+
+struct map_node;
+
+int ploop_fastmap(struct ploop_map * map, cluster_t block, iblock_t *result);
+void ploop_update_map(struct ploop_map * map, int level, cluster_t block, iblock_t iblk);
+void ploop_update_map_hdr(struct ploop_map * map, u8 *hdr, int hdr_size);
+void map_release(struct map_node * m);
+int ploop_find_map(struct ploop_map * map, struct ploop_request * preq);
+int ploop_find_trans_map(struct ploop_map * map, struct ploop_request * preq);
+int ploop_check_map(struct ploop_map * map, struct ploop_request * preq);
+cluster_t map_get_mn_end(struct map_node *m);
+int map_get_index(struct ploop_request * preq, cluster_t block, iblock_t *result);
+int trans_map_get_index(struct ploop_request * preq, cluster_t block, iblock_t *result);
+int map_index_fault(struct ploop_request * preq);
+void map_read_complete(struct ploop_request * preq);
+int map_index(struct ploop_delta * delta, struct ploop_request * preq, unsigned long *sec);
+struct ploop_delta * map_writable_delta(struct ploop_request * preq);
+void map_init(struct ploop_device *, struct ploop_map * map);
+void ploop_map_start(struct ploop_map * map, u64 bd_size);
+void ploop_map_destroy(struct ploop_map * map);
+void ploop_map_remove_delta(struct ploop_map * map, int level);
+void ploop_index_wb_proceed(struct ploop_request * preq);
+void ploop_index_update(struct ploop_request * preq);
+void ploop_index_wb_complete(struct ploop_request * preq);
+int __init ploop_map_init(void);
+void ploop_map_exit(void);
+void ploop_add_req_to_fsync_queue(struct ploop_request * preq);
+
+
+void ploop_quiesce(struct ploop_device * plo);
+void ploop_relax(struct ploop_device * plo);
+
+void track_init(struct ploop_device * plo);
+int ploop_tracker_destroy(struct ploop_device *plo, int force);
+int ploop_tracker_stop(struct ploop_device * plo, int force);
+int ploop_tracker_read(struct ploop_device * plo, unsigned long arg);
+int ploop_tracker_setpos(struct ploop_device * plo, unsigned long arg);
+int ploop_tracker_init(struct ploop_device * plo, unsigned long arg);
+
+
+int ploop_add_lockout(struct ploop_request *preq, int try);
+void del_lockout(struct ploop_request *preq);
+
+int ploop_io_init(struct ploop_delta * delta, int nchunks, struct ploop_ctl_chunk * pc);
+int ploop_io_open(struct ploop_io *);
+void ploop_io_destroy(struct ploop_io * io);
+void ploop_io_report_fn(struct file * file, char * msg);
+
+int ploop_register_format(struct ploop_delta_ops * ops);
+int ploop_register_io(struct ploop_io_ops * ops);
+void ploop_unregister_format(struct ploop_delta_ops * ops);
+void ploop_unregister_io(struct ploop_io_ops * ops);
+void ploop_format_put(struct ploop_delta_ops * ops);
+
+extern struct kobj_type ploop_delta_ktype;
+void ploop_sysfs_init(struct ploop_device * plo);
+void ploop_sysfs_uninit(struct ploop_device * plo);
+
+void ploop_queue_zero_request(struct ploop_device *plo, struct ploop_request *orig_preq, cluster_t clu);
+
+int ploop_maintenance_wait(struct ploop_device * plo);
+
+extern int max_map_pages;
+
+extern void ploop_msg_once(struct ploop_device *plo, const char *, ...)
+	__attribute__ ((format (printf, 2, 3)));
+
+/* Define PLOOP_TRACE to get full trace of ploop state machine.
+ */
+#undef PLOOP_TRACE
+
+
+#ifdef PLOOP_TRACE
+#define __TRACE(a...)  do { printk(a); } while (0)
+#else
+#define __TRACE(a...)  do { } while (0)
+#endif
+
+#endif /* _LINUX_PLOOP_H_ */
--- /dev/null
+++ b/include/linux/ploop/ploop_if.h
@@ -0,0 +1,387 @@
+/*
+ *  include/linux/ploop/ploop_if.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __PLOOP_IF_H__
+#define __PLOOP_IF_H__ 1
+
+#include <linux/ioctl.h>
+
+/* This interface mixes data relevant to delta layer and io layer
+ * to one request. It is too simplistic.
+ *
+ * But this allows to create the whole delta atomically and does
+ * not require maintenance of incomplete composition state inside device.
+ */
+
+/* Formats of deltas. */
+
+#define PLOOP_FMT_RAW		1
+#define PLOOP_FMT_PLOOP1	2
+
+/* PLOOP_FMT_PLOOP1 subversions */
+enum {
+	PLOOP_FMT_UNDEFINED = 0,
+	PLOOP_FMT_V1,
+	PLOOP_FMT_V2,
+};
+
+/* Delta flags. */
+#define PLOOP_FMT_RDONLY	1
+#define PLOOP_FMT_FLAGS		1
+
+#define PLOOP_FLAG_FS_SYNC	0x10000000
+
+#define PLOOP_FMT_PREALLOCATED	2
+
+#define PLOOP_FLAG_COOKIE	4
+#define PLOOP_COOKIE_SIZE	64
+
+#define PLOOP_FLAG_CLUBLKS	8
+
+/* IO types. */
+
+#define PLOOP_IO_AUTO		0
+#define PLOOP_IO_DIRECT		1
+#define PLOOP_IO_NFS		2
+#define PLOOP_IO_RESERVED	3	/* reserved, do not use */
+#define PLOOP_IO_KAIO		4
+
+/*
+ * # slots to skip in the very first page of L2 table
+ * (they are reserved for format-specific header)
+ * Assumptions:
+ * 1) sizeof(map_index_t) == sizeof(u32)
+ * 2) PLOOP_MAP_OFFSET == sizeof(struct ploop_pvd_header) / sizeof(u32)
+ */
+#define PLOOP_MAP_OFFSET	16
+
+/*
+ * in-kernel ploop implementation assumes that L2[index] can never be
+ * equal to this value (this is guaranteed by limitation of bdsize).
+ * So, in-kernel ploop may encode L2[index] == 0 by this value and keep
+ * zero value as special one meaning "iblock is not allocated yet for
+ * given index". User-space may use this value to denote uninitialized
+ * slots of L2[] table.
+ */
+#define PLOOP_ZERO_INDEX	0xFFFFFFFFU
+
+struct ploop_ctl_chunk
+{
+	__s32	pctl_fd;	/* FD of backing file */
+	__u32	pctl_type;	/* IO engine */
+	__u32	pctl_flags;	/* Some modifiers, undefined now */
+	__u32	pctl_offset;	/* Starting cluster of this chunk in image */
+
+	__u64	pctl_start;	/* Position of data in file.  */
+	__u64	pctl_len;	/* Length of data area in file. */
+} __attribute__ ((aligned (8)));
+
+struct ploop_ctl
+{
+	/* Description of delta format */
+	__u32	pctl_format;
+	__u32	pctl_flags;
+	__u32	pctl_cluster_log;
+	__u32	pctl_size;
+
+	/* Description of backing files. */
+	__u16	pctl_chunks;
+	__u8	pctl_level;
+	__u8	__mbz1;
+	__u32	__mbz2;
+	struct ploop_ctl_chunk chunks[0];
+} __attribute__ ((aligned (8)));
+
+/* helper for ADD_DELTA */
+struct ploop_ctl_delta {
+	struct ploop_ctl c;
+	struct ploop_ctl_chunk f;
+};
+
+struct ploop_truncate_ctl
+{
+	int	fd;
+	__u32	alloc_head;
+	__u8	level;
+	__u8	__mbz1;
+	__u16	__mbz2;
+} __attribute__ ((aligned (8)));
+
+
+/*
+ * Before relocation l2[req_cluster] == old_iblk.
+ * Then user-space decided to relocate old_iblk to new_iblk.
+ * After relocation is done, we need kernel help to update map_node
+ * structure for req_cluster (if present). When kernel
+ * accomplished this, user-space may safely nullify old_iblk.
+ */
+struct reloc_map
+{
+	__u32 req_cluster;
+	__u32 iblk;
+} __attribute__ ((aligned (8)));
+
+struct ploop_index_update_ctl
+{
+	__u32	n_maps;
+	__u8	level;
+	__u8	__mbz1;
+	__u16	__mbz2;
+	struct reloc_map rmap[0];
+} __attribute__ ((aligned (8)));
+
+/*
+ * user-space found out that some blocks are not used
+ * and reports the list of them to kernel. Onwards,
+ * kernel will use them as free blocks instead of
+ * alloc_head++ technique.
+ */
+struct ploop_freeblks_ctl_extent
+{
+	__u32 clu;
+	__u32 iblk;
+	__u32 len;
+
+} __attribute__ ((aligned (8)));
+
+struct ploop_freeblks_ctl
+{
+	__u32	n_extents;
+	__u32	alloc_head; /* out */
+	__u8	level;
+	__u8	__mbz1;
+	__u16	__mbz2;
+	struct ploop_freeblks_ctl_extent extents[0];
+} __attribute__ ((aligned (8)));
+
+struct ploop_relocblks_ctl_extent
+{
+	__u32 clu;
+	__u32 iblk;
+	__u32 len;
+	__u32 free; /* this extent is also present in freemap */
+} __attribute__ ((aligned (8)));
+
+struct ploop_relocblks_ctl
+{
+	__u32	n_extents;
+	__u32	n_scanned;  /* # bytes scanned */
+	__u32	alloc_head; /* in, for sanity check */
+	__u8	level;
+	__u8	__mbz1;
+	__u16	__mbz2;
+	struct ploop_relocblks_ctl_extent extents[0];
+} __attribute__ ((aligned (8)));
+
+struct ploop_balloon_ctl
+{
+	__u32	mntn_type;     /* see enum above */
+	__u32	alloc_head;    /* freezed alloc_head */
+	__u8	level;	       /* top-level of ploop device */
+	__u8	inflate;       /* inflate/truncate flag */
+	__u8	keep_intact;   /* keep mntn state intact */
+	__u8	__mbz;
+} __attribute__ ((aligned (8)));
+
+struct ploop_getdevice_ctl
+{
+	__u32	minor;
+	__u32	__mbz1;
+} __attribute__ ((aligned (8)));
+
+struct ploop_push_backup_init_ctl
+{
+	__u8    cbt_uuid[16];
+	__u64	cbt_mask_addr; /* page-aligned space for CBT mask */
+} __attribute__ ((aligned (8)));
+
+struct ploop_push_backup_ctl_extent
+{
+	__u32 clu;
+	__u32 len;
+} __attribute__ ((aligned (8)));
+
+/* ploop_push_backup_io_ctl.direction */
+enum {
+	PLOOP_READ = 0, /* wait for requests */
+	PLOOP_WRITE,    /* ACK requests */
+	PLOOP_PEEK,     /* peek at what to be backed up */
+};
+
+struct ploop_push_backup_io_ctl
+{
+	__u8    cbt_uuid[16];
+	__u32	direction;
+	__u32	n_extents;
+	struct ploop_push_backup_ctl_extent extents[0];
+} __attribute__ ((aligned (8)));
+
+struct ploop_push_backup_stop_ctl
+{
+	__u8    cbt_uuid[16];
+	__u32	status; /* for sanity: non-zero if pending or active queue is not empty */
+} __attribute__ ((aligned (8)));
+
+/* maintenance types */
+enum {
+	PLOOP_MNTN_OFF = 0,  /* no maintenance is in progress */
+	PLOOP_MNTN_BALLOON,  /* user-space started ballooning */
+	PLOOP_MNTN_FBLOADED, /* list of free-blocks loaded */
+	PLOOP_MNTN_SNAPSHOT, /* bdev is freezed due to snapshot */
+
+	PLOOP_MNTN_TRACK,    /* tracking is in progress */
+	PLOOP_MNTN_DISCARD,  /* ready to handle discard requests */
+
+	PLOOP_MNTN_NOFAST = 256,
+	/* all types below requires fast-path disabled ! */
+
+	PLOOP_MNTN_MERGE,    /* merge is in progress */
+	PLOOP_MNTN_GROW,     /* grow is in progress */
+	PLOOP_MNTN_RELOC,    /* relocation is in progress */
+	PLOOP_MNTN_PUSH_BACKUP, /* push backup is in progress */
+};
+
+/*
+ * This define should be in sync with enum above.
+ * NB: PLOOP_MNTN_TRACK is handled separately because
+ * READ-requests may go fast-path even while tracking.
+ */
+#define FAST_PATH_DISABLED(t) (t > PLOOP_MNTN_NOFAST)
+
+#define PLOOPCTLTYPE	'P'
+
+/* Add delta. Device must be offline */
+#define PLOOP_IOC_ADD_DELTA	_IOW(PLOOPCTLTYPE, 0, struct ploop_ctl)
+
+/* Close images, free all data, return the device to initial state  */
+#define PLOOP_IOC_CLEAR		_IO(PLOOPCTLTYPE, 1)
+
+/* Stop/start device. */
+#define PLOOP_IOC_STOP		_IO(PLOOPCTLTYPE, 2)
+#define PLOOP_IOC_START		_IO(PLOOPCTLTYPE, 3)
+
+/* Make new snapshot on running device */
+#define PLOOP_IOC_SNAPSHOT	_IOW(PLOOPCTLTYPE, 4, struct ploop_ctl)
+
+/* Remove delta. Argument is delta level. */
+#define PLOOP_IOC_DEL_DELTA	_IOW(PLOOPCTLTYPE, 5, __u32)
+
+struct ploop_track_extent
+{
+	__u64	start;
+	__u64	end;
+	/*
+	 * On any changes to this struct keep in mind fixing
+	 * all copy_to_user instances, initializing new fields/paddings
+	 * to prevent possible leaks from kernel-space.
+	 */
+};
+
+/* Start tracking of top delta image. */
+#define PLOOP_IOC_TRACK_INIT	_IOR(PLOOPCTLTYPE, 6, struct ploop_track_extent)
+
+/* Stop of top delta image. It is responsibility of caller
+ * to quiesce the device before stopping tracking. The ioctl
+ * will fail if tracking was aborted or if not all dirty bits are read.
+ */
+#define PLOOP_IOC_TRACK_STOP	_IO(PLOOPCTLTYPE, 7)
+
+/* Abort tracker, clear the state */
+#define PLOOP_IOC_TRACK_ABORT	_IO(PLOOPCTLTYPE, 8)
+
+/* User -> ploop : transferred up to this position */
+#define PLOOP_IOC_TRACK_SETPOS	_IOW(PLOOPCTLTYPE, 9, __u64)
+
+/* ploop -> user: get modified bits */
+#define PLOOP_IOC_TRACK_READ	_IOR(PLOOPCTLTYPE, 10, struct ploop_track_extent)
+
+/* sync cacheable state of deltas to disk */
+#define PLOOP_IOC_SYNC		_IO(PLOOPCTLTYPE, 11)
+
+/* Merge top delta to lower one and delete it. */
+#define PLOOP_IOC_MERGE		_IO(PLOOPCTLTYPE, 12)
+
+/* Replace alive delta with equivalent one. */
+#define PLOOP_IOC_REPLACE_DELTA	_IOW(PLOOPCTLTYPE, 13, struct ploop_ctl)
+
+/* Replace alive delta with equivalent one. */
+#define PLOOP_IOC_TRUNCATE	_IOW(PLOOPCTLTYPE, 14, struct ploop_truncate_ctl)
+
+/* Update in-core copy of L2 table */
+#define PLOOP_IOC_UPDATE_INDEX  _IOW(PLOOPCTLTYPE, 16, struct ploop_index_update_ctl)
+
+/* Increase size of block device */
+#define PLOOP_IOC_GROW		_IOW(PLOOPCTLTYPE, 17, struct ploop_ctl)
+
+/* Inquire current state of free block extents */
+#define PLOOP_IOC_FBGET		_IOW(PLOOPCTLTYPE, 18, struct ploop_freeblks_ctl)
+
+/* Start balloning or inquire maintenance_type or flush stale BALLON state */
+#define PLOOP_IOC_BALLOON	_IOW(PLOOPCTLTYPE, 19, struct ploop_balloon_ctl)
+
+/* Load free blocks to ploop */
+#define PLOOP_IOC_FREEBLKS      _IOW(PLOOPCTLTYPE, 20, struct ploop_freeblks_ctl)
+
+/* Load blocks to relocate and initiate relocation process */
+#define PLOOP_IOC_RELOCBLKS     _IOW(PLOOPCTLTYPE, 21, struct ploop_relocblks_ctl)
+
+/* Search ploop_device global tree for first unused minor number */
+#define PLOOP_IOC_GETDEVICE    _IOW(PLOOPCTLTYPE, 22, struct ploop_getdevice_ctl)
+
+/* Start handling discard requests */
+#define PLOOP_IOC_DISCARD_INIT _IO(PLOOPCTLTYPE, 23)
+/* Stop handling discard requests */
+#define PLOOP_IOC_DISCARD_FINI _IO(PLOOPCTLTYPE, 24)
+/* Wait a discard request */
+#define PLOOP_IOC_DISCARD_WAIT _IO(PLOOPCTLTYPE, 25)
+
+/* Drop current state of free block extents */
+#define PLOOP_IOC_FBDROP	_IO(PLOOPCTLTYPE, 26)
+
+/* Filter extents with sizes less than arg */
+#define PLOOP_IOC_FBFILTER	_IOR(PLOOPCTLTYPE, 27, unsigned long)
+
+/* Set maximum size for the top delta . */
+#define PLOOP_IOC_MAX_DELTA_SIZE _IOW(PLOOPCTLTYPE, 28, __u64)
+
+/* Start push backup */
+#define PLOOP_IOC_PUSH_BACKUP_INIT _IOR(PLOOPCTLTYPE, 29, struct ploop_push_backup_init_ctl)
+
+/* Wait for push backup out-of-order requests; or ACK them */
+#define PLOOP_IOC_PUSH_BACKUP_IO _IOR(PLOOPCTLTYPE, 30, struct ploop_push_backup_io_ctl)
+
+/* Stop push backup */
+#define PLOOP_IOC_PUSH_BACKUP_STOP _IOR(PLOOPCTLTYPE, 31, struct ploop_push_backup_stop_ctl)
+
+/* Freeze FS mounted over ploop */
+#define PLOOP_IOC_FREEZE	_IO(PLOOPCTLTYPE, 32)
+
+/* Unfreeze FS mounted over ploop */
+#define PLOOP_IOC_THAW		_IO(PLOOPCTLTYPE, 33)
+
+/* Events exposed via /sys/block/ploopN/pstate/event */
+#define PLOOP_EVENT_ABORTED	1
+#define PLOOP_EVENT_STOPPED	2
+#define PLOOP_EVENT_ENOSPC	3
+
+#ifdef __KERNEL__
+
+#define PLOOP_INTERNAL_MAGIC	0x284cd32c
+struct ploop_xops
+{
+	__u32		magic;
+
+	int		(*get_extent)(struct inode *inode, sector_t isec,
+				      unsigned int nr, sector_t *start,
+				      sector_t *psec, int creat);
+};
+
+#define PLOOP_IOC_INTERNAL	_IOR(PLOOPCTLTYPE, 15, struct ploop_xops)
+
+#endif
+
+#endif /* __PLOOP_IF_H__ */
--- /dev/null
+++ b/include/linux/ploop/ploop_stat.h
@@ -0,0 +1,55 @@
+/*
+ *  include/linux/ploop/ploop_stat.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+__DO(bio_in)
+__DO(bio_fast)
+__DO(bio_full)
+__DO(bio_out)
+__DO(bio_alloc)
+__DO(bio_alloc_whole)
+__DO(bio_splits)
+__DO(coal_back)
+__DO(coal_forw)
+__DO(coal_back2)
+__DO(coal_forw2)
+__DO(coal_oback)
+__DO(coal_oforw)
+__DO(coal_mback)
+__DO(coal_mforw)
+__DO(coal_overlap)
+__DO(coal_flush)
+__DO(bio_barriers)
+__DO(bio_rzero)
+__DO(bio_wzero)
+__DO(bio_syncwait)
+__DO(bio_fsync)
+__DO(bio_cows)
+__DO(bio_whole_cows)
+__DO(merge_neg_cluster)
+__DO(merge_neg_disable)
+__DO(fast_neg_nomap)
+__DO(fast_neg_noem)
+__DO(fast_neg_shortem)
+__DO(fast_neg_backing)
+__DO(bio_lockouts)
+__DO(map_lockouts)
+__DO(merge_lockouts)
+__DO(map_reads)
+__DO(map_merges)
+__DO(map_single_writes)
+__DO(map_multi_writes)
+__DO(map_multi_updates)
+__DO(bio_trans_whole)
+__DO(bio_trans_copy)
+__DO(bio_trans_alloc)
+__DO(bio_trans_index)
+__DO(bio_flush_in)
+__DO(bio_fua_in)
+__DO(bio_flush_out)
+__DO(bio_fua_out)
+__DO(bio_flush_skip)
+
--- a/include/linux/posix_acl.h
+++ b/include/linux/posix_acl.h
@@ -95,6 +95,8 @@ extern struct posix_acl *get_posix_acl(struct inode *, int);
 extern int set_posix_acl(struct inode *, int, struct posix_acl *);
 
 #ifdef CONFIG_FS_POSIX_ACL
+extern int posix_acl_update_mode(struct inode *, umode_t *, struct posix_acl **);
+
 static inline struct posix_acl **acl_by_type(struct inode *inode, int type)
 {
 	switch (type) {
--- a/include/linux/printk.h
+++ b/include/linux/printk.h
@@ -43,10 +43,9 @@ extern int console_printk[];
 #define minimum_console_loglevel (console_printk[2])
 #define default_console_loglevel (console_printk[3])
 
-static inline void console_silent(void)
-{
-	console_loglevel = 0;
-}
+#define VE0_LOG		1
+#define VE_LOG		2
+#define VE_LOG_BOTH	(VE0_LOG | VE_LOG)
 
 static inline void console_verbose(void)
 {
@@ -109,6 +108,8 @@ static inline __printf(1, 2) __cold
 void early_printk(const char *s, ...) { }
 #endif
 
+struct ve_struct;
+
 #ifdef CONFIG_PRINTK
 asmlinkage __printf(5, 0)
 int vprintk_emit(int facility, int level,
@@ -126,6 +127,17 @@ asmlinkage int printk_emit(int facility, int level,
 asmlinkage __printf(1, 2) __cold
 int printk(const char *fmt, ...);
 
+asmlinkage __printf(2, 0)
+int ve_vprintk(int dst, const char *fmt, va_list args);
+
+asmlinkage __printf(2, 3) __cold
+int ve_printk(int dst, const char *fmt, ...);
+
+int ve_log_init(struct ve_struct *ve);
+void ve_log_destroy(struct ve_struct *ve);
+asmlinkage __printf(2, 3) __cold
+int ve_log_printk(struct ve_struct *ve, const char *s, ...);
+
 /*
  * Special printk facility for scheduler/timekeeping use only, _DO_NOT_USE_ !
  */
@@ -165,6 +177,30 @@ int printk(const char *s, ...)
 {
 	return 0;
 }
+static inline __printf(2, 0)
+int ve_vprintk(int dst, const char *s, va_list args)
+{
+	return 0;
+}
+static inline __printf(2, 3) __cold
+int ve_printk(int dst, const char *s, ...)
+{
+	return 0;
+}
+static inline
+int ve_log_init(struct ve_struct *ve)
+{
+	return 0;
+}
+static inline
+void ve_log_destroy(struct ve_struct *ve)
+{
+}
+static inline __printf(2, 3) __cold
+int ve_log_printk(struct ve_struct *ve, const char *s, ...)
+{
+	return 0;
+}
 static inline __printf(1, 2) __cold
 int printk_deferred(const char *s, ...)
 {
@@ -329,9 +365,21 @@ extern void dump_stack(void) __cold;
 	if (__ratelimit(&_rs))						\
 		printk(fmt, ##__VA_ARGS__);				\
 })
+
+#define ve_printk_ratelimited(dst, fmt, ...)				\
+({									\
+	static DEFINE_RATELIMIT_STATE(_rs,				\
+				      DEFAULT_RATELIMIT_INTERVAL,	\
+				      DEFAULT_RATELIMIT_BURST);		\
+									\
+	if (__ratelimit(&_rs))						\
+		ve_printk(dst, fmt, ##__VA_ARGS__);			\
+})
 #else
 #define printk_ratelimited(fmt, ...)					\
 	no_printk(fmt, ##__VA_ARGS__)
+#define ve_printk_ratelimited(dst, fmt, ...)				\
+	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
 #define pr_emerg_ratelimited(fmt, ...)					\
@@ -349,6 +397,20 @@ extern void dump_stack(void) __cold;
 #define pr_info_ratelimited(fmt, ...)					\
 	printk_ratelimited(KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
 /* no pr_cont_ratelimited, don't do that... */
+#define ve_pr_emerg_ratelimited(dst, fmt, ...)					\
+	ve_printk_ratelimited(dst, KERN_EMERG pr_fmt(fmt), ##__VA_ARGS__)
+#define ve_pr_alert_ratelimited(dst, fmt, ...)					\
+	ve_printk_ratelimited(dst, KERN_ALERT pr_fmt(fmt), ##__VA_ARGS__)
+#define ve_pr_crit_ratelimited(dst, fmt, ...)					\
+	ve_printk_ratelimited(dst, KERN_CRIT pr_fmt(fmt), ##__VA_ARGS__)
+#define ve_pr_err_ratelimited(dst, fmt, ...)					\
+	ve_printk_ratelimited(dst, KERN_ERR pr_fmt(fmt), ##__VA_ARGS__)
+#define ve_pr_warn_ratelimited(dst, fmt, ...)					\
+	ve_printk_ratelimited(dst, KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__)
+#define ve_pr_notice_ratelimited(dst, fmt, ...)					\
+	ve_printk_ratelimited(dst, KERN_NOTICE pr_fmt(fmt), ##__VA_ARGS__)
+#define ve_pr_info_ratelimited(dst, fmt, ...)					\
+	ve_printk_ratelimited(dst, KERN_INFO pr_fmt(fmt), ##__VA_ARGS__)
 
 #if defined(DEBUG)
 #define pr_devel_ratelimited(fmt, ...)					\
@@ -388,6 +450,17 @@ extern void print_hex_dump(const char *level, const char *prefix_str,
 extern void print_hex_dump_bytes(const char *prefix_str, int prefix_type,
 				 const void *buf, size_t len);
 #endif /* defined(CONFIG_DYNAMIC_DEBUG) */
+
+extern int console_silence_loglevel;
+
+static inline void console_silent(void)
+{
+	if (console_loglevel > console_silence_loglevel) {
+		printk(KERN_EMERG "console shuts up ...\n");
+		console_loglevel = 0;
+	}
+}
+
 #else
 static inline void print_hex_dump(const char *level, const char *prefix_str,
 				  int prefix_type, int rowsize, int groupsize,
--- a/include/linux/proc_fs.h
+++ b/include/linux/proc_fs.h
@@ -7,15 +7,45 @@
 #include <linux/types.h>
 #include <linux/fs.h>
 
-struct proc_dir_entry;
+struct proc_dir_entry {
+	unsigned int low_ino;
+	umode_t mode;
+	nlink_t nlink;
+	kuid_t uid;
+	kgid_t gid;
+	loff_t size;
+	const struct inode_operations *proc_iops;
+	const struct file_operations *proc_fops;
+#ifdef __GENKSYMS__
+	struct proc_dir_entry *next, *parent, *subdir;
+#else
+	struct proc_dir_entry *parent;
+	struct rb_root subdir;
+	struct rb_node subdir_node;
+#endif
+	void *data;
+	atomic_t count;		/* use count */
+	atomic_t in_use;	/* number of callers into module in progress; */
+			/* negative -> it's going away RSN */
+	struct completion *pde_unload_completion;
+	struct list_head pde_openers;	/* who did ->open, but not ->release */
+	spinlock_t pde_unload_lock; /* proc_fops checks and pde_users bumps */
+	u8 namelen;
+	char name[];
+};
 
 #ifdef CONFIG_PROC_FS
 
 extern void proc_root_init(void);
 extern void proc_flush_task(struct task_struct *);
 
-extern struct proc_dir_entry *proc_symlink(const char *,
-		struct proc_dir_entry *, const char *);
+extern struct proc_dir_entry *proc_symlink_mode(const char *name, umode_t mode,
+			struct proc_dir_entry *parent, const char *dest);
+static inline struct proc_dir_entry *proc_symlink(const char *name,
+			struct proc_dir_entry *parent, const char *dest)
+{
+	return proc_symlink_mode(name, S_IRWXUGO, parent, dest);
+}
 extern struct proc_dir_entry *proc_mkdir(const char *, struct proc_dir_entry *);
 extern struct proc_dir_entry *proc_mkdir_data(const char *, umode_t,
 					      struct proc_dir_entry *, void *);
@@ -26,6 +56,9 @@ extern struct proc_dir_entry *proc_create_data(const char *, umode_t,
 					       struct proc_dir_entry *,
 					       const struct file_operations *,
 					       void *);
+extern struct proc_dir_entry *proc_net_create_data(const char *name,
+				umode_t mode, struct proc_dir_entry *parent,
+				const struct file_operations *fops, void *data);
 
 static inline struct proc_dir_entry *proc_create(
 	const char *name, umode_t mode, struct proc_dir_entry *parent,
@@ -50,6 +83,8 @@ static inline void proc_flush_task(struct task_struct *task)
 
 static inline struct proc_dir_entry *proc_symlink(const char *name,
 		struct proc_dir_entry *parent,const char *dest) { return NULL;}
+static inline struct proc_dir_entry *proc_symlink_mode(const char *name,
+	umode_t m, struct proc_dir_entry *p, const char *d) { return NULL; }
 static inline struct proc_dir_entry *proc_mkdir(const char *name,
 	struct proc_dir_entry *parent) {return NULL;}
 static inline struct proc_dir_entry *proc_mkdir_data(const char *name,
@@ -75,7 +110,7 @@ struct net;
 static inline struct proc_dir_entry *proc_net_mkdir(
 	struct net *net, const char *name, struct proc_dir_entry *parent)
 {
-	return proc_mkdir_data(name, 0, parent, net);
+	return proc_mkdir_data(name, S_ISGID|S_IRUGO|S_IXUGO, parent, net);
 }
 
 #endif /* _LINUX_PROC_FS_H */
--- a/include/linux/proc_ns.h
+++ b/include/linux/proc_ns.h
@@ -4,6 +4,7 @@
 #ifndef _LINUX_PROC_NS_H
 #define _LINUX_PROC_NS_H
 
+struct super_block;
 struct pid_namespace;
 struct nsproxy;
 
@@ -49,6 +50,8 @@ extern int proc_alloc_inum(unsigned int *pino);
 extern void proc_free_inum(unsigned int inum);
 extern bool proc_ns_inode(struct inode *inode);
 
+extern bool proc_in_container(struct super_block *sb);
+
 #else /* CONFIG_PROC_FS */
 
 static inline int pid_ns_prepare_proc(struct pid_namespace *ns) { return 0; }
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -33,6 +33,7 @@
 #define PT_TRACE_SECCOMP	PT_EVENT_FLAG(PTRACE_EVENT_SECCOMP)
 
 #define PT_EXITKILL		(PTRACE_O_EXITKILL << PT_OPT_FLAG_SHIFT)
+#define PT_SUSPEND_SECCOMP	(PTRACE_O_SUSPEND_SECCOMP << PT_OPT_FLAG_SHIFT)
 
 /* single stepping state bits (used on ARM and PA-RISC) */
 #define PT_SINGLESTEP_BIT	31
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -42,6 +42,10 @@ void inode_add_rsv_space(struct inode *inode, qsize_t number);
 void inode_claim_rsv_space(struct inode *inode, qsize_t number);
 void inode_sub_rsv_space(struct inode *inode, qsize_t number);
 void inode_reclaim_rsv_space(struct inode *inode, qsize_t number);
+qsize_t *inode_reserved_space(struct inode * inode);
+qsize_t inode_get_rsv_space(struct inode *inode);
+void inode_incr_space(struct inode *inode, qsize_t number, int reserve);
+void inode_decr_space(struct inode *inode, qsize_t number, int reserve);
 
 void dquot_initialize(struct inode *inode);
 void dquot_drop(struct inode *inode);
--- a/include/linux/radix-tree.h
+++ b/include/linux/radix-tree.h
@@ -81,6 +81,8 @@ static inline int radix_tree_is_indirect_ptr(void *ptr)
 #define RADIX_TREE_TAG_LONGS	\
 	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
 
+#define RADIX_ROOT_TAG_MASK	(((1<<RADIX_TREE_MAX_TAGS)-1) << __GFP_BITS_SHIFT)
+
 #define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
 #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \
 					  RADIX_TREE_MAP_SHIFT))
@@ -295,6 +297,10 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
 int radix_tree_tag_get(struct radix_tree_root *root,
 			unsigned long index, unsigned int tag);
+void __radix_tree_root_tag_move_all_to_prev(struct radix_tree_root *root);
+void __radix_tree_prev_tag_clear(struct radix_tree_root *root,
+				 unsigned int tag);
+int radix_tree_prev_tag_get(struct radix_tree_root *root, unsigned int tag);
 unsigned int
 radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
 		unsigned long first_index, unsigned int max_items,
--- a/include/linux/rmap.h
+++ b/include/linux/rmap.h
@@ -55,6 +55,7 @@ struct anon_vma {
 	RH_KABI_EXTEND(unsigned degree)
 
 	RH_KABI_EXTEND(struct anon_vma *parent)	/* Parent of this anon_vma */
+	struct user_beancounter *anon_vma_ub;
 };
 
 /*
@@ -178,7 +179,7 @@ void page_add_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
 void do_page_add_anon_rmap(struct page *, struct vm_area_struct *,
 			   unsigned long, int);
 void page_add_new_anon_rmap(struct page *, struct vm_area_struct *, unsigned long);
-void page_add_file_rmap(struct page *);
+void page_add_file_rmap(struct page *, struct mm_struct *);
 void page_remove_rmap(struct page *);
 
 void hugepage_add_anon_rmap(struct page *, struct vm_area_struct *,
@@ -244,15 +245,32 @@ int try_to_munlock(struct page *);
 /*
  * Called by memory-failure.c to kill processes.
  */
-struct anon_vma *page_lock_anon_vma_read(struct page *page);
-void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
+extern struct anon_vma *page_lock_anon_vma_read(struct page *page);
+extern void page_unlock_anon_vma_read(struct anon_vma *anon_vma);
 int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma);
 
 /*
+ * rmap_walk_control: To control rmap traversing for specific needs
+ *
+ * arg: passed to rmap_one() and invalid_vma()
+ * rmap_one: executed on each vma where page is mapped
+ * done: for checking traversing termination condition
+ * anon_lock: for getting anon_lock by optimized way rather than default
+ * invalid_vma: for skipping uninterested vma
+ */
+struct rmap_walk_control {
+	void *arg;
+	int (*rmap_one)(struct page *page, struct vm_area_struct *vma,
+					unsigned long addr, void *arg);
+	int (*done)(struct page *page);
+	struct anon_vma *(*anon_lock)(struct page *page);
+	bool (*invalid_vma)(struct vm_area_struct *vma, void *arg);
+};
+
+/*
  * Called by migrate.c to remove migration ptes, but might be used more later.
  */
-int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
-		struct vm_area_struct *, unsigned long, void *), void *arg);
+int rmap_walk(struct page *page, struct rmap_walk_control *rwc);
 
 #else	/* !CONFIG_MMU */
 
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -47,15 +47,19 @@ struct sched_param {
 #include <linux/resource.h>
 #include <linux/timer.h>
 #include <linux/hrtimer.h>
+#include <linux/kcov.h>
 #include <linux/task_io_accounting.h>
 #include <linux/latencytop.h>
 #include <linux/cred.h>
 #include <linux/llist.h>
 #include <linux/uidgid.h>
 #include <linux/gfp.h>
+#include <linux/ve_proto.h>
 
 #include <asm/processor.h>
 
+#include <bc/task.h>
+
 #define SCHED_ATTR_SIZE_VER0	48	/* sizeof first published struct */
 
 /*
@@ -147,6 +151,8 @@ struct filename;
  */
 extern unsigned long avenrun[];		/* Load averages */
 extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
+extern void get_avenrun_ve(unsigned long *loads,
+			unsigned long offset, int shift);
 
 #define FSHIFT		11		/* nr of bits of precision */
 #define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
@@ -160,16 +166,23 @@ extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
 	load += n*(FIXED_1-exp); \
 	load >>= FSHIFT;
 
+#define LOAD_INT(x) ((x) >> FSHIFT)
+#define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100)
+
 extern unsigned long total_forks;
 extern int nr_threads;
 DECLARE_PER_CPU(unsigned long, process_counts);
 extern int nr_processes(void);
 extern unsigned long nr_running(void);
 extern bool single_task_running(void);
+extern unsigned long nr_sleeping(void);
+extern unsigned long nr_stopped(void);
+extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_iowait(void);
 extern unsigned long nr_iowait_cpu(int cpu);
-extern unsigned long this_cpu_load(void);
-
+extern unsigned long nr_active_cpu(void);
+extern atomic_t nr_dead;
+extern unsigned long nr_zombie;
 
 extern void calc_global_load(unsigned long ticks);
 extern void update_cpu_load_nohz(void);
@@ -213,6 +226,8 @@ print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
 #define TASK_PARKED		512
 #define TASK_STATE_MAX		1024
 
+#define __TASK_IOTHROTTLED	1024
+
 #define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
 
 extern char ___assert_task_state[1 - 2*!!(
@@ -240,6 +255,7 @@ extern char ___assert_task_state[1 - 2*!!(
 #define task_contributes_to_load(task)	\
 				((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
 				 (task->flags & PF_FROZEN) == 0)
+#define task_iothrottled(task)	((task->state & __TASK_IOTHROTTLED) != 0)
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
@@ -448,7 +464,9 @@ extern int get_dumpable(struct mm_struct *mm);
 					/* leave room for more dump flags */
 #define MMF_VM_MERGEABLE	16	/* KSM may merge identical pages */
 #define MMF_VM_HUGEPAGE		17	/* set when VM_HUGEPAGE is set on vma */
-#define MMF_EXE_FILE_CHANGED	18	/* see prctl_set_mm_exe_file() */
+/* This ine-shot flag is droped due to necessivity of changing exe once again
+ * on NFS restore */
+//#define MMF_EXE_FILE_CHANGED	18	/* see prctl_set_mm_exe_file() */
 
 #define MMF_HAS_UPROBES		19	/* has uprobes */
 #define MMF_RECALC_UPROBES	20	/* MMF_HAS_UPROBES can be wrong */
@@ -978,6 +996,11 @@ struct sched_domain {
 	unsigned int alb_failed;
 	unsigned int alb_pushed;
 
+	/* cpulimit balancing */
+	unsigned int clb_count;
+	unsigned int clb_failed;
+	unsigned int clb_pushed;
+
 	/* SD_BALANCE_EXEC stats */
 	unsigned int sbe_count;
 	unsigned int sbe_balanced;
@@ -1111,6 +1134,7 @@ struct sched_statistics {
 	u64			nr_migrations_cold;
 	u64			nr_failed_migrations_affine;
 	u64			nr_failed_migrations_running;
+	u64			nr_failed_migrations_cpulimit;
 	u64			nr_failed_migrations_hot;
 	u64			nr_forced_migrations;
 
@@ -1130,8 +1154,14 @@ struct sched_entity {
 	struct load_weight	load;		/* for load-balancing */
 	struct rb_node		run_node;
 	struct list_head	group_node;
+	struct list_head	cfs_rq_node;
 	unsigned int		on_rq;
 
+#ifdef CONFIG_CFS_BANDWIDTH
+	unsigned int            boosted;
+	struct list_head        boost_node;
+#endif
+
 	u64			exec_start;
 	u64			sum_exec_runtime;
 	u64			vruntime;
@@ -1145,6 +1175,7 @@ struct sched_entity {
 	struct cfs_rq		*cfs_rq;
 	/* rq "owned" by this entity/group: */
 	struct cfs_rq		*my_q;
+	int			depth;
 #endif
 
 /*
@@ -1339,17 +1370,22 @@ struct task_struct {
 	/* Used for emulating ABI behavior of previous Linux versions */
 	unsigned int personality;
 
+	/* scheduler bits, serialized by rq lock: */
+	unsigned sched_reset_on_fork:1;
+	/* Two below are really protected by pi_lock, but they are modified in
+         * the place where nobody else can modify other fields using rq->lock */
+	unsigned sched_contributes_to_load:1;
+	unsigned sched_interruptible_sleep:1;
+	unsigned woken_while_running:1;
+	unsigned :0; /* force alignment to the next boundary */
+
+	/* unserialized, strictly 'current' */
 	unsigned did_exec:1;
 	unsigned in_execve:1;	/* Tell the LSMs that the process is doing an
 				 * execve */
 	unsigned in_iowait:1;
-
-	/* task may not gain privileges */
-	unsigned no_new_privs:1;
-
-	/* Revert to default priority/policy when forking */
-	unsigned sched_reset_on_fork:1;
-	unsigned sched_contributes_to_load:1;
+	unsigned no_new_privs:1; /* task may not gain privileges */
+	unsigned may_throttle:1;
 
 	pid_t pid;
 	pid_t tgid;
@@ -1644,6 +1680,9 @@ struct task_struct {
 	unsigned long timer_slack_ns;
 	unsigned long default_timer_slack_ns;
 
+#ifdef CONFIG_KASAN
+	unsigned int kasan_depth;
+#endif
 #if defined(CONFIG_FUNCTION_GRAPH_TRACER) && !defined(CONFIG_S390)
 	/* Index of current stored address in ret_stack */
 	int curr_ret_stack;
@@ -1665,13 +1704,23 @@ struct task_struct {
 	/* bitmask and counter of trace recursion */
 	unsigned long trace_recursion;
 #endif /* CONFIG_TRACING */
+#ifdef CONFIG_KCOV
+	/* Coverage collection mode enabled for this task (0 if disabled). */
+	enum kcov_mode kcov_mode;
+	/* Size of the kcov_area. */
+	unsigned	kcov_size;
+	/* Buffer for coverage collection. */
+	void		*kcov_area;
+	/* kcov desciptor wired with this task or NULL. */
+	struct kcov	*kcov;
+#endif
+#ifdef CONFIG_BEANCOUNTERS
+	struct task_beancounter task_bc;
+#endif
+#ifdef CONFIG_VE
+	struct ve_struct *task_ve;
+#endif
 #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
-	struct memcg_batch_info {
-		int do_batch;	/* incremented when batch uncharge started */
-		struct mem_cgroup *memcg; /* target memcg of uncharge */
-		unsigned long nr_pages;	/* uncharged usage */
-		unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
-	} memcg_batch;
 	unsigned int memcg_kmem_skip_account;
 #endif
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
@@ -1995,6 +2044,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut,
 #define PF_KTHREAD	0x00200000	/* I am a kernel thread */
 #define PF_RANDOMIZE	0x00400000	/* randomize virtual address space */
 #define PF_SWAPWRITE	0x00800000	/* Allowed to write to swap */
+#define PF_MEMCG_RECLAIM  0x01000000	/* We are in memcg reclaim */
 #define PF_NO_SETAFFINITY 0x04000000	/* Userland is not allowed to meddle with cpus_allowed */
 #define PF_MCE_EARLY    0x08000000      /* Early kill for mce process policy */
 #define PF_MEMPOLICY	0x10000000	/* Non-default NUMA mempolicy */
@@ -2355,6 +2405,8 @@ extern void __set_special_pids(struct pid *pid);
 
 /* per-UID process charging. */
 extern struct user_struct * alloc_uid(kuid_t);
+extern struct user_struct * alloc_uid_ns(struct user_namespace *ns, kuid_t);
+
 static inline struct user_struct *get_uid(struct user_struct *u)
 {
 	atomic_inc(&u->__count);
@@ -2607,8 +2659,10 @@ int same_thread_group(struct task_struct *p1, struct task_struct *p2)
 
 static inline struct task_struct *next_thread(const struct task_struct *p)
 {
-	return list_entry_rcu(p->thread_group.next,
+	struct task_struct *tsk;
+	tsk = list_entry_rcu(p->thread_group.next,
 			      struct task_struct, thread_group);
+	return tsk;
 }
 
 static inline int thread_group_empty(struct task_struct *p)
@@ -2839,6 +2893,13 @@ extern int _cond_resched(void);
 	_cond_resched();			\
 })
 
+extern int _cond_resched_may_throttle(void);
+
+#define cond_resched_may_throttle() ({		\
+	__might_sleep(__FILE__, __LINE__, 0);	\
+	_cond_resched_may_throttle();		\
+})
+
 extern int __cond_resched_lock(spinlock_t *lock);
 
 #ifdef CONFIG_PREEMPT_COUNT
@@ -3048,6 +3109,31 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_CFS_CPULIMIT
+extern unsigned int task_nr_cpus(struct task_struct *p);
+extern unsigned int task_vcpu_id(struct task_struct *p);
+extern unsigned int sysctl_sched_vcpu_hotslice;
+extern unsigned int sysctl_sched_cpulimit_scale_cpufreq;
+extern unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq);
+#else
+static inline unsigned int task_nr_cpus(struct task_struct *p)
+{
+	return num_online_cpus();
+}
+
+static inline unsigned int task_vcpu_id(struct task_struct *p)
+{
+	return task_cpu(p);
+}
+
+static inline unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq)
+{
+	return freq;
+}
+#endif
+
+#define num_online_vcpus() task_nr_cpus(current)
+
 extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
 extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
 
--- a/include/linux/seccomp.h
+++ b/include/linux/seccomp.h
@@ -87,4 +87,15 @@ static inline void get_seccomp_filter(struct task_struct *tsk)
 	return;
 }
 #endif /* CONFIG_SECCOMP_FILTER */
+
+#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
+extern long seccomp_get_filter(struct task_struct *task,
+			       unsigned long filter_off, void __user *data);
+#else
+static inline long seccomp_get_filter(struct task_struct *task,
+				      unsigned long n, void __user *data)
+{
+	return -EINVAL;
+}
+#endif /* CONFIG_SECCOMP_FILTER && CONFIG_CHECKPOINT_RESTORE */
 #endif /* _LINUX_SECCOMP_H */
--- a/include/linux/shmem_fs.h
+++ b/include/linux/shmem_fs.h
@@ -23,6 +23,7 @@ struct shmem_inode_info {
 	struct list_head	swaplist;	/* chain of maybes on swap */
 	struct simple_xattrs	xattrs;		/* list of xattrs */
 	struct inode		vfs_inode;
+	struct user_beancounter	*shmi_ub;
 };
 
 struct shmem_sb_info {
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -4,39 +4,71 @@
 /*
  * This struct is used to pass information from page reclaim to the shrinkers.
  * We consolidate the values for easier extention later.
+ *
+ * The 'gfpmask' refers to the allocation we are currently trying to
+ * fulfil.
  */
 struct shrink_control {
 	gfp_t gfp_mask;
 
-	/* How many slab objects shrinker() should scan and try to reclaim */
+	/*
+	 * How many objects scan_objects should scan and try to reclaim.
+	 * This is reset before every call, so it is safe for callees
+	 * to modify.
+	 */
 	unsigned long nr_to_scan;
+
+	/* current node being shrunk (for NUMA aware shrinkers) */
+	int nid;
+
+	/* current memcg being shrunk (for memcg aware shrinkers) */
+	struct mem_cgroup *memcg;
+
+	bool for_drop_caches;
 };
 
+#define SHRINK_STOP (~0UL)
 /*
  * A callback you can register to apply pressure to ageable caches.
  *
- * 'sc' is passed shrink_control which includes a count 'nr_to_scan'
- * and a 'gfpmask'.  It should look through the least-recently-used
- * 'nr_to_scan' entries and attempt to free them up.  It should return
- * the number of objects which remain in the cache.  If it returns -1, it means
- * it cannot do any scanning at this time (eg. there is a risk of deadlock).
+ * @count_objects should return the number of freeable items in the cache. If
+ * there are no objects to free or the number of freeable items cannot be
+ * determined, it should return 0. No deadlock checks should be done during the
+ * count callback - the shrinker relies on aggregating scan counts that couldn't
+ * be executed due to potential deadlocks to be run at a later call when the
+ * deadlock condition is no longer pending.
  *
- * The 'gfpmask' refers to the allocation we are currently trying to
- * fulfil.
+ * @scan_objects will only be called if @count_objects returned a non-zero
+ * value for the number of freeable objects. The callout should scan the cache
+ * and attempt to free items from the cache. It should then return the number
+ * of objects freed during the scan, or SHRINK_STOP if progress cannot be made
+ * due to potential deadlocks. If SHRINK_STOP is returned, then no further
+ * attempts to call the @scan_objects will be made from the current reclaim
+ * context.
  *
- * Note that 'shrink' will be passed nr_to_scan == 0 when the VM is
- * querying the cache size, so a fastpath for that case is appropriate.
+ * @flags determine the shrinker abilities, like numa awareness
  */
 struct shrinker {
-	int (*shrink)(struct shrinker *, struct shrink_control *sc);
+	unsigned long (*count_objects)(struct shrinker *,
+				       struct shrink_control *sc);
+	unsigned long (*scan_objects)(struct shrinker *,
+				      struct shrink_control *sc);
+
 	int seeks;	/* seeks to recreate an obj */
 	long batch;	/* reclaim batch size, 0 = default */
+	unsigned long flags;
 
 	/* These are for internal use */
 	struct list_head list;
-	atomic_long_t nr_in_batch; /* objs pending delete */
+	/* objs pending delete, per node */
+	atomic_long_t *nr_deferred;
 };
 #define DEFAULT_SEEKS 2 /* A good number if you don't know better. */
-extern void register_shrinker(struct shrinker *);
+
+/* Flags */
+#define SHRINKER_NUMA_AWARE	(1 << 0)
+#define SHRINKER_MEMCG_AWARE	(1 << 1)
+
+extern int register_shrinker(struct shrinker *);
 extern void unregister_shrinker(struct shrinker *);
 #endif
--- a/include/linux/signal.h
+++ b/include/linux/signal.h
@@ -17,6 +17,9 @@ struct sigqueue {
 	int flags;
 	siginfo_t info;
 	struct user_struct *user;
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *sig_ub;
+#endif
 };
 
 /* flags values. */
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -4,6 +4,8 @@
  * (C) SGI 2006, Christoph Lameter
  * 	Cleaned up and restructured to ease the addition of alternative
  * 	implementations of SLAB allocators.
+ * (C) Linux Foundation 2008-2013
+ *      Unified interface for all slab allocators
  */
 
 #ifndef _LINUX_SLAB_H
@@ -77,10 +79,16 @@
 #else
 # define SLAB_FAILSLAB		0x00000000UL
 #endif
+#ifdef CONFIG_MEMCG_KMEM
+# define SLAB_ACCOUNT		0x04000000UL	/* Account to memcg */
+#else
+# define SLAB_ACCOUNT		0x00000000UL
+#endif
 
 /* The following flags affect the page allocator grouping pages by mobility */
 #define SLAB_RECLAIM_ACCOUNT	0x00020000UL		/* Objects are reclaimable */
 #define SLAB_TEMPORARY		SLAB_RECLAIM_ACCOUNT	/* Objects are short-lived */
+
 /*
  * ZERO_SIZE_PTR will be returned for zero sized kmalloc requests.
  *
@@ -94,6 +102,8 @@
 #define ZERO_OR_NULL_PTR(x) ((unsigned long)(x) <= \
 				(unsigned long)ZERO_SIZE_PTR)
 
+#include <linux/kmemleak.h>
+#include <linux/kasan.h>
 
 struct mem_cgroup;
 /*
@@ -105,12 +115,12 @@ int slab_is_available(void);
 struct kmem_cache *kmem_cache_create(const char *, size_t, size_t,
 			unsigned long,
 			void (*)(void *));
-struct kmem_cache *
-kmem_cache_create_memcg(struct mem_cgroup *, const char *, size_t, size_t,
-			unsigned long, void (*)(void *), struct kmem_cache *);
 void kmem_cache_destroy(struct kmem_cache *);
 int kmem_cache_shrink(struct kmem_cache *);
-void kmem_cache_free(struct kmem_cache *, void *);
+
+void memcg_create_kmem_cache(struct mem_cgroup *, struct kmem_cache *);
+void memcg_deactivate_kmem_caches(struct mem_cgroup *);
+void memcg_destroy_kmem_caches(struct mem_cgroup *);
 
 /*
  * Please use this macro to create slab caches. Simply specify the
@@ -146,35 +156,6 @@ size_t ksize(const void *);
 #define ARCH_KMALLOC_MINALIGN __alignof__(unsigned long long)
 #endif
 
-#ifdef CONFIG_SLOB
-/*
- * Common fields provided in kmem_cache by all slab allocators
- * This struct is either used directly by the allocator (SLOB)
- * or the allocator must include definitions for all fields
- * provided in kmem_cache_common in their definition of kmem_cache.
- *
- * Once we can do anonymous structs (C11 standard) we could put a
- * anonymous struct definition in these allocators so that the
- * separate allocations in the kmem_cache structure of SLAB and
- * SLUB is no longer needed.
- */
-struct kmem_cache {
-	unsigned int object_size;/* The original size of the object */
-	unsigned int size;	/* The aligned/padded/added on size  */
-	unsigned int align;	/* Alignment as calculated */
-	unsigned long flags;	/* Active flags on the slab */
-	const char *name;	/* Slab name for sysfs */
-	int refcount;		/* Use counter */
-	void (*ctor)(void *);	/* Called on object slot creation */
-	struct list_head list;	/* List of all slab caches on the system */
-};
-
-#define KMALLOC_MAX_SIZE (1UL << 30)
-
-#include <linux/slob_def.h>
-
-#else /* CONFIG_SLOB */
-
 /*
  * Kmalloc array related definitions
  */
@@ -195,7 +176,9 @@ struct kmem_cache {
 #ifndef KMALLOC_SHIFT_LOW
 #define KMALLOC_SHIFT_LOW	5
 #endif
-#else
+#endif
+
+#ifdef CONFIG_SLUB
 /*
  * SLUB allocates up to order 2 pages directly and otherwise
  * passes the request to the page allocator.
@@ -207,6 +190,19 @@ struct kmem_cache {
 #endif
 #endif
 
+#ifdef CONFIG_SLOB
+/*
+ * SLOB passes all page size and larger requests to the page allocator.
+ * No kmalloc array is necessary since objects of different sizes can
+ * be allocated from the same page.
+ */
+#define KMALLOC_SHIFT_MAX	30
+#define KMALLOC_SHIFT_HIGH	PAGE_SHIFT
+#ifndef KMALLOC_SHIFT_LOW
+#define KMALLOC_SHIFT_LOW	3
+#endif
+#endif
+
 /* Maximum allocatable size */
 #define KMALLOC_MAX_SIZE	(1UL << KMALLOC_SHIFT_MAX)
 /* Maximum size for which we actually use a slab cache */
@@ -221,6 +217,7 @@ struct kmem_cache {
 #define KMALLOC_MIN_SIZE (1 << KMALLOC_SHIFT_LOW)
 #endif
 
+#ifndef CONFIG_SLOB
 extern struct kmem_cache *kmalloc_caches[KMALLOC_SHIFT_HIGH + 1];
 #ifdef CONFIG_ZONE_DMA
 extern struct kmem_cache *kmalloc_dma_caches[KMALLOC_SHIFT_HIGH + 1];
@@ -275,15 +272,112 @@ static __always_inline int kmalloc_index(size_t size)
 	/* Will never be reached. Needed because the compiler may complain */
 	return -1;
 }
+#endif /* !CONFIG_SLOB */
 
-#ifdef CONFIG_SLAB
-#include <linux/slab_def.h>
-#elif defined(CONFIG_SLUB)
-#include <linux/slub_def.h>
+void *__kmalloc(size_t size, gfp_t flags);
+void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags);
+void kmem_cache_free(struct kmem_cache *, void *);
+
+#ifdef CONFIG_NUMA
+void *__kmalloc_node(size_t size, gfp_t flags, int node);
+void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 #else
-#error "Unknown slab allocator"
+static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
+{
+	return __kmalloc(size, flags);
+}
+
+static __always_inline void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t flags, int node)
+{
+	return kmem_cache_alloc(s, flags);
+}
 #endif
 
+#ifdef CONFIG_TRACING
+extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t);
+
+#ifdef CONFIG_NUMA
+extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
+					   gfp_t gfpflags,
+					   int node, size_t size);
+#else
+static __always_inline void *
+kmem_cache_alloc_node_trace(struct kmem_cache *s,
+			      gfp_t gfpflags,
+			      int node, size_t size)
+{
+	return kmem_cache_alloc_trace(s, gfpflags, size);
+}
+#endif /* CONFIG_NUMA */
+
+#else /* CONFIG_TRACING */
+static __always_inline void *kmem_cache_alloc_trace(struct kmem_cache *s,
+		gfp_t flags, size_t size)
+{
+	void *ret = kmem_cache_alloc(s, flags);
+
+	kasan_kmalloc(s, ret, size);
+	return ret;
+}
+
+static __always_inline void *
+kmem_cache_alloc_node_trace(struct kmem_cache *s,
+			      gfp_t gfpflags,
+			      int node, size_t size)
+{
+	void *ret = kmem_cache_alloc_node(s, gfpflags, node);
+
+	kasan_kmalloc(s, ret, size);
+	return ret;
+}
+#endif /* CONFIG_TRACING */
+
+extern void *kmalloc_order(size_t size, gfp_t flags, unsigned int order);
+
+#ifdef CONFIG_TRACING
+extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order);
+#else
+static __always_inline void *
+kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+{
+	return kmalloc_order(size, flags, order);
+}
+#endif
+
+static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
+{
+	unsigned int order = get_order(size);
+	return kmalloc_order_trace(size, flags, order);
+}
+
+/**
+ * kmalloc - allocate memory
+ * @size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate (see kcalloc).
+ *
+ * kmalloc is the normal method of allocating memory
+ * for objects smaller than page size in the kernel.
+ */
+static __always_inline void *kmalloc(size_t size, gfp_t flags)
+{
+	if (__builtin_constant_p(size)) {
+		if (size > KMALLOC_MAX_CACHE_SIZE)
+			return kmalloc_large(size, flags);
+#ifndef CONFIG_SLOB
+		if (!(flags & GFP_DMA)) {
+			int index = kmalloc_index(size);
+
+			if (!index)
+				return ZERO_SIZE_PTR;
+
+			return kmem_cache_alloc_trace(kmalloc_caches[index],
+					flags, size);
+		}
+#endif
+	}
+	return __kmalloc(size, flags);
+}
+
 /*
  * Determine size used for the nth kmalloc cache.
  * return size or 0 if a kmalloc cache for that
@@ -291,6 +385,7 @@ static __always_inline int kmalloc_index(size_t size)
  */
 static __always_inline int kmalloc_size(int n)
 {
+#ifndef CONFIG_SLOB
 	if (n > 2)
 		return 1 << n;
 
@@ -299,10 +394,26 @@ static __always_inline int kmalloc_size(int n)
 
 	if (n == 2 && KMALLOC_MIN_SIZE <= 64)
 		return 192;
-
+#endif
 	return 0;
 }
-#endif /* !CONFIG_SLOB */
+
+static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
+{
+#ifndef CONFIG_SLOB
+	if (__builtin_constant_p(size) &&
+		size <= KMALLOC_MAX_CACHE_SIZE && !(flags & SLAB_CACHE_DMA)) {
+		int i = kmalloc_index(size);
+
+		if (!i)
+			return ZERO_SIZE_PTR;
+
+		return kmem_cache_alloc_node_trace(kmalloc_caches[i],
+						flags, node, size);
+	}
+#endif
+	return __kmalloc_node(size, flags, node);
+}
 
 /*
  * Setting ARCH_SLAB_MINALIGN in arch headers allows a different alignment.
@@ -312,49 +423,42 @@ static __always_inline int kmalloc_size(int n)
 #ifndef ARCH_SLAB_MINALIGN
 #define ARCH_SLAB_MINALIGN __alignof__(unsigned long long)
 #endif
+
+struct memcg_cache_array {
+	struct rcu_head rcu;
+	struct kmem_cache *entries[0];
+};
+
 /*
  * This is the main placeholder for memcg-related information in kmem caches.
- * struct kmem_cache will hold a pointer to it, so the memory cost while
- * disabled is 1 pointer. The runtime cost while enabled, gets bigger than it
- * would otherwise be if that would be bundled in kmem_cache: we'll need an
- * extra pointer chase. But the trade off clearly lays in favor of not
- * penalizing non-users.
- *
  * Both the root cache and the child caches will have it. For the root cache,
  * this will hold a dynamically allocated array large enough to hold
- * information about the currently limited memcgs in the system.
+ * information about the currently limited memcgs in the system. To allow the
+ * array to be accessed without taking any locks, on relocation we free the old
+ * version only after a grace period.
  *
  * Child caches will hold extra metadata needed for its operation. Fields are:
  *
  * @memcg: pointer to the memcg this cache belongs to
- * @list: list_head for the list of all caches in this memcg
  * @root_cache: pointer to the global, root cache, this cache was derived from
- * @dead: set to true after the memcg dies; the cache may still be around.
- * @nr_pages: number of pages that belongs to this cache.
- * @destroy: worker to be called whenever we are ready, or believe we may be
- *           ready, to destroy this cache.
+ *
+ * Both root and child caches of the same kind are linked into a list chained
+ * through @list.
  */
 struct memcg_cache_params {
 	bool is_root_cache;
+	struct list_head list;
 	union {
-		struct kmem_cache *memcg_caches[0];
+		struct memcg_cache_array __rcu *memcg_caches;
 		struct {
 			struct mem_cgroup *memcg;
-			struct list_head list;
 			struct kmem_cache *root_cache;
-			bool dead;
-			atomic_t nr_pages;
-			struct work_struct destroy;
 		};
 	};
 };
 
 int memcg_update_all_caches(int num_memcgs);
 
-struct seq_file;
-int cache_show(struct kmem_cache *s, struct seq_file *m);
-void print_slabinfo_header(struct seq_file *m);
-
 /**
  * kmalloc_array - allocate memory for an array.
  * @n: number of elements.
@@ -434,36 +538,6 @@ static inline void *kcalloc(size_t n, size_t size, gfp_t flags)
 void kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
 int kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 
-#if !defined(CONFIG_NUMA) && !defined(CONFIG_SLOB)
-/**
- * kmalloc_node - allocate memory from a specific node
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate (see kcalloc).
- * @node: node to allocate from.
- *
- * kmalloc() for non-local nodes, used to allocate from a specific node
- * if available. Equivalent to kmalloc() in the non-NUMA single-node
- * case.
- */
-static inline void *kmalloc_node(size_t size, gfp_t flags, int node)
-{
-	return kmalloc(size, flags);
-}
-
-static inline void *__kmalloc_node(size_t size, gfp_t flags, int node)
-{
-	return __kmalloc(size, flags);
-}
-
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
-
-static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep,
-					gfp_t flags, int node)
-{
-	return kmem_cache_alloc(cachep, flags);
-}
-#endif /* !CONFIG_NUMA && !CONFIG_SLOB */
-
 /*
  * kmalloc_track_caller is a special version of kmalloc that records the
  * calling function of the routine calling it for slab leak tracking instead
@@ -540,14 +614,7 @@ static inline void *kzalloc_node(size_t size, gfp_t flags, int node)
 	return kmalloc_node(size, flags | __GFP_ZERO, node);
 }
 
-/*
- * Determine the size of a slab object
- */
-static inline unsigned int kmem_cache_size(struct kmem_cache *s)
-{
-	return s->object_size;
-}
-
+unsigned int kmem_cache_size(struct kmem_cache *s);
 void __init kmem_cache_init_late(void);
 
 #endif	/* _LINUX_SLAB_H */
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -5,20 +5,6 @@
 
 /*
  * Definitions unique to the original Linux SLAB allocator.
- *
- * What we provide here is a way to optimize the frequent kmalloc
- * calls in the kernel by selecting the appropriate general cache
- * if kmalloc was called with a size that can be established at
- * compile time.
- */
-
-#include <linux/init.h>
-#include <linux/compiler.h>
-
-/*
- * struct kmem_cache
- *
- * manages a cache.
  */
 
 struct kmem_cache {
@@ -82,7 +68,7 @@ struct kmem_cache {
 	int obj_offset;
 #endif /* CONFIG_DEBUG_SLAB */
 #ifdef CONFIG_MEMCG_KMEM
-	struct memcg_cache_params *memcg_params;
+	struct memcg_cache_params memcg_params;
 #endif
 
 /* 6) per-cpu/per-node data, touched during every alloc/free */
@@ -104,96 +90,4 @@ struct kmem_cache {
 	 */
 };
 
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
-void *__kmalloc(size_t size, gfp_t flags);
-
-#ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_trace(struct kmem_cache *, gfp_t, size_t);
-#else
-static __always_inline void *
-kmem_cache_alloc_trace(struct kmem_cache *cachep, gfp_t flags, size_t size)
-{
-	return kmem_cache_alloc(cachep, flags);
-}
-#endif
-
-static __always_inline void *kmalloc(size_t size, gfp_t flags)
-{
-	struct kmem_cache *cachep;
-	void *ret;
-
-	if (__builtin_constant_p(size)) {
-		int i;
-
-		if (!size)
-			return ZERO_SIZE_PTR;
-
-		if (WARN_ON_ONCE(size > KMALLOC_MAX_SIZE))
-			return NULL;
-
-		i = kmalloc_index(size);
-
-#ifdef CONFIG_ZONE_DMA
-		if (flags & GFP_DMA)
-			cachep = kmalloc_dma_caches[i];
-		else
-#endif
-			cachep = kmalloc_caches[i];
-
-		ret = kmem_cache_alloc_trace(cachep, flags, size);
-
-		return ret;
-	}
-	return __kmalloc(size, flags);
-}
-
-#ifdef CONFIG_NUMA
-extern void *__kmalloc_node(size_t size, gfp_t flags, int node);
-extern void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
-
-#ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
-					 gfp_t flags,
-					 int nodeid,
-					 size_t size);
-#else
-static __always_inline void *
-kmem_cache_alloc_node_trace(struct kmem_cache *cachep,
-			    gfp_t flags,
-			    int nodeid,
-			    size_t size)
-{
-	return kmem_cache_alloc_node(cachep, flags, nodeid);
-}
-#endif
-
-static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
-{
-	struct kmem_cache *cachep;
-
-	if (__builtin_constant_p(size)) {
-		int i;
-
-		if (!size)
-			return ZERO_SIZE_PTR;
-
-		if (WARN_ON_ONCE(size > KMALLOC_MAX_SIZE))
-			return NULL;
-
-		i = kmalloc_index(size);
-
-#ifdef CONFIG_ZONE_DMA
-		if (flags & GFP_DMA)
-			cachep = kmalloc_dma_caches[i];
-		else
-#endif
-			cachep = kmalloc_caches[i];
-
-		return kmem_cache_alloc_node_trace(cachep, flags, node, size);
-	}
-	return __kmalloc_node(size, flags, node);
-}
-
-#endif	/* CONFIG_NUMA */
-
 #endif	/* _LINUX_SLAB_DEF_H */
--- a/include/linux/slob_def.h
+++ /dev/null
@@ -1,39 +0,0 @@
-#ifndef __LINUX_SLOB_DEF_H
-#define __LINUX_SLOB_DEF_H
-
-#include <linux/numa.h>
-
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
-
-static __always_inline void *kmem_cache_alloc(struct kmem_cache *cachep,
-					      gfp_t flags)
-{
-	return kmem_cache_alloc_node(cachep, flags, NUMA_NO_NODE);
-}
-
-void *__kmalloc_node(size_t size, gfp_t flags, int node);
-
-static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
-{
-	return __kmalloc_node(size, flags, node);
-}
-
-/**
- * kmalloc - allocate memory
- * @size: how many bytes of memory are required.
- * @flags: the type of memory to allocate (see kcalloc).
- *
- * kmalloc is the normal method of allocating memory
- * in the kernel.
- */
-static __always_inline void *kmalloc(size_t size, gfp_t flags)
-{
-	return __kmalloc_node(size, flags, NUMA_NO_NODE);
-}
-
-static __always_inline void *__kmalloc(size_t size, gfp_t flags)
-{
-	return kmalloc(size, flags);
-}
-
-#endif /* __LINUX_SLOB_DEF_H */
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -6,14 +6,8 @@
  *
  * (C) 2007 SGI, Christoph Lameter
  */
-#include <linux/types.h>
-#include <linux/gfp.h>
-#include <linux/bug.h>
-#include <linux/workqueue.h>
 #include <linux/kobject.h>
 
-#include <linux/kmemleak.h>
-
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
 	ALLOC_SLOWPATH,		/* Allocation by getting a new cpu slab */
@@ -91,8 +85,11 @@ struct kmem_cache {
 	struct kobject kobj;	/* For sysfs */
 #endif
 #ifdef CONFIG_MEMCG_KMEM
-	struct memcg_cache_params *memcg_params;
+	struct memcg_cache_params memcg_params;
 	int max_attr_size; /* for propagation, maximum size of a stored attr */
+#ifdef CONFIG_SYSFS
+	struct kset *memcg_kset;
+#endif
 #endif
 
 #ifdef CONFIG_NUMA
@@ -104,20 +101,6 @@ struct kmem_cache {
 	struct kmem_cache_node *node[MAX_NUMNODES];
 };
 
-void *kmem_cache_alloc(struct kmem_cache *, gfp_t);
-void *__kmalloc(size_t size, gfp_t flags);
-
-static __always_inline void *
-kmalloc_order(size_t size, gfp_t flags, unsigned int order)
-{
-	void *ret;
-
-	flags |= (__GFP_COMP | __GFP_KMEMCG);
-	ret = (void *) __get_free_pages(flags, order);
-	kmemleak_alloc(ret, size, 1, flags);
-	return ret;
-}
-
 /**
  * Calling this on allocated memory will check that the memory
  * is expected to be in use, and print warnings if not.
@@ -131,81 +114,32 @@ static inline bool verify_mem_not_deleted(const void *x)
 }
 #endif
 
-#ifdef CONFIG_TRACING
-extern void *
-kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size);
-extern void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order);
+#ifdef CONFIG_SYSFS
+#define SLAB_SUPPORTS_SYSFS
+void sysfs_slab_remove(struct kmem_cache *);
 #else
-static __always_inline void *
-kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
-{
-	return kmem_cache_alloc(s, gfpflags);
-}
-
-static __always_inline void *
-kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+static inline void sysfs_slab_remove(struct kmem_cache *s)
 {
-	return kmalloc_order(size, flags, order);
 }
 #endif
 
-static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
-{
-	unsigned int order = get_order(size);
-	return kmalloc_order_trace(size, flags, order);
-}
-
-static __always_inline void *kmalloc(size_t size, gfp_t flags)
-{
-	if (__builtin_constant_p(size)) {
-		if (size > KMALLOC_MAX_CACHE_SIZE)
-			return kmalloc_large(size, flags);
-
-		if (!(flags & GFP_DMA)) {
-			int index = kmalloc_index(size);
-
-			if (!index)
-				return ZERO_SIZE_PTR;
-
-			return kmem_cache_alloc_trace(kmalloc_caches[index],
-					flags, size);
-		}
-	}
-	return __kmalloc(size, flags);
-}
-
-#ifdef CONFIG_NUMA
-void *__kmalloc_node(size_t size, gfp_t flags, int node);
-void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node);
 
-#ifdef CONFIG_TRACING
-extern void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
-					   gfp_t gfpflags,
-					   int node, size_t size);
-#else
-static __always_inline void *
-kmem_cache_alloc_node_trace(struct kmem_cache *s,
-			      gfp_t gfpflags,
-			      int node, size_t size)
+/**
+ * virt_to_obj - returns address of the beginning of object.
+ * @s: object's kmem_cache
+ * @slab_page: address of slab page
+ * @x: address within object memory range
+ *
+ * Returns address of the beginning of object
+ */
+static inline void *virt_to_obj(struct kmem_cache *s,
+				const void *slab_page,
+				const void *x)
 {
-	return kmem_cache_alloc_node(s, gfpflags, node);
+	return (void *)x - ((x - slab_page) % s->size);
 }
-#endif
-
-static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
-{
-	if (__builtin_constant_p(size) &&
-		size <= KMALLOC_MAX_CACHE_SIZE && !(flags & GFP_DMA)) {
-		int index = kmalloc_index(size);
-
-		if (!index)
-			return ZERO_SIZE_PTR;
 
-		return kmem_cache_alloc_node_trace(kmalloc_caches[index],
-			       flags, node, size);
-	}
-	return __kmalloc_node(size, flags, node);
-}
-#endif
+void object_err(struct kmem_cache *s, struct page *page,
+		u8 *object, char *reason);
 
 #endif /* _LINUX_SLUB_DEF_H */
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -305,6 +305,15 @@ struct ucred {
 /* IPX options */
 #define IPX_TYPE	1
 
+#define MAX_SOCK_ADDR  128             /* 108 for Unix domain -
+					  16 for IP, 16 for IPX,
+					  24 for IPv6,
+					  about 80 for AX.25
+					  must be at least one bigger than
+					  the AF_UNIX size (see net/unix/af_unix.c
+					  :unix_mkname()).
+					*/
+
 extern void cred_to_ucred(struct pid *pid, const struct cred *cred, struct ucred *ucred);
 
 extern int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -241,9 +241,7 @@ struct swap_info_struct {
 void *workingset_eviction(struct address_space *mapping, struct page *page);
 bool workingset_refault(void *shadow);
 void workingset_activation(struct page *page);
-
-void workingset_remember_node(struct radix_tree_node *node);
-void workingset_forget_node(struct radix_tree_node *node);
+extern struct list_lru workingset_shadow_nodes;
 
 static inline unsigned int workingset_node_pages(struct radix_tree_node *node)
 {
@@ -317,12 +315,15 @@ static inline void lru_cache_add_file(struct page *page)
 	ClearPageActive(page);
 	__lru_cache_add(page);
 }
+extern void lru_cache_add_active_or_unevictable(struct page *page,
+						struct vm_area_struct *vma);
 
 /* linux/mm/vmscan.c */
 extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 					gfp_t gfp_mask, nodemask_t *mask);
 extern int __isolate_lru_page(struct page *page, isolate_mode_t mode);
 extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem,
+						  unsigned long nr_pages,
 						  gfp_t gfp_mask, bool noswap);
 extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
 						gfp_t gfp_mask, bool noswap,
@@ -346,6 +347,10 @@ static inline int zone_reclaim(struct zone *z, gfp_t mask, unsigned int order)
 }
 #endif
 
+#ifdef CONFIG_MEMCG
+extern int sysctl_force_scan_thresh;
+#endif
+
 extern int page_evictable(struct page *page);
 extern void check_move_unevictable_pages(struct page **, int nr_pages);
 
@@ -360,9 +365,13 @@ static inline int mem_cgroup_swappiness(struct mem_cgroup *mem)
 }
 #endif
 #ifdef CONFIG_MEMCG_SWAP
-extern void mem_cgroup_uncharge_swap(swp_entry_t ent);
+extern void mem_cgroup_swapout(struct page *page, swp_entry_t entry);
+extern void mem_cgroup_uncharge_swap(swp_entry_t entry);
 #else
-static inline void mem_cgroup_uncharge_swap(swp_entry_t ent)
+static inline void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+{
+}
+static inline void mem_cgroup_uncharge_swap(swp_entry_t entry)
 {
 }
 #endif
@@ -422,7 +431,7 @@ extern void swap_shmem_alloc(swp_entry_t);
 extern int swap_duplicate(swp_entry_t);
 extern int swapcache_prepare(swp_entry_t);
 extern void swap_free(swp_entry_t);
-extern void swapcache_free(swp_entry_t, struct page *page);
+extern void swapcache_free(swp_entry_t);
 extern int free_swap_and_cache(swp_entry_t);
 extern int swap_type_of(dev_t, sector_t, struct block_device **);
 extern unsigned int count_swap_pages(int, int);
@@ -485,7 +494,7 @@ static inline void swap_free(swp_entry_t swp)
 {
 }
 
-static inline void swapcache_free(swp_entry_t swp, struct page *page)
+static inline void swapcache_free(swp_entry_t swp)
 {
 }
 
--- a/include/linux/swapops.h
+++ b/include/linux/swapops.h
@@ -54,7 +54,7 @@ static inline pgoff_t swp_offset(swp_entry_t entry)
 /* check whether a pte points to a swap entry */
 static inline int is_swap_pte(pte_t pte)
 {
-	return !pte_none(pte) && !pte_present(pte) && !pte_file(pte);
+	return !pte_none(pte) && !pte_present(pte);
 }
 #endif
 
@@ -66,7 +66,6 @@ static inline swp_entry_t pte_to_swp_entry(pte_t pte)
 {
 	swp_entry_t arch_entry;
 
-	BUG_ON(pte_file(pte));
 	if (pte_swp_soft_dirty(pte))
 		pte = pte_swp_clear_soft_dirty(pte);
 	arch_entry = __pte_to_swp_entry(pte);
@@ -82,7 +81,6 @@ static inline pte_t swp_entry_to_pte(swp_entry_t entry)
 	swp_entry_t arch_entry;
 
 	arch_entry = __swp_entry(swp_type(entry), swp_offset(entry));
-	BUG_ON(pte_file(__swp_entry_to_pte(arch_entry)));
 	return __swp_entry_to_pte(arch_entry);
 }
 
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -58,6 +58,15 @@ extern int proc_doulongvec_ms_jiffies_minmax(struct ctl_table *table, int,
 extern int proc_do_large_bitmap(struct ctl_table *, int,
 				void __user *, size_t *, loff_t *);
 
+extern int proc_dointvec_virtual(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos);
+extern int proc_doulongvec_minmax_virtual(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos);
+extern int proc_dointvec_immutable(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos);
+extern int proc_dostring_immutable(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos);
+
 /*
  * Register a set of sysctl names by calling register_sysctl_table
  * with an initialised array of struct ctl_table's.  An entry with 
@@ -166,6 +175,8 @@ struct ctl_path {
 	const char *procname;
 };
 
+extern int ve_allow_module_load;
+
 #ifdef CONFIG_SYSCTL
 
 void proc_sys_poll_notify(struct ctl_table_poll *poll);
--- a/include/linux/task_io_accounting_ops.h
+++ b/include/linux/task_io_accounting_ops.h
@@ -5,10 +5,13 @@
 #define __TASK_IO_ACCOUNTING_OPS_INCLUDED
 
 #include <linux/sched.h>
+#include <bc/io_acct.h>
 
 #ifdef CONFIG_TASK_IO_ACCOUNTING
+
 static inline void task_io_account_read(size_t bytes)
 {
+	ub_io_account_read(bytes);
 	current->ioac.read_bytes += bytes;
 }
 
@@ -23,6 +26,12 @@ static inline unsigned long task_io_get_inblock(const struct task_struct *p)
 
 static inline void task_io_account_write(size_t bytes)
 {
+	ub_io_account_write(bytes);
+	current->ioac.write_bytes += bytes;
+}
+
+static inline void task_io_account_dirty(size_t bytes)
+{
 	current->ioac.write_bytes += bytes;
 }
 
@@ -73,6 +82,10 @@ static inline unsigned long task_io_get_oublock(const struct task_struct *p)
 	return 0;
 }
 
+static inline void task_io_account_dirty(size_t bytes)
+{
+}
+
 static inline void task_io_account_cancelled_write(size_t bytes)
 {
 }
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -360,6 +360,11 @@ static inline struct tcp_sock *tcp_sk(const struct sock *sk)
 	return (struct tcp_sock *)sk;
 }
 
+static inline int tcp_urg_mode(const struct tcp_sock *tp)
+{
+	return tp->snd_una != tp->snd_up;
+}
+
 struct tcp_timewait_sock {
 	struct inet_timewait_sock tw_sk;
 	u32			  tw_rcv_nxt;
--- a/include/linux/thread_info.h
+++ b/include/linux/thread_info.h
@@ -56,13 +56,12 @@ extern long do_no_restart_syscall(struct restart_block *parm);
 #ifdef __KERNEL__
 
 #ifdef CONFIG_DEBUG_STACK_USAGE
-# define THREADINFO_GFP		(GFP_KERNEL | __GFP_NOTRACK | __GFP_ZERO)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK | \
+				 __GFP_ZERO)
 #else
-# define THREADINFO_GFP		(GFP_KERNEL | __GFP_NOTRACK)
+# define THREADINFO_GFP		(GFP_KERNEL_ACCOUNT | __GFP_NOTRACK)
 #endif
 
-#define THREADINFO_GFP_ACCOUNTED (THREADINFO_GFP | __GFP_KMEMCG)
-
 /*
  * flag set/clear/test wrappers
  * - pass TIF_xxxx constants to these functions
--- a/include/linux/threads.h
+++ b/include/linux/threads.h
@@ -25,6 +25,7 @@
  * This controls the default maximum pid allocated to a process
  */
 #define PID_MAX_DEFAULT (CONFIG_BASE_SMALL ? 0x1000 : 0x8000)
+#define PID_MAX_NS_DEFAULT	(PID_MAX_DEFAULT)
 
 /*
  * A maximum of 4 million PIDs should be enough for a while.
--- a/include/linux/tty.h
+++ b/include/linux/tty.h
@@ -281,6 +281,10 @@ struct tty_struct {
 	struct tty_port *port;
 
 	RH_KABI_EXTEND(struct ld_semaphore ldisc_sem)
+
+#ifdef CONFIG_BEANCOUNTERS
+	struct user_beancounter *ub;
+#endif
 };
 
 /* Each of a tty's open files has private_data pointing to tty_file_private */
@@ -313,6 +317,7 @@ struct tty_file_private {
 #define TTY_NO_WRITE_SPLIT 	17	/* Preserve write boundaries to driver */
 #define TTY_HUPPED 		18	/* Post driver->hangup() */
 #define TTY_LDISC_HALTED	22	/* Line discipline is halted */
+#define TTY_CHARGED		23	/* Charged as ub resource */
 
 #define TTY_WRITE_FLUSH(tty) tty_write_flush((tty))
 
--- a/include/linux/types.h
+++ b/include/linux/types.h
@@ -26,6 +26,11 @@ typedef __kernel_timer_t	timer_t;
 typedef __kernel_clockid_t	clockid_t;
 typedef __kernel_mqd_t		mqd_t;
 
+#ifndef __ENVID_T_DEFINED__
+typedef unsigned envid_t;
+#define __ENVID_T_DEFINED__
+#endif
+
 typedef _Bool			bool;
 
 typedef __kernel_uid32_t	uid_t;
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -7,6 +7,9 @@
 #include <linux/err.h>
 #include <linux/rh_kabi.h>
 
+#define UIDHASH_BITS   (CONFIG_BASE_SMALL ? 3 : 7)
+#define UIDHASH_SZ     (1 << UIDHASH_BITS)
+
 #define UID_GID_MAP_MAX_EXTENTS 5
 
 struct uid_gid_map {	/* 64 bytes -- 1 cache line */
@@ -27,6 +30,7 @@ struct user_namespace {
 	struct uid_gid_map	gid_map;
 	struct uid_gid_map	projid_map;
 	atomic_t		count;
+	struct hlist_head       uidhash_table[UIDHASH_SZ];
 	struct user_namespace	*parent;
 	kuid_t			owner;
 	kgid_t			group;
--- a/include/linux/utsname.h
+++ b/include/linux/utsname.h
@@ -19,13 +19,30 @@ enum uts_proc {
 struct user_namespace;
 extern struct user_namespace init_user_ns;
 
+#ifdef CONFIG_X86
+struct uts_vdso {
+	void			*addr;
+	struct page		**pages;
+	unsigned int		nr_pages;
+	unsigned int		size;
+	unsigned long		version_off;
+};
+#endif
+
 struct uts_namespace {
 	struct kref kref;
 	struct new_utsname name;
 	struct user_namespace *user_ns;
 	unsigned int proc_inum;
+#ifdef CONFIG_X86
+	struct uts_vdso vdso;
+#endif
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+	struct uts_vdso vdso32;
+#endif
 };
 extern struct uts_namespace init_uts_ns;
+extern struct new_utsname virt_utsname;
 
 #ifdef CONFIG_UTS_NS
 static inline void get_uts_ns(struct uts_namespace *ns)
--- /dev/null
+++ b/include/linux/ve.h
@@ -0,0 +1,305 @@
+/*
+ *  include/linux/ve.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_VE_H
+#define _LINUX_VE_H
+
+#include <linux/types.h>
+#include <linux/capability.h>
+#include <linux/sysctl.h>
+#include <linux/net.h>
+#include <linux/vzstat.h>
+#include <linux/kobject.h>
+#include <linux/pid.h>
+#include <linux/path.h>
+#include <linux/socket.h>
+#include <linux/kthread.h>
+#include <linux/ve_proto.h>
+#include <net/inet_frag.h>
+#include <linux/cgroup.h>
+#include <linux/kmapset.h>
+#include <linux/binfmts.h>
+
+struct tty_driver;
+struct file_system_type;
+struct veip_struct;
+struct ve_monitor;
+struct nsproxy;
+struct user_namespace;
+extern struct user_namespace init_user_ns;
+
+struct ve_struct {
+	struct cgroup_subsys_state	css;
+
+	const char		*ve_name;
+
+	struct list_head	ve_list;
+
+	envid_t			veid;
+
+	unsigned int		class_id;
+	struct rw_semaphore	op_sem;
+	int			is_running;
+	int			is_pseudosuper;
+	atomic_t		suspend;
+	/* see vzcalluser.h for VE_FEATURE_XXX definitions */
+	__u64			features;
+
+	struct task_struct	*ve_kthread_task;
+	struct kthread_worker	ve_kthread_worker;
+
+	struct task_struct	*ve_umh_task;
+	struct kthread_worker	ve_umh_worker;
+
+/* VE's root */
+	struct path		root_path;
+
+	struct super_block	*dev_sb;
+	struct super_block	*devpts_sb;
+
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+	struct binfmt_misc	*binfmt_misc;
+#endif
+
+	struct list_head	devices;
+
+#if defined(CONFIG_VE_NETDEV) || defined (CONFIG_VE_NETDEV_MODULE)
+	struct veip_struct	*veip;
+	struct net_device	*_venet_dev;
+#endif
+
+/* per VE CPU stats*/
+	struct timespec		start_timespec;		/* monotonic time */
+	struct timespec		real_start_timespec;	/* boot based time */
+	u64			start_jiffies;	/* Deprecated */
+
+	struct kstat_lat_pcpu_struct	sched_lat_ve;
+
+#ifdef CONFIG_INET
+	struct venet_stat       *stat;
+#ifdef CONFIG_VE_IPTABLES
+/* core/netfilter.c virtualization */
+	__u64			ipt_mask;
+#endif /* CONFIG_VE_IPTABLES */
+#endif
+
+	void			*log_state;
+#define VE_LOG_BUF_LEN		4096
+
+	unsigned long		down_at;
+	struct list_head	cleanup_list;
+	unsigned char		disable_net;
+	struct ve_monitor	*monitor;
+	struct proc_dir_entry	*monitor_proc;
+	unsigned long		meminfo_val;
+	int _randomize_va_space;
+
+	int			odirect_enable;
+	int			fsync_enable;
+
+	u64			_uevent_seqnum;
+	struct nsproxy __rcu	*ve_ns;
+	struct task_struct	*init_task;
+	struct cred		*init_cred;
+	struct net		*ve_netns;
+
+	struct list_head	devmnt_list;
+	struct mutex		devmnt_mutex;
+
+	struct kmapset_key	ve_sysfs_perms;
+
+#ifdef CONFIG_AIO
+	spinlock_t		aio_nr_lock;
+	unsigned long		aio_nr;
+	unsigned long		aio_max_nr;
+#endif
+	atomic_t		netns_avail_nr;
+	int			netns_max_nr;
+	atomic_t		netif_avail_nr;
+	int			netif_max_nr;
+	/* Number of mounts. May become unbalanced if VE0 mounts something
+	 * and the VE unmounts it. This is acceptable.
+	 */
+	int			mnt_nr;
+#ifdef CONFIG_COREDUMP
+	char 			core_pattern[CORENAME_MAX_SIZE];
+#endif
+};
+
+struct ve_devmnt {
+	struct list_head	link;
+
+	dev_t                   dev;
+	char			*allowed_options;
+	char			*hidden_options; /* balloon_ino, etc. */
+};
+
+#define NETNS_MAX_NR_DEFAULT	256	/* number of net-namespaces per-VE */
+#define NETIF_MAX_NR_DEFAULT	256	/* number of net-interfaces per-VE */
+
+#define VE_MEMINFO_DEFAULT      1       /* default behaviour */
+#define VE_MEMINFO_SYSTEM       0       /* disable meminfo virtualization */
+
+#define capable_setveid() \
+	(ve_is_super(get_exec_env()) && capable(CAP_SYS_ADMIN))
+
+extern int nr_ve;
+extern struct proc_dir_entry *proc_vz_dir;
+extern struct cgroup_subsys ve_subsys;
+
+extern unsigned int sysctl_ve_mount_nr;
+
+#ifdef CONFIG_VE
+#define ve_uevent_seqnum       (get_exec_env()->_uevent_seqnum)
+
+extern struct kobj_ns_type_operations ve_ns_type_operations;
+extern struct kobject * kobject_create_and_add_ve(const char *name,
+						struct kobject *parent);
+
+extern struct kmapset_set ve_sysfs_perms;
+
+extern int vz_security_family_check(struct net *net, int family, int type);
+extern int vz_security_protocol_check(struct net *net, int protocol);
+
+extern struct task_struct *kthread_create_on_node_ve(struct ve_struct *ve,
+					int (*threadfn)(void *data),
+					void *data, int node,
+					const char namefmt[], ...);
+
+#define kthread_create_ve(ve, threadfn, data, namefmt, arg...) \
+	kthread_create_on_node_ve(ve, threadfn, data, -1, namefmt, ##arg)
+
+#define kthread_run_ve(ve, threadfn, data, namefmt, ...)		   \
+({									   \
+	struct task_struct *__k						   \
+		= kthread_create_ve(ve, threadfn, data, namefmt, ## __VA_ARGS__); \
+	if (!IS_ERR(__k))						   \
+		wake_up_process(__k);					   \
+	__k;								   \
+})
+
+struct subprocess_info;
+extern int call_usermodehelper_fns_ve(struct ve_struct *ve,
+	char *path, char **argv, char **envp, int wait,
+	int (*init)(struct subprocess_info *info, struct cred *new),
+	void (*cleanup)(struct subprocess_info *), void *data);
+
+static inline int
+call_usermodehelper_ve(struct ve_struct *ve, char *path, char **argv,
+		       char **envp, int wait)
+{
+	return call_usermodehelper_fns_ve(ve, path, argv, envp, wait,
+				       NULL, NULL, NULL);
+}
+void do_update_load_avg_ve(void);
+
+extern struct ve_struct *get_ve(struct ve_struct *ve);
+extern void put_ve(struct ve_struct *ve);
+
+struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id);
+
+static inline struct ve_struct *cgroup_ve(struct cgroup *cgroup)
+{
+	return container_of(cgroup_subsys_state(cgroup, ve_subsys_id),
+			struct ve_struct, css);
+}
+
+extern unsigned long long ve_relative_clock(struct timespec * ts);
+extern void monotonic_abs_to_ve(clockid_t which_clock, struct timespec *tp);
+extern void monotonic_ve_to_abs(clockid_t which_clock, struct timespec *tp);
+
+void ve_stop_ns(struct pid_namespace *ns);
+void ve_exit_ns(struct pid_namespace *ns);
+
+extern bool current_user_ns_initial(void);
+struct user_namespace *ve_init_user_ns(void);
+
+int ve_net_hide_sysctl(struct net *net);
+
+#ifdef CONFIG_TTY
+#define MAX_NR_VTTY_CONSOLES	(12)
+extern struct tty_driver *vtty_driver(dev_t dev, int *index);
+extern struct tty_driver *vtty_console_driver(int *index);
+extern int vtty_open_master(envid_t veid, int idx);
+extern void vtty_release(struct tty_struct *tty, struct tty_struct *o_tty,
+			 int *tty_closing, int *o_tty_closing);
+extern bool vtty_is_master(struct tty_struct *tty);
+#endif /* CONFIG_TTY */
+
+static inline int ve_mount_allowed(void)
+{
+	struct ve_struct *ve = get_exec_env();
+
+	return ve_is_super(ve) || ve->mnt_nr < (int)sysctl_ve_mount_nr;
+}
+
+static inline void ve_mount_nr_inc(void)
+{
+	get_exec_env()->mnt_nr++;
+}
+
+static inline void ve_mount_nr_dec(void)
+{
+	get_exec_env()->mnt_nr--;
+}
+
+#else	/* CONFIG_VE */
+
+#define ve_uevent_seqnum uevent_seqnum
+
+static inline int vz_security_family_check(struct net *net, int family, int type) { return 0; }
+static inline int vz_security_protocol_check(struct net *net, int protocol) { return 0; }
+
+#define ve_utsname	system_utsname
+#define get_ve(ve)	(NULL)
+#define put_ve(ve)	do { } while (0)
+
+static inline void ve_stop_ns(struct pid_namespace *ns) { }
+static inline void ve_exit_ns(struct pid_namespace *ns) { }
+
+static inline bool current_user_ns_initial(void)
+{
+	return current_user_ns() == init_cred.user_ns;
+}
+
+static inline struct user_namespace *ve_init_user_ns(void)
+{
+	return &init_user_ns;
+}
+
+#define kthread_create_on_node_ve(ve, threadfn, data, node, namefmt...)	\
+	kthread_create_on_node_ve(threadfn, data, node, namefmt...)
+
+#define kobject_create_and_add_ve		kobject_create_and_add
+
+static inline void monotonic_abs_to_ve(clockid_t which_clock,
+				struct timespec *tp) { }
+static inline void monotonic_ve_to_abs(clockid_t which_clock,
+				struct timepsec *tp) { }
+
+static inline int ve_mount_allowed(void) { return 1; }
+static inline void ve_mount_nr_inc(void) { }
+static inline void ve_mount_nr_dec(void) { }
+#endif	/* CONFIG_VE */
+
+struct seq_file;
+struct kernel_cpustat;
+
+#if defined(CONFIG_VE) && defined(CONFIG_CGROUP_SCHED)
+int ve_show_cpu_stat(struct ve_struct *ve, struct seq_file *p);
+int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p);
+int ve_get_cpu_avenrun(struct ve_struct *ve, unsigned long *avenrun);
+int ve_get_cpu_stat(struct ve_struct *ve, struct kernel_cpustat *kstat);
+#else
+static inline int ve_show_cpu_stat(struct ve_struct *ve, struct seq_file *p) { return -ENOSYS; }
+static inline int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p) { return -ENOSYS; }
+static inline int ve_get_cpu_avenrun(struct ve_struct *ve, unsigned long *avenrun) { return -ENOSYS; }
+static inline int ve_get_cpu_stat(struct ve_struct *ve, struct kernel_cpustat *kstat) { return -ENOSYS; }
+#endif
+
+#endif /* _LINUX_VE_H */
--- /dev/null
+++ b/include/linux/ve_proto.h
@@ -0,0 +1,135 @@
+/*
+ *  include/linux/ve_proto.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __VE_H__
+#define __VE_H__
+
+struct ve_struct;
+struct task_struct;
+struct seq_file;
+struct net;
+
+#ifdef CONFIG_VE
+
+extern struct ve_struct ve0;
+
+static inline struct ve_struct *get_ve0(void)
+{
+	return &ve0;
+}
+
+static inline bool ve_is_super(struct ve_struct *ve)
+{
+	return ve == &ve0;
+}
+
+#define get_exec_env()		(current->task_ve)
+
+const char *ve_name(struct ve_struct *ve);
+
+/* must be called under rcu_read_lock if task != current */
+const char *task_ve_name(struct task_struct *task);
+
+extern int ve_task_count(struct ve_struct *);
+
+typedef void (*ve_seq_print_t)(struct seq_file *, struct ve_struct *);
+
+void vzmon_register_veaddr_print_cb(ve_seq_print_t);
+void vzmon_unregister_veaddr_print_cb(ve_seq_print_t);
+
+#if defined(CONFIG_INET) && defined(CONFIG_VE_NETDEV)
+int venet_init(void);
+#endif
+
+extern struct list_head ve_list_head;
+#define for_each_ve(ve)	list_for_each_entry((ve), &ve_list_head, ve_list)
+extern struct mutex ve_list_lock;
+extern struct ve_struct *get_ve_by_id(envid_t);
+
+extern int nr_threads_ve(struct ve_struct *ve);
+
+enum {
+	VE_SS_CHAIN,
+	VE_SHUTDOWN_CHAIN,
+
+	VE_MAX_CHAINS
+};
+
+typedef int ve_hook_init_fn(void *data);
+typedef void ve_hook_fini_fn(void *data);
+
+struct ve_hook
+{
+	ve_hook_init_fn *init;
+	ve_hook_fini_fn *fini;
+	struct module *owner;
+
+	/* Functions are called in ascending priority */
+	int priority;
+
+	/* Private part */
+	struct list_head list;
+};
+
+enum {
+	HOOK_PRIO_DEFAULT = 0,
+
+	HOOK_PRIO_FS = HOOK_PRIO_DEFAULT,
+
+	HOOK_PRIO_NET_PRE,
+	HOOK_PRIO_NET,
+	HOOK_PRIO_NET_POST,
+	HOOK_PRIO_NET_ACCT = 100,
+	HOOK_PRIO_NET_ACCT_V6,
+
+	HOOK_PRIO_AFTERALL = INT_MAX-1,
+	HOOK_PRIO_FINISHING = INT_MAX,
+};
+
+void *ve_seq_start(struct seq_file *m, loff_t *pos);
+void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos);
+void ve_seq_stop(struct seq_file *m, void *v);
+
+extern int ve_hook_iterate_init(int chain, void *data);
+extern void ve_hook_iterate_fini(int chain, void *data);
+
+extern void ve_hook_register(int chain, struct ve_hook *vh);
+extern void ve_hook_unregister(struct ve_hook *vh);
+#else /* CONFIG_VE */
+#define ve_hook_register(ch, vh)	do { } while (0)
+#define ve_hook_unregister(ve)		do { } while (0)
+
+static inline struct ve_struct *get_ve0(void)
+{
+	return NULL;
+}
+
+static inline struct ve_struct *get_exec_env(void)
+{
+	return NULL;
+}
+
+static inline bool ve_is_super(struct ve_struct *ve)
+{
+	return true;
+}
+
+static inline const char *ve_name(struct ve_struct *ve)
+{
+	return "0";
+}
+
+static inline const char *task_ve_name(struct task_struct *task)
+{
+	return "0";
+}
+
+#define nr_threads_ve(ve)	(nr_threads)
+
+#endif /* CONFIG_VE */
+#endif
--- /dev/null
+++ b/include/linux/veip.h
@@ -0,0 +1,22 @@
+/*
+ *  include/linux/veip.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __VE_IP_H_
+#define __VE_IP_H_
+
+struct ve_addr_struct {
+	int family;
+	__u32 key[4];
+};
+
+struct sockaddr;
+
+extern void veaddr_print(char *, int, struct ve_addr_struct *);
+extern int sockaddr_to_veaddr(struct sockaddr __user *uaddr, int addrlen,
+		struct ve_addr_struct *veaddr);
+
+#endif
--- /dev/null
+++ b/include/linux/venet.h
@@ -0,0 +1,103 @@
+/*
+ *  include/linux/venet.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _VENET_H
+#define _VENET_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <uapi/linux/vzcalluser.h>
+#include <linux/veip.h>
+#include <linux/netdevice.h>
+
+#define VEIP_HASH_SZ 512
+
+struct ve_struct;
+struct venet_stat;
+struct venet_stats {
+	struct net_device_stats	stats;
+	struct net_device_stats	*real_stats;
+};
+
+struct ip_entry_struct
+{
+	struct ve_addr_struct	addr;
+	struct ve_struct	*active_env;
+	struct veip_struct	*tgt_veip;
+	struct hlist_node 	ip_hash;
+	union {
+		struct list_head 	ve_list;
+		struct rcu_head		rcu;
+	};
+};
+
+struct ext_entry_struct
+{
+	struct list_head	list;
+	struct ve_addr_struct	addr;
+	struct rcu_head		rcu;
+};
+
+struct veip_struct
+{
+	struct list_head	src_lh;
+	struct list_head	dst_lh;
+	struct list_head	ip_lh;
+	struct list_head	list;
+	struct list_head	ext_lh;
+	envid_t			veid;
+	struct venet_stat	*stat;
+	struct rcu_head		rcu;
+};
+
+struct veip_pool_ops {
+	int (*veip_create)(struct ve_struct *);
+	void (*veip_release)(struct ve_struct *);
+	void (*veip_free)(struct veip_struct *);
+	struct ve_struct *(*veip_lookup)(struct ve_struct *, struct sk_buff *);
+};
+
+extern struct veip_pool_ops *veip_pool_ops;
+
+static inline struct net_device_stats *
+venet_stats(struct net_device *dev, int cpu)
+{
+	struct venet_stats *stats;
+	stats = (struct venet_stats*)dev->ml_priv;
+	return per_cpu_ptr(stats->real_stats, cpu);
+}
+
+void ip_entry_hash(struct ip_entry_struct *entry, struct veip_struct *veip);
+void ip_entry_unhash(struct ip_entry_struct *entry);
+void ip_entry_unhash(struct ip_entry_struct *entry);
+struct ip_entry_struct *venet_entry_lookup(struct ve_addr_struct *);
+
+struct veip_struct *veip_findcreate(envid_t veid);
+int veip_put(struct veip_struct *veip);
+void veip_cleanup(void);
+
+int in4_to_veaddr(const char *addr, struct ve_addr_struct *veaddr);
+int in6_to_veaddr(const char *addr, struct ve_addr_struct *veaddr);
+
+extern struct list_head veip_lh;
+
+struct ext_entry_struct *venet_ext_lookup(struct ve_struct *ve,
+		struct ve_addr_struct *addr);
+
+extern struct hlist_head ip_entry_hash_table[];
+extern spinlock_t veip_lock;
+
+extern void (*venet_free_stat)(struct ve_struct *);
+
+#define NIPQUAD(addr) \
+	((unsigned char *)&addr)[0], \
+	((unsigned char *)&addr)[1], \
+	((unsigned char *)&addr)[2], \
+	((unsigned char *)&addr)[3]
+
+#endif
--- /dev/null
+++ b/include/linux/virtinfo.h
@@ -0,0 +1,84 @@
+/*
+ *  include/linux/virtinfo.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __LINUX_VIRTINFO_H
+#define __LINUX_VIRTINFO_H
+
+#include <linux/kernel.h>
+#include <linux/page-flags.h>
+#include <linux/notifier.h>
+#include <linux/mmzone.h>
+
+struct vnotifier_block
+{
+	int (*notifier_call)(struct vnotifier_block *self,
+			unsigned long, void *, int);
+	struct vnotifier_block *next;
+	int priority;
+};
+
+extern struct semaphore virtinfo_sem;
+void __virtinfo_notifier_register(int type, struct vnotifier_block *nb);
+void virtinfo_notifier_register(int type, struct vnotifier_block *nb);
+void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb);
+int virtinfo_notifier_call(int type, unsigned long n, void *data);
+int virtinfo_notifier_call_irq(int type, unsigned long n, void *data);
+
+struct page_info {
+	unsigned long nr_file_dirty;
+	unsigned long nr_writeback;
+	unsigned long nr_anon_pages;
+	unsigned long nr_file_mapped;
+	unsigned long nr_slab_rec;
+	unsigned long nr_slab_unrec;
+	unsigned long nr_pagetable;
+	unsigned long nr_unstable_nfs;
+	unsigned long nr_bounce;
+	unsigned long nr_writeback_temp;
+};
+
+struct sysinfo;
+struct user_beancounter;
+
+struct meminfo {
+	struct sysinfo *si;
+	struct user_beancounter *ub;
+	unsigned long meminfo_val;
+	unsigned long pages[NR_LRU_LISTS];
+	unsigned long cached, dirty_pages, writeback_pages, locked, shmem;
+	unsigned long slab_reclaimable, slab_unreclaimable;
+};
+
+struct seq_file;
+
+int meminfo_proc_show_ub(struct seq_file *m, void *v,
+		struct user_beancounter *ub, unsigned long meminfo_val);
+
+#define VIRTINFO_MEMINFO	0
+#define VIRTINFO_SYSINFO	2
+#define VIRTINFO_VMSTAT		3
+#define VIRTINFO_OOMKILL	4
+
+#define VIRTINFO_IO_ACCOUNT	0
+#define VIRTINFO_IO_PREPARE	1
+#define VIRTINFO_IO_JOURNAL	2
+#define VIRTINFO_IO_READAHEAD	3
+#define VIRTINFO_IO_CONGESTION	4
+#define VIRTINFO_IO_OP_ACCOUNT	5
+#define VIRTINFO_IO_BALANCE_DIRTY	6
+#define VIRTINFO_IO_FUSE_REQ	7
+
+enum virt_info_types {
+	VITYPE_GENERAL,
+	VITYPE_QUOTA,
+	VITYPE_IO,
+
+	VIRT_TYPES
+};
+
+#endif /* __LINUX_VIRTINFO_H */
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -16,6 +16,8 @@ struct vm_area_struct;		/* vma defining user mapping in mm_types.h */
 #define VM_USERMAP	0x00000008	/* suitable for remap_vmalloc_range */
 #define VM_VPAGES	0x00000010	/* buffer for pages was vmalloc'ed */
 #define VM_UNLIST	0x00000020	/* vm_struct is not listed in vmlist */
+#define VM_NO_GUARD	0x00000040      /* don't add guard page */
+#define VM_KASAN	0x00000080      /* has allocated kasan shadow memory */
 /* bits [20..32] reserved for arch specific ioremap internals */
 
 /*
@@ -66,6 +68,8 @@ static inline void vmalloc_init(void)
 
 extern void *vmalloc(unsigned long size);
 extern void *vzalloc(unsigned long size);
+extern void *vmalloc_account(unsigned long size);
+extern void *vzalloc_account(unsigned long size);
 extern void *vmalloc_user(unsigned long size);
 extern void *vmalloc_node(unsigned long size, int node);
 extern void *vzalloc_node(unsigned long size, int node);
@@ -75,7 +79,9 @@ extern void *vmalloc_32_user(unsigned long size);
 extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot);
 extern void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			unsigned long start, unsigned long end, gfp_t gfp_mask,
-			pgprot_t prot, int node, const void *caller);
+			pgprot_t prot, unsigned long vm_flags, int node,
+			const void *caller);
+
 extern void vfree(const void *addr);
 
 extern void *vmap(struct page **pages, unsigned int count,
@@ -96,8 +102,12 @@ void vmalloc_sync_all(void);
 
 static inline size_t get_vm_area_size(const struct vm_struct *area)
 {
-	/* return actual size without guard page */
-	return area->size - PAGE_SIZE;
+	if (!(area->flags & VM_NO_GUARD))
+		/* return actual size without guard page */
+		return area->size - PAGE_SIZE;
+	else
+		return area->size;
+
 }
 
 extern struct vm_struct *get_vm_area(unsigned long size, unsigned long flags);
--- /dev/null
+++ b/include/linux/vzctl.h
@@ -0,0 +1,28 @@
+/*
+ *  include/linux/vzctl.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_VZCTL_H
+#define _LINUX_VZCTL_H
+
+#include <linux/list.h>
+
+struct module;
+struct inode;
+struct file;
+struct vzioctlinfo {
+	unsigned type;
+	int (*ioctl)(struct file *, unsigned int, unsigned long);
+	int (*compat_ioctl)(struct file *, unsigned int, unsigned long);
+	struct module *owner;
+	struct list_head list;
+};
+
+extern void vzioctl_register(struct vzioctlinfo *inf);
+extern void vzioctl_unregister(struct vzioctlinfo *inf);
+
+#endif
--- /dev/null
+++ b/include/linux/vzevent.h
@@ -0,0 +1,28 @@
+/*
+ *  include/linux/vzevent.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __LINUX_VZ_EVENT_H__
+#define __LINUX_VZ_EVENT_H__
+
+#if defined(CONFIG_VZ_EVENT) || defined(CONFIG_VZ_EVENT_MODULE)
+extern int vzevent_send(int msg, const char *attrs_fmt, ...);
+#else
+static inline int vzevent_send(int msg, const char *attrs_fmt, ...)
+{
+	return 0;
+}
+#endif
+
+enum {
+	VE_EVENT_MOUNT,
+	VE_EVENT_UMOUNT,
+	VE_EVENT_START,
+	VE_EVENT_STOP,
+	VE_EVENT_REBOOT,
+};
+
+#endif /* __LINUX_VZ_EVENT_H__ */
--- /dev/null
+++ b/include/linux/vziolimit.h
@@ -0,0 +1,28 @@
+/*
+ *  include/linux/vziolimit.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_VZIOLIMIT_H
+#define _LINUX_VZIOLIMIT_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#define VZIOLIMITTYPE 'I'
+
+struct iolimit_state {
+	unsigned int id;
+	unsigned int speed;
+	unsigned int burst;
+	unsigned int latency;
+};
+
+#define VZCTL_SET_IOLIMIT	_IOW(VZIOLIMITTYPE, 0, struct iolimit_state)
+#define VZCTL_GET_IOLIMIT	_IOR(VZIOLIMITTYPE, 1, struct iolimit_state)
+#define VZCTL_SET_IOPSLIMIT	_IOW(VZIOLIMITTYPE, 2, struct iolimit_state)
+#define VZCTL_GET_IOPSLIMIT	_IOR(VZIOLIMITTYPE, 3, struct iolimit_state)
+
+#endif /* _LINUX_VZIOLIMIT_H */
--- /dev/null
+++ b/include/linux/vziptable_defs.h
@@ -0,0 +1,21 @@
+/*
+ *  include/linux/vziptable_defs.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _LINUX_VZIPTABLE_DEFS_H
+#define _LINUX_VZIPTABLE_DEFS_H
+
+#include <linux/types.h>
+#include <linux/ve.h>
+
+#include <uapi/linux/vziptable_defs.h>
+
+static inline bool mask_ipt_allow(__u64 permitted, __u64 mask)
+{
+	return (permitted & mask) == mask;
+}
+
+#endif /* _LINUX_VZIPTABLE_DEFS_H */
--- /dev/null
+++ b/include/linux/vznetstat.h
@@ -0,0 +1,85 @@
+/*
+ * include/linux/vznetstat.h
+ *
+ * Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _VZNETSTAT_H
+#define _VZNETSTAT_H
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#define TC_CLASS_MAX	16
+
+struct acct_counter {
+	u64	bytes;
+	u32	pkts;
+	u32	__pad;
+};
+
+enum {
+	ACCT_IN,
+	ACCT_OUT,
+	ACCT_MAX
+};
+
+struct acct_stat {
+	struct acct_counter cnt[TC_CLASS_MAX][ACCT_MAX];
+};
+
+struct venet_stat {
+	struct list_head list;
+	envid_t  veid;
+	u16 base;
+	unsigned long flags;
+	atomic_t users;
+
+	struct acct_stat __percpu *ipv4_stat;
+	struct acct_stat __percpu *ipv6_stat;
+};
+
+static inline int venet_acct_skb_size(struct sk_buff *skb)
+{
+	return skb->data_len + (skb->tail - skb->network_header);
+}
+
+struct ve_addr_struct;
+
+#if IS_ENABLED(CONFIG_VE_NETDEV_ACCOUNTING)
+struct venet_stat *venet_acct_find_stat(envid_t veid);
+struct venet_stat *venet_acct_find_create_stat(envid_t veid);
+static inline void venet_acct_get_stat(struct venet_stat *stat)
+{
+	atomic_inc(&stat->users);
+}
+void   venet_acct_put_stat(struct venet_stat *);
+
+void venet_acct_classify_add_incoming(struct venet_stat *, struct sk_buff *skb);
+void venet_acct_classify_add_outgoing(struct venet_stat *, struct sk_buff *skb);
+void venet_acct_classify_sub_outgoing(struct venet_stat *, struct sk_buff *skb);
+
+void venet_acct_classify_add_incoming_plain(struct venet_stat *stat,
+		struct ve_addr_struct *src_addr, int data_size);
+void venet_acct_classify_add_outgoing_plain(struct venet_stat *stat,
+		struct ve_addr_struct *dst_addr, int data_size);
+
+#else /* !CONFIG_VE_NETDEV_ACCOUNTING */
+static inline void venet_acct_get_stat(struct venet_stat *stat) { }
+static inline void venet_acct_put_stat(struct venet_stat *stat) { }
+
+static inline void venet_acct_classify_add_incoming(struct venet_stat *stat,
+						struct sk_buff *skb) {}
+static inline void venet_acct_classify_add_outgoing(struct venet_stat *stat,
+						struct sk_buff *skb) {}
+static inline void venet_acct_classify_sub_outgoing(struct venet_stat *stat,
+						struct sk_buff *skb) {}
+
+static inline void venet_acct_classify_add_incoming_plain(struct venet_stat *stat,
+		struct ve_addr_struct *src_addr, int data_size) {}
+static inline void venet_acct_classify_add_outgoing_plain(struct venet_stat *stat,
+		struct ve_addr_struct *dst_addr, int data_size) {}
+#endif /* CONFIG_VE_NETDEV_ACCOUNTING */
+
+#endif
--- /dev/null
+++ b/include/linux/vzprivnet.h
@@ -0,0 +1,28 @@
+/*
+ *  include/linux/vzprivnet.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __IP_VZPRIVNET_H__
+#define __IP_VZPRIVNET_H__
+
+extern int vzpn_handle_bridged;
+extern int vzpn_filter_host;
+
+struct proc_dir_entry;
+extern struct proc_dir_entry *vzpriv_proc_dir;
+
+struct seq_file;
+typedef void (*vzprivnet_show_fn)(struct seq_file *);
+void vzprivnet_reg_show(vzprivnet_show_fn);
+void vzprivnet_unreg_show(vzprivnet_show_fn);
+
+#define is_eol(ch)	((ch) == '\0' || (ch) == '\n')
+
+#define VZPRIVNET_STRONG       0
+#define VZPRIVNET_WEAK         1
+#define VZPRIVNET_INET         2
+
+#endif
--- /dev/null
+++ b/include/linux/vzstat.h
@@ -0,0 +1,127 @@
+/*
+ *  include/linux/vzstat.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __VZSTAT_H__
+#define __VZSTAT_H__
+
+#include <linux/mmzone.h>
+
+struct swap_cache_info_struct {
+	unsigned long add_total;
+	unsigned long del_total;
+	unsigned long find_success;
+	unsigned long find_total;
+};
+
+struct kstat_lat_snap_struct {
+	u64 maxlat, totlat;
+	unsigned long count;
+};
+struct kstat_lat_pcpu_snap_struct {
+	u64 maxlat, totlat;
+	unsigned long count;
+	seqcount_t lock;
+} ____cacheline_aligned_in_smp;
+
+struct kstat_lat_struct {
+	struct kstat_lat_snap_struct cur, last;
+	u64 avg[3];
+};
+struct kstat_lat_pcpu_struct {
+	struct kstat_lat_pcpu_snap_struct *cur;
+	u64 max_snap;
+	struct kstat_lat_snap_struct last;
+	u64 avg[3];
+};
+
+struct kstat_perf_snap_struct {
+	u64 wall_tottime, cpu_tottime;
+	u64 wall_maxdur, cpu_maxdur;
+	unsigned long count;
+};
+
+struct kstat_perf_pcpu_snap_struct {
+	u64 wall_tottime, cpu_tottime;
+	u64 wall_maxdur, cpu_maxdur;
+	unsigned long count;
+	seqcount_t lock;
+};
+
+struct kstat_perf_pcpu_struct {
+	struct kstat_perf_pcpu_snap_struct *cur;
+	struct kstat_perf_snap_struct last;
+};
+
+struct kstat_zone_avg {
+	unsigned long		free_pages_avg[3],
+				nr_active_avg[3],
+				nr_inactive_avg[3];
+};
+
+enum {
+	KSTAT_ALLOCSTAT_ATOMIC,
+	KSTAT_ALLOCSTAT_LOW,
+	KSTAT_ALLOCSTAT_HIGH,
+	KSTAT_ALLOCSTAT_LOW_MP,
+	KSTAT_ALLOCSTAT_HIGH_MP,
+	KSTAT_ALLOCSTAT_NR,
+};
+
+struct kernel_stat_glob {
+	unsigned long nr_unint_avg[3];
+
+	unsigned long alloc_fails[NR_CPUS][KSTAT_ALLOCSTAT_NR];
+	struct kstat_lat_pcpu_struct alloc_lat[KSTAT_ALLOCSTAT_NR];
+	struct kstat_lat_pcpu_struct sched_lat;
+	struct kstat_lat_pcpu_struct page_in;
+	struct kstat_lat_struct swap_in;
+
+	struct kstat_perf_pcpu_struct ttfp, cache_reap,
+			refill_inact, shrink_icache, shrink_dcache;
+
+	struct kstat_zone_avg zone_avg[MAX_NR_ZONES];
+} ____cacheline_aligned;
+
+extern struct kernel_stat_glob kstat_glob ____cacheline_aligned;
+extern spinlock_t kstat_glb_lock;
+
+extern void kstat_init(void);
+
+#ifdef CONFIG_VE
+
+extern void KSTAT_PERF_ADD(struct kstat_perf_pcpu_struct *ptr, u64 real_time,
+			   u64 cpu_time);
+
+#define KSTAT_PERF_ENTER(name)				\
+	u64 start, sleep_time;				\
+							\
+	start = ktime_to_ns(ktime_get());		\
+	sleep_time = current->se.statistics->sum_sleep_runtime; \
+
+#define KSTAT_PERF_LEAVE(name)				\
+	start = ktime_to_ns(ktime_get()) - start;	\
+	sleep_time = current->se.statistics->sum_sleep_runtime - sleep_time; \
+	KSTAT_PERF_ADD(&kstat_glob.name, start, start - sleep_time);
+
+extern void KSTAT_LAT_ADD(struct kstat_lat_struct *p, u64 dur);
+extern void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, int cpu, u64 dur);
+extern void KSTAT_LAT_UPDATE(struct kstat_lat_struct *p);
+extern void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p);
+
+#else
+#define KSTAT_PERF_ADD(ptr, real_time, cpu_time)
+#define KSTAT_PERF_ENTER(name)
+#define KSTAT_PERF_LEAVE(name)
+#define KSTAT_LAT_ADD(p, dur)
+#define KSTAT_LAT_PCPU_ADD(p, cpu, dur)
+#define KSTAT_LAT_UPDATE(p)
+#define KSTAT_LAT_PCPU_UPDATE(p)
+#define KSTAT_LAT_PCPU_UPDATE(p)
+#endif
+
+#endif /* __VZSTAT_H__ */
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -744,6 +744,32 @@ do {									\
 	__ret;								\
 })
 
+#define __wait_event_killable_exclusive(wq, condition, ret)		\
+do {									\
+	DEFINE_WAIT(__wait);						\
+									\
+	for (;;) {							\
+		prepare_to_wait_exclusive(&wq, &__wait, TASK_KILLABLE);	\
+		if (condition)						\
+			break;						\
+		if (!fatal_signal_pending(current)) {			\
+			schedule();					\
+			continue;					\
+		}							\
+		ret = -ERESTARTSYS;					\
+		break;							\
+	}								\
+	finish_wait(&wq, &__wait);					\
+} while (0)
+
+
+#define wait_event_killable_exclusive(wq, condition)			\
+({									\
+	int __ret = 0;							\
+	if (!(condition))						\
+		__wait_event_killable_exclusive(wq, condition, __ret);	\
+	__ret;								\
+})
 
 #define __wait_event_lock_irq(wq, condition, lock, cmd)			\
 do {									\
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -92,13 +92,18 @@ struct writeback_control {
  */	
 struct bdi_writeback;
 void writeback_inodes_sb(struct super_block *, enum wb_reason reason);
+void writeback_inodes_sb_ub(struct super_block *, struct user_beancounter *,
+							enum wb_reason reason);
 void writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 							enum wb_reason reason);
 int try_to_writeback_inodes_sb(struct super_block *, enum wb_reason reason);
 int try_to_writeback_inodes_sb_nr(struct super_block *, unsigned long nr,
 				  enum wb_reason reason);
 void sync_inodes_sb(struct super_block *);
+void sync_inodes_sb_ub(struct super_block *, struct user_beancounter *ub);
 void wakeup_flusher_threads(long nr_pages, enum wb_reason reason);
+void wakeup_flusher_threads_ub(long nr_pages, struct user_beancounter *ub,
+			enum wb_reason reason);
 void inode_wait_for_writeback(struct inode *inode);
 
 /* writeback.h requires fs.h; it, too, is not included from here. */
@@ -131,6 +136,7 @@ extern int vm_dirty_ratio;
 extern unsigned long vm_dirty_bytes;
 extern unsigned int dirty_writeback_interval;
 extern unsigned int dirty_expire_interval;
+extern unsigned int dirtytime_expire_interval;
 extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
@@ -147,6 +153,8 @@ extern int dirty_ratio_handler(struct ctl_table *table, int write,
 extern int dirty_bytes_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
+int dirtytime_interval_handler(struct ctl_table *table, int write,
+			       void __user *buffer, size_t *lenp, loff_t *ppos);
 
 struct ctl_table;
 int dirty_writeback_centisecs_handler(struct ctl_table *, int,
--- a/include/linux/xattr.h
+++ b/include/linux/xattr.h
@@ -10,7 +10,6 @@
 #ifndef _LINUX_XATTR_H
 #define _LINUX_XATTR_H
 
-
 #include <linux/slab.h>
 #include <linux/types.h>
 #include <linux/spinlock.h>
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -156,6 +156,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex,
 		      const struct in6_addr *addr);
 int ipv6_sock_mc_drop(struct sock *sk, int ifindex,
 		      const struct in6_addr *addr);
+void __ipv6_sock_mc_close(struct sock *sk);
 void ipv6_sock_mc_close(struct sock *sk);
 bool inet6_mc_check(struct sock *sk, const struct in6_addr *mc_addr,
 		    const struct in6_addr *src_addr);
--- a/include/net/dst.h
+++ b/include/net/dst.h
@@ -61,6 +61,7 @@ struct dst_entry {
 #define DST_XFRM_TUNNEL		0x0080
 #define DST_XFRM_QUEUE		0x0100
 #define DST_METADATA		0x0200
+#define DST_FREE		0x0400
 
 	unsigned short		pending_confirm;
 
@@ -194,6 +195,11 @@ dst_metric_raw(const struct dst_entry *dst, const int metric)
 	return p[metric-1];
 }
 
+void dst_dump_one(struct dst_entry *d);
+void ip_rt_dump_dsts(void);
+void dst_cache_dump(void);
+extern void (*ip6_rt_dump_dsts)(void);
+
 static inline u32
 dst_metric(const struct dst_entry *dst, const int metric)
 {
--- a/include/net/ip.h
+++ b/include/net/ip.h
@@ -201,14 +201,27 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 #define NET_ADD_STATS_BH(net, field, adnd) SNMP_ADD_STATS_BH((net)->mib.net_statistics, field, adnd)
 #define NET_ADD_STATS_USER(net, field, adnd) SNMP_ADD_STATS_USER((net)->mib.net_statistics, field, adnd)
 
-unsigned long snmp_fold_field(void __percpu *mib[], int offt);
+unsigned long __snmp_fold_field(void __percpu *mib[], int offt, const struct cpumask *mask);
+static inline unsigned long snmp_fold_field(void __percpu *mib[], int offt)
+{
+	return __snmp_fold_field(mib, offt, cpu_possible_mask);
+}
 #if BITS_PER_LONG==32
-u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t sync_off);
+u64 __snmp_fold_field64(void __percpu *mib[], int offt, size_t sync_off,
+			const struct cpumask *mask);
+static inline u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t sync_off)
+{
+	return __snmp_fold_field64(mib, offt, sync_off, cpu_possible_mask)
+}
 #else
 static inline u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_off)
 {
 	return snmp_fold_field(mib, offt);
 }
+static inline unsigned long __snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_off, const struct cpumask *mask)
+{
+	return __snmp_fold_field(mib, offt, mask);
+}
 #endif
 int snmp_mib_init(void __percpu *ptr[2], size_t mibsize, size_t align);
 
--- a/include/net/ip6_route.h
+++ b/include/net/ip6_route.h
@@ -64,6 +64,7 @@ static inline bool rt6_need_strict(const struct in6_addr *daddr)
 
 
 void ip6_route_input(struct sk_buff *skb);
+void __ip6_route_input(struct sk_buff *skb, struct in6_addr *daddr);
 
 struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
 				   struct flowi6 *fl6);
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -75,6 +75,13 @@ struct net {
 	unsigned int		dev_base_seq;	/* protected by rtnl_mutex */
 	int			ifindex;
 
+#ifdef CONFIG_VE
+	struct ve_struct	*owner_ve;
+#ifdef CONFIG_VE_IPTABLES
+	__u64			_iptables_modules;
+#endif
+#endif
+
 	/* core fib_rules */
 	struct list_head	rules_ops;
 
@@ -235,6 +242,11 @@ int net_eq(const struct net *net1, const struct net *net2)
 
 extern void net_drop_ns(void *);
 
+/* Returns whether curr can mess with net's objects */
+static inline int net_access_allowed(const struct net *net, const struct net *curr)
+{
+	return net_eq(curr, &init_net) || net_eq(curr, net);
+}
 #else
 
 static inline struct net *get_net(struct net *net)
@@ -258,6 +270,11 @@ int net_eq(const struct net *net1, const struct net *net2)
 }
 
 #define net_drop_ns NULL
+
+static inline int net_access_allowed(const struct net *net, const struct net *curr)
+{
+	return 1;
+}
 #endif
 
 
@@ -297,6 +314,16 @@ static inline struct net *read_pnet(possible_net_t const *pnet)
 #define __net_initconst	__initconst
 #endif
 
+#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+static inline void allow_conntrack_allocation(struct net *net)
+{
+	net->ct.can_alloc = true;
+	smp_wmb(); /* Pairs with rmb in resolve_normal_ct() */
+}
+#else
+static inline void allow_conntrack_allocation(struct net *net) { }
+#endif
+
 int peernet2id_alloc(struct net *net, struct net *peer);
 int peernet2id(struct net *net, struct net *peer);
 bool peernet_has_id(struct net *net, struct net *peer);
--- a/include/net/netfilter/nf_conntrack_expect.h
+++ b/include/net/netfilter/nf_conntrack_expect.h
@@ -9,7 +9,6 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 
 extern unsigned int nf_ct_expect_hsize;
-extern unsigned int nf_ct_expect_max;
 
 struct nf_conntrack_expect {
 	/* Conntrack expectation list member */
--- a/include/net/netfilter/nf_log.h
+++ b/include/net/netfilter/nf_log.h
@@ -93,7 +93,7 @@ struct nf_log_buf;
 
 struct nf_log_buf *nf_log_buf_open(void);
 __printf(2, 3) int nf_log_buf_add(struct nf_log_buf *m, const char *f, ...);
-void nf_log_buf_close(struct nf_log_buf *m);
+void nf_log_buf_close(struct nf_log_buf *m, struct ve_struct *ve);
 
 /* common logging functions */
 int nf_log_dump_udp_header(struct nf_log_buf *m, const struct sk_buff *skb,
--- a/include/net/netlabel.h
+++ b/include/net/netlabel.h
@@ -525,10 +525,10 @@ static inline int netlbl_catmap_setrng(struct netlbl_lsm_catmap **catmap,
 {
 	return 0;
 }
-static int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap,
-				 u32 offset,
-				 unsigned long bitmap,
-				 gfp_t flags)
+static inline int netlbl_catmap_setlong(struct netlbl_lsm_catmap **catmap,
+					u32 offset,
+					unsigned long bitmap,
+					gfp_t flags)
 {
 	return 0;
 }
--- a/include/net/netns/conntrack.h
+++ b/include/net/netns/conntrack.h
@@ -73,8 +73,12 @@ struct ct_pcpu {
 
 struct netns_ct {
 	atomic_t		count;
+	bool			can_alloc; /* Initialized in 0 by net_alloc */
+	unsigned int		max;
 	unsigned int		expect_count;
+	unsigned int		expect_max;
 #ifdef CONFIG_SYSCTL
+	struct ctl_table_header	*netfilter_header;
 	struct ctl_table_header	*sysctl_header;
 	struct ctl_table_header	*acct_sysctl_header;
 	struct ctl_table_header	*tstamp_sysctl_header;
--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -42,7 +42,13 @@ static inline void *net_generic(const struct net *net, int id)
 	ptr = ng->ptr[id - 1];
 	rcu_read_unlock();
 
+#ifndef CONFIG_VE
+	/* May be NULL for disabled VE features */
 	BUG_ON(!ptr);
+#endif
 	return ptr;
 }
+
+extern int net_assign_generic(struct net *net, int id, void *data);
+
 #endif
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -23,6 +23,11 @@
 
 extern struct proto raw_prot;
 
+extern struct raw_hashinfo raw_v4_hashinfo;
+struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
+			     unsigned short num, __be32 raddr,
+			     __be32 laddr, int dif);
+
 void raw_icmp_error(struct sk_buff *, int, u32);
 int raw_local_deliver(struct sk_buff *, int);
 
--- a/include/net/rawv6.h
+++ b/include/net/rawv6.h
@@ -3,6 +3,13 @@
 
 #include <net/protocol.h>
 
+extern struct raw_hashinfo raw_v6_hashinfo;
+struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
+			     unsigned short num, const struct in6_addr *loc_addr,
+			     const struct in6_addr *rmt_addr, int dif);
+
+int raw_abort(struct sock *sk, int err);
+
 void raw6_icmp_error(struct sk_buff *, int nexthdr,
 		u8 type, u8 code, int inner_offset, __be32);
 bool raw6_local_deliver(struct sk_buff *, int);
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -214,6 +214,7 @@ static inline void ip_rt_put(struct rtable *rt)
 #define IPTOS_RT_MASK	(IPTOS_TOS_MASK & ~3)
 
 extern const __u8 ip_tos2prio[16];
+extern int ip_rt_src_check;
 
 static inline char rt_tos2priority(u8 tos)
 {
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -733,11 +733,15 @@ struct psched_ratecfg {
 	u16	overhead;
 	u8	linklayer;
 	u8	shift;
+	u32	mpu;
 };
 
 static inline u64 psched_l2t_ns(const struct psched_ratecfg *r,
 				unsigned int len)
 {
+	if (len < r->mpu)
+		len = r->mpu;
+
 	len += r->overhead;
 
 	if (unlikely(r->linklayer == TC_LINKLAYER_ATM))
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -54,11 +54,7 @@
 #include <linux/security.h>
 #include <linux/slab.h>
 #include <linux/uaccess.h>
-#ifdef __GENKSYMS__
-#include <linux/res_counter.h>
-#else
 #include <linux/page_counter.h>
-#endif
 #include <linux/memcontrol.h>
 #include <linux/static_key.h>
 #include <linux/aio.h>
@@ -373,8 +369,13 @@ struct sock {
 	atomic_t		sk_omem_alloc;
 	int			sk_sndbuf;
 	struct sk_buff_head	sk_write_queue;
+
+	/*
+	 * Because of non atomicity rules, all
+	 * changes are protected by socket lock.
+	 */
 	kmemcheck_bitfield_begin(flags);
-	unsigned int		sk_shutdown  : 2,
+	unsigned int		sk_padding  : 2,
 #ifdef __GENKSYMS__
 				sk_no_check : 2,
 #else
@@ -386,6 +387,7 @@ struct sock {
 #define SK_PROTOCOL_MAX U8_MAX
 				sk_type      : 16;
 	kmemcheck_bitfield_end(flags);
+
 	int			sk_wmem_queued;
 	gfp_t			sk_allocation;
 	u32			sk_pacing_rate; /* bytes per second */
@@ -394,6 +396,7 @@ struct sock {
 	int			sk_gso_type;
 	unsigned int		sk_gso_max_size;
 	u16			sk_gso_max_segs;
+	u8			sk_shutdown;
 	int			sk_rcvlowat;
 	unsigned long	        sk_lingertime;
 	struct sk_buff_head	sk_error_queue;
@@ -1243,6 +1246,7 @@ static inline void memcg_memory_allocated_add(struct cg_proto *prot,
 					      unsigned long amt,
 					      int *parent_status)
 {
+	memcg_charge_kmem_nofail(prot->memcg, amt);
 	page_counter_charge(prot->memory_allocated, amt);
 
 	if (page_counter_read(prot->memory_allocated) >
@@ -1254,6 +1258,7 @@ static inline void memcg_memory_allocated_sub(struct cg_proto *prot,
 					      unsigned long amt)
 {
 	page_counter_uncharge(prot->memory_allocated, amt);
+	memcg_uncharge_kmem(prot->memcg, amt);
 }
 
 static inline long
@@ -1464,6 +1469,7 @@ sk_rmem_schedule(struct sock *sk, struct sk_buff *skb, int size)
 {
 	if (!sk_has_account(sk))
 		return true;
+
 	return size<= sk->sk_forward_alloc ||
 		__sk_mem_schedule(sk, size, SK_MEM_RECV) ||
 		skb_pfmemalloc(skb);
@@ -2333,6 +2339,13 @@ static inline void sk_change_net(struct sock *sk, struct net *net)
 	}
 }
 
+static inline void sk_change_net_get(struct sock *sk, struct net *net)
+{
+	struct net *old_net = sock_net(sk);
+	sock_net_set(sk, get_net(net));
+	put_net(old_net);
+}
+
 static inline struct sock *skb_steal_sock(struct sk_buff *skb)
 {
 	if (skb->sk) {
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -46,6 +46,13 @@
 
 #include <linux/seq_file.h>
 #include <linux/memcontrol.h>
+#include <net/tcp_memcontrol.h>
+
+#define TCP_PAGE(sk)	(sk->sk_sndmsg_page)
+#define TCP_OFF(sk)	(sk->sk_sndmsg_off)
+
+#define TW_WSCALE_MASK		0x0f
+#define TW_WSCALE_SPEC		0x10
 
 extern struct inet_hashinfo tcp_hashinfo;
 
@@ -255,10 +262,13 @@ extern int sysctl_tcp_max_orphans;
 extern int sysctl_tcp_fack;
 extern int sysctl_tcp_reordering;
 extern int sysctl_tcp_dsack;
+extern int sysctl_tcp_mem[3];
 extern int sysctl_tcp_wmem[3];
 extern int sysctl_tcp_rmem[3];
 extern int sysctl_tcp_app_win;
+#ifndef sysctl_tcp_adv_win_scale
 extern int sysctl_tcp_adv_win_scale;
+#endif
 extern int sysctl_tcp_tw_reuse;
 extern int sysctl_tcp_frto;
 extern int sysctl_tcp_low_latency;
@@ -278,6 +288,7 @@ extern unsigned int sysctl_tcp_notsent_lowat;
 extern int sysctl_tcp_min_tso_segs;
 extern int sysctl_tcp_autocorking;
 extern int sysctl_tcp_invalid_ratelimit;
+extern int sysctl_tcp_use_sg;
 
 extern atomic_long_t tcp_memory_allocated;
 extern struct percpu_counter tcp_sockets_allocated;
@@ -287,7 +298,7 @@ extern int tcp_memory_pressure;
 static inline bool tcp_under_memory_pressure(const struct sock *sk)
 {
 	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
-		return !!sk->sk_cgrp->memory_pressure;
+		return *sk->sk_cgrp->memory_pressure;
 
 	return tcp_memory_pressure;
 }
@@ -318,11 +329,28 @@ static inline bool tcp_out_of_memory(struct sock *sk)
 
 void sk_forced_mem_schedule(struct sock *sk, int size);
 
+static inline void orphan_count_inc(struct sock *sk)
+{
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		cg_orphan_count_inc(sk);
+	percpu_counter_inc(sk->sk_prot->orphan_count);
+}
+
+static inline void orphan_count_dec(struct sock *sk)
+{
+	percpu_counter_dec(sk->sk_prot->orphan_count);
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		cg_orphan_count_dec(sk);
+}
+
 static inline bool tcp_too_many_orphans(struct sock *sk, int shift)
 {
 	struct percpu_counter *ocp = sk->sk_prot->orphan_count;
 	int orphans = percpu_counter_read_positive(ocp);
 
+	if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+		return cg_too_many_orphans(sk, shift);
+
 	if (orphans << shift > sysctl_tcp_max_orphans) {
 		orphans = percpu_counter_sum_positive(ocp);
 		if (orphans << shift > sysctl_tcp_max_orphans)
@@ -1073,6 +1101,7 @@ static inline void tcp_prequeue_init(struct tcp_sock *tp)
 }
 
 bool tcp_prequeue(struct sock *sk, struct sk_buff *skb);
+int tcp_filter(struct sock *sk, struct sk_buff *skb);
 
 #undef STATE_TRACE
 
--- a/include/net/tcp_memcontrol.h
+++ b/include/net/tcp_memcontrol.h
@@ -6,8 +6,10 @@ struct tcp_memcontrol {
 	/* per-cgroup tcp memory pressure knobs */
 	struct page_counter tcp_memory_allocated;
 	struct percpu_counter tcp_sockets_allocated;
+	struct percpu_counter tcp_orphan_count;
 	/* those two are read-mostly, leave them at the end */
 	long tcp_prot_mem[3];
+	int tcp_max_orphans;
 	int tcp_memory_pressure;
 };
 
@@ -15,4 +17,8 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg);
 int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
 void tcp_destroy_cgroup(struct mem_cgroup *memcg);
 void tcp_prot_mem(struct mem_cgroup *memcg, long val, int idx);
+
+void cg_orphan_count_inc(struct sock *sk);
+void cg_orphan_count_dec(struct sock *sk);
+bool cg_too_many_orphans(struct sock *sk, int shift);
 #endif /* _TCP_MEMCG_H */
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -238,6 +238,7 @@ extern int udp_push_pending_frames(struct sock *sk);
 extern void udp_flush_pending_frames(struct sock *sk);
 extern int udp_rcv(struct sk_buff *skb);
 extern int udp_ioctl(struct sock *sk, int cmd, unsigned long arg);
+extern int __udp_disconnect(struct sock *sk, int flags);
 extern int udp_disconnect(struct sock *sk, int flags);
 extern unsigned int udp_poll(struct file *file, struct socket *sock,
 			     poll_table *wait);
@@ -331,4 +332,5 @@ extern void udp_encap_enable(void);
 #if IS_ENABLED(CONFIG_IPV6)
 extern void udpv6_encap_enable(void);
 #endif
+extern int udp_init_sock(struct sock *sk);
 #endif	/* _UDP_H */
--- /dev/null
+++ b/include/net/udp_memcontrol.h
@@ -0,0 +1,20 @@
+/*
+ *  include/net/udp_memcontrol.h
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _UDP_MEMCG_H
+#define _UDP_MEMCG_H
+
+struct udp_memcontrol {
+	struct cg_proto cg_proto;
+	struct page_counter udp_memory_allocated;
+	long udp_prot_mem[3];
+};
+
+struct cg_proto *udp_proto_cgroup(struct mem_cgroup *memcg);
+int udp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss);
+void udp_destroy_cgroup(struct mem_cgroup *memcg);
+#endif /* _UDP_MEMCG_H */
--- a/include/trace/events/ext4.h
+++ b/include/trace/events/ext4.h
@@ -83,6 +83,36 @@ struct extent_status;
 	{ FALLOC_FL_ZERO_RANGE,		"ZERO_RANGE"})
 
 
+TRACE_EVENT(ext4_other_inode_update_time,
+	TP_PROTO(struct inode *inode, ino_t orig_ino),
+
+	TP_ARGS(inode, orig_ino),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	orig_ino		)
+		__field(	uid_t,	uid			)
+		__field(	gid_t,	gid			)
+		__field(	__u16, mode			)
+	),
+
+	TP_fast_assign(
+		__entry->orig_ino = orig_ino;
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->uid	= i_uid_read(inode);
+		__entry->gid	= i_gid_read(inode);
+		__entry->mode	= inode->i_mode;
+	),
+
+	TP_printk("dev %d,%d orig_ino %lu ino %lu mode 0%o uid %u gid %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  (unsigned long) __entry->orig_ino,
+		  (unsigned long) __entry->ino, __entry->mode,
+		  __entry->uid, __entry->gid)
+);
+
 TRACE_EVENT(ext4_free_inode,
 	TP_PROTO(struct inode *inode),
 
@@ -887,6 +917,60 @@ TRACE_EVENT(ext4_sync_file_exit,
 		  __entry->ret)
 );
 
+TRACE_EVENT(ext4_sync_files_iterate,
+	TP_PROTO(struct dentry *dentry, tid_t tid, int datasync),
+
+	TP_ARGS(dentry, tid, datasync),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	parent			)
+		__field(	int,	datasync		)
+		__field(	unsigned int,	tid		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= dentry->d_inode->i_sb->s_dev;
+		__entry->ino		= dentry->d_inode->i_ino;
+		__entry->datasync	= datasync;
+		__entry->parent		= dentry->d_parent->d_inode->i_ino;
+		__entry->tid		= tid;
+	),
+
+	TP_printk("dev %d,%d ino %ld parent %ld datasync %d tid %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino,
+		  (unsigned long) __entry->parent, __entry->datasync,
+		  __entry->tid)
+);
+
+TRACE_EVENT(ext4_sync_files_exit,
+	TP_PROTO(struct dentry *dentry, tid_t tid, int barrier),
+
+	TP_ARGS(dentry, tid, barrier),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(	ino_t,	ino			)
+		__field(	ino_t,	parent			)
+		__field(	int,	barrier			)
+		__field(	unsigned int,	tid		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= dentry->d_inode->i_sb->s_dev;
+		__entry->ino		= dentry->d_inode->i_ino;
+		__entry->parent		= dentry->d_parent->d_inode->i_ino;
+		__entry->tid		= tid;
+		__entry->barrier	= barrier;
+	),
+
+	TP_printk("dev %d,%d ino %ld parent %ld explicit_barrier %d tid %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long) __entry->ino,
+		  (unsigned long) __entry->parent, __entry->barrier,
+		  __entry->tid)
+);
+
 TRACE_EVENT(ext4_sync_fs,
 	TP_PROTO(struct super_block *sb, int wait),
 
--- a/include/trace/events/gfpflags.h
+++ b/include/trace/events/gfpflags.h
@@ -34,7 +34,6 @@
 	{(unsigned long)__GFP_HARDWALL,		"GFP_HARDWALL"},	\
 	{(unsigned long)__GFP_THISNODE,		"GFP_THISNODE"},	\
 	{(unsigned long)__GFP_RECLAIMABLE,	"GFP_RECLAIMABLE"},	\
-	{(unsigned long)__GFP_KMEMCG,		"GFP_KMEMCG"},		\
 	{(unsigned long)__GFP_MOVABLE,		"GFP_MOVABLE"},		\
 	{(unsigned long)__GFP_NOTRACK,		"GFP_NOTRACK"},		\
 	{(unsigned long)__GFP_NO_KSWAPD,	"GFP_NO_KSWAPD"},	\
--- a/include/trace/events/kvm.h
+++ b/include/trace/events/kvm.h
@@ -358,14 +358,15 @@ TRACE_EVENT(
 #endif
 
 TRACE_EVENT(kvm_halt_poll_ns,
-	TP_PROTO(bool grow, unsigned int vcpu_id, int new, int old),
+	TP_PROTO(bool grow, unsigned int vcpu_id, unsigned int new,
+		 unsigned int old),
 	TP_ARGS(grow, vcpu_id, new, old),
 
 	TP_STRUCT__entry(
 		__field(bool, grow)
 		__field(unsigned int, vcpu_id)
-		__field(int, new)
-		__field(int, old)
+		__field(unsigned int, new)
+		__field(unsigned int, old)
 	),
 
 	TP_fast_assign(
@@ -375,7 +376,7 @@ TRACE_EVENT(kvm_halt_poll_ns,
 		__entry->old            = old;
 	),
 
-	TP_printk("vcpu %u: halt_poll_ns %d (%s %d)",
+	TP_printk("vcpu %u: halt_poll_ns %u (%s %u)",
 			__entry->vcpu_id,
 			__entry->new,
 			__entry->grow ? "grow" : "shrink",
--- a/include/trace/events/vmscan.h
+++ b/include/trace/events/vmscan.h
@@ -202,7 +202,7 @@ TRACE_EVENT(mm_shrink_slab_start,
 
 	TP_fast_assign(
 		__entry->shr = shr;
-		__entry->shrink = shr->shrink;
+		__entry->shrink = shr->scan_objects;
 		__entry->nr_objects_to_shrink = nr_objects_to_shrink;
 		__entry->gfp_flags = sc->gfp_mask;
 		__entry->pgs_scanned = pgs_scanned;
@@ -241,7 +241,7 @@ TRACE_EVENT(mm_shrink_slab_end,
 
 	TP_fast_assign(
 		__entry->shr = shr;
-		__entry->shrink = shr->shrink;
+		__entry->shrink = shr->scan_objects;
 		__entry->unused_scan = unused_scan_cnt;
 		__entry->new_scan = new_scan_cnt;
 		__entry->retval = shrinker_retval;
--- a/include/trace/events/writeback.h
+++ b/include/trace/events/writeback.h
@@ -18,6 +18,8 @@
 		{I_FREEING,		"I_FREEING"},		\
 		{I_CLEAR,		"I_CLEAR"},		\
 		{I_SYNC,		"I_SYNC"},		\
+		{I_DIRTY_TIME,		"I_DIRTY_TIME"},	\
+		{I_DIRTY_TIME_EXPIRED,	"I_DIRTY_TIME_EXPIRED"}, \
 		{I_REFERENCED,		"I_REFERENCED"}		\
 	)
 
@@ -69,6 +71,7 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
 	TP_STRUCT__entry (
 		__array(char, name, 32)
 		__field(unsigned long, ino)
+		__field(unsigned long, state)
 		__field(unsigned long, flags)
 	),
 
@@ -79,16 +82,25 @@ DECLARE_EVENT_CLASS(writeback_dirty_inode_template,
 		strncpy(__entry->name,
 			bdi->dev ? dev_name(bdi->dev) : "(unknown)", 32);
 		__entry->ino		= inode->i_ino;
+		__entry->state		= inode->i_state;
 		__entry->flags		= flags;
 	),
 
-	TP_printk("bdi %s: ino=%lu flags=%s",
+	TP_printk("bdi %s: ino=%lu state=%s flags=%s",
 		__entry->name,
 		__entry->ino,
+		show_inode_state(__entry->state),
 		show_inode_state(__entry->flags)
 	)
 );
 
+DEFINE_EVENT(writeback_dirty_inode_template, writeback_mark_inode_dirty,
+
+	TP_PROTO(struct inode *inode, int flags),
+
+	TP_ARGS(inode, flags)
+);
+
 DEFINE_EVENT(writeback_dirty_inode_template, writeback_dirty_inode_start,
 
 	TP_PROTO(struct inode *inode, int flags),
@@ -599,6 +611,52 @@ DEFINE_EVENT(writeback_single_inode_template, writeback_single_inode,
 	TP_ARGS(inode, wbc, nr_to_write)
 );
 
+DECLARE_EVENT_CLASS(writeback_lazytime_template,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,	dev			)
+		__field(unsigned long,	ino			)
+		__field(unsigned long,	state			)
+		__field(	__u16, mode			)
+		__field(unsigned long, dirtied_when		)
+	),
+
+	TP_fast_assign(
+		__entry->dev	= inode->i_sb->s_dev;
+		__entry->ino	= inode->i_ino;
+		__entry->state	= inode->i_state;
+		__entry->mode	= inode->i_mode;
+		__entry->dirtied_when = inode->dirtied_when;
+	),
+
+	TP_printk("dev %d,%d ino %lu dirtied %lu state %s mode 0%o",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino, __entry->dirtied_when,
+		  show_inode_state(__entry->state), __entry->mode)
+);
+
+DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode)
+);
+
+DEFINE_EVENT(writeback_lazytime_template, writeback_lazytime_iput,
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode)
+);
+
+DEFINE_EVENT(writeback_lazytime_template, writeback_dirty_inode_enqueue,
+
+	TP_PROTO(struct inode *inode),
+
+	TP_ARGS(inode)
+);
+
 #endif /* _TRACE_WRITEBACK_H */
 
 /* This part must be outside protection */
--- a/include/uapi/asm-generic/ioctls.h
+++ b/include/uapi/asm-generic/ioctls.h
@@ -94,6 +94,8 @@
 #define TIOCMIWAIT	0x545C	/* wait for a change on serial input line(s) */
 #define TIOCGICOUNT	0x545D	/* read serial port inline interrupt counts */
 
+#define TIOSAK		_IO('T', 0x66)  /* "Secure Attention Key" */
+
 /*
  * Some arches already define FIOQSIZE due to a historical
  * conflict with a Hayes modem-specific ioctl value.
--- a/include/uapi/linux/Kbuild
+++ b/include/uapi/linux/Kbuild
@@ -62,6 +62,7 @@ header-y += auxvec.h
 header-y += ax25.h
 header-y += b1lli.h
 header-y += baycom.h
+header-y += beancounter.h
 header-y += bfs_fs.h
 header-y += binfmts.h
 header-y += blkpg.h
@@ -424,6 +425,12 @@ header-y += virtio_ring.h
 header-y += virtio_rng.h
 header-y += virtio_scsi.h
 header-y += vt.h
+header-y += vzcalluser.h
+header-y += vzctl_netstat.h
+header-y += vzctl_veth.h
+header-y += vzctl_venet.h
+header-y += vziptable_defs.h
+header-y += vzlist.h
 header-y += wait.h
 header-y += wanrouter.h
 header-y += watchdog.h
@@ -434,3 +441,4 @@ header-y += xattr.h
 header-y += xfrm.h
 header-y += hw_breakpoint.h
 header-y += userfaultfd.h
+header-y += compat.h
--- a/include/uapi/linux/aio_abi.h
+++ b/include/uapi/linux/aio_abi.h
@@ -44,6 +44,8 @@ enum {
 	IOCB_CMD_NOOP = 6,
 	IOCB_CMD_PREADV = 7,
 	IOCB_CMD_PWRITEV = 8,
+	IOCB_CMD_READ_ITER = 9,
+	IOCB_CMD_WRITE_ITER = 10,
 };
 
 /*
--- /dev/null
+++ b/include/uapi/linux/bc/statd.h
@@ -0,0 +1,76 @@
+/*
+ *  include/uapi/linux/bc/statd.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __BC_STATD_H_
+#define __BC_STATD_H_
+
+/* sys_ubstat commands list */
+#define UBSTAT_READ_ONE			0x010000
+#define UBSTAT_READ_ALL			0x020000
+#define UBSTAT_READ_FULL		0x030000
+#define UBSTAT_UBLIST			0x040000
+#define UBSTAT_UBPARMNUM		0x050000
+#define UBSTAT_GETTIME			0x060000
+
+#define UBSTAT_CMD(func)		((func) & 0xF0000)
+#define UBSTAT_PARMID(func)		((func) & 0x0FFFF)
+
+#define TIME_MAX_SEC		(LONG_MAX / HZ)
+#define TIME_MAX_JIF		(TIME_MAX_SEC * HZ)
+
+typedef unsigned long ubstattime_t;
+
+typedef struct {
+	ubstattime_t	start_time;
+	ubstattime_t	end_time;
+	ubstattime_t	cur_time;
+	/*
+	 * On any changes to this struct keep in mind fixing
+	 * all copy_to_user instances, initializing new fields/paddings
+	 * to prevent possible leaks from kernel-space.
+	 */
+} ubgettime_t;
+
+typedef struct {
+	long		maxinterval;
+	int		signum;
+} ubnotifrq_t;
+
+typedef struct {
+	unsigned long	maxheld;
+	unsigned long	failcnt;
+	/*
+	 * On any changes to this struct keep in mind fixing
+	 * all copy_to_user instances, initializing new fields/paddings
+	 * to prevent possible leaks from kernel-space.
+	 */
+} ubstatparm_t;
+
+typedef struct {
+	unsigned long	barrier;
+	unsigned long	limit;
+	unsigned long	held;
+	unsigned long	maxheld;
+	unsigned long	minheld;
+	unsigned long	failcnt;
+	unsigned long __unused1;
+	unsigned long __unused2;
+	/*
+	 * On any changes to this struct keep in mind fixing
+	 * all copy_to_user instances, initializing new fields/paddings
+	 * to prevent possible leaks from kernel-space.
+	 */
+} ubstatparmf_t;
+
+typedef struct {
+	ubstattime_t	start_time;
+	ubstattime_t	end_time;
+	ubstatparmf_t	param[0];
+} ubstatfull_t;
+
+#endif
--- /dev/null
+++ b/include/uapi/linux/beancounter.h
@@ -0,0 +1,65 @@
+/*
+ *  include/uapi/linux/beancounter.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _UAPI_LINUX_BEANCOUNTER_H
+#define _UAPI_LINUX_BEANCOUNTER_H
+
+/*
+ * Resource list.
+ */
+#define UB_KMEMSIZE		0	/* Unswappable kernel memory size including
+					 * struct task, page directories, etc. */
+#define UB_LOCKEDPAGES		1	/* Mlock()ed pages. */
+#define UB_PRIVVMPAGES		2	/* Total number of pages, counting potentially
+					 * private pages as private and used. */
+#define UB_SHMPAGES		3	/* IPC SHM segment size. */
+#define UB_DUMMY		4	/* Dummy resource (compatibility) */
+#define UB_NUMPROC		5	/* Number of processes. */
+#define UB_PHYSPAGES		6	/* All resident pages, for swapout guarantee. */
+#define UB_VMGUARPAGES		7	/* Guarantee for memory allocation,
+					 * checked against PRIVVMPAGES. */
+#define UB_OOMGUARPAGES		8	/* Guarantees against OOM kill.
+					 * Only limit is used, no accounting. */
+#define UB_NUMTCPSOCK		9	/* Number of TCP sockets. */
+#define UB_NUMFLOCK		10	/* Number of file locks. */
+#define UB_NUMPTY		11	/* Number of PTYs. */
+#define UB_NUMSIGINFO		12	/* Number of siginfos. */
+#define UB_TCPSNDBUF		13	/* Total size of tcp send buffers. */
+#define UB_TCPRCVBUF		14	/* Total size of tcp receive buffers. */
+#define UB_OTHERSOCKBUF		15	/* Total size of other socket
+					 * send buffers (all buffers for PF_UNIX). */
+#define UB_DGRAMRCVBUF		16	/* Total size of other socket
+					 * receive buffers. */
+#define UB_NUMOTHERSOCK		17	/* Number of other sockets. */
+#define UB_DCACHESIZE		18	/* Size of busy dentry/inode cache. */
+#define UB_NUMFILE		19	/* Number of open files. */
+
+#define UB_RESOURCES_COMPAT	24
+
+/*
+ * Add new resources here.
+ */
+#define UB_NUMXTENT		23
+#define UB_SWAPPAGES		24
+#define UB_RESOURCES		25
+
+struct ubparm {
+	/*
+	 * A barrier over which resource allocations are failed gracefully.
+	 * If the amount of consumed memory is over the barrier further sbrk()
+	 * or mmap() calls fail, the existing processes are not killed.
+	 */
+	unsigned long	barrier;
+	unsigned long	limit;		/* hard resource limit */
+	unsigned long	held;		/* consumed resources */
+	unsigned long	maxheld;	/* maximum amount of consumed resources through the last period */
+	unsigned long	minheld;	/* minimum amount of consumed resources through the last period */
+	unsigned long	failcnt;	/* count of failed charges */
+	int		max_precharge;	/* maximum percpu resource precharge */
+};
+
+#endif /* _UAPI_LINUX_BEANCOUNTER_H */
--- a/include/uapi/linux/blkpg.h
+++ b/include/uapi/linux/blkpg.h
@@ -41,6 +41,7 @@ struct blkpg_ioctl_arg {
 #define BLKPG_ADD_PARTITION	1
 #define BLKPG_DEL_PARTITION	2
 #define BLKPG_RESIZE_PARTITION	3
+#define BLKPG_GET_PARTITION	4
 
 /* Sizes of name fields. Unused at present. */
 #define BLKPG_DEVNAMELTH	64
--- a/include/uapi/linux/capability.h
+++ b/include/uapi/linux/capability.h
@@ -171,12 +171,9 @@ struct vfs_cap_data {
 
 #define CAP_NET_BROADCAST    11
 
-/* Allow interface configuration */
 /* Allow administration of IP firewall, masquerading and accounting */
 /* Allow setting debug option on sockets */
 /* Allow modification of routing tables */
-/* Allow setting arbitrary process / process group ownership on
-   sockets */
 /* Allow binding to any address for transparent proxying (also via NET_RAW) */
 /* Allow setting TOS (type of service) */
 /* Allow setting promiscuous mode */
@@ -207,6 +204,7 @@ struct vfs_cap_data {
 #define CAP_SYS_MODULE       16
 
 /* Allow ioperm/iopl access */
+/* Allow O_DIRECT access */
 /* Allow sending USB messages to any device via /proc/bus/usb */
 
 #define CAP_SYS_RAWIO        17
@@ -225,23 +223,18 @@ struct vfs_cap_data {
 
 /* Allow configuration of the secure attention key */
 /* Allow administration of the random device */
-/* Allow examination and configuration of disk quotas */
 /* Allow setting the domainname */
 /* Allow setting the hostname */
 /* Allow calling bdflush() */
-/* Allow mount() and umount(), setting up new smb connection */
+/* Allow setting up new smb connection */
 /* Allow some autofs root ioctls */
 /* Allow nfsservctl */
 /* Allow VM86_REQUEST_IRQ */
 /* Allow to read/write pci config on alpha */
 /* Allow irix_prctl on mips (setstacksize) */
 /* Allow flushing all cache on m68k (sys_cacheflush) */
-/* Allow removing semaphores */
-/* Used instead of CAP_CHOWN to "chown" IPC message queues, semaphores
-   and shared memory */
 /* Allow locking/unlocking of shared memory segment */
 /* Allow turning swap on/off */
-/* Allow forged pids on socket credentials passing */
 /* Allow setting readahead and flushing buffers on block devices */
 /* Allow setting geometry in floppy driver */
 /* Allow turning DMA on/off in xd driver */
--- a/include/uapi/linux/fadvise.h
+++ b/include/uapi/linux/fadvise.h
@@ -17,5 +17,9 @@
 #define POSIX_FADV_DONTNEED	4 /* Don't need these pages.  */
 #define POSIX_FADV_NOREUSE	5 /* Data will be accessed once.  */
 #endif
+#define FADV_DEACTIVATE		32 /* Mark pages as good candidates for reclaim */
 
+#ifdef __KERNEL__
+extern int generic_fadvise(struct file* file, loff_t off, loff_t len, int adv);
+#endif
 #endif	/* FADVISE_H_INCLUDED */
--- /dev/null
+++ b/include/uapi/linux/fairsched.h
@@ -0,0 +1,7 @@
+/*
+ *  include/uapi/linux/fairsched.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
--- a/include/uapi/linux/falloc.h
+++ b/include/uapi/linux/falloc.h
@@ -4,6 +4,8 @@
 #define FALLOC_FL_KEEP_SIZE	0x01 /* default is extend size */
 #define FALLOC_FL_PUNCH_HOLE	0x02 /* de-allocates range */
 #define FALLOC_FL_NO_HIDE_STALE	0x04 /* reserved codepoint */
+#define FALLOC_FL_CONVERT_UNWRITTEN 0x100 /* mark extents as initialized */
+
 
 /*
  * FALLOC_FL_COLLAPSE_RANGE is used to remove a range of a file
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -53,9 +53,9 @@ struct files_stat_struct {
 };
 
 struct inodes_stat_t {
-	int nr_inodes;
-	int nr_unused;
-	int dummy[5];		/* padding for sysctl ABI compatibility */
+	long nr_inodes;
+	long nr_unused;
+	long dummy[5];		/* padding for sysctl ABI compatibility */
 };
 
 
@@ -90,6 +90,7 @@ struct inodes_stat_t {
 #define MS_KERNMOUNT	(1<<22) /* this is a kern_mount call */
 #define MS_I_VERSION	(1<<23) /* Update inode I_version field */
 #define MS_STRICTATIME	(1<<24) /* Always perform atime updates */
+#define MS_LAZYTIME	(1<<25) /* Update the on-disk [acm]times lazily */
 
 /* These sb flags are internal to the kernel */
 #define MS_NOSEC	(1<<28)
@@ -100,7 +101,8 @@ struct inodes_stat_t {
 /*
  * Superblock flags that can be altered by MS_REMOUNT
  */
-#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION)
+#define MS_RMT_MASK	(MS_RDONLY|MS_SYNCHRONOUS|MS_MANDLOCK|MS_I_VERSION|\
+			 MS_LAZYTIME)
 
 /*
  * Old magic mount flag and mask
@@ -151,6 +153,40 @@ struct inodes_stat_t {
 #define BLKROTATIONAL _IO(0x12,126)
 #define BLKZEROOUT _IO(0x12,127)
 
+/* Hole from 127..199 */
+struct blk_user_cbt_extent {
+	__u64 ce_physical; /* physical offset in bytes for the start
+			    * of the extent from the beginning of the disk */
+	__u64 ce_length;   /* length in bytes for this extent */
+	__u64 ce_reserved64[1];
+};
+
+struct blk_user_cbt_info {
+	__u8  ci_uuid[16];      /* Bitmap UUID */
+	__u64 ci_start;		/* start phisical range of mapping which
+				   userspace wants (in) */
+	__u64 ci_length;	/* phisical length of mapping which
+				 * userspace wants (in) */
+	__u32 ci_blksize;	/* cbt logical block size */
+	__u32 ci_flags;		/* CI_FLAG_* flags for request (in/out) */
+	__u32 ci_mapped_extents;/* number of extents that were mapped (out) */
+	__u32 ci_extent_count;  /* size of fm_extents array (in) */
+	__u32 ci_reserved;
+	struct blk_user_cbt_extent ci_extents[0]; /* array of mapped extents (out) */
+};
+
+enum CI_FLAGS
+{
+	CI_FLAG_ONCE = 1, /* BLKCBTGET will clear bits */
+	CI_FLAG_NEW_UUID = 2 /* BLKCBTSET update uuid */
+};
+
+#define BLKCBTSTART _IOR(0x12,200, struct blk_user_cbt_info)
+#define BLKCBTSTOP _IO(0x12,201)
+#define BLKCBTGET _IOWR(0x12,202,struct blk_user_cbt_info)
+#define BLKCBTSET _IOR(0x12,203,struct blk_user_cbt_info)
+#define BLKCBTCLR _IOR(0x12,204,struct blk_user_cbt_info)
+
 #define BMAP_IOCTL 1		/* obsolete - kept for compatibility */
 #define FIBMAP	   _IO(0x00,1)	/* bmap access */
 #define FIGETBSZ   _IO(0x00,2)	/* get the block size used for bmap */
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -364,6 +364,7 @@ enum fuse_notify_code {
 	FUSE_NOTIFY_STORE = 4,
 	FUSE_NOTIFY_RETRIEVE = 5,
 	FUSE_NOTIFY_DELETE = 6,
+	FUSE_NOTIFY_INVAL_FILES = 77,
 	FUSE_NOTIFY_CODE_MAX,
 };
 
@@ -744,4 +745,8 @@ struct fuse_notify_retrieve_in {
 	uint64_t	dummy4;
 };
 
+struct fuse_notify_inval_files_out {
+	__u64	ino;
+};
+
 #endif /* _LINUX_FUSE_H */
--- a/include/uapi/linux/if.h
+++ b/include/uapi/linux/if.h
@@ -221,6 +221,7 @@ struct ifreq {
 		char	ifru_newname[IFNAMSIZ];
 		void __user *	ifru_data;
 		struct	if_settings ifru_settings;
+		unsigned int ifru_acctid;
 	} ifr_ifru;
 };
 
@@ -241,6 +242,7 @@ struct ifreq {
 #define ifr_qlen	ifr_ifru.ifru_ivalue	/* Queue length 	*/
 #define ifr_newname	ifr_ifru.ifru_newname	/* New name		*/
 #define ifr_settings	ifr_ifru.ifru_settings	/* Device/proto settings*/
+#define ifr_acctid	ifr_ifru.ifru_acctid	/* New ve accounting identifier */
 
 /*
  * Structure used in SIOCGIFCONF request.
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -57,6 +57,9 @@
 #define TUNSETVNETBE _IOW('T', 222, int)
 #define TUNGETVNETBE _IOR('T', 223, int)
 
+/* CONFIG_VE_TUNTAP_ACCOUNTING should be set */
+#define TUNSETACCTID _IOW('T', 300, struct ifreq)
+
 /* TUNSETIFF ifr flags */
 #define IFF_TUN		0x0001
 #define IFF_TAP		0x0002
--- a/include/uapi/linux/in6.h
+++ b/include/uapi/linux/in6.h
@@ -174,6 +174,8 @@ struct in6_flowlabel_req {
 #define IPV6_JOIN_ANYCAST	27
 #define IPV6_LEAVE_ANYCAST	28
 
+#define IPV6_HDRINCL		36
+
 /* IPV6_MTU_DISCOVER values */
 #define IPV6_PMTUDISC_DONT		0
 #define IPV6_PMTUDISC_WANT		1
--- a/include/uapi/linux/inet_diag.h
+++ b/include/uapi/linux/inet_diag.h
@@ -43,6 +43,23 @@ struct inet_diag_req_v2 {
 	struct inet_diag_sockid id;
 };
 
+/*
+ * SOCK_RAW sockets require the underlied protocol to be
+ * additionally specified so we can use @pad member for
+ * this, but we can't rename it because userspace programs
+ * still may depend on this name. Instead lets use another
+ * structure definition as an alias for struct
+ * @inet_diag_req_v2.
+ */
+struct inet_diag_req_raw {
+	__u8	sdiag_family;
+	__u8	sdiag_protocol;
+	__u8	idiag_ext;
+	__u8	sdiag_raw_protocol;
+	__u32	idiag_states;
+	struct inet_diag_sockid id;
+};
+
 enum {
 	INET_DIAG_REQ_NONE,
 	INET_DIAG_REQ_BYTECODE,
--- /dev/null
+++ b/include/uapi/linux/kcov.h
@@ -0,0 +1,10 @@
+#ifndef _LINUX_KCOV_IOCTLS_H
+#define _LINUX_KCOV_IOCTLS_H
+
+#include <linux/types.h>
+
+#define KCOV_INIT_TRACE			_IOR('c', 1, unsigned long)
+#define KCOV_ENABLE			_IO('c', 100)
+#define KCOV_DISABLE			_IO('c', 101)
+
+#endif /* _LINUX_KCOV_IOCTLS_H */
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -147,6 +147,25 @@ struct kvm_pit_config {
 
 #define KVM_PIT_SPEAKER_DUMMY     1
 
+struct kvm_hyperv_exit {
+#define KVM_EXIT_HYPERV_SYNIC          1
+#define KVM_EXIT_HYPERV_HCALL          2
+	__u32 type;
+	union {
+		struct {
+			__u32 msr;
+			__u64 control;
+			__u64 evt_page;
+			__u64 msg_page;
+		} synic;
+		struct {
+			__u64 input;
+			__u64 result;
+			__u64 params[2];
+		} hcall;
+	} u;
+};
+
 #define KVM_EXIT_UNKNOWN          0
 #define KVM_EXIT_EXCEPTION        1
 #define KVM_EXIT_IO               2
@@ -171,7 +190,9 @@ struct kvm_pit_config {
 #define KVM_EXIT_WATCHDOG         21
 #define KVM_EXIT_S390_TSCH        22
 #define KVM_EXIT_EPR              23
+#define KVM_EXIT_SYSTEM_EVENT     24
 #define KVM_EXIT_IOAPIC_EOI       26
+#define KVM_EXIT_HYPERV           27
 
 /* For KVM_EXIT_INTERNAL_ERROR */
 /* Emulate instruction failed. */
@@ -306,6 +327,16 @@ struct kvm_run {
 		struct {
 			__u8 vector;
 		} eoi;
+		/* KVM_EXIT_SYSTEM_EVENT */
+		struct {
+#define KVM_SYSTEM_EVENT_SHUTDOWN       1
+#define KVM_SYSTEM_EVENT_RESET          2
+#define KVM_SYSTEM_EVENT_CRASH          3
+			__u32 type;
+			__u64 flags;
+		} system_event;
+		/* KVM_EXIT_HYPERV */
+		struct kvm_hyperv_exit hyperv;
 		/* Fix the size of the union. */
 		char padding[256];
 	};
@@ -691,6 +722,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_GUEST_DEBUG_HW_WPS 120
 #define KVM_CAP_SPLIT_IRQCHIP 121
 #define KVM_CAP_IOEVENTFD_ANY_LENGTH 122
+#define KVM_CAP_HYPERV_SYNIC 123
 #define KVM_CAP_X2APIC_API 129
 
 #ifdef KVM_CAP_IRQ_ROUTING
@@ -707,9 +739,15 @@ struct kvm_irq_routing_msi {
 	__u32 pad;
 };
 
+struct kvm_irq_routing_hv_sint {
+	__u32 vcpu;
+	__u32 sint;
+};
+
 /* gsi routing entry types */
 #define KVM_IRQ_ROUTING_IRQCHIP 1
 #define KVM_IRQ_ROUTING_MSI 2
+#define KVM_IRQ_ROUTING_HV_SINT 4
 
 struct kvm_irq_routing_entry {
 	__u32 gsi;
@@ -719,6 +757,7 @@ struct kvm_irq_routing_entry {
 	union {
 		struct kvm_irq_routing_irqchip irqchip;
 		struct kvm_irq_routing_msi msi;
+		struct kvm_irq_routing_hv_sint hv_sint;
 		__u32 pad[8];
 	} u;
 };
--- a/include/uapi/linux/netfilter/xt_DSCP.h
+++ b/include/uapi/linux/netfilter/xt_DSCP.h
@@ -13,6 +13,12 @@
 #include <linux/netfilter/xt_dscp.h>
 #include <linux/types.h>
 
+#define IPTOS_NORMALSVC 0
+
+struct ipt_tos_target_info {
+	u_int8_t tos;
+};
+
 /* target info */
 struct xt_DSCP_info {
 	__u8 dscp;
--- a/include/uapi/linux/netfilter/xt_connlimit.h
+++ b/include/uapi/linux/netfilter/xt_connlimit.h
@@ -22,8 +22,13 @@ struct xt_connlimit_info {
 #endif
 	};
 	unsigned int limit;
-	/* revision 1 */
-	__u32 flags;
+	union {
+		/* revision 0 */
+		unsigned int inverse;
+
+		/* revision 1 */
+		__u32 flags;
+	};
 
 	/* Used internally by the kernel */
 	struct xt_connlimit_data *data __attribute__((aligned(8)));
--- a/include/uapi/linux/netfilter/xt_connmark.h
+++ b/include/uapi/linux/netfilter/xt_connmark.h
@@ -18,11 +18,22 @@ enum {
 	XT_CONNMARK_RESTORE
 };
 
+struct xt_connmark_target_info {
+	unsigned long mark;
+	unsigned long mask;
+	__u8 mode;
+};
+
 struct xt_connmark_tginfo1 {
 	__u32 ctmark, ctmask, nfmask;
 	__u8 mode;
 };
 
+struct xt_connmark_info {
+	unsigned long mark, mask;
+	__u8 invert;
+};
+
 struct xt_connmark_mtinfo1 {
 	__u32 mark, mask;
 	__u8 invert;
--- a/include/uapi/linux/netfilter/xt_conntrack.h
+++ b/include/uapi/linux/netfilter/xt_conntrack.h
@@ -34,6 +34,41 @@ enum {
 	XT_CONNTRACK_STATE_ALIAS  = 1 << 13,
 };
 
+/* This is exposed to userspace, so remains frozen in time. */
+struct ip_conntrack_old_tuple
+{
+	struct {
+		__be32 ip;
+		union {
+			__u16 all;
+		} u;
+	} src;
+
+	struct {
+		__be32 ip;
+		union {
+			__u16 all;
+		} u;
+
+		/* The protocol. */
+		__u16 protonum;
+	} dst;
+};
+
+struct xt_conntrack_info
+{
+	unsigned int statemask, statusmask;
+        struct ip_conntrack_old_tuple tuple[IP_CT_DIR_MAX];
+	struct in_addr sipmsk[IP_CT_DIR_MAX], dipmsk[IP_CT_DIR_MAX];
+
+	unsigned long expires_min, expires_max;
+
+	/* Flags word */
+	__u8 flags;
+	/* Inverse flags */
+	__u8 invflags;
+};
+
 struct xt_conntrack_mtinfo1 {
 	union nf_inet_addr origsrc_addr, origsrc_mask;
 	union nf_inet_addr origdst_addr, origdst_mask;
--- a/include/uapi/linux/netfilter/xt_mark.h
+++ b/include/uapi/linux/netfilter/xt_mark.h
@@ -3,10 +3,32 @@
 
 #include <linux/types.h>
 
+/* Version 0 */
+struct xt_mark_target_info {
+	unsigned long mark;
+};
+
+/* Version 1 */
+enum {
+	XT_MARK_SET=0,
+	XT_MARK_AND,
+	XT_MARK_OR,
+};
+
+struct xt_mark_target_info_v1 {
+	unsigned long mark;
+	__u8 mode;
+};
+
 struct xt_mark_tginfo2 {
 	__u32 mark, mask;
 };
 
+struct xt_mark_info {
+	unsigned long mark, mask;
+	__u8 invert;
+};
+
 struct xt_mark_mtinfo1 {
 	__u32 mark, mask;
 	__u8 invert;
--- a/include/uapi/linux/netfilter/xt_owner.h
+++ b/include/uapi/linux/netfilter/xt_owner.h
@@ -9,6 +9,23 @@ enum {
 	XT_OWNER_SOCKET = 1 << 2,
 };
 
+struct ipt_owner_info {
+	uid_t uid;
+	gid_t gid;
+	pid_t pid;
+	pid_t sid;
+	char comm[16];
+	u_int8_t match, invert;     /* flags */
+};
+
+struct ip6t_owner_info {
+	uid_t uid;
+	gid_t gid;
+	pid_t pid;
+	pid_t sid;
+	u_int8_t match, invert;     /* flags */
+};
+
 struct xt_owner_match_info {
 	__u32 uid_min, uid_max;
 	__u32 gid_min, gid_max;
--- a/include/uapi/linux/netfilter_arp/arp_tables.h
+++ b/include/uapi/linux/netfilter_arp/arp_tables.h
@@ -71,9 +71,9 @@ struct arpt_arp {
 };
 
 /* Values for "flag" field in struct arpt_ip (general arp structure).
- * No flags defined yet.
  */
-#define ARPT_F_MASK		0x00	/* All possible flag bits mask. */
+#define ARPT_WDOGTMO		0x80
+#define ARPT_F_MASK		0x80	/* All possible flag bits mask. */
 
 /* Values for "inv" field in struct arpt_arp. */
 #define ARPT_INV_VIA_IN		0x0001	/* Invert the sense of IN IFACE. */
@@ -86,7 +86,8 @@ struct arpt_arp {
 #define ARPT_INV_ARPHRD		0x0080	/* Invert the sense of ARP HRD. */
 #define ARPT_INV_ARPPRO		0x0100	/* Invert the sense of ARP PRO. */
 #define ARPT_INV_ARPHLN		0x0200	/* Invert the sense of ARP HLN. */
-#define ARPT_INV_MASK		0x03FF	/* All possible flag bits mask. */
+#define ARPT_INV_WDOGTMO	0x8000	/* Invert the sense if ARPT_WDOGTMO flag */
+#define ARPT_INV_MASK		0x83FF	/* All possible flag bits mask. */
 
 /* This structure defines each of the firewall rules.  Consists of 3
    parts which are 1) general ARP header stuff 2) match specific
--- a/include/uapi/linux/netlink.h
+++ b/include/uapi/linux/netlink.h
@@ -101,14 +101,17 @@ struct nlmsgerr {
 	struct nlmsghdr msg;
 };
 
-#define NETLINK_ADD_MEMBERSHIP	1
-#define NETLINK_DROP_MEMBERSHIP	2
-#define NETLINK_PKTINFO		3
-#define NETLINK_BROADCAST_ERROR	4
-#define NETLINK_NO_ENOBUFS	5
-#define NETLINK_RX_RING		6
-#define NETLINK_TX_RING		7
-#define NETLINK_LISTEN_ALL_NSID	8
+#define NETLINK_ADD_MEMBERSHIP		1
+#define NETLINK_DROP_MEMBERSHIP		2
+#define NETLINK_PKTINFO			3
+#define NETLINK_BROADCAST_ERROR		4
+#define NETLINK_NO_ENOBUFS		5
+#define NETLINK_RX_RING			6
+#define NETLINK_TX_RING			7
+#define NETLINK_LISTEN_ALL_NSID		8
+#define NETLINK_LIST_MEMBERSHIPS	9
+#define NETLINK_CAP_ACK			10
+#define NETLINK_REPAIR			11
 
 struct nl_pktinfo {
 	__u32	group;
--- a/include/uapi/linux/netlink_diag.h
+++ b/include/uapi/linux/netlink_diag.h
@@ -37,6 +37,7 @@ enum {
 	NETLINK_DIAG_GROUPS,
 	NETLINK_DIAG_RX_RING,
 	NETLINK_DIAG_TX_RING,
+	NETLINK_DIAG_FLAGS,
 
 	__NETLINK_DIAG_MAX,
 };
@@ -48,5 +49,14 @@ enum {
 #define NDIAG_SHOW_MEMINFO	0x00000001 /* show memory info of a socket */
 #define NDIAG_SHOW_GROUPS	0x00000002 /* show groups of a netlink socket */
 #define NDIAG_SHOW_RING_CFG	0x00000004 /* show ring configuration */
+#define NDIAG_SHOW_FLAGS	0x00000008 /* show flags of a netlink socket */
+
+/* flags */
+#define NDIAG_FLAG_CB_RUNNING		0x00000001
+#define NDIAG_FLAG_PKTINFO		0x00000002
+#define NDIAG_FLAG_BROADCAST_ERROR	0x00000004
+#define NDIAG_FLAG_NO_ENOBUFS		0x00000008
+#define NDIAG_FLAG_LISTEN_ALL_NSID	0x00000010
+#define NDIAG_FLAG_CAP_ACK		0x00000020
 
 #endif
--- a/include/uapi/linux/ptrace.h
+++ b/include/uapi/linux/ptrace.h
@@ -64,6 +64,8 @@ struct ptrace_peeksiginfo_args {
 #define PTRACE_GETSIGMASK	0x420a
 #define PTRACE_SETSIGMASK	0x420b
 
+#define PTRACE_SECCOMP_GET_FILTER	0x420c
+
 /* Read signals from a shared (process wide) queue */
 #define PTRACE_PEEKSIGINFO_SHARED	(1 << 0)
 
@@ -89,9 +91,11 @@ struct ptrace_peeksiginfo_args {
 #define PTRACE_O_TRACESECCOMP	(1 << PTRACE_EVENT_SECCOMP)
 
 /* eventless options */
-#define PTRACE_O_EXITKILL	(1 << 20)
+#define PTRACE_O_EXITKILL		(1 << 20)
+#define PTRACE_O_SUSPEND_SECCOMP	(1 << 21)
 
-#define PTRACE_O_MASK		(0x000000ff | PTRACE_O_EXITKILL)
+#define PTRACE_O_MASK		(\
+	0x000000ff | PTRACE_O_EXITKILL | PTRACE_O_SUSPEND_SECCOMP)
 
 #include <asm/ptrace.h>
 
--- a/include/uapi/linux/tcp.h
+++ b/include/uapi/linux/tcp.h
@@ -112,12 +112,22 @@ enum {
 #define TCP_FASTOPEN		23	/* Enable FastOpen on listeners */
 #define TCP_TIMESTAMP		24
 #define TCP_NOTSENT_LOWAT	25	/* limit number of unsent bytes in write queue */
+#define TCP_REPAIR_WINDOW	29	/* Get/set window parameters */
 
 struct tcp_repair_opt {
 	__u32	opt_code;
 	__u32	opt_val;
 };
 
+struct tcp_repair_window {
+	__u32	snd_wl1;
+	__u32	snd_wnd;
+	__u32	max_window;
+
+	__u32	rcv_wnd;
+	__u32	rcv_wup;
+};
+
 enum {
 	TCP_NO_QUEUE,
 	TCP_RECV_QUEUE,
--- /dev/null
+++ b/include/uapi/linux/venet-netlink.h
@@ -0,0 +1,30 @@
+/*
+ *  include/uapi/linux/venet-netlink.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __NET_VENET_H_
+#define __NET_VENET_H_
+
+enum {
+	VENET_INFO_UNSPEC,
+	VENET_INFO_CMD,
+
+	__VENET_INFO_MAX
+#define VENET_INFO_MAX   (__VENET_INFO_MAX - 1)
+};
+
+enum {
+	VENET_IP_ADD,
+	VENET_IP_DEL,
+};
+
+struct venetaddrmsg {
+	__u8		va_family;
+	__u8		va_cmd;
+	__u32		va_addr[4];
+};
+
+#endif
--- a/include/uapi/linux/veth.h
+++ b/include/uapi/linux/veth.h
@@ -1,3 +1,12 @@
+/*
+ *  include/linux/veth.h
+ *
+ *  Copyright (C) 2007  SWsoft
+ *  All rights reserved.
+ *  
+ *  Licensing governed by "linux/COPYING.SWsoft" file.
+ *
+ */
 #ifndef __NET_VETH_H_
 #define __NET_VETH_H_
 
@@ -9,4 +18,7 @@ enum {
 #define VETH_INFO_MAX	(__VETH_INFO_MAX - 1)
 };
 
+#define SIOCSVENET	(SIOCDEVPRIVATE + 0xf)
+#define SIOCSFIXEDADDR	(SIOCDEVPRIVATE + 0xe)
+
 #endif
--- /dev/null
+++ b/include/uapi/linux/vzcalluser.h
@@ -0,0 +1,203 @@
+/*
+ *  include/uapi/linux/vzcalluser.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _UAPI_LINUX_VZCALLUSER_H
+#define _UAPI_LINUX_VZCALLUSER_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#include <linux/vziptable_defs.h>
+
+#ifndef __ENVID_T_DEFINED__
+# define __ENVID_T_DEFINED__
+typedef unsigned int envid_t;
+#endif
+
+#ifndef __KERNEL__
+#define __user
+#endif
+
+/*
+ * VE management ioctls
+ */
+
+#define VE_CREATE	1	/* Create VE, VE_ENTER added automatically */
+#define VE_EXCLUSIVE	2	/* Fail if exists */
+#define VE_ENTER	4	/* Enter existing VE */
+#define VE_TEST		8	/* Test if VE exists */
+#define VE_LOCK		16	/* Do not allow entering created VE */
+#define VE_SKIPLOCK	32	/* Allow entering embrion VE */
+
+struct vzctl_old_env_create {
+	envid_t				veid;
+	unsigned int			flags;
+	__u32				addr;
+};
+
+struct vzctl_mark_env_to_down {
+	envid_t			veid;
+};
+
+#define VE_USE_MAJOR	010	/* Test MAJOR supplied in rule */
+#define VE_USE_MINOR	030	/* Test MINOR supplied in rule */
+#define VE_USE_MASK	030	/* Testing mask, VE_USE_MAJOR|VE_USE_MINOR */
+
+struct vzctl_setdevperms {
+	envid_t				veid;
+	unsigned int			type;
+	unsigned int			dev;
+	unsigned int			mask;
+};
+
+#define VE_NETDEV_ADD  1
+#define VE_NETDEV_DEL  2
+
+struct vzctl_ve_netdev {
+	envid_t				veid;
+	int				op;
+	char __user			*dev_name;
+};
+
+#define VE_CONFIGURE_OS_RELEASE		2
+#define VE_CONFIGURE_CREATE_PROC_LINK	4
+#define VE_CONFIGURE_OPEN_TTY		5
+
+struct vzctl_ve_configure {
+	envid_t				veid;
+	unsigned int			key;
+	unsigned int			val;
+	unsigned int			size;
+	char				data[0];
+};
+
+struct vzctl_ve_meminfo {
+	envid_t				veid;
+	unsigned long			val;
+};
+
+struct vzctl_env_create_cid {
+	envid_t				veid;
+	unsigned int			flags;
+	__u32				class_id;
+};
+
+struct vzctl_env_create {
+	envid_t				veid;
+	unsigned int			flags;
+	__u32				class_id;
+};
+
+struct env_create_param {
+	__u64				iptables_mask;
+};
+
+#define VZCTL_ENV_CREATE_DATA_MINLEN	sizeof(struct env_create_param)
+
+struct env_create_param2 {
+	__u64				iptables_mask;
+	__u64				feature_mask;
+	__u32				total_vcpus;	/* 0 - don't care, same as in host */
+};
+
+struct env_create_param3 {
+	__u64				iptables_mask;
+	__u64				feature_mask;
+	__u32				total_vcpus;
+	__u32				pad;
+	__u64				known_features;
+};
+
+#define VE_FEATURE_SYSFS	(1ULL << 0)	/* deprecated */
+#define VE_FEATURE_NFS		(1ULL << 1)
+#define VE_FEATURE_DEF_PERMS	(1ULL << 2)	/* deprecated */
+#define VE_FEATURE_SIT          (1ULL << 3)
+#define VE_FEATURE_IPIP         (1ULL << 4)
+#define VE_FEATURE_PPP		(1ULL << 5)
+#define VE_FEATURE_IPGRE	(1ULL << 6)	/* deprecated */
+#define VE_FEATURE_BRIDGE	(1ULL << 7)
+#define VE_FEATURE_NFSD		(1ULL << 8)
+
+#define VE_FEATURES_OLD		(VE_FEATURE_SYSFS)
+#define VE_FEATURES_DEF		(VE_FEATURE_SYSFS | VE_FEATURE_DEF_PERMS)
+
+typedef struct env_create_param3 env_create_param_t;
+#define VZCTL_ENV_CREATE_DATA_MAXLEN	sizeof(env_create_param_t)
+
+struct vzctl_env_create_data {
+	envid_t				veid;
+	unsigned int			flags;
+	__u32				class_id;
+	env_create_param_t __user	*data;
+	int				datalen;
+};
+
+struct vz_load_avg {
+	int				val_int;
+	int				val_frac;
+};
+
+struct vz_cpu_stat {
+	unsigned long			user_jif;
+	unsigned long			nice_jif;
+	unsigned long			system_jif;
+	unsigned long			uptime_jif;
+	__u64				idle_clk;
+	__u64				strv_clk;
+	__u64				uptime_clk;
+	struct vz_load_avg		avenrun[3];	/* loadavg data */
+};
+
+struct vzctl_cpustatctl {
+	envid_t				veid;
+	struct vz_cpu_stat __user	*cpustat;
+};
+
+#define VZCTLTYPE			'.'
+#define VZCTL_OLD_ENV_CREATE		_IOW(VZCTLTYPE,  0, struct vzctl_old_env_create)
+#define VZCTL_MARK_ENV_TO_DOWN		_IOW(VZCTLTYPE,  1, struct vzctl_mark_env_to_down)
+#define VZCTL_SETDEVPERMS		_IOW(VZCTLTYPE,  2, struct vzctl_setdevperms) /* DEPRECATED */
+#define VZCTL_ENV_CREATE_CID		_IOW(VZCTLTYPE,  4, struct vzctl_env_create_cid)
+#define VZCTL_ENV_CREATE		_IOW(VZCTLTYPE,  5, struct vzctl_env_create)
+#define VZCTL_GET_CPU_STAT		_IOW(VZCTLTYPE,  6, struct vzctl_cpustatctl)
+#define VZCTL_ENV_CREATE_DATA		_IOW(VZCTLTYPE, 10, struct vzctl_env_create_data)
+#define VZCTL_VE_NETDEV			_IOW(VZCTLTYPE, 11, struct vzctl_ve_netdev)
+#define VZCTL_VE_MEMINFO		_IOW(VZCTLTYPE, 13, struct vzctl_ve_meminfo)
+#define VZCTL_VE_CONFIGURE		_IOW(VZCTLTYPE, 15, struct vzctl_ve_configure)
+
+#ifdef __KERNEL__
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+
+struct compat_vzctl_ve_netdev {
+	envid_t				veid;
+	int				op;
+	compat_uptr_t			dev_name;
+};
+
+struct compat_vzctl_ve_meminfo {
+	envid_t				veid;
+	compat_ulong_t			val;
+};
+
+struct compat_vzctl_env_create_data {
+	envid_t				veid;
+	unsigned int			flags;
+	__u32				class_id;
+	compat_uptr_t			data;
+	int				datalen;
+};
+
+#define VZCTL_COMPAT_ENV_CREATE_DATA	_IOW(VZCTLTYPE, 10, struct compat_vzctl_env_create_data)
+#define VZCTL_COMPAT_VE_NETDEV		_IOW(VZCTLTYPE, 11, struct compat_vzctl_ve_netdev)
+#define VZCTL_COMPAT_VE_MEMINFO		_IOW(VZCTLTYPE, 13, struct compat_vzctl_ve_meminfo)
+
+#endif /* CONFIG_COMPAT */
+#endif /* __KERNEL__ */
+
+#endif /* _UAPI_LINUX_VZCALLUSER_H */
--- /dev/null
+++ b/include/uapi/linux/vzctl_netstat.h
@@ -0,0 +1,129 @@
+/*
+ *  include/uapi/linux/vzctl_netstat.h
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef __VZCTL_NETSTAT_H__
+#define __VZCTL_NETSTAT_H__
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#ifndef __ENVID_T_DEFINED__
+# define __ENVID_T_DEFINED__
+typedef unsigned int envid_t;
+#endif
+
+/*
+ * Traffic accouting management ioctl
+ */
+
+struct vz_tc_class_info {
+	__u32				cid;	/* class number */
+	__u32				addr;	/* Network byte order */
+	__u32				mask;	/* subnet mask */
+	/*
+	 * On any changes to this struct keep in mind fixing
+	 * all copy_to_user instances, initializing new fields/paddings
+	 * to prevent possible leaks from kernel-space.
+	 */
+};
+
+
+struct vzctl_tc_classes {
+	struct vz_tc_class_info		*info;
+	int				length;
+};
+
+/* For IPv6 */
+struct vz_tc_class_info_v6 {
+	__u32				cid;	/* class number */
+	__u32				addr[4];/* Network byte order */
+	__u32				mask[4];/* subnet mask */
+	/*
+	 * On any changes to this struct keep in mind fixing
+	 * all copy_to_user instances, initializing new fields/paddings
+	 * to prevent possible leaks from kernel-space.
+	 */
+};
+
+struct vzctl_tc_classes_v6 {
+	struct vz_tc_class_info_v6	*info;
+	int				length;
+};
+
+struct vzctl_tc_get_stat {
+	envid_t				veid;
+	__u64				*incoming;
+	__u64				*outgoing;
+	__u32				*incoming_pkt;
+	__u32				*outgoing_pkt;
+	int				length;
+};
+
+struct vzctl_tc_get_stat_list {
+	envid_t				*list;
+	int				length;
+};
+
+struct vzctl_tc_set_base {
+	envid_t				veid;
+	__u16				base;
+};
+
+#define VZTCCTLTYPE			'='
+#define VZCTL_TC_MAX_CLASS		_IO(VZTCCTLTYPE, 1)
+#define VZCTL_TC_CLASS_NUM		_IO(VZTCCTLTYPE, 2)
+#define VZCTL_TC_SET_CLASS_TABLE	_IOW(VZTCCTLTYPE, 3, struct vzctl_tc_classes)
+#define VZCTL_TC_GET_CLASS_TABLE	_IOR(VZTCCTLTYPE, 4, struct vzctl_tc_classes)
+#define VZCTL_TC_STAT_NUM		_IO(VZTCCTLTYPE, 5)
+#define VZCTL_TC_GET_STAT_LIST		_IOR(VZTCCTLTYPE, 6, struct vzctl_tc_get_stat_list)
+#define VZCTL_TC_GET_STAT		_IOR(VZTCCTLTYPE, 7, struct vzctl_tc_get_stat)
+#define VZCTL_TC_DESTROY_STAT		_IO(VZTCCTLTYPE, 8)
+#define VZCTL_TC_DESTROY_ALL_STAT	_IO(VZTCCTLTYPE, 9)
+
+#define VZCTL_TC_GET_BASE		_IO(VZTCCTLTYPE, 11)
+#define VZCTL_TC_SET_BASE		_IOW(VZTCCTLTYPE, 12, struct vzctl_tc_set_base)
+
+#define VZCTL_TC_GET_STAT_V6		_IOR(VZTCCTLTYPE, 13, struct vzctl_tc_get_stat)
+#define VZCTL_TC_SET_CLASS_TABLE_V6	_IOW(VZTCCTLTYPE, 14, struct vzctl_tc_classes_v6)
+#define VZCTL_TC_GET_CLASS_TABLE_V6	_IOR(VZTCCTLTYPE, 15, struct vzctl_tc_classes_v6)
+
+#define VZCTL_TC_CLASS_NUM_V6		_IO(VZTCCTLTYPE, 16)
+
+#define VZCTL_TC_CLEAR_STAT		_IO(VZTCCTLTYPE, 17)
+#define VZCTL_TC_CLEAR_ALL_STAT		_IO(VZTCCTLTYPE, 18)
+
+#ifdef __KERNEL__
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+
+struct compat_vzctl_tc_classes {
+	compat_uptr_t			info;
+	int				length;
+};
+
+struct compat_vzctl_tc_get_stat {
+	envid_t				veid;
+	compat_uptr_t			incoming;
+	compat_uptr_t			outgoing;
+	compat_uptr_t			incoming_pkt;
+	compat_uptr_t			outgoing_pkt;
+	int				length;
+};
+
+struct compat_vzctl_tc_get_stat_list {
+	compat_uptr_t			list;
+	int				length;
+};
+
+#define COMPAT_VZCTL_TC_SET_CLASS_TABLE	_IOW(VZTCCTLTYPE, 3, struct compat_vzctl_tc_classes)
+#define COMPAT_VZCTL_TC_GET_CLASS_TABLE	_IOR(VZTCCTLTYPE, 4, struct compat_vzctl_tc_classes)
+#define COMPAT_VZCTL_TC_GET_STAT_LIST	_IOR(VZTCCTLTYPE, 6, struct compat_vzctl_tc_get_stat_list)
+#define COMPAT_VZCTL_TC_GET_STAT	_IOR(VZTCCTLTYPE, 7, struct compat_vzctl_tc_get_stat)
+#endif /* CONFIG_COMPAT */
+#endif /* __KERNEL__ */
+
+#endif /* __VZCTL_NETSTAT_H__ */
--- /dev/null
+++ b/include/uapi/linux/vzctl_venet.h
@@ -0,0 +1,51 @@
+/*
+ *  include/uapi/linux/vzctl_venet.h
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _UAPI_VZCTL_VENET_H
+#define _UAPI_VZCTL_VENET_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#ifndef __ENVID_T_DEFINED__
+#define __ENVID_T_DEFINED__
+typedef unsigned envid_t;
+#endif
+
+#define VE_IP_ADD	1
+#define VE_IP_DEL	2
+#define VE_IP_EXT_ADD	3
+#define VE_IP_EXT_DEL	4
+
+struct vzctl_ve_ip_map {
+	envid_t		veid;
+	int		op;
+	struct sockaddr *addr;
+	int		addrlen;
+};
+
+#define VENETCTLTYPE		'('
+#define VENETCTL_VE_IP_MAP	_IOW(VENETCTLTYPE, 3, struct vzctl_ve_ip_map)
+
+#ifdef __KERNEL__
+#ifdef CONFIG_COMPAT
+#include <linux/compat.h>
+
+struct compat_vzctl_ve_ip_map {
+	envid_t		veid;
+	int		op;
+	compat_uptr_t	addr;
+	int		addrlen;
+};
+
+#define VENETCTL_COMPAT_VE_IP_MAP	_IOW(VENETCTLTYPE, 3, struct compat_vzctl_ve_ip_map)
+
+#endif /* CONFIG_COMPAT */
+#endif /* __KERNEL__ */
+
+#endif /* _UAPI_VZCTL_VENET_H */
--- /dev/null
+++ b/include/uapi/linux/vzctl_veth.h
@@ -0,0 +1,39 @@
+/*
+ *  include/uapi/linux/vzctl_veth.h
+ *
+ *  Copyright (c) 2006-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _UAPI_VZCTL_VETH_H
+#define _UAPI_VZCTL_VETH_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#ifndef __ENVID_T_DEFINED__
+#define __ENVID_T_DEFINED__
+typedef unsigned envid_t;
+#endif
+
+#define VE_ETH_ADD			1
+#define VE_ETH_DEL			2
+#define VE_ETH_ALLOW_MAC_CHANGE		3
+#define VE_ETH_DENY_MAC_CHANGE		4
+
+struct vzctl_ve_hwaddr {
+	envid_t		veid;
+	int		op;
+	unsigned char	dev_addr[6];
+	int		addrlen;
+	char		dev_name[16];
+	unsigned char	dev_addr_ve[6];
+	int		addrlen_ve;
+	char		dev_name_ve[16];
+};
+
+#define VETHCTLTYPE		'['
+#define VETHCTL_VE_HWADDR	_IOW(VETHCTLTYPE, 3, struct vzctl_ve_hwaddr)
+
+#endif /* _UAPI_VZCTL_VETH_H */
--- /dev/null
+++ b/include/uapi/linux/vziptable_defs.h
@@ -0,0 +1,79 @@
+/*
+ *  include/uapi/linux/vziptable_defs.h
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _UAPI_LINUX_VZIPTABLE_DEFS_H
+#define _UAPI_LINUX_VZIPTABLE_DEFS_H
+
+/*
+ * This masks represent modules
+ *
+ * Strictly speaking we use only a small subset
+ * of this bits novadays but we MUST RESERVE all
+ * the bits were ever used in a sake of ABI compatibility
+ * (ie compatibility with vzctl user-space utility)
+ *
+ * DON'T EVER DELETE/MODIFY THIS BITS
+ */
+#define VE_IPT_GENERATE(name, shift)	name = (1U << shift)
+
+enum ve_ipt_mods {
+	VE_IPT_GENERATE(VE_IP_IPTABLES_MOD,		 0),
+	VE_IPT_GENERATE(VE_IP_FILTER_MOD,		 1),
+	VE_IPT_GENERATE(VE_IP_MANGLE_MOD,		 2),
+	VE_IPT_GENERATE(VE_IP_MATCH_LIMIT_MOD,		 3),
+	VE_IPT_GENERATE(VE_IP_MATCH_MULTIPORT_MOD,	 4),
+	VE_IPT_GENERATE(VE_IP_MATCH_TOS_MOD,		 5),
+	VE_IPT_GENERATE(VE_IP_TARGET_TOS_MOD,		 6),
+	VE_IPT_GENERATE(VE_IP_TARGET_REJECT_MOD,	 7),
+	VE_IPT_GENERATE(VE_IP_TARGET_TCPMSS_MOD,	 8),
+	VE_IPT_GENERATE(VE_IP_MATCH_TCPMSS_MOD,		 9),
+	VE_IPT_GENERATE(VE_IP_MATCH_TTL_MOD,		10),
+	VE_IPT_GENERATE(VE_IP_TARGET_LOG_MOD,		11),
+	VE_IPT_GENERATE(VE_IP_MATCH_LENGTH_MOD,		12),
+	VE_IPT_GENERATE(VE_IP_CONNTRACK_MOD,		14),
+	VE_IPT_GENERATE(VE_IP_CONNTRACK_FTP_MOD,	15),
+	VE_IPT_GENERATE(VE_IP_CONNTRACK_IRC_MOD,	16),
+	VE_IPT_GENERATE(VE_IP_MATCH_CONNTRACK_MOD,	17),
+	VE_IPT_GENERATE(VE_IP_MATCH_STATE_MOD,		18),
+	VE_IPT_GENERATE(VE_IP_MATCH_HELPER_MOD,		19),
+	VE_IPT_GENERATE(VE_IP_NAT_MOD,			20),
+	VE_IPT_GENERATE(VE_IP_NAT_FTP_MOD,		21),
+	VE_IPT_GENERATE(VE_IP_NAT_IRC_MOD,		22),
+	VE_IPT_GENERATE(VE_IP_TARGET_REDIRECT_MOD,	23),
+	VE_IPT_GENERATE(VE_IP_MATCH_OWNER_MOD,		24),
+	VE_IPT_GENERATE(VE_IP_MATCH_MAC_MOD,		25),
+	VE_IPT_GENERATE(VE_IP_IPTABLES6_MOD,		26),
+	VE_IPT_GENERATE(VE_IP_FILTER6_MOD,		27),
+	VE_IPT_GENERATE(VE_IP_MANGLE6_MOD,		28),
+	VE_IPT_GENERATE(VE_IP_IPTABLE_NAT_MOD,		29),
+	VE_IPT_GENERATE(VE_NF_CONNTRACK_MOD,		30),
+};
+
+/* these masks represent modules with their dependences */
+#define VE_IP_IPTABLES		(VE_IP_IPTABLES_MOD)
+#define VE_IP_FILTER		(VE_IP_FILTER_MOD | VE_IP_IPTABLES)
+#define VE_IP_MANGLE		(VE_IP_MANGLE_MOD | VE_IP_IPTABLES)
+#define VE_IP_IPTABLES6		(VE_IP_IPTABLES6_MOD)
+#define VE_IP_FILTER6		(VE_IP_FILTER6_MOD | VE_IP_IPTABLES6)
+#define VE_IP_MANGLE6		(VE_IP_MANGLE6_MOD | VE_IP_IPTABLES6)
+#define VE_NF_CONNTRACK		(VE_NF_CONNTRACK_MOD | VE_IP_IPTABLES)
+#define VE_IP_CONNTRACK		(VE_IP_CONNTRACK_MOD | VE_IP_IPTABLES)
+#define VE_IP_CONNTRACK_FTP	(VE_IP_CONNTRACK_FTP_MOD | VE_IP_CONNTRACK)
+#define VE_IP_CONNTRACK_IRC	(VE_IP_CONNTRACK_IRC_MOD | VE_IP_CONNTRACK)
+#define VE_IP_NAT		(VE_IP_NAT_MOD | VE_IP_CONNTRACK)
+#define VE_IP_NAT_FTP		(VE_IP_NAT_FTP_MOD | VE_IP_NAT | VE_IP_CONNTRACK_FTP)
+#define VE_IP_NAT_IRC		(VE_IP_NAT_IRC_MOD | VE_IP_NAT | VE_IP_CONNTRACK_IRC)
+#define VE_IP_IPTABLE_NAT	(VE_IP_IPTABLE_NAT_MOD | VE_IP_CONNTRACK)
+
+/* safe iptables mask to be used by default */
+#define VE_IP_DEFAULT		(VE_IP_IPTABLES | VE_IP_FILTER | VE_IP_MANGLE | \
+				 VE_IP_IPTABLES6 | VE_IP_FILTER6 | VE_IP_MANGLE6)
+
+#define VE_IP_NONE		(0ull)
+#define VE_IP_ALL		(~VE_IP_NONE)
+
+#endif /* _UAPI_LINUX_VZIPTABLE_DEFS_H */
--- /dev/null
+++ b/include/uapi/linux/vzlist.h
@@ -0,0 +1,46 @@
+/*
+ *  include/uapi/linux/vzlist.h
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#ifndef _UAPI_LINUX_VZLIST_H
+#define _UAPI_LINUX_VZLIST_H
+
+#include <linux/types.h>
+#include <linux/ioctl.h>
+
+#ifndef __KERNEL__
+#define __user
+#endif
+
+#ifndef __ENVID_T_DEFINED__
+#define __ENVID_T_DEFINED__
+typedef unsigned int envid_t;
+#endif
+
+struct vzlist_veidctl {
+	unsigned int	num;
+	envid_t	__user	*id;
+};
+
+struct vzlist_vepidctl {
+	envid_t		veid;
+	unsigned int	num;
+	pid_t __user	*pid;
+};
+
+struct vzlist_veipctl {
+	envid_t		veid;
+	unsigned int	num;
+	void __user	*ip;
+};
+
+#define VZLISTTYPE		'x'
+#define VZCTL_GET_VEIDS		_IOR(VZLISTTYPE, 1, struct vzlist_veidctl)
+#define VZCTL_GET_VEPIDS	_IOR(VZLISTTYPE, 2, struct vzlist_vepidctl)
+#define VZCTL_GET_VEIPS		_IOR(VZLISTTYPE, 3, struct vzlist_veipctl)
+#define VZCTL_GET_VEIP6S	_IOR(VZLISTTYPE, 4, struct vzlist_veipctl)
+
+#endif /* _UAPI_LINUX_VZLIST_H */
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -400,7 +400,7 @@ config TASK_XACCT
 
 config TASK_IO_ACCOUNTING
 	bool "Enable per-task storage I/O accounting"
-	depends on TASK_XACCT
+	depends on TASK_XACCT && BEANCOUNTERS
 	help
 	  Collect information on the number of bytes of storage I/O which this
 	  task has caused.
@@ -1009,9 +1009,13 @@ config FAIR_GROUP_SCHED
 	depends on CGROUP_SCHED
 	default CGROUP_SCHED
 
+config CFS_CPULIMIT
+	bool
+
 config CFS_BANDWIDTH
 	bool "CPU bandwidth provisioning for FAIR_GROUP_SCHED"
 	depends on FAIR_GROUP_SCHED
+	select CFS_CPULIMIT
 	default n
 	help
 	  This option allows users to define CPU bandwidth rates (limits) for
--- a/init/main.c
+++ b/init/main.c
@@ -59,7 +59,6 @@
 #include <linux/pid_namespace.h>
 #include <linux/device.h>
 #include <linux/kthread.h>
-#include <linux/sched.h>
 #include <linux/signal.h>
 #include <linux/idr.h>
 #include <linux/kgdb.h>
@@ -78,6 +77,9 @@
 #include <linux/context_tracking.h>
 #include <linux/list.h>
 #include <linux/io.h>
+#include <linux/ve.h>
+
+#include <bc/beancounter.h>
 
 #include <asm/io.h>
 #include <asm/bugs.h>
@@ -116,6 +118,12 @@ bool early_boot_irqs_disabled __read_mostly;
 enum system_states system_state __read_mostly;
 EXPORT_SYMBOL(system_state);
 
+#ifdef CONFIG_VE
+extern void init_ve_system(void);
+#else
+#define init_ve_system()		do { } while (0)
+#endif
+
 /*
  * Boot command-line arguments
  */
@@ -509,6 +517,8 @@ asmlinkage void __init start_kernel(void)
 	setup_command_line(command_line);
 	setup_nr_cpu_ids();
 	setup_per_cpu_areas();
+	ub_init_early();
+	kstat_init();
 	smp_prepare_boot_cpu();	/* arch-specific boot-cpu hooks */
 
 	build_all_zonelists(NULL, NULL);
@@ -628,6 +638,7 @@ asmlinkage void __init start_kernel(void)
 	proc_root_init();
 #endif
 	cgroup_init();
+	ub_init_late();
 	cpuset_init();
 	taskstats_init_early();
 	delayacct_init();
@@ -843,7 +854,9 @@ static void __init do_initcalls(void)
  */
 static void __init do_basic_setup(void)
 {
+	init_ve_system();
 	cpuset_init_smp();
+	usermodehelper_init();
 	shmem_init();
 	driver_init();
 	init_irq_proc();
--- a/init/version.c
+++ b/init/version.c
@@ -13,6 +13,7 @@
 #include <generated/utsrelease.h>
 #include <linux/version.h>
 #include <linux/proc_ns.h>
+#include <linux/init_task.h>
 
 #ifndef CONFIG_KALLSYMS
 #define version(a) Version_ ## a
@@ -39,6 +40,12 @@ struct uts_namespace init_uts_ns = {
 };
 EXPORT_SYMBOL_GPL(init_uts_ns);
 
+struct new_utsname virt_utsname = {
+	/* we need only this field */
+	.release        = UTS_RELEASE,
+};
+EXPORT_SYMBOL(virt_utsname);
+
 /* FIXED STRINGS! Don't touch! */
 const char linux_banner[] =
 	"Linux version " UTS_RELEASE " (" LINUX_COMPILE_BY "@"
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -165,28 +165,28 @@ static struct ctl_table ipc_kern_table[] = {
 		.procname	= "shmmax",
 		.data		= &init_ipc_ns.shm_ctlmax,
 		.maxlen		= sizeof (init_ipc_ns.shm_ctlmax),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_doulongvec_minmax,
 	},
 	{
 		.procname	= "shmall",
 		.data		= &init_ipc_ns.shm_ctlall,
 		.maxlen		= sizeof (init_ipc_ns.shm_ctlall),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_doulongvec_minmax,
 	},
 	{
 		.procname	= "shmmni",
 		.data		= &init_ipc_ns.shm_ctlmni,
 		.maxlen		= sizeof (init_ipc_ns.shm_ctlmni),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_dointvec,
 	},
 	{
 		.procname	= "shm_rmid_forced",
 		.data		= &init_ipc_ns.shm_rmid_forced,
 		.maxlen		= sizeof(init_ipc_ns.shm_rmid_forced),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_dointvec_minmax_orphans,
 		.extra1		= &zero,
 		.extra2		= &one,
@@ -195,7 +195,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.procname	= "msgmax",
 		.data		= &init_ipc_ns.msg_ctlmax,
 		.maxlen		= sizeof (init_ipc_ns.msg_ctlmax),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &int_max,
@@ -204,7 +204,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.procname	= "msgmni",
 		.data		= &init_ipc_ns.msg_ctlmni,
 		.maxlen		= sizeof (init_ipc_ns.msg_ctlmni),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_callback_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &int_max,
@@ -213,7 +213,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.procname	=  "msgmnb",
 		.data		= &init_ipc_ns.msg_ctlmnb,
 		.maxlen		= sizeof (init_ipc_ns.msg_ctlmnb),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &int_max,
@@ -222,14 +222,14 @@ static struct ctl_table ipc_kern_table[] = {
 		.procname	= "sem",
 		.data		= &init_ipc_ns.sem_ctls,
 		.maxlen		= 4*sizeof (int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_dointvec,
 	},
 	{
 		.procname	= "auto_msgmni",
 		.data		= &init_ipc_ns.auto_msgmni,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipcauto_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &one,
@@ -239,7 +239,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.procname	= "sem_next_id",
 		.data		= &init_ipc_ns.ids[IPC_SEM_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SEM_IDS].next_id),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &int_max,
@@ -248,7 +248,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.procname	= "msg_next_id",
 		.data		= &init_ipc_ns.ids[IPC_MSG_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_MSG_IDS].next_id),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &int_max,
@@ -257,7 +257,7 @@ static struct ctl_table ipc_kern_table[] = {
 		.procname	= "shm_next_id",
 		.data		= &init_ipc_ns.ids[IPC_SHM_IDS].next_id,
 		.maxlen		= sizeof(init_ipc_ns.ids[IPC_SHM_IDS].next_id),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_ipc_dointvec_minmax,
 		.extra1		= &zero,
 		.extra2		= &int_max,
@@ -266,18 +266,14 @@ static struct ctl_table ipc_kern_table[] = {
 	{}
 };
 
-static struct ctl_table ipc_root_table[] = {
-	{
-		.procname	= "kernel",
-		.mode		= 0555,
-		.child		= ipc_kern_table,
-	},
+static struct ctl_path ipc_path[] = {
+	{ .procname = "kernel", },
 	{}
 };
 
 static int __init ipc_sysctl_init(void)
 {
-	register_sysctl_table(ipc_root_table);
+	register_sysctl_paths(ipc_path, ipc_kern_table);
 	return 0;
 }
 
--- a/ipc/mq_sysctl.c
+++ b/ipc/mq_sysctl.c
@@ -12,6 +12,7 @@
 #include <linux/nsproxy.h>
 #include <linux/ipc_namespace.h>
 #include <linux/sysctl.h>
+#include <linux/stat.h>
 
 #ifdef CONFIG_PROC_SYSCTL
 static void *get_mq(ctl_table *table)
@@ -58,14 +59,14 @@ static ctl_table mq_sysctls[] = {
 		.procname	= "queues_max",
 		.data		= &init_ipc_ns.mq_queues_max,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_mq_dointvec,
 	},
 	{
 		.procname	= "msg_max",
 		.data		= &init_ipc_ns.mq_msg_max,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_mq_dointvec_minmax,
 		.extra1		= &msg_max_limit_min,
 		.extra2		= &msg_max_limit_max,
@@ -74,7 +75,7 @@ static ctl_table mq_sysctls[] = {
 		.procname	= "msgsize_max",
 		.data		= &init_ipc_ns.mq_msgsize_max,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_mq_dointvec_minmax,
 		.extra1		= &msg_maxsize_limit_min,
 		.extra2		= &msg_maxsize_limit_max,
@@ -83,7 +84,7 @@ static ctl_table mq_sysctls[] = {
 		.procname	= "msg_default",
 		.data		= &init_ipc_ns.mq_msg_default,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_mq_dointvec_minmax,
 		.extra1		= &msg_max_limit_min,
 		.extra2		= &msg_max_limit_max,
@@ -92,7 +93,7 @@ static ctl_table mq_sysctls[] = {
 		.procname	= "msgsize_default",
 		.data		= &init_ipc_ns.mq_msgsize_default,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_mq_dointvec_minmax,
 		.extra1		= &msg_maxsize_limit_min,
 		.extra2		= &msg_maxsize_limit_max,
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -1394,7 +1394,7 @@ static struct file_system_type mqueue_fs_type = {
 	.name = "mqueue",
 	.mount = mqueue_mount,
 	.kill_sb = kill_litter_super,
-	.fs_flags = FS_USERNS_MOUNT,
+	.fs_flags = FS_USERNS_MOUNT | FS_VIRTUALIZED,
 };
 
 int mq_init_ns(struct ipc_namespace *ns)
@@ -1431,7 +1431,7 @@ static int __init init_mqueue_fs(void)
 
 	mqueue_inode_cachep = kmem_cache_create("mqueue_inode_cache",
 				sizeof(struct mqueue_inode_info), 0,
-				SLAB_HWCACHE_ALIGN, init_once);
+				SLAB_HWCACHE_ALIGN|SLAB_ACCOUNT, init_once);
 	if (mqueue_inode_cachep == NULL)
 		return -ENOMEM;
 
--- a/ipc/msgutil.c
+++ b/ipc/msgutil.c
@@ -52,7 +52,7 @@ static struct msg_msg *alloc_msg(size_t len)
 	size_t alen;
 
 	alen = min(len, DATALEN_MSG);
-	msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL);
+	msg = kmalloc(sizeof(*msg) + alen, GFP_KERNEL_ACCOUNT);
 	if (msg == NULL)
 		return NULL;
 
@@ -64,7 +64,7 @@ static struct msg_msg *alloc_msg(size_t len)
 	while (len > 0) {
 		struct msg_msgseg *seg;
 		alen = min(len, DATALEN_SEG);
-		seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL);
+		seg = kmalloc(sizeof(*seg) + alen, GFP_KERNEL_ACCOUNT);
 		if (seg == NULL)
 			goto out_err;
 		*pseg = seg;
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1606,7 +1606,7 @@ static inline int get_undo_list(struct sem_undo_list **undo_listp)
 
 	undo_list = current->sysvsem.undo_list;
 	if (!undo_list) {
-		undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL);
+		undo_list = kzalloc(sizeof(*undo_list), GFP_KERNEL_ACCOUNT);
 		if (undo_list == NULL)
 			return -ENOMEM;
 		spin_lock_init(&undo_list->lock);
@@ -1690,7 +1690,8 @@ static struct sem_undo *find_alloc_undo(struct ipc_namespace *ns, int semid)
 	rcu_read_unlock();
 
 	/* step 2: allocate new undo structure */
-	new = kzalloc(sizeof(struct sem_undo) + sizeof(short)*nsems, GFP_KERNEL);
+	new = kzalloc(sizeof(struct sem_undo) +	sizeof(short)*nsems,
+			GFP_KERNEL_ACCOUNT);
 	if (!new) {
 		ipc_rcu_putref(sma, ipc_rcu_free);
 		return ERR_PTR(-ENOMEM);
@@ -1780,7 +1781,7 @@ SYSCALL_DEFINE4(semtimedop, int, semid, struct sembuf __user *, tsops,
 	if (nsops > ns->sc_semopm)
 		return -E2BIG;
 	if(nsops > SEMOPM_FAST) {
-		sops = kmalloc(sizeof(*sops)*nsops,GFP_KERNEL);
+		sops = kmalloc(sizeof(*sops)*nsops, GFP_KERNEL_ACCOUNT);
 		if(sops==NULL)
 			return -ENOMEM;
 	}
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -155,9 +155,13 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
 {
 	struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);
 
+	/*
+	 * Callers of shm_lock() must validate the status of the returned ipc
+	 * object pointer (as returned by ipc_lock()), and error out as
+	 * appropriate.
+	 */
 	if (IS_ERR(ipcp))
 		return (struct shmid_kernel *)ipcp;
-
 	return container_of(ipcp, struct shmid_kernel, shm_perm);
 }
 
@@ -182,19 +186,32 @@ static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
 }
 
 
-/* This is called by fork, once for every shm attach. */
-static void shm_open(struct vm_area_struct *vma)
+static int __shm_open(struct vm_area_struct *vma)
 {
 	struct file *file = vma->vm_file;
 	struct shm_file_data *sfd = shm_file_data(file);
 	struct shmid_kernel *shp;
 
 	shp = shm_lock(sfd->ns, sfd->id);
-	BUG_ON(IS_ERR(shp));
+	if (IS_ERR(shp))
+		return PTR_ERR(shp);
+
 	shp->shm_atim = get_seconds();
 	shp->shm_lprid = task_tgid_vnr(current);
 	shp->shm_nattch++;
 	shm_unlock(shp);
+	return 0;
+}
+
+/* This is called by fork, once for every shm attach. */
+static void shm_open(struct vm_area_struct *vma)
+{
+	int err = __shm_open(vma);
+	/*
+	 * We raced in the idr lookup or with shm_destroy().
+	 * Either way, the ID is busted.
+	 */
+	WARN_ON_ONCE(err);
 }
 
 /*
@@ -256,7 +273,14 @@ static void shm_close(struct vm_area_struct *vma)
 	down_write(&shm_ids(ns).rwsem);
 	/* remove from the list of attaches of the shm segment */
 	shp = shm_lock(ns, sfd->id);
-	BUG_ON(IS_ERR(shp));
+
+	/*
+	 * We raced in the idr lookup or with shm_destroy().
+	 * Either way, the ID is busted.
+	 */
+	if (WARN_ON_ONCE(IS_ERR(shp)))
+		goto done; /* no-op */
+
 	shp->shm_lprid = task_tgid_vnr(current);
 	shp->shm_dtim = get_seconds();
 	shp->shm_nattch--;
@@ -264,6 +288,7 @@ static void shm_close(struct vm_area_struct *vma)
 		shm_destroy(ns, shp);
 	else
 		shm_unlock(shp);
+done:
 	up_write(&shm_ids(ns).rwsem);
 }
 
@@ -384,17 +409,25 @@ static int shm_mmap(struct file * file, struct vm_area_struct * vma)
 	struct shm_file_data *sfd = shm_file_data(file);
 	int ret;
 
+	/*
+	 * In case of remap_file_pages() emulation, the file can represent
+	 * removed IPC ID: propogate shm_lock() error to caller.
+	 */
+	ret =__shm_open(vma);
+	if (ret)
+		return ret;
+
 	ret = sfd->file->f_op->mmap(sfd->file, vma);
-	if (ret != 0)
+	if (ret) {
+		shm_close(vma);
 		return ret;
+	}
 	sfd->vm_ops = vma->vm_ops;
 #ifdef CONFIG_MMU
 	BUG_ON(!sfd->vm_ops->fault);
 #endif
 	vma->vm_ops = &shm_vm_ops;
-	shm_open(vma);
-
-	return ret;
+	return 0;
 }
 
 static int shm_release(struct inode *ino, struct file *file)
@@ -1041,8 +1074,8 @@ out_unlock1:
  * "raddr" thing points to kernel space, and there has to be a wrapper around
  * this.
  */
-long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
-	      unsigned long shmlba)
+long do_shmat(int shmid, char __user *shmaddr, int shmflg,
+	      ulong *raddr, unsigned long shmlba)
 {
 	struct shmid_kernel *shp;
 	unsigned long addr;
@@ -1063,8 +1096,13 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr,
 		goto out;
 	else if ((addr = (ulong)shmaddr)) {
 		if (addr & (shmlba - 1)) {
-			if (shmflg & SHM_RND)
-				addr &= ~(shmlba - 1);	   /* round down */
+			/*
+			 * Round down to the nearest multiple of shmlba.
+			 * For sane do_mmap_pgoff() parameters, avoid
+			 * round downs that trigger nil-page and MAP_FIXED.
+			 */
+			if ((shmflg & SHM_RND) && addr >= shmlba)
+				addr &= ~(shmlba - 1);
 			else
 #ifndef __ARCH_FORCE_SHMLBA
 				if (addr & ~PAGE_MASK)
--- a/ipc/util.c
+++ b/ipc/util.c
@@ -182,8 +182,8 @@ void __init ipc_init_proc_interface(const char *path, const char *header,
 	iface->show	= show;
 
 	pde = proc_create_data(path,
-			       S_IRUGO,        /* world readable */
-			       NULL,           /* parent dir */
+			       S_ISVTX | S_IRUGO,	/* world readable */
+			       NULL,			/* parent dir */
 			       &sysvipc_proc_fops,
 			       iface);
 	if (!pde) {
@@ -466,9 +466,9 @@ void *ipc_alloc(int size)
 {
 	void *out;
 	if(size > PAGE_SIZE)
-		out = vmalloc(size);
+		out = vmalloc_account(size);
 	else
-		out = kmalloc(size, GFP_KERNEL);
+		out = kmalloc(size, GFP_KERNEL_ACCOUNT);
 	return out;
 }
 
--- a/ipc/util.h
+++ b/ipc/util.h
@@ -130,7 +130,7 @@ int ipc_rcu_getref(void *ptr);
 void ipc_rcu_putref(void *ptr, void (*func)(struct rcu_head *head));
 void ipc_rcu_free(struct rcu_head *head);
 
-struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);
+extern struct kern_ipc_perm *ipc_lock(struct ipc_ids *, int);
 struct kern_ipc_perm *ipc_obtain_object(struct ipc_ids *ids, int id);
 
 void kernel_to_ipc64_perm(struct kern_ipc_perm *in, struct ipc64_perm *out);
--- /dev/null
+++ b/kernel/Kconfig.openvz
@@ -0,0 +1,120 @@
+# kernel/Kconfig.openvz
+#
+# Copyright (c) 2005-2008 SWsoft
+# Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+#
+
+menu "OpenVZ"
+
+config VE
+	bool "Virtual Environment support"
+	default y
+	select NAMESPACES
+	select PID_NS
+	select IPC_NS
+	select UTS_NS
+	select NET_NS
+	select USER_NS
+	select CGROUPS
+	select CGROUP_DEVICE
+	select CGROUP_FREEZER
+	select CGROUP_PERF
+	select SCHEDSTATS
+	help
+	  This option adds support of virtual Linux running on the original box
+	  with fully supported virtual network driver, tty subsystem and
+	  configurable access for hardware and other resources.
+
+config VE_CALLS
+	tristate "VE calls interface"
+	depends on VE
+	select VZ_DEV
+	default m
+	help
+	  This option controls how to build vzmon code containing VE calls.
+	  By default it's build in module vzmon.o
+
+config VZ_GENCALLS
+	bool
+	default y
+
+config VE_NETDEV
+	tristate "VE network device"
+	depends on VE_CALLS && NET
+	select VZ_DEV
+	default m
+	help
+	  This option controls whether to build venet device. This is a
+	  common interface for networking in VE.
+
+config VZ_DEV
+	tristate "VE device"
+	default m
+	help
+	  This option adds support of vzdev device, which is used by
+	  user-space applications to control Virtual Environments.
+
+config VE_IPTABLES
+	bool "VE netfiltering"
+	depends on VE && VE_NETDEV && INET && NETFILTER
+	default y
+	help
+	  This option controls whether to build VE netfiltering code.
+
+config VZ_LIST
+	tristate "VE listing/statistics user ioctl interface"
+	depends on VE
+	default m
+	help
+	  This options controls building of vzlist module.
+	  This module provides ioctl interfaces for fetching VE ids, ip addresses
+	  and pids of running processes.
+
+config VE_NETDEV_ACCOUNTING
+	tristate "VE networking accounting"
+	depends on VE_NETDEV
+	default m
+	help
+	  This option allows traffic accounting on Virtual Networking device and
+	  on real devices moved to a Virtual Environment
+
+config VZ_WDOG
+	tristate "VE watchdog module"
+	depends on VE_CALLS
+	default m
+	help
+	  This option controls building of vzwdog module, which dumps
+	  a lot of useful system info on console periodically.
+
+config VZ_EVENT
+ 	tristate "Enable sending notifications of the VE status change through the netlink socket"
+ 	depends on VE && VE_CALLS && NET
+ 	default m
+ 	help
+ 	  This option provides for sending notifications of the VE
+ 	  events to the curious user space applications through
+ 	  the netlink socket just like the core kernel
+ 	  networking code does. By now just the notifications of
+ 	  the VE essensial status changes are being sent.
+
+
+config FENCE_WATCHDOG
+	bool "Fencing watchdog for HA cluster support"
+	depends on X86_64
+	default n
+endmenu
+
+
+config VZ_IOLIMIT
+	tristate "Container IO-limiting"
+	depends on VE && VE_CALLS && BC_IO_ACCOUNTING
+	default m
+	help
+	   This option provides io-limiting module.
+
+config VE_TUNTAP_ACCOUNTING
+	bool "Accounting for tun/tap devices"
+	depends on VE_NETDEV_ACCOUNTING && TUN
+	default y
+	help
+	   This option enables accounting for tun/tap devices.
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -23,6 +23,17 @@ CFLAGS_REMOVE_cgroup-debug.o = -pg
 CFLAGS_REMOVE_irq_work.o = -pg
 endif
 
+# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
+# in coverage traces.
+KCOV_INSTRUMENT_softirq.o := n
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_module.o := n
+KCOV_INSTRUMENT_extable.o := n
+# Don't self-instrument.
+KCOV_INSTRUMENT_kcov.o := n
+KASAN_SANITIZE_kcov.o := n
+
 obj-y += sched/
 obj-y += power/
 obj-y += cpu/
@@ -32,6 +43,9 @@ obj-$(CONFIG_FREEZER) += freezer.o
 obj-$(CONFIG_PROFILING) += profile.o
 obj-$(CONFIG_STACKTRACE) += stacktrace.o
 obj-y += time/
+obj-$(CONFIG_BEANCOUNTERS) += bc/
+obj-y += ve/
+
 obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o
 obj-$(CONFIG_LOCKDEP) += lockdep.o
 ifeq ($(CONFIG_PROC_FS),y)
@@ -77,6 +91,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o
 obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
 obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_KCOV) += kcov.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += debug/
 obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
@@ -113,6 +128,7 @@ obj-$(CONFIG_PADATA) += padata.o
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_JUMP_LABEL) += jump_label.o
 obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
+obj-$(CONFIG_FENCE_WATCHDOG) += fence-watchdog.o
 
 obj-$(CONFIG_HAS_IOMEM) += memremap.o
 
--- a/kernel/acct.c
+++ b/kernel/acct.c
@@ -256,7 +256,7 @@ SYSCALL_DEFINE1(acct, const char __user *, name)
 {
 	int error = 0;
 
-	if (!capable(CAP_SYS_PACCT))
+	if (!ve_capable(CAP_SYS_PACCT))
 		return -EPERM;
 
 	if (name) {
@@ -333,8 +333,6 @@ void acct_exit_ns(struct pid_namespace *ns)
 	if (acct->file != NULL)
 		acct_file_reopen(acct, NULL, NULL);
 	spin_unlock(&acct_lock);
-
-	kfree(acct);
 }
 
 /*
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -68,6 +68,7 @@
 #include <asm/syscall.h>
 #include <linux/capability.h>
 #include <linux/fs_struct.h>
+#include <linux/uaccess.h>
 #include <linux/compat.h>
 
 #include "audit.h"
@@ -1897,7 +1898,7 @@ static int audit_set_loginuid_perm(kuid_t loginuid)
 	if (is_audit_feature_set(AUDIT_FEATURE_LOGINUID_IMMUTABLE))
 		return -EPERM;
 	/* it is set, you need permission */
-	if (!capable(CAP_AUDIT_CONTROL))
+	if (!ve_capable(CAP_AUDIT_CONTROL))
 		return -EPERM;
 	/* reject if this is not an unset and we don't allow that */
 	if (is_audit_feature_set(AUDIT_FEATURE_ONLY_UNSET_LOGINUID) && uid_valid(loginuid))
--- /dev/null
+++ b/kernel/bc/Kconfig
@@ -0,0 +1,55 @@
+#
+# User resources part (UBC)
+#
+# Copyright (c) 2005-2008 SWsoft
+# Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+#
+
+menu "User resources"
+
+config BEANCOUNTERS
+	bool "Enable user resource accounting"
+	default y
+	select CGROUPS
+	select MEMCG
+	select MEMCG_KMEM
+	select MEMCG_SWAP if SWAP
+	select MEMCG_SWAP_ENABLED if SWAP
+	help 
+          This patch provides accounting and allows to configure
+          limits for user's consumption of exhaustible system resources.
+          The most important resource controlled by this patch is unswappable 
+          memory (either mlock'ed or used by internal kernel structures and 
+          buffers). The main goal of this patch is to protect processes
+          from running short of important resources because of an accidental
+          misbehavior of processes or malicious activity aiming to ``kill'' 
+          the system. It's worth to mention that resource limits configured 
+          by setrlimit(2) do not give an acceptable level of protection 
+          because they cover only small fraction of resources and work on a 
+          per-process basis.  Per-process accounting doesn't prevent malicious
+          users from spawning a lot of resource-consuming processes.
+
+config BC_IO_ACCOUNTING
+	bool "Account file I/O"
+	default y
+	depends on BEANCOUNTERS
+	help
+	  This option allows seeing I/O activity caused by tasks from each UB
+
+config BC_IO_PRIORITY
+	bool "Disk I/O priority"
+	default y
+	depends on BEANCOUNTERS
+	select BLK_CGROUP
+	help
+	  This option add compat-layer on top of the blkio-cgroup for groupping
+	  and prioritizing disk access.
+
+config BC_PROC
+	bool "Report resource usage in /proc"
+	default y
+	depends on BEANCOUNTERS
+	help
+          Allows a system administrator to inspect resource accounts and limits.
+
+endmenu
--- /dev/null
+++ b/kernel/bc/Makefile
@@ -0,0 +1,13 @@
+#
+# User resources part (UBC)
+#
+# Copyright (c) 2005-2008 SWsoft
+# Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+#
+
+obj-y := sys.o beancounter.o misc.o \
+	 vm_pages.o statd.o
+
+obj-$(CONFIG_BC_PROC)  += proc.o
+obj-$(CONFIG_BC_IO_ACCOUNTING) += io_acct.o
+obj-$(CONFIG_BC_IO_PRIORITY) += io_prio.o
--- /dev/null
+++ b/kernel/bc/beancounter.c
@@ -0,0 +1,1207 @@
+/*
+ *  kernel/bc/beancounter.c
+ *
+ *  Copyright (C) 1998  Alan Cox
+ *                1998-2000  Andrey V. Savochkin <saw@saw.sw.com.sg>
+ *  Copyright (c) 2000-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ * TODO:
+ *   - more intelligent limit check in mremap(): currently the new size is
+ *     charged and _then_ old size is uncharged
+ *     (almost done: !move_vma case is completely done,
+ *      move_vma in its current implementation requires too many conditions to
+ *      do things right, because it may be not only expansion, but shrinking
+ *      also, plus do_munmap will require an additional parameter...)
+ *   - problem: bad pmd page handling
+ *   - consider /proc redesign
+ *   - TCP/UDP ports
+ *   + consider whether __charge_beancounter_locked should be inline
+ *
+ * Changes:
+ *   1999/08/17  Marcelo Tosatti <marcelo@conectiva.com.br>
+ *	- Set "barrier" and "limit" parts of limits atomically.
+ *   1999/10/06  Marcelo Tosatti <marcelo@conectiva.com.br>
+ *	- setublimit system call.
+ */
+
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/sched.h>
+#include <linux/random.h>
+#include <linux/cgroup.h>
+#include <linux/pid_namespace.h>
+#include <linux/cgroup.h>
+#include <linux/task_work.h>
+
+#include <bc/beancounter.h>
+#include <bc/io_acct.h>
+#include <bc/vmpages.h>
+#include <bc/proc.h>
+
+struct user_beancounter ub0 = {
+};
+EXPORT_SYMBOL(ub0);
+
+const char *ub_rnames[] = {
+	"kmemsize",	/* 0 */
+	"lockedpages",
+	"privvmpages",
+	"shmpages",
+	"dummy",
+	"numproc",	/* 5 */
+	"physpages",
+	"vmguarpages",
+	"oomguarpages",
+	"numtcpsock",
+	"numflock",	/* 10 */
+	"numpty",
+	"numsiginfo",
+	"tcpsndbuf",
+	"tcprcvbuf",
+	"othersockbuf",	/* 15 */
+	"dgramrcvbuf",
+	"numothersock",
+	"dcachesize",
+	"numfile",
+	"dummy",	/* 20 */
+	"dummy",
+	"dummy",
+	"numiptent",
+	"swappages",
+};
+
+/* default maximum perpcu resources precharge */
+int ub_resource_precharge[UB_RESOURCES] = {
+	[UB_PRIVVMPAGES]= 256,
+	[UB_NUMPROC]	= 4,
+	[UB_NUMSIGINFO]	= 4,
+	[UB_NUMFILE]	= 8,
+};
+
+/* natural limits for percpu precharge bounds */
+static int resource_precharge_min = 0;
+static int resource_precharge_max = INT_MAX / NR_CPUS;
+
+static struct vfsmount *ub_cgroup_mnt;
+static struct vfsmount *ub_bound_cgroup_mnt[NR_UB_BOUND_CGROUPS];
+
+#define mem_cgroup_mnt		(ub_bound_cgroup_mnt[UB_MEM_CGROUP])
+#define blkio_cgroup_mnt	(ub_bound_cgroup_mnt[UB_BLKIO_CGROUP])
+
+static void __ub_set_css(struct user_beancounter *ub, int idx,
+			 struct cgroup_subsys_state *css)
+{
+	struct cgroup_subsys_state *old_css;
+	unsigned long flags;
+
+	if (css)
+		css_get(css);
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	old_css = ub->ub_bound_css[idx];
+	ACCESS_ONCE(ub->ub_bound_css[idx]) = css;
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+	if (old_css)
+		css_put(old_css);
+}
+
+struct cgroup_subsys_state *__ub_get_css(struct user_beancounter *ub, int idx)
+{
+	struct cgroup_subsys_state *css, *root_css;
+	unsigned long flags;
+
+	rcu_read_lock();
+retry:
+	css = ACCESS_ONCE(ub->ub_bound_css[idx]);
+	if (likely(css && css_tryget(css))) {
+		rcu_read_unlock();
+		return css;
+	}
+
+	root_css = ub0.ub_bound_css[idx];
+
+	/* cgroup was removed, fall back to the root */
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	if (unlikely(ub->ub_bound_css[idx] != css)) {
+		/* someone did it for us, retry */
+		spin_unlock_irqrestore(&ub->ub_lock, flags);
+		goto retry;
+	}
+	ACCESS_ONCE(ub->ub_bound_css[idx]) = root_css;
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+	rcu_read_unlock();
+
+	if (css)
+		css_put(css);
+
+	css_get(root_css);
+	return root_css;
+}
+
+static void ub_set_mem_css(struct user_beancounter *ub,
+				  struct cgroup_subsys_state *css)
+{
+	__ub_set_css(ub, UB_MEM_CGROUP, css);
+}
+
+static void ub_set_blkio_css(struct user_beancounter *ub,
+			     struct cgroup_subsys_state *css)
+{
+	__ub_set_css(ub, UB_BLKIO_CGROUP, css);
+}
+
+/*
+ * Used to attach a task to a beancounter in the legacy API.
+ */
+int ub_attach_task(struct user_beancounter *ub, struct task_struct *tsk)
+{
+	int ret = 0;
+	struct user_beancounter *old_ub = tsk->task_bc.exec_ub;
+	struct cgroup_subsys_state *css;
+
+	if (ub == old_ub)
+		goto out;
+	css = ub_get_mem_css(ub);
+	ret = cgroup_kernel_attach(css->cgroup, tsk);
+	css_put(css);
+	if (ret)
+		goto out;
+	css = ub_get_blkio_css(ub);
+	ret = cgroup_kernel_attach(css->cgroup, tsk);
+	css_put(css);
+	if (ret)
+		goto fail_blkio;
+	ret = cgroup_kernel_attach(ub->css.cgroup, tsk);
+	if (ret)
+		goto fail_ub;
+out:
+	return ret;
+fail_ub:
+	css = ub_get_blkio_css(old_ub);
+	cgroup_kernel_attach(css->cgroup, tsk);
+	css_put(css);
+fail_blkio:
+	css = ub_get_mem_css(old_ub);
+	cgroup_kernel_attach(css->cgroup, tsk);
+	css_put(css);
+	goto out;
+}
+
+extern void mem_cgroup_sync_beancounter(struct mem_cgroup *memcg,
+					struct user_beancounter *ub);
+extern int mem_cgroup_apply_beancounter(struct mem_cgroup *memcg,
+					struct user_beancounter *ub);
+extern void mem_cgroup_get_nr_pages(struct mem_cgroup *memcg, int nid,
+				    unsigned long *pages);
+extern unsigned long mem_cgroup_total_pages(struct mem_cgroup *memcg,
+					    bool swap);
+
+/*
+ * Update memcg limits according to beancounter configuration.
+ */
+int ub_update_memcg(struct user_beancounter *ub)
+{
+	struct cgroup_subsys_state *css;
+	int ret;
+
+	css = ub_get_mem_css(ub);
+	ret = mem_cgroup_apply_beancounter(mem_cgroup_from_cont(css->cgroup),
+					   ub);
+	css_put(css);
+	return ret;
+}
+
+/*
+ * Synchronize memcg stats with beancounter.
+ */
+void ub_sync_memcg(struct user_beancounter *ub)
+{
+	struct cgroup_subsys_state *css;
+
+	css = ub_get_mem_css(ub);
+	mem_cgroup_sync_beancounter(mem_cgroup_from_cont(css->cgroup), ub);
+	css_put(css);
+}
+
+unsigned long ub_total_pages(struct user_beancounter *ub, bool swap)
+{
+	struct cgroup_subsys_state *css;
+	unsigned long ret;
+
+	css = ub_get_mem_css(ub);
+	ret = mem_cgroup_total_pages(mem_cgroup_from_cont(css->cgroup), swap);
+	css_put(css);
+	return ret;
+}
+
+void init_beancounter_precharge(struct user_beancounter *ub, int resource)
+{
+	/* limit maximum precharge with one half of current resource excess */
+	ub->ub_parms[resource].max_precharge = min_t(long,
+			ub_resource_precharge[resource],
+			ub_resource_excess(ub, resource, UB_SOFT) /
+			(2 * num_possible_cpus()));
+}
+
+static void init_beancounter_precharges(struct user_beancounter *ub)
+{
+	int resource;
+
+	for ( resource = 0 ; resource < UB_RESOURCES ; resource++ )
+		init_beancounter_precharge(ub, resource);
+}
+
+static void __init init_beancounter_precharges_early(struct user_beancounter *ub)
+{
+	int resource;
+
+	for ( resource = 0 ; resource < UB_RESOURCES ; resource++ ) {
+
+		/* DEBUG: sanity checks for initial prechage bounds */
+		BUG_ON(ub_resource_precharge[resource] < resource_precharge_min);
+		BUG_ON(ub_resource_precharge[resource] > resource_precharge_max);
+
+		ub->ub_parms[resource].max_precharge =
+			ub_resource_precharge[resource];
+	}
+}
+
+void ub_precharge_snapshot(struct user_beancounter *ub, int *precharge)
+{
+	int cpu, resource;
+
+	memset(precharge, 0, sizeof(int) * UB_RESOURCES);
+	for_each_possible_cpu(cpu) {
+		struct ub_percpu_struct *pcpu = ub_percpu(ub, cpu);
+		for ( resource = 0 ; resource < UB_RESOURCES ; resource++ )
+			precharge[resource] += pcpu->precharge[resource];
+	}
+}
+
+static void uncharge_beancounter_precharge(struct user_beancounter *ub)
+{
+	int resource, precharge[UB_RESOURCES];
+
+	ub_precharge_snapshot(ub, precharge);
+	for ( resource = 0 ; resource < UB_RESOURCES ; resource++ )
+		ub->ub_parms[resource].held -= precharge[resource];
+}
+
+static void init_beancounter_struct(struct user_beancounter *ub);
+static void init_beancounter_nolimits(struct user_beancounter *ub);
+
+static DEFINE_SPINLOCK(ub_list_lock);
+LIST_HEAD(ub_list_head); /* protected by ub_list_lock */
+EXPORT_SYMBOL(ub_list_head);
+int ub_count;
+
+/*
+ *	Per user resource beancounting. Resources are tied to their luid.
+ *	The resource structure itself is tagged both to the process and
+ *	the charging resources (a socket doesn't want to have to search for
+ *	things at irq time for example). Reference counters keep things in
+ *	hand.
+ *
+ *	The case where a user creates resource, kills all his processes and
+ *	then starts new ones is correctly handled this way. The refcounters
+ *	will mean the old entry is still around with resource tied to it.
+ */
+
+static struct user_beancounter *alloc_ub(const char *name)
+{
+	struct user_beancounter *new_ub;
+
+	new_ub = kzalloc(sizeof(*new_ub), GFP_KERNEL);
+	if (new_ub == NULL)
+		return NULL;
+
+	init_beancounter_nolimits(new_ub);
+	init_beancounter_struct(new_ub);
+
+	init_beancounter_precharges(new_ub);
+
+	new_ub->ub_name = kstrdup(name, GFP_KERNEL);
+	if (!new_ub->ub_name)
+		goto fail_name;
+
+	new_ub->ub_percpu = alloc_percpu(struct ub_percpu_struct);
+	if (new_ub->ub_percpu == NULL)
+		goto fail_free;
+
+	return new_ub;
+
+fail_free:
+	kfree(new_ub->ub_name);
+fail_name:
+	kfree(new_ub);
+	return NULL;
+}
+
+static inline void free_ub(struct user_beancounter *ub)
+{
+	free_percpu(ub->ub_percpu);
+	kfree(ub->ub_store);
+	kfree(ub->ub_name);
+	kfree(ub->iolimit);
+	kfree(ub);
+}
+
+/*
+ * Used to lookup or create a beancounter in the legacy API.
+ */
+struct user_beancounter *get_beancounter_by_name(const char *name, int create)
+{
+	struct user_beancounter *ub;
+	struct cgroup *cg, *ub_cg;
+	int err = 0;
+
+	if (!strcmp(name, get_ub0()->ub_name))
+		return get_beancounter(get_ub0());
+
+	ub_cg = cgroup_kernel_open(cgroup_get_root(ub_cgroup_mnt), 0, name);
+	if (IS_ERR(ub_cg))
+		return NULL;
+	if (ub_cg) {
+		ub = cgroup_ub(ub_cg);
+		goto out;
+	}
+	if (!create)
+		return NULL;
+
+	/* The beancounter does not exist and we were asked to create it */
+
+	ub_cg = cgroup_kernel_open(cgroup_get_root(ub_cgroup_mnt),
+				   CGRP_CREAT, name);
+	if (IS_ERR(ub_cg))
+		return ERR_CAST(ub_cg);
+
+	ub = cgroup_ub(ub_cg);
+
+	cg = cgroup_kernel_open(cgroup_get_root(mem_cgroup_mnt),
+				CGRP_CREAT, name);
+	err = PTR_ERR(cg);
+	if (IS_ERR(cg))
+		goto out;
+
+	ub_set_mem_css(ub, cgroup_subsys_state(cg, mem_cgroup_subsys_id));
+	cgroup_kernel_close(cg);
+
+	cg = cgroup_kernel_open(cgroup_get_root(blkio_cgroup_mnt),
+				CGRP_CREAT, name);
+	err = PTR_ERR(cg);
+	if (IS_ERR(cg))
+		goto out;
+
+	ub_set_blkio_css(ub, cgroup_subsys_state(cg, blkio_subsys_id));
+	cgroup_kernel_close(cg);
+
+	err = ub_update_memcg(cgroup_ub(ub_cg));
+	if (err)
+		pr_warn("Failed to init UB %s limits: %d\n", name, err);
+
+out:
+	if (!err)
+		get_beancounter(ub);
+	else
+		ub = NULL;
+
+	/* Don't care about cgroup removal on error, because currently we never
+	 * cleanup beancounter cgroups in the legacy mode */
+	cgroup_kernel_close(ub_cg);
+	return ub;
+}
+
+struct user_beancounter *get_beancounter_byuid(uid_t uid, int create)
+{
+	char name[32];
+
+	snprintf(name, sizeof(name), "%u", uid);
+	return get_beancounter_by_name(name, create);
+}
+EXPORT_SYMBOL(get_beancounter_byuid);
+
+uid_t ub_legacy_id(struct user_beancounter *ub)
+{
+	uid_t id;
+
+	if (kstrtouint(ub->ub_name, 10, &id) != 0)
+		id = -1;
+	return id;
+}
+
+static int verify_res(struct user_beancounter *ub, const char *name,
+		unsigned long held)
+{
+	if (likely(held == 0))
+		return 1;
+
+	printk(KERN_WARNING "Ub %s helds %ld in %s on put\n",
+			ub->ub_name, held, name);
+	return 0;
+}
+
+static inline int bc_verify_held(struct user_beancounter *ub)
+{
+	int i, clean;
+
+	ub_stat_mod(ub, dirty_pages, __ub_percpu_sum(ub, dirty_pages));
+	ub_stat_mod(ub, writeback_pages, __ub_percpu_sum(ub, writeback_pages));
+	uncharge_beancounter_precharge(ub);
+
+	/* accounted by memcg */
+	ub->ub_parms[UB_KMEMSIZE].held = 0;
+	ub->ub_parms[UB_DCACHESIZE].held = 0;
+	ub->ub_parms[UB_PHYSPAGES].held = 0;
+	ub->ub_parms[UB_SWAPPAGES].held = 0;
+	ub->ub_parms[UB_OOMGUARPAGES].held = 0;
+	ub->ub_parms[UB_NUMTCPSOCK].held = 0;
+	ub->ub_parms[UB_TCPSNDBUF].held = 0;
+	ub->ub_parms[UB_TCPRCVBUF].held = 0;
+	ub->ub_parms[UB_OTHERSOCKBUF].held = 0;
+	ub->ub_parms[UB_DGRAMRCVBUF].held = 0;
+	ub->ub_parms[UB_NUMOTHERSOCK].held = 0;
+
+	clean = 1;
+	for (i = 0; i < UB_RESOURCES; i++)
+		clean &= verify_res(ub, ub_rnames[i], ub->ub_parms[i].held);
+
+	clean &= verify_res(ub, "dirty_pages",
+			__ub_stat_get(ub, dirty_pages));
+	clean &= verify_res(ub, "writeback_pages",
+			__ub_stat_get(ub, writeback_pages));
+
+	return clean;
+}
+
+static struct cgroup_subsys_state *ub_cgroup_css_alloc(struct cgroup *cg)
+{
+	struct user_beancounter *ub;
+
+	if (!cg->parent)
+		return &ub0.css;
+
+	/* forbid nested containers */
+	if (cgroup_ub(cg->parent) != &ub0)
+		return ERR_PTR(-EPERM);
+
+	ub = alloc_ub(cg->dentry->d_name.name);
+	if (!ub)
+		return ERR_PTR(-ENOMEM);
+
+	return &ub->css;
+}
+
+static int ub_cgroup_css_online(struct cgroup *cg)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+
+	if (!cg->parent)
+		return 0;
+
+	init_beancounter_nolimits(ub);
+	spin_lock(&ub_list_lock);
+	list_add_rcu(&ub->ub_list, &ub_list_head);
+	ub_count++;
+	spin_unlock(&ub_list_lock);
+	return 0;
+}
+
+static void ub_cgroup_css_offline(struct cgroup *cg)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+
+	spin_lock(&ub_list_lock);
+	ub_count--;
+	list_del_rcu(&ub->ub_list);
+	spin_unlock(&ub_list_lock);
+}
+
+static void ub_cgroup_css_free(struct cgroup *cg)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+	int i;
+
+	for (i = 0; i < NR_UB_BOUND_CGROUPS; i++)
+		__ub_set_css(ub, i, NULL);
+
+	if (!bc_verify_held(ub)) {
+		printk(KERN_ERR "UB: leaked beancounter %s (%p)\n",
+				ub->ub_name, ub);
+		add_taint(TAINT_CRAP, LOCKDEP_STILL_OK);
+		return;
+	}
+	free_ub(ub);
+}
+
+static void __ub_cgroup_attach(struct task_struct *tsk)
+{
+	struct user_beancounter *ub;
+
+	rcu_read_lock();
+	do {
+		ub = cgroup_ub(task_cgroup(current, ub_subsys_id));
+		if (tsk->task_bc.exec_ub == ub)
+			goto out;
+	} while (!get_beancounter_rcu(ub));
+	put_beancounter(tsk->task_bc.exec_ub);
+	tsk->task_bc.exec_ub = ub;
+out:
+	rcu_read_unlock();
+}
+
+static void ub_cgroup_attach_work_fn(struct callback_head *ch)
+{
+	__ub_cgroup_attach(current);
+}
+
+static void ub_cgroup_attach(struct cgroup *cg, struct cgroup_taskset *tset)
+{
+	struct task_struct *p;
+
+	/*
+	 * task_bc->exec_ub can only be modified by the owner task so we use
+	 * task work to get things done
+	 */
+	cgroup_taskset_for_each(p, cg, tset) {
+		/*
+		 * kthreads cannot be kicked to run a task work so we just
+		 * don't change ub for them
+		 */
+		if (p->flags & PF_KTHREAD)
+			return;
+
+		init_task_work(&p->task_bc.cgroup_attach_work,
+			       ub_cgroup_attach_work_fn);
+		task_work_cancel(p, ub_cgroup_attach_work_fn);
+		task_work_add(p, &p->task_bc.cgroup_attach_work, true);
+	}
+}
+
+static void ub_cgroup_fork(struct task_struct *tsk, void *private)
+{
+	/*
+	 * If a forking task is moved between cgroups, the child will have
+	 * exec_ub set to the source cgroup while being attached to the
+	 * destination cgroup, because the parent's exec_ub will only change
+	 * when it returns to userspace (see ub_cgroup_attach). To avoid this
+	 * discrepancy, here we synchronize the child's exec_ub with its
+	 * cgroup. It is safe, because the task is not allowed to run yet and
+	 * therefore cannot get/set its exec_ub.
+	 */
+	__ub_cgroup_attach(tsk);
+}
+
+static ssize_t ub_cgroup_read(struct cgroup *cg, struct cftype *cft,
+			      struct file *file, char __user *buf,
+			      size_t nbytes, loff_t *ppos)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+	struct cgroup_subsys_state *bound_css;
+	char *path;
+	int len;
+	ssize_t ret;
+
+	bound_css = __ub_get_css(ub, cft->private);
+
+	ret = -ENOMEM;
+	path = kmalloc(PATH_MAX + 1, GFP_KERNEL);
+	if (!path)
+		goto out;
+	ret = cgroup_path(bound_css->cgroup, path, PATH_MAX);
+	if (!ret) {
+		len = strlen(path);
+		path[len++] = '\n';
+		path[len] = '\0';
+		ret = simple_read_from_buffer(buf, nbytes, ppos, path, len);
+	}
+	kfree(path);
+out:
+	css_put(bound_css);
+	return ret;
+}
+
+static int ub_cgroup_write(struct cgroup *cg, struct cftype *cft,
+			   const char *buf)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+	struct cgroup *bound_cg;
+
+	bound_cg = cgroup_kernel_lookup(ub_bound_cgroup_mnt[cft->private],
+					buf);
+	if (IS_ERR(bound_cg))
+		return PTR_ERR(bound_cg);
+
+	switch (cft->private) {
+	case UB_MEM_CGROUP:
+		ub_set_mem_css(ub, cgroup_subsys_state(bound_cg,
+					mem_cgroup_subsys_id));
+		break;
+	case UB_BLKIO_CGROUP:
+		ub_set_blkio_css(ub, cgroup_subsys_state(bound_cg,
+					blkio_subsys_id));
+		break;
+	}
+
+	cgroup_kernel_close(bound_cg);
+	return 0;
+}
+
+static struct cftype ub_cgroup_files[] = {
+	{
+		.name = "memory",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = UB_MEM_CGROUP,
+		.write_string = ub_cgroup_write,
+		.read = ub_cgroup_read,
+	},
+	{
+		.name = "blkio",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = UB_BLKIO_CGROUP,
+		.write_string = ub_cgroup_write,
+		.read = ub_cgroup_read,
+	},
+	{ },	/* terminate */
+};
+
+enum {
+	UB_CGROUP_ATTR_HELD,
+	UB_CGROUP_ATTR_MAXHELD,
+	UB_CGROUP_ATTR_BARRIER,
+	UB_CGROUP_ATTR_LIMIT,
+	UB_CGROUP_ATTR_FAILCNT,
+	UB_CGROUP_NR_ATTRS,
+};
+
+#define UB_CGROUP_PRIVATE(res, attr)	(((res) << 16) | (attr))
+#define UB_CGROUP_RES(val)		(((val) >> 16) & 0xffff)
+#define UB_CGROUP_ATTR(val)		((val) & 0xffff)
+
+static ssize_t ub_cgroup_resource_read(struct cgroup *cg, struct cftype *cft,
+				       struct file *file, char __user *buf,
+				       size_t nbytes, loff_t *ppos)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+	struct ubparm *ubparm;
+	unsigned long val;
+	int res, attr;
+	int len;
+	char str[32];
+
+	res = UB_CGROUP_RES(cft->private);
+	attr = UB_CGROUP_ATTR(cft->private);
+
+	ubparm = &ub->ub_parms[res];
+
+	switch (attr) {
+	case UB_CGROUP_ATTR_HELD:
+		val = ubparm->held;
+		break;
+	case UB_CGROUP_ATTR_MAXHELD:
+		val = ubparm->maxheld;
+		break;
+	case UB_CGROUP_ATTR_BARRIER:
+		val = ubparm->barrier;
+		break;
+	case UB_CGROUP_ATTR_LIMIT:
+		val = ubparm->limit;
+		break;
+	case UB_CGROUP_ATTR_FAILCNT:
+		val = ubparm->failcnt;
+		break;
+	default:
+		BUG();
+	}
+
+	len = scnprintf(str, sizeof(str), "%lu\n", val);
+	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+
+static int ub_cgroup_resource_write(struct cgroup *cg, struct cftype *cft,
+				    u64 val)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+	struct ubparm *ubparm;
+	int res, attr;
+
+	if (val > UB_MAXVALUE)
+		return -EINVAL;
+
+	res = UB_CGROUP_RES(cft->private);
+	attr = UB_CGROUP_ATTR(cft->private);
+
+	ubparm = &ub->ub_parms[res];
+
+	spin_lock_irq(&ub->ub_lock);
+	switch (attr) {
+	case UB_CGROUP_ATTR_BARRIER:
+		ubparm->barrier = val;
+		break;
+	case UB_CGROUP_ATTR_LIMIT:
+		ubparm->limit = val;
+		break;
+	default:
+		BUG();
+	}
+	init_beancounter_precharge(ub, res);
+	spin_unlock_irq(&ub->ub_lock);
+	return 0;
+}
+
+static __init int ub_cgroup_init(void)
+{
+	static struct cftype cgroup_files[UB_RESOURCES * UB_CGROUP_NR_ATTRS + 1];
+	struct cftype *cft;
+	int i, j;
+
+	for (i = 0, j = 0; i < UB_RESOURCES; i++) {
+		if (!strcmp(ub_rnames[i], "dummy"))
+			continue;
+
+		/* accounted by memcg */
+		switch (i) {
+		case UB_KMEMSIZE:
+		case UB_DCACHESIZE:
+		case UB_PHYSPAGES:
+		case UB_SWAPPAGES:
+		case UB_OOMGUARPAGES:
+		case UB_NUMTCPSOCK:
+		case UB_TCPSNDBUF:
+		case UB_TCPRCVBUF:
+		case UB_OTHERSOCKBUF:
+		case UB_DGRAMRCVBUF:
+		case UB_NUMOTHERSOCK:
+			continue;
+		}
+
+		cft = &cgroup_files[j * UB_CGROUP_NR_ATTRS];
+		snprintf(cft->name, MAX_CFTYPE_NAME, "%s.held", ub_rnames[i]);
+		cft->flags = CFTYPE_NOT_ON_ROOT;
+		cft->private = UB_CGROUP_PRIVATE(i, UB_CGROUP_ATTR_HELD);
+		cft->read = ub_cgroup_resource_read;
+
+		cft = &cgroup_files[j * UB_CGROUP_NR_ATTRS + 1];
+		snprintf(cft->name, MAX_CFTYPE_NAME, "%s.maxheld", ub_rnames[i]);
+		cft->flags = CFTYPE_NOT_ON_ROOT;
+		cft->private = UB_CGROUP_PRIVATE(i, UB_CGROUP_ATTR_MAXHELD);
+		cft->read = ub_cgroup_resource_read;
+
+		cft = &cgroup_files[j * UB_CGROUP_NR_ATTRS + 2];
+		snprintf(cft->name, MAX_CFTYPE_NAME, "%s.barrier", ub_rnames[i]);
+		cft->flags = CFTYPE_NOT_ON_ROOT;
+		cft->private = UB_CGROUP_PRIVATE(i, UB_CGROUP_ATTR_BARRIER);
+		cft->read = ub_cgroup_resource_read;
+		cft->write_u64 = ub_cgroup_resource_write;
+
+		cft = &cgroup_files[j * UB_CGROUP_NR_ATTRS + 3];
+		snprintf(cft->name, MAX_CFTYPE_NAME, "%s.limit", ub_rnames[i]);
+		cft->flags = CFTYPE_NOT_ON_ROOT;
+		cft->private = UB_CGROUP_PRIVATE(i, UB_CGROUP_ATTR_LIMIT);
+		cft->read = ub_cgroup_resource_read;
+		cft->write_u64 = ub_cgroup_resource_write;
+
+		cft = &cgroup_files[j * UB_CGROUP_NR_ATTRS + 4];
+		snprintf(cft->name, MAX_CFTYPE_NAME, "%s.failcnt", ub_rnames[i]);
+		cft->flags = CFTYPE_NOT_ON_ROOT;
+		cft->private = UB_CGROUP_PRIVATE(i, UB_CGROUP_ATTR_FAILCNT);
+		cft->read = ub_cgroup_resource_read;
+
+		j++;
+	}
+
+	WARN_ON(cgroup_add_cftypes(&ub_subsys, cgroup_files));
+
+	return 0;
+}
+module_init(ub_cgroup_init);
+
+struct cgroup_subsys ub_subsys = {
+	.name = "beancounter",
+	.subsys_id = ub_subsys_id,
+	.css_alloc = ub_cgroup_css_alloc,
+	.css_online = ub_cgroup_css_online,
+	.css_offline = ub_cgroup_css_offline,
+	.css_free = ub_cgroup_css_free,
+	.attach = ub_cgroup_attach,
+	.fork = ub_cgroup_fork,
+	.base_cftypes = ub_cgroup_files,
+	.use_id = true,
+};
+EXPORT_SYMBOL(ub_subsys);
+
+/*
+ *	Generic resource charging stuff
+ */
+
+int __charge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val, enum ub_severity strict)
+{
+	/*
+	 * ub_value <= UB_MAXVALUE, value <= UB_MAXVALUE, and only one addition
+	 * at the moment is possible so an overflow is impossible.  
+	 */
+	ub->ub_parms[resource].held += val;
+
+	switch (strict & ~UB_SEV_FLAGS) {
+		case UB_HARD:
+			if (ub->ub_parms[resource].held >
+					ub->ub_parms[resource].barrier)
+				break;
+		case UB_SOFT:
+			if (ub->ub_parms[resource].held >
+					ub->ub_parms[resource].limit)
+				break;
+		case UB_FORCE:
+			ub_adjust_maxheld(ub, resource);
+			return 0;
+		default:
+			BUG();
+	}
+
+	if (!(strict & UB_TEST)) {
+		if (strict == UB_SOFT && __ratelimit(&ub->ub_ratelimit))
+			printk(KERN_INFO "Fatal resource shortage: %s, UB %s.\n",
+			       ub_rnames[resource], ub->ub_name);
+		ub->ub_parms[resource].failcnt++;
+	}
+	ub->ub_parms[resource].held -= val;
+	return -ENOMEM;
+}
+
+int charge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val, enum ub_severity strict)
+{
+	int retval;
+	unsigned long flags;
+
+	retval = -EINVAL;
+	if (val > UB_MAXVALUE)
+		goto out;
+
+	if (ub) {
+		spin_lock_irqsave(&ub->ub_lock, flags);
+		retval = __charge_beancounter_locked(ub, resource, val, strict);
+		spin_unlock_irqrestore(&ub->ub_lock, flags);
+	}
+out:
+	return retval;
+}
+
+EXPORT_SYMBOL(charge_beancounter);
+
+void uncharge_warn(struct user_beancounter *ub, const char *resource,
+		unsigned long val, unsigned long held)
+{
+	printk(KERN_ERR "Uncharging too much %lu h %lu, res %s ub %s\n",
+			val, held, resource, ub->ub_name);
+}
+
+void __uncharge_beancounter_locked(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	if (ub->ub_parms[resource].held < val) {
+		uncharge_warn(ub, ub_rnames[resource],
+				val, ub->ub_parms[resource].held);
+		val = ub->ub_parms[resource].held;
+	}
+	ub->ub_parms[resource].held -= val;
+}
+
+void uncharge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	unsigned long flags;
+
+	if (ub) {
+		spin_lock_irqsave(&ub->ub_lock, flags);
+		__uncharge_beancounter_locked(ub, resource, val);
+		spin_unlock_irqrestore(&ub->ub_lock, flags);
+	}
+}
+
+EXPORT_SYMBOL(uncharge_beancounter);
+
+/* called with disabled interrupts */
+static int __precharge_beancounter_percpu(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	struct ub_percpu_struct *ub_pcpu = ub_percpu(ub, smp_processor_id());
+	int charge, retval;
+
+	BUG_ON(ub->ub_parms[resource].max_precharge < 0);
+
+	if (likely(ub_pcpu->precharge[resource] >= val))
+		return 0;
+
+	spin_lock(&ub->ub_lock);
+	charge = max((int)val, ub->ub_parms[resource].max_precharge >> 1) -
+		ub_pcpu->precharge[resource];
+	retval = __charge_beancounter_locked(ub, resource,
+			charge, UB_SOFT | UB_TEST);
+	if (!retval)
+		ub_pcpu->precharge[resource] += charge;
+	spin_unlock(&ub->ub_lock);
+
+	return retval;
+}
+
+/* called with disabled interrupts */
+int __charge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu,
+		int resource, unsigned long val, enum ub_severity strict)
+{
+	int retval, precharge;
+
+	spin_lock(&ub->ub_lock);
+	precharge = max(0, (ub->ub_parms[resource].max_precharge >> 1) -
+			ub_pcpu->precharge[resource]);
+	retval = __charge_beancounter_locked(ub, resource,
+			val + precharge, UB_SOFT | UB_TEST);
+	if (!retval)
+		ub_pcpu->precharge[resource] += precharge;
+	else {
+		init_beancounter_precharge(ub, resource);
+		retval = __charge_beancounter_locked(ub, resource,
+				val, strict);
+	}
+	spin_unlock(&ub->ub_lock);
+
+	return retval;
+}
+EXPORT_SYMBOL(__charge_beancounter_percpu);
+
+/* called with disabled interrupts */
+void __uncharge_beancounter_percpu(struct user_beancounter *ub,
+		struct ub_percpu_struct *ub_pcpu,
+		int resource, unsigned long val)
+{
+	int uncharge;
+
+	spin_lock(&ub->ub_lock);
+	if (ub->ub_parms[resource].max_precharge !=
+			ub_resource_precharge[resource])
+		init_beancounter_precharge(ub, resource);
+	uncharge = max(0, ub_pcpu->precharge[resource] -
+			(ub->ub_parms[resource].max_precharge >> 1));
+	ub_pcpu->precharge[resource] -= uncharge;
+	smp_wmb();
+	__uncharge_beancounter_locked(ub, resource, val + uncharge);
+	spin_unlock(&ub->ub_lock);
+}
+EXPORT_SYMBOL(__uncharge_beancounter_percpu);
+
+unsigned long __get_beancounter_usage_percpu(struct user_beancounter *ub,
+		int resource)
+{
+	long held, precharge;
+
+	held = ub->ub_parms[resource].held;
+	smp_rmb();
+	precharge = __ub_percpu_sum(ub, precharge[resource]);
+
+	return max(0l, held - precharge);
+}
+
+int precharge_beancounter(struct user_beancounter *ub,
+		int resource, unsigned long val)
+{
+	unsigned long flags;
+	int retval;
+
+	retval = -EINVAL;
+	if (val > UB_MAXVALUE)
+		goto out;
+
+	local_irq_save(flags);
+	if (ub)
+		retval = __precharge_beancounter_percpu(ub, resource, val);
+	local_irq_restore(flags);
+out:
+	return retval;
+}
+EXPORT_SYMBOL(precharge_beancounter);
+
+/*
+ *	Initialization
+ *
+ *	struct user_beancounter contains
+ *	 - limits and other configuration settings,
+ *	   with a copy stored for accounting purposes,
+ *	 - structural fields: lists, spinlocks and so on.
+ *
+ *	Before these parts are initialized, the structure should be memset
+ *	to 0 or copied from a known clean structure.  That takes care of a lot
+ *	of fields not initialized explicitly.
+ */
+
+static void init_beancounter_struct(struct user_beancounter *ub)
+{
+	ub->ub_magic = UB_MAGIC;
+	spin_lock_init(&ub->ub_lock);
+}
+
+static void init_beancounter_nolimits(struct user_beancounter *ub)
+{
+	int k;
+
+	for (k = 0; k < UB_RESOURCES; k++) {
+		ub->ub_parms[k].limit = UB_MAXVALUE;
+		ub->ub_parms[k].barrier = UB_MAXVALUE;
+	}
+
+	/*
+	 * Unlimited vmguarpages gives immunity against systemwide overcommit
+	 * policy. It makes sense in some cases but by default we must obey it.
+	 */
+	ub->ub_parms[UB_VMGUARPAGES].barrier = 0;
+
+	/*
+	 * Unlimited oomguarpages makes container or host mostly immune to
+	 * to the OOM-killer while other containers exists. Withal we cannot
+	 * set it to zero, otherwise single unconfigured container will be
+	 * first target for OOM-killer. 75% of ram looks like sane default.
+	 */
+	ub->ub_parms[UB_OOMGUARPAGES].barrier = totalram_pages * 3 / 4;
+
+	/* Ratelimit for messages in the kernel log */
+	ub->ub_ratelimit.burst = 4;
+	ub->ub_ratelimit.interval = 300*HZ;
+}
+
+static DEFINE_PER_CPU(struct ub_percpu_struct, ub0_percpu);
+
+void __init ub_init_early(void)
+{
+	struct user_beancounter *ub;
+
+	ub = get_ub0();
+	ub->ub_name = "0";
+	init_beancounter_nolimits(ub);
+	init_beancounter_struct(ub);
+	init_beancounter_precharges_early(ub);
+	ub->ub_percpu = &ub0_percpu;
+
+	memset(&current->task_bc, 0, sizeof(struct task_beancounter));
+	(void)set_exec_ub(ub);
+	current->task_bc.task_ub = get_beancounter(ub);
+	__charge_beancounter_locked(ub, UB_NUMPROC, 1, UB_FORCE);
+	init_mm.mm_ub = get_beancounter(ub);
+
+	list_add(&ub->ub_list, &ub_list_head);
+	ub_count++;
+}
+
+static int proc_resource_precharge(ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	static DEFINE_MUTEX(lock);
+	struct user_beancounter *ub;
+	int err;
+
+	mutex_lock(&lock);
+
+	err = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (err || !write)
+		goto out;
+
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		spin_lock_irq(&ub->ub_lock);
+		init_beancounter_precharges(ub);
+		spin_unlock_irq(&ub->ub_lock);
+	}
+	rcu_read_unlock();
+
+out:
+	mutex_unlock(&lock);
+	return err;
+}
+
+static ctl_table ub_sysctl_table[] = {
+	{
+		.procname	= "resource_precharge",
+		.data		= &ub_resource_precharge,
+		.extra1		= &resource_precharge_min,
+		.extra2		= &resource_precharge_max,
+		.maxlen		= sizeof(ub_resource_precharge),
+		.mode		= 0644,
+		.proc_handler	= &proc_resource_precharge,
+	},
+	{
+		.procname	= "overcommit_memory",
+		.data		= &ub_overcommit_memory,
+		.maxlen		= sizeof(ub_overcommit_memory),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#ifdef CONFIG_BC_IO_ACCOUNTING
+	{
+		.procname	= "dirty_ratio",
+		.data		= &ub_dirty_ratio,
+		.maxlen		= sizeof ub_dirty_ratio,
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.procname	= "dirty_background_ratio",
+		.data		= &ub_dirty_background_ratio,
+		.maxlen		= sizeof ub_dirty_background_ratio,
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif /* CONFIG_BC_IO_ACCOUNTING */
+	{ }
+};
+
+static ctl_table ub_sysctl_root[] = {
+       {
+	       .procname	= "ubc",
+	       .mode		= 0555,
+	       .child		= ub_sysctl_table,
+       },
+       { }
+};
+
+void __init ub_init_late(void)
+{
+	ub_set_mem_css(&ub0, task_subsys_state_check(&init_task,  mem_cgroup_subsys_id, true));
+	ub_set_blkio_css(&ub0, task_subsys_state_check(&init_task, blkio_subsys_id, true));
+
+	register_sysctl_table(ub_sysctl_root);
+}
+
+int __init ub_init_cgroup(void)
+{
+	struct cgroup_sb_opts blkio_opts = {
+		.subsys_mask    = (1ul << blkio_subsys_id),
+	};
+	struct cgroup_sb_opts mem_opts = {
+		.subsys_mask    = (1ul << mem_cgroup_subsys_id),
+	};
+	struct cgroup_sb_opts ub_opts = {
+		.subsys_mask	= (1ul << ub_subsys_id),
+	};
+
+	blkio_cgroup_mnt = cgroup_kernel_mount(&blkio_opts);
+	if (IS_ERR(blkio_cgroup_mnt))
+		panic("Failed to mount blkio cgroup: %ld\n",
+		      PTR_ERR(blkio_cgroup_mnt));
+
+	mem_cgroup_mnt = cgroup_kernel_mount(&mem_opts);
+	if (IS_ERR(mem_cgroup_mnt))
+		panic("Failed to mount memory cgroup: %ld\n",
+		      PTR_ERR(mem_cgroup_mnt));
+
+	ub_cgroup_mnt = cgroup_kernel_mount(&ub_opts);
+	if (IS_ERR(ub_cgroup_mnt))
+		panic("Failed to mount beancounter cgroup: %ld\n",
+		      PTR_ERR(ub_cgroup_mnt));
+
+	return 0;
+}
+late_initcall(ub_init_cgroup);
--- /dev/null
+++ b/kernel/bc/io_acct.c
@@ -0,0 +1,335 @@
+/*
+ *  kernel/bc/io_acct.c
+ *
+ *  Copyright (c) 2006-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ *  Pavel Emelianov <xemul@openvz.org>
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mempool.h>
+#include <linux/proc_fs.h>
+#include <linux/virtinfo.h>
+#include <linux/pagemap.h>
+#include <linux/module.h>
+#include <linux/writeback.h>
+#include <linux/backing-dev.h>
+
+#include <bc/beancounter.h>
+#include <bc/io_acct.h>
+#include <bc/proc.h>
+#include <bc/vmpages.h>
+
+/*
+ * starts writeback at this dirty memory percentage from physpages limit
+ */
+int ub_dirty_ratio = 50;
+int ub_dirty_background_ratio = 30;
+
+/* under write lock mapping->tree_lock */
+
+void ub_io_account_dirty(struct address_space *mapping)
+{
+	struct user_beancounter *ub = mapping->dirtied_ub;
+
+	WARN_ON_ONCE(!radix_tree_tagged(&mapping->page_tree,
+				PAGECACHE_TAG_DIRTY));
+
+	if (!ub)
+		ub = mapping->dirtied_ub = get_beancounter(get_io_ub());
+
+	ub_stat_inc(ub, dirty_pages);
+}
+EXPORT_SYMBOL_GPL(ub_io_account_dirty);
+
+void ub_io_account_clean(struct address_space *mapping)
+{
+	struct user_beancounter *ub = mapping->dirtied_ub;
+	size_t bytes = PAGE_SIZE;
+
+	if (unlikely(!ub)) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	ub_stat_dec(ub, dirty_pages);
+
+	ub_percpu_inc(ub, async_write_complete);
+
+	ub = set_exec_ub(ub);
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_ACCOUNT, &bytes);
+	ub = set_exec_ub(ub);
+
+	if (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_DIRTY) &&
+	    (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_WRITEBACK) ||
+	     !mapping_cap_account_writeback(mapping))) {
+		mapping->dirtied_ub = NULL;
+		put_beancounter(ub);
+	}
+}
+
+void ub_io_account_cancel(struct address_space *mapping)
+{
+	struct user_beancounter *ub = mapping->dirtied_ub;
+
+	if (unlikely(!ub)) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	ub_stat_dec(ub, dirty_pages);
+
+	ub_percpu_inc(ub, async_write_canceled);
+
+	if (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_DIRTY) &&
+	    (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_WRITEBACK) ||
+	     !mapping_cap_account_writeback(mapping))) {
+		mapping->dirtied_ub = NULL;
+		put_beancounter(ub);
+	}
+}
+
+void ub_io_writeback_inc(struct address_space *mapping)
+{
+	struct user_beancounter *ub = mapping->dirtied_ub;
+
+	WARN_ON_ONCE(!radix_tree_tagged(&mapping->page_tree,
+				PAGECACHE_TAG_WRITEBACK));
+
+	if (!ub)
+		ub = mapping->dirtied_ub = get_beancounter(get_io_ub());
+
+	ub_stat_inc(ub, writeback_pages);
+}
+
+void ub_io_writeback_dec(struct address_space *mapping)
+{
+	struct user_beancounter *ub = mapping->dirtied_ub;
+
+	if (unlikely(!ub)) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
+	ub_stat_dec(ub, writeback_pages);
+
+	if (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_WRITEBACK) &&
+	    (!radix_tree_tagged(&mapping->page_tree, PAGECACHE_TAG_DIRTY) ||
+	     !mapping_cap_account_dirty(mapping))) {
+		mapping->dirtied_ub = NULL;
+		put_beancounter(ub);
+	}
+}
+
+static bool __ub_over_bground_thresh(struct user_beancounter *ub)
+{
+	unsigned long background_thresh, dirty_thresh;
+	unsigned long ub_dirty, ub_writeback;
+
+	ub_dirty_limits(&background_thresh, &dirty_thresh, ub);
+
+	ub_dirty = ub_stat_get(ub, dirty_pages);
+	ub_writeback = ub_stat_get(ub, writeback_pages);
+
+	if (ub_dirty + ub_writeback >= background_thresh)
+		return true;
+
+	return false;
+}
+
+bool ub_over_bground_thresh(void)
+{
+	struct user_beancounter *ub;
+	bool ret = false;
+
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		if (ub == get_ub0())
+			continue;
+		if (__ub_over_bground_thresh(ub)) {
+			ret = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+int ub_dirty_limits(unsigned long *pbackground,
+		    long *pdirty, struct user_beancounter *ub)
+{
+	int dirty_ratio;
+	unsigned long available_memory;
+
+	*pdirty = *pbackground = LONG_MAX;
+
+	dirty_ratio = ub_dirty_ratio;
+	if (!dirty_ratio)
+		return 0;
+
+	available_memory = ub_total_pages(ub, false);
+	if (available_memory == ULONG_MAX || available_memory == 0)
+		return 0;
+
+	*pdirty = (dirty_ratio * available_memory) / 100;
+
+	dirty_ratio = ub_dirty_background_ratio;
+	*pbackground = (dirty_ratio * available_memory) / 100;
+	if (!dirty_ratio || *pbackground >= *pdirty)
+		*pbackground = *pdirty / 2;
+
+	return 1;
+}
+
+bool ub_should_skip_writeback(struct user_beancounter *ub, struct inode *inode)
+{
+	struct user_beancounter *dirtied_ub;
+	bool ret;
+
+	rcu_read_lock();
+	dirtied_ub = rcu_dereference(inode->i_mapping->dirtied_ub);
+	if (ub)
+		ret = (ub != dirtied_ub);
+	else
+		ret = (dirtied_ub && !__ub_over_bground_thresh(dirtied_ub));
+	rcu_read_unlock();
+
+	return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+#define in_flight(var)	(var > var##_done ? var - var##_done : 0)
+
+static int bc_ioacct_show(struct seq_file *f, void *v)
+{
+	int i;
+	unsigned long long read, write, cancel;
+	unsigned long sync, sync_done;
+	unsigned long fsync, fsync_done;
+	unsigned long fdsync, fdsync_done;
+	unsigned long frsync, frsync_done;
+	struct user_beancounter *ub;
+	unsigned long dirty_pages;
+	unsigned long long dirtied;
+	unsigned long fuse_requests, fuse_bytes;
+
+	ub = seq_beancounter(f);
+
+	dirty_pages = __ub_stat_get(ub, dirty_pages);
+
+	read = write = cancel = 0;
+	sync = sync_done = fsync = fsync_done =
+		fdsync = fdsync_done = frsync = frsync_done = 0;
+	fuse_requests = fuse_bytes = 0;
+	for_each_online_cpu(i) {
+		struct ub_percpu_struct *ub_percpu;
+		ub_percpu = per_cpu_ptr(ub->ub_percpu, i);
+
+		read += ub_percpu->sync_read_bytes;
+		write += ub_percpu->sync_write_bytes;
+
+		dirty_pages += ub_percpu->dirty_pages;
+		write += (u64)ub_percpu->async_write_complete << PAGE_SHIFT;
+		cancel += (u64)ub_percpu->async_write_canceled << PAGE_SHIFT;
+
+		sync += ub_percpu->sync;
+		fsync += ub_percpu->fsync;
+		fdsync += ub_percpu->fdsync;
+		frsync += ub_percpu->frsync;
+		sync_done += ub_percpu->sync_done;
+		fsync_done += ub_percpu->fsync_done;
+		fdsync_done += ub_percpu->fdsync_done;
+		frsync_done += ub_percpu->frsync_done;
+
+		fuse_requests += ub_percpu->fuse_requests;
+		fuse_bytes += ub_percpu->fuse_bytes;
+	}
+
+	if ((long)dirty_pages < 0)
+		dirty_pages = 0;
+
+	dirtied = write + cancel;
+	dirtied += (u64)dirty_pages << PAGE_SHIFT;
+
+	seq_printf(f, bc_proc_llu_fmt, "read", read);
+	seq_printf(f, bc_proc_llu_fmt, "write", write);
+	seq_printf(f, bc_proc_llu_fmt, "dirty", dirtied);
+	seq_printf(f, bc_proc_llu_fmt, "cancel", cancel);
+	seq_printf(f, bc_proc_llu_fmt, "missed", 0ull);
+
+	seq_printf(f, bc_proc_lu_lfmt, "syncs_total", sync);
+	seq_printf(f, bc_proc_lu_lfmt, "fsyncs_total", fsync);
+	seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_total", fdsync);
+	seq_printf(f, bc_proc_lu_lfmt, "range_syncs_total", frsync);
+
+	seq_printf(f, bc_proc_lu_lfmt, "syncs_active", in_flight(sync));
+	seq_printf(f, bc_proc_lu_lfmt, "fsyncs_active", in_flight(fsync));
+	seq_printf(f, bc_proc_lu_lfmt, "fdatasyncs_active", in_flight(fsync));
+	seq_printf(f, bc_proc_lu_lfmt, "range_syncs_active", in_flight(frsync));
+
+	seq_printf(f, bc_proc_lu_lfmt, "io_pbs", dirty_pages);
+
+	seq_printf(f, bc_proc_lu_lfmt, "fuse_requests", fuse_requests);
+	seq_printf(f, bc_proc_lu_lfmt, "fuse_bytes", fuse_bytes);
+
+	return 0;
+}
+
+static struct bc_proc_entry bc_ioacct_entry = {
+	.name = "ioacct",
+	.u.show = bc_ioacct_show,
+};
+
+static int bc_ioacct_notify(struct vnotifier_block *self,
+		unsigned long event, void *arg, int old_ret)
+{
+	struct user_beancounter *ub;
+	struct ub_percpu_struct *ub_pcpu;
+	unsigned long *vm_events;
+	unsigned long long bin, bout;
+	int i;
+
+	if (event != VIRTINFO_VMSTAT)
+		return old_ret;
+
+	ub = get_exec_ub();
+	if (ub == get_ub0())
+		return old_ret;
+
+	/* Think over: do we need to account here bytes_dirty_missed? */
+	bout = 0;
+	bin = 0;
+	for_each_online_cpu(i) {
+		ub_pcpu = per_cpu_ptr(ub->ub_percpu, i);
+		bout += (u64)ub_pcpu->async_write_complete << PAGE_SHIFT;
+		bout += ub_pcpu->sync_write_bytes;
+		bin += ub_pcpu->sync_read_bytes;
+	}
+
+	/* convert to Kbytes */
+	bout >>= 10;
+	bin >>= 10;
+
+	vm_events = ((unsigned long *)arg) + NR_VM_ZONE_STAT_ITEMS;
+	vm_events[PGPGOUT] = (unsigned long)bout;
+	vm_events[PGPGIN] = (unsigned long)bin;
+	return NOTIFY_OK;
+}
+
+static struct vnotifier_block bc_ioacct_nb = {
+	.notifier_call = bc_ioacct_notify,
+};
+
+static int __init bc_ioacct_init(void)
+{
+	bc_register_proc_entry(&bc_ioacct_entry);
+
+	virtinfo_notifier_register(VITYPE_GENERAL, &bc_ioacct_nb);
+	return 0;
+}
+
+late_initcall(bc_ioacct_init);
+#endif
--- /dev/null
+++ b/kernel/bc/io_prio.c
@@ -0,0 +1,173 @@
+/*
+ *  kernel/bc/io_prio.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/cgroup.h>
+#include <bc/beancounter.h>
+#include <bc/proc.h>
+
+static unsigned int ioprio_weight[UB_IOPRIO_MAX] = {
+	320, 365, 410, 460, 500, 550, 600, 640,
+};
+
+extern unsigned int blkcg_get_weight(struct cgroup *cgrp);
+extern int blkcg_set_weight(struct cgroup *cgrp, unsigned int weight);
+extern void blkcg_show_ub_iostat(struct cgroup *cgrp, struct seq_file *sf);
+
+int ub_set_ioprio(int id, int ioprio)
+{
+	struct user_beancounter *ub;
+	struct cgroup_subsys_state *css;
+	int ret;
+
+	ret = -ERANGE;
+	if (ioprio < UB_IOPRIO_MIN || ioprio >= UB_IOPRIO_MAX)
+		goto out;
+
+	ret = -ESRCH;
+	ub = get_beancounter_byuid(id, 0);
+	if (!ub)
+		goto out;
+
+	css = ub_get_blkio_css(ub);
+	ret = blkcg_set_weight(css->cgroup, ioprio_weight[ioprio]);
+	css_put(css);
+	put_beancounter(ub);
+out:
+	return ret;
+}
+
+#ifdef CONFIG_PROC_FS
+
+static int bc_iostat(struct seq_file *f, struct user_beancounter *bc)
+{
+	struct cgroup_subsys_state *css;
+
+	seq_printf(f, "flush %s . 0 0 0 0 0 %ld %ld 0 0\n",
+			bc->ub_name,
+			ub_stat_get_exact(bc, wb_requests),
+			ub_stat_get_exact(bc, wb_sectors));
+
+	seq_printf(f, "fuse %s . 0 0 0 0 0 %lu %lu 0 0\n",
+			bc->ub_name,
+			__ub_percpu_sum(bc, fuse_requests),
+			__ub_percpu_sum(bc, fuse_bytes) >> 9);
+
+	css = ub_get_blkio_css(bc);
+	blkcg_show_ub_iostat(css->cgroup, f);
+	css_put(css);
+	return 0;
+}
+
+static int bc_iostat_single(struct seq_file *f, void *v)
+{
+	return bc_iostat(f, seq_beancounter(f));
+}
+
+static struct bc_proc_entry bc_iostat_entry = {
+	.name = "iostat",
+	.u.show = bc_iostat_single,
+};
+
+static void *bc_iostat_start(struct seq_file *f, loff_t *ppos)
+{
+	struct user_beancounter *ub;
+	unsigned long pos = *ppos;
+
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		if (!pos--)
+			return ub;
+	}
+	return NULL;
+}
+
+static void *bc_iostat_next(struct seq_file *f, void *v, loff_t *ppos)
+{
+	struct user_beancounter *ub = v;
+
+	list_for_each_entry_continue_rcu(ub, &ub_list_head, ub_list) {
+		(*ppos)++;
+		return ub;
+	}
+	return NULL;
+}
+
+static int bc_iostat_show(struct seq_file *f, void *v)
+{
+	f->private = v;
+	return bc_iostat(f, v);
+}
+
+static void bc_iostat_stop(struct seq_file *f, void *v)
+{
+	rcu_read_unlock();
+}
+
+static struct seq_operations iostat_seq_ops = {
+	.start = bc_iostat_start,
+	.next  = bc_iostat_next,
+	.stop  = bc_iostat_stop,
+	.show  = bc_iostat_show,
+};
+
+static int bc_iostat_open(struct inode *inode, struct file *filp)
+{
+	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+		return -EACCES;
+
+	return seq_open(filp, &iostat_seq_ops);
+}
+
+static struct file_operations bc_iostat_ops = {
+	.open		= bc_iostat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static struct bc_proc_entry bc_root_iostat_entry = {
+	.name = "iostat",
+	.u.fops = &bc_iostat_ops,
+};
+
+static int bc_ioprio_show(struct seq_file *f, void *v)
+{
+	struct user_beancounter *bc;
+	struct cgroup_subsys_state *css;
+	unsigned int weight;
+	int ioprio;
+
+	bc = seq_beancounter(f);
+
+	css = ub_get_blkio_css(bc);
+	weight = blkcg_get_weight(css->cgroup);
+	css_put(css);
+
+	ioprio = UB_IOPRIO_MAX - 1;
+	while (ioprio && weight < ioprio_weight[ioprio])
+		ioprio--;
+
+	seq_printf(f, "prio: %d\n", ioprio);
+	return 0;
+}
+
+static struct bc_proc_entry bc_ioprio_entry = {
+	.name = "ioprio",
+	.u.show = bc_ioprio_show,
+};
+
+static int __init bc_iostat_init(void)
+{
+	bc_register_proc_entry(&bc_ioprio_entry);
+	bc_register_proc_entry(&bc_iostat_entry);
+	bc_register_proc_root_entry(&bc_root_iostat_entry);
+	return 0;
+}
+late_initcall(bc_iostat_init);
+
+#endif /* CONFIG_PROC_FS */
--- /dev/null
+++ b/kernel/bc/misc.c
@@ -0,0 +1,168 @@
+/*
+ *  kernel/bc/misc.c
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/tty.h>
+#include <linux/tty_driver.h>
+#include <linux/signal.h>
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+
+#include <bc/beancounter.h>
+#include <bc/proc.h>
+
+/*
+ * Task staff
+ */
+
+int ub_task_charge(struct user_beancounter *ub)
+{
+	return charge_beancounter_fast(ub, UB_NUMPROC, 1, UB_HARD);
+}
+
+void ub_task_uncharge(struct user_beancounter *ub)
+{
+	uncharge_beancounter_fast(ub, UB_NUMPROC, 1);
+}
+
+void ub_task_get(struct user_beancounter *ub, struct task_struct *task)
+{
+	struct task_beancounter *new_bc = &task->task_bc;
+
+	new_bc->task_ub = get_beancounter(ub);
+	new_bc->exec_ub = get_beancounter(ub);
+}
+
+void ub_task_put(struct task_struct *task)
+{
+	struct task_beancounter *task_bc;
+
+	task_bc = &task->task_bc;
+
+	put_beancounter(task_bc->exec_ub);
+	put_beancounter(task_bc->task_ub);
+
+	task_bc->exec_ub = (struct user_beancounter *)0xdeadbcbc;
+	task_bc->task_ub = (struct user_beancounter *)0xdead100c;
+}
+
+int ub_file_charge(struct file *f)
+{
+	struct user_beancounter *ub = get_exec_ub();
+	int err;
+
+	err = charge_beancounter_fast(ub, UB_NUMFILE, 1, UB_HARD);
+	if (unlikely(err))
+		goto no_file;
+
+	f->f_ub = get_beancounter(ub);
+
+	return 0;
+
+no_file:
+	return err;
+}
+
+void ub_file_uncharge(struct file *f)
+{
+	struct user_beancounter *ub = f->f_ub;
+
+	uncharge_beancounter_fast(ub, UB_NUMFILE, 1);
+	put_beancounter(ub);
+}
+
+int ub_flock_charge(struct file_lock *fl, int hard)
+{
+	struct user_beancounter *ub;
+	int err;
+
+	ub = fl->fl_ub;
+	if (ub == NULL)
+		return 0;
+
+	err = charge_beancounter(ub, UB_NUMFLOCK, 1, hard ? UB_HARD : UB_SOFT);
+	if (!err)
+		fl->fl_charged = 1;
+	return err;
+}
+
+void ub_flock_uncharge(struct file_lock *fl)
+{
+	struct user_beancounter *ub;
+
+	ub = fl->fl_ub;
+	if (ub == NULL || !fl->fl_charged)
+		return;
+
+	uncharge_beancounter(ub, UB_NUMFLOCK, 1);
+	fl->fl_charged = 0;
+}
+
+/*
+ * Signal handling
+ */
+
+int ub_siginfo_charge(struct sigqueue *sq, struct user_beancounter *ub,
+			gfp_t gfp_mask)
+{
+	if (charge_beancounter_fast(ub, UB_NUMSIGINFO, 1, UB_HARD))
+		goto out_num;
+
+	sq->sig_ub = get_beancounter(ub);
+	return 0;
+
+out_num:
+	return -ENOMEM;
+}
+EXPORT_SYMBOL(ub_siginfo_charge);
+
+void ub_siginfo_uncharge(struct sigqueue *sq)
+{
+	struct user_beancounter *ub;
+
+	ub = sq->sig_ub;
+	sq->sig_ub = NULL;
+	uncharge_beancounter_fast(ub, UB_NUMSIGINFO, 1);
+	put_beancounter(ub);
+}
+
+/*
+ * PTYs
+ */
+
+int ub_pty_charge(struct tty_struct *tty)
+{
+	struct user_beancounter *ub = get_exec_ub();
+	int retval;
+
+	retval = 0;
+	if (ub && tty->driver->subtype == PTY_TYPE_MASTER &&
+			!test_bit(TTY_CHARGED, &tty->flags)) {
+		retval = charge_beancounter(ub, UB_NUMPTY, 1, UB_HARD);
+		if (!retval) {
+			set_bit(TTY_CHARGED, &tty->flags);
+			tty->ub = get_beancounter(ub);
+		}
+	}
+	return retval;
+}
+
+void ub_pty_uncharge(struct tty_struct *tty)
+{
+	struct user_beancounter *ub;
+
+	ub = tty->ub;
+	if (ub && tty->driver->subtype == PTY_TYPE_MASTER &&
+			test_bit(TTY_CHARGED, &tty->flags)) {
+		uncharge_beancounter(ub, UB_NUMPTY, 1);
+		clear_bit(TTY_CHARGED, &tty->flags);
+		put_beancounter(ub);
+	}
+}
--- /dev/null
+++ b/kernel/bc/proc.c
@@ -0,0 +1,760 @@
+/*
+ *  kernel/bc/proc.c
+ *
+ *  Copyright (c) 2006-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/ve_proto.h>
+#include <linux/virtinfo.h>
+#include <linux/mount.h>
+#include <linux/nsproxy.h>
+#include <linux/mnt_namespace.h>
+#include <linux/lglock.h>
+#include <linux/ve.h>
+#include <linux/memcontrol.h>
+
+#include <bc/beancounter.h>
+#include <bc/proc.h>
+
+/* Generic output formats */
+#if BITS_PER_LONG == 32
+const char *bc_proc_lu_fmt = "\t%-20s %10lu\n";
+const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n";
+const char *bc_proc_llu_fmt = "\t%-20s %21llu\n";
+const char *bc_proc_lu_lu_fmt = "\t%-20s %10lu %10lu\n";
+#else
+const char *bc_proc_lu_fmt = "\t%-20s %21lu\n";
+const char *bc_proc_lu_lfmt = "\t%-20s %21lu\n";
+const char *bc_proc_llu_fmt = "\t%-20s %21llu\n";
+const char *bc_proc_lu_lu_fmt = "\t%-20s %21lu %21lu\n";
+#endif
+
+#if BITS_PER_LONG == 32
+static const char *head_fmt = "%10s  %-12s %10s %10s %10s %10s %10s\n";
+static const char *res_fmt = "%10s%c %-12s %10lu %10lu %10lu %10lu %10lu\n";
+#else
+static const char *head_fmt = "%10s  %-12s %20s %20s %20s %20s %20s\n";
+static const char *res_fmt = "%10s%c %-12s %20lu %20lu %20lu %20lu %20lu\n";
+#endif
+
+static void ub_show_res(struct seq_file *f, struct user_beancounter *ub,
+		int r, int precharge, int show_uid)
+{
+	struct ubparm *p;
+	unsigned long held;
+
+	p = &ub->ub_parms[r];
+	held = p->held;
+	held = (held > precharge) ? (held - precharge) : 0;
+
+	seq_printf(f, res_fmt,
+			show_uid && r == 0 ? ub->ub_name : "",
+			show_uid && r == 0 ? ':' : ' ',
+		   	ub_rnames[r],
+			held,
+			p->maxheld,
+			p->barrier,
+			p->limit,
+			p->failcnt);
+}
+
+static void __show_resources(struct seq_file *f, struct user_beancounter *ub,
+		int show_uid)
+{
+	int i, precharge[UB_RESOURCES];
+
+	ub_sync_memcg(ub);
+	ub_precharge_snapshot(ub, precharge);
+
+	for (i = 0; i < UB_RESOURCES_COMPAT; i++)
+		if (strcmp(ub_rnames[i], "dummy") != 0)
+			ub_show_res(f, ub, i, precharge[i], show_uid);
+
+	for (i = UB_RESOURCES_COMPAT; i < UB_RESOURCES; i++)
+		ub_show_res(f, ub, i, precharge[i], show_uid);
+}
+
+static int bc_resources_show(struct seq_file *f, void *v)
+{
+	__show_resources(f, seq_beancounter(f), 0);
+	return 0;
+}
+
+static struct bc_proc_entry bc_resources_entry = {
+	.name = "resources",
+	.u.show = bc_resources_show,
+};
+
+static int bc_precharge_show(struct seq_file *f, void *v)
+{
+	struct user_beancounter *ub;
+	int i, cpus = num_possible_cpus();
+	int precharge[UB_RESOURCES];
+
+	seq_printf(f, "%-12s %16s %10s %10s\n",
+			"resource", "real_held", "precharge", "max_precharge");
+
+	ub = seq_beancounter(f);
+	ub_precharge_snapshot(ub, precharge);
+	for ( i = 0 ; i < UB_RESOURCES ; i++ ) {
+		if (!strcmp(ub_rnames[i], "dummy"))
+			continue;
+		seq_printf(f, "%-12s %16lu %10d %10d\n", ub_rnames[i],
+				ub->ub_parms[i].held,
+				precharge[i],
+				ub->ub_parms[i].max_precharge * cpus);
+	}
+
+	return 0;
+}
+
+static struct bc_proc_entry bc_precharge_entry = {
+	.name = "precharge",
+	.u.show = bc_precharge_show,
+};
+
+static int bc_proc_meminfo_show(struct seq_file *f, void *v)
+{
+	return meminfo_proc_show_ub(f, NULL,
+			seq_beancounter(f), VE_MEMINFO_DEFAULT);
+}
+
+static struct bc_proc_entry bc_meminfo_entry = {
+	.name = "meminfo",
+	.u.show = bc_proc_meminfo_show,
+};
+
+extern void mem_cgroup_get_nr_pages(struct mem_cgroup *memcg, int nid,
+				    unsigned long *pages);
+
+#define K(x) ((x) << (PAGE_SHIFT - 10))
+static int bc_proc_nodeinfo_show(struct seq_file *f, void *v)
+{
+	int nid;
+	struct cgroup_subsys_state *css;
+	unsigned long pages[NR_LRU_LISTS];
+
+	css = ub_get_mem_css(seq_beancounter(f));
+	for_each_node_state(nid, N_HIGH_MEMORY) {
+		memset(pages, 0, sizeof(pages));
+		mem_cgroup_get_nr_pages(mem_cgroup_from_cont(css->cgroup),
+					nid, pages);
+		seq_printf(f,
+			"Node %d Active:         %8lu kB\n"
+			"Node %d Inactive:       %8lu kB\n"
+			"Node %d Active(anon):   %8lu kB\n"
+			"Node %d Inactive(anon): %8lu kB\n"
+			"Node %d Active(file):   %8lu kB\n"
+			"Node %d Inactive(file): %8lu kB\n"
+			"Node %d Unevictable:    %8lu kB\n",
+			nid, K(pages[LRU_ACTIVE_ANON] +
+			       pages[LRU_ACTIVE_FILE]),
+			nid, K(pages[LRU_INACTIVE_ANON] +
+			       pages[LRU_INACTIVE_FILE]),
+			nid, K(pages[LRU_ACTIVE_ANON]),
+			nid, K(pages[LRU_INACTIVE_ANON]),
+			nid, K(pages[LRU_ACTIVE_FILE]),
+			nid, K(pages[LRU_INACTIVE_FILE]),
+			nid, K(pages[LRU_UNEVICTABLE]));
+	}
+	css_put(css);
+	return 0;
+}
+#undef K
+
+static struct bc_proc_entry bc_nodeinfo_entry = {
+	.name = "nodeinfo",
+	.u.show = bc_proc_nodeinfo_show,
+};
+
+static int ub_show(struct seq_file *f, void *v)
+{
+	int i, precharge[UB_RESOURCES];
+	struct user_beancounter *ub = v;
+
+	ub_sync_memcg(ub);
+	ub_precharge_snapshot(ub, precharge);
+
+	for (i = 0; i < UB_RESOURCES_COMPAT; i++)
+		ub_show_res(f, ub, i, precharge[i], 1);
+	return 0;
+}
+
+static int res_show(struct seq_file *f, void *v)
+{
+	__show_resources(f, (struct user_beancounter *)v, 1);
+	return 0;
+}
+
+static int ub_accessible(struct user_beancounter *exec,
+		struct user_beancounter *target)
+{
+	return (exec == get_ub0() || exec == target);
+}
+
+static void ub_show_header(struct seq_file *f)
+{
+	seq_printf(f, "Version: 2.5\n");
+	seq_printf(f, head_fmt, "uid", "resource",
+			"held", "maxheld", "barrier", "limit", "failcnt");
+}
+
+static void *ub_start(struct seq_file *f, loff_t *ppos)
+{
+	struct user_beancounter *ub, *ret = NULL;
+	struct user_beancounter *exec_ub; 
+	unsigned long pos;
+
+	pos = *ppos;
+	if (pos == 0)
+		ub_show_header(f);
+
+	exec_ub = get_exec_ub();
+
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		if (!ub_accessible(exec_ub, ub))
+			continue;
+		if (!get_beancounter_rcu(ub))
+			continue;
+		if (pos-- == 0) {
+			ret = ub;
+			break;
+		}
+		put_beancounter(ub);
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+static void *ub_next(struct seq_file *f, void *v, loff_t *ppos)
+{
+	struct user_beancounter *ub, *ret = NULL;
+	struct user_beancounter *exec_ub;
+
+	exec_ub = get_exec_ub();
+	ub = (struct user_beancounter *)v;
+	rcu_read_lock();
+	put_beancounter(ub);
+	list_for_each_entry_continue_rcu(ub, &ub_list_head, ub_list) {
+		if (!ub_accessible(exec_ub, ub))
+			continue;
+		if (!get_beancounter_rcu(ub))
+			continue;
+		(*ppos)++;
+		ret = ub;
+		break;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+static void ub_stop(struct seq_file *f, void *v)
+{
+	struct user_beancounter *ub;
+
+	ub = (struct user_beancounter *)v;
+	put_beancounter(ub);
+}
+
+static struct seq_operations ub_seq_ops = {
+	.start = ub_start,
+	.next  = ub_next,
+	.stop  = ub_stop,
+	.show  = ub_show,
+};
+
+static int ub_open(struct inode *inode, struct file *filp)
+{
+	if (!(ve_capable(CAP_DAC_OVERRIDE) && ve_capable(CAP_DAC_READ_SEARCH)))
+		return -EACCES;
+
+	return seq_open(filp, &ub_seq_ops);
+}
+
+static struct file_operations ub_file_operations = {
+	.open		= ub_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static struct seq_operations res_seq_ops = {
+	.start = ub_start,
+	.next  = ub_next,
+	.stop  = ub_stop,
+	.show  = res_show,
+};
+
+static int res_open(struct inode *inode, struct file *filp)
+{
+	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+		return -EACCES;
+
+	return seq_open(filp, &res_seq_ops);
+}
+
+static struct file_operations resources_operations = {
+	.open		= res_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static struct bc_proc_entry bc_all_resources_entry = {
+	.name = "resources",
+	.u.fops = &resources_operations,
+};
+
+/*
+ * Generic showing stuff
+ */
+
+static int cookies, num_entries;
+static struct bc_proc_entry *bc_entries __read_mostly;
+static struct bc_proc_entry *bc_root_entries __read_mostly;
+static DEFINE_SPINLOCK(bc_entries_lock);
+static struct proc_dir_entry *bc_proc_root;
+
+void bc_register_proc_entry(struct bc_proc_entry *e)
+{
+	spin_lock(&bc_entries_lock);
+	e->cookie = ++cookies;
+	e->next = bc_entries;
+	bc_entries = e;
+	num_entries++;
+	spin_unlock(&bc_entries_lock);
+}
+
+EXPORT_SYMBOL(bc_register_proc_entry);
+
+void bc_register_proc_root_entry(struct bc_proc_entry *e)
+{
+	spin_lock(&bc_entries_lock);
+	e->cookie = ++cookies;
+	e->next = bc_root_entries;
+	bc_root_entries = e;
+	bc_proc_root->nlink++;
+	spin_unlock(&bc_entries_lock);
+}
+
+EXPORT_SYMBOL(bc_register_proc_root_entry);
+
+/*
+ * small helpers
+ */
+
+static inline unsigned long bc_make_ino(struct user_beancounter *ub)
+{
+	return 0xbc000000 | (css_id(&ub->css) + 1);
+}
+
+static inline unsigned long bc_make_file_ino(struct bc_proc_entry *de)
+{
+	return 0xbe000000 + de->cookie;
+}
+
+static int bc_d_delete(const struct dentry *d)
+{
+	return 1;
+}
+
+static void bc_d_release(struct dentry *d)
+{
+	put_beancounter((struct user_beancounter *)d->d_fsdata);
+}
+
+static struct inode_operations bc_entry_iops;
+static struct file_operations bc_entry_fops;
+static struct dentry_operations bc_dentry_ops = {
+	.d_delete = bc_d_delete,
+	.d_release = bc_d_release,
+};
+
+/*
+ * common directory operations' helpers
+ */
+
+static int bc_readdir(struct file *file, filldir_t filler, void *data,
+		struct user_beancounter *parent)
+{
+	int err = 0;
+	loff_t pos, filled;
+	struct user_beancounter *ub, *prev;
+	struct bc_proc_entry *pde;
+
+	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+		return -EPERM;
+
+	pos = file->f_pos;
+	if (pos == 0) {
+		err = (*filler)(data, ".", 1, pos,
+				file->f_dentry->d_inode->i_ino, DT_DIR);
+		if (err < 0) {
+			err = 0;
+			goto out;
+		}
+		pos++;
+	}
+
+	if (pos == 1) {
+		err = (*filler)(data, "..", 2, pos,
+				parent_ino(file->f_dentry), DT_DIR);
+		if (err < 0) {
+			err = 0;
+			goto out;
+		}
+		pos++;
+	}
+
+	filled = 2;
+	for (pde = (parent == NULL ? bc_root_entries : bc_entries);
+			pde != NULL; pde = pde->next) {
+		if (filled++ < pos)
+			continue;
+
+		err = (*filler)(data, pde->name, strlen(pde->name), pos,
+				bc_make_file_ino(pde), DT_REG);
+		if (err < 0) {
+			err = 0;
+			goto out;
+		}
+		pos++;
+	}
+
+	if (parent)
+		goto out;
+
+	rcu_read_lock();
+	prev = NULL;
+	ub = list_entry(&ub_list_head, struct user_beancounter, ub_list);
+	while (1) {
+		ub = list_entry(rcu_dereference(ub->ub_list.next),
+				struct user_beancounter, ub_list);
+		if (&ub->ub_list == &ub_list_head)
+			break;
+
+		if (!get_beancounter_rcu(ub))
+			continue;
+
+		if (filled++ < pos) {
+			put_beancounter(ub);
+			continue;
+		}
+
+		rcu_read_unlock();
+		put_beancounter(prev);
+
+		err = (*filler)(data, ub->ub_name, strlen(ub->ub_name),
+				pos, bc_make_ino(ub), DT_DIR);
+		if (err < 0) {
+			err = 0;
+			put_beancounter(ub);
+			goto out;
+		}
+
+		rcu_read_lock();
+		prev = ub;
+		pos++;
+	}
+	rcu_read_unlock();
+	put_beancounter(prev);
+out:
+	file->f_pos = pos;
+	return err;
+}
+
+static int bc_looktest(struct inode *ino, void *data)
+{
+	return ino->i_op == &bc_entry_iops && ino->i_private == data;
+}
+
+static int bc_lookset(struct inode *ino, void *data)
+{
+	struct user_beancounter *ub;
+
+	ub = (struct user_beancounter *)data;
+	ino->i_private = data;
+	ino->i_ino = bc_make_ino(ub);
+	ino->i_fop = &bc_entry_fops;
+	ino->i_op = &bc_entry_iops;
+	ino->i_mode = S_IFDIR | S_IRUSR | S_IXUSR;
+	/* subbeancounters are not included, but who cares? */
+	ino->__i_nlink = num_entries + 2;
+	ino->i_gid = GLOBAL_ROOT_GID;
+	ino->i_uid = GLOBAL_ROOT_UID;
+	return 0;
+}
+
+static struct dentry *bc_lookup(struct user_beancounter *ub, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *ino;
+
+	ino = iget5_locked(dir->i_sb, css_id(&ub->css), bc_looktest, bc_lookset, ub);
+	if (ino == NULL)
+		goto out_put;
+
+	if (ino->i_state & I_NEW)
+		unlock_new_inode(ino);
+	d_set_d_op(dentry, &bc_dentry_ops);
+	dentry->d_fsdata = ub;
+	d_add(dentry, ino);
+	return NULL;
+
+out_put:
+	put_beancounter(ub);
+	return ERR_PTR(-ENOENT);
+}
+
+/*
+ * files (bc_proc_entry) manipulations
+ */
+
+static struct dentry *bc_lookup_file(struct inode *dir,
+		struct dentry *dentry, struct bc_proc_entry *root,
+		int (*test)(struct inode *, void *),
+		int (*set)(struct inode *, void *))
+{
+	struct bc_proc_entry *pde;
+	struct inode *ino;
+
+	for (pde = root; pde != NULL; pde = pde->next)
+		if (strcmp(pde->name, dentry->d_name.name) == 0)
+			break;
+
+	if (pde == NULL)
+		return ERR_PTR(-ESRCH);
+
+	ino = iget5_locked(dir->i_sb, pde->cookie, test, set, pde);
+	if (ino == NULL)
+		return ERR_PTR(-ENOENT);
+
+	if (ino->i_state & I_NEW)
+		unlock_new_inode(ino);
+	d_set_d_op(dentry, &bc_dentry_ops);
+	d_add(dentry, ino);
+	return NULL;
+}
+
+static int bc_file_open(struct inode *ino, struct file *filp)
+{
+	struct bc_proc_entry *de;
+	struct user_beancounter *ub;
+
+	de = (struct bc_proc_entry *)ino->i_private;
+	ub = (struct user_beancounter *)filp->f_dentry->d_parent->d_fsdata;
+	BUG_ON(ub->ub_magic != UB_MAGIC);
+
+	/*
+	 * ub can't disappear: we hold d_parent, he holds the beancounter
+	 */
+	return single_open(filp, de->u.show, ub);
+}
+
+static struct file_operations bc_file_ops = {
+	.open		= bc_file_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int bc_looktest_entry(struct inode *ino, void *data)
+{
+	return ino->i_fop == &bc_file_ops && ino->i_private == data;
+}
+
+static int bc_lookset_entry(struct inode *ino, void *data)
+{
+	struct bc_proc_entry *de;
+
+	de = (struct bc_proc_entry *)data;
+	ino->i_private = data;
+	ino->i_ino = bc_make_file_ino(de);
+	ino->i_fop = &bc_file_ops,
+	ino->i_mode = S_IFREG | S_IRUSR;
+	ino->__i_nlink = 1;
+	ino->i_gid = GLOBAL_ROOT_GID;
+	ino->i_uid = GLOBAL_ROOT_UID;
+	return 0;
+}
+
+static inline struct dentry *bc_lookup_files(struct inode *dir,
+		struct dentry *de)
+{
+	return bc_lookup_file(dir, de, bc_entries,
+			bc_looktest_entry, bc_lookset_entry);
+}
+
+static int bc_looktest_root_entry(struct inode *ino, void *data)
+{
+	struct bc_proc_entry *de;
+
+	de = (struct bc_proc_entry *)data;
+	return ino->i_fop == de->u.fops && ino->i_private == data;
+}
+
+static int bc_lookset_root_entry(struct inode *ino, void *data)
+{
+	struct bc_proc_entry *de;
+
+	de = (struct bc_proc_entry *)data;
+	ino->i_private = data;
+	ino->i_ino = bc_make_file_ino(de);
+	ino->i_fop = de->u.fops;
+	ino->i_mode = S_IFREG | S_IRUSR;
+	ino->__i_nlink = 1;
+	ino->i_gid = GLOBAL_ROOT_GID;
+	ino->i_uid = GLOBAL_ROOT_UID;
+	return 0;
+}
+
+static inline struct dentry *bc_lookup_root_files(struct inode *dir,
+		struct dentry *de)
+{
+	return bc_lookup_file(dir, de, bc_root_entries,
+			bc_looktest_root_entry, bc_lookset_root_entry);
+}
+
+/*
+ * /proc/bc/.../<id> directory operations
+ */
+
+static int bc_entry_readdir(struct file *file, void *data, filldir_t filler)
+{
+	return bc_readdir(file, filler, data,
+			(struct user_beancounter *)file->f_dentry->d_fsdata);
+}
+
+static struct dentry *bc_entry_lookup(struct inode *dir, struct dentry *dentry,
+		unsigned int flags)
+{
+	struct dentry *de;
+
+	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+		return ERR_PTR(-EPERM);
+
+	de = bc_lookup_files(dir, dentry);
+	if (de != ERR_PTR(-ESRCH))
+		return de;
+
+	return ERR_PTR(-ENOENT);
+}
+
+static int bc_entry_getattr(struct vfsmount *mnt, struct dentry *dentry,
+		struct kstat *stat)
+{
+	struct user_beancounter *ub;
+
+	generic_fillattr(dentry->d_inode, stat);
+	ub = (struct user_beancounter *)dentry->d_fsdata;
+	stat->nlink = 2;
+	return 0;
+}
+
+static struct file_operations bc_entry_fops = {
+	.read = generic_read_dir,
+	.readdir = bc_entry_readdir,
+};
+
+static struct inode_operations bc_entry_iops = {
+	.lookup = bc_entry_lookup,
+	.getattr = bc_entry_getattr,
+};
+
+/*
+ * /proc/bc directory operations
+ */
+
+static int bc_root_readdir(struct file *file, void *data, filldir_t filler)
+{
+	return bc_readdir(file, filler, data, NULL);
+}
+
+static struct dentry *bc_root_lookup(struct inode *dir, struct dentry *dentry,
+		unsigned int flags)
+{
+	struct user_beancounter *ub;
+	struct dentry *de;
+
+	if (!(capable(CAP_DAC_OVERRIDE) && capable(CAP_DAC_READ_SEARCH)))
+		return ERR_PTR(-EPERM);
+
+	de = bc_lookup_root_files(dir, dentry);
+	if (de != ERR_PTR(-ESRCH))
+		return de;
+
+	ub = get_beancounter_by_name(dentry->d_name.name, 0);
+	if (IS_ERR_OR_NULL(ub))
+		return ub ? ERR_CAST(ub) : ERR_PTR(-ENOENT);
+
+	return bc_lookup(ub, dir, dentry);
+}
+
+static int bc_root_getattr(struct vfsmount *mnt, struct dentry *dentry,
+	struct kstat *stat)
+{
+	generic_fillattr(dentry->d_inode, stat);
+	stat->nlink = ub_count + 2;
+	return 0;
+}
+
+static struct file_operations bc_root_fops = {
+	.read = generic_read_dir,
+	.readdir = bc_root_readdir,
+};
+
+static struct inode_operations bc_root_iops = {
+	.lookup = bc_root_lookup,
+	.getattr = bc_root_getattr,
+};
+
+static int ub_vswap_show(struct seq_file *f, void *unused)
+{
+	seq_puts(f, "Version: 1.0\n");
+	return 0;
+}
+
+static int ub_vswap_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, ub_vswap_show, NULL);
+}
+
+static struct file_operations ub_vswap_fops = {
+	.open		= ub_vswap_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init ub_init_proc(void)
+{
+	struct proc_dir_entry *entry;
+
+	bc_proc_root = proc_mkdir_mode("bc", 0, NULL);
+	if (bc_proc_root == NULL)
+		panic("Can't create /proc/bc entry");
+
+	bc_proc_root->proc_fops = &bc_root_fops;
+	bc_proc_root->proc_iops = &bc_root_iops;
+
+	bc_register_proc_entry(&bc_resources_entry);
+	bc_register_proc_entry(&bc_precharge_entry);
+	bc_register_proc_root_entry(&bc_all_resources_entry);
+	bc_register_proc_entry(&bc_meminfo_entry);
+	bc_register_proc_entry(&bc_nodeinfo_entry);
+
+	entry = proc_create("user_beancounters",
+			S_IRUSR|S_ISVTX, NULL, &ub_file_operations);
+	proc_create("vswap", S_IRUSR, proc_vz_dir, &ub_vswap_fops);
+	return 0;
+}
+
+core_initcall(ub_init_proc);
--- /dev/null
+++ b/kernel/bc/statd.c
@@ -0,0 +1,527 @@
+/*
+ *  kernel/bc/statd.c
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/timer.h>
+#include <linux/sched.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/suspend.h>
+#include <linux/freezer.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <asm/uaccess.h>
+#include <asm/param.h>
+
+#include <bc/beancounter.h>
+#include <uapi/linux/bc/statd.h>
+
+static DEFINE_SPINLOCK(ubs_notify_lock);
+static LIST_HEAD(ubs_notify_list);
+static long ubs_min_interval;
+static ubstattime_t ubs_start_time, ubs_end_time;
+static struct timer_list ubs_timer;
+
+struct ub_stat_notify {
+	struct list_head	list;
+	struct task_struct	*task;
+	int			signum;
+};
+
+static int ubstat_get_list(void __user *buf, long size)
+{
+	int retval;
+	struct user_beancounter *ub, *ubp;
+	long *page, *ptr, *end;
+	int len;
+
+	page = (long *)__get_free_page(GFP_KERNEL);
+	if (page == NULL)
+		return -ENOMEM;
+
+	retval = 0;
+	ubp = NULL;
+	ptr = page;
+	end = page + PAGE_SIZE / sizeof(*ptr);
+
+	rcu_read_lock();
+	for_each_beancounter(ub) {
+		uid_t uid = ub_legacy_id(ub);
+
+		if (uid == -1)
+			continue;
+
+		*ptr++ = uid;
+		if (ptr != end)
+			continue;
+
+		if (!get_beancounter_rcu(ub)) {
+			ptr--;
+			continue;
+		}
+		rcu_read_unlock();
+
+		put_beancounter(ubp);
+		ubp = ub;
+
+		len = min_t(long, (ptr - page) * sizeof(*ptr), size);
+		if (copy_to_user(buf, page, len)) {
+			retval = -EFAULT;
+			goto out_put;
+		}
+		retval += len;
+		if (len < PAGE_SIZE)
+			goto out_put;
+		buf += len;
+		size -= len;
+
+		ptr = page;
+		end = page + PAGE_SIZE / sizeof(*ptr);
+
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+
+	size = min_t(long, (ptr - page) * sizeof(*ptr), size);
+	if (size > 0 && copy_to_user(buf, page, size)) {
+		retval = -EFAULT;
+		goto out_put;
+	}
+	retval += size;
+
+out_put:
+	put_beancounter(ubp);
+	free_page((unsigned long)page);
+	return retval;
+}
+
+static int ubstat_gettime(void __user *buf, long size)
+{
+	ubgettime_t data;
+	int retval;
+
+	spin_lock(&ubs_notify_lock);
+	data.start_time = ubs_start_time;
+	data.end_time = ubs_end_time;
+	data.cur_time = ubs_start_time + (jiffies - ubs_start_time * HZ) / HZ;
+	spin_unlock(&ubs_notify_lock);
+
+	retval = min_t(long, sizeof(data), size);
+	if (copy_to_user(buf, &data, retval))
+		retval = -EFAULT;
+	return retval;
+}
+
+static int ubstat_do_read_one(struct user_beancounter *ub, int res, void *kbuf)
+{
+	struct {
+		ubstattime_t	start_time;
+		ubstattime_t	end_time;
+		ubstatparm_t	param[1];
+	} *data;
+
+	data = kbuf;
+	data->start_time = ubs_start_time;
+	data->end_time = ubs_end_time;
+
+	data->param[0].maxheld = ub->ub_store[res].maxheld;
+	data->param[0].failcnt = ub->ub_store[res].failcnt;
+
+	return sizeof(*data);
+}
+
+static int ubstat_do_read_all(struct user_beancounter *ub, void *kbuf, int size)
+{
+	int wrote;
+	struct {
+		ubstattime_t	start_time;
+		ubstattime_t	end_time;
+		ubstatparm_t	param[UB_RESOURCES];
+	} *data;
+	int resource;
+
+	data = kbuf;
+	data->start_time = ubs_start_time;
+	data->end_time = ubs_end_time;
+	wrote = sizeof(data->start_time) + sizeof(data->end_time);
+
+	for (resource = 0; resource < UB_RESOURCES; resource++) {
+		if (size < wrote + sizeof(data->param[resource]))
+			break;
+		data->param[resource].maxheld = ub->ub_store[resource].maxheld;
+		data->param[resource].failcnt = ub->ub_store[resource].failcnt;
+		wrote += sizeof(data->param[resource]); 
+	}
+
+	return wrote;
+}
+
+static int ubstat_do_read_full(struct user_beancounter *ub, void *kbuf,
+		int size)
+{
+	int wrote;
+	struct {
+		ubstattime_t	start_time;
+		ubstattime_t	end_time;
+		ubstatparmf_t	param[UB_RESOURCES];
+	} *data;
+	int resource;
+
+	data = kbuf;
+	data->start_time = ubs_start_time;
+	data->end_time = ubs_end_time;
+	wrote = sizeof(data->start_time) + sizeof(data->end_time);
+
+	for (resource = 0; resource < UB_RESOURCES; resource++) {
+		ubstatparmf_t *p = &data->param[resource];
+		struct ubparm *s = &ub->ub_store[resource];
+
+		if (size < wrote + sizeof(data->param[resource]))
+			break;
+
+		p->barrier	= s->barrier;
+		p->limit	= s->limit;
+		p->held		= s->held;
+		p->maxheld	= s->maxheld;
+		p->minheld	= s->minheld;
+		p->failcnt	= s->failcnt;
+		p->__unused1	= 0;
+		p->__unused2	= 0;
+
+		wrote += sizeof(data->param[resource]);
+	}
+	return wrote;
+}
+
+int ubstat_alloc_store(struct user_beancounter *ub)
+{
+	if (ub->ub_store == NULL) {
+		struct ubparm *store;
+
+		store = kmemdup(ub->ub_parms,
+				UB_RESOURCES * sizeof(struct ubparm),
+				GFP_KERNEL);
+		if (store == NULL)
+			return -ENOMEM;
+
+		spin_lock(&ubs_notify_lock);
+		if (ub->ub_store != NULL)
+			kfree(store);
+		else
+			ub->ub_store = store;
+		spin_unlock(&ubs_notify_lock);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(ubstat_alloc_store);
+
+static bool ubstat_need_memcg_sync(long cmd)
+{
+	if (UBSTAT_CMD(cmd) != UBSTAT_READ_ONE)
+		return true;
+
+	switch (UBSTAT_PARMID(cmd)) {
+		case UB_KMEMSIZE:
+		case UB_DCACHESIZE:
+		case UB_PHYSPAGES:
+		case UB_SWAPPAGES:
+		case UB_OOMGUARPAGES:
+			return true;
+	}
+	return false;
+}
+
+static int ubstat_check_cmd(long cmd)
+{
+	switch (UBSTAT_CMD(cmd)) {
+		case UBSTAT_READ_ONE:
+			if (UBSTAT_PARMID(cmd) >= UB_RESOURCES)
+				break;
+		case UBSTAT_READ_ALL:
+		case UBSTAT_READ_FULL:
+			return 0;
+	}
+	return -EINVAL;
+}
+
+static int ubstat_get_stat(struct user_beancounter *ub, long cmd,
+		void __user *buf, long size)
+{
+	void *kbuf;
+	int retval;
+
+	retval = ubstat_check_cmd(cmd);
+	if (retval)
+		return retval;
+
+	kbuf = (void *)__get_free_page(GFP_KERNEL);
+	if (kbuf == NULL)
+		return -ENOMEM;
+
+	retval = ubstat_alloc_store(ub);
+	if (retval)
+		goto out;
+
+	if (ubstat_need_memcg_sync(cmd))
+		ub_sync_memcg(ub);
+
+	spin_lock(&ubs_notify_lock);
+	switch (UBSTAT_CMD(cmd)) {
+		case UBSTAT_READ_ONE:
+			retval = ubstat_do_read_one(ub,
+					UBSTAT_PARMID(cmd), kbuf);
+			break;
+		case UBSTAT_READ_ALL:
+			retval = ubstat_do_read_all(ub, kbuf, PAGE_SIZE);
+			break;
+		case UBSTAT_READ_FULL:
+			retval = ubstat_do_read_full(ub, kbuf, PAGE_SIZE);
+			break;
+		default:
+			retval = -EINVAL;
+			__WARN_printf("%s: we shouldn't get there\ncmd: %ld\n",
+					__func__, UBSTAT_CMD(cmd));
+	}
+	spin_unlock(&ubs_notify_lock);
+
+	if (retval > 0) {
+		retval = min_t(long, retval, size);
+		if (copy_to_user(buf, kbuf, retval))
+			retval = -EFAULT;
+	}
+out:
+	free_page((unsigned long)kbuf);
+	return retval;
+}
+
+static int ubstat_handle_notifrq(ubnotifrq_t *req)
+{
+	int retval;
+	struct ub_stat_notify *new_notify;
+	struct list_head *entry;
+	struct task_struct *tsk_to_free;
+
+	new_notify = kmalloc(sizeof(*new_notify), GFP_KERNEL);
+	if (new_notify == NULL)
+		return -ENOMEM;
+
+	tsk_to_free = NULL;
+	INIT_LIST_HEAD(&new_notify->list);
+
+	spin_lock(&ubs_notify_lock);
+	list_for_each(entry, &ubs_notify_list) {
+		struct ub_stat_notify *notify;
+
+		notify = list_entry(entry, struct ub_stat_notify, list);
+		if (notify->task == current) {
+			kfree(new_notify);
+			new_notify = notify;
+			break;
+		}
+	}
+
+	retval = -EINVAL;
+	if (req->maxinterval < 1)
+		goto out_unlock;
+	if (req->maxinterval > TIME_MAX_SEC)
+		req->maxinterval = TIME_MAX_SEC;
+	if (req->maxinterval < ubs_min_interval) {
+		unsigned long dif;
+
+		ubs_min_interval = req->maxinterval;
+		dif = (ubs_timer.expires - jiffies + HZ - 1) / HZ;
+		if (dif > req->maxinterval)
+			mod_timer(&ubs_timer,
+					ubs_timer.expires -
+					(dif - req->maxinterval) * HZ);
+	}
+
+	if (entry != &ubs_notify_list) {
+		list_del(&new_notify->list);
+		tsk_to_free = new_notify->task;
+	}
+	if (req->signum) {
+		new_notify->task = current;
+		get_task_struct(new_notify->task);
+		new_notify->signum = req->signum;
+		list_add(&new_notify->list, &ubs_notify_list);
+	} else
+		kfree(new_notify);
+	retval = 0;
+out_unlock:
+	spin_unlock(&ubs_notify_lock);
+	if (tsk_to_free != NULL)
+		put_task_struct(tsk_to_free);
+	return retval;
+}
+
+/*
+ * former sys_ubstat
+ */
+long do_ubstat(int func, unsigned long arg1, unsigned long arg2,
+		void __user *buf, long size)
+{
+	int retval;
+	struct user_beancounter *ub;
+
+	if (func == UBSTAT_UBPARMNUM)
+		return UB_RESOURCES;
+	if (func == UBSTAT_UBLIST)
+		return ubstat_get_list(buf, size);
+
+	if (func == UBSTAT_GETTIME) {
+		retval = ubstat_gettime(buf, size);
+		goto notify;
+	}
+
+	ub = get_exec_ub();
+	if (ub != NULL && ub_legacy_id(ub) == arg1 && (uid_t)arg1 != -1)
+		get_beancounter(ub);
+	else /* FIXME must be if (ve_is_super) */
+		ub = get_beancounter_byuid(arg1, 0);
+
+	if (ub == NULL)
+		return -ESRCH;
+
+	retval = ubstat_get_stat(ub, func, buf, size);
+	put_beancounter(ub);
+notify:
+	/* Handle request for notification */
+	if (retval >= 0) {
+		ubnotifrq_t notifrq;
+		int err;
+
+		err = -EFAULT;
+		if (!copy_from_user(&notifrq, (void __user *)arg2,
+					sizeof(notifrq)))
+			err = ubstat_handle_notifrq(&notifrq);
+		if (err)
+			retval = err;
+	}
+
+	return retval;
+}
+
+static void ubstat_save_onestat(struct user_beancounter *ub)
+{
+	int resource;
+
+	if (ub->ub_store == NULL)
+		return;
+
+	/* called with local irq disabled */
+	spin_lock(&ub->ub_lock);
+	for (resource = 0; resource < UB_RESOURCES; resource++) {
+		memcpy(&ub->ub_store[resource], &ub->ub_parms[resource],
+			sizeof(struct ubparm));
+		ub->ub_parms[resource].minheld = 
+			ub->ub_parms[resource].maxheld =
+			ub->ub_parms[resource].held;
+	}
+	spin_unlock(&ub->ub_lock);
+}
+
+static void ubstat_save_statistics(void)
+{
+	unsigned long flags;
+	struct user_beancounter *ub;
+
+	local_irq_save(flags);
+	for_each_beancounter (ub)
+		ubstat_save_onestat(ub);
+	local_irq_restore(flags);
+}
+
+static void ubstatd_timeout(unsigned long __data)
+{
+	struct task_struct *p;
+
+	p = (struct task_struct *) __data;
+	wake_up_process(p);
+}
+
+/*
+ * Safe wrapper for send_sig. It prevents a race with release_task
+ * for sighand.
+ * Should be called under tasklist_lock.
+ */
+static void task_send_sig(struct ub_stat_notify *notify)
+{
+	if (likely(notify->task->sighand != NULL))
+		send_sig(notify->signum, notify->task, 1);
+}
+
+static inline void do_notifies(void)
+{
+	LIST_HEAD(notif_free_list);
+	struct ub_stat_notify *notify;
+	struct ub_stat_notify *tmp;
+
+	spin_lock(&ubs_notify_lock);
+	ubs_start_time = ubs_end_time;
+	/*
+	 * the expression below relies on time being unsigned long and
+	 * arithmetic promotion rules
+	 */
+	ubs_end_time += (ubs_timer.expires - ubs_start_time * HZ) / HZ;
+	mod_timer(&ubs_timer, ubs_timer.expires + ubs_min_interval * HZ);
+	ubs_min_interval = TIME_MAX_SEC;
+	/* save statistics accumulated for the interval */
+	ubstat_save_statistics();
+	/* send signals */
+	read_lock(&tasklist_lock);
+	list_for_each_entry_safe(notify, tmp, &ubs_notify_list, list) {
+		task_send_sig(notify);
+		list_move(&notify->list, &notif_free_list);
+	}
+	read_unlock(&tasklist_lock);
+	spin_unlock(&ubs_notify_lock);
+
+	list_for_each_entry_safe(notify, tmp, &notif_free_list, list) {
+		put_task_struct(notify->task);
+		list_del(&notify->list);
+		kfree(notify);
+	}
+}
+
+/*
+ * Kernel thread
+ */
+static int ubstatd(void *unused)
+{
+	ubs_timer.data = (unsigned long)current;
+	ubs_timer.function = ubstatd_timeout;
+	add_timer(&ubs_timer);
+
+	while (1) {
+		set_task_state(current, TASK_INTERRUPTIBLE);
+		if (time_after(ubs_timer.expires, jiffies)) {
+			schedule();
+			try_to_freeze();
+			continue;
+		}
+
+		__set_task_state(current, TASK_RUNNING);
+		do_notifies();
+	}
+	return 0;
+}
+
+static int __init ubstatd_init(void)
+{
+	init_timer(&ubs_timer);
+	ubs_timer.expires = TIME_MAX_JIF;
+	ubs_min_interval = TIME_MAX_SEC;
+	ubs_start_time = ubs_end_time = 0;
+	kthread_run(ubstatd, NULL, "ubstatd");
+	return 0;
+}
+
+module_init(ubstatd_init);
--- /dev/null
+++ b/kernel/bc/sys.c
@@ -0,0 +1,161 @@
+/*
+ *  kernel/bc/sys.c
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/virtinfo.h>
+#include <linux/compat.h>
+#include <linux/syscalls.h>
+#include <linux/sched.h>
+#include <asm/uaccess.h>
+
+#include <bc/beancounter.h>
+
+/*
+ *	The (rather boring) getluid syscall
+ */
+SYSCALL_DEFINE0(getluid)
+{
+	struct user_beancounter *ub;
+	uid_t uid;
+
+	ub = get_exec_ub();
+	if (ub == NULL)
+		return -EINVAL;
+
+	uid = ub_legacy_id(ub);
+	if (uid == -1)
+		return -EINVAL;
+
+	return uid;
+}
+
+/*
+ *	The setluid syscall
+ */
+SYSCALL_DEFINE1(setluid, uid_t, uid)
+{
+	struct user_beancounter *ub;
+	int error;
+
+	/* You may not disown a setluid */
+	error = -EINVAL;
+	if (uid == (uid_t)-1)
+		goto out;
+
+	/* You may only set an ub as root */
+	error = -EPERM;
+	if (!capable(CAP_SETUID))
+		goto out;
+
+	/* Ok - set up a beancounter entry for this user */
+	error = -ENOBUFS;
+	ub = get_beancounter_byuid(uid, 1);
+	if (ub == NULL)
+		goto out;
+	error = ub_attach_task(ub, current);
+	put_beancounter(ub);
+out:
+	return error;
+}
+
+long do_setublimit(uid_t uid, unsigned long resource,
+		unsigned long *new_limits)
+{
+	int error;
+	unsigned long flags;
+	struct user_beancounter *ub;
+
+	error = -EINVAL;
+	if (resource >= UB_RESOURCES)
+		goto out;
+
+	error = -EINVAL;
+	if (new_limits[0] > UB_MAXVALUE || new_limits[1] > UB_MAXVALUE)
+		goto out;
+
+	error = -ENOENT;
+	ub = get_beancounter_byuid(uid, 0);
+	if (ub == NULL)
+		goto out;
+
+	ub_sync_memcg(ub);
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	ub->ub_parms[resource].barrier = new_limits[0];
+	ub->ub_parms[resource].limit = new_limits[1];
+	init_beancounter_precharge(ub, resource);
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+
+	error = ub_update_memcg(ub);
+
+	put_beancounter(ub);
+out:
+	return error;
+}
+
+/*
+ *	The setbeanlimit syscall
+ */
+SYSCALL_DEFINE3(setublimit, uid_t, uid, unsigned long, resource,
+		unsigned long __user *, limits)
+{
+	unsigned long new_limits[2];
+
+	if (!capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
+	if (copy_from_user(&new_limits, limits, sizeof(new_limits)))
+		return -EFAULT;
+
+	return do_setublimit(uid, resource, new_limits);
+}
+
+extern long do_ubstat(int func, unsigned long arg1, unsigned long arg2, 
+		void __user *buf, long size);
+
+SYSCALL_DEFINE5(ubstat, int, func, unsigned long, arg1, unsigned long, arg2,
+		void __user *, buf, long, size)
+{
+	if (!capable(CAP_DAC_OVERRIDE) && !capable(CAP_DAC_READ_SEARCH))
+		return -EPERM;
+
+	return do_ubstat(func, arg1, arg2, buf, size);
+}
+
+#ifdef CONFIG_COMPAT
+#define UB_MAXVALUE_COMPAT ((1UL << (sizeof(compat_long_t) * 8 - 1)) - 1)
+
+asmlinkage long compat_sys_setublimit(uid_t uid,
+		compat_long_t resource,
+		compat_long_t __user *limits)
+{
+	compat_long_t u_new_limits[2];
+	unsigned long new_limits[2];
+
+	if (!capable(CAP_SYS_RESOURCE))
+		return -EPERM;
+
+	if (copy_from_user(&u_new_limits, limits, sizeof(u_new_limits)))
+		return -EFAULT;
+
+	new_limits[0] = u_new_limits[0];
+	new_limits[1] = u_new_limits[1];
+
+	if (u_new_limits[0] == UB_MAXVALUE_COMPAT)
+		new_limits[0] = UB_MAXVALUE;
+	if (u_new_limits[1] == UB_MAXVALUE_COMPAT)
+		new_limits[1] = UB_MAXVALUE;
+
+	return do_setublimit(uid, resource, new_limits);
+}
+
+asmlinkage long compat_sys_ubstat(int func, unsigned int arg1,
+		unsigned int arg2, compat_uptr_t *buf, long size)
+{
+	return sys_ubstat(func, arg1, arg2, buf, size);
+}
+#endif
--- /dev/null
+++ b/kernel/bc/vm_pages.c
@@ -0,0 +1,300 @@
+/*
+ *  kernel/bc/vm_pages.c
+ *
+ *  Copyright (c) 2005-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/highmem.h>
+#include <linux/virtinfo.h>
+#include <linux/module.h>
+#include <linux/shmem_fs.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/ve.h>
+
+#include <asm/pgtable.h>
+#include <asm/page.h>
+
+#include <bc/beancounter.h>
+#include <bc/vmpages.h>
+#include <bc/proc.h>
+
+int ub_overcommit_memory;
+
+int ub_memory_charge(struct mm_struct *mm, unsigned long size,
+		unsigned vm_flags, struct file *vm_file, int sv)
+{
+	struct user_beancounter *ub;
+
+	ub = mm->mm_ub;
+	if (ub == NULL)
+		return 0;
+
+	size >>= PAGE_SHIFT;
+	if (size > UB_MAXVALUE)
+		return -EINVAL;
+
+	BUG_ON(sv != UB_SOFT && sv != UB_HARD);
+
+	if (vm_flags & VM_LOCKED) {
+		if (charge_beancounter(ub, UB_LOCKEDPAGES, size, sv))
+			goto out_err;
+	}
+	if (VM_UB_PRIVATE(vm_flags, vm_file)) {
+               if (charge_beancounter_fast(ub, UB_PRIVVMPAGES, size, sv))
+			goto out_private;
+	}
+	return 0;
+
+out_private:
+	if (vm_flags & VM_LOCKED)
+		uncharge_beancounter(ub, UB_LOCKEDPAGES, size);
+out_err:
+	return -ENOMEM;
+}
+
+void ub_memory_uncharge(struct mm_struct *mm, unsigned long size,
+		unsigned vm_flags, struct file *vm_file)
+{
+	struct user_beancounter *ub;
+
+	ub = mm->mm_ub;
+	if (ub == NULL)
+		return;
+
+	size >>= PAGE_SHIFT;
+
+	if (vm_flags & VM_LOCKED)
+		uncharge_beancounter(ub, UB_LOCKEDPAGES, size);
+	if (VM_UB_PRIVATE(vm_flags, vm_file))
+		uncharge_beancounter_fast(ub, UB_PRIVVMPAGES, size);
+}
+
+int ub_locked_charge(struct mm_struct *mm, unsigned long size)
+{
+	struct user_beancounter *ub;
+
+	ub = mm->mm_ub;
+	if (ub == NULL)
+		return 0;
+
+	return charge_beancounter(ub, UB_LOCKEDPAGES,
+			size >> PAGE_SHIFT, UB_HARD);
+}
+
+void ub_locked_uncharge(struct mm_struct *mm, unsigned long size)
+{
+	struct user_beancounter *ub;
+
+	ub = mm->mm_ub;
+	if (ub == NULL)
+		return;
+
+	uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT);
+}
+
+int ub_lockedshm_charge(struct shmem_inode_info *shi, unsigned long size)
+{
+	struct user_beancounter *ub;
+
+	ub = shi->shmi_ub;
+	if (ub == NULL)
+		return 0;
+
+	return charge_beancounter(ub, UB_LOCKEDPAGES,
+			size >> PAGE_SHIFT, UB_HARD);
+}
+
+void ub_lockedshm_uncharge(struct shmem_inode_info *shi, unsigned long size)
+{
+	struct user_beancounter *ub;
+
+	ub = shi->shmi_ub;
+	if (ub == NULL)
+		return;
+
+	uncharge_beancounter(ub, UB_LOCKEDPAGES, size >> PAGE_SHIFT);
+}
+
+extern int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages);
+
+int ub_enough_memory(struct mm_struct *mm, long pages)
+{
+	struct user_beancounter *ub;
+	struct cgroup_subsys_state *css;
+	unsigned long flags;
+	int ret;
+
+	if (!mm)
+		return 0;
+
+	ub = mm->mm_ub;
+
+	if (ub->ub_parms[UB_PRIVVMPAGES].held >
+	    ub->ub_parms[UB_PRIVVMPAGES].barrier) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (ub == get_ub0() || ub_overcommit_memory)
+		return 0;
+
+	css = ub_get_mem_css(ub);
+	ret = mem_cgroup_enough_memory(mem_cgroup_from_cont(css->cgroup), pages);
+	css_put(css);
+out:
+	if (unlikely(ret < 0)) {
+		spin_lock_irqsave(&ub->ub_lock, flags);
+		ub->ub_parms[UB_PRIVVMPAGES].failcnt++;
+		spin_unlock_irqrestore(&ub->ub_lock, flags);
+	}
+	return ret;
+}
+
+static int bc_fill_sysinfo(struct user_beancounter *ub,
+		unsigned long meminfo_val, struct sysinfo *si)
+{
+	unsigned long used, total;
+	unsigned long totalram, totalswap;
+
+	/* No virtualization */
+	if (meminfo_val == VE_MEMINFO_SYSTEM)
+		return NOTIFY_DONE | NOTIFY_STOP_MASK;
+
+	totalram = si->totalram;
+	totalswap = si->totalswap;
+
+	memset(si, 0, sizeof(*si));
+
+	ub_sync_memcg(ub);
+
+	total = ub->ub_parms[UB_PHYSPAGES].limit;
+	used = ub->ub_parms[UB_PHYSPAGES].held;
+
+	if (total == UB_MAXVALUE)
+		total = totalram;
+
+	si->totalram = total;
+	si->freeram = (total > used ? total - used : 0);
+
+	total = ub->ub_parms[UB_SWAPPAGES].limit;
+	used = ub->ub_parms[UB_SWAPPAGES].held;
+
+	if (total == UB_MAXVALUE)
+		total = totalswap;
+
+	si->totalswap = total;
+	si->freeswap = (total > used ? total - used : 0);
+
+	si->mem_unit = PAGE_SIZE;
+
+	return NOTIFY_OK;
+}
+
+extern void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, struct meminfo *mi);
+
+static int bc_fill_meminfo(struct user_beancounter *ub,
+		unsigned long meminfo_val, struct meminfo *mi)
+{
+	struct cgroup_subsys_state *css;
+	int cpu, ret;
+
+	ret = bc_fill_sysinfo(ub, meminfo_val, mi->si);
+	if (ret & NOTIFY_STOP_MASK)
+		goto out;
+
+	css = ub_get_mem_css(ub);
+	mem_cgroup_fill_meminfo(mem_cgroup_from_cont(css->cgroup), mi);
+	css_put(css);
+
+	mi->locked = ub->ub_parms[UB_LOCKEDPAGES].held;
+
+	mi->dirty_pages = __ub_stat_get(ub, dirty_pages);
+	mi->writeback_pages = __ub_stat_get(ub, writeback_pages);
+	for_each_possible_cpu(cpu) {
+		struct ub_percpu_struct *pcpu = ub_percpu(ub, cpu);
+
+		mi->dirty_pages	+= pcpu->dirty_pages;
+		mi->writeback_pages += pcpu->writeback_pages;
+	}
+
+	mi->dirty_pages = max_t(long, 0, mi->dirty_pages);
+	mi->writeback_pages = max_t(long, 0, mi->writeback_pages);
+out:
+	return ret;
+}
+
+static int bc_fill_vmstat(struct user_beancounter *ub, unsigned long *stat)
+{
+	/* FIXME: show swapin/swapout? */
+	return NOTIFY_OK;
+}
+
+static int bc_mem_notify(struct vnotifier_block *self,
+		unsigned long event, void *arg, int old_ret)
+{
+	switch (event) {
+	case VIRTINFO_MEMINFO: {
+		struct meminfo *mi = arg;
+		return bc_fill_meminfo(mi->ub, mi->meminfo_val, mi);
+	}
+	case VIRTINFO_SYSINFO:
+		return bc_fill_sysinfo(get_exec_ub(),
+				get_exec_env()->meminfo_val, arg);
+	case VIRTINFO_VMSTAT:
+		return bc_fill_vmstat(get_exec_ub(), arg);
+	};
+
+	return old_ret;
+}
+
+static struct vnotifier_block bc_mem_notifier_block = {
+	.notifier_call = bc_mem_notify,
+};
+
+static int __init init_vmguar_notifier(void)
+{
+	virtinfo_notifier_register(VITYPE_GENERAL, &bc_mem_notifier_block);
+	return 0;
+}
+
+static void __exit fini_vmguar_notifier(void)
+{
+	virtinfo_notifier_unregister(VITYPE_GENERAL, &bc_mem_notifier_block);
+}
+
+module_init(init_vmguar_notifier);
+module_exit(fini_vmguar_notifier);
+
+#ifdef CONFIG_PROC_FS
+static int bc_vmaux_show(struct seq_file *f, void *v)
+{
+	struct user_beancounter *ub;
+
+	ub = seq_beancounter(f);
+
+	ub_sync_memcg(ub);
+
+	seq_printf(f, bc_proc_lu_fmt, "swapin", ub->swapin);
+	seq_printf(f, bc_proc_lu_fmt, "swapout", ub->swapout);
+
+	seq_printf(f, bc_proc_lu_fmt, "ram", ub->ub_parms[UB_PHYSPAGES].held);
+
+	return 0;
+}
+static struct bc_proc_entry bc_vmaux_entry = {
+	.name = "vmaux",
+	.u.show = bc_vmaux_show,
+};
+
+static int __init bc_vmaux_init(void)
+{
+	bc_register_proc_entry(&bc_vmaux_entry);
+	return 0;
+}
+
+late_initcall(bc_vmaux_init);
+#endif
--- a/kernel/capability.c
+++ b/kernel/capability.c
@@ -16,6 +16,7 @@
 #include <linux/pid_namespace.h>
 #include <linux/user_namespace.h>
 #include <asm/uaccess.h>
+#include <linux/ve.h>
 
 /*
  * Leveraged for setting/resetting capabilities
@@ -396,6 +397,25 @@ bool ns_capable(struct user_namespace *ns, int cap)
 }
 EXPORT_SYMBOL(ns_capable);
 
+#if CONFIG_VE
+bool ve_capable(int cap)
+{
+	struct cred *cred = get_exec_env()->init_cred;
+
+	if (cred == NULL) /* ve isn't running */
+		cred = ve0.init_cred;
+
+	return ns_capable(cred->user_ns, cap);
+}
+#else
+bool ve_capable(int cap)
+{
+	return capable(cap);
+}
+#endif
+
+EXPORT_SYMBOL_GPL(ve_capable);
+
 /**
  * file_ns_capable - Determine if the file's opener had a capability in effect
  * @file:  The file we want to check
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -39,7 +39,6 @@
 #include <linux/pagemap.h>
 #include <linux/proc_fs.h>
 #include <linux/rcupdate.h>
-#include <linux/sched.h>
 #include <linux/backing-dev.h>
 #include <linux/seq_file.h>
 #include <linux/slab.h>
@@ -60,12 +59,10 @@
 #include <linux/poll.h>
 #include <linux/flex_array.h> /* used in cgroup_attach_task */
 #include <linux/kthread.h>
+#include <linux/ve.h>
 
 #include <linux/atomic.h>
 
-/* css deactivation bias, makes css->refcnt negative to deny new trygets */
-#define CSS_DEACT_BIAS		INT_MIN
-
 /*
  * cgroup_mutex is the master lock.  Any modification to cgroup or its
  * hierarchy must be performed while holding it.
@@ -215,23 +212,11 @@ static struct cgroup_name root_cgroup_name = { .name = "/" };
  */
 static int need_forkexit_callback __read_mostly;
 
+static void cgroup_offline_fn(struct work_struct *work);
 static int cgroup_destroy_locked(struct cgroup *cgrp);
 static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys,
 			      struct cftype cfts[], bool is_add);
 
-static int css_unbias_refcnt(int refcnt)
-{
-	return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS;
-}
-
-/* the current nr of refs, always >= 0 whether @css is deactivated or not */
-static int css_refcnt(struct cgroup_subsys_state *css)
-{
-	int v = atomic_read(&css->refcnt);
-
-	return css_unbias_refcnt(v);
-}
-
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
 {
@@ -838,7 +823,7 @@ static struct cgroup_name *cgroup_alloc_name(struct dentry *dentry)
 
 static void cgroup_free_fn(struct work_struct *work)
 {
-	struct cgroup *cgrp = container_of(work, struct cgroup, free_work);
+	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
 	struct cgroup_subsys *ss;
 
 	mutex_lock(&cgroup_mutex);
@@ -883,7 +868,8 @@ static void cgroup_free_rcu(struct rcu_head *head)
 {
 	struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head);
 
-	queue_work(cgroup_destroy_wq, &cgrp->free_work);
+	INIT_WORK(&cgrp->destroy_work, cgroup_free_fn);
+	queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
 }
 
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
@@ -1103,6 +1089,7 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 	if (strlen(root->release_agent_path))
 		seq_show_option(seq, "release_agent",
 				root->release_agent_path);
+
 	if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags))
 		seq_puts(seq, ",clone_children");
 	if (strlen(root->name))
@@ -1111,19 +1098,6 @@ static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry)
 	return 0;
 }
 
-struct cgroup_sb_opts {
-	unsigned long subsys_mask;
-	unsigned long flags;
-	char *release_agent;
-	bool cpuset_clone_children;
-	char *name;
-	/* User explicitly requested empty subsystem */
-	bool none;
-
-	struct cgroupfs_root *new_root;
-
-};
-
 /*
  * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
  * with cgroup_mutex held to protect the subsys[] array. This function takes
@@ -1400,10 +1374,60 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	return ret;
 }
 
+#ifdef CONFIG_VE
+static int cgroup_show_path(struct seq_file *m, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct cgroup *cgrp = __d_cgrp(dentry);
+	char *buf;
+	int ret;
+
+	/*
+	 * dentry can be of cgroup file (but not only of
+	 * directory) so use parent's cgroup for it to find
+	 * dirname and append dentry name to it at the end
+	 */
+	if (!inode || !S_ISDIR(inode->i_mode))
+		cgrp = __d_cgrp(dentry->d_parent);
+
+	ret = -ENOMEM;
+	buf = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!buf)
+		goto out;
+
+	ret = cgroup_path_ve(cgrp, buf, PATH_MAX);
+	if (ret < 0)
+		goto out_free;
+
+	ret = seq_puts(m, buf);
+	if (ret < 0)
+		goto out_free;
+
+	if (!inode || !S_ISDIR(inode->i_mode)) {
+		if (buf[1] != '\0') {
+			ret = seq_putc(m, '/');
+			if (ret < 0)
+				goto out_free;
+		}
+
+		ret = seq_puts(m, dentry->d_name.name);
+		if (ret < 0)
+			goto out_free;
+	}
+out_free:
+	kfree(buf);
+out:
+	return ret;
+}
+#endif
+
 static const struct super_operations cgroup_ops = {
 	.statfs = simple_statfs,
 	.drop_inode = generic_delete_inode,
 	.show_options = cgroup_show_options,
+#ifdef CONFIG_VE
+	.show_path = cgroup_show_path,
+#endif
 	.remount_fs = cgroup_remount,
 };
 
@@ -1416,7 +1440,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp)
 	INIT_LIST_HEAD(&cgrp->allcg_node);
 	INIT_LIST_HEAD(&cgrp->release_list);
 	INIT_LIST_HEAD(&cgrp->pidlists);
-	INIT_WORK(&cgrp->free_work, cgroup_free_fn);
 	mutex_init(&cgrp->pidlist_mutex);
 	INIT_LIST_HEAD(&cgrp->event_list);
 	spin_lock_init(&cgrp->event_list_lock);
@@ -1586,10 +1609,23 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type,
 	struct cgroupfs_root *new_root;
 	struct inode *inode;
 
+#ifdef CONFIG_VE
+	if (!ve_is_super(get_exec_env()) && !(flags & MS_KERNMOUNT)) {
+		if (!get_exec_env()->is_pseudosuper)
+			return ERR_PTR(-EACCES);
+	}
+#endif
+
 	/* First find the desired set of subsystems */
-	mutex_lock(&cgroup_mutex);
-	ret = parse_cgroupfs_options(data, &opts);
-	mutex_unlock(&cgroup_mutex);
+	if (!(flags & MS_KERNMOUNT)) {
+		mutex_lock(&cgroup_mutex);
+		ret = parse_cgroupfs_options(data, &opts);
+		mutex_unlock(&cgroup_mutex);
+	} else {
+		opts = *(struct cgroup_sb_opts *)data;
+		opts.name = kstrdup(opts.name, GFP_KERNEL);
+		opts.release_agent = kstrdup(opts.release_agent, GFP_KERNEL);
+	}
 	if (ret)
 		goto out_err;
 
@@ -1781,6 +1817,7 @@ static struct file_system_type cgroup_fs_type = {
 	.name = "cgroup",
 	.mount = cgroup_mount,
 	.kill_sb = cgroup_kill_sb,
+	.fs_flags = FS_VIRTUALIZED,
 };
 
 static struct kobject *cgroup_kobj;
@@ -1798,7 +1835,8 @@ static struct kobject *cgroup_kobj;
  * inode's i_mutex, while on the other hand cgroup_path() can be called
  * with some irq-safe spinlocks held.
  */
-int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
+static int __cgroup_path(const struct cgroup *cgrp, char *buf, int buflen,
+			 bool virt)
 {
 	int ret = -ENAMETOOLONG;
 	char *start;
@@ -1817,6 +1855,22 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 		const char *name = cgroup_name(cgrp);
 		int len;
 
+#ifdef CONFIG_VE
+		if (virt && test_bit(CGRP_VE_ROOT, &cgrp->flags)) {
+			/*
+			 * Containers cgroups are bind-mounted from node
+			 * so they are like '/' from inside, thus we have
+			 * to mangle cgroup path output.
+			 */
+			if (*start != '/') {
+				if (--start < buf)
+					goto out;
+				*start = '/';
+			}
+			break;
+		}
+#endif
+
 		len = strlen(name);
 		if ((start -= len) < buf)
 			goto out;
@@ -1834,8 +1888,18 @@ out:
 	rcu_read_unlock();
 	return ret;
 }
+
+int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
+{
+	return __cgroup_path(cgrp, buf, buflen, false);
+}
 EXPORT_SYMBOL_GPL(cgroup_path);
 
+int cgroup_path_ve(const struct cgroup *cgrp, char *buf, int buflen)
+{
+	return __cgroup_path(cgrp, buf, buflen, !ve_is_super(get_exec_env()));
+}
+
 /*
  * Control Group taskset
  */
@@ -2228,10 +2292,13 @@ static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft,
 				      const char *buffer)
 {
 	BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX);
+
 	if (strlen(buffer) >= PATH_MAX)
 		return -EINVAL;
+
 	if (!cgroup_lock_live_group(cgrp))
 		return -ENODEV;
+
 	mutex_lock(&cgroup_root_mutex);
 	strcpy(cgrp->root->release_agent_path, buffer);
 	mutex_unlock(&cgroup_root_mutex);
@@ -2334,6 +2401,31 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
 	struct cftype *cft = __d_cft(file->f_dentry);
 	struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 
+#ifdef CONFIG_VE
+	/*
+	 * In a sake of Docker we might bindmount cgroups so
+	 * that they would look like
+	 *
+	 * Node				Container
+	 * /sys/fs/cgroup/memory/CTID	/sys/fs/cgroup/memory
+	 *
+	 * but we should not allow to modify these toplevel
+	 * cgroups, only nested ones, because toplevel carries
+	 * container's resource limits/settings and etc.
+	 *
+	 * Same time ve cgroup should be writable during
+	 * container startup (to modify @ve.state entry which
+	 * kick container to run), but once ve is up and running
+	 * userspace from ve0 should *never* bindmount it
+	 * inside a container FS.
+	 */
+	if (!ve_is_super(get_exec_env())
+	    && test_bit(CGRP_VE_ROOT, &cgrp->flags)
+	    && !get_exec_env()->is_pseudosuper
+	    && !(cft->flags & CFTYPE_VE_WRITABLE))
+		return -EPERM;
+#endif
+
 	if (cgroup_is_removed(cgrp))
 		return -ENODEV;
 	if (cft->write)
@@ -2880,6 +2972,7 @@ int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts)
 	cgroup_cfts_commit(ss, NULL, false);
 	return -ENOENT;
 }
+EXPORT_SYMBOL_GPL(cgroup_rm_cftypes);
 
 /**
  * cgroup_task_count - count the number of tasks in a cgroup.
@@ -3973,6 +4066,22 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
 	return 0;
 }
 
+static u64 cgroup_read_subgroups_limit(struct cgroup *cgrp,
+				struct cftype *cft)
+{
+	return cgrp->subgroups_limit;
+}
+static int cgroup_write_subgroups_limit(struct cgroup *cgrp,
+					struct cftype *cft,
+					u64 val)
+{
+	if (!test_bit(CGRP_VE_ROOT, &cgrp->flags))
+		return -EACCES;
+
+	cgrp->subgroups_limit = val;
+	return 0;
+}
+
 /*
  * for the common functions, 'private' gives the type of file
  */
@@ -3981,6 +4090,7 @@ static int cgroup_clone_children_write(struct cgroup *cgrp,
 static struct cftype files[] = {
 	{
 		.name = "tasks",
+		.flags = CFTYPE_VE_WRITABLE,
 		.open = cgroup_tasks_open,
 		.write_u64 = cgroup_tasks_write,
 		.release = cgroup_pidlist_release,
@@ -3988,6 +4098,7 @@ static struct cftype files[] = {
 	},
 	{
 		.name = CGROUP_FILE_GENERIC_PREFIX "procs",
+		.flags = CFTYPE_VE_WRITABLE,
 		.open = cgroup_procs_open,
 		.write_u64 = cgroup_procs_write,
 		.release = cgroup_pidlist_release,
@@ -4021,6 +4132,12 @@ static struct cftype files[] = {
 		.write_string = cgroup_release_agent_write,
 		.max_write_len = PATH_MAX,
 	},
+	{
+		.name = "cgroup.subgroups_limit",
+		.read_u64 = cgroup_read_subgroups_limit,
+		.write_u64 = cgroup_write_subgroups_limit,
+		.mode = S_IRUGO | S_IWUSR,
+	},
 	{ }	/* terminate */
 };
 
@@ -4072,15 +4189,23 @@ static void css_dput_fn(struct work_struct *work)
 	struct cgroup_subsys_state *css =
 		container_of(work, struct cgroup_subsys_state, dput_work);
 
+	percpu_ref_exit(&css->refcnt);
 	cgroup_dput(css->cgroup);
 }
 
+static void css_release(struct percpu_ref *ref)
+{
+	struct cgroup_subsys_state *css =
+		container_of(ref, struct cgroup_subsys_state, refcnt);
+
+	queue_work(cgroup_destroy_wq, &css->dput_work);
+}
+
 static void init_cgroup_css(struct cgroup_subsys_state *css,
 			       struct cgroup_subsys *ss,
 			       struct cgroup *cgrp)
 {
 	css->cgroup = cgrp;
-	atomic_set(&css->refcnt, 1);
 	css->flags = 0;
 	css->id = NULL;
 	if (cgrp == dummytop)
@@ -4128,6 +4253,56 @@ static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp)
 	cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE;
 }
 
+static int subgroups_count(struct cgroup *cgroup)
+{
+	struct cgroup *pos;
+	int cgrps_count = 0;
+
+	rcu_read_lock();
+	cgroup_for_each_descendant_post(pos, cgroup)
+		cgrps_count++;
+	rcu_read_unlock();
+
+	return cgrps_count;
+}
+
+#ifdef CONFIG_VE
+void cgroup_mark_ve_root(struct ve_struct *ve)
+{
+	struct cgroup *cgrp;
+	struct cgroupfs_root *root;
+
+	mutex_lock(&cgroup_mutex);
+	for_each_active_root(root) {
+		cgrp = task_cgroup_from_root(ve->init_task, root);
+		set_bit(CGRP_VE_ROOT, &cgrp->flags);
+	}
+	mutex_unlock(&cgroup_mutex);
+}
+
+static struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
+{
+	struct cgroup *ve_root = NULL;
+
+	rcu_read_lock();
+	do {
+		if (test_bit(CGRP_VE_ROOT, &cgrp->flags)) {
+			ve_root = cgrp;
+			break;
+		}
+		cgrp = cgrp->parent;
+	} while (cgrp);
+	rcu_read_unlock();
+
+	return ve_root;
+}
+#else
+static inline struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
+{
+	return NULL;
+}
+#endif
+
 /*
  * cgroup_create - create a cgroup
  * @parent: cgroup that will be parent of the new cgroup
@@ -4145,6 +4320,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	int err = 0;
 	struct cgroup_subsys *ss;
 	struct super_block *sb = root->sb;
+	struct cgroup *ve_root = parent;
 
 	/* allocate the cgroup and its ID, 0 is reserved for the root */
 	cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL);
@@ -4160,6 +4336,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 	if (cgrp->id < 0)
 		goto err_free_name;
 
+	ve_root = cgroup_get_ve_root(parent);
+	if (ve_root && ve_root->subgroups_limit > 0 &&
+			subgroups_count(ve_root) >= ve_root->subgroups_limit) {
+		err = -EACCES;
+		goto err_free_name;
+	}
+
 	/*
 	 * Only live parents can have children.  Note that the liveliness
 	 * check isn't strictly necessary because cgroup_mkdir() and
@@ -4201,7 +4384,13 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			err = PTR_ERR(css);
 			goto err_free_all;
 		}
+
+		err = percpu_ref_init(&css->refcnt, css_release, 0, GFP_KERNEL);
+		if (err)
+			goto err_free_all;
+
 		init_cgroup_css(css, ss, cgrp);
+
 		if (ss->use_id) {
 			err = alloc_css_id(ss, parent, cgrp);
 			if (err)
@@ -4258,8 +4447,12 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
 err_free_all:
 	for_each_subsys(root, ss) {
-		if (cgrp->subsys[ss->subsys_id])
+		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+
+		if (css) {
+			percpu_ref_exit(&css->refcnt);
 			ss->css_free(cgrp);
+		}
 	}
 	mutex_unlock(&cgroup_mutex);
 	/* Release the reference count that we took on the superblock */
@@ -4287,63 +4480,122 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
 	return cgroup_create(c_parent, dentry, mode | S_IFDIR);
 }
 
+static void cgroup_css_killed(struct cgroup *cgrp)
+{
+	if (!atomic_dec_and_test(&cgrp->css_kill_cnt))
+		return;
+
+	/* percpu ref's of all css's are killed, kick off the next step */
+	INIT_WORK(&cgrp->destroy_work, cgroup_offline_fn);
+	queue_work(cgroup_destroy_wq, &cgrp->destroy_work);
+}
+
+static void css_ref_killed_fn(struct percpu_ref *ref)
+{
+	struct cgroup_subsys_state *css =
+		container_of(ref, struct cgroup_subsys_state, refcnt);
+
+	cgroup_css_killed(css->cgroup);
+}
+
+/**
+ * cgroup_destroy_locked - the first stage of cgroup destruction
+ * @cgrp: cgroup to be destroyed
+ *
+ * css's make use of percpu refcnts whose killing latency shouldn't be
+ * exposed to userland and are RCU protected.  Also, cgroup core needs to
+ * guarantee that css_tryget() won't succeed by the time ->css_offline() is
+ * invoked.  To satisfy all the requirements, destruction is implemented in
+ * the following two steps.
+ *
+ * s1. Verify @cgrp can be destroyed and mark it dying.  Remove all
+ *     userland visible parts and start killing the percpu refcnts of
+ *     css's.  Set up so that the next stage will be kicked off once all
+ *     the percpu refcnts are confirmed to be killed.
+ *
+ * s2. Invoke ->css_offline(), mark the cgroup dead and proceed with the
+ *     rest of destruction.  Once all cgroup references are gone, the
+ *     cgroup is RCU-freed.
+ *
+ * This function implements s1.  After this step, @cgrp is gone as far as
+ * the userland is concerned and a new cgroup with the same name may be
+ * created.  As cgroup doesn't care about the names internally, this
+ * doesn't cause any problem.
+ */
 static int cgroup_destroy_locked(struct cgroup *cgrp)
 	__releases(&cgroup_mutex) __acquires(&cgroup_mutex)
 {
 	struct dentry *d = cgrp->dentry;
-	struct cgroup *parent = cgrp->parent;
 	struct cgroup_event *event, *tmp;
 	struct cgroup_subsys *ss;
+	struct cgroup *child;
+	bool empty;
 
 	lockdep_assert_held(&d->d_inode->i_mutex);
 	lockdep_assert_held(&cgroup_mutex);
 
-	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children))
+	if (atomic_read(&cgrp->count))
 		return -EBUSY;
 
 	/*
-	 * Block new css_tryget() by deactivating refcnt and mark @cgrp
-	 * removed.  This makes future css_tryget() and child creation
-	 * attempts fail thus maintaining the removal conditions verified
-	 * above.
+	 * Make sure there's no live children.  We can't test ->children
+	 * emptiness as dead children linger on it while being destroyed;
+	 * otherwise, "rmdir parent/child parent" may fail with -EBUSY.
+	 */
+	empty = true;
+	rcu_read_lock();
+	list_for_each_entry_rcu(child, &cgrp->children, sibling) {
+		empty = cgroup_is_removed(child);
+		if (!empty)
+			break;
+	}
+	rcu_read_unlock();
+	if (!empty)
+		return -EBUSY;
+
+	/*
+	 * Block new css_tryget() by killing css refcnts.  cgroup core
+	 * guarantees that, by the time ->css_offline() is invoked, no new
+	 * css reference will be given out via css_tryget().  We can't
+	 * simply call percpu_ref_kill() and proceed to offlining css's
+	 * because percpu_ref_kill() doesn't guarantee that the ref is seen
+	 * as killed on all CPUs on return.
+	 *
+	 * Use percpu_ref_kill_and_confirm() to get notifications as each
+	 * css is confirmed to be seen as killed on all CPUs.  The
+	 * notification callback keeps track of the number of css's to be
+	 * killed and schedules cgroup_offline_fn() to perform the rest of
+	 * destruction once the percpu refs of all css's are confirmed to
+	 * be killed.
 	 */
+	atomic_set(&cgrp->css_kill_cnt, 1);
 	for_each_subsys(cgrp->root, ss) {
 		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
 
-		WARN_ON(atomic_read(&css->refcnt) < 0);
-		atomic_add(CSS_DEACT_BIAS, &css->refcnt);
-	}
-	set_bit(CGRP_REMOVED, &cgrp->flags);
+		/*
+		 * Killing would put the base ref, but we need to keep it
+		 * alive until after ->css_offline.
+		 */
+		percpu_ref_get(&css->refcnt);
 
-	/* tell subsystems to initate destruction */
-	for_each_subsys(cgrp->root, ss)
-		offline_css(ss, cgrp);
+		atomic_inc(&cgrp->css_kill_cnt);
+		percpu_ref_kill_and_confirm(&css->refcnt, css_ref_killed_fn);
+	}
+	cgroup_css_killed(cgrp);
 
-	/*
-	 * Put all the base refs.  Each css holds an extra reference to the
-	 * cgroup's dentry and cgroup removal proceeds regardless of css
-	 * refs.  On the last put of each css, whenever that may be, the
-	 * extra dentry ref is put so that dentry destruction happens only
-	 * after all css's are released.
-	 */
-	for_each_subsys(cgrp->root, ss)
-		css_put(cgrp->subsys[ss->subsys_id]);
+	set_bit(CGRP_REMOVED, &cgrp->flags);
 
 	raw_spin_lock(&release_list_lock);
 	if (!list_empty(&cgrp->release_list))
 		list_del_init(&cgrp->release_list);
 	raw_spin_unlock(&release_list_lock);
 
-	/* delete this cgroup from parent->children */
-	list_del_rcu(&cgrp->sibling);
-	list_del_init(&cgrp->allcg_node);
-
+	/*
+	 * Remove @cgrp directory.  The removal puts the base ref but we
+	 * aren't quite done with @cgrp yet, so hold onto it.
+	 */
 	dget(d);
 	cgroup_d_remove_dir(d);
-	dput(d);
-
-	set_bit(CGRP_RELEASABLE, &parent->flags);
-	check_for_release(parent);
 
 	/*
 	 * Unregister events and notify userspace.
@@ -4358,6 +4610,54 @@ static int cgroup_destroy_locked(struct cgroup *cgrp)
 	spin_unlock(&cgrp->event_list_lock);
 
 	return 0;
+};
+
+/**
+ * cgroup_offline_fn - the second step of cgroup destruction
+ * @work: cgroup->destroy_free_work
+ *
+ * This function is invoked from a work item for a cgroup which is being
+ * destroyed after the percpu refcnts of all css's are guaranteed to be
+ * seen as killed on all CPUs, and performs the rest of destruction.  This
+ * is the second step of destruction described in the comment above
+ * cgroup_destroy_locked().
+ */
+static void cgroup_offline_fn(struct work_struct *work)
+{
+	struct cgroup *cgrp = container_of(work, struct cgroup, destroy_work);
+	struct cgroup *parent = cgrp->parent;
+	struct dentry *d = cgrp->dentry;
+	struct cgroup_subsys *ss;
+
+	mutex_lock(&cgroup_mutex);
+
+	/*
+	 * css_tryget() is guaranteed to fail now.  Tell subsystems to
+	 * initate destruction.
+	 */
+	for_each_subsys(cgrp->root, ss)
+		offline_css(ss, cgrp);
+
+	/*
+	 * Put the css refs from cgroup_destroy_locked().  Each css holds
+	 * an extra reference to the cgroup's dentry and cgroup removal
+	 * proceeds regardless of css refs.  On the last put of each css,
+	 * whenever that may be, the extra dentry ref is put so that dentry
+	 * destruction happens only after all css's are released.
+	 */
+	for_each_subsys(cgrp->root, ss)
+		css_put(cgrp->subsys[ss->subsys_id]);
+
+	/* delete this cgroup from parent->children */
+	list_del_rcu(&cgrp->sibling);
+	list_del_init(&cgrp->allcg_node);
+
+	dput(d);
+
+	set_bit(CGRP_RELEASABLE, &parent->flags);
+	check_for_release(parent);
+
+	mutex_unlock(&cgroup_mutex);
 }
 
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
@@ -4693,7 +4993,7 @@ int __init cgroup_init(void)
 		goto out;
 	}
 
-	proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations);
+	proc_create("cgroups", S_ISVTX, NULL, &proc_cgroupstats_operations);
 
 out:
 	if (err)
@@ -4718,6 +5018,13 @@ static int __init cgroup_wq_init(void)
 }
 core_initcall(cgroup_wq_init);
 
+static int ve_hide_cgroups(struct cgroupfs_root *root)
+{
+	/* Hide cpuset cgroup in CT for docker */
+	return !ve_is_super(get_exec_env())
+	       && (root->subsys_mask & (1UL << cpuset_subsys_id));
+}
+
 /*
  * proc_cgroup_show()
  *  - Print task's cgroup paths into seq_file, one line for each hierarchy
@@ -4759,6 +5066,8 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 		struct cgroup *cgrp;
 		int count = 0;
 
+		if (ve_hide_cgroups(root))
+			continue;
 		seq_printf(m, "%d:", root->hierarchy_id);
 		for_each_subsys(root, ss)
 			seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
@@ -4767,7 +5076,7 @@ int proc_cgroup_show(struct seq_file *m, void *v)
 				   root->name);
 		seq_putc(m, ':');
 		cgrp = task_cgroup_from_root(tsk, root);
-		retval = cgroup_path(cgrp, buf, PAGE_SIZE);
+		retval = cgroup_path_ve(cgrp, buf, PAGE_SIZE);
 		if (retval < 0)
 			goto out_unlock;
 		seq_puts(m, buf);
@@ -4783,6 +5092,8 @@ out:
 	return retval;
 }
 
+#define _cg_virtualized(x) ((ve_is_super(get_exec_env())) ? (x) : 1)
+
 /* Display information about each subsystem and each hierarchy */
 static int proc_cgroupstats_show(struct seq_file *m, void *v)
 {
@@ -4797,11 +5108,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
 	mutex_lock(&cgroup_mutex);
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
+		int num;
+
 		if (ss == NULL)
 			continue;
+		if (ve_hide_cgroups(ss->root))
+			continue;
+		num = _cg_virtualized(ss->root->number_of_cgroups);
 		seq_printf(m, "%s\t%d\t%d\t%d\n",
 			   ss->name, ss->root->hierarchy_id,
-			   ss->root->number_of_cgroups, !ss->disabled);
+			   num, !ss->disabled);
 	}
 	mutex_unlock(&cgroup_mutex);
 	return 0;
@@ -5077,34 +5393,6 @@ static void check_for_release(struct cgroup *cgrp)
 	}
 }
 
-/* Caller must verify that the css is not for root cgroup */
-bool __css_tryget(struct cgroup_subsys_state *css)
-{
-	while (true) {
-		int t, v;
-
-		v = css_refcnt(css);
-		t = atomic_cmpxchg(&css->refcnt, v, v + 1);
-		if (likely(t == v))
-			return true;
-		else if (t < 0)
-			return false;
-		cpu_relax();
-	}
-}
-EXPORT_SYMBOL_GPL(__css_tryget);
-
-/* Caller must verify that the css is not for root cgroup */
-void __css_put(struct cgroup_subsys_state *css)
-{
-	int v;
-
-	v = css_unbias_refcnt(atomic_dec_return(&css->refcnt));
-	if (v == 0)
-		queue_work(cgroup_destroy_wq, &css->dput_work);
-}
-EXPORT_SYMBOL_GPL(__css_put);
-
 /*
  * Notify userspace when a cgroup is released, by running the
  * configured release agent with the name of the cgroup (path
@@ -5135,7 +5423,7 @@ static void cgroup_release_agent(struct work_struct *work)
 	raw_spin_lock(&release_list_lock);
 	while (!list_empty(&release_list)) {
 		char *argv[3], *envp[3];
-		int i;
+		int i, err;
 		char *pathbuf = NULL, *agentbuf = NULL;
 		struct cgroup *cgrp = list_entry(release_list.next,
 						    struct cgroup,
@@ -5166,7 +5454,12 @@ static void cgroup_release_agent(struct work_struct *work)
 		 * since the exec could involve hitting disk and hence
 		 * be a slow process */
 		mutex_unlock(&cgroup_mutex);
-		call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+		err = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
+		if (err < 0)
+			pr_warn_ratelimited("cgroup release_agent "
+					    "%s %s failed: %d\n",
+					    agentbuf, pathbuf, err);
+
 		mutex_lock(&cgroup_mutex);
  continue_free:
 		kfree(pathbuf);
@@ -5224,7 +5517,7 @@ unsigned short css_id(struct cgroup_subsys_state *css)
 	 * on this or this is under rcu_read_lock(). Once css->id is allocated,
 	 * it's unchanged until freed.
 	 */
-	cssid = rcu_dereference_check(css->id, css_refcnt(css));
+	cssid = rcu_dereference_raw(css->id);
 
 	if (cssid)
 		return cssid->id;
@@ -5232,18 +5525,6 @@ unsigned short css_id(struct cgroup_subsys_state *css)
 }
 EXPORT_SYMBOL_GPL(css_id);
 
-unsigned short css_depth(struct cgroup_subsys_state *css)
-{
-	struct css_id *cssid;
-
-	cssid = rcu_dereference_check(css->id, css_refcnt(css));
-
-	if (cssid)
-		return cssid->depth;
-	return 0;
-}
-EXPORT_SYMBOL_GPL(css_depth);
-
 /**
  *  css_is_ancestor - test "root" css is an ancestor of "child"
  * @child: the css to be tested.
@@ -5569,3 +5850,85 @@ struct cgroup_subsys debug_subsys = {
 	.base_cftypes = debug_files,
 };
 #endif /* CONFIG_CGROUP_DEBUG */
+
+
+struct vfsmount *cgroup_kernel_mount(struct cgroup_sb_opts *opts)
+{
+	return kern_mount_data(&cgroup_fs_type, opts);
+}
+
+struct cgroup *cgroup_get_root(struct vfsmount *mnt)
+{
+	return mnt->mnt_root->d_fsdata;
+}
+
+struct cgroup *cgroup_kernel_lookup(struct vfsmount *mnt,
+				    const char *pathname)
+{
+	int err;
+	struct path path;
+	struct dentry *dentry;
+	struct cgroup *cgrp;
+
+	err = vfs_path_lookup(mnt->mnt_root, mnt, pathname,
+			      LOOKUP_DIRECTORY, &path);
+	if (err)
+		return ERR_PTR(err);
+	dentry = path.dentry;
+	if (dentry->d_inode) {
+		cgrp = __d_cgrp(dentry);
+		atomic_inc(&cgrp->count);
+	} else
+		cgrp = ERR_PTR(-ENOENT);
+	path_put(&path);
+	return cgrp;
+}
+
+struct cgroup *cgroup_kernel_open(struct cgroup *parent,
+		enum cgroup_open_flags flags, const char *name)
+{
+	struct dentry *dentry;
+	struct cgroup *cgrp;
+	int ret = 0;
+
+	mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT);
+	dentry = lookup_one_len(name, parent->dentry, strlen(name));
+	cgrp = ERR_CAST(dentry);
+	if (IS_ERR(dentry))
+		goto out;
+
+	if (flags & CGRP_CREAT) {
+		if ((flags & CGRP_EXCL) && dentry->d_inode)
+			ret = -EEXIST;
+		else if (!dentry->d_inode)
+			ret = vfs_mkdir(parent->dentry->d_inode, dentry, 0755);
+	}
+	if (!ret && dentry->d_inode) {
+		cgrp = __d_cgrp(dentry);
+		atomic_inc(&cgrp->count);
+	} else
+		cgrp = ret ? ERR_PTR(ret) : NULL;
+	dput(dentry);
+out:
+	mutex_unlock(&parent->dentry->d_inode->i_mutex);
+	return cgrp;
+}
+
+int cgroup_kernel_attach(struct cgroup *cgrp, struct task_struct *tsk)
+{
+	int ret;
+
+	if (!cgroup_lock_live_group(cgrp))
+		return -ENODEV;
+	ret = cgroup_attach_task(cgrp, tsk, true);
+	mutex_unlock(&cgroup_mutex);
+	return ret;
+}
+
+void cgroup_kernel_close(struct cgroup *cgrp)
+{
+	if (atomic_dec_and_test(&cgrp->count)) {
+		set_bit(CGRP_RELEASABLE, &cgrp->flags);
+		check_for_release(cgrp);
+	}
+}
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -875,14 +875,10 @@ static void update_tasks_cpumask(struct cpuset *cs, struct ptr_heap *heap)
 	cgroup_scan_tasks(&scan);
 }
 
-/**
- * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
- * @cs: the cpuset to consider
- * @buf: buffer of cpu numbers written to this cpuset
- */
-static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
-			  const char *buf)
+static int __update_cpumask(struct cpuset *cs,
+			    const struct cpumask *cpus_allowed)
 {
+	struct cpuset *trialcs;
 	struct ptr_heap heap;
 	int retval;
 	int is_load_balanced;
@@ -891,33 +887,26 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 	if (cs == &top_cpuset)
 		return -EACCES;
 
-	/*
-	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
-	 * Since cpulist_parse() fails on an empty mask, we special case
-	 * that parsing.  The validate_change() call ensures that cpusets
-	 * with tasks have cpus.
-	 */
-	if (!*buf) {
-		cpumask_clear(trialcs->cpus_allowed);
-	} else {
-		retval = cpulist_parse(buf, trialcs->cpus_allowed);
-		if (retval < 0)
-			return retval;
-
-		if (!cpumask_subset(trialcs->cpus_allowed, cpu_active_mask))
-			return -EINVAL;
-	}
+	if (!cpumask_subset(cpus_allowed, cpu_active_mask))
+		return -EINVAL;
+
+	trialcs = alloc_trial_cpuset(cs);
+	if (!trialcs)
+		return -ENOMEM;
+
+	cpumask_copy(trialcs->cpus_allowed, cpus_allowed);
+
 	retval = validate_change(cs, trialcs);
 	if (retval < 0)
-		return retval;
+		goto done;
 
 	/* Nothing to do if the cpus didn't change */
 	if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
-		return 0;
+		goto done;
 
 	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
 	if (retval)
-		return retval;
+		goto done;
 
 	is_load_balanced = is_sched_load_balance(trialcs);
 
@@ -935,7 +924,41 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
 
 	if (is_load_balanced)
 		rebuild_sched_domains_locked();
-	return 0;
+
+	retval = 0;
+done:
+	free_trial_cpuset(trialcs);
+	return retval;
+}
+
+/**
+ * update_cpumask - update the cpus_allowed mask of a cpuset and all tasks in it
+ * @buf: buffer of cpu numbers written to this cpuset
+ */
+static int update_cpumask(struct cpuset *cs, const char *buf)
+{
+	cpumask_var_t cpus_allowed;
+	int retval = 0;
+
+	if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
+		return -ENOMEM;
+
+	/*
+	 * An empty cpus_allowed is ok only if the cpuset has no tasks.
+	 * Since cpulist_parse() fails on an empty mask, we special case
+	 * that parsing.  The validate_change() call ensures that cpusets
+	 * with tasks have cpus.
+	 */
+	if (!*buf)
+		cpumask_clear(cpus_allowed);
+	else
+		retval = cpulist_parse(buf, cpus_allowed);
+
+	if (retval == 0)
+		retval = __update_cpumask(cs, cpus_allowed);
+
+	free_cpumask_var(cpus_allowed);
+	return retval;
 }
 
 /*
@@ -1103,9 +1126,10 @@ static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
  * lock each such tasks mm->mmap_sem, scan its vma's and rebind
  * their mempolicies to the cpusets new mems_allowed.
  */
-static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
-			   const char *buf)
+static int __update_nodemask(struct cpuset *cs,
+			   const nodemask_t *mems_allowed)
 {
+	struct cpuset *trialcs = NULL;
 	NODEMASK_ALLOC(nodemask_t, oldmem, GFP_KERNEL);
 	int retval;
 	struct ptr_heap heap;
@@ -1122,25 +1146,19 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 		goto done;
 	}
 
-	/*
-	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
-	 * Since nodelist_parse() fails on an empty mask, we special case
-	 * that parsing.  The validate_change() call ensures that cpusets
-	 * with tasks have memory.
-	 */
-	if (!*buf) {
-		nodes_clear(trialcs->mems_allowed);
-	} else {
-		retval = nodelist_parse(buf, trialcs->mems_allowed);
-		if (retval < 0)
-			goto done;
+	if (!nodes_subset(*mems_allowed, node_states[N_MEMORY])) {
+		retval = -EINVAL;
+		goto done;
+	}
 
-		if (!nodes_subset(trialcs->mems_allowed,
-				node_states[N_MEMORY])) {
-			retval =  -EINVAL;
-			goto done;
-		}
+	trialcs = alloc_trial_cpuset(cs);
+	if (!trialcs) {
+		retval = -ENOMEM;
+		goto done;
 	}
+
+	trialcs->mems_allowed = *mems_allowed;
+
 	*oldmem = cs->mems_allowed;
 	if (nodes_equal(*oldmem, trialcs->mems_allowed)) {
 		retval = 0;		/* Too easy - nothing to do */
@@ -1162,10 +1180,38 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 
 	heap_free(&heap);
 done:
+	if (trialcs)
+		free_trial_cpuset(trialcs);
 	NODEMASK_FREE(oldmem);
 	return retval;
 }
 
+static int update_nodemask(struct cpuset *cs, const char *buf)
+{
+	NODEMASK_ALLOC(nodemask_t, mems_allowed, GFP_KERNEL);
+	int retval = 0;
+
+	if (!mems_allowed)
+		return -ENOMEM;
+
+	/*
+	 * An empty mems_allowed is ok iff there are no tasks in the cpuset.
+	 * Since nodelist_parse() fails on an empty mask, we special case
+	 * that parsing.  The validate_change() call ensures that cpusets
+	 * with tasks have memory.
+	 */
+	if (!*buf)
+		nodes_clear(*mems_allowed);
+	else
+		retval = nodelist_parse(buf, *mems_allowed);
+
+	if (retval == 0)
+		retval = __update_nodemask(cs, mems_allowed);
+
+	NODEMASK_FREE(mems_allowed);
+	return retval;
+}
+
 int current_cpuset_is_being_rebound(void)
 {
 	int ret;
@@ -1593,7 +1639,6 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
 				const char *buf)
 {
 	struct cpuset *cs = cgroup_cs(cgrp);
-	struct cpuset *trialcs;
 	int retval = -ENODEV;
 
 	/*
@@ -1618,25 +1663,18 @@ static int cpuset_write_resmask(struct cgroup *cgrp, struct cftype *cft,
 	if (!is_cpuset_online(cs))
 		goto out_unlock;
 
-	trialcs = alloc_trial_cpuset(cs);
-	if (!trialcs) {
-		retval = -ENOMEM;
-		goto out_unlock;
-	}
-
 	switch (cft->private) {
 	case FILE_CPULIST:
-		retval = update_cpumask(cs, trialcs, buf);
+		retval = update_cpumask(cs, buf);
 		break;
 	case FILE_MEMLIST:
-		retval = update_nodemask(cs, trialcs, buf);
+		retval = update_nodemask(cs, buf);
 		break;
 	default:
 		retval = -EINVAL;
 		break;
 	}
 
-	free_trial_cpuset(trialcs);
 out_unlock:
 	mutex_unlock(&cpuset_mutex);
 	return retval;
@@ -2648,7 +2686,7 @@ int proc_cpuset_show(struct seq_file *m, void *unused_v)
 
 	rcu_read_lock();
 	css = task_subsys_state(tsk, cpuset_subsys_id);
-	retval = cgroup_path(css->cgroup, buf, PAGE_SIZE);
+	retval = cgroup_path_ve(css->cgroup, buf, PAGE_SIZE);
 	rcu_read_unlock();
 	if (retval < 0)
 		goto out_put_task;
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -55,6 +55,7 @@ struct cred init_cred = {
 	.user_ns		= &init_user_ns,
 	.group_info		= &init_groups,
 };
+EXPORT_SYMBOL(init_cred);
 
 static inline void set_cred_subscribers(struct cred *cred, int n)
 {
@@ -561,8 +562,8 @@ EXPORT_SYMBOL(revert_creds);
 void __init cred_init(void)
 {
 	/* allocate a slab in which we can store credentials */
-	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
-				     0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0,
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
 }
 
 /**
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -34,7 +34,7 @@ __setup("nodelayacct", delayacct_setup_disable);
 
 void delayacct_init(void)
 {
-	delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC);
+	delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT);
 	delayacct_tsk_init(&init_task);
 }
 
--- a/kernel/events/uprobes.c
+++ b/kernel/events/uprobes.c
@@ -123,6 +123,11 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 	/* For mmu_notifiers */
 	const unsigned long mmun_start = addr;
 	const unsigned long mmun_end   = addr + PAGE_SIZE;
+	struct mem_cgroup *memcg;
+
+	err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg);
+	if (err)
+		return err;
 
 	/* For try_to_free_swap() and munlock_vma_page() below */
 	lock_page(page);
@@ -135,6 +140,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
 	get_page(kpage);
 	page_add_new_anon_rmap(kpage, vma, addr);
+	mem_cgroup_commit_charge(kpage, memcg, false);
+	lru_cache_add_active_or_unevictable(kpage, vma);
 
 	if (!PageAnon(page)) {
 		dec_mm_counter(mm, mm_counter_file(page));
@@ -156,6 +163,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr,
 
 	err = 0;
  unlock:
+	mem_cgroup_cancel_charge(kpage, memcg);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 	unlock_page(page);
 	return err;
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -48,19 +48,21 @@
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <linux/perf_event.h>
+#include <linux/ve.h>
 #include <trace/events/sched.h>
 #include <linux/hw_breakpoint.h>
 #include <linux/oom.h>
 #include <linux/writeback.h>
 #include <linux/shm.h>
+#include <linux/kcov.h>
+
+#include <bc/misc.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/pgtable.h>
 #include <asm/mmu_context.h>
 
-static void exit_mm(struct task_struct * tsk);
-
 static void __unhash_process(struct task_struct *p, bool group_dead)
 {
 	nr_threads--;
@@ -185,6 +187,8 @@ repeat:
 	tasklist_write_lock_irq();
 	ptrace_release_task(p);
 	__exit_signal(p);
+	nr_zombie--;
+	atomic_inc(&nr_dead);
 
 	/*
 	 * If we are the last non-leader member of the thread
@@ -207,6 +211,7 @@ repeat:
 	write_unlock_irq(&tasklist_lock);
 	cgroup_pids_release(p);
 	release_thread(p);
+	ub_task_uncharge(get_task_ub(p));
 	call_rcu(&p->rcu, delayed_put_task_struct);
 
 	p = leader;
@@ -380,6 +385,7 @@ retry:
 	 */
 	if (mm->owner != p)
 		return;
+
 	/*
 	 * The current owner is exiting/execing and there are no other
 	 * candidates.  Do not leave the mm pointing to a possibly
@@ -462,6 +468,19 @@ static void exit_mm(struct task_struct * tsk)
 	if (!mm)
 		return;
 	sync_mm_rss(mm);
+
+#ifdef CONFIG_VE
+#define K(x) ((x) << (PAGE_SHIFT-10))
+	if (tsk->task_ve != &ve0 &&
+	    test_tsk_thread_flag(tsk, TIF_MEMDIE))
+		ve_printk(VE_LOG, KERN_ERR "OOM killed process %d (%s) "
+			  "total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
+			  task_pid_vnr(tsk), tsk->comm, K(mm->total_vm),
+			  K(get_mm_counter(mm, MM_ANONPAGES)),
+			  K(get_mm_counter(mm, MM_FILEPAGES)));
+#undef K
+#endif
+
 	/*
 	 * Serialize with any possible pending coredump.
 	 * We must hold mmap_sem around checking core_state
@@ -678,6 +697,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	}
 
 	tsk->exit_state = autoreap ? EXIT_DEAD : EXIT_ZOMBIE;
+	nr_zombie++;
 
 	/* mt-exec, de_thread() is waiting for group leader */
 	if (unlikely(tsk->signal->notify_count < 0))
@@ -720,6 +740,7 @@ void do_exit(long code)
 	int group_dead;
 
 	profile_task_exit(tsk);
+	kcov_task_exit(tsk);
 
 	WARN_ON(blk_needs_flush_plug(tsk));
 
@@ -808,6 +829,10 @@ void do_exit(long code)
 		disassociate_ctty(1);
 	exit_task_namespaces(tsk);
 	exit_task_work(tsk);
+
+	if (test_thread_flag(TIF_MEMDIE))
+		exit_oom_victim();
+
 	check_stack_usage();
 	exit_thread();
 
@@ -901,7 +926,6 @@ void complete_and_exit(struct completion *comp, long code)
 
 	do_exit(code);
 }
-
 EXPORT_SYMBOL(complete_and_exit);
 
 SYSCALL_DEFINE1(exit, int, error_code)
@@ -973,12 +997,40 @@ struct pid *task_pid_type(struct task_struct *task, enum pid_type type)
 	return task->pids[type].pid;
 }
 
-static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
+static int __eligible_pid(struct wait_opts *wo, struct task_struct *p)
 {
 	return	wo->wo_type == PIDTYPE_MAX ||
 		task_pid_type(p, wo->wo_type) == wo->wo_pid;
 }
 
+static int __entered_pid(struct wait_opts *wo, struct task_struct *p)
+{
+	struct pid *pid, *wo_pid;
+
+	wo_pid = wo->wo_pid;
+	if ((wo_pid == NULL) || (wo_pid->level != 0))
+		return 0;
+
+	pid = task_pid_type(p, wo->wo_type);
+	if (pid->level != 1)
+		return 0;
+
+	if (wo_pid->numbers[0].nr != pid->numbers[0].nr)
+		return 0;
+
+	wo->wo_pid = get_pid(pid);
+	put_pid(wo_pid);
+	return 1;
+}
+
+static int eligible_pid(struct wait_opts *wo, struct task_struct *p)
+{
+	if (__eligible_pid(wo, p))
+		return 1;
+	else
+		return __entered_pid(wo, p);
+}
+
 static int eligible_child(struct wait_opts *wo, struct task_struct *p)
 {
 	if (!eligible_pid(wo, p))
@@ -1638,7 +1690,7 @@ SYSCALL_DEFINE5(waitid, int, which, pid_t, upid, struct siginfo __user *,
 			ret = put_user(0, &infop->si_status);
 	}
 
-	put_pid(pid);
+	put_pid(wo.wo_pid);
 	return ret;
 }
 
@@ -1674,7 +1726,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
 	wo.wo_stat	= stat_addr;
 	wo.wo_rusage	= ru;
 	ret = do_wait(&wo);
-	put_pid(pid);
+	put_pid(wo.wo_pid);
 
 	return ret;
 }
--- /dev/null
+++ b/kernel/fence-watchdog.c
@@ -0,0 +1,198 @@
+/*
+ *  kernel/fence-watchdog.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * Provide userspace with an interface to forbid kernel to work
+ * without an userspace daemon.
+ *
+ * The daemon should write number of seconds before fencing to the
+ * file /sys/kernel/watchdog_timer, and must renew it, until the
+ * time elapses.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/kobject.h>
+#include <linux/jiffies.h>
+#include <linux/reboot.h>
+#include <linux/fence-watchdog.h>
+#include <linux/device.h>
+#include <linux/kmsg_dump.h>
+
+#define MAX_U64			(~(u64)0)
+#define MAX_JIFFIES_DELTA	(10 * 365UL * 24UL * 3600UL * HZ)
+#define ACTION_NAME_LEN		16
+
+enum {
+	FENCE_WDOG_CRASH = 0,
+	FENCE_WDOG_REBOOT = 1,
+	FENCE_WDOG_POWEROFF = 2,
+	FENCE_WDOG_NETFILTER = 3,
+};
+
+const char *action_names[] = {"crash", "reboot", "halt", "netfilter", NULL};
+
+
+DEFINE_VVAR(volatile unsigned long, fence_wdog_jiffies64) = MAX_U64;
+static int fence_wdog_action = FENCE_WDOG_CRASH;
+static atomic_t not_fenced = ATOMIC_INIT(-1);
+
+static void do_halt_or_reboot(struct work_struct *dummy)
+{
+	printk(KERN_EMERG"fence-watchdog: %s\n",
+	       action_names[fence_wdog_action]);
+	switch (fence_wdog_action) {
+	case FENCE_WDOG_REBOOT:
+		emergency_restart();
+		break;
+	case FENCE_WDOG_POWEROFF:
+		kernel_halt();
+		break;
+	}
+}
+
+static DECLARE_WORK(halt_or_reboot_work, do_halt_or_reboot);
+
+void fence_wdog_do_fence(void)
+{
+	if (fence_wdog_action == FENCE_WDOG_CRASH)
+		panic("fence-watchdog: %s\n",
+		      action_names[fence_wdog_action]);
+	else
+		schedule_work(&halt_or_reboot_work);
+}
+
+inline int fence_wdog_check_timer(void)
+{
+	if (unlikely(get_jiffies_64() > fence_wdog_jiffies64 &&
+			fence_wdog_action != FENCE_WDOG_NETFILTER)) {
+		if (atomic_inc_not_zero(&not_fenced))
+			fence_wdog_do_fence();
+		return 1;
+	}
+
+	return 0;
+}
+
+bool fence_wdog_tmo_match(void)
+{
+	return get_jiffies_64() > fence_wdog_jiffies64;
+}
+EXPORT_SYMBOL(fence_wdog_tmo_match);
+
+static ssize_t fence_wdog_timer_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	ssize_t ret;
+	u64 jiffies_delta = fence_wdog_jiffies64 - get_jiffies_64();
+	struct timespec t;
+
+	if (jiffies_delta > MAX_JIFFIES_DELTA) {
+		ret =  sprintf(buf, "inf\n");
+	} else {
+		jiffies_to_timespec(jiffies_delta, &t);
+		ret =  sprintf(buf, "%ld\n", t.tv_sec);
+	}
+
+	return ret;
+}
+
+static ssize_t fence_wdog_timer_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	unsigned long long val;
+	unsigned long jiffies_delta;
+	struct timespec t;
+
+	if (strict_strtoull(buf, 10, &val))
+		return -EINVAL;
+
+	if (val == 0) {
+		fence_wdog_jiffies64 = MAX_U64;
+		return count;
+	}
+
+	t.tv_sec = val;
+	t.tv_nsec = 0;
+
+	jiffies_delta = timespec_to_jiffies(&t);
+	if (jiffies_delta > MAX_JIFFIES_DELTA)
+		return -EINVAL;
+
+	fence_wdog_jiffies64 = get_jiffies_64() + jiffies_delta;
+
+	return count;
+}
+
+static ssize_t fence_wdog_action_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%s\n", action_names[fence_wdog_action]);
+}
+
+static ssize_t fence_wdog_action_store(struct kobject *kobj,
+		struct kobj_attribute *attr, const char *buf, size_t count)
+{
+	char str_action[ACTION_NAME_LEN];
+	int i = 0;
+
+	if (sscanf(buf, "%15s", str_action) != 1)
+		return -EINVAL;
+
+	for (i = 0; action_names[i]; i++) {
+		if ((!strnicmp(str_action, action_names[i], ACTION_NAME_LEN))) {
+			fence_wdog_action = i;
+			return count;
+		}
+	}
+
+	return -EINVAL;
+}
+
+static ssize_t fence_wdog_available_actions_show(struct kobject *kobj,
+		struct kobj_attribute *attr, char *buf)
+{
+	int i, ret = 0;
+
+	for (i = 0; action_names[i] != NULL; i++)
+		ret += sprintf(&buf[ret], "%s ", action_names[i]);
+
+	ret += sprintf(&buf[ret], "\n");
+	return ret;
+}
+
+static struct kobj_attribute fence_wdog_timer_attr =
+	__ATTR(watchdog_timer, 0644,
+		fence_wdog_timer_show, fence_wdog_timer_store);
+
+static struct kobj_attribute fence_wdog_action_attr =
+	__ATTR(watchdog_action, 0644,
+		fence_wdog_action_show, fence_wdog_action_store);
+
+static struct kobj_attribute fence_wdog_available_actions_attr =
+	__ATTR(watchdog_available_actions, 0644,
+		fence_wdog_available_actions_show, NULL);
+
+static struct attribute *fence_wdog_attrs[] = {
+	&fence_wdog_timer_attr.attr,
+	&fence_wdog_action_attr.attr,
+	&fence_wdog_available_actions_attr.attr,
+	NULL,
+};
+
+static struct attribute_group fence_wdog_attr_group = {
+	.attrs = fence_wdog_attrs,
+};
+
+static int __init fence_wdog_init(void)
+{
+	sysfs_update_group(kernel_kobj, &fence_wdog_attr_group);
+	return 0;
+}
+
+module_init(fence_wdog_init)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -18,6 +18,7 @@
 #include <linux/vmalloc.h>
 #include <linux/completion.h>
 #include <linux/personality.h>
+#include <linux/ratelimit.h>
 #include <linux/mempolicy.h>
 #include <linux/sem.h>
 #include <linux/file.h>
@@ -55,6 +56,7 @@
 #include <linux/acct.h>
 #include <linux/tsacct_kern.h>
 #include <linux/cn_proc.h>
+#include <linux/kcov.h>
 #include <linux/freezer.h>
 #include <linux/delayacct.h>
 #include <linux/taskstats_kern.h>
@@ -71,6 +73,7 @@
 #include <linux/signalfd.h>
 #include <linux/uprobes.h>
 #include <linux/aio.h>
+#include <linux/ve.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -79,6 +82,9 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include <bc/misc.h>
+#include <bc/vmpages.h>
+
 #include <trace/events/sched.h>
 
 #define CREATE_TRACE_POINTS
@@ -89,6 +95,7 @@
  */
 unsigned long total_forks;	/* Handle normal Linux uptimes. */
 int nr_threads;			/* The idle threads do not count.. */
+EXPORT_SYMBOL(nr_threads);
 
 int max_threads;		/* tunable limit on nr_threads */
 
@@ -98,6 +105,7 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 __attribute__((__section__(".data..cacheline_aligned")))
 static atomic_t tasklist_waiters = ATOMIC_INIT(0);
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
+EXPORT_SYMBOL(tasklist_lock);
 
 void tasklist_write_lock_irq(void)
 {
@@ -175,7 +183,7 @@ void __weak arch_release_thread_info(struct thread_info *ti)
 static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 						  int node)
 {
-	struct page *page = alloc_pages_node(node, THREADINFO_GFP_ACCOUNTED,
+	struct page *page = alloc_pages_node(node, THREADINFO_GFP,
 					     THREAD_SIZE_ORDER);
 
 	return page ? page_address(page) : NULL;
@@ -183,7 +191,7 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
 
 static inline void free_thread_info(struct thread_info *ti)
 {
-	free_memcg_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+	free_pages((unsigned long)ti, THREAD_SIZE_ORDER);
 }
 # else
 static struct kmem_cache *thread_info_cache;
@@ -221,7 +229,7 @@ struct kmem_cache *files_cachep;
 struct kmem_cache *fs_cachep;
 
 /* SLAB cache for vm_area_struct structures */
-struct kmem_cache *vm_area_cachep;
+struct kmem_cache *__vm_area_cachep;
 
 /* SLAB cache for mm_struct structures (tsk->mm) */
 static struct kmem_cache *mm_cachep;
@@ -265,11 +273,13 @@ void __put_task_struct(struct task_struct *tsk)
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
 
+	ub_task_put(tsk);
 	security_task_free(tsk);
 	exit_creds(tsk);
 	delayacct_tsk_free(tsk);
 	put_signal_struct(tsk->signal);
 
+	atomic_dec(&nr_dead);
 	if (!profile_handoff_task(tsk))
 		free_task(tsk);
 }
@@ -284,9 +294,9 @@ void __init fork_init(unsigned long mempages)
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
 #endif
 	/* create a slab on which task_structs can be allocated */
-	task_struct_cachep =
-		kmem_cache_create("task_struct", sizeof(struct task_struct),
-			ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL);
+	task_struct_cachep = kmem_cache_create("task_struct",
+			sizeof(struct task_struct), ARCH_MIN_TASKALIGN,
+			SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL);
 #endif
 
 	/* do the arch specific task caches init */
@@ -364,6 +374,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
 
 	account_kernel_stack(ti, 1);
 
+	kcov_task_init(tsk);
+
 	return tsk;
 
 free_ti:
@@ -422,6 +434,10 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 			continue;
 		}
 		charge = 0;
+		if (ub_memory_charge(mm, mpnt->vm_end - mpnt->vm_start,
+					mpnt->vm_flags & ~VM_LOCKED,
+					mpnt->vm_file, UB_HARD))
+			goto fail_noch;
 		if (mpnt->vm_flags & VM_ACCOUNT) {
 			unsigned long len = vma_pages(mpnt);
 
@@ -429,7 +445,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 				goto fail_nomem;
 			charge = len;
 		}
-		tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+		tmp = allocate_vma(mm, GFP_KERNEL);
 		if (!tmp)
 			goto fail_nomem;
 		*tmp = *mpnt;
@@ -445,6 +461,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 		tmp->vm_flags &= ~(VM_LOCKED|VM_UFFD_MISSING|VM_UFFD_WP);
 		tmp->vm_next = tmp->vm_prev = NULL;
 		tmp->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+		tmp->vm_private_data2 = NULL;
 		file = tmp->vm_file;
 		if (file) {
 			struct inode *inode = file_inode(file);
@@ -458,12 +475,8 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 				atomic_inc(&mapping->i_mmap_writable);
 			flush_dcache_mmap_lock(mapping);
 			/* insert tmp into the share list, just after mpnt */
-			if (unlikely(tmp->vm_flags & VM_NONLINEAR))
-				vma_nonlinear_insert(tmp,
-						&mapping->i_mmap_nonlinear);
-			else
-				vma_interval_tree_insert_after(tmp, mpnt,
-							&mapping->i_mmap);
+			vma_interval_tree_insert_after(tmp, mpnt,
+					&mapping->i_mmap);
 			flush_dcache_mmap_unlock(mapping);
 			mutex_unlock(&mapping->i_mmap_mutex);
 		}
@@ -509,8 +522,11 @@ out:
 fail_nomem_anon_vma_fork:
 	mpol_put(pol);
 fail_nomem_policy:
-	kmem_cache_free(vm_area_cachep, tmp);
+	free_vma(mm, tmp);
 fail_nomem:
+	ub_memory_uncharge(mm, mpnt->vm_end - mpnt->vm_start,
+			mpnt->vm_flags & ~VM_LOCKED, mpnt->vm_file);
+fail_noch:
 	retval = -ENOMEM;
 	vm_unacct_memory(charge);
 	goto out;
@@ -542,7 +558,32 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock);
 
-#define allocate_mm()	(kmem_cache_alloc(mm_cachep, GFP_KERNEL))
+#ifdef CONFIG_BEANCOUNTERS
+
+static inline struct mm_struct *allocate_mm(struct user_beancounter *ub)
+{
+	return kmem_cache_alloc(mm_cachep, GFP_KERNEL);
+}
+
+static inline void set_mm_ub(struct mm_struct *mm, struct user_beancounter *ub)
+{
+	mm->mm_ub = get_beancounter(ub);
+}
+
+static inline void put_mm_ub(struct mm_struct *mm)
+{
+	put_beancounter(mm->mm_ub);
+	mm->mm_ub = NULL;
+}
+
+#else /* CONFIG_BEANCOUNTERS */
+
+#define allocate_mm(ub)  (kmem_cache_alloc(mm_cachep, GFP_KERNEL))
+#define set_mm_ub(mm, ub)
+#define put_mm_ub(mm)
+
+#endif /* CONFIG_BEANCOUNTERS */
+
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))
 
 static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT;
@@ -597,6 +638,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p)
 		return mm;
 	}
 
+	put_mm_ub(mm);
 	free_mm(mm);
 	return NULL;
 }
@@ -625,11 +667,12 @@ struct mm_struct *mm_alloc(void)
 {
 	struct mm_struct *mm;
 
-	mm = allocate_mm();
+	mm = allocate_mm(get_exec_ub());
 	if (!mm)
 		return NULL;
 
 	memset(mm, 0, sizeof(*mm));
+	set_mm_ub(mm, get_exec_ub());
 	mm_init_cpumask(mm);
 	return mm_init(mm, current);
 }
@@ -642,6 +685,8 @@ struct mm_struct *mm_alloc(void)
 void __mmdrop(struct mm_struct *mm)
 {
 	BUG_ON(mm == &init_mm);
+	if (unlikely(atomic_read(&mm->mm_users)))
+		put_mm_ub(mm);
 	mm_free_pgd(mm);
 	destroy_context(mm);
 	mmu_notifier_mm_destroy(mm);
@@ -671,6 +716,7 @@ void mmput(struct mm_struct *mm)
 		}
 		if (mm->binfmt)
 			module_put(mm->binfmt->module);
+		put_mm_ub(mm);
 		mmdrop(mm);
 	}
 }
@@ -901,7 +947,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	if (!oldmm)
 		return NULL;
 
-	mm = allocate_mm();
+	mm = allocate_mm(tsk->task_bc.task_ub);
 	if (!mm)
 		goto fail_nomem;
 
@@ -911,6 +957,7 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
 	mm->pmd_huge_pte = NULL;
 #endif
+	set_mm_ub(mm, tsk->task_bc.task_ub);
 	if (!mm_init(mm, tsk))
 		goto fail_nomem;
 
@@ -942,6 +989,7 @@ fail_nocontext:
 	 * If init_new_context() failed, we cannot use mmput() to free the mm
 	 * because it calls destroy_context()
 	 */
+	put_mm_ub(mm);
 	mm_free_pgd(mm);
 	free_mm(mm);
 	return NULL;
@@ -1269,10 +1317,9 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	/*
 	 * If the new process will be in a different pid or user namespace
-	 * do not allow it to share a thread group or signal handlers or
-	 * parent with the forking task.
+	 * do not allow it to share a thread group with the forking task.
 	 */
-	if (clone_flags & CLONE_SIGHAND) {
+	if (clone_flags & CLONE_THREAD) {
 		if ((clone_flags & (CLONE_NEWUSER | CLONE_NEWPID)) ||
 		    (task_active_pid_ns(current) !=
 				current->nsproxy->pid_ns))
@@ -1284,9 +1331,14 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto fork_out;
 
 	retval = -ENOMEM;
+	if (ub_task_charge(get_exec_ub()))
+		goto fork_out;
+
 	p = dup_task_struct(current, node);
 	if (!p)
-		goto fork_out;
+		goto bad_fork_uncharge;
+
+	ub_task_get(get_exec_ub(), p);
 
 	ftrace_graph_init_task(p);
 	get_seccomp_filter(p);
@@ -1402,10 +1454,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
-#ifdef CONFIG_MEMCG
-	p->memcg_batch.do_batch = 0;
-	p->memcg_batch.memcg = NULL;
-#endif
 #ifdef CONFIG_BCACHE
 	p->sequential_io	= 0;
 	p->sequential_io_avg	= 0;
@@ -1648,7 +1696,10 @@ bad_fork_cleanup_count:
 	atomic_dec(&p->cred->user->processes);
 	exit_creds(p);
 bad_fork_free:
+	ub_task_put(p);
 	free_task(p);
+bad_fork_uncharge:
+	ub_task_uncharge(get_exec_ub());
 fork_out:
 	return ERR_PTR(retval);
 }
@@ -1822,16 +1873,19 @@ void __init proc_caches_init(void)
 	sighand_cachep = kmem_cache_create("sighand_cache",
 			sizeof(struct sighand_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU|
-			SLAB_NOTRACK, sighand_ctor);
+			SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor);
 	signal_cachep = kmem_cache_create("signal_cache",
 			sizeof(struct signal_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			NULL);
 	files_cachep = kmem_cache_create("files_cache",
 			sizeof(struct files_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			NULL);
 	fs_cachep = kmem_cache_create("fs_cache",
 			sizeof(struct fs_struct), 0,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			NULL);
 	/*
 	 * FIXME! The "sizeof(struct mm_struct)" currently includes the
 	 * whole struct cpumask for the OFFSTACK case. We could change
@@ -1841,8 +1895,9 @@ void __init proc_caches_init(void)
 	 */
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
-			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL);
-	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
+			SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT,
+			NULL);
+	__vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT);
 	mmap_init();
 	nsproxy_cache_init();
 }
@@ -1928,7 +1983,8 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 	int err;
 
 	/*
-	 * If unsharing a user namespace must also unshare the thread.
+	 * If unsharing a user namespace must also unshare the thread group
+	 * and unshare the filesystem root and working directories.
 	 */
 	if (unshare_flags & CLONE_NEWUSER)
 		unshare_flags |= CLONE_THREAD | CLONE_FS;
--- a/kernel/freezer.c
+++ b/kernel/freezer.c
@@ -36,6 +36,12 @@ bool freezing_slow_path(struct task_struct *p)
 	if (p->flags & (PF_NOFREEZE | PF_SUSPEND_TASK))
 		return false;
 
+	if (test_thread_flag(TIF_MEMDIE))
+		return false;
+
+	if (p->jobctl & JOBCTL_TRAPPING)
+		return false;
+
 	if (pm_nosig_freezing || cgroup_freezing(p))
 		return true;
 
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -64,6 +64,7 @@
 #include <linux/freezer.h>
 #include <linux/bootmem.h>
 #include <linux/hugetlb.h>
+#include <linux/ve.h>
 
 #include <asm/futex.h>
 
@@ -786,7 +787,8 @@ void exit_pi_state_list(struct task_struct *curr)
  */
 static int
 lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
-		union futex_key *key, struct futex_pi_state **ps)
+		union futex_key *key, struct futex_pi_state **ps,
+		struct task_struct *task)
 {
 	struct futex_pi_state *pi_state = NULL;
 	struct futex_q *this, *next;
@@ -1029,7 +1031,7 @@ retry:
 	 * We dont have the lock. Look up the PI state (or create it if
 	 * we are the first waiter):
 	 */
-	ret = lookup_pi_state(uval, hb, key, ps);
+	ret = lookup_pi_state(uval, hb, key, ps, task);
 
 	if (unlikely(ret)) {
 		switch (ret) {
@@ -1438,7 +1440,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
  *
  * Return:
  *  0 - failed to acquire the lock atomically;
- *  1 - acquired the lock;
+ * >0 - acquired the lock, return value is vpid of the top_waiter
  * <0 - error
  */
 static int futex_proxy_trylock_atomic(u32 __user *pifutex,
@@ -1449,7 +1451,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 {
 	struct futex_q *top_waiter = NULL;
 	u32 curval;
-	int ret;
+	int ret, vpid;
 
 	if (get_futex_value_locked(&curval, pifutex))
 		return -EFAULT;
@@ -1477,11 +1479,13 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex,
 	 * the contended case or if set_waiters is 1.  The pi_state is returned
 	 * in ps in contended cases.
 	 */
+	vpid = task_pid_vnr(top_waiter->task);
 	ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
 				   set_waiters);
-	if (ret == 1)
+	if (ret == 1) {
 		requeue_pi_wake_futex(top_waiter, key2, hb2);
-
+		return vpid;
+	}
 	return ret;
 }
 
@@ -1512,7 +1516,6 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
 	struct futex_pi_state *pi_state = NULL;
 	struct futex_hash_bucket *hb1, *hb2;
 	struct futex_q *this, *next;
-	u32 curval2;
 
 	if (requeue_pi) {
 		/*
@@ -1616,16 +1619,25 @@ retry_private:
 		 * At this point the top_waiter has either taken uaddr2 or is
 		 * waiting on it.  If the former, then the pi_state will not
 		 * exist yet, look it up one more time to ensure we have a
-		 * reference to it.
+		 * reference to it. If the lock was taken, ret contains the
+		 * vpid of the top waiter task.
 		 */
-		if (ret == 1) {
+		if (ret > 0) {
 			WARN_ON(pi_state);
 			drop_count++;
 			task_count++;
-			ret = get_futex_value_locked(&curval2, uaddr2);
-			if (!ret)
-				ret = lookup_pi_state(curval2, hb2, &key2,
-						      &pi_state);
+			/*
+			 * If we acquired the lock, then the user
+			 * space value of uaddr2 should be vpid. It
+			 * cannot be changed by the top waiter as it
+			 * is blocked on hb2 lock if it tries to do
+			 * so. If something fiddled with it behind our
+			 * back the pi state lookup might unearth
+			 * it. So we rather use the known value than
+			 * rereading and handing potential crap to
+			 * lookup_pi_state.
+			 */
+			ret = lookup_pi_state(ret, hb2, &key2, &pi_state, NULL);
 		}
 
 		switch (ret) {
@@ -2555,7 +2567,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 {
 	struct hrtimer_sleeper timeout, *to = NULL;
 	struct rt_mutex_waiter rt_waiter;
-	struct rt_mutex *pi_mutex = NULL;
 	struct futex_hash_bucket *hb;
 	union futex_key key2 = FUTEX_KEY_INIT;
 	struct futex_q q = futex_q_init;
@@ -2641,6 +2652,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 			spin_unlock(q.lock_ptr);
 		}
 	} else {
+		struct rt_mutex *pi_mutex;
+
 		/*
 		 * We have been woken up by futex_unlock_pi(), a timeout, or a
 		 * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
@@ -2664,18 +2677,19 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
 		if (res)
 			ret = (res < 0) ? res : 0;
 
+		/*
+		 * If fixup_pi_state_owner() faulted and was unable to handle
+		 * the fault, unlock the rt_mutex and return the fault to
+		 * userspace.
+		 */
+		if (ret && rt_mutex_owner(pi_mutex) == current)
+			rt_mutex_unlock(pi_mutex);
+
 		/* Unqueue and drop the lock. */
 		unqueue_me_pi(&q);
 	}
 
-	/*
-	 * If fixup_pi_state_owner() faulted and was unable to handle the
-	 * fault, unlock the rt_mutex and return the fault to userspace.
-	 */
-	if (ret == -EFAULT) {
-		if (pi_mutex && rt_mutex_owner(pi_mutex) == current)
-			rt_mutex_unlock(pi_mutex);
-	} else if (ret == -EINTR) {
+	if (ret == -EINTR) {
 		/*
 		 * We've already been requeued, but cannot restart by calling
 		 * futex_lock_pi() directly. We could restart this syscall, but
@@ -2923,6 +2937,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 {
 	int cmd = op & FUTEX_CMD_MASK;
 	unsigned int flags = 0;
+	ktime_t abs_time;
 
 	if (!(op & FUTEX_PRIVATE_FLAG))
 		flags |= FLAGS_SHARED;
@@ -2931,6 +2946,12 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
 		flags |= FLAGS_CLOCKRT;
 		if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
 			return -ENOSYS;
+	} else if (timeout) {
+		if (cmd == FUTEX_WAIT_BITSET || cmd == FUTEX_WAIT_REQUEUE_PI) {
+			abs_time = ktime_add(*timeout, timespec_to_ktime(
+					     get_exec_env()->start_timespec));
+			timeout = &abs_time;
+		}
 	}
 
 	switch (cmd) {
--- a/kernel/groups.c
+++ b/kernel/groups.c
@@ -21,7 +21,8 @@ struct group_info *groups_alloc(int gidsetsize)
 	nblocks = (gidsetsize + NGROUPS_PER_BLOCK - 1) / NGROUPS_PER_BLOCK;
 	/* Make sure we always allocate at least one indirect block pointer */
 	nblocks = nblocks ? : 1;
-	group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *), GFP_USER);
+	group_info = kmalloc(sizeof(*group_info) + nblocks*sizeof(gid_t *),
+			GFP_USER | __GFP_NOWARN | __GFP_ACCOUNT);
 	if (!group_info)
 		return NULL;
 	group_info->ngroups = gidsetsize;
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -286,9 +286,9 @@ SYSCALL_DEFINE3(setitimer, int, which, struct itimerval __user *, value,
 			return -EFAULT;
 	} else {
 		memset(&set_buffer, 0, sizeof(set_buffer));
-		printk_once(KERN_WARNING "%s calls setitimer() with new_value NULL pointer."
+		printk_once(KERN_WARNING "cmd: %s CT: %s calls setitimer() with new_value NULL pointer."
 			    " Misfeature support will be removed\n",
-			    current->comm);
+			    current->comm, task_ve_name(current));
 	}
 
 	error = do_setitimer(which, &set_buffer, ovalue ? &get_buffer : NULL);
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -44,11 +44,12 @@ static long kptr_obfuscate(long v, int type)
  */
 static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type)
 {
-	long ret;
+	long t1, t2;
 
-	ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type);
+	t1 = kptr_obfuscate((long)v1, type);
+	t2 = kptr_obfuscate((long)v2, type);
 
-	return (ret < 0) | ((ret > 0) << 1);
+	return (t1 < t2) | ((t1 > t2) << 1);
 }
 
 /* The caller must have pinned the task */
--- /dev/null
+++ b/kernel/kcov.c
@@ -0,0 +1,283 @@
+#define pr_fmt(fmt) "kcov: " fmt
+
+#define DISABLE_BRANCH_PROFILING
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/preempt_mask.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/kcov.h>
+
+/*
+ * kcov descriptor (one per opened debugfs file).
+ * State transitions of the descriptor:
+ *  - initial state after open()
+ *  - then there must be a single ioctl(KCOV_INIT_TRACE) call
+ *  - then, mmap() call (several calls are allowed but not useful)
+ *  - then, repeated enable/disable for a task (only one task a time allowed)
+ */
+struct kcov {
+	/*
+	 * Reference counter. We keep one for:
+	 *  - opened file descriptor
+	 *  - task with enabled coverage (we can't unwire it from another task)
+	 */
+	atomic_t		refcount;
+	/* The lock protects mode, size, area and t. */
+	spinlock_t		lock;
+	enum kcov_mode		mode;
+	/* Size of arena (in long's for KCOV_MODE_TRACE). */
+	unsigned		size;
+	/* Coverage buffer shared with user space. */
+	void			*area;
+	/* Task for which we collect coverage, or NULL. */
+	struct task_struct	*t;
+};
+
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void notrace __sanitizer_cov_trace_pc(void)
+{
+	struct task_struct *t;
+	enum kcov_mode mode;
+
+	t = current;
+	/*
+	 * We are interested in code coverage as a function of a syscall inputs,
+	 * so we ignore code executed in interrupts.
+	 * The checks for whether we are in an interrupt are open-coded, because
+	 * 1. We can't use in_interrupt() here, since it also returns true
+	 *    when we are inside local_bh_disable() section.
+	 * 2. We don't want to use (in_irq() | in_serving_softirq() | in_nmi()),
+	 *    since that leads to slower generated code (three separate tests,
+	 *    one for each of the flags).
+	 */
+	if (!t || (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_OFFSET
+							| NMI_MASK)))
+		return;
+	mode = READ_ONCE(t->kcov_mode);
+	if (mode == KCOV_MODE_TRACE) {
+		unsigned long *area;
+		unsigned long pos;
+
+		/*
+		 * There is some code that runs in interrupts but for which
+		 * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+		 * READ_ONCE()/barrier() effectively provides load-acquire wrt
+		 * interrupts, there are paired barrier()/WRITE_ONCE() in
+		 * kcov_ioctl_locked().
+		 */
+		barrier();
+		area = t->kcov_area;
+		/* The first word is number of subsequent PCs. */
+		pos = READ_ONCE(area[0]) + 1;
+		if (likely(pos < t->kcov_size)) {
+			area[pos] = _RET_IP_;
+			WRITE_ONCE(area[0], pos);
+		}
+	}
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
+
+static void kcov_get(struct kcov *kcov)
+{
+	atomic_inc(&kcov->refcount);
+}
+
+static void kcov_put(struct kcov *kcov)
+{
+	if (atomic_dec_and_test(&kcov->refcount)) {
+		vfree(kcov->area);
+		kfree(kcov);
+	}
+}
+
+void kcov_task_init(struct task_struct *t)
+{
+	t->kcov_mode = KCOV_MODE_DISABLED;
+	t->kcov_size = 0;
+	t->kcov_area = NULL;
+	t->kcov = NULL;
+}
+
+void kcov_task_exit(struct task_struct *t)
+{
+	struct kcov *kcov;
+
+	kcov = t->kcov;
+	if (kcov == NULL)
+		return;
+	spin_lock(&kcov->lock);
+	if (WARN_ON(kcov->t != t)) {
+		spin_unlock(&kcov->lock);
+		return;
+	}
+	/* Just to not leave dangling references behind. */
+	kcov_task_init(t);
+	kcov->t = NULL;
+	spin_unlock(&kcov->lock);
+	kcov_put(kcov);
+}
+
+static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+	int res = 0;
+	void *area;
+	struct kcov *kcov = vma->vm_file->private_data;
+	unsigned long size, off;
+	struct page *page;
+
+	area = vmalloc_user(vma->vm_end - vma->vm_start);
+	if (!area)
+		return -ENOMEM;
+
+	spin_lock(&kcov->lock);
+	size = kcov->size * sizeof(unsigned long);
+	if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 ||
+	    vma->vm_end - vma->vm_start != size) {
+		res = -EINVAL;
+		goto exit;
+	}
+	if (!kcov->area) {
+		kcov->area = area;
+		vma->vm_flags |= VM_DONTEXPAND;
+		spin_unlock(&kcov->lock);
+		for (off = 0; off < size; off += PAGE_SIZE) {
+			page = vmalloc_to_page(kcov->area + off);
+			if (vm_insert_page(vma, vma->vm_start + off, page))
+				WARN_ONCE(1, "vm_insert_page() failed");
+		}
+		return 0;
+	}
+exit:
+	spin_unlock(&kcov->lock);
+	vfree(area);
+	return res;
+}
+
+static int kcov_open(struct inode *inode, struct file *filep)
+{
+	struct kcov *kcov;
+
+	kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
+	if (!kcov)
+		return -ENOMEM;
+	atomic_set(&kcov->refcount, 1);
+	spin_lock_init(&kcov->lock);
+	filep->private_data = kcov;
+	return nonseekable_open(inode, filep);
+}
+
+static int kcov_close(struct inode *inode, struct file *filep)
+{
+	kcov_put(filep->private_data);
+	return 0;
+}
+
+static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
+			     unsigned long arg)
+{
+	struct task_struct *t;
+	unsigned long size, unused;
+
+	switch (cmd) {
+	case KCOV_INIT_TRACE:
+		/*
+		 * Enable kcov in trace mode and setup buffer size.
+		 * Must happen before anything else.
+		 */
+		if (kcov->mode != KCOV_MODE_DISABLED)
+			return -EBUSY;
+		/*
+		 * Size must be at least 2 to hold current position and one PC.
+		 * Later we allocate size * sizeof(unsigned long) memory,
+		 * that must not overflow.
+		 */
+		size = arg;
+		if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+			return -EINVAL;
+		kcov->size = size;
+		kcov->mode = KCOV_MODE_TRACE;
+		return 0;
+	case KCOV_ENABLE:
+		/*
+		 * Enable coverage for the current task.
+		 * At this point user must have been enabled trace mode,
+		 * and mmapped the file. Coverage collection is disabled only
+		 * at task exit or voluntary by KCOV_DISABLE. After that it can
+		 * be enabled for another task.
+		 */
+		unused = arg;
+		if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED ||
+		    kcov->area == NULL)
+			return -EINVAL;
+		if (kcov->t != NULL)
+			return -EBUSY;
+		t = current;
+		/* Cache in task struct for performance. */
+		t->kcov_size = kcov->size;
+		t->kcov_area = kcov->area;
+		/* See comment in __sanitizer_cov_trace_pc(). */
+		barrier();
+		WRITE_ONCE(t->kcov_mode, kcov->mode);
+		t->kcov = kcov;
+		kcov->t = t;
+		/* This is put either in kcov_task_exit() or in KCOV_DISABLE. */
+		kcov_get(kcov);
+		return 0;
+	case KCOV_DISABLE:
+		/* Disable coverage for the current task. */
+		unused = arg;
+		if (unused != 0 || current->kcov != kcov)
+			return -EINVAL;
+		t = current;
+		if (WARN_ON(kcov->t != t))
+			return -EINVAL;
+		kcov_task_init(t);
+		kcov->t = NULL;
+		kcov_put(kcov);
+		return 0;
+	default:
+		return -ENOTTY;
+	}
+}
+
+static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+	struct kcov *kcov;
+	int res;
+
+	kcov = filep->private_data;
+	spin_lock(&kcov->lock);
+	res = kcov_ioctl_locked(kcov, cmd, arg);
+	spin_unlock(&kcov->lock);
+	return res;
+}
+
+static const struct file_operations kcov_fops = {
+	.open		= kcov_open,
+	.unlocked_ioctl	= kcov_ioctl,
+	.mmap		= kcov_mmap,
+	.release        = kcov_close,
+};
+
+static int __init kcov_init(void)
+{
+	if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) {
+		pr_err("failed to create kcov in debugfs\n");
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+device_initcall(kcov_init);
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -28,7 +28,7 @@
 #include <linux/cred.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
-#include <linux/workqueue.h>
+#include <linux/kthread.h>
 #include <linux/security.h>
 #include <linux/mount.h>
 #include <linux/kernel.h>
@@ -39,12 +39,17 @@
 #include <linux/rwsem.h>
 #include <linux/ptrace.h>
 #include <linux/async.h>
+#include <linux/ve.h>
+#include <linux/netfilter.h>
+#include <linux/sysctl.h>
 #include <asm/uaccess.h>
 
 #include <trace/events/module.h>
 
 extern int max_threads;
 
+static DEFINE_KTHREAD_WORKER(khelper_worker);
+
 /*
  * kmod_thread_locker is used for deadlock avoidance.  There is no explicit
  * locking to protect this global - it is private to the singleton khelper
@@ -69,11 +74,14 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
 
 static void free_modprobe_argv(struct subprocess_info *info)
 {
-	kfree(info->argv[3]); /* check call_modprobe() */
+	kfree(info->argv[4]); /* check call_modprobe() */
 	kfree(info->argv);
 }
 
-static int call_modprobe(char *module_name, int wait)
+static int __call_usermodehelper_exec(struct kthread_worker *worker,
+		struct subprocess_info *sub_info, int wait);
+
+static int call_modprobe(char *module_name, int wait, int blacklist)
 {
 	struct subprocess_info *info;
 	static char *envp[] = {
@@ -83,7 +91,7 @@ static int call_modprobe(char *module_name, int wait)
 		NULL
 	};
 
-	char **argv = kmalloc(sizeof(char *[5]), GFP_KERNEL);
+	char **argv = kmalloc(sizeof(char *[6]), GFP_KERNEL);
 	if (!argv)
 		goto out;
 
@@ -93,16 +101,24 @@ static int call_modprobe(char *module_name, int wait)
 
 	argv[0] = modprobe_path;
 	argv[1] = "-q";
-	argv[2] = "--";
-	argv[3] = module_name;	/* check free_modprobe_argv() */
-	argv[4] = NULL;
+	if (blacklist)
+		argv[2] = "-b";
+	else
+		argv[2] = "-q"; /* just repeat argv[1] */
+	argv[3] = "--";
+	argv[4] = module_name;	/* check free_modprobe_argv() */
+	argv[5] = NULL;
 
 	info = call_usermodehelper_setup(modprobe_path, argv, envp, GFP_KERNEL,
 					 NULL, free_modprobe_argv, NULL);
 	if (!info)
 		goto free_module_name;
 
-	return call_usermodehelper_exec(info, wait | UMH_KILLABLE);
+	/*
+	 * We enter to this function with the right permittions, so
+	 * it's possible to directly call __call_usermodehelper_exec()
+	 */
+	return __call_usermodehelper_exec(&khelper_worker, info, wait | UMH_KILLABLE);
 
 free_module_name:
 	kfree(module_name);
@@ -113,10 +129,10 @@ out:
 }
 
 /**
- * __request_module - try to load a kernel module
+ * ___request_module - try to load a kernel module
  * @wait: wait (or not) for the operation to complete
- * @fmt: printf style format string for the name of the module
- * @...: arguments as specified in the format string
+ * @blacklist: say usermodehelper to ignore blacklisted modules
+ * @module_name: name of requested module
  *
  * Load a module using the user mode module loader. The function returns
  * zero on success or a negative errno code on failure. Note that a
@@ -127,10 +143,8 @@ out:
  * If module auto-loading support is disabled then this function
  * becomes a no-operation.
  */
-int __request_module(bool wait, const char *fmt, ...)
+static int ___request_module(bool wait, bool blacklist, char *module_name)
 {
-	va_list args;
-	char module_name[MODULE_NAME_LEN];
 	unsigned int max_modprobes;
 	int ret;
 	static atomic_t kmod_concurrent = ATOMIC_INIT(0);
@@ -145,12 +159,6 @@ int __request_module(bool wait, const char *fmt, ...)
 	 */
 	WARN_ON_ONCE(wait && current_is_async());
 
-	va_start(args, fmt);
-	ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
-	va_end(args);
-	if (ret >= MODULE_NAME_LEN)
-		return -ENAMETOOLONG;
-
 	ret = security_kernel_module_request(module_name);
 	if (ret)
 		return ret;
@@ -183,11 +191,303 @@ int __request_module(bool wait, const char *fmt, ...)
 
 	trace_module_request(module_name, wait, _RET_IP_);
 
-	ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
+	ret = call_modprobe(module_name, wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC, blacklist);
 
 	atomic_dec(&kmod_concurrent);
 	return ret;
 }
+
+#ifdef CONFIG_VE_IPTABLES
+
+/* ve0 allowed iptables modules */
+static struct {
+	const char *name;
+	u64 perm;
+} ve0_ipt_am[] = {
+	{ "ip_tables",		VE_IP_IPTABLES	},
+	{ "ip6_tables",		VE_IP_IPTABLES6	},
+	{ "iptable_filter",	VE_IP_FILTER	},
+	{ "iptable_raw",	VE_IP_IPTABLES	},
+	{ "iptable_nat",	VE_IP_NAT	},
+	{ "iptable_mangle",	VE_IP_MANGLE	},
+	{ "ip6table_filter",	VE_IP_FILTER6	},
+	{ "ip6table_nat",	VE_IP_NAT	},
+	{ "ip6table_mangle",	VE_IP_MANGLE6	},
+	{ "ip6table_raw",	VE_IP_IPTABLES6	},
+
+	{ "xt_CONNMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_CONNSECMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_NOTRACK",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_cluster",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_connbytes",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_connlimit",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_connmark",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_conntrack",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_helper",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_state",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "xt_socket",		VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+	{ "xt_connlabel",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+
+	{ "ipt_CLUSTERIP",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_CONNMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_CONNSECMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_NOTRACK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_cluster",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_connbytes",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_connlimit",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_connmark",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_conntrack",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_helper",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_state",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ipt_socket",		VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+	{ "ipt_MASQUERADE",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT			},
+	{ "ipt_NETMAP",		VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT			},
+	{ "ipt_REDIRECT",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT			},
+	{ "ipt_connlabel",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+	{ "ipt_SYNPROXY",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+
+	{ "ip6t_CONNMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_CONNSECMARK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_NOTRACK",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_cluster",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_connbytes",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_connlimit",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_connmark",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_conntrack",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_helper",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_state",		VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip6t_socket",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+	{ "ip6t_MASQUERADE",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT|VE_IP_IPTABLES6	},
+	{ "ip6t_connlabel",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+	{ "ip6t_SYNPROXY",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_IPTABLES6			},
+
+	{ "nf-nat-ipv4",	VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT			},
+	{ "nf-nat",		VE_NF_CONNTRACK|VE_IP_CONNTRACK|
+				VE_IP_NAT			},
+	{ "nf_conntrack-2",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "nf_conntrack_ipv4",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "ip_conntrack",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "nf_conntrack-10",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+	{ "nf_conntrack_ipv6",	VE_NF_CONNTRACK|VE_IP_CONNTRACK },
+
+	{ "nft-set",		VE_IP_IPTABLES			},
+	{ "nft-afinfo-2",	VE_IP_IPTABLES			}, /* IPV4 */
+	{ "nft-afinfo-3",	VE_IP_IPTABLES			}, /* ARP  */
+	{ "nft-afinfo-10",	VE_IP_IPTABLES6			}, /* IPV6 */
+
+	{ "nft-chain-2-nat",	VE_IP_IPTABLES|VE_IP_NAT	},
+	{ "nft-chain-2-route",	VE_IP_IPTABLES			},
+
+	{ "nft-chain-10-nat",	VE_IP_IPTABLES6|VE_IP_NAT	},
+	{ "nft-chain-10-route",	VE_IP_IPTABLES6		},
+
+	{ "nft-expr-2-reject",	VE_IP_IPTABLES			},
+	{ "nft-expr-10-reject",	VE_IP_IPTABLES6			},
+	{ "nf-logger-2-0",	VE_IP_IPTABLES			},
+	{ "nf-logger-10-0",	VE_IP_IPTABLES6			},
+};
+
+/*
+ *  Check if module named nft-expr-name is allowed.
+ *  We pass only tail name part to this function.
+ */
+static bool nft_expr_allowed(const char *name)
+{
+	u64 permitted = get_exec_env()->ipt_mask;
+
+	if (!name[0])
+		return false;
+
+	if (!strcmp(name, "ct"))
+		return mask_ipt_allow(permitted, VE_IP_CONNTRACK);
+
+	if (!strcmp(name, "nat"))
+		return mask_ipt_allow(permitted, VE_IP_NAT);
+
+	/*
+	 * We are interested in modules like nft-expr-xxx.
+	 * Expressions like nft-expr-xxx-yyy currently are
+	 * handled in ve0_ipt_am table. So expr does not contain
+	 * minus
+	 */
+	if (!strchr(name, '-'))
+		return mask_ipt_allow(permitted, VE_IP_IPTABLES) |
+		       mask_ipt_allow(permitted, VE_IP_IPTABLES6);
+	return false;
+}
+
+/*
+ * module_payload_iptable_allowed - check if iptables functionality is allowed
+ *			    to be used inside current virtual environment.
+ *
+ * Returns:
+ *   0 if iptable module is disallowed to load
+ *   1 if it is allowed or we're in ve0
+ *   -1 if module isn't iptables module
+ */
+static inline int module_payload_iptable_allowed(const char *module)
+{
+	u64 permitted = get_exec_env()->ipt_mask;
+	int i;
+
+	/* Look for full module name in ve0_ipt_am table */
+	for (i = 0; i < ARRAY_SIZE(ve0_ipt_am); i++) {
+		if (!strcmp(ve0_ipt_am[i].name, module))
+			return mask_ipt_allow(permitted, ve0_ipt_am[i].perm);
+	}
+
+	/* The rest of xt_* modules is allowed in both ipv4 and ipv6 modes */
+	if (!strncmp("xt_", module, 3))
+		return mask_ipt_allow(permitted, VE_IP_IPTABLES) ||
+		       mask_ipt_allow(permitted, VE_IP_IPTABLES6);
+
+	/* The rest of ipt_* modules */
+	if (!strncmp("ipt_", module, 4))
+		return mask_ipt_allow(permitted, VE_IP_IPTABLES);
+
+	/* The rest of ip6t_* modules */
+	if (!strncmp("ip6t_", module, 5))
+		return mask_ipt_allow(permitted, VE_IP_IPTABLES6);
+
+	/* The rest of arpt_* modules */
+	if (!strncmp("arpt_", module, 5))
+		return 1;
+
+	/* The rest of ebt_* modules */
+	if (!strncmp("ebt_", module, 4))
+		return 1;
+
+	/* The rest of nft- modules */
+	if (!strncmp("nft-expr-", module, 9))
+		return nft_expr_allowed(module + 9);
+
+	return -1;
+}
+
+/* ve0 allowed modules */
+static const char * const ve0_allowed_mod[] = {
+	"fs-binfmt_misc",
+	"fs-overlay",
+
+	/* inet_diag, inet6_diag  */
+	"net-pf-16-proto-4-type-2",	/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_INET */
+	"net-pf-16-proto-4-type-10",	/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_INET6 */
+
+	/* tcp_diag */
+	"net-pf-16-proto-4-type-2-6",	/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_INET - IPPROTO_TCP */
+
+	/* udp_diag */
+	"net-pf-16-proto-4-type-2-17",	/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_INET - IPPROTO_UDP */
+	"net-pf-16-proto-4-type-2-136",	/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_INET - IPPROTO_UDPLITE */
+
+	/* nfnetlink  */
+	"net-pf-16-proto-12",		/* PF_NETLINK, NETLINK_NETFILTER */
+	"nfnetlink-subsys-1",		/* NFNL_SUBSYS_CTNETLINK */
+	"nfnetlink-subsys-2",		/* NFNL_SUBSYS_CTNETLINK_EXP */
+
+	/* unix_diag */
+	"net-pf-16-proto-4-type-1",	/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_LOCAL */
+
+	/* af_packet_diag */
+	"net-pf-16-proto-4-type-17",	/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_PACKET */
+
+	/* netlink_diag */
+	"net-pf-16-proto-4-type-16",	/* PF_NETLINK, NETLINK_SOCK_DIAG, AF_NETLINK */
+
+	/* ip_set */
+	"nfnetlink-subsys-6",		/* NFNL_SUBSYS_IPSET */
+	"ip_set_bitmap:ip",
+	"ip_set_bitmap:ip,mac",
+	"ip_set_bitmap:port",
+	"ip_set_hash:ip",
+	"ip_set_hash:ip,port",
+	"ip_set_hash:ip,port,ip",
+	"ip_set_hash:net",
+	"ip_set_hash:net,port",
+	"ip_set_hash:ip,port,net",
+	"ip_set_hash:net,iface",
+	"ip_set_list:set",
+
+	"rtnl-link-dummy",
+	"rtnl-link-vxlan",
+
+	/* NFS */
+	"nfsv3",
+	"nfsv4",
+};
+
+/*
+ * module_payload_allowed - check if module functionality is allowed
+ *			    to be used inside current virtual environment.
+ *
+ * Returns true if it is allowed or we're in ve0, false otherwise.
+ */
+bool module_payload_allowed(const char *module)
+{
+	int i;
+	int ret;
+
+	if (ve_is_super(get_exec_env()))
+		return true;
+
+	ret = module_payload_iptable_allowed(module);
+	if (ret >= 0)
+		return !!ret;
+
+	for (i = 0; i < ARRAY_SIZE(ve0_allowed_mod); i++) {
+		if (!strcmp(ve0_allowed_mod[i], module))
+			return true;
+	}
+
+	return false;
+}
+
+#endif
+
+int __request_module(bool wait, const char *fmt, ...)
+{
+	char module_name[MODULE_NAME_LEN];
+	bool blacklist;
+	va_list args;
+	int ret;
+
+	va_start(args, fmt);
+	ret = vsnprintf(module_name, MODULE_NAME_LEN, fmt, args);
+	va_end(args);
+
+	if (ret >= MODULE_NAME_LEN)
+		return -ENAMETOOLONG;
+
+	/* Check that autoload is not prohobited using /proc interface */
+	if (!ve_is_super(get_exec_env()) &&
+	    !ve_allow_module_load)
+		return -EPERM;
+
+	/* Check that module functionality is permitted */
+	if (!module_payload_allowed(module_name))
+		return -EPERM;
+
+	/*
+	 * This function may be called from ve0, where standard behaviour
+	 * is not to use blacklist. So, we request blacklist reading only
+	 * if we're inside CT.
+	 */
+	blacklist = !ve_is_super(get_exec_env());
+
+	return ___request_module(wait, blacklist, module_name);
+}
 EXPORT_SYMBOL(__request_module);
 #endif /* CONFIG_MODULES */
 
@@ -311,7 +611,7 @@ static int wait_for_helper(void *data)
 }
 
 /* This is run by khelper thread  */
-static void __call_usermodehelper(struct work_struct *work)
+static void __call_usermodehelper(struct kthread_work *work)
 {
 	struct subprocess_info *sub_info =
 		container_of(work, struct subprocess_info, work);
@@ -533,7 +833,7 @@ struct subprocess_info *call_usermodehelper_setup(char *path, char **argv,
 	if (!sub_info)
 		goto out;
 
-	INIT_WORK(&sub_info->work, __call_usermodehelper);
+	init_kthread_work(&sub_info->work, __call_usermodehelper);
 	sub_info->path = path;
 	sub_info->argv = argv;
 	sub_info->envp = envp;
@@ -558,7 +858,8 @@ EXPORT_SYMBOL(call_usermodehelper_setup);
  * asynchronously if wait is not set, and runs as a child of keventd.
  * (ie. it runs with full root capabilities).
  */
-int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
+static int __call_usermodehelper_exec(struct kthread_worker *worker,
+		struct subprocess_info *sub_info, int wait)
 {
 	DECLARE_COMPLETION_ONSTACK(done);
 	int retval = 0;
@@ -590,7 +891,7 @@ int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
 	sub_info->complete = &done;
 	sub_info->wait = wait;
 
-	queue_work(system_unbound_wq, &sub_info->work);
+	queue_kthread_work(worker, &sub_info->work);
 	if (wait == UMH_NO_WAIT)	/* task has freed sub_info */
 		goto unlock;
 
@@ -614,6 +915,14 @@ unlock:
 	helper_unlock();
 	return retval;
 }
+
+int call_usermodehelper_exec(struct subprocess_info *sub_info, int wait)
+{
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+
+	return __call_usermodehelper_exec(&khelper_worker, sub_info, wait);
+}
 EXPORT_SYMBOL(call_usermodehelper_exec);
 
 /**
@@ -631,17 +940,75 @@ EXPORT_SYMBOL(call_usermodehelper_exec);
  */
 int call_usermodehelper(char *path, char **argv, char **envp, int wait)
 {
+	return call_usermodehelper_by(&khelper_worker, path, argv, envp,
+			wait, NULL, NULL, NULL);
+}
+EXPORT_SYMBOL(call_usermodehelper);
+
+#ifdef CONFIG_VE
+int call_usermodehelper_fns_ve(struct ve_struct *ve,
+	char *path, char **argv, char **envp, int wait,
+	int (*init)(struct subprocess_info *info, struct cred *new),
+	void (*cleanup)(struct subprocess_info *), void *data)
+{
+	int err;
+	struct kthread_worker *khelper;
+
+	ve = get_ve(ve);
+	if (!ve)
+		return -EFAULT;
+
+	khelper = ve_is_super(ve) ? &khelper_worker : &ve->ve_umh_worker;
+
+	if (ve_is_super(ve) || (get_exec_env() == ve)) {
+		err = call_usermodehelper_by(khelper, path, argv, envp, wait, init,
+					     cleanup, data);
+		goto out_put;
+	}
+
+	if (wait > UMH_WAIT_EXEC) {
+		printk(KERN_ERR "VE#%s: Sleeping call for containers UMH is "
+				"not supported\n", ve->ve_name);
+		err = -EINVAL;
+		goto out_put;
+	}
+
+	down_read(&ve->op_sem);
+	err = -EPIPE;
+	if (!ve->is_running)
+		goto out;
+
+	err = call_usermodehelper_by(khelper, path, argv, envp, wait, init,
+				     cleanup, data);
+
+out:
+	up_read(&ve->op_sem);
+out_put:
+	put_ve(ve);
+	return err;
+}
+EXPORT_SYMBOL(call_usermodehelper_fns_ve);
+#endif
+
+int call_usermodehelper_by(struct kthread_worker *worker,
+	char *path, char **argv, char **envp, int wait,
+	int (*init)(struct subprocess_info *info, struct cred *new),
+	void (*cleanup)(struct subprocess_info *), void *data)
+{
 	struct subprocess_info *info;
 	gfp_t gfp_mask = (wait == UMH_NO_WAIT) ? GFP_ATOMIC : GFP_KERNEL;
 
+	if (worker == &khelper_worker && !ve_is_super(get_exec_env()))
+		return -EPERM;
+
 	info = call_usermodehelper_setup(path, argv, envp, gfp_mask,
-					 NULL, NULL, NULL);
+					 init, cleanup, data);
 	if (info == NULL)
 		return -ENOMEM;
 
-	return call_usermodehelper_exec(info, wait);
+	return __call_usermodehelper_exec(worker, info, wait);
 }
-EXPORT_SYMBOL(call_usermodehelper);
+EXPORT_SYMBOL(call_usermodehelper_by);
 
 static int proc_cap_handler(struct ctl_table *table, int write,
 			 void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -720,3 +1087,11 @@ struct ctl_table usermodehelper_table[] = {
 	},
 	{ }
 };
+
+void __init usermodehelper_init(void)
+{
+	struct task_struct *t;
+
+	t = kthread_run(kthread_worker_fn, &khelper_worker, "khelper");
+	BUG_ON(IS_ERR(t));
+}
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -18,6 +18,7 @@
 #include <linux/stat.h>
 #include <linux/sched.h>
 #include <linux/capability.h>
+#include <linux/ve.h>
 
 #define KERNEL_ATTR_RO(_name) \
 static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
@@ -30,7 +31,7 @@ static struct kobj_attribute _name##_attr = \
 static ssize_t uevent_seqnum_show(struct kobject *kobj,
 				  struct kobj_attribute *attr, char *buf)
 {
-	return sprintf(buf, "%llu\n", (unsigned long long)uevent_seqnum);
+	return sprintf(buf, "%llu\n", (unsigned long long)ve_uevent_seqnum);
 }
 KERNEL_ATTR_RO(uevent_seqnum);
 
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -24,20 +24,6 @@ static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
 
-struct kthread_create_info
-{
-	/* Information passed to kthread() from kthreadd. */
-	int (*threadfn)(void *data);
-	void *data;
-	int node;
-
-	/* Result passed back to kthread_create() from kthreadd. */
-	struct task_struct *result;
-	struct completion done;
-
-	struct list_head list;
-};
-
 struct kthread {
 	unsigned long flags;
 	unsigned int cpu;
@@ -213,7 +199,7 @@ int tsk_fork_get_node(struct task_struct *tsk)
 	return NUMA_NO_NODE;
 }
 
-static void create_kthread(struct kthread_create_info *create)
+void create_kthread(struct kthread_create_info *create)
 {
 	int pid;
 
@@ -227,6 +213,16 @@ static void create_kthread(struct kthread_create_info *create)
 		complete(&create->done);
 	}
 }
+EXPORT_SYMBOL(create_kthread);
+
+static void kthread_add_to_kthreadd(void *data, struct kthread_create_info *create)
+{
+	spin_lock(&kthread_create_lock);
+	list_add_tail(&create->list, &kthread_create_list);
+	spin_unlock(&kthread_create_lock);
+	wake_up_process(kthreadd_task);
+	wait_for_completion(&create->done);
+}
 
 /**
  * kthread_create_on_node - create a kthread.
@@ -250,10 +246,13 @@ static void create_kthread(struct kthread_create_info *create)
  *
  * Returns a task_struct or ERR_PTR(-ENOMEM).
  */
-struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
-					   void *data, int node,
-					   const char namefmt[],
-					   ...)
+struct task_struct *__kthread_create_on_node(
+		void (*addfn)(void *data, struct kthread_create_info *create),
+		void *add_data,
+		int (*threadfn)(void *data),
+		void *data, int node,
+		const char namefmt[],
+		...)
 {
 	struct kthread_create_info create;
 
@@ -262,12 +261,10 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 	create.node = node;
 	init_completion(&create.done);
 
-	spin_lock(&kthread_create_lock);
-	list_add_tail(&create.list, &kthread_create_list);
-	spin_unlock(&kthread_create_lock);
+	if (addfn == NULL)
+		addfn = kthread_add_to_kthreadd;
 
-	wake_up_process(kthreadd_task);
-	wait_for_completion(&create.done);
+	addfn(add_data, &create);
 
 	if (!IS_ERR(create.result)) {
 		static const struct sched_param param = { .sched_priority = 0 };
@@ -286,7 +283,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
 	}
 	return create.result;
 }
-EXPORT_SYMBOL(kthread_create_on_node);
+EXPORT_SYMBOL(__kthread_create_on_node);
 
 static void __kthread_bind(struct task_struct *p, unsigned int cpu, long state)
 {
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -712,8 +712,6 @@ static int module_unload_init(struct module *mod)
 
 	/* Hold reference count during initialization. */
 	__this_cpu_write(mod->refptr->incs, 1);
-	/* Backwards compatibility macros put refcount during init. */
-	mod->waiter = current;
 
 	return 0;
 }
@@ -839,16 +837,9 @@ static int __try_stop_module(void *_sref)
 
 static int try_stop_module(struct module *mod, int flags, int *forced)
 {
-	if (flags & O_NONBLOCK) {
-		struct stopref sref = { mod, flags, forced };
+	struct stopref sref = { mod, flags, forced };
 
-		return stop_machine(__try_stop_module, &sref, NULL);
-	} else {
-		/* We don't need to stop the machine for this. */
-		mod->state = MODULE_STATE_GOING;
-		synchronize_sched();
-		return 0;
-	}
+	return stop_machine(__try_stop_module, &sref, NULL);
 }
 
 unsigned long module_refcount(struct module *mod)
@@ -881,21 +872,6 @@ EXPORT_SYMBOL(module_refcount);
 /* This exists whether we can unload or not */
 static void free_module(struct module *mod);
 
-static void wait_for_zero_refcount(struct module *mod)
-{
-	/* Since we might sleep for some time, release the mutex first */
-	mutex_unlock(&module_mutex);
-	for (;;) {
-		pr_debug("Looking at refcount...\n");
-		set_current_state(TASK_UNINTERRUPTIBLE);
-		if (module_refcount(mod) == 0)
-			break;
-		schedule();
-	}
-	current->state = TASK_RUNNING;
-	mutex_lock(&module_mutex);
-}
-
 SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		unsigned int, flags)
 {
@@ -927,8 +903,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 
 	/* Doing init or already dying? */
 	if (mod->state != MODULE_STATE_LIVE) {
-		/* FIXME: if (force), slam module count and wake up
-                   waiter --RR */
+		/* FIXME: if (force), slam module count damn the torpedoes */
 		pr_debug("%s already dying\n", mod->name);
 		ret = -EBUSY;
 		goto out;
@@ -944,18 +919,11 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 		}
 	}
 
-	/* Set this up before setting mod->state */
-	mod->waiter = current;
-
 	/* Stop the machine so refcounts can't move and disable module. */
 	ret = try_stop_module(mod, flags, &forced);
 	if (ret != 0)
 		goto out;
 
-	/* Never wait if forced. */
-	if (!forced && module_refcount(mod) != 0)
-		wait_for_zero_refcount(mod);
-
 	mutex_unlock(&module_mutex);
 	/* Final destruction now no one is using it. */
 	if (mod->exit != NULL)
@@ -978,8 +946,9 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod)
 {
 	struct module_use *use;
 	int printed_something = 0;
+	bool in_container = !ve_is_super(get_exec_env());
 
-	seq_printf(m, " %lu ", module_refcount(mod));
+	seq_printf(m, " %lu ", in_container ? 1 : module_refcount(mod));
 
 	/* Always include a trailing , so userspace can differentiate
            between this and the old multi-field proc format. */
@@ -1073,9 +1042,6 @@ void module_put(struct module *module)
 		__this_cpu_inc(module->refptr->decs);
 
 		trace_module_put(module, _RET_IP_);
-		/* Maybe they're waiting for us to drop reference? */
-		if (unlikely(!module_is_live(module)))
-			wake_up_process(module->waiter);
 		preempt_enable();
 	}
 }
@@ -3833,6 +3799,7 @@ static void m_stop(struct seq_file *m, void *p)
 static int m_show(struct seq_file *m, void *p)
 {
 	struct module *mod = list_entry(p, struct module, list);
+	bool in_container = !ve_is_super(get_exec_env());
 	char buf[8];
 
 	/* We always ignore unformed modules. */
@@ -3840,7 +3807,7 @@ static int m_show(struct seq_file *m, void *p)
 		return 0;
 
 	seq_printf(m, "%s %u",
-		   mod->name, mod->init_size + mod->core_size);
+		   mod->name, in_container ? 4242 : mod->init_size + mod->core_size);
 	print_unload_info(m, mod);
 
 	/* Informative for users. */
@@ -3852,7 +3819,7 @@ static int m_show(struct seq_file *m, void *p)
 	seq_printf(m, " 0x%pK", mod->module_core);
 
 	/* Taints info */
-	if (mod->taints)
+	if (mod->taints && !in_container)
 		seq_printf(m, " %s", module_flags(mod, buf));
 
 	seq_printf(m, "\n");
@@ -3885,7 +3852,7 @@ static const struct file_operations proc_modules_operations = {
 
 static int __init proc_modules_init(void)
 {
-	proc_create("modules", 0, NULL, &proc_modules_operations);
+	proc_create("modules", S_ISVTX, NULL, &proc_modules_operations);
 	return 0;
 }
 module_init(proc_modules_init);
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -165,6 +165,7 @@ out:
 	put_nsproxy(old_ns);
 	return err;
 }
+EXPORT_SYMBOL(copy_namespaces);
 
 void free_nsproxy(struct nsproxy *ns)
 {
@@ -179,6 +180,7 @@ void free_nsproxy(struct nsproxy *ns)
 	put_net(ns->net_ns);
 	kmem_cache_free(nsproxy_cachep, ns);
 }
+EXPORT_SYMBOL(free_nsproxy);
 
 /*
  * Called from unshare. Unshare all the namespaces part of nsproxy.
@@ -223,6 +225,7 @@ void switch_task_namespaces(struct task_struct *p, struct nsproxy *new)
 	if (ns && atomic_dec_and_test(&ns->count))
 		free_nsproxy(ns);
 }
+EXPORT_SYMBOL_GPL(switch_task_namespaces);
 
 void exit_task_namespaces(struct task_struct *p)
 {
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -336,6 +336,12 @@ void add_taint(unsigned flag, enum lockdep_ok lockdep_ok)
 		printk(KERN_WARNING
 		       "Disabling lock debugging due to kernel taint\n");
 
+	/* Do not confuse people with calltraces on proprietary modules */
+	if (flag != TAINT_PROPRIETARY_MODULE && flag != TAINT_OOT_MODULE &&
+	    flag != TAINT_UNSIGNED_MODULE) {
+		printk(KERN_WARNING "Tainting kernel with flag 0x%x\n", flag);
+		dump_stack();
+	}
 	set_bit(flag, &tainted_mask);
 }
 EXPORT_SYMBOL(add_taint);
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -45,8 +45,6 @@ static struct hlist_head *pid_hash;
 static unsigned int pidhash_shift = 4;
 struct pid init_struct_pid = INIT_STRUCT_PID;
 
-int pid_max = PID_MAX_DEFAULT;
-
 #define RESERVED_PIDS		300
 
 int pid_max_min = RESERVED_PIDS + 1;
@@ -153,7 +151,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 	struct pidmap *map;
 
 	pid = last + 1;
-	if (pid >= pid_max)
+	if (pid >= pid_ns->pid_max)
 		pid = RESERVED_PIDS;
 	offset = pid & BITS_PER_PAGE_MASK;
 	map = &pid_ns->pidmap[pid/BITS_PER_PAGE];
@@ -162,7 +160,7 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 	 * want to scan this bitmap block twice, the second time
 	 * we start with offset == 0 (or RESERVED_PIDS).
 	 */
-	max_scan = DIV_ROUND_UP(pid_max, BITS_PER_PAGE) - !offset;
+	max_scan = DIV_ROUND_UP(pid_ns->pid_max, BITS_PER_PAGE) - !offset;
 	for (i = 0; i <= max_scan; ++i) {
 		if (unlikely(!map->page)) {
 			void *page = kzalloc(PAGE_SIZE, GFP_KERNEL);
@@ -191,11 +189,11 @@ static int alloc_pidmap(struct pid_namespace *pid_ns)
 				if (offset >= BITS_PER_PAGE)
 					break;
 				pid = mk_pid(pid_ns, map, offset);
-				if (pid >= pid_max)
+				if (pid >= pid_ns->pid_max)
 					break;
 			}
 		}
-		if (map < &pid_ns->pidmap[(pid_max-1)/BITS_PER_PAGE]) {
+		if (map < &pid_ns->pidmap[(pid_ns->pid_max-1)/BITS_PER_PAGE]) {
 			++map;
 			offset = 0;
 		} else {
@@ -229,6 +227,7 @@ int next_pidmap(struct pid_namespace *pid_ns, unsigned int last)
 	}
 	return -1;
 }
+EXPORT_SYMBOL(next_pidmap);
 
 void put_pid(struct pid *pid)
 {
@@ -262,7 +261,9 @@ void free_pid(struct pid *pid)
 	for (i = 0; i <= pid->level; i++) {
 		struct upid *upid = pid->numbers + i;
 		struct pid_namespace *ns = upid->ns;
+
 		hlist_del_rcu(&upid->pid_chain);
+
 		switch(--ns->nr_hashed) {
 		case 2:
 		case 1:
@@ -272,6 +273,11 @@ void free_pid(struct pid *pid)
 			 */
 			wake_up_process(ns->child_reaper);
 			break;
+		case PIDNS_HASH_ADDING:
+			/* Handle a fork failure of the first process */
+			WARN_ON(ns->child_reaper);
+			ns->nr_hashed = 0;
+			/* fall through */
 		case 0:
 			schedule_work(&ns->proc_work);
 			break;
@@ -310,8 +316,10 @@ struct pid *alloc_pid(struct pid_namespace *ns)
 	}
 
 	if (unlikely(is_child_reaper(pid))) {
-		if (pid_ns_prepare_proc(ns))
+		if (pid_ns_prepare_proc(ns)) {
+			disable_pid_allocation(ns);
 			goto out_free;
+		}
 	}
 
 	get_pid_ns(ns);
@@ -436,6 +444,51 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type)
 }
 EXPORT_SYMBOL(pid_task);
 
+int change_active_pid_ns(struct task_struct *task, struct pid_namespace *ns)
+{
+	struct pid *pid = task_pid(task);
+	struct upid *upid = pid->numbers + pid->level;
+	int nr, err;
+
+	if (upid->ns != ns->parent || upid->ns->pid_cachep != ns->pid_cachep)
+		return -EINVAL;
+
+	nr = alloc_pidmap(ns);
+	if (nr < 0)
+		return -ENOMEM;
+
+	get_pid_ns(ns);
+	put_pid_ns(upid->ns);
+
+	upid++;
+	upid->nr = nr;
+	upid->ns = ns;
+	smp_wmb();
+	pid->level++;
+
+	if (is_child_reaper(pid)) {
+		err = pid_ns_prepare_proc(ns);
+		if (err)
+			goto undo;
+		ns->child_reaper = task;
+	}
+
+	spin_lock_irq(&pidmap_lock);
+	hlist_add_head_rcu(&upid->pid_chain, &pid_hash[pid_hashfn(nr, ns)]);
+	ns->nr_hashed++;
+	spin_unlock_irq(&pidmap_lock);
+
+	return 0;
+
+undo:
+	pid->level--;
+	free_pidmap(upid);
+	get_pid_ns(ns->parent);
+	put_pid_ns(ns);
+	return err;
+}
+EXPORT_SYMBOL_GPL(change_active_pid_ns);
+
 /*
  * Must be called under rcu_read_lock().
  */
@@ -527,6 +580,18 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
 }
 EXPORT_SYMBOL(__task_pid_nr_ns);
 
+pid_t ve_task_ppid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+{
+	pid_t ppid;
+	rcu_read_lock();
+	ppid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
+	rcu_read_unlock();
+	/* It's dirty hack. Some old utils don't work if ppid is zero*/
+	if (ppid == 0 && ns->child_reaper != tsk)
+		ppid = 1;
+	return ppid;
+}
+
 pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 {
 	return pid_nr_ns(task_tgid(tsk), ns);
@@ -583,11 +648,11 @@ void __init pidmap_init(void)
 	BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
 
 	/* bump default and minimum pid_max based on number of cpus */
-	pid_max = min(pid_max_max, max_t(int, pid_max,
+	init_pid_ns.pid_max = min(pid_max_max, max_t(int, PID_MAX_DEFAULT,
 				PIDS_PER_CPU_DEFAULT * num_possible_cpus()));
 	pid_max_min = max_t(int, pid_max_min,
 				PIDS_PER_CPU_MIN * num_possible_cpus());
-	pr_info("pid_max: default: %u minimum: %u\n", pid_max, pid_max_min);
+	pr_info("pid_max: default: %u minimum: %u\n", init_pid_ns.pid_max, pid_max_min);
 
 	init_pid_ns.pidmap[0].page = kzalloc(PAGE_SIZE, GFP_KERNEL);
 	/* Reserve PID 0. We never call free_pidmap(0) */
@@ -596,5 +661,5 @@ void __init pidmap_init(void)
 	init_pid_ns.nr_hashed = PIDNS_HASH_ADDING;
 
 	init_pid_ns.pid_cachep = KMEM_CACHE(pid,
-			SLAB_HWCACHE_ALIGN | SLAB_PANIC);
+			SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT);
 }
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -18,6 +18,9 @@
 #include <linux/proc_ns.h>
 #include <linux/reboot.h>
 #include <linux/export.h>
+#include <linux/module.h>
+#include <linux/ve.h>
+#include <linux/kthread.h>
 
 struct pid_cache {
 	int nr_ids;
@@ -40,6 +43,9 @@ static struct kmem_cache *create_pid_cachep(int nr_ids)
 	struct pid_cache *pcache;
 	struct kmem_cache *cachep;
 
+	if (nr_ids <= 2)
+		return init_pid_ns.pid_cachep;
+
 	mutex_lock(&pid_caches_mutex);
 	list_for_each_entry(pcache, &pid_caches_lh, list)
 		if (pcache->nr_ids == nr_ids)
@@ -51,7 +57,7 @@ static struct kmem_cache *create_pid_cachep(int nr_ids)
 
 	snprintf(pcache->name, sizeof(pcache->name), "pid_%d", nr_ids);
 	cachep = kmem_cache_create(pcache->name,
-			sizeof(struct pid) + (nr_ids - 1) * sizeof(struct upid),
+			sizeof(struct pid) + (nr_ids - 2) * sizeof(struct upid),
 			0, SLAB_HWCACHE_ALIGN, NULL);
 	if (cachep == NULL)
 		goto err_cachep;
@@ -115,6 +121,7 @@ static struct pid_namespace *create_pid_namespace(struct user_namespace *user_ns
 	ns->user_ns = get_user_ns(user_ns);
 	ns->nr_hashed = PIDNS_HASH_ADDING;
 	INIT_WORK(&ns->proc_work, proc_cleanup_work);
+	ns->pid_max = PID_MAX_NS_DEFAULT;
 
 	set_bit(0, ns->pidmap[0].page);
 	atomic_set(&ns->pidmap[0].nr_free, BITS_PER_PAGE - 1);
@@ -140,6 +147,10 @@ static void destroy_pid_namespace(struct pid_namespace *ns)
 	for (i = 0; i < PIDMAP_ENTRIES; i++)
 		kfree(ns->pidmap[i].page);
 	put_user_ns(ns->user_ns);
+
+#ifdef CONFIG_BSD_PROCESS_ACCT
+	kfree(ns->bacct);
+#endif
 	kmem_cache_free(pid_ns_cachep, ns);
 }
 
@@ -189,6 +200,8 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	me->sighand->action[SIGCHLD - 1].sa.sa_handler = SIG_IGN;
 	spin_unlock_irq(&me->sighand->siglock);
 
+	ve_stop_ns(pid_ns);
+
 	/*
 	 * The last thread in the cgroup-init thread group is terminating.
 	 * Find remaining pid_ts in the namespace, signal and wait for them
@@ -239,6 +252,9 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 		current->signal->group_exit_code = pid_ns->reboot;
 
 	acct_exit_ns(pid_ns);
+
+	ve_exit_ns(pid_ns);
+
 	return;
 }
 
@@ -259,6 +275,7 @@ static int pid_ns_ctl_handler(struct ctl_table *table, int write,
 	 */
 
 	tmp.data = &pid_ns->last_pid;
+	tmp.extra2 = &pid_ns->pid_max;
 	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
 }
 
@@ -268,10 +285,9 @@ static struct ctl_table pid_ns_ctl_table[] = {
 	{
 		.procname = "ns_last_pid",
 		.maxlen = sizeof(int),
-		.mode = 0666, /* permissions are checked in the handler */
+		.mode = 0666 | S_ISVTX, /* permissions are checked in the handler */
 		.proc_handler = pid_ns_ctl_handler,
 		.extra1 = &zero,
-		.extra2 = &pid_max,
 	},
 	{ }
 };
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -12,6 +12,7 @@
 #include <linux/random.h>
 #include <linux/tick.h>
 #include <linux/workqueue.h>
+#include <linux/module.h>
 
 /*
  * Called after updating RLIMIT_CPU to run cpu timer and update
--- a/kernel/posix-timers.c
+++ b/kernel/posix-timers.c
@@ -48,6 +48,9 @@
 #include <linux/workqueue.h>
 #include <linux/export.h>
 #include <linux/hashtable.h>
+#include <linux/ve.h>
+
+#include <bc/beancounter.h>
 
 #include "time/timekeeping.h"
 
@@ -126,6 +129,39 @@ static DEFINE_SPINLOCK(hash_lock);
 
 static struct k_clock posix_clocks[MAX_CLOCKS];
 
+#define clock_is_monotonic(which_clock) \
+	((which_clock) == CLOCK_MONOTONIC || \
+	 (which_clock) == CLOCK_MONOTONIC_RAW || \
+	 (which_clock) == CLOCK_MONOTONIC_COARSE)
+
+#ifdef CONFIG_VE
+static struct timespec zero_time;
+
+void monotonic_abs_to_ve(clockid_t which_clock, struct timespec *tp)
+{
+	struct ve_struct *ve = get_exec_env();
+
+	if (clock_is_monotonic(which_clock))
+		set_normalized_timespec(tp,
+				tp->tv_sec - ve->start_timespec.tv_sec,
+				tp->tv_nsec - ve->start_timespec.tv_nsec);
+}
+
+void monotonic_ve_to_abs(clockid_t which_clock, struct timespec *tp)
+{
+	struct ve_struct *ve = get_exec_env();
+
+	if (clock_is_monotonic(which_clock))
+		set_normalized_timespec(tp,
+				tp->tv_sec + ve->start_timespec.tv_sec,
+				tp->tv_nsec + ve->start_timespec.tv_nsec);
+	if (timespec_compare(tp, &zero_time) <= 0) {
+		tp->tv_sec =  0;
+		tp->tv_nsec = 1;
+	}
+}
+#endif
+
 /*
  * These ones are defined below.
  */
@@ -341,8 +377,8 @@ static __init int init_posix_timers(void)
 	posix_timers_register_clock(CLOCK_TAI, &clock_tai);
 
 	posix_timers_cache = kmem_cache_create("posix_timers_cache",
-					sizeof (struct k_itimer), 0, SLAB_PANIC,
-					NULL);
+					sizeof (struct k_itimer), 0,
+					SLAB_PANIC | SLAB_ACCOUNT, NULL);
 	return 0;
 }
 
@@ -416,8 +452,14 @@ int posix_timer_event(struct k_itimer *timr, int si_private)
 	rcu_read_lock();
 	task = pid_task(timr->it_pid, PIDTYPE_PID);
 	if (task) {
+		struct user_beancounter *ub;
+
+		ub = set_exec_ub(task->task_bc.task_ub);
+
 		shared = !(timr->it_sigev_notify & SIGEV_THREAD_ID);
 		ret = send_sigqueue(timr->sigq, task, shared);
+
+		(void)set_exec_ub(ub);
 	}
 	rcu_read_unlock();
 	/* If we failed to send the signal the timer stops. */
@@ -895,6 +937,9 @@ retry:
 	if (!timr)
 		return -EINVAL;
 
+	if ((flags & TIMER_ABSTIME) &&
+	    (new_spec.it_value.tv_sec || new_spec.it_value.tv_nsec))
+		monotonic_ve_to_abs(timr->it_clock, &new_spec.it_value);
 	kc = clockid_to_kclock(timr->it_clock);
 	if (WARN_ON_ONCE(!kc || !kc->timer_set))
 		error = -EINVAL;
@@ -1028,6 +1073,7 @@ SYSCALL_DEFINE2(clock_gettime, const clockid_t, which_clock,
 
 	error = kc->clock_get(which_clock, &kernel_tp);
 
+	monotonic_abs_to_ve(which_clock, &kernel_tp);
 	if (!error && copy_to_user(tp, &kernel_tp, sizeof (kernel_tp)))
 		error = -EFAULT;
 
@@ -1104,6 +1150,9 @@ SYSCALL_DEFINE4(clock_nanosleep, const clockid_t, which_clock, int, flags,
 	if (!timespec_valid(&t))
 		return -EINVAL;
 
+	if (flags & TIMER_ABSTIME)
+		monotonic_ve_to_abs(which_clock, &t);
+
 	return kc->nsleep(which_clock, flags, &t, rmtp);
 }
 
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -45,6 +45,8 @@
 #include <linux/poll.h>
 #include <linux/irq_work.h>
 #include <linux/utsname.h>
+#include <linux/vermagic.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 
@@ -125,6 +127,7 @@ EXPORT_SYMBOL(console_set_on_cmdline);
 
 /* Flag: console code may call schedule() */
 static int console_may_schedule;
+int console_silence_loglevel;
 
 /*
  * The printk log buffer consists of a chain of concatenated variable
@@ -218,30 +221,6 @@ struct log {
 static DEFINE_RAW_SPINLOCK(logbuf_lock);
 
 #ifdef CONFIG_PRINTK
-DECLARE_WAIT_QUEUE_HEAD(log_wait);
-/* the next printk record to read by syslog(READ) or /proc/kmsg */
-static u64 syslog_seq;
-static u32 syslog_idx;
-static enum log_flags syslog_prev;
-static size_t syslog_partial;
-
-/* index and sequence number of the first record stored in the buffer */
-static u64 log_first_seq;
-static u32 log_first_idx;
-
-/* index and sequence number of the next record to store in the buffer */
-static u64 log_next_seq;
-static u32 log_next_idx;
-
-/* the next printk record to write to the console */
-static u64 console_seq;
-static u32 console_idx;
-static enum log_flags console_prev;
-
-/* the next printk record to read after the last 'clear' command */
-static u64 clear_seq;
-static u32 clear_idx;
-
 #define PREFIX_MAX		32
 #define LOG_LINE_MAX		1024 - PREFIX_MAX
 
@@ -253,8 +232,92 @@ static u32 clear_idx;
 #endif
 #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT)
 static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN);
-static char *log_buf = __log_buf;
-static u32 log_buf_len = __LOG_BUF_LEN;
+
+/*
+ * Continuation lines are buffered, and not committed to the record buffer
+ * until the line is complete, or a race forces it. The line fragments
+ * though, are printed immediately to the consoles to ensure everything has
+ * reached the console in case of a kernel crash.
+ */
+struct cont {
+	char buf[LOG_LINE_MAX];
+	size_t len;			/* length == 0 means unused buffer */
+	size_t cons;			/* bytes written to console */
+	struct task_struct *owner;	/* task of first print*/
+	u64 ts_nsec;			/* time of first print */
+	u8 level;			/* log level of first message */
+	u8 facility;			/* log level of first message */
+	enum log_flags flags;		/* prefix, newline flags */
+	bool flushed:1;			/* buffer sealed and committed */
+};
+
+static struct log_state {
+	char *buf;
+	u32 buf_len;
+
+	/* the next printk record to read by syslog(READ) or /proc/kmsg */
+	u64 syslog_seq;
+	u32 syslog_idx;
+	enum log_flags syslog_prev;
+	size_t syslog_partial;
+
+	/* index and sequence number of the first record stored in the buffer */
+	u64 first_seq;
+	u32 first_idx;
+
+	/* index and sequence number of the next record to store in the buffer */
+	u64 next_seq;
+	u32 next_idx;
+
+	/* the next printk record to write to the console */
+	u64 console_seq;
+	u32 console_idx;
+	enum log_flags console_prev;
+
+	/* the next printk record to read after the last 'clear' command */
+	u64 clear_seq;
+	u32 clear_idx;
+
+	u64 seen_seq;
+
+	struct cont cont;
+
+	wait_queue_head_t wait;
+} init_log_state = {
+	.buf = __log_buf,
+	.buf_len = __LOG_BUF_LEN,
+	.wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_log_state.wait),
+};
+
+/* kdump relies on some log_* symbols, let's make it happy */
+#define DEFINE_STRUCT_MEMBER_ALIAS(name, inst, memb)			\
+static void ____ ## name ## _definition(void) __attribute__((used));	\
+static void ____ ## name ## _definition(void)				\
+{									\
+	asm (".globl " #name "\n\t.set " #name ", " #inst "+%c0"	\
+	     : : "g" (offsetof(typeof(inst), memb)));			\
+}									\
+extern typeof(inst.memb) name;
+DEFINE_STRUCT_MEMBER_ALIAS(log_buf, init_log_state, buf);
+DEFINE_STRUCT_MEMBER_ALIAS(log_buf_len, init_log_state, buf_len);
+DEFINE_STRUCT_MEMBER_ALIAS(log_first_idx, init_log_state, first_idx);
+DEFINE_STRUCT_MEMBER_ALIAS(log_next_idx, init_log_state, next_idx);
+#undef DEFINE_STRUCT_MEMBER_ALIAS
+
+static inline struct log_state *ve_log_state(void)
+{
+	struct log_state *log = &init_log_state;
+#ifdef CONFIG_VE
+	if (get_exec_env()->log_state)
+		log = get_exec_env()->log_state;
+#endif
+	return log;
+}
+
+void log_poll_wait(struct file *filp, poll_table *p)
+{
+	poll_wait(filp, &ve_log_state()->wait, p);
+}
 
 /* Return log buffer address */
 char *log_buf_addr_get(void)
@@ -281,23 +344,23 @@ static char *log_dict(const struct log *msg)
 }
 
 /* get record by index; idx must point to valid msg */
-static struct log *log_from_idx(u32 idx)
+static struct log *log_from_idx(struct log_state *log, u32 idx)
 {
-	struct log *msg = (struct log *)(log_buf + idx);
+	struct log *msg = (struct log *)(log->buf + idx);
 
 	/*
 	 * A length == 0 record is the end of buffer marker. Wrap around and
 	 * read the message at the start of the buffer.
 	 */
 	if (!msg->len)
-		return (struct log *)log_buf;
+		return (struct log *)log->buf;
 	return msg;
 }
 
 /* get next record; idx must point to valid msg */
-static u32 log_next(u32 idx)
+static u32 log_next(struct log_state *log, u32 idx)
 {
-	struct log *msg = (struct log *)(log_buf + idx);
+	struct log *msg = (struct log *)(log->buf + idx);
 
 	/* length == 0 indicates the end of the buffer; wrap */
 	/*
@@ -306,14 +369,15 @@ static u32 log_next(u32 idx)
 	 * return the one after that.
 	 */
 	if (!msg->len) {
-		msg = (struct log *)log_buf;
+		msg = (struct log *)log->buf;
 		return msg->len;
 	}
 	return idx + msg->len;
 }
 
 /* insert record into the buffer, discard old ones, update heads */
-static void log_store(int facility, int level,
+static void log_store(struct log_state *log,
+		      int facility, int level,
 		      enum log_flags flags, u64 ts_nsec,
 		      const char *dict, u16 dict_len,
 		      const char *text, u16 text_len)
@@ -326,34 +390,35 @@ static void log_store(int facility, int level,
 	pad_len = (-size) & (LOG_ALIGN - 1);
 	size += pad_len;
 
-	while (log_first_seq < log_next_seq) {
+	while (log->first_seq < log->next_seq) {
 		u32 free;
 
-		if (log_next_idx > log_first_idx)
-			free = max(log_buf_len - log_next_idx, log_first_idx);
+		if (log->next_idx > log->first_idx)
+			free = max(log->buf_len - log->next_idx,
+				   log->first_idx);
 		else
-			free = log_first_idx - log_next_idx;
+			free = log->first_idx - log->next_idx;
 
 		if (free > size + sizeof(struct log))
 			break;
 
 		/* drop old messages until we have enough contiuous space */
-		log_first_idx = log_next(log_first_idx);
-		log_first_seq++;
+		log->first_idx = log_next(log, log->first_idx);
+		log->first_seq++;
 	}
 
-	if (log_next_idx + size + sizeof(struct log) >= log_buf_len) {
+	if (log->next_idx + size + sizeof(struct log) >= log->buf_len) {
 		/*
 		 * This message + an additional empty header does not fit
 		 * at the end of the buffer. Add an empty header with len == 0
 		 * to signify a wrap around.
 		 */
-		memset(log_buf + log_next_idx, 0, sizeof(struct log));
-		log_next_idx = 0;
+		memset(log->buf + log->next_idx, 0, sizeof(struct log));
+		log->next_idx = 0;
 	}
 
 	/* fill message */
-	msg = (struct log *)(log_buf + log_next_idx);
+	msg = (struct log *)(log->buf + log->next_idx);
 	memcpy(log_text(msg), text, text_len);
 	msg->text_len = text_len;
 	memcpy(log_dict(msg), dict, dict_len);
@@ -369,8 +434,8 @@ static void log_store(int facility, int level,
 	msg->len = sizeof(struct log) + text_len + dict_len + pad_len;
 
 	/* insert message */
-	log_next_idx += msg->len;
-	log_next_seq++;
+	log->next_idx += msg->len;
+	log->next_seq++;
 }
 
 #ifdef CONFIG_SECURITY_DMESG_RESTRICT
@@ -401,13 +466,13 @@ static int check_syslog_permissions(int type, bool from_file)
 		return 0;
 
 	if (syslog_action_restricted(type)) {
-		if (capable(CAP_SYSLOG))
+		if (ve_capable(CAP_SYSLOG))
 			return 0;
 		/*
 		 * For historical reasons, accept CAP_SYS_ADMIN too, with
 		 * a warning.
 		 */
-		if (capable(CAP_SYS_ADMIN)) {
+		if (ve_capable(CAP_SYS_ADMIN)) {
 			pr_warn_once("%s (%d): Attempt to access syslog with "
 				     "CAP_SYS_ADMIN but no CAP_SYSLOG "
 				     "(deprecated).\n",
@@ -439,6 +504,9 @@ static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv,
 	size_t len = iov_length(iv, count);
 	ssize_t ret = len;
 
+	if (ve_log_state() != &init_log_state)
+		return count;
+
 	if (len > LOG_LINE_MAX)
 		return -EINVAL;
 	buf = kmalloc(len+1, GFP_KERNEL);
@@ -488,6 +556,7 @@ out:
 static ssize_t devkmsg_read(struct file *file, char __user *buf,
 			    size_t count, loff_t *ppos)
 {
+	struct log_state *log = ve_log_state();
 	struct devkmsg_user *user = file->private_data;
 	struct log *msg;
 	u64 ts_usec;
@@ -503,7 +572,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 	if (ret)
 		return ret;
 	raw_spin_lock_irq(&logbuf_lock);
-	while (user->seq == log_next_seq) {
+	while (user->seq == log->next_seq) {
 		if (file->f_flags & O_NONBLOCK) {
 			ret = -EAGAIN;
 			raw_spin_unlock_irq(&logbuf_lock);
@@ -511,23 +580,23 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 		}
 
 		raw_spin_unlock_irq(&logbuf_lock);
-		ret = wait_event_interruptible(log_wait,
-					       user->seq != log_next_seq);
+		ret = wait_event_interruptible(log->wait,
+				user->seq != log->next_seq);
 		if (ret)
 			goto out;
 		raw_spin_lock_irq(&logbuf_lock);
 	}
 
-	if (user->seq < log_first_seq) {
+	if (user->seq < log->first_seq) {
 		/* our last seen message is gone, return error and reset */
-		user->idx = log_first_idx;
-		user->seq = log_first_seq;
+		user->idx = log->first_idx;
+		user->seq = log->first_seq;
 		ret = -EPIPE;
 		raw_spin_unlock_irq(&logbuf_lock);
 		goto out;
 	}
 
-	msg = log_from_idx(user->idx);
+	msg = log_from_idx(log, user->idx);
 	ts_usec = msg->ts_nsec;
 	do_div(ts_usec, 1000);
 
@@ -588,7 +657,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf,
 		user->buf[len++] = '\n';
 	}
 
-	user->idx = log_next(user->idx);
+	user->idx = log_next(log, user->idx);
 	user->seq++;
 	raw_spin_unlock_irq(&logbuf_lock);
 
@@ -609,6 +678,7 @@ out:
 
 static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 {
+	struct log_state *log = ve_log_state();
 	struct devkmsg_user *user = file->private_data;
 	loff_t ret = 0;
 
@@ -621,8 +691,8 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 	switch (whence) {
 	case SEEK_SET:
 		/* the first record */
-		user->idx = log_first_idx;
-		user->seq = log_first_seq;
+		user->idx = log->first_idx;
+		user->seq = log->first_seq;
 		break;
 	case SEEK_DATA:
 		/*
@@ -630,13 +700,13 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 		 * like issued by 'dmesg -c'. Reading /dev/kmsg itself
 		 * changes no global state, and does not clear anything.
 		 */
-		user->idx = clear_idx;
-		user->seq = clear_seq;
+		user->idx = log->clear_idx;
+		user->seq = log->clear_seq;
 		break;
 	case SEEK_END:
 		/* after the last record */
-		user->idx = log_next_idx;
-		user->seq = log_next_seq;
+		user->idx = log->next_idx;
+		user->seq = log->next_seq;
 		break;
 	default:
 		ret = -EINVAL;
@@ -647,18 +717,19 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence)
 
 static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
 {
+	struct log_state *log = ve_log_state();
 	struct devkmsg_user *user = file->private_data;
 	int ret = 0;
 
 	if (!user)
 		return POLLERR|POLLNVAL;
 
-	poll_wait(file, &log_wait, wait);
+	poll_wait(file, &log->wait, wait);
 
 	raw_spin_lock_irq(&logbuf_lock);
-	if (user->seq < log_next_seq) {
+	if (user->seq < log->next_seq) {
 		/* return error when data has vanished underneath us */
-		if (user->seq < log_first_seq)
+		if (user->seq < log->first_seq)
 			ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI;
 		else
 			ret = POLLIN|POLLRDNORM;
@@ -670,6 +741,7 @@ static unsigned int devkmsg_poll(struct file *file, poll_table *wait)
 
 static int devkmsg_open(struct inode *inode, struct file *file)
 {
+	struct log_state *log = ve_log_state();
 	struct devkmsg_user *user;
 	int err;
 
@@ -689,8 +761,8 @@ static int devkmsg_open(struct inode *inode, struct file *file)
 	mutex_init(&user->lock);
 
 	raw_spin_lock_irq(&logbuf_lock);
-	user->idx = log_first_idx;
-	user->seq = log_first_seq;
+	user->idx = log->first_idx;
+	user->seq = log->first_seq;
 	raw_spin_unlock_irq(&logbuf_lock);
 
 	file->private_data = user;
@@ -745,6 +817,19 @@ void log_buf_kexec_setup(void)
 }
 #endif
 
+static int __init setup_console_silencelevel(char *str)
+{
+	int level;
+
+	if (get_option(&str, &level) != 1)
+		return 0;
+
+	console_silence_loglevel = level;
+	return 1;
+}
+
+__setup("silencelevel=", setup_console_silencelevel);
+
 /* requested log_buf_len from kernel cmdline */
 static unsigned long __initdata new_log_buf_len;
 
@@ -755,7 +840,7 @@ static int __init log_buf_len_setup(char *str)
 
 	if (size)
 		size = roundup_pow_of_two(size);
-	if (size > log_buf_len)
+	if (size > init_log_state.buf_len)
 		new_log_buf_len = size;
 
 	return 0;
@@ -764,6 +849,7 @@ early_param("log_buf_len", log_buf_len_setup);
 
 void __init setup_log_buf(int early)
 {
+	struct log_state *log = &init_log_state;
 	unsigned long flags;
 	char *new_log_buf;
 	int free;
@@ -789,14 +875,14 @@ void __init setup_log_buf(int early)
 	}
 
 	raw_spin_lock_irqsave(&logbuf_lock, flags);
-	log_buf_len = new_log_buf_len;
-	log_buf = new_log_buf;
+	log->buf_len = new_log_buf_len;
+	log->buf = new_log_buf;
 	new_log_buf_len = 0;
-	free = __LOG_BUF_LEN - log_next_idx;
-	memcpy(log_buf, __log_buf, __LOG_BUF_LEN);
+	free = __LOG_BUF_LEN - log->next_idx;
+	memcpy(log->buf, __log_buf, __LOG_BUF_LEN);
 	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 
-	pr_info("log_buf_len: %d\n", log_buf_len);
+	pr_info("log_buf_len: %d\n", log->buf_len);
 	pr_info("early log buf free: %d(%d%%)\n",
 		free, (free * 100) / __LOG_BUF_LEN);
 }
@@ -976,7 +1062,7 @@ static size_t msg_print_text(const struct log *msg, enum log_flags prev,
 	return len;
 }
 
-static int syslog_print(char __user *buf, int size)
+static int syslog_print(struct log_state *log, char __user *buf, int size)
 {
 	char *text;
 	struct log *msg;
@@ -991,33 +1077,33 @@ static int syslog_print(char __user *buf, int size)
 		size_t skip;
 
 		raw_spin_lock_irq(&logbuf_lock);
-		if (syslog_seq < log_first_seq) {
+		if (log->syslog_seq < log->first_seq) {
 			/* messages are gone, move to first one */
-			syslog_seq = log_first_seq;
-			syslog_idx = log_first_idx;
-			syslog_prev = 0;
-			syslog_partial = 0;
+			log->syslog_seq = log->first_seq;
+			log->syslog_idx = log->first_idx;
+			log->syslog_prev = 0;
+			log->syslog_partial = 0;
 		}
-		if (syslog_seq == log_next_seq) {
+		if (log->syslog_seq == log->next_seq) {
 			raw_spin_unlock_irq(&logbuf_lock);
 			break;
 		}
 
-		skip = syslog_partial;
-		msg = log_from_idx(syslog_idx);
-		n = msg_print_text(msg, syslog_prev, true, text,
+		skip = log->syslog_partial;
+		msg = log_from_idx(log, log->syslog_idx);
+		n = msg_print_text(msg, log->syslog_prev, true, text,
 				   LOG_LINE_MAX + PREFIX_MAX);
-		if (n - syslog_partial <= size) {
+		if (n - log->syslog_partial <= size) {
 			/* message fits into buffer, move forward */
-			syslog_idx = log_next(syslog_idx);
-			syslog_seq++;
-			syslog_prev = msg->flags;
-			n -= syslog_partial;
-			syslog_partial = 0;
+			log->syslog_idx = log_next(log, log->syslog_idx);
+			log->syslog_seq++;
+			log->syslog_prev = msg->flags;
+			n -= log->syslog_partial;
+			log->syslog_partial = 0;
 		} else if (!len){
 			/* partial read(), remember position */
 			n = size;
-			syslog_partial += n;
+			log->syslog_partial += n;
 		} else
 			n = 0;
 		raw_spin_unlock_irq(&logbuf_lock);
@@ -1040,7 +1126,8 @@ static int syslog_print(char __user *buf, int size)
 	return len;
 }
 
-static int syslog_print_all(char __user *buf, int size, bool clear)
+static int syslog_print_all(struct log_state *log,
+			    char __user *buf, int size, bool clear)
 {
 	char *text;
 	int len = 0;
@@ -1056,48 +1143,48 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 		u32 idx;
 		enum log_flags prev;
 
-		if (clear_seq < log_first_seq) {
+		if (log->clear_seq < log->first_seq) {
 			/* messages are gone, move to first available one */
-			clear_seq = log_first_seq;
-			clear_idx = log_first_idx;
+			log->clear_seq = log->first_seq;
+			log->clear_idx = log->first_idx;
 		}
 
 		/*
 		 * Find first record that fits, including all following records,
 		 * into the user-provided buffer for this dump.
 		 */
-		seq = clear_seq;
-		idx = clear_idx;
+		seq = log->clear_seq;
+		idx = log->clear_idx;
 		prev = 0;
-		while (seq < log_next_seq) {
-			struct log *msg = log_from_idx(idx);
+		while (seq < log->next_seq) {
+			struct log *msg = log_from_idx(log, idx);
 
 			len += msg_print_text(msg, prev, true, NULL, 0);
 			prev = msg->flags;
-			idx = log_next(idx);
+			idx = log_next(log, idx);
 			seq++;
 		}
 
 		/* move first record forward until length fits into the buffer */
-		seq = clear_seq;
-		idx = clear_idx;
+		seq = log->clear_seq;
+		idx = log->clear_idx;
 		prev = 0;
-		while (len > size && seq < log_next_seq) {
-			struct log *msg = log_from_idx(idx);
+		while (len > size && seq < log->next_seq) {
+			struct log *msg = log_from_idx(log, idx);
 
 			len -= msg_print_text(msg, prev, true, NULL, 0);
 			prev = msg->flags;
-			idx = log_next(idx);
+			idx = log_next(log, idx);
 			seq++;
 		}
 
 		/* last message fitting into this dump */
-		next_seq = log_next_seq;
+		next_seq = log->next_seq;
 
 		len = 0;
 		prev = 0;
 		while (len >= 0 && seq < next_seq) {
-			struct log *msg = log_from_idx(idx);
+			struct log *msg = log_from_idx(log, idx);
 			int textlen;
 
 			textlen = msg_print_text(msg, prev, true, text,
@@ -1106,7 +1193,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 				len = textlen;
 				break;
 			}
-			idx = log_next(idx);
+			idx = log_next(log, idx);
 			seq++;
 			prev = msg->flags;
 
@@ -1117,18 +1204,18 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 				len += textlen;
 			raw_spin_lock_irq(&logbuf_lock);
 
-			if (seq < log_first_seq) {
+			if (seq < log->first_seq) {
 				/* messages are gone, move to next one */
-				seq = log_first_seq;
-				idx = log_first_idx;
+				seq = log->first_seq;
+				idx = log->first_idx;
 				prev = 0;
 			}
 		}
 	}
 
 	if (clear) {
-		clear_seq = log_next_seq;
-		clear_idx = log_next_idx;
+		log->clear_seq = log->next_seq;
+		log->clear_idx = log->next_idx;
 	}
 	raw_spin_unlock_irq(&logbuf_lock);
 
@@ -1138,6 +1225,7 @@ static int syslog_print_all(char __user *buf, int size, bool clear)
 
 int do_syslog(int type, char __user *buf, int len, bool from_file)
 {
+	struct log_state *log = ve_log_state();
 	bool clear = false;
 	static int saved_console_loglevel = LOGLEVEL_DEFAULT;
 	int error;
@@ -1150,6 +1238,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 	if (error)
 		return error;
 
+	error = 0;
+	if (log != &init_log_state &&
+	    (type == SYSLOG_ACTION_CONSOLE_OFF ||
+	     type == SYSLOG_ACTION_CONSOLE_ON))
+		goto out;
+
 	switch (type) {
 	case SYSLOG_ACTION_CLOSE:	/* Close log */
 		break;
@@ -1166,11 +1260,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 			error = -EFAULT;
 			goto out;
 		}
-		error = wait_event_interruptible(log_wait,
-						 syslog_seq != log_next_seq);
+		error = wait_event_interruptible(log->wait,
+				log->syslog_seq != log->next_seq);
 		if (error)
 			goto out;
-		error = syslog_print(buf, len);
+		error = syslog_print(log, buf, len);
 		break;
 	/* Read/clear last kernel messages */
 	case SYSLOG_ACTION_READ_CLEAR:
@@ -1188,11 +1282,11 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 			error = -EFAULT;
 			goto out;
 		}
-		error = syslog_print_all(buf, len, clear);
+		error = syslog_print_all(log, buf, len, clear);
 		break;
 	/* Clear ring buffer */
 	case SYSLOG_ACTION_CLEAR:
-		syslog_print_all(NULL, 0, true);
+		syslog_print_all(log, NULL, 0, true);
 		break;
 	/* Disable logging to console */
 	case SYSLOG_ACTION_CONSOLE_OFF:
@@ -1212,6 +1306,10 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 		error = -EINVAL;
 		if (len < 1 || len > 8)
 			goto out;
+		error = 0;
+		/* VE has no console, so return success */
+		if (log != &init_log_state)
+			goto out;
 		if (len < minimum_console_loglevel)
 			len = minimum_console_loglevel;
 		console_loglevel = len;
@@ -1222,12 +1320,12 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 	/* Number of chars in the log buffer */
 	case SYSLOG_ACTION_SIZE_UNREAD:
 		raw_spin_lock_irq(&logbuf_lock);
-		if (syslog_seq < log_first_seq) {
+		if (log->syslog_seq < log->first_seq) {
 			/* messages are gone, move to first one */
-			syslog_seq = log_first_seq;
-			syslog_idx = log_first_idx;
-			syslog_prev = 0;
-			syslog_partial = 0;
+			log->syslog_seq = log->first_seq;
+			log->syslog_idx = log->first_idx;
+			log->syslog_prev = 0;
+			log->syslog_partial = 0;
 		}
 		if (from_file) {
 			/*
@@ -1235,28 +1333,28 @@ int do_syslog(int type, char __user *buf, int len, bool from_file)
 			 * for pending data, not the size; return the count of
 			 * records, not the length.
 			 */
-			error = log_next_idx - syslog_idx;
+			error = log->next_idx - log->syslog_idx;
 		} else {
-			u64 seq = syslog_seq;
-			u32 idx = syslog_idx;
-			enum log_flags prev = syslog_prev;
+			u64 seq = log->syslog_seq;
+			u32 idx = log->syslog_idx;
+			enum log_flags prev = log->syslog_prev;
 
 			error = 0;
-			while (seq < log_next_seq) {
-				struct log *msg = log_from_idx(idx);
+			while (seq < log->next_seq) {
+				struct log *msg = log_from_idx(log, idx);
 
 				error += msg_print_text(msg, prev, true, NULL, 0);
-				idx = log_next(idx);
+				idx = log_next(log, idx);
 				seq++;
 				prev = msg->flags;
 			}
-			error -= syslog_partial;
+			error -= log->syslog_partial;
 		}
 		raw_spin_unlock_irq(&logbuf_lock);
 		break;
 	/* Size of the log buffer */
 	case SYSLOG_ACTION_SIZE_BUFFER:
-		error = log_buf_len;
+		error = log->buf_len;
 		break;
 	default:
 		error = -EINVAL;
@@ -1389,113 +1487,143 @@ static inline void printk_delay(void)
 	}
 }
 
-/*
- * Continuation lines are buffered, and not committed to the record buffer
- * until the line is complete, or a race forces it. The line fragments
- * though, are printed immediately to the consoles to ensure everything has
- * reached the console in case of a kernel crash.
- */
-static struct cont {
-	char buf[LOG_LINE_MAX];
-	size_t len;			/* length == 0 means unused buffer */
-	size_t cons;			/* bytes written to console */
-	struct task_struct *owner;	/* task of first print*/
-	u64 ts_nsec;			/* time of first print */
-	u8 level;			/* log level of first message */
-	u8 facility;			/* log level of first message */
-	enum log_flags flags;		/* prefix, newline flags */
-	bool flushed:1;			/* buffer sealed and committed */
-} cont;
-
-static void cont_flush(enum log_flags flags)
+static void cont_flush(struct log_state *log, enum log_flags flags)
 {
-	if (cont.flushed)
+	struct cont *c = &log->cont;
+
+	if (c->flushed)
 		return;
-	if (cont.len == 0)
+	if (c->len == 0)
 		return;
 
-	if (cont.cons) {
+	if (c->cons) {
 		/*
 		 * If a fragment of this line was directly flushed to the
 		 * console; wait for the console to pick up the rest of the
 		 * line. LOG_NOCONS suppresses a duplicated output.
 		 */
-		log_store(cont.facility, cont.level, flags | LOG_NOCONS,
-			  cont.ts_nsec, NULL, 0, cont.buf, cont.len);
-		cont.flags = flags;
-		cont.flushed = true;
+		log_store(log, c->facility, c->level, flags | LOG_NOCONS,
+			  c->ts_nsec, NULL, 0, c->buf, c->len);
+		c->flags = flags;
+		c->flushed = true;
 	} else {
 		/*
 		 * If no fragment of this line ever reached the console,
 		 * just submit it to the store and free the buffer.
 		 */
-		log_store(cont.facility, cont.level, flags, 0,
-			  NULL, 0, cont.buf, cont.len);
-		cont.len = 0;
+		log_store(log, c->facility, c->level, flags, 0,
+			  NULL, 0, c->buf, c->len);
+		c->len = 0;
 	}
 }
 
-static bool cont_add(int facility, int level, const char *text, size_t len)
+static bool cont_add(struct log_state *log,
+		     int facility, int level, const char *text, size_t len)
 {
-	if (cont.len && cont.flushed)
+	struct cont *c = &log->cont;
+
+	if (c->len && c->flushed)
 		return false;
 
-	if (cont.len + len > sizeof(cont.buf)) {
+	if (c->len + len > sizeof(c->buf)) {
 		/* the line gets too long, split it up in separate records */
-		cont_flush(LOG_CONT);
+		cont_flush(log, LOG_CONT);
 		return false;
 	}
 
-	if (!cont.len) {
-		cont.facility = facility;
-		cont.level = level;
-		cont.owner = current;
-		cont.ts_nsec = local_clock();
-		cont.flags = 0;
-		cont.cons = 0;
-		cont.flushed = false;
+	if (!c->len) {
+		c->facility = facility;
+		c->level = level;
+		c->owner = current;
+		c->ts_nsec = local_clock();
+		c->flags = 0;
+		c->cons = 0;
+		c->flushed = false;
 	}
 
-	memcpy(cont.buf + cont.len, text, len);
-	cont.len += len;
+	memcpy(c->buf + c->len, text, len);
+	c->len += len;
 
-	if (cont.len > (sizeof(cont.buf) * 80) / 100)
-		cont_flush(LOG_CONT);
+	if (c->len > (sizeof(c->buf) * 80) / 100)
+		cont_flush(log, LOG_CONT);
 
 	return true;
 }
 
-static size_t cont_print_text(char *text, size_t size)
+static size_t cont_print_text(struct log_state *log, char *text, size_t size)
 {
+	struct cont *c = &log->cont;
 	size_t textlen = 0;
 	size_t len;
 
-	if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
-		textlen += print_time(cont.ts_nsec, text);
+	if (c->cons == 0 && (log->console_prev & LOG_NEWLINE)) {
+		textlen += print_time(c->ts_nsec, text);
 		size -= textlen;
 	}
 
-	len = cont.len - cont.cons;
+	len = c->len - c->cons;
 	if (len > 0) {
 		if (len+1 > size)
 			len = size-1;
-		memcpy(text + textlen, cont.buf + cont.cons, len);
+		memcpy(text + textlen, c->buf + c->cons, len);
 		textlen += len;
-		cont.cons = cont.len;
+		c->cons = c->len;
 	}
 
-	if (cont.flushed) {
-		if (cont.flags & LOG_NEWLINE)
+	if (c->flushed) {
+		if (c->flags & LOG_NEWLINE)
 			text[textlen++] = '\n';
 		/* got everything, release buffer */
-		cont.len = 0;
+		c->len = 0;
 	}
 	return textlen;
 }
 
-asmlinkage int vprintk_emit(int facility, int level,
-			    const char *dict, size_t dictlen,
-			    const char *fmt, va_list args)
+#ifdef CONFIG_VE
+int ve_log_init(struct ve_struct *ve)
+{
+	struct log_state *log;
+
+	log = kzalloc(sizeof(*log), GFP_KERNEL);
+	if (!log)
+		return -ENOMEM;
+
+	init_waitqueue_head(&log->wait);
+	log->buf_len = VE_LOG_BUF_LEN;
+	/* buf will be initialized later by log_state_init() */
+
+	ve->log_state = log;
+	return 0;
+}
+EXPORT_SYMBOL(ve_log_init);
+
+void ve_log_destroy(struct ve_struct *ve)
+{
+	struct log_state *log = ve->log_state;
+
+	kfree(log->buf);
+	kfree(log);
+}
+EXPORT_SYMBOL(ve_log_destroy);
+#endif
+
+static int log_state_init(struct log_state *log)
+{
+#ifdef CONFIG_VE
+	if (log->buf)
+		return 0;
+
+	log->buf = kzalloc(log->buf_len, GFP_ATOMIC);
+	if (!log->buf)
+		return -ENOMEM;
+#endif
+	return 0;
+}
+
+static int __vprintk_emit(struct log_state *log,
+			  int facility, int level,
+			  const char *dict, size_t dictlen,
+			  const char *fmt, va_list args)
 {
 	static int recursion_bug;
 	static char textbuf[LOG_LINE_MAX];
@@ -1508,6 +1636,8 @@ asmlinkage int vprintk_emit(int facility, int level,
 	bool in_sched = false;
 	/* cpu currently holding logbuf_lock in this function */
 	static volatile unsigned int logbuf_cpu = UINT_MAX;
+	bool need_wake = false;
+	int err;
 
 	if (level == LOGLEVEL_SCHED) {
 		level = LOGLEVEL_DEFAULT;
@@ -1544,6 +1674,15 @@ asmlinkage int vprintk_emit(int facility, int level,
 	raw_spin_lock(&logbuf_lock);
 	logbuf_cpu = this_cpu;
 
+	err = log_state_init(log);
+	if (err) {
+		logbuf_cpu = UINT_MAX;
+		raw_spin_unlock(&logbuf_lock);
+		lockdep_on();
+		local_irq_restore(flags);
+		return err;
+	}
+
 	if (recursion_bug) {
 		static const char recursion_msg[] =
 			"BUG: recent printk recursion!";
@@ -1551,7 +1690,7 @@ asmlinkage int vprintk_emit(int facility, int level,
 		recursion_bug = 0;
 		printed_len += strlen(recursion_msg);
 		/* emit KERN_CRIT message */
-		log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
+		log_store(log, 0, 2, LOG_PREFIX|LOG_NEWLINE, 0,
 			  NULL, 0, recursion_msg, printed_len);
 	}
 
@@ -1599,12 +1738,13 @@ asmlinkage int vprintk_emit(int facility, int level,
 		 * Flush the conflicting buffer. An earlier newline was missing,
 		 * or another task also prints continuation lines.
 		 */
-		if (cont.len && (lflags & LOG_PREFIX || cont.owner != current))
-			cont_flush(LOG_NEWLINE);
+		if (log->cont.len && (lflags & LOG_PREFIX ||
+				     log->cont.owner != current))
+			cont_flush(log, LOG_NEWLINE);
 
 		/* buffer line if possible, otherwise store it right away */
-		if (!cont_add(facility, level, text, text_len))
-			log_store(facility, level, lflags | LOG_CONT, 0,
+		if (!cont_add(log, facility, level, text, text_len))
+			log_store(log, facility, level, lflags | LOG_CONT, 0,
 				  dict, dictlen, text, text_len);
 	} else {
 		bool stored = false;
@@ -1615,14 +1755,15 @@ asmlinkage int vprintk_emit(int facility, int level,
 		 * there was a race with interrupts (prefix == true) then just
 		 * flush it out and store this line separately.
 		 */
-		if (cont.len && cont.owner == current) {
+		if (log->cont.len && log->cont.owner == current) {
 			if (!(lflags & LOG_PREFIX))
-				stored = cont_add(facility, level, text, text_len);
-			cont_flush(LOG_NEWLINE);
+				stored = cont_add(log, facility, level,
+						  text, text_len);
+			cont_flush(log, LOG_NEWLINE);
 		}
 
 		if (!stored)
-			log_store(facility, level, lflags, 0,
+			log_store(log, facility, level, lflags, 0,
 				  dict, dictlen, text, text_len);
 	}
 	printed_len += text_len;
@@ -1648,14 +1789,39 @@ asmlinkage int vprintk_emit(int facility, int level,
 		 * semaphore.  The release will print out buffers and wake up
 		 * /dev/kmsg and syslog() users.
 		 */
-		if (console_trylock_for_printk())
+		if (log != &init_log_state) {
+			raw_spin_lock_irqsave(&logbuf_lock, flags);
+			if (log->seen_seq != log->next_seq && !oops_in_progress) {
+				log->seen_seq = log->next_seq;
+				need_wake = true;
+			}
+			logbuf_cpu = UINT_MAX;
+			raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+		} else if (console_trylock_for_printk())
 			console_unlock();
 		preempt_enable();
 		lockdep_on();
+
+		if (need_wake)
+			wake_up_interruptible(&log->wait);
 	}
 
 	return printed_len;
 }
+
+static int __vprintk(const char *fmt, va_list args)
+{
+	return __vprintk_emit(ve_log_state(), 0, -1, NULL, 0, fmt, args);
+}
+
+asmlinkage int vprintk_emit(int facility, int level,
+			    const char *dict, size_t dictlen,
+			    const char *fmt, va_list args)
+{
+	return __vprintk_emit(&init_log_state,
+			      facility, level, dict, dictlen, fmt, args);
+}
+
 EXPORT_SYMBOL(vprintk_emit);
 
 asmlinkage int vprintk(const char *fmt, va_list args)
@@ -1703,6 +1869,53 @@ EXPORT_SYMBOL_GPL(vprintk_default);
  */
 DEFINE_PER_CPU(printk_func_t, printk_func) = vprintk_default;
 
+asmlinkage int ve_vprintk(int dst, const char *fmt, va_list args)
+{
+	va_list args2;
+	int r = 0;
+
+	va_copy(args2, args);
+	if (ve_is_super(get_exec_env()) || (dst & VE0_LOG))
+		r = vprintk(fmt, args);
+	if (!ve_is_super(get_exec_env()) && (dst & VE_LOG))
+		r = __vprintk(fmt, args2);
+
+	return r;
+}
+
+/*
+ * Do not use it from scheduler code - can lead to deadlocks.
+ */
+
+asmlinkage int ve_printk(int dst, const char *fmt, ...)
+{
+	va_list args;
+	int r;
+
+	va_start(args, fmt);
+	r = ve_vprintk(dst, fmt, args);
+	va_end(args);
+
+	return r;
+}
+EXPORT_SYMBOL(ve_printk);
+
+asmlinkage int ve_log_printk(struct ve_struct *ve, const char *fmt, ...)
+{
+	struct log_state *log = &init_log_state;
+	va_list args;
+	int r;
+
+	if (likely(ve && ve->log_state))
+		log = ve->log_state;
+
+	va_start(args, fmt);
+	r = __vprintk_emit(log, 0, -1, NULL, 0, fmt, args);
+	va_end(args);
+
+	return r;
+}
+EXPORT_SYMBOL(ve_log_printk);
 /**
  * printk - print a kernel message
  * @fmt: format string
@@ -1745,28 +1958,36 @@ EXPORT_SYMBOL(printk);
 
 #define LOG_LINE_MAX		0
 #define PREFIX_MAX		0
-#define LOG_LINE_MAX 0
-static u64 syslog_seq;
-static u32 syslog_idx;
-static u64 console_seq;
-static u32 console_idx;
-static enum log_flags syslog_prev;
-static u64 log_first_seq;
-static u32 log_first_idx;
-static u64 log_next_seq;
-static enum log_flags console_prev;
-static struct cont {
+#define LOG_LINE_MAX		0
+struct cont {
 	size_t len;
 	size_t cons;
 	u8 level;
 	bool flushed:1;
-} cont;
-static struct log *log_from_idx(u32 idx) { return NULL; }
-static u32 log_next(u32 idx) { return 0; }
+};
+static struct log_state {
+	u64 syslog_seq;
+	u32 syslog_idx;
+	enum log_flags syslog_prev;
+	u64 first_seq;
+	u32 first_idx;
+	u64 next_seq;
+	u64 console_seq;
+	u32 console_idx;
+	enum log_flags console_prev;
+	u64 seen_seq;
+	struct cont cont;
+	wait_queue_head_t wait;
+} init_log_state = {
+	.wait = __WAIT_QUEUE_HEAD_INITIALIZER(init_log_state.wait),
+};
+static struct log *log_from_idx(struct log_state *log, u32 idx) { return NULL; }
+static u32 log_next(struct log_state *log, u32 idx) { return 0; }
 static void call_console_drivers(int level, const char *text, size_t len) {}
 static size_t msg_print_text(const struct log *msg, enum log_flags prev,
 			     bool syslog, char *buf, size_t size) { return 0; }
-static size_t cont_print_text(char *text, size_t size) { return 0; }
+static size_t cont_print_text(struct log_state *log,
+			      char *text, size_t size) { return 0; }
 
 #endif /* CONFIG_PRINTK */
 
@@ -2026,14 +2247,14 @@ int is_console_locked(void)
 	return console_locked;
 }
 
-static void console_cont_flush(char *text, size_t size)
+static void console_cont_flush(struct log_state *log, char *text, size_t size)
 {
 	unsigned long flags;
 	size_t len;
 
 	raw_spin_lock_irqsave(&logbuf_lock, flags);
 
-	if (!cont.len)
+	if (!log->cont.len)
 		goto out;
 
 	/*
@@ -2041,13 +2262,13 @@ static void console_cont_flush(char *text, size_t size)
 	 * busy. The earlier ones need to be printed before this one, we
 	 * did not flush any fragment so far, so just let it queue up.
 	 */
-	if (console_seq < log_next_seq && !cont.cons)
+	if (log->console_seq < log->next_seq && !log->cont.cons)
 		goto out;
 
-	len = cont_print_text(text, size);
+	len = cont_print_text(log, text, size);
 	raw_spin_unlock(&logbuf_lock);
 	stop_critical_timings();
-	call_console_drivers(cont.level, text, len);
+	call_console_drivers(log->cont.level, text, len);
 	start_critical_timings();
 	local_irq_restore(flags);
 	return;
@@ -2072,9 +2293,10 @@ out:
 void console_unlock(void)
 {
 	static char text[LOG_LINE_MAX + PREFIX_MAX];
-	static u64 seen_seq;
+	struct log_state *log = &init_log_state;
 	unsigned long flags;
 	bool wake_klogd = false;
+	bool first = true;
 	bool retry;
 	unsigned cnt;
 
@@ -2086,7 +2308,7 @@ void console_unlock(void)
 	console_may_schedule = 0;
 
 	/* flush buffered message fragment immediately to console */
-	console_cont_flush(text, sizeof(text));
+	console_cont_flush(log, text, sizeof(text));
 again:
 	cnt = 5;
 	for (;;) {
@@ -2094,49 +2316,54 @@ again:
 		size_t len;
 		int level;
 
+		if (first)
+			first = false;
+		else
+			touch_all_softlockup_watchdogs();
+
 		raw_spin_lock_irqsave(&logbuf_lock, flags);
-		if (seen_seq != log_next_seq) {
+		if (log->seen_seq != log->next_seq) {
 			wake_klogd = true;
-			seen_seq = log_next_seq;
+			log->seen_seq = log->next_seq;
 		}
 
-		if (console_seq < log_first_seq) {
+		if (log->console_seq < log->first_seq) {
 			/* messages are gone, move to first one */
-			console_seq = log_first_seq;
-			console_idx = log_first_idx;
-			console_prev = 0;
+			log->console_seq = log->first_seq;
+			log->console_idx = log->first_idx;
+			log->console_prev = 0;
 		}
 skip:
-		if (console_seq == log_next_seq)
+		if (log->console_seq == log->next_seq)
 			break;
 
 		if (--cnt == 0)
 			break;	/* Someone else printk's like crazy */
 
-		msg = log_from_idx(console_idx);
+		msg = log_from_idx(log, log->console_idx);
 		if (msg->flags & LOG_NOCONS) {
 			/*
 			 * Skip record we have buffered and already printed
 			 * directly to the console when we received it.
 			 */
-			console_idx = log_next(console_idx);
-			console_seq++;
+			log->console_idx = log_next(log, log->console_idx);
+			log->console_seq++;
 			/*
 			 * We will get here again when we register a new
 			 * CON_PRINTBUFFER console. Clear the flag so we
 			 * will properly dump everything later.
 			 */
 			msg->flags &= ~LOG_NOCONS;
-			console_prev = msg->flags;
+			log->console_prev = msg->flags;
 			goto skip;
 		}
 
 		level = msg->level;
-		len = msg_print_text(msg, console_prev, false,
+		len = msg_print_text(msg, log->console_prev, false,
 				     text, sizeof(text));
-		console_idx = log_next(console_idx);
-		console_seq++;
-		console_prev = msg->flags;
+		log->console_idx = log_next(log, log->console_idx);
+		log->console_seq++;
+		log->console_prev = msg->flags;
 		raw_spin_unlock(&logbuf_lock);
 
 		stop_critical_timings();	/* don't trace print latency */
@@ -2162,7 +2389,7 @@ skip:
 	 * flush, no worries.
 	 */
 	raw_spin_lock(&logbuf_lock);
-	retry = console_seq != log_next_seq;
+	retry = log->console_seq != log->next_seq;
 	raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 
 	if (retry && console_trylock())
@@ -2177,7 +2404,7 @@ skip:
 		cnt = 9999;
 		while (--cnt != 0) {
 			cpu_relax();
-			if (console_seq == log_next_seq) {
+			if (log->console_seq == log->next_seq) {
 				/* Good, other CPU entered "for(;;)" loop */
 				goto out;
 			}
@@ -2305,6 +2532,7 @@ early_param("keep_bootcon", keep_bootcon_setup);
  */
 void register_console(struct console *newcon)
 {
+	struct log_state *log = &init_log_state;
 	int i;
 	unsigned long flags;
 	struct console *bcon = NULL;
@@ -2425,9 +2653,9 @@ void register_console(struct console *newcon)
 		 * for us.
 		 */
 		raw_spin_lock_irqsave(&logbuf_lock, flags);
-		console_seq = syslog_seq;
-		console_idx = syslog_idx;
-		console_prev = syslog_prev;
+		log->console_seq = log->syslog_seq;
+		log->console_idx = log->syslog_idx;
+		log->console_prev = log->syslog_prev;
 		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 		/*
 		 * We're about to replay the log buffer.  Only do this to the
@@ -2540,7 +2768,7 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work)
 	}
 
 	if (pending & PRINTK_PENDING_WAKEUP)
-		wake_up_interruptible(&log_wait);
+		wake_up_interruptible(&init_log_state.wait);
 }
 
 static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
@@ -2551,7 +2779,7 @@ static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = {
 void wake_up_klogd(void)
 {
 	preempt_disable();
-	if (waitqueue_active(&log_wait)) {
+	if (waitqueue_active(&init_log_state.wait)) {
 		this_cpu_or(printk_pending, PRINTK_PENDING_WAKEUP);
 		irq_work_queue(&__get_cpu_var(wake_up_klogd_work));
 	}
@@ -2683,6 +2911,7 @@ module_param_named(always_kmsg_dump, always_kmsg_dump, bool, S_IRUGO | S_IWUSR);
  */
 void kmsg_dump(enum kmsg_dump_reason reason)
 {
+	struct log_state *log = &init_log_state;
 	struct kmsg_dumper *dumper;
 	unsigned long flags;
 
@@ -2698,10 +2927,10 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 		dumper->active = true;
 
 		raw_spin_lock_irqsave(&logbuf_lock, flags);
-		dumper->cur_seq = clear_seq;
-		dumper->cur_idx = clear_idx;
-		dumper->next_seq = log_next_seq;
-		dumper->next_idx = log_next_idx;
+		dumper->cur_seq = log->clear_seq;
+		dumper->cur_idx = log->clear_idx;
+		dumper->next_seq = log->next_seq;
+		dumper->next_idx = log->next_idx;
 		raw_spin_unlock_irqrestore(&logbuf_lock, flags);
 
 		/* invoke dumper which will iterate over records */
@@ -2735,6 +2964,7 @@ void kmsg_dump(enum kmsg_dump_reason reason)
 bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
 			       char *line, size_t size, size_t *len)
 {
+	struct log_state *log = &init_log_state;
 	struct log *msg;
 	size_t l = 0;
 	bool ret = false;
@@ -2742,20 +2972,20 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog,
 	if (!dumper->active)
 		goto out;
 
-	if (dumper->cur_seq < log_first_seq) {
+	if (dumper->cur_seq < log->first_seq) {
 		/* messages are gone, move to first available one */
-		dumper->cur_seq = log_first_seq;
-		dumper->cur_idx = log_first_idx;
+		dumper->cur_seq = log->first_seq;
+		dumper->cur_idx = log->first_idx;
 	}
 
 	/* last entry */
-	if (dumper->cur_seq >= log_next_seq)
+	if (dumper->cur_seq >= log->next_seq)
 		goto out;
 
-	msg = log_from_idx(dumper->cur_idx);
+	msg = log_from_idx(log, dumper->cur_idx);
 	l = msg_print_text(msg, 0, syslog, line, size);
 
-	dumper->cur_idx = log_next(dumper->cur_idx);
+	dumper->cur_idx = log_next(log, dumper->cur_idx);
 	dumper->cur_seq++;
 	ret = true;
 out:
@@ -2817,6 +3047,7 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line);
 bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 			  char *buf, size_t size, size_t *len)
 {
+	struct log_state *log = &init_log_state;
 	unsigned long flags;
 	u64 seq;
 	u32 idx;
@@ -2830,10 +3061,10 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 		goto out;
 
 	raw_spin_lock_irqsave(&logbuf_lock, flags);
-	if (dumper->cur_seq < log_first_seq) {
+	if (dumper->cur_seq < log->first_seq) {
 		/* messages are gone, move to first available one */
-		dumper->cur_seq = log_first_seq;
-		dumper->cur_idx = log_first_idx;
+		dumper->cur_seq = log->first_seq;
+		dumper->cur_idx = log->first_idx;
 	}
 
 	/* last entry */
@@ -2847,10 +3078,10 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	idx = dumper->cur_idx;
 	prev = 0;
 	while (seq < dumper->next_seq) {
-		struct log *msg = log_from_idx(idx);
+		struct log *msg = log_from_idx(log, idx);
 
 		l += msg_print_text(msg, prev, true, NULL, 0);
-		idx = log_next(idx);
+		idx = log_next(log, idx);
 		seq++;
 		prev = msg->flags;
 	}
@@ -2860,10 +3091,10 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	idx = dumper->cur_idx;
 	prev = 0;
 	while (l > size && seq < dumper->next_seq) {
-		struct log *msg = log_from_idx(idx);
+		struct log *msg = log_from_idx(log, idx);
 
 		l -= msg_print_text(msg, prev, true, NULL, 0);
-		idx = log_next(idx);
+		idx = log_next(log, idx);
 		seq++;
 		prev = msg->flags;
 	}
@@ -2875,10 +3106,10 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog,
 	l = 0;
 	prev = 0;
 	while (seq < dumper->next_seq) {
-		struct log *msg = log_from_idx(idx);
+		struct log *msg = log_from_idx(log, idx);
 
 		l += msg_print_text(msg, prev, syslog, buf + l, size - l);
-		idx = log_next(idx);
+		idx = log_next(log, idx);
 		seq++;
 		prev = msg->flags;
 	}
@@ -2906,10 +3137,12 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer);
  */
 void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper)
 {
-	dumper->cur_seq = clear_seq;
-	dumper->cur_idx = clear_idx;
-	dumper->next_seq = log_next_seq;
-	dumper->next_idx = log_next_idx;
+	struct log_state *log = &init_log_state;
+
+	dumper->cur_seq = log->clear_seq;
+	dumper->cur_idx = log->clear_idx;
+	dumper->next_seq = log->next_seq;
+	dumper->next_idx = log->next_idx;
 }
 
 /**
@@ -2961,11 +3194,11 @@ void __init dump_stack_set_arch_desc(const char *fmt, ...)
  */
 void dump_stack_print_info(const char *log_lvl)
 {
-	printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
+	printk("%sCPU: %d PID: %d Comm: %.20s ve: %s %s %s %.*s %s\n",
 	       log_lvl, raw_smp_processor_id(), current->pid, current->comm,
-	       print_tainted(), init_utsname()->release,
+	       task_ve_name(current), print_tainted(), init_utsname()->release,
 	       (int)strcspn(init_utsname()->version, " "),
-	       init_utsname()->version);
+	       init_utsname()->version, VZVERSION);
 
 	if (dump_stack_arch_desc_str[0] != '\0')
 		printk("%sHardware name: %s\n",
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -229,6 +229,8 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 	 * or halting the specified task is impossible.
 	 */
 	int dumpable = 0;
+	int vps_dumpable = 0;
+
 	/* Don't let security modules deny introspection */
 	if (task == current)
 		return 0;
@@ -248,14 +250,20 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode)
 ok:
 	rcu_read_unlock();
 	smp_rmb();
-	if (task->mm)
+	if (task->mm) {
 		dumpable = get_dumpable(task->mm);
+		vps_dumpable = (task->mm->vps_dumpable == VD_PTRACE_COREDUMP);
+	}
 	rcu_read_lock();
 	if (dumpable != SUID_DUMP_USER &&
 	    !ptrace_has_cap(__task_cred(task)->user_ns, mode)) {
 		rcu_read_unlock();
 		return -EPERM;
 	}
+	if (!vps_dumpable && !ve_is_super(get_exec_env())) {
+		rcu_read_unlock();
+		return -EPERM;
+	}
 	rcu_read_unlock();
 
 	return security_ptrace_access_check(task, mode);
@@ -307,6 +315,10 @@ static int ptrace_attach(struct task_struct *task, long request,
 
 	task_lock(task);
 	retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH);
+	if (!retval) {
+		if (!task->mm || task->mm->vps_dumpable == VD_LICDATA_ACCESS)
+			retval = -EACCES;
+	}
 	task_unlock(task);
 	if (retval)
 		goto unlock_creds;
@@ -580,6 +592,19 @@ static int ptrace_setoptions(struct task_struct *child, unsigned long data)
 	if (data & ~(unsigned long)PTRACE_O_MASK)
 		return -EINVAL;
 
+	if (unlikely(data & PTRACE_O_SUSPEND_SECCOMP)) {
+		if (!config_enabled(CONFIG_CHECKPOINT_RESTORE) ||
+		    !config_enabled(CONFIG_SECCOMP))
+			return -EINVAL;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (seccomp_mode(&current->seccomp) != SECCOMP_MODE_DISABLED ||
+		    current->ptrace & PT_SUSPEND_SECCOMP)
+			return -EPERM;
+	}
+
 	/* Avoid intermediate state when all opts are cleared */
 	flags = child->ptrace;
 	flags &= ~(PTRACE_O_MASK << PT_OPT_FLAG_SHIFT);
@@ -1007,6 +1032,11 @@ int ptrace_request(struct task_struct *child, long request,
 		break;
 	}
 #endif
+
+	case PTRACE_SECCOMP_GET_FILTER:
+		ret = seccomp_get_filter(child, addr, datavp);
+		break;
+
 	default:
 		break;
 	}
@@ -1018,6 +1048,10 @@ static struct task_struct *ptrace_get_task_struct(pid_t pid)
 {
 	struct task_struct *child;
 
+	/* ptracing of init from inside CT is dangerous */
+	if (pid == 1 && !capable(CAP_SYS_ADMIN))
+		return ERR_PTR(-EPERM);
+
 	rcu_read_lock();
 	child = find_task_by_vpid(pid);
 	if (child)
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER
 CFLAGS_REMOVE_clock.o = -pg
 endif
 
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
 # needed for x86 only.  Why this used to be enabled for all architectures is beyond
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,7 @@
 #include <linux/binfmts.h>
 #include <linux/context_tracking.h>
 #include <linux/frame.h>
+#include <linux/ve.h>
 
 #include <asm/switch_to.h>
 #include <asm/tlb.h>
@@ -134,6 +135,37 @@ void update_rq_clock(struct rq *rq)
 	update_rq_clock_task(rq, delta);
 }
 
+struct kernel_stat_glob kstat_glob;
+DEFINE_SPINLOCK(kstat_glb_lock);
+EXPORT_SYMBOL(kstat_glob);
+EXPORT_SYMBOL(kstat_glb_lock);
+
+static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, glob_kstat_lat);
+static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, glob_kstat_page_in);
+static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, alloc_kstat_lat[KSTAT_ALLOCSTAT_NR]);
+
+static DEFINE_PER_CPU(struct kstat_perf_pcpu_snap_struct, kstat_pcpu_ttfp);
+static DEFINE_PER_CPU(struct kstat_perf_pcpu_snap_struct, kstat_pcpu_cache_reap);
+static DEFINE_PER_CPU(struct kstat_perf_pcpu_snap_struct, kstat_pcpu_shrink_icache);
+static DEFINE_PER_CPU(struct kstat_perf_pcpu_snap_struct, kstat_pcpu_shrink_dcache);
+static DEFINE_PER_CPU(struct kstat_perf_pcpu_snap_struct, kstat_pcpu_refill_inact);
+
+void __init kstat_init(void)
+{
+	int i;
+
+	kstat_glob.sched_lat.cur = &glob_kstat_lat;
+	kstat_glob.page_in.cur = &glob_kstat_page_in;
+	for ( i = 0 ; i < KSTAT_ALLOCSTAT_NR ; i++)
+		kstat_glob.alloc_lat[i].cur = &alloc_kstat_lat[i];
+
+	kstat_glob.ttfp.cur = &kstat_pcpu_ttfp;
+	kstat_glob.cache_reap.cur = &kstat_pcpu_cache_reap;
+	kstat_glob.shrink_icache.cur = &kstat_pcpu_shrink_icache;
+	kstat_glob.shrink_dcache.cur = &kstat_pcpu_shrink_dcache;
+	kstat_glob.refill_inact.cur = &kstat_pcpu_refill_inact;
+}
+
 /*
  * Debugging: various feature bits
  */
@@ -307,6 +339,54 @@ __read_mostly int scheduler_running;
  */
 int sysctl_sched_rt_runtime = 950000;
 
+#ifdef CONFIG_CFS_CPULIMIT
+unsigned int task_nr_cpus(struct task_struct *p)
+{
+	unsigned int nr_cpus = 0;
+	unsigned int max_nr_cpus = num_online_cpus();
+
+	rcu_read_lock();
+	nr_cpus = task_group(p)->nr_cpus;
+	rcu_read_unlock();
+
+	if (!nr_cpus || nr_cpus > max_nr_cpus)
+		nr_cpus = max_nr_cpus;
+
+	return nr_cpus;
+}
+
+unsigned int task_vcpu_id(struct task_struct *p)
+{
+	return task_cpu(p) % task_nr_cpus(p);
+}
+
+unsigned int sysctl_sched_cpulimit_scale_cpufreq = 1;
+
+unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq)
+{
+	unsigned long rate, max_rate;
+
+	if (!sysctl_sched_cpulimit_scale_cpufreq)
+		return freq;
+
+	rcu_read_lock();
+	rate = task_group(current)->cpu_rate;
+	rcu_read_unlock();
+
+	max_rate = num_online_vcpus() * MAX_CPU_RATE;
+	if (!rate || rate >= max_rate)
+		return freq;
+
+	return div_u64((u64)freq * rate, max_rate); /* avoid 32bit overflow */
+}
+#endif
+
+unsigned long nr_zombie = 0;	/* protected by tasklist_lock */
+EXPORT_SYMBOL(nr_zombie);
+
+atomic_t nr_dead = ATOMIC_INIT(0);
+EXPORT_SYMBOL(nr_dead);
+
 /*
  * this_rq_lock - lock this runqueue and disable interrupts.
  */
@@ -837,18 +917,48 @@ static void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 	p->sched_class->dequeue_task(rq, p, flags);
 }
 
+static inline void check_inc_sleeping(struct rq *rq, struct task_struct *t)
+{
+	if (t->state == TASK_INTERRUPTIBLE)
+		rq->nr_sleeping++;
+}
+
+static inline void check_dec_sleeping(struct rq *rq, struct task_struct *t)
+{
+	if (t->state == TASK_INTERRUPTIBLE)
+		rq->nr_sleeping--;
+}
+
 void activate_task(struct rq *rq, struct task_struct *p, int flags)
 {
-	if (task_contributes_to_load(p))
+	if (task_contributes_to_load(p)) {
 		rq->nr_uninterruptible--;
+		if (task_iothrottled(p))
+			rq->nr_iothrottled--;
+		task_cfs_rq(p)->nr_unint--;
+	}
+
+	check_dec_sleeping(rq, p);
 
 	enqueue_task(rq, p, flags);
 }
 
 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 {
-	if (task_contributes_to_load(p))
+	check_inc_sleeping(rq, p);
+
+#if 0 /* this is broken */
+	if (p->state == TASK_STOPPED) {
+		rq->nr_stopped++;
+	}
+#endif
+
+	if (task_contributes_to_load(p)) {
 		rq->nr_uninterruptible++;
+		if (task_iothrottled(p))
+			rq->nr_iothrottled++;
+		task_cfs_rq(p)->nr_unint++;
+	}
 
 	dequeue_task(rq, p, flags);
 }
@@ -1517,10 +1627,17 @@ static void
 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags)
 {
 #ifdef CONFIG_SMP
-	if (p->sched_contributes_to_load)
+	if (p->sched_contributes_to_load) {
 		rq->nr_uninterruptible--;
+		if (task_iothrottled(p))
+			rq->nr_iothrottled--;
+		task_cfs_rq(p)->nr_unint--;
+	}
 #endif
 
+	if (p->sched_interruptible_sleep)
+		rq->nr_sleeping--;
+
 	ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_WAKING);
 	ttwu_do_wakeup(rq, p, wake_flags);
 }
@@ -1539,6 +1656,7 @@ static int ttwu_remote(struct task_struct *p, int wake_flags)
 	rq = __task_rq_lock(p);
 	if (p->on_rq) {
 		ttwu_do_wakeup(rq, p, wake_flags);
+		p->woken_while_running = 1;
 		ret = 1;
 	}
 	__task_rq_unlock(rq);
@@ -1694,6 +1812,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 * current.
 	 */
 	smp_rmb();
+
 	if (p->on_rq && ttwu_remote(p, wake_flags))
 		goto stat;
 
@@ -1709,7 +1828,14 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
 	 */
 	smp_rmb();
 
+	if (p->in_iowait && p->sched_class->nr_iowait_dec) {
+		struct rq *rq = __task_rq_lock(p);
+		p->sched_class->nr_iowait_dec(p);
+		__task_rq_unlock(rq);
+	}
+
 	p->sched_contributes_to_load = !!task_contributes_to_load(p);
+	p->sched_interruptible_sleep = (p->state == TASK_INTERRUPTIBLE);
 	p->state = TASK_WAKING;
 
 	if (p->sched_class->task_waking)
@@ -1828,6 +1954,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
 	p->se.vruntime			= 0;
 	INIT_LIST_HEAD(&p->se.group_node);
 
+#ifdef CONFIG_CFS_BANDWIDTH
+	p->se.boosted = 0;
+#endif
+
 #ifdef CONFIG_SCHEDSTATS
 	/* Even if schedstat is disabled, there should not be garbage */
 	p->se.statistics = &p->statistics;
@@ -2347,6 +2477,10 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	}
 
 	tick_nohz_task_switch(current);
+
+	/* kernel threads don't care about cpuid faulting */
+	if (current->mm)
+		set_cpuid_faulting(!ve_is_super(get_exec_env()));
 }
 
 #ifdef CONFIG_SMP
@@ -2392,20 +2526,18 @@ static inline void post_schedule(struct rq *rq)
 asmlinkage void schedule_tail(struct task_struct *prev)
 	__releases(rq->lock)
 {
-	struct rq *rq = this_rq();
+	struct rq *rq;
 
+#ifndef __ARCH_WANT_UNLOCKED_CTXSW
+	/* finish_task_switch() drops rq->lock and enables preemtion */
+	preempt_disable();
+#endif
+	rq = this_rq();
 	finish_task_switch(rq, prev);
 
-	/*
-	 * FIXME: do we need to worry about rq being invalidated by the
-	 * task_switch?
-	 */
 	post_schedule(rq);
-
-#ifdef __ARCH_WANT_UNLOCKED_CTXSW
-	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
-#endif
+
 	if (current->set_child_tid)
 		put_user(task_pid_vnr(current), current->set_child_tid);
 }
@@ -2465,21 +2597,28 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	finish_task_switch(this_rq(), prev);
 }
 
+#define DECLARE_NR_ONLINE(varname)			\
+	unsigned long varname(void)			\
+	{						\
+		unsigned long i, sum = 0;		\
+		for_each_online_cpu(i)			\
+			sum += cpu_rq(i)->varname;	\
+		if (unlikely((long)sum < 0))		\
+			return 0;			\
+		return sum;				\
+	}						\
+	EXPORT_SYMBOL(varname);				\
+
 /*
  * nr_running and nr_context_switches:
  *
  * externally visible scheduler statistics: current number of runnable
  * threads, total number of context switches performed since bootup.
  */
-unsigned long nr_running(void)
-{
-	unsigned long i, sum = 0;
-
-	for_each_online_cpu(i)
-		sum += cpu_rq(i)->nr_running;
-
-	return sum;
-}
+DECLARE_NR_ONLINE(nr_running);
+DECLARE_NR_ONLINE(nr_sleeping);
+DECLARE_NR_ONLINE(nr_stopped);
+DECLARE_NR_ONLINE(nr_uninterruptible);
 
 /*
  * Check if only the current task is running on the cpu.
@@ -2520,13 +2659,12 @@ unsigned long nr_iowait_cpu(int cpu)
 	return atomic_read(&this->nr_iowait);
 }
 
-unsigned long this_cpu_load(void)
+unsigned long nr_active_cpu(void)
 {
 	struct rq *this = this_rq();
-	return this->cpu_load[0];
+	return this->nr_active;
 }
 
-
 /*
  * Global load-average calculations
  *
@@ -2595,12 +2733,21 @@ void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
 	loads[2] = (avenrun[2] + offset) << shift;
 }
 
+void get_avenrun_ve(unsigned long *loads, unsigned long offset, int shift)
+{
+	struct task_group *tg = task_group(current);
+	loads[0] = (tg->avenrun[0] + offset) << shift;
+	loads[1] = (tg->avenrun[1] + offset) << shift;
+	loads[2] = (tg->avenrun[2] + offset) << shift;
+}
+
 static long calc_load_fold_active(struct rq *this_rq)
 {
 	long nr_active, delta = 0;
 
 	nr_active = this_rq->nr_running;
 	nr_active += (long) this_rq->nr_uninterruptible;
+	nr_active -= (long) this_rq->nr_iothrottled;
 
 	if (nr_active != this_rq->calc_load_active) {
 		delta = nr_active - this_rq->calc_load_active;
@@ -2622,6 +2769,42 @@ calc_load(unsigned long load, unsigned long exp, unsigned long active)
 	return load >> FSHIFT;
 }
 
+#ifdef CONFIG_VE
+static void calc_load_ve(void)
+{
+	unsigned long flags, nr_unint, nr_active;
+	struct task_group *tg;
+	int i;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(tg, &task_groups, list) {
+		nr_active = 0;
+		for_each_possible_cpu(i) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+			nr_active += tg->cfs_rq[i]->nr_running;
+			nr_active += tg->cfs_rq[i]->nr_unint;
+#endif
+		}
+		nr_active *= FIXED_1;
+
+		tg->avenrun[0] = calc_load(tg->avenrun[0], EXP_1, nr_active);
+		tg->avenrun[1] = calc_load(tg->avenrun[1], EXP_5, nr_active);
+		tg->avenrun[2] = calc_load(tg->avenrun[2], EXP_15, nr_active);
+	}
+	rcu_read_unlock();
+
+	nr_unint = nr_uninterruptible() * FIXED_1;
+	spin_lock_irqsave(&kstat_glb_lock, flags);
+	CALC_LOAD(kstat_glob.nr_unint_avg[0], EXP_1, nr_unint);
+	CALC_LOAD(kstat_glob.nr_unint_avg[1], EXP_5, nr_unint);
+	CALC_LOAD(kstat_glob.nr_unint_avg[2], EXP_15, nr_unint);
+	spin_unlock_irqrestore(&kstat_glb_lock, flags);
+
+}
+#else
+#define calc_load_ve()	do { } while (0)
+#endif
+
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * Handle NO_HZ for the global load-average.
@@ -2882,6 +3065,8 @@ void calc_global_load(unsigned long ticks)
 
 	calc_load_update += LOAD_FREQ;
 
+	calc_load_ve();
+
 	/*
 	 * In case we idled for multiple LOAD_FREQ intervals, catch up in bulk.
 	 */
@@ -2990,6 +3175,7 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load,
 	this_rq->nr_load_updates++;
 
 	/* Update our load: */
+	this_rq->nr_active = this_rq->nr_running;
 	this_rq->cpu_load[0] = this_load; /* Fasttrack for idx 0 */
 	for (i = 1, scale = 2; i < CPU_LOAD_IDX_MAX; i++, scale += scale) {
 		unsigned long old_load, new_load;
@@ -3403,6 +3589,7 @@ static void __sched __schedule(void)
 	struct task_struct *prev, *next;
 	unsigned long *switch_count;
 	struct rq *rq;
+	int resched_next;
 	int cpu;
 
 need_resched:
@@ -3478,8 +3665,14 @@ need_resched:
 
 	post_schedule(rq);
 
+	resched_next = READ_ONCE(rq->resched_next);
+	if (resched_next) {
+		set_tsk_need_resched(current);
+		rq->resched_next = 0;
+	}
+
 	sched_preempt_enable_no_resched();
-	if (need_resched())
+	if (!resched_next && need_resched())
 		goto need_resched;
 }
 STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */
@@ -4498,7 +4691,7 @@ recheck:
 	/*
 	 * Allow unprivileged RT tasks to decrease priority:
 	 */
-	if (user && !capable(CAP_SYS_NICE)) {
+	if (user && !capable(CAP_SYS_ADMIN)) {
 		if (fair_policy(policy)) {
 			if (attr->sched_nice < TASK_NICE(p) &&
 			    !can_nice(p, attr->sched_nice))
@@ -5059,6 +5252,9 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
 	struct task_struct *p;
 	int retval;
 
+	if (!ve_is_super(get_exec_env()))
+		return 0;
+
 	rcu_read_lock();
 
 	p = find_process_by_pid(pid);
@@ -5193,6 +5389,12 @@ long sched_getaffinity(pid_t pid, struct cpumask *mask)
 	if (retval)
 		goto out_unlock;
 
+	if (!ve_is_super(get_exec_env())) {
+		cpumask_clear(mask);
+		bitmap_fill(cpumask_bits(mask), num_online_vcpus());
+		goto out_unlock;
+	}
+
 	raw_spin_lock_irqsave(&p->pi_lock, flags);
 	cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
 	raw_spin_unlock_irqrestore(&p->pi_lock, flags);
@@ -5273,23 +5475,38 @@ static inline int should_resched(void)
 	return need_resched() && !(preempt_count() & PREEMPT_ACTIVE);
 }
 
-static void __cond_resched(void)
+static void __cond_resched(bool may_throttle)
 {
 	add_preempt_count(PREEMPT_ACTIVE);
+	if (may_throttle)
+		current->may_throttle = 1;
 	__schedule();
+	if (may_throttle)
+		current->may_throttle = 0;
 	sub_preempt_count(PREEMPT_ACTIVE);
 }
 
 int __sched _cond_resched(void)
 {
 	if (should_resched()) {
-		__cond_resched();
+		__cond_resched(false);
 		return 1;
 	}
 	return 0;
 }
 EXPORT_SYMBOL(_cond_resched);
 
+int __sched _cond_resched_may_throttle(void)
+{
+	if (should_resched()) {
+		__cond_resched(true);
+		return 1;
+	}
+	return 0;
+}
+EXPORT_SYMBOL(_cond_resched_may_throttle);
+
+
 /*
  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
  * call schedule, and on return reacquire the lock.
@@ -5308,7 +5525,7 @@ int __cond_resched_lock(spinlock_t *lock)
 	if (spin_needbreak(lock) || resched) {
 		spin_unlock(lock);
 		if (resched)
-			__cond_resched();
+			__cond_resched(false);
 		else
 			cpu_relax();
 		ret = 1;
@@ -5324,7 +5541,7 @@ int __sched __cond_resched_softirq(void)
 
 	if (should_resched()) {
 		local_bh_enable();
-		__cond_resched();
+		__cond_resched(false);
 		local_bh_disable();
 		return 1;
 	}
@@ -5581,27 +5798,16 @@ void sched_show_task(struct task_struct *p)
 	state = p->state ? __ffs(p->state) + 1 : 0;
 	printk(KERN_INFO "%-15.15s %c", p->comm,
 		state < sizeof(stat_nam) - 1 ? stat_nam[state] : '?');
-#if BITS_PER_LONG == 32
-	if (state == TASK_RUNNING)
-		printk(KERN_CONT " running  ");
-	else
-		printk(KERN_CONT " %08lx ", thread_saved_pc(p));
-#else
-	if (state == TASK_RUNNING)
-		printk(KERN_CONT "  running task    ");
-	else
-		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
-#endif
+	printk(KERN_CONT " %p ", p);
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	free = stack_not_used(p);
 #endif
 	rcu_read_lock();
 	ppid = task_pid_nr(rcu_dereference(p->real_parent));
-	rcu_read_unlock();
-	printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+	printk(KERN_CONT "%5lu %5d %6d %4s 0x%08lx\n", free,
 		task_pid_nr(p), ppid,
-		(unsigned long)task_thread_info(p)->flags);
-
+		task_ve_name(p), (unsigned long)task_thread_info(p)->flags);
+	rcu_read_unlock();
 	print_worker_info(KERN_INFO, p);
 	show_stack(p, NULL);
 }
@@ -5612,25 +5818,33 @@ void show_state_filter(unsigned long state_filter)
 
 #if BITS_PER_LONG == 32
 	printk(KERN_INFO
-		"  task                PC stack   pid father\n");
+		"  task          taskaddr stack   pid father veid\n");
 #else
 	printk(KERN_INFO
-		"  task                        PC stack   pid father\n");
+		"  task                  taskaddr stack   pid father veid\n");
 #endif
 	rcu_read_lock();
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
 		 * console might take a lot of time:
+		 * Also, reset softlockup watchdogs on all CPUs, because
+		 * another CPU might be blocked waiting for us to process
+		 * an IPI.
 		 */
 		touch_nmi_watchdog();
+		touch_all_softlockup_watchdogs();
 		if (!state_filter || (p->state & state_filter))
 			sched_show_task(p);
 	} while_each_thread(g, p);
 
-	touch_all_softlockup_watchdogs();
-
-#ifdef CONFIG_SCHED_DEBUG
+#if 0
+	/*
+	 * This results in soft lockups, because it writes too much data to
+	 * console. At the same time information it shows is only useful for
+	 * sched debugging and can be obtained via /proc/sched_debug anyway.
+	 * So disable it.
+	 */
 	sysrq_sched_debug_show();
 #endif
 	rcu_read_unlock();
@@ -5981,6 +6195,9 @@ void idle_task_exit(void)
 	if (mm != &init_mm)
 		switch_mm(mm, &init_mm, current);
 	mmdrop(mm);
+
+	/* disable cpuid faulting when a cpu goes offline */
+	set_cpuid_faulting(false);
 }
 
 /*
@@ -6281,6 +6498,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 
 			set_rq_online(rq);
 		}
+		start_cfs_idle_time_accounting(cpu);
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 
@@ -6295,6 +6513,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		}
 		migrate_tasks(cpu);
 		BUG_ON(rq->nr_running != 1); /* the migration thread */
+		stop_cfs_idle_time_accounting(cpu);
 		raw_spin_unlock_irqrestore(&rq->lock, flags);
 		break;
 
@@ -8086,6 +8305,8 @@ void __init sched_init(void)
 	init_dl_bandwidth(&def_dl_bandwidth,
 			global_rt_period(), global_rt_runtime());
 
+	root_task_group.taskstats = alloc_percpu(struct taskstats);
+
 #ifdef CONFIG_SMP
 	init_defrootdomain();
 #endif
@@ -8101,8 +8322,14 @@ void __init sched_init(void)
 	INIT_LIST_HEAD(&root_task_group.siblings);
 	autogroup_init(&init_task);
 
+	root_task_group.start_time = (struct timespec){0, 0};
+
 #endif /* CONFIG_CGROUP_SCHED */
 
+#ifdef CONFIG_CFS_CPULIMIT
+	root_task_group.topmost_limited_ancestor = &root_task_group;
+#endif
+
 	for_each_possible_cpu(i) {
 		struct rq *rq;
 
@@ -8381,6 +8608,9 @@ static void free_sched_group(struct task_group *tg)
 	free_fair_sched_group(tg);
 	free_rt_sched_group(tg);
 	autogroup_free(tg);
+	free_percpu(tg->taskstats);
+	kfree(tg->cpustat_last);
+	kfree(tg->vcpustat);
 	kfree(tg);
 }
 
@@ -8399,6 +8629,27 @@ struct task_group *sched_create_group(struct task_group *parent)
 	if (!alloc_rt_sched_group(tg, parent))
 		goto err;
 
+	tg->taskstats = alloc_percpu(struct taskstats);
+	if (!tg->taskstats)
+		goto err;
+
+	tg->cpustat_last = kcalloc(nr_cpu_ids, sizeof(struct kernel_cpustat),
+				   GFP_KERNEL);
+	if (!tg->cpustat_last)
+		goto err;
+
+	tg->vcpustat = kcalloc(nr_cpu_ids, sizeof(struct kernel_cpustat),
+			       GFP_KERNEL);
+	if (!tg->vcpustat)
+		goto err;
+
+	tg->vcpustat_last_update = ktime_set(0, 0);
+	spin_lock_init(&tg->vcpustat_lock);
+
+	/* start_timespec is saved CT0 uptime */
+	do_posix_clock_monotonic_gettime(&tg->start_time);
+	monotonic_to_bootbased(&tg->start_time);
+
 	return tg;
 
 err:
@@ -8406,6 +8657,8 @@ err:
 	return ERR_PTR(-ENOMEM);
 }
 
+static void tg_update_topmost_limited_ancestor(struct task_group *tg);
+
 void sched_online_group(struct task_group *tg, struct task_group *parent)
 {
 	unsigned long flags;
@@ -8418,6 +8671,9 @@ void sched_online_group(struct task_group *tg, struct task_group *parent)
 	tg->parent = parent;
 	INIT_LIST_HEAD(&tg->children);
 	list_add_rcu(&tg->siblings, &parent->children);
+
+	tg_update_topmost_limited_ancestor(tg);
+
 	spin_unlock_irqrestore(&task_group_lock, flags);
 
 	online_fair_sched_group(tg);
@@ -8471,11 +8727,26 @@ void sched_move_task(struct task_struct *tsk)
 
 	if (on_rq)
 		dequeue_task(rq, tsk, 0);
+	else {
+		if (!(tsk->state & TASK_WAKING) && tsk->in_iowait &&
+				tsk->sched_class->nr_iowait_dec)
+			tsk->sched_class->nr_iowait_dec(tsk);
+
+		if (task_contributes_to_load(tsk))
+			task_cfs_rq(tsk)->nr_unint--;
+
+		check_dec_sleeping(rq, tsk);
+	}
+
 	if (unlikely(running))
 		tsk->sched_class->put_prev_task(rq, tsk);
 
-	tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id,
-				lockdep_is_held(&tsk->sighand->siglock)),
+	/*
+	 * All callers are synchronized by task_rq_lock(); we do not use RCU
+	 * which is pointless here. Thus, we pass "true" to task_css_check()
+	 * to prevent lockdep warnings.
+	 */
+	tg = container_of(task_subsys_state_check(tsk, cpu_cgroup_subsys_id, true),
 			  struct task_group, css);
 	tg = autogroup_task_group(tsk, tg);
 	tsk->sched_task_group = tg;
@@ -8491,6 +8762,16 @@ void sched_move_task(struct task_struct *tsk)
 		tsk->sched_class->set_curr_task(rq);
 	if (on_rq)
 		enqueue_task(rq, tsk, 0);
+	else {
+		if (!(tsk->state & TASK_WAKING) && tsk->in_iowait &&
+				tsk->sched_class->nr_iowait_inc)
+			tsk->sched_class->nr_iowait_inc(tsk);
+
+		if (task_contributes_to_load(tsk))
+			task_cfs_rq(tsk)->nr_unint++;
+
+		check_inc_sleeping(rq, tsk);
+	}
 
 	task_rq_unlock(rq, tsk, &flags);
 }
@@ -8908,6 +9189,11 @@ static void cpu_cgroup_css_offline(struct cgroup *cgrp)
 	sched_offline_group(tg);
 }
 
+static void cpu_cgroup_fork(struct task_struct *task, void *private)
+{
+	sched_move_task(task);
+}
+
 static int cpu_cgroup_can_attach(struct cgroup *cgrp,
 				 struct cgroup_taskset *tset)
 {
@@ -8948,6 +9234,19 @@ cpu_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
 		return;
 
 	sched_move_task(task);
+
+	if (thread_group_leader(task)) {
+		struct task_group *tg = cgroup_tg(old_cgrp);
+		struct taskstats *stats = get_cpu_ptr(tg->taskstats);
+		struct signal_struct *sig = task->signal;
+
+		if (sig->stats)
+			delayacct_add_stats(stats, sig->stats);
+		else
+			delayacct_add_tsk(stats, task);
+
+		put_cpu_ptr(stats);
+	}
 }
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -8972,7 +9271,10 @@ const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
 
 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
 
-static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+static void tg_limit_toggled(struct task_group *tg);
+
+/* call with cfs_constraints_mutex held */
+static int __tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 {
 	int i, ret = 0, runtime_enabled, runtime_was_enabled;
 	struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
@@ -8996,10 +9298,9 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 	if (period > max_cfs_quota_period)
 		return -EINVAL;
 
-	mutex_lock(&cfs_constraints_mutex);
 	ret = __cfs_schedulable(tg, period, quota);
 	if (ret)
-		goto out_unlock;
+		return ret;
 
 	runtime_enabled = quota != RUNTIME_INF;
 	runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
@@ -9027,15 +9328,28 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
 
 		raw_spin_lock_irq(&rq->lock);
 		cfs_rq->runtime_enabled = runtime_enabled;
-		cfs_rq->runtime_remaining = 0;
+		cfs_rq->runtime_remaining = 1;
 
 		if (cfs_rq->throttled)
 			unthrottle_cfs_rq(cfs_rq);
 		raw_spin_unlock_irq(&rq->lock);
 	}
+	if (runtime_enabled != runtime_was_enabled)
+		tg_limit_toggled(tg);
 	if (runtime_was_enabled && !runtime_enabled)
 		cfs_bandwidth_usage_dec();
-out_unlock:
+	return ret;
+}
+
+static void tg_update_cpu_limit(struct task_group *tg);
+
+static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
+{
+	int ret;
+
+	mutex_lock(&cfs_constraints_mutex);
+	ret = __tg_set_cfs_bandwidth(tg, period, quota);
+	tg_update_cpu_limit(tg);
 	mutex_unlock(&cfs_constraints_mutex);
 
 	return ret;
@@ -9199,6 +9513,135 @@ static int cpu_stats_show(struct cgroup *cgrp, struct cftype *cft,
 
 	return 0;
 }
+
+#ifdef CONFIG_CFS_CPULIMIT
+static int __tg_update_topmost_limited_ancestor(struct task_group *tg, void *unused)
+{
+	struct task_group *parent = tg->parent;
+
+	/*
+	 * Parent and none of its uncestors is limited? The task group should
+	 * become a topmost limited uncestor then, provided it has a limit set.
+	 * Otherwise inherit topmost limited ancestor from the parent.
+	 */
+	if (parent->topmost_limited_ancestor == parent &&
+	    parent->cfs_bandwidth.quota == RUNTIME_INF)
+		tg->topmost_limited_ancestor = tg;
+	else
+		tg->topmost_limited_ancestor = parent->topmost_limited_ancestor;
+	return 0;
+}
+
+static void tg_update_topmost_limited_ancestor(struct task_group *tg)
+{
+	__tg_update_topmost_limited_ancestor(tg, NULL);
+}
+
+static void tg_limit_toggled(struct task_group *tg)
+{
+	if (tg->topmost_limited_ancestor != tg) {
+		/*
+		 * This task group is not a topmost limited ancestor, so both
+		 * it and all its children must already point to their topmost
+		 * limited ancestor, and we have nothing to do.
+		 */
+		return;
+	}
+
+	/*
+	 * This task group is a topmost limited ancestor. Walk over all its
+	 * children and update their pointers to the topmost limited ancestor.
+	 */
+
+	spin_lock_irq(&task_group_lock);
+	walk_tg_tree_from(tg, __tg_update_topmost_limited_ancestor, tg_nop, NULL);
+	spin_unlock_irq(&task_group_lock);
+}
+
+static void tg_update_cpu_limit(struct task_group *tg)
+{
+	long quota, period;
+	unsigned long rate = 0;
+
+	quota = tg_get_cfs_quota(tg);
+	period = tg_get_cfs_period(tg);
+
+	if (quota > 0 && period > 0) {
+		rate = quota * MAX_CPU_RATE / period;
+		rate = max(rate, 1UL);
+	}
+
+	tg->cpu_rate = rate;
+	tg->nr_cpus = 0;
+}
+
+static int tg_set_cpu_limit(struct task_group *tg,
+			    unsigned long cpu_rate, unsigned int nr_cpus)
+{
+	int ret;
+	unsigned long rate;
+	u64 quota = RUNTIME_INF;
+	u64 period = default_cfs_period();
+
+	rate = (cpu_rate && nr_cpus) ?
+		min_t(unsigned long, cpu_rate, nr_cpus * MAX_CPU_RATE) :
+		max_t(unsigned long, cpu_rate, nr_cpus * MAX_CPU_RATE);
+	if (rate) {
+		quota = div_u64(period * rate, MAX_CPU_RATE);
+		quota = max(quota, min_cfs_quota_period);
+	}
+
+	mutex_lock(&cfs_constraints_mutex);
+	ret = __tg_set_cfs_bandwidth(tg, period, quota);
+	if (!ret) {
+		tg->cpu_rate = cpu_rate;
+		tg->nr_cpus = nr_cpus;
+	}
+	mutex_unlock(&cfs_constraints_mutex);
+
+	return ret;
+}
+
+static u64 cpu_rate_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return cgroup_tg(cgrp)->cpu_rate;
+}
+
+static int cpu_rate_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+			      u64 rate)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	if (rate > num_online_cpus() * MAX_CPU_RATE)
+		rate = num_online_cpus() * MAX_CPU_RATE;
+	return tg_set_cpu_limit(tg, rate, tg->nr_cpus);
+}
+
+static u64 nr_cpus_read_u64(struct cgroup *cgrp, struct cftype *cft)
+{
+	return cgroup_tg(cgrp)->nr_cpus;
+}
+
+static int nr_cpus_write_u64(struct cgroup *cgrp, struct cftype *cftype,
+			     u64 nr_cpus)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	if (nr_cpus > num_online_cpus())
+		nr_cpus = num_online_cpus();
+	return tg_set_cpu_limit(tg, tg->cpu_rate, nr_cpus);
+}
+#else
+static void tg_update_topmost_limited_ancestor(struct task_group *tg)
+{
+}
+static void tg_limit_toggled(struct task_group *tg)
+{
+}
+static void tg_update_cpu_limit(struct task_group *tg)
+{
+}
+#endif /* CONFIG_CFS_CPULIMIT */
 #endif /* CONFIG_CFS_BANDWIDTH */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 
@@ -9226,6 +9669,432 @@ static u64 cpu_rt_period_read_uint(struct cgroup *cgrp, struct cftype *cft)
 }
 #endif /* CONFIG_RT_GROUP_SCHED */
 
+static void cpu_cgroup_update_stat(struct cgroup *cgrp, int i)
+{
+#if defined(CONFIG_SCHEDSTATS) && defined(CONFIG_FAIR_GROUP_SCHED)
+	struct task_group *tg = cgroup_tg(cgrp);
+	struct sched_entity *se = tg->se[i];
+	struct kernel_cpustat *kcpustat = cpuacct_cpustat(cgrp, i);
+	u64 now = cpu_clock(i);
+	u64 delta, idle, iowait, steal, used;
+
+	/* root_task_group has not sched entities */
+	if (tg == &root_task_group)
+		return;
+
+	iowait = se->statistics->iowait_sum;
+	idle = se->statistics->sum_sleep_runtime;
+	steal = se->statistics->wait_sum;
+	used = se->sum_exec_runtime;
+
+	if (idle > iowait)
+		idle -= iowait;
+	else
+		idle = 0;
+
+	if (se->statistics->sleep_start) {
+		delta = now - se->statistics->sleep_start;
+		if ((s64)delta > 0)
+			idle += delta;
+	} else if (se->statistics->block_start) {
+		delta = now - se->statistics->block_start;
+		if ((s64)delta > 0)
+			iowait += delta;
+	} else if (se->statistics->wait_start) {
+		delta = now - se->statistics->wait_start;
+		if ((s64)delta > 0)
+			steal += delta;
+	}
+
+	kcpustat->cpustat[CPUTIME_IDLE] =
+			max(kcpustat->cpustat[CPUTIME_IDLE],
+			    nsecs_to_cputime(idle));
+	kcpustat->cpustat[CPUTIME_IOWAIT] =
+			max(kcpustat->cpustat[CPUTIME_IOWAIT],
+			    nsecs_to_cputime(iowait));
+	kcpustat->cpustat[CPUTIME_STEAL] = nsecs_to_cputime(steal);
+	kcpustat->cpustat[CPUTIME_USED] = nsecs_to_cputime(used);
+#endif
+}
+
+static void fixup_vcpustat_delta_usage(struct kernel_cpustat *cur,
+				       struct kernel_cpustat *rem, int ind,
+				       u64 cur_usage, u64 target_usage,
+				       u64 rem_usage)
+{
+	s64 scaled_val;
+	u32 scale_pct = 0;
+
+	/* distribute the delta among USER, NICE, and SYSTEM proportionally */
+	if (cur_usage < target_usage) {
+		if ((s64)rem_usage > 0) /* sanity check to avoid div/0 */
+			scale_pct = div64_u64(100 * rem->cpustat[ind],
+					      rem_usage);
+	} else {
+		if ((s64)cur_usage > 0) /* sanity check to avoid div/0 */
+			scale_pct = div64_u64(100 * cur->cpustat[ind],
+					      cur_usage);
+	}
+
+	scaled_val = div_s64(scale_pct * (target_usage - cur_usage), 100);
+
+	cur->cpustat[ind] += scaled_val;
+	if ((s64)cur->cpustat[ind] < 0)
+		cur->cpustat[ind] = 0;
+
+	rem->cpustat[ind] -= scaled_val;
+	if ((s64)rem->cpustat[ind] < 0)
+		rem->cpustat[ind] = 0;
+}
+
+static void calc_vcpustat_delta_idle(struct kernel_cpustat *cur,
+				     int ind, u64 cur_idle, u64 target_idle)
+{
+	/* distribute target_idle between IDLE and IOWAIT proportionally to
+	 * what we initially had on this vcpu */
+	if ((s64)cur_idle > 0) {
+		u32 scale_pct = div64_u64(100 * cur->cpustat[ind], cur_idle);
+		cur->cpustat[ind] = div_u64(scale_pct * target_idle, 100);
+	} else {
+		cur->cpustat[ind] = ind == CPUTIME_IDLE ? target_idle : 0;
+	}
+}
+
+static void fixup_vcpustat_delta(struct kernel_cpustat *cur,
+				 struct kernel_cpustat *rem,
+				 u64 max_usage)
+{
+	u64 cur_usage, target_usage, rem_usage;
+	u64 cur_idle, target_idle;
+
+	cur_usage = kernel_cpustat_total_usage(cur);
+	rem_usage = kernel_cpustat_total_usage(rem);
+
+	target_usage = min(cur_usage + rem_usage,
+			   max_usage);
+
+	if (cur_usage != target_usage) {
+		fixup_vcpustat_delta_usage(cur, rem, CPUTIME_USER,
+				cur_usage, target_usage, rem_usage);
+		fixup_vcpustat_delta_usage(cur, rem, CPUTIME_NICE,
+				cur_usage, target_usage, rem_usage);
+		fixup_vcpustat_delta_usage(cur, rem, CPUTIME_SYSTEM,
+				cur_usage, target_usage, rem_usage);
+	}
+
+	cur_idle = kernel_cpustat_total_idle(cur);
+	target_idle = max_usage - target_usage;
+
+	if (cur_idle != target_idle) {
+		calc_vcpustat_delta_idle(cur, CPUTIME_IDLE,
+					 cur_idle, target_idle);
+		calc_vcpustat_delta_idle(cur, CPUTIME_IOWAIT,
+					 cur_idle, target_idle);
+	}
+
+	cur->cpustat[CPUTIME_USED] = target_usage;
+
+	/* do not show steal time inside ve */
+	cur->cpustat[CPUTIME_STEAL] = 0;
+}
+
+static void cpu_cgroup_update_vcpustat(struct cgroup *cgrp)
+{
+	int i, j;
+	int nr_vcpus;
+	int vcpu_rate;
+	ktime_t now;
+	u64 abs_delta_ns, max_usage;
+	struct kernel_cpustat stat_delta, stat_rem;
+	struct task_group *tg = cgroup_tg(cgrp);
+	int first_pass = 1;
+
+	spin_lock(&tg->vcpustat_lock);
+
+	now = ktime_get();
+	nr_vcpus = tg->nr_cpus ?: num_online_cpus();
+	vcpu_rate = DIV_ROUND_UP(tg->cpu_rate, nr_vcpus);
+	if (!vcpu_rate || vcpu_rate > MAX_CPU_RATE)
+		vcpu_rate = MAX_CPU_RATE;
+
+	if (!ktime_to_ns(tg->vcpustat_last_update)) {
+		/* on the first read initialize vcpu i stat as a sum of stats
+		 * over pcpus j such that j % nr_vcpus == i */
+		for (i = 0; i < nr_vcpus; i++) {
+			for (j = i; j < nr_cpu_ids; j += nr_vcpus) {
+				if (!cpu_possible(j))
+					continue;
+				kernel_cpustat_add(tg->vcpustat + i,
+						   cpuacct_cpustat(cgrp, j),
+						   tg->vcpustat + i);
+			}
+		}
+		goto out_update_last;
+	}
+
+	abs_delta_ns = ktime_to_ns(ktime_sub(now, tg->vcpustat_last_update));
+	max_usage = nsecs_to_cputime(abs_delta_ns);
+	max_usage = div_u64(max_usage * vcpu_rate, MAX_CPU_RATE);
+	/* don't allow to update stats too often to avoid calculation errors */
+	if (max_usage < 10)
+		goto out_unlock;
+
+	/* temporarily copy per cpu usage delta to tg->cpustat_last */
+	for_each_possible_cpu(i)
+		kernel_cpustat_sub(cpuacct_cpustat(cgrp, i),
+				   tg->cpustat_last + i,
+				   tg->cpustat_last + i);
+
+	/* proceed to calculating per vcpu delta */
+	kernel_cpustat_zero(&stat_rem);
+
+again:
+	for (i = 0; i < nr_vcpus; i++) {
+		int exceeds_max;
+
+		kernel_cpustat_zero(&stat_delta);
+		for (j = i; j < nr_cpu_ids; j += nr_vcpus) {
+			if (!cpu_possible(j))
+				continue;
+			kernel_cpustat_add(&stat_delta,
+					   tg->cpustat_last + j, &stat_delta);
+		}
+
+		exceeds_max = kernel_cpustat_total_usage(&stat_delta) >=
+								max_usage;
+		/*
+		 * On the first pass calculate delta for vcpus with usage >
+		 * max_usage in order to accumulate excess in stat_rem.
+		 *
+		 * Once the remainder is accumulated, proceed to the rest of
+		 * vcpus so that it will be distributed among them.
+		*/
+		if (exceeds_max != first_pass)
+			continue;
+
+		fixup_vcpustat_delta(&stat_delta, &stat_rem, max_usage);
+		kernel_cpustat_add(tg->vcpustat + i, &stat_delta,
+				   tg->vcpustat + i);
+	}
+
+	if (first_pass) {
+		first_pass = 0;
+		goto again;
+	}
+out_update_last:
+	for_each_possible_cpu(i)
+		tg->cpustat_last[i] = *cpuacct_cpustat(cgrp, i);
+	tg->vcpustat_last_update = now;
+out_unlock:
+	spin_unlock(&tg->vcpustat_lock);
+}
+
+int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft,
+				struct seq_file *p)
+{
+	int i;
+	unsigned long jif;
+	u64 user, nice, system, idle, iowait, steal;
+	struct timespec boottime;
+	struct task_group *tg = cgroup_tg(cgrp);
+	bool virt = !ve_is_super(get_exec_env()) && tg != &root_task_group;
+	int nr_vcpus = tg->nr_cpus ?: num_online_cpus();
+	struct kernel_cpustat *kcpustat;
+	unsigned long tg_nr_running = 0;
+	unsigned long tg_nr_iowait = 0;
+	unsigned long long tg_nr_switches = 0;
+	unsigned long tg_nr_forks = 0;
+
+	getboottime(&boottime);
+	jif = boottime.tv_sec + tg->start_time.tv_sec;
+
+	for_each_possible_cpu(i) {
+		cpu_cgroup_update_stat(cgrp, i);
+
+		/* root task group has autogrouping, so this doesn't hold */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		tg_nr_running += tg->cfs_rq[i]->nr_running;
+		tg_nr_iowait += tg->cfs_rq[i]->nr_iowait;
+		tg_nr_switches += tg->cfs_rq[i]->nr_switches;
+		tg_nr_forks += tg->cfs_rq[i]->nr_forks;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+		tg_nr_running += tg->rt_rq[i]->rt_nr_running;
+#endif
+	}
+
+	if (virt)
+		cpu_cgroup_update_vcpustat(cgrp);
+
+	user = nice = system = idle = iowait = steal = 0;
+
+	for (i = 0; i < (virt ? nr_vcpus : nr_cpu_ids); i++) {
+		if (!virt && !cpu_possible(i))
+			continue;
+		kcpustat = virt ? tg->vcpustat + i : cpuacct_cpustat(cgrp, i);
+		user += kcpustat->cpustat[CPUTIME_USER];
+		nice += kcpustat->cpustat[CPUTIME_NICE];
+		system += kcpustat->cpustat[CPUTIME_SYSTEM];
+		idle += kcpustat->cpustat[CPUTIME_IDLE];
+		iowait += kcpustat->cpustat[CPUTIME_IOWAIT];
+		steal += kcpustat->cpustat[CPUTIME_STEAL];
+	}
+
+	if (!ve_is_super(get_exec_env()))
+		steal = 0;
+
+	seq_printf(p, "cpu  %llu %llu %llu %llu %llu 0 0 %llu\n",
+		(unsigned long long)cputime64_to_clock_t(user),
+		(unsigned long long)cputime64_to_clock_t(nice),
+		(unsigned long long)cputime64_to_clock_t(system),
+		(unsigned long long)cputime64_to_clock_t(idle),
+		(unsigned long long)cputime64_to_clock_t(iowait),
+		virt ? 0ULL :
+		(unsigned long long)cputime64_to_clock_t(steal));
+
+	for (i = 0; i < (virt ? nr_vcpus : nr_cpu_ids); i++) {
+		if (!virt && !cpu_online(i))
+			continue;
+		kcpustat = virt ? tg->vcpustat + i : cpuacct_cpustat(cgrp, i);
+		user = kcpustat->cpustat[CPUTIME_USER];
+		nice = kcpustat->cpustat[CPUTIME_NICE];
+		system = kcpustat->cpustat[CPUTIME_SYSTEM];
+		idle = kcpustat->cpustat[CPUTIME_IDLE];
+		iowait = kcpustat->cpustat[CPUTIME_IOWAIT];
+		steal = kcpustat->cpustat[CPUTIME_STEAL];
+		if (!ve_is_super(get_exec_env()))
+			steal = 0;
+		seq_printf(p,
+			"cpu%d %llu %llu %llu %llu %llu 0 0 %llu\n",
+			i,
+			(unsigned long long)cputime64_to_clock_t(user),
+			(unsigned long long)cputime64_to_clock_t(nice),
+			(unsigned long long)cputime64_to_clock_t(system),
+			(unsigned long long)cputime64_to_clock_t(idle),
+			(unsigned long long)cputime64_to_clock_t(iowait),
+			virt ? 0ULL :
+			(unsigned long long)cputime64_to_clock_t(steal));
+	}
+	seq_printf(p, "intr 0\nswap 0 0\n");
+
+	seq_printf(p,
+		"\nctxt %llu\n"
+		"btime %lu\n"
+		"processes %lu\n"
+		"procs_running %lu\n"
+		"procs_blocked %lu\n",
+		tg_nr_switches,
+		(unsigned long)jif,
+		tg_nr_forks,
+		tg_nr_running,
+		tg_nr_iowait);
+
+	return 0;
+}
+
+int cpu_cgroup_proc_loadavg(struct cgroup *cgrp, struct cftype *cft,
+			    struct seq_file *p)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	unsigned long avnrun[3];
+	int nr_running = 0;
+	int i;
+
+	avnrun[0] = tg->avenrun[0] + FIXED_1/200;
+	avnrun[1] = tg->avenrun[1] + FIXED_1/200;
+	avnrun[2] = tg->avenrun[2] + FIXED_1/200;
+
+	for_each_possible_cpu(i) {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+		nr_running += tg->cfs_rq[i]->nr_running;
+#endif
+#ifdef CONFIG_RT_GROUP_SCHED
+		nr_running += tg->rt_rq[i]->rt_nr_running;
+#endif
+	}
+
+	seq_printf(p, "%lu.%02lu %lu.%02lu %lu.%02lu %d/%d %d\n",
+		LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+		LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+		LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
+		nr_running, cgroup_task_count(cgrp),
+		task_active_pid_ns(current)->last_pid);
+	return 0;
+}
+
+int cpu_cgroup_get_stat(struct cgroup *cgrp, struct kernel_cpustat *kstat)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	int nr_vcpus = tg->nr_cpus ?: num_online_cpus();
+	int i;
+
+	kernel_cpustat_zero(kstat);
+
+	if (tg == &root_task_group)
+		return -ENOENT;
+
+	for_each_possible_cpu(i)
+		cpu_cgroup_update_stat(cgrp, i);
+
+	cpu_cgroup_update_vcpustat(cgrp);
+
+	for (i = 0; i < nr_vcpus; i++)
+		kernel_cpustat_add(tg->vcpustat + i, kstat, kstat);
+
+	return 0;
+}
+
+int cpu_cgroup_get_avenrun(struct cgroup *cgrp, unsigned long *avenrun)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+
+	if (tg == &root_task_group)
+		return -ENOSYS;
+
+	avenrun[0] = tg->avenrun[0];
+	avenrun[1] = tg->avenrun[1];
+	avenrun[2] = tg->avenrun[2];
+
+	return 0;
+}
+
+static int cpu_cgroup_delay_show(struct cgroup *cgrp, struct cftype *cft,
+				 struct cgroup_map_cb *cb)
+{
+	struct task_group *tg = cgroup_tg(cgrp);
+	struct taskstats stats;
+	struct cgroup_iter it;
+	struct task_struct *p;
+	int cpu;
+
+	memset(&stats, 0, sizeof stats);
+
+	for_each_present_cpu(cpu)
+		delayacct_add_stats(&stats, per_cpu_ptr(tg->taskstats, cpu));
+
+	cgroup_iter_start(cgrp, &it);
+	while ((p = cgroup_iter_next(cgrp, &it))) {
+		if (thread_group_leader(p) && p->signal->stats)
+			delayacct_add_stats(&stats, p->signal->stats);
+		delayacct_add_tsk(&stats, p);
+	}
+	cgroup_iter_end(cgrp, &it);
+
+	cb->fill(cb, "cpu_count", stats.cpu_count);
+	cb->fill(cb, "cpu_delay", stats.cpu_delay_total);
+	cb->fill(cb, "cpu_run_real", stats.cpu_run_real_total);
+	cb->fill(cb, "cpu_run_virtual", stats.cpu_run_virtual_total);
+	cb->fill(cb, "cpu_scaled_run_real", stats.cpu_scaled_run_real_total);
+	cb->fill(cb, "blkio_count", stats.blkio_count);
+	cb->fill(cb, "blkio_delay", stats.blkio_delay_total);
+	cb->fill(cb, "swapin_count", stats.swapin_count);
+	cb->fill(cb, "swapin_delay", stats.swapin_delay_total);
+	cb->fill(cb, "freepages_count", stats.freepages_count);
+	cb->fill(cb, "freepages_delay", stats.freepages_delay_total);
+
+	return 0;
+}
+
 static struct cftype cpu_files[] = {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	{
@@ -9250,6 +10119,18 @@ static struct cftype cpu_files[] = {
 		.read_map = cpu_stats_show,
 	},
 #endif
+#ifdef CONFIG_CFS_CPULIMIT
+	{
+		.name = "rate",
+		.read_u64 = cpu_rate_read_u64,
+		.write_u64 = cpu_rate_write_u64,
+	},
+	{
+		.name = "nr_cpus",
+		.read_u64 = nr_cpus_read_u64,
+		.write_u64 = nr_cpus_write_u64,
+	},
+#endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	{
 		.name = "rt_runtime_us",
@@ -9262,6 +10143,18 @@ static struct cftype cpu_files[] = {
 		.write_u64 = cpu_rt_period_write_uint,
 	},
 #endif
+	{
+		.name = "proc.stat",
+		.read_seq_string = cpu_cgroup_proc_stat,
+	},
+	{
+		.name = "proc.loadavg",
+		.read_seq_string = cpu_cgroup_proc_loadavg,
+	},
+	{
+		.name = "delayacct.total",
+		.read_map = cpu_cgroup_delay_show,
+	},
 	{ }	/* terminate */
 };
 
@@ -9271,6 +10164,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
 	.css_free	= cpu_cgroup_css_free,
 	.css_online	= cpu_cgroup_css_online,
 	.css_offline	= cpu_cgroup_css_offline,
+	.fork		= cpu_cgroup_fork,
 	.can_attach	= cpu_cgroup_can_attach,
 	.attach		= cpu_cgroup_attach,
 	.exit		= cpu_cgroup_exit,
--- a/kernel/sched/cpuacct.c
+++ b/kernel/sched/cpuacct.c
@@ -286,6 +286,11 @@ void cpuacct_account_field(struct task_struct *p, int index, u64 val)
 	rcu_read_unlock();
 }
 
+struct kernel_cpustat *cpuacct_cpustat(struct cgroup *cgrp, int cpu)
+{
+	return per_cpu_ptr(cgroup_ca(cgrp)->cpustat, cpu);
+}
+
 struct cgroup_subsys cpuacct_subsys = {
 	.name		= "cpuacct",
 	.css_alloc	= cpuacct_css_alloc,
--- a/kernel/sched/cpuacct.h
+++ b/kernel/sched/cpuacct.h
@@ -3,6 +3,9 @@
 extern void cpuacct_charge(struct task_struct *tsk, u64 cputime);
 extern void cpuacct_account_field(struct task_struct *p, int index, u64 val);
 
+struct cgroup;
+extern struct kernel_cpustat *cpuacct_cpustat(struct cgroup *cgrp, int cpu);
+
 #else
 
 static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime)
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -444,6 +444,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	*ut = p->utime;
 	*st = p->stime;
 }
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
@@ -624,6 +625,7 @@ void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 	task_cputime(p, &cputime.utime, &cputime.stime);
 	cputime_adjust(&cputime, &p->prev_cputime, ut, st);
 }
+EXPORT_SYMBOL_GPL(task_cputime_adjusted);
 
 void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
 {
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -236,6 +236,11 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
 			cfs_rq->throttle_count);
 #endif
 
+#ifdef CONFIG_CFS_CPULIMIT
+	SEQ_printf(m, "  .%-30s: %d\n", "nr_cpus_active",
+		   atomic_read(&cfs_rq->tg->nr_cpus_active));
+#endif
+
 	print_cfs_group_stats(m, cpu, cfs_rq->tg);
 #endif
 }
@@ -595,6 +600,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 		P(se.statistics->nr_migrations_cold);
 		P(se.statistics->nr_failed_migrations_affine);
 		P(se.statistics->nr_failed_migrations_running);
+		P(se.statistics->nr_failed_migrations_cpulimit);
 		P(se.statistics->nr_failed_migrations_hot);
 		P(se.statistics->nr_forced_migrations);
 		P(se.statistics->nr_wakeups);
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -29,6 +29,8 @@
 #include <linux/mempolicy.h>
 #include <linux/migrate.h>
 #include <linux/task_work.h>
+#include <linux/ve.h>
+#include <linux/vzstat.h>
 
 #include <trace/events/sched.h>
 
@@ -113,6 +115,10 @@ unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 #endif
 
+#ifdef CONFIG_CFS_CPULIMIT
+unsigned int sysctl_sched_vcpu_hotslice = 5000000UL;
+#endif
+
 /*
  * Increase the granularity value when there are more CPUs,
  * because with more CPUs the 'effective latency' as visible
@@ -248,11 +254,6 @@ static inline struct task_struct *task_of(struct sched_entity *se)
 #define for_each_sched_entity(se) \
 		for (; se; se = se->parent)
 
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-	return p->se.cfs_rq;
-}
-
 /* runqueue on which this entity is (to be) queued */
 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 {
@@ -320,16 +321,6 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
 }
 
 /* return depth at which a sched entity is present in the hierarchy */
-static inline int depth_se(struct sched_entity *se)
-{
-	int depth = 0;
-
-	for_each_sched_entity(se)
-		depth++;
-
-	return depth;
-}
-
 static void
 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 {
@@ -343,8 +334,8 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 	 */
 
 	/* First walk up until both entities are at same depth */
-	se_depth = depth_se(*se);
-	pse_depth = depth_se(*pse);
+	se_depth = (*se)->depth;
+	pse_depth = (*pse)->depth;
 
 	while (se_depth > pse_depth) {
 		se_depth--;
@@ -379,11 +370,6 @@ static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 #define for_each_sched_entity(se) \
 		for (; se; se = NULL)
 
-static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
-{
-	return &task_rq(p)->cfs;
-}
-
 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 {
 	struct task_struct *p = task_of(se);
@@ -427,6 +413,141 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 
 #endif	/* CONFIG_FAIR_GROUP_SCHED */
 
+#ifdef CONFIG_CFS_BANDWIDTH
+
+#ifdef HAVE_JUMP_LABEL
+static struct static_key __cfs_bandwidth_used;
+
+static inline bool cfs_bandwidth_used(void)
+{
+	return static_key_false(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_inc(void)
+{
+	static_key_slow_inc(&__cfs_bandwidth_used);
+}
+
+void cfs_bandwidth_usage_dec(void)
+{
+	static_key_slow_dec(&__cfs_bandwidth_used);
+}
+#else /* HAVE_JUMP_LABEL */
+static bool cfs_bandwidth_used(void)
+{
+	return true;
+}
+
+void cfs_bandwidth_usage_inc(void) {}
+void cfs_bandwidth_usage_dec(void) {}
+#endif /* HAVE_JUMP_LABEL */
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return cfs_bandwidth_used() && cfs_rq->throttled;
+}
+
+static inline int cfs_rq_has_boosted_entities(struct cfs_rq *cfs_rq)
+{
+	return !list_empty(&cfs_rq->boosted_entities);
+}
+
+static inline int entity_boosted(struct sched_entity *se)
+{
+	return se->boosted;
+}
+
+#else /* !CONFIG_CFS_BANDWIDTH */
+
+static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+static inline int cfs_rq_has_boosted_entities(struct cfs_rq *cfs_rq)
+{
+	return 0;
+}
+
+static inline int entity_boosted(struct sched_entity *se)
+{
+	return 0;
+}
+
+#endif /* CONFIG_CFS_BANDWIDTH */
+
+#ifdef CONFIG_CFS_CPULIMIT
+static inline int cfs_rq_active(struct cfs_rq *cfs_rq)
+{
+	return cfs_rq->active;
+}
+
+static void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
+{
+	/* if we canceled delayed dec, there is no need to do inc */
+	if (hrtimer_try_to_cancel(&cfs_rq->active_timer) != 1)
+		atomic_inc(&cfs_rq->tg->nr_cpus_active);
+	cfs_rq->active = 1;
+}
+
+static void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
+{
+	if (!cfs_rq->runtime_enabled || !sysctl_sched_vcpu_hotslice)
+		postpone = 0;
+
+	if (!postpone) {
+		cfs_rq->active = 0;
+		atomic_dec(&cfs_rq->tg->nr_cpus_active);
+	} else {
+		hrtimer_start_range_ns(&cfs_rq->active_timer,
+				ns_to_ktime(sysctl_sched_vcpu_hotslice), 0,
+				HRTIMER_MODE_REL_PINNED);
+	}
+}
+
+static enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer)
+{
+	struct cfs_rq *cfs_rq =
+		container_of(timer, struct cfs_rq, active_timer);
+	struct rq *rq = rq_of(cfs_rq);
+	unsigned long flags;
+
+	raw_spin_lock_irqsave(&rq->lock, flags);
+	cfs_rq->active = !list_empty(&cfs_rq->tasks);
+	raw_spin_unlock_irqrestore(&rq->lock, flags);
+
+	atomic_dec(&cfs_rq->tg->nr_cpus_active);
+
+	return HRTIMER_NORESTART;
+}
+
+static inline int check_cpulimit_spread(struct task_group *tg, int target_cpu)
+{
+	int nr_cpus_active = atomic_read(&tg->nr_cpus_active);
+	int nr_cpus_limit = DIV_ROUND_UP(tg->cpu_rate, MAX_CPU_RATE);
+
+	nr_cpus_limit = nr_cpus_limit && tg->nr_cpus ?
+		min_t(int, nr_cpus_limit, tg->nr_cpus) :
+		max_t(int, nr_cpus_limit, tg->nr_cpus);
+
+	if (!nr_cpus_limit || nr_cpus_active < nr_cpus_limit)
+		return 1;
+
+	if (nr_cpus_active > nr_cpus_limit)
+		return -1;
+
+	return cfs_rq_active(tg->cfs_rq[target_cpu]) ? 0 : -1;
+}
+#else /* !CONFIG_CFS_CPULIMIT */
+static inline void inc_nr_active_cfs_rqs(struct cfs_rq *cfs_rq)
+{
+}
+
+static inline void dec_nr_active_cfs_rqs(struct cfs_rq *cfs_rq, int postpone)
+{
+}
+#endif /* CONFIG_CFS_CPULIMIT */
+
 static __always_inline
 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec);
 
@@ -727,6 +848,27 @@ static void update_curr_fair(struct rq *rq)
 	update_curr(cfs_rq_of(&rq->curr->se));
 }
 
+static void dequeue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+#ifdef CONFIG_SCHEDSTATS
+	if (entity_is_task(se)) {
+		struct task_struct *tsk = task_of(se);
+
+		if (tsk->state & TASK_INTERRUPTIBLE)
+			se->statistics->sleep_start = rq_clock(rq_of(cfs_rq));
+		if (tsk->state & TASK_UNINTERRUPTIBLE)
+			se->statistics->block_start = rq_clock(rq_of(cfs_rq));
+		if (tsk->in_iowait)
+			cfs_rq->nr_iowait++;
+	} else if (!cfs_rq_throttled(group_cfs_rq(se))) {
+		if (group_cfs_rq(se)->nr_iowait)
+			se->statistics->block_start = rq_clock(rq_of(cfs_rq));
+		else
+			se->statistics->sleep_start = rq_clock(rq_of(cfs_rq));
+	}
+#endif
+}
+
 #ifdef CONFIG_SCHEDSTATS
 static inline void
 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -734,6 +876,25 @@ update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	schedstat_set(se->statistics->wait_start, rq_clock(rq_of(cfs_rq)));
 }
 
+static inline void update_sched_lat(struct task_struct *t, u64 now)
+{
+#ifdef CONFIG_VE
+	int cpu;
+	u64 ve_wstamp;
+
+	/* safe due to runqueue lock */
+	cpu = smp_processor_id();
+	ve_wstamp = t->se.statistics->wait_start;
+
+	if (ve_wstamp && now > ve_wstamp) {
+		KSTAT_LAT_PCPU_ADD(&kstat_glob.sched_lat,
+				cpu, now - ve_wstamp);
+		KSTAT_LAT_PCPU_ADD(&t->task_ve->sched_lat_ve,
+				cpu, now - ve_wstamp);
+	}
+#endif
+}
+
 static void
 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
@@ -743,8 +904,11 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	schedstat_set(se->statistics->wait_sum, se->statistics->wait_sum +
 			rq_clock(rq_of(cfs_rq)) - se->statistics->wait_start);
 	if (entity_is_task(se)) {
-		trace_sched_stat_wait(task_of(se),
-			rq_clock(rq_of(cfs_rq)) - se->statistics->wait_start);
+		u64 now = rq_clock(rq_of(cfs_rq));
+		struct task_struct *p = task_of(se);
+
+		trace_sched_stat_wait(p, now - se->statistics->wait_start);
+		update_sched_lat(p, now);
 	}
 	schedstat_set(se->statistics->wait_start, 0);
 }
@@ -773,17 +937,8 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (se != cfs_rq->curr)
 		update_stats_wait_end(cfs_rq, se);
 
-	if (flags & DEQUEUE_SLEEP) {
-		if (entity_is_task(se)) {
-			struct task_struct *tsk = task_of(se);
-
-			if (tsk->state & TASK_INTERRUPTIBLE)
-				se->statistics->sleep_start = rq_clock(rq_of(cfs_rq));
-			if (tsk->state & TASK_UNINTERRUPTIBLE)
-				se->statistics->block_start = rq_clock(rq_of(cfs_rq));
-		}
-	}
-
+	if (flags & DEQUEUE_SLEEP)
+		dequeue_sleeper(cfs_rq, se);
 }
 #else
 static inline void
@@ -819,6 +974,106 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	se->exec_start = rq_clock_task(rq_of(cfs_rq));
 }
 
+#ifdef CONFIG_CFS_BANDWIDTH
+static inline void update_entity_boost(struct sched_entity *se)
+{
+	if (!entity_is_task(se)) {
+		se->boosted = cfs_rq_has_boosted_entities(group_cfs_rq(se));
+		WARN_ON(se->boosted && cfs_rq_throttled(group_cfs_rq(se)));
+	} else {
+		struct task_struct *p = task_of(se);
+
+		if (unlikely(p != current))
+			return;
+
+		if (!(preempt_count() & PREEMPT_ACTIVE)) {
+			se->boosted = sched_feat(BOOST_WAKEUPS) &&
+					p->woken_while_running;
+			p->woken_while_running = 0;
+		} else
+			se->boosted = sched_feat(BOOST_PREEMPT) &&
+				      !p->may_throttle;
+	}
+}
+
+static int check_enqueue_boost(struct rq *rq, struct task_struct *p, int flags)
+{
+	if (sched_feat(BOOST_WAKEUPS) && (flags & ENQUEUE_WAKEUP))
+		p->se.boosted = 1;
+	return p->se.boosted;
+}
+
+static inline void __enqueue_boosted_entity(struct cfs_rq *cfs_rq,
+					    struct sched_entity *se)
+{
+	list_add(&se->boost_node, &cfs_rq->boosted_entities);
+}
+
+static inline void __dequeue_boosted_entity(struct cfs_rq *cfs_rq,
+					    struct sched_entity *se)
+{
+	if (WARN_ON(se->boost_node.next == LIST_POISON1))
+		return;
+	list_del(&se->boost_node);
+}
+
+static int enqueue_boosted_entity(struct cfs_rq *cfs_rq,
+				  struct sched_entity *se)
+{
+	if (entity_is_task(se) || !entity_boosted(se)) {
+		if (se != cfs_rq->curr)
+			__enqueue_boosted_entity(cfs_rq, se);
+		se->boosted = 1;
+		WARN_ON(!entity_is_task(se) &&
+			cfs_rq_throttled(group_cfs_rq(se)));
+		return 1;
+	} else
+		WARN_ON(cfs_rq_throttled(group_cfs_rq(se)));
+
+	return 0;
+}
+
+static int dequeue_boosted_entity(struct cfs_rq *cfs_rq,
+				  struct sched_entity *se)
+{
+	if (entity_is_task(se) ||
+	    !cfs_rq_has_boosted_entities(group_cfs_rq(se))) {
+		if (se != cfs_rq->curr)
+			__dequeue_boosted_entity(cfs_rq, se);
+		if (!entity_is_task(se))
+			se->boosted = 0;
+		return 1;
+	}
+
+	return 0;
+}
+#else
+static inline void update_entity_boost(struct sched_entity *se) {}
+
+static inline int check_enqueue_boost(struct rq *rq,
+				      struct task_struct *p, int flags)
+{
+	return 0;
+}
+
+static inline void __enqueue_boosted_entity(struct cfs_rq *cfs_rq,
+					    struct sched_entity *se) {}
+static inline void __dequeue_boosted_entity(struct cfs_rq *cfs_rq,
+					    struct sched_entity *se) {}
+
+static inline int enqueue_boosted_entity(struct cfs_rq *cfs_rq,
+					 struct sched_entity *se)
+{
+	return 0;
+}
+
+static inline int dequeue_boosted_entity(struct cfs_rq *cfs_rq,
+					 struct sched_entity *se)
+{
+	return 0;
+}
+#endif
+
 /**************************************************
  * Scheduling class queueing methods:
  */
@@ -1133,8 +1388,6 @@ static void task_numa_assign(struct task_numa_env *env,
 {
 	if (env->best_task)
 		put_task_struct(env->best_task);
-	if (p)
-		get_task_struct(p);
 
 	env->best_task = p;
 	env->best_imp = imp;
@@ -1156,11 +1409,30 @@ static void task_numa_compare(struct task_numa_env *env,
 	long dst_load, src_load;
 	long load;
 	long imp = (groupimp > 0) ? groupimp : taskimp;
+	bool assigned = false;
 
 	rcu_read_lock();
-	cur = ACCESS_ONCE(dst_rq->curr);
-	if (cur->pid == 0) /* idle */
+	raw_spin_lock_irq(&dst_rq->lock);
+	cur = dst_rq->curr;
+	/*
+	 * No need to move the exiting task or idle task.
+	 */
+	if ((cur->flags & PF_EXITING) || is_idle_task(cur))
 		cur = NULL;
+	else {
+		/*
+		 * The task_struct must be protected here to protect the
+		 * p->numa_faults access in the task_weight since the
+		 * numa_faults could already be freed in the following path:
+		 * finish_task_switch()
+		 *     --> put_task_struct()
+		 *         --> __put_task_struct()
+		 *             --> task_numa_free()
+		 */
+		get_task_struct(cur);
+	}
+
+	raw_spin_unlock_irq(&dst_rq->lock);
 
 	/*
 	 * "imp" is the fault differential for the source task between the
@@ -1249,9 +1521,16 @@ balance:
 		goto unlock;
 
 assign:
+	assigned = true;
 	task_numa_assign(env, cur, imp);
 unlock:
 	rcu_read_unlock();
+	/*
+	 * The dst_rq->curr isn't assigned. The protection for task_struct is
+	 * finished.
+	 */
+	if (cur && !assigned)
+		put_task_struct(cur);
 }
 
 static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2054,6 +2333,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 		account_numa_enqueue(rq, task_of(se));
 		list_add(&se->group_node, &rq->cfs_tasks);
+		list_add(&se->cfs_rq_node, &cfs_rq->tasks);
 	}
 #endif
 	cfs_rq->nr_running++;
@@ -2068,6 +2348,7 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
 	if (entity_is_task(se)) {
 		account_numa_dequeue(rq_of(cfs_rq), task_of(se));
 		list_del_init(&se->group_node);
+		list_del_init(&se->cfs_rq_node);
 	}
 	cfs_rq->nr_running--;
 }
@@ -2678,12 +2959,13 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 			se->statistics->sleep_max = delta;
 
 		se->statistics->sleep_start = 0;
-		se->statistics->sum_sleep_runtime += delta;
 
 		if (tsk) {
 			account_scheduler_latency(tsk, delta >> 10, 1);
 			trace_sched_stat_sleep(tsk, delta);
 		}
+
+		se->statistics->sum_sleep_runtime += delta;
 	}
 	if (se->statistics->block_start) {
 		u64 delta = rq_clock(rq_of(cfs_rq)) - se->statistics->block_start;
@@ -2695,7 +2977,6 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 			se->statistics->block_max = delta;
 
 		se->statistics->block_start = 0;
-		se->statistics->sum_sleep_runtime += delta;
 
 		if (tsk) {
 			if (tsk->in_iowait) {
@@ -2717,11 +2998,42 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
 						delta >> 20);
 			}
 			account_scheduler_latency(tsk, delta >> 10, 0);
-		}
+		} else
+			se->statistics->iowait_sum += delta;
+
+		se->statistics->sum_sleep_runtime += delta;
 	}
 #endif
 }
 
+void start_cfs_idle_time_accounting(int cpu)
+{
+	struct task_group *tg;
+	struct sched_entity *se;
+
+	list_for_each_entry(tg, &task_groups, list) {
+		if (tg != &root_task_group &&
+		    !tg->cfs_rq[cpu]->nr_running) {
+			se = tg->se[cpu];
+			dequeue_sleeper(cfs_rq_of(se), se);
+		}
+	}
+}
+
+void stop_cfs_idle_time_accounting(int cpu)
+{
+	struct task_group *tg;
+	struct sched_entity *se;
+
+	list_for_each_entry(tg, &task_groups, list) {
+		if (tg != &root_task_group &&
+		    !tg->cfs_rq[cpu]->nr_running) {
+			se = tg->se[cpu];
+			enqueue_sleeper(cfs_rq_of(se), se);
+		}
+	}
+}
+
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
 #ifdef CONFIG_SCHED_DEBUG
@@ -2767,7 +3079,7 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
 	se->vruntime = max_vruntime(se->vruntime, vruntime);
 }
 
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq, int flags);
 
 static inline void check_schedstat_required(void)
 {
@@ -2792,6 +3104,9 @@ static inline void check_schedstat_required(void)
 static void
 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 {
+	if (!cfs_rq->load.weight)
+		inc_nr_active_cfs_rqs(cfs_rq);
+
 	/*
 	 * Update the normalized vruntime before updating min_vruntime
 	 * through callig update_curr().
@@ -2824,7 +3139,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	if (cfs_rq->nr_running == 1) {
 		list_add_leaf_cfs_rq(cfs_rq);
-		check_enqueue_throttle(cfs_rq);
+		check_enqueue_throttle(cfs_rq, flags);
 	}
 }
 
@@ -2889,6 +3204,9 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 
 	clear_buddies(cfs_rq, se);
 
+	if (cfs_rq->prev == se)
+		cfs_rq->prev = NULL;
+
 	if (se != cfs_rq->curr)
 		__dequeue_entity(cfs_rq, se);
 	se->on_rq = 0;
@@ -2902,11 +3220,18 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
 	if (!(flags & DEQUEUE_SLEEP))
 		se->vruntime -= cfs_rq->min_vruntime;
 
-	/* return excess runtime on last dequeue */
-	return_cfs_rq_runtime(cfs_rq);
+	if (!cfs_rq->nr_running) {
+		/* return excess runtime on last dequeue */
+		return_cfs_rq_runtime(cfs_rq);
+		/* account switch to idle task */
+		cfs_rq->nr_switches++;
+	}
 
 	update_min_vruntime(cfs_rq);
 	update_cfs_shares(cfs_rq);
+
+	if (!cfs_rq->load.weight)
+		dec_nr_active_cfs_rqs(cfs_rq, flags & DEQUEUE_TASK_SLEEP);
 }
 
 /*
@@ -2962,10 +3287,14 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 		if (schedstat_enabled())
 			update_stats_wait_end(cfs_rq, se);
 		__dequeue_entity(cfs_rq, se);
+		if (entity_boosted(se))
+			__dequeue_boosted_entity(cfs_rq, se);
 	}
 
 	update_stats_curr_start(cfs_rq, se);
 	cfs_rq->curr = se;
+	if (cfs_rq->prev != se)
+		cfs_rq->nr_switches++;
 #ifdef CONFIG_SCHEDSTATS
 	/*
 	 * Track our maximum slice length, if the CPU's load is at
@@ -3017,6 +3346,20 @@ static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
 		se = cfs_rq->next;
 
+#ifdef CONFIG_CFS_BANDWIDTH
+	/*
+	 * Give boosted tasks a chance to finish their kernel-mode execution in
+	 * order to avoid prio inversion in case they hold a lock, but resched
+	 * them asap for the sake of fairness.
+	 */
+	if (cfs_rq->runtime_enabled && cfs_rq->runtime_remaining <= 0) {
+		if (cfs_rq_has_boosted_entities(cfs_rq))
+			se = list_first_entry(&cfs_rq->boosted_entities,
+					      struct sched_entity, boost_node);
+		rq_of(cfs_rq)->resched_next = 1;
+	}
+#endif
+
 	clear_buddies(cfs_rq, se);
 
 	return se;
@@ -3033,6 +3376,14 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 	if (prev->on_rq)
 		update_curr(cfs_rq);
 
+	update_entity_boost(prev);
+	if (entity_boosted(prev) && prev->on_rq) {
+		__enqueue_boosted_entity(cfs_rq, prev);
+		if (unlikely(cfs_rq_throttled(cfs_rq)))
+			/* prev was moved to throttled cfs_rq */
+			unthrottle_cfs_rq(cfs_rq);
+	}
+
 	/* throttle cfs_rqs exceeding runtime */
 	check_cfs_rq_runtime(cfs_rq);
 
@@ -3047,7 +3398,9 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
 		__enqueue_entity(cfs_rq, prev);
 		/* in !on_rq case, update occurred at dequeue */
 		update_entity_load_avg(prev, 1);
-	}
+		cfs_rq->prev = prev;
+	} else
+		cfs_rq->prev = NULL;
 	cfs_rq->curr = NULL;
 }
 
@@ -3094,42 +3447,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
 
 #ifdef CONFIG_CFS_BANDWIDTH
 
-#ifdef HAVE_JUMP_LABEL
-static struct static_key __cfs_bandwidth_used;
-
-static inline bool cfs_bandwidth_used(void)
-{
-	return static_key_false(&__cfs_bandwidth_used);
-}
-
-void cfs_bandwidth_usage_inc(void)
-{
-	static_key_slow_inc(&__cfs_bandwidth_used);
-}
-
-void cfs_bandwidth_usage_dec(void)
-{
-	static_key_slow_dec(&__cfs_bandwidth_used);
-}
-#else /* HAVE_JUMP_LABEL */
-static bool cfs_bandwidth_used(void)
-{
-	return true;
-}
-
-void cfs_bandwidth_usage_inc(void) {}
-void cfs_bandwidth_usage_dec(void) {}
-#endif /* HAVE_JUMP_LABEL */
-
-/*
- * default period for cfs group bandwidth.
- * default: 0.1s, units: nanoseconds
- */
-static inline u64 default_cfs_period(void)
-{
-	return 100000000ULL;
-}
-
 static inline u64 sched_cfs_bandwidth_slice(void)
 {
 	return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
@@ -3275,11 +3592,6 @@ void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec)
 	__account_cfs_rq_runtime(cfs_rq, delta_exec);
 }
 
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
-{
-	return cfs_bandwidth_used() && cfs_rq->throttled;
-}
-
 /* check whether cfs_rq, or any parent, is throttled */
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
@@ -3348,6 +3660,8 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
 	rcu_read_unlock();
 
+	cfs_rq->throttled = 1;
+
 	task_delta = cfs_rq->h_nr_running;
 	for_each_sched_entity(se) {
 		struct cfs_rq *qcfs_rq = cfs_rq_of(se);
@@ -3366,7 +3680,6 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 	if (!se)
 		rq->nr_running -= task_delta;
 
-	cfs_rq->throttled = 1;
 	cfs_rq->throttled_clock = rq_clock(rq);
 	raw_spin_lock(&cfs_b->lock);
 	list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
@@ -3661,11 +3974,36 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
  * runtime as update_curr() throttling can not not trigger until it's on-rq.
  */
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq, int flags)
 {
+	WARN_ON(cfs_rq_has_boosted_entities(cfs_rq));
+
 	if (!cfs_bandwidth_used())
 		return;
 
+	/* Synchronize hierarchical throttle counter: */
+	if (unlikely(!cfs_rq->throttle_uptodate)) {
+		struct rq *rq = rq_of(cfs_rq);
+		struct cfs_rq *pcfs_rq;
+		struct task_group *tg;
+
+		cfs_rq->throttle_uptodate = 1;
+
+		/* Get closest up-to-date node, because leaves go first: */
+		for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
+			pcfs_rq = tg->cfs_rq[cpu_of(rq)];
+			if (pcfs_rq->throttle_uptodate)
+				break;
+		}
+		if (tg) {
+			cfs_rq->throttle_count = pcfs_rq->throttle_count;
+			cfs_rq->throttled_clock_task = rq_clock_task(rq);
+		}
+	}
+
+	if (flags & ENQUEUE_BOOST)
+		return;
+
 	/* an active group must be handled by the update_curr()->put() path */
 	if (!cfs_rq->runtime_enabled || cfs_rq->curr)
 		return;
@@ -3713,6 +4051,9 @@ static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 	if (cfs_rq_throttled(cfs_rq))
 		return;
 
+	if (cfs_rq_has_boosted_entities(cfs_rq))
+		return;
+
 	throttle_cfs_rq(cfs_rq);
 }
 
@@ -3766,10 +4107,17 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
 	cfs_b->slack_timer.function = sched_cfs_slack_timer;
 }
 
+static enum hrtimer_restart sched_cfs_active_timer(struct hrtimer *timer);
+
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->runtime_enabled = 0;
 	INIT_LIST_HEAD(&cfs_rq->throttled_list);
+	INIT_LIST_HEAD(&cfs_rq->boosted_entities);
+#ifdef CONFIG_CFS_CPULIMIT
+	hrtimer_init(&cfs_rq->active_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	cfs_rq->active_timer.function = sched_cfs_active_timer;
+#endif
 }
 
 /* requires cfs_b->lock, may release to reprogram timer */
@@ -3828,15 +4176,10 @@ static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
 
 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {}
 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
-static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
+static void check_enqueue_throttle(struct cfs_rq *cfs_rq, int flags) {}
 static inline void sync_throttle(struct task_group *tg, int cpu) {}
 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
 
-static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
-{
-	return 0;
-}
-
 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
 {
 	return 0;
@@ -3925,11 +4268,14 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
+	int boost = check_enqueue_boost(rq, p, flags);
 
 	for_each_sched_entity(se) {
 		if (se->on_rq)
 			break;
 		cfs_rq = cfs_rq_of(se);
+		if (boost)
+			flags |= ENQUEUE_BOOST;
 		enqueue_entity(cfs_rq, se, flags);
 
 		/*
@@ -3942,6 +4288,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 		cfs_rq->h_nr_running++;
 
+		if (boost)
+			boost = enqueue_boosted_entity(cfs_rq, se);
+
 		flags = ENQUEUE_WAKEUP;
 	}
 
@@ -3952,6 +4301,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		if (cfs_rq_throttled(cfs_rq))
 			break;
 
+		if (boost)
+			boost = enqueue_boosted_entity(cfs_rq, se);
+
 		update_cfs_shares(cfs_rq);
 		update_entity_load_avg(se, 1);
 	}
@@ -3959,6 +4311,16 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 	if (!se) {
 		update_rq_runnable_avg(rq, rq->nr_running);
 		inc_nr_running(rq);
+	} else if (boost) {
+		for_each_sched_entity(se) {
+			cfs_rq = cfs_rq_of(se);
+			if (!enqueue_boosted_entity(cfs_rq, se)) {
+				WARN_ON(throttled_hierarchy(cfs_rq));
+				break;
+			}
+			if (cfs_rq_throttled(cfs_rq))
+				unthrottle_cfs_rq(cfs_rq);
+		}
 	}
 	hrtick_update(rq);
 }
@@ -3974,8 +4336,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 {
 	struct cfs_rq *cfs_rq;
 	struct sched_entity *se = &p->se;
+	int boosted = entity_boosted(se);
 	int task_sleep = flags & DEQUEUE_SLEEP;
 
+	if (task_sleep)
+		flags |= DEQUEUE_TASK_SLEEP;
+
 	for_each_sched_entity(se) {
 		cfs_rq = cfs_rq_of(se);
 		dequeue_entity(cfs_rq, se, flags);
@@ -3990,6 +4356,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 			break;
 		cfs_rq->h_nr_running--;
 
+		if (boosted)
+			boosted = dequeue_boosted_entity(cfs_rq, se);
+
 		/* Don't dequeue parent if it has other entities besides us */
 		if (cfs_rq->load.weight) {
 			/* Avoid re-evaluating load for this entity: */
@@ -4009,8 +4378,13 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
 		cfs_rq = cfs_rq_of(se);
 		cfs_rq->h_nr_running--;
 
-		if (cfs_rq_throttled(cfs_rq))
+		if (cfs_rq_throttled(cfs_rq)) {
+			WARN_ON(boosted);
 			break;
+		}
+
+		if (boosted)
+			boosted = dequeue_boosted_entity(cfs_rq, se);
 
 		update_cfs_shares(cfs_rq);
 		update_entity_load_avg(se, 1);
@@ -4468,6 +4842,38 @@ done:
 	return target;
 }
 
+static inline bool select_runnable_cpu(struct task_struct *p, int *new_cpu)
+{
+#ifdef CONFIG_CFS_CPULIMIT
+	struct task_group *tg;
+	struct sched_domain *sd;
+	int prev_cpu = task_cpu(p);
+	int cpu;
+
+	tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
+	if (check_cpulimit_spread(tg, *new_cpu) > 0)
+		return false;
+
+	if (cfs_rq_active(tg->cfs_rq[*new_cpu]))
+		return true;
+
+	if (cfs_rq_active(tg->cfs_rq[prev_cpu])) {
+		*new_cpu = prev_cpu;
+		return true;
+	}
+
+	for_each_domain(*new_cpu, sd) {
+		for_each_cpu_and(cpu, sched_domain_span(sd), &p->cpus_allowed) {
+			if (cfs_rq_active(tg->cfs_rq[cpu])) {
+				*new_cpu = cpu;
+				return true;
+			}
+		}
+	}
+#endif
+	return false;
+}
+
 /*
  * sched_balance_self: balance the current task (running on cpu) in domains
  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
@@ -4518,9 +4924,16 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
 
 	if (affine_sd) {
 		if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
-			prev_cpu = cpu;
+			new_cpu = cpu;
+		else
+			new_cpu = prev_cpu;
+	}
+
+	if (select_runnable_cpu(p, &new_cpu))
+		goto unlock;
 
-		new_cpu = select_idle_sibling(p, prev_cpu);
+	if (affine_sd) {
+		new_cpu = select_idle_sibling(p, new_cpu);
 		goto unlock;
 	}
 
@@ -4779,9 +5192,70 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 	if (hrtick_enabled(rq))
 		hrtick_start_fair(rq, p);
 
+	if (rq->resched_next && !entity_boosted(&p->se)) {
+		/*
+		 * seems boosted tasks have gone from the throttled cfs_rq,
+		 * pick another task then
+		 */
+		resched_task(p);
+		rq->resched_next = 0;
+	}
+
 	return p;
 }
 
+#if defined(CONFIG_SMP) && defined(CONFIG_CFS_CPULIMIT)
+static int cpulimit_balance_cpu_stop(void *data);
+
+static inline void trigger_cpulimit_balance(struct task_struct *p)
+{
+	struct rq *this_rq;
+	struct task_group *tg;
+	int this_cpu, cpu, target_cpu = -1;
+	struct sched_domain *sd;
+
+	this_rq = rq_of(cfs_rq_of(&p->se));
+	this_cpu = cpu_of(this_rq);
+
+	if (!p->se.on_rq || this_rq->active_balance)
+		return;
+
+	tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
+	if (check_cpulimit_spread(tg, this_cpu) >= 0)
+		return;
+
+	rcu_read_lock();
+	for_each_domain(this_cpu, sd) {
+		if (!(sd->flags & SD_LOAD_BALANCE))
+			continue;
+		for_each_cpu_and(cpu, sched_domain_span(sd),
+				 tsk_cpus_allowed(p)) {
+			if (cpu != this_cpu &&
+			    cfs_rq_active(tg->cfs_rq[cpu])) {
+				target_cpu = cpu;
+				goto unlock;
+			}
+		}
+	}
+unlock:
+	rcu_read_unlock();
+
+	if (target_cpu >= 0) {
+		this_rq->active_balance = 1;
+		this_rq->push_cpu = target_cpu;
+		raw_spin_unlock(&this_rq->lock);
+		stop_one_cpu_nowait(this_rq->cpu,
+				    cpulimit_balance_cpu_stop, this_rq,
+				    &this_rq->active_balance_work);
+		raw_spin_lock(&this_rq->lock);
+	}
+}
+#else
+static inline void trigger_cpulimit_balance(struct task_struct *p)
+{
+}
+#endif
+
 /*
  * Account for a descheduled task:
  */
@@ -5117,6 +5591,37 @@ static inline bool migrate_degrades_locality(struct task_struct *p,
 }
 #endif
 
+static inline int can_migrate_task_cpulimit(struct task_struct *p, struct lb_env *env)
+{
+#ifdef CONFIG_CFS_CPULIMIT
+	struct task_group *tg = cfs_rq_of(&p->se)->tg->topmost_limited_ancestor;
+
+	if (check_cpulimit_spread(tg, env->dst_cpu) < 0) {
+		int cpu;
+
+		schedstat_inc(p, se.statistics->nr_failed_migrations_cpulimit);
+
+		env->flags |= LBF_SOME_PINNED;
+
+		if (check_cpulimit_spread(tg, env->src_cpu) != 0)
+			return 0;
+
+		if (!env->dst_grpmask || (env->flags & LBF_DST_PINNED))
+			return 0;
+
+		for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
+			if (cfs_rq_active(tg->cfs_rq[cpu])) {
+				env->flags |= LBF_DST_PINNED;
+				env->new_dst_cpu = cpu;
+				break;
+			}
+		}
+		return 0;
+	}
+#endif
+	return 1;
+}
+
 /*
  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
  */
@@ -5124,6 +5629,10 @@ static
 int can_migrate_task(struct task_struct *p, struct lb_env *env)
 {
 	int tsk_cache_hot = 0;
+
+	if (!can_migrate_task_cpulimit(p, env))
+		return 0;
+
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) throttled_lb_pair, or
@@ -5316,6 +5825,158 @@ next:
 	return pulled;
 }
 
+#ifdef CONFIG_CFS_CPULIMIT
+static unsigned long entity_h_load(struct sched_entity *se);
+
+static int can_migrate_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
+{
+	struct sched_entity *se;
+	struct task_struct *p;
+
+	list_for_each_entry(se, &cfs_rq->tasks, cfs_rq_node) {
+		p = task_of(se);
+		if (task_curr(p) ||
+		    !cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p)))
+			return 0;
+	}
+	env->flags &= ~LBF_ALL_PINNED;
+	return 1;
+}
+
+static int move_task_group(struct cfs_rq *cfs_rq, struct lb_env *env)
+{
+	struct sched_entity *se, *tmp;
+	int moved = 0;
+
+	list_for_each_entry_safe(se, tmp, &cfs_rq->tasks, cfs_rq_node) {
+		move_task(task_of(se), env);
+		moved++;
+	}
+	return moved;
+}
+
+static int move_task_groups(struct lb_env *env)
+{
+	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
+	unsigned long load;
+	int cur_pulled, pulled = 0;
+
+	if (env->imbalance <= 0)
+		return 0;
+
+	for_each_leaf_cfs_rq(env->src_rq, cfs_rq) {
+		if (cfs_rq->tg == &root_task_group)
+			continue;
+		/*
+		 * A child always goes before its parent in a leaf_cfs_rq_list.
+		 * Therefore, if we encounter a cfs_rq that has a child cfs_rq,
+		 * we could not migrate the child and therefore we should not
+		 * even try to migrate the parent.
+		 */
+		if (cfs_rq->nr_running != cfs_rq->h_nr_running)
+			continue;
+
+		tg = cfs_rq->tg->topmost_limited_ancestor;
+
+		if (check_cpulimit_spread(tg, env->src_cpu) != 0 ||
+		    cfs_rq_active(tg->cfs_rq[env->dst_cpu]))
+			continue;
+
+		load = entity_h_load(tg->se[env->src_cpu]);
+		if ((load / 2) > env->imbalance)
+			continue;
+
+		if (!can_migrate_task_group(cfs_rq, env))
+			continue;
+
+		cur_pulled = move_task_group(cfs_rq, env);
+		pulled += cur_pulled;
+		env->imbalance -= load;
+
+		env->loop += cur_pulled;
+		if (env->loop > env->loop_max)
+			break;
+
+		if (env->imbalance <= 0)
+			break;
+	}
+	return pulled;
+}
+
+static int do_cpulimit_balance(struct lb_env *env)
+{
+	struct cfs_rq *cfs_rq;
+	struct task_group *tg;
+	int pushed = 0;
+
+	for_each_leaf_cfs_rq(env->src_rq, cfs_rq) {
+		if (cfs_rq->tg == &root_task_group)
+			continue;
+		/* see move_task_groups for why we skip such groups */
+		if (cfs_rq->nr_running != cfs_rq->h_nr_running)
+			continue;
+		tg = cfs_rq->tg->topmost_limited_ancestor;
+		if (check_cpulimit_spread(tg, env->src_cpu) < 0 &&
+		    cfs_rq_active(tg->cfs_rq[env->dst_cpu]) &&
+		    can_migrate_task_group(cfs_rq, env))
+			pushed += move_task_group(cfs_rq, env);
+	}
+	return pushed;
+}
+
+static int cpulimit_balance_cpu_stop(void *data)
+{
+	struct rq *rq = data;
+	int cpu = cpu_of(rq);
+	int target_cpu = rq->push_cpu;
+	struct rq *target_rq = cpu_rq(target_cpu);
+	struct sched_domain *sd;
+
+	raw_spin_lock_irq(&rq->lock);
+
+	if (unlikely(cpu != smp_processor_id() || !rq->active_balance ||
+		     !cpu_online(target_cpu)))
+		goto out_unlock;
+
+	if (unlikely(!rq->nr_running))
+		goto out_unlock;
+
+	BUG_ON(rq == target_rq);
+
+	double_lock_balance(rq, target_rq);
+	rcu_read_lock();
+	for_each_domain(target_cpu, sd) {
+		if ((sd->flags & SD_LOAD_BALANCE) &&
+		    cpumask_test_cpu(cpu, sched_domain_span(sd)))
+				break;
+	}
+	if (likely(sd)) {
+		struct lb_env env = {
+			.sd		= sd,
+			.dst_cpu	= target_cpu,
+			.dst_rq		= target_rq,
+			.src_cpu	= cpu,
+			.src_rq		= rq,
+		};
+
+		schedstat_inc(sd, clb_count);
+
+		if (do_cpulimit_balance(&env))
+			schedstat_inc(sd, clb_pushed);
+		else
+			schedstat_inc(sd, clb_failed);
+	}
+	rcu_read_unlock();
+	double_unlock_balance(rq, target_rq);
+
+out_unlock:
+	rq->active_balance = 0;
+	raw_spin_unlock_irq(&rq->lock);
+	return 0;
+}
+#endif /* CONFIG_CFS_CPULIMIT */
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 /*
  * update tg->load_weight by folding this cpu's load_avg
@@ -5412,12 +6073,12 @@ static void update_h_load(long cpu)
 	rcu_read_unlock();
 }
 
-static unsigned long task_h_load(struct task_struct *p)
+static unsigned long entity_h_load(struct sched_entity *se)
 {
-	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct cfs_rq *cfs_rq = cfs_rq_of(se);
 	unsigned long load;
 
-	load = p->se.load.weight;
+	load = se->load.weight;
 	load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
 
 	return load;
@@ -5431,12 +6092,17 @@ static inline void update_h_load(long cpu)
 {
 }
 
-static unsigned long task_h_load(struct task_struct *p)
+static unsigned long entity_h_load(struct sched_entity *se)
 {
-	return p->se.load.weight;
+	return se->load.weight;
 }
 #endif
 
+static unsigned long task_h_load(struct task_struct *p)
+{
+	return entity_h_load(&p->se);
+}
+
 /********** Helpers for find_busiest_group ************************/
 
 enum group_type {
@@ -6190,7 +6856,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 	/* How much load to actually move to equalise the imbalance */
 	env->imbalance = min(
 		max_pull * busiest->group_power,
-		(sds->avg_load - local->avg_load) * local->group_power
+		(busiest->avg_load - local->avg_load) * local->group_power
 	) / SCHED_POWER_SCALE;
 
 	/*
@@ -6267,13 +6933,6 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
 	if (local->avg_load >= busiest->avg_load)
 		goto out_balanced;
 
-	/*
-	 * Don't pull any tasks if this group is already above the domain
-	 * average load.
-	 */
-	if (local->avg_load >= sds.avg_load)
-		goto out_balanced;
-
 	if (env->idle == CPU_IDLE) {
 		/*
 		 * This cpu is idle. If the busiest group load doesn't
@@ -6520,6 +7179,17 @@ more_balance:
 		double_rq_unlock(env.dst_rq, busiest);
 		local_irq_restore(flags);
 
+#ifdef CONFIG_CFS_CPULIMIT
+		if (!ld_moved && (env.flags & LBF_ALL_PINNED)) {
+			env.loop = 0;
+			local_irq_save(flags);
+			double_rq_lock(env.dst_rq, busiest);
+			cur_ld_moved = ld_moved = move_task_groups(&env);
+			double_rq_unlock(env.dst_rq, busiest);
+			local_irq_restore(flags);
+		}
+#endif
+
 		/*
 		 * some other cpu did the load balance for us.
 		 */
@@ -6854,6 +7524,11 @@ out_unlock:
 	return 0;
 }
 
+static void pre_schedule_fair(struct rq *rq, struct task_struct *prev)
+{
+	trigger_cpulimit_balance(prev);
+}
+
 #ifdef CONFIG_NO_HZ_COMMON
 /*
  * idle load balancing details
@@ -7326,6 +8001,8 @@ static void task_fork_fair(struct task_struct *p)
 
 	se->vruntime -= cfs_rq->min_vruntime;
 
+	cfs_rq->nr_forks++;
+
 	raw_spin_unlock_irqrestore(&rq->lock, flags);
 }
 
@@ -7394,6 +8071,13 @@ static void switched_from_fair(struct rq *rq, struct task_struct *p)
  */
 static void switched_to_fair(struct rq *rq, struct task_struct *p)
 {
+#ifdef CONFIG_FAIR_GROUP_SCHED
+	/*
+	 * Since the real-depth could have been changed (only FAIR
+	 * class maintain depth value), reset depth properly.
+	 */
+	p->se.depth = p->se.parent ? p->se.parent->depth + 1 : 0;
+#endif
 	if (!p->se.on_rq)
 		return;
 
@@ -7429,6 +8113,7 @@ static void set_curr_task_fair(struct rq *rq)
 void init_cfs_rq(struct cfs_rq *cfs_rq)
 {
 	cfs_rq->tasks_timeline = RB_ROOT;
+	INIT_LIST_HEAD(&cfs_rq->tasks);
 	cfs_rq->min_vruntime = (u64)(-(1LL << 20));
 #ifndef CONFIG_64BIT
 	cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
@@ -7487,6 +8172,12 @@ static void task_move_group_fair(struct task_struct *p, int on_rq)
 		cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
 #endif
 	}
+
+	/*
+	 * Since the real-depth could have been changed (only FAIR
+	 * class maintain depth value), reset depth properly.
+	 */
+        p->se.depth = p->se.parent ? p->se.parent->depth + 1 : 0;
 }
 
 void free_fair_sched_group(struct task_group *tg)
@@ -7605,15 +8296,23 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
 	if (!se)
 		return;
 
-	if (!parent)
+	if (!parent) {
 		se->cfs_rq = &rq->cfs;
-	else
+		se->depth = 0;
+	} else {
 		se->cfs_rq = parent->my_q;
+		se->depth = parent->depth + 1;
+	}
 
 	se->my_q = cfs_rq;
 	/* guarantee group entities always have weight */
 	update_load_set(&se->load, NICE_0_LOAD);
 	se->parent = parent;
+
+#ifdef CONFIG_SCHEDSTATS
+	if (cpu_online(cpu))
+		se->statistics->sleep_start = cpu_clock(cpu);
+#endif
 }
 
 static DEFINE_MUTEX(shares_mutex);
@@ -7682,6 +8381,69 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
 	return rr_interval;
 }
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void nr_iowait_dec_fair(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct sched_entity *se = p->se.parent;
+
+	cfs_rq->nr_iowait--;
+
+#ifdef CONFIG_SCHEDSTATS
+	if (!cfs_rq->nr_iowait && se && se->statistics->block_start) {
+		u64 delta;
+		struct rq *rq = rq_of(cfs_rq);
+
+		update_rq_clock(rq);
+
+		delta = rq->clock - se->statistics->block_start;
+
+		if ((s64)delta < 0)
+			delta = 0;
+
+		if (unlikely(delta > se->statistics->block_max))
+			se->statistics->block_max = delta;
+
+		se->statistics->block_start = 0;
+		se->statistics->sleep_start = rq->clock;
+
+		se->statistics->iowait_sum += delta;
+		se->statistics->sum_sleep_runtime += delta;
+	}
+#endif
+}
+
+static void nr_iowait_inc_fair(struct task_struct *p)
+{
+	struct cfs_rq *cfs_rq = task_cfs_rq(p);
+	struct sched_entity *se = p->se.parent;
+
+	cfs_rq->nr_iowait++;
+
+#ifdef CONFIG_SCHEDSTATS
+	if (cfs_rq->nr_iowait && se && se->statistics->sleep_start) {
+		u64 delta;
+		struct rq *rq = rq_of(cfs_rq);
+
+		update_rq_clock(rq);
+
+		delta = rq->clock - se->statistics->sleep_start;
+
+		if ((s64)delta < 0)
+			delta = 0;
+
+		if (unlikely(delta > se->statistics->sleep_max))
+			se->statistics->sleep_max = delta;
+
+		se->statistics->sleep_start = 0;
+		se->statistics->block_start = rq->clock;
+
+		se->statistics->sum_sleep_runtime += delta;
+	}
+#endif
+}
+#endif
+
 /*
  * All the scheduling class methods:
  */
@@ -7706,6 +8468,7 @@ const struct sched_class fair_sched_class = {
 	.rq_offline		= rq_offline_fair,
 
 	.task_waking		= task_waking_fair,
+	.pre_schedule		= pre_schedule_fair,
 #endif
 
 	.set_curr_task          = set_curr_task_fair,
@@ -7722,6 +8485,8 @@ const struct sched_class fair_sched_class = {
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	.task_move_group	= task_move_group_fair,
+	.nr_iowait_inc		= nr_iowait_inc_fair,
+	.nr_iowait_dec		= nr_iowait_dec_fair,
 #endif
 };
 
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -83,3 +83,6 @@ SCHED_FEAT(NUMA_FAVOUR_HIGHER, true)
  */
 SCHED_FEAT(NUMA_RESIST_LOWER, false)
 #endif
+
+SCHED_FEAT(BOOST_WAKEUPS, true)
+SCHED_FEAT(BOOST_PREEMPT, true)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -52,7 +52,7 @@ extern __read_mostly int scheduler_running;
  * when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
  * increased costs.
  */
-#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load  */
+#if BITS_PER_LONG > 32
 # define SCHED_LOAD_RESOLUTION	10
 # define scale_load(w)		((w) << SCHED_LOAD_RESOLUTION)
 # define scale_load_down(w)	((w) >> SCHED_LOAD_RESOLUTION)
@@ -260,8 +260,26 @@ struct task_group {
 	struct autogroup *autogroup;
 #endif
 
+	struct taskstats __percpu *taskstats;
+	unsigned long avenrun[3];	/* loadavg data */
+	struct timespec start_time;
+
+	struct kernel_cpustat *cpustat_last;
+	struct kernel_cpustat *vcpustat;
+	ktime_t vcpustat_last_update;
+	spinlock_t vcpustat_lock;
+
 	struct cfs_bandwidth cfs_bandwidth;
 
+#ifdef CONFIG_CFS_CPULIMIT
+#define MAX_CPU_RATE 1024
+	unsigned long cpu_rate;
+	unsigned int nr_cpus;
+	atomic_t nr_cpus_active;
+	struct task_group *topmost_limited_ancestor; /* self if none of the
+							ancestors is limited */
+#endif
+
 #if defined(CONFIG_FAIR_GROUP_SCHED)
 	/*
 	 * Put load_avg/runnable_avg in its own cacheline to avoid
@@ -348,6 +366,9 @@ struct cfs_rq {
 	struct load_weight load;
 	unsigned int nr_running, h_nr_running;
 
+	unsigned long nr_iowait;
+	unsigned long nr_unint;
+
 	u64 exec_clock;
 	u64 min_vruntime;
 #ifndef CONFIG_64BIT
@@ -357,11 +378,16 @@ struct cfs_rq {
 	struct rb_root tasks_timeline;
 	struct rb_node *rb_leftmost;
 
+	struct list_head tasks;
+
 	/*
 	 * 'curr' points to currently running entity on this cfs_rq.
 	 * It is set to NULL otherwise (i.e when none are currently running).
 	 */
-	struct sched_entity *curr, *next, *last, *skip;
+	struct sched_entity *curr, *next, *last, *skip, *prev;
+
+	u64 nr_switches;
+	unsigned long nr_forks;
 
 #ifdef	CONFIG_SCHED_DEBUG
 	unsigned int nr_spread_over;
@@ -421,9 +447,15 @@ struct cfs_rq {
 
 	u64 throttled_clock, throttled_clock_task;
 	u64 throttled_clock_task_time;
-	int throttled, throttle_count;
+	int throttled, throttle_count, throttle_uptodate;
 	struct list_head throttled_list;
+
+	struct list_head boosted_entities;
 #endif /* CONFIG_CFS_BANDWIDTH */
+#ifdef CONFIG_CFS_CPULIMIT
+	int active;
+	struct hrtimer active_timer;
+#endif /* CONFIG_CFS_CPULIMIT */
 #endif /* CONFIG_FAIR_GROUP_SCHED */
 };
 
@@ -570,7 +602,8 @@ struct rq {
 #ifdef CONFIG_NO_HZ_FULL
 	unsigned long last_sched_tick;
 #endif
-	int skip_clock_update;
+	signed char skip_clock_update;
+	unsigned char resched_next;
 
 	/* capture load from *all* tasks on this cpu: */
 	struct load_weight load;
@@ -592,6 +625,9 @@ struct rq {
 	struct list_head leaf_rt_rq_list;
 #endif
 
+	/* nr_running last seen in update_cpu_load() */
+	unsigned long nr_active;
+
 	/*
 	 * This is part of a global counter where only the total sum
 	 * over all CPUs matters. A task can increase this counter on
@@ -599,6 +635,10 @@ struct rq {
 	 * it on another CPU. Always updated under the runqueue lock:
 	 */
 	unsigned long nr_uninterruptible;
+	unsigned long nr_iothrottled;
+
+	unsigned long nr_sleeping;
+	unsigned long nr_stopped;
 
 	struct task_struct *curr, *idle, *stop;
 	unsigned long next_balance;
@@ -1164,8 +1204,10 @@ static const u32 prio_to_wmult[40] = {
 #define ENQUEUE_WAKING		0
 #endif
 #define ENQUEUE_REPLENISH	8
+#define ENQUEUE_BOOST		16
 
 #define DEQUEUE_SLEEP		1
+#define DEQUEUE_TASK_SLEEP	2
 
 struct sched_class {
 	const struct sched_class *next;
@@ -1216,6 +1258,8 @@ struct sched_class {
 #ifdef CONFIG_FAIR_GROUP_SCHED
 	void (*task_move_group) (struct task_struct *p, int on_rq);
 #endif
+	void (*nr_iowait_inc) (struct task_struct *p);
+	void (*nr_iowait_dec) (struct task_struct *p);
 	RH_KABI_EXTEND(void (*update_curr) (struct rq *rq))
 	RH_KABI_EXTEND(void (*task_dead) (struct task_struct *p))
 };
@@ -1619,6 +1663,32 @@ extern void init_dl_rq(struct dl_rq *dl_rq, struct rq *rq);
 extern void cfs_bandwidth_usage_inc(void);
 extern void cfs_bandwidth_usage_dec(void);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+	return p->se.cfs_rq;
+}
+#else
+static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
+{
+	return &task_rq(p)->cfs;
+}
+#endif
+
+#ifdef CONFIG_CFS_BANDWIDTH
+/*
+ * default period for cfs group bandwidth.
+ * default: 0.1s, units: nanoseconds
+ */
+static inline u64 default_cfs_period(void)
+{
+	return 100000000ULL;
+}
+#endif
+
+extern void start_cfs_idle_time_accounting(int cpu);
+extern void stop_cfs_idle_time_accounting(int cpu);
+
 #ifdef CONFIG_NO_HZ_COMMON
 enum rq_nohz_flag_bits {
 	NOHZ_TICK_STOPPED,
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -66,8 +66,9 @@ static int show_schedstat(struct seq_file *seq, void *v)
 				    sd->lb_nobusyg[itype]);
 			}
 			seq_printf(seq,
-				   " %u %u %u %u %u %u %u %u %u %u %u %u\n",
+				   " %u %u %u %u %u %u %u %u %u %u %u %u %u %u %u\n",
 			    sd->alb_count, sd->alb_failed, sd->alb_pushed,
+			    sd->clb_count, sd->clb_failed, sd->clb_pushed,
 			    sd->sbe_count, sd->sbe_balanced, sd->sbe_pushed,
 			    sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
 			    sd->ttwu_wake_remote, sd->ttwu_move_affine,
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -54,6 +54,9 @@
 struct seccomp_filter {
 	atomic_t usage;
 	struct seccomp_filter *prev;
+#if CONFIG_VE
+	struct sock_fprog orig_prog;
+#endif
 	unsigned short len;  /* Instruction count */
 	struct sock_filter insns[];
 };
@@ -265,6 +268,16 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
 	if (copy_from_user(filter->insns, fprog->filter, fp_size))
 		goto fail;
 
+#if CONFIG_VE
+	filter->orig_prog.len = fprog->len;
+	filter->orig_prog.filter = kmemdup(filter->insns, fp_size,
+					   GFP_KERNEL|__GFP_NOWARN);
+	if (!filter->orig_prog.filter) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+#endif
+
 	/* Check and rewrite the fprog via the skb checker */
 	ret = sk_chk_filter(filter->insns, filter->len);
 	if (ret)
@@ -283,6 +296,9 @@ static long seccomp_attach_filter(struct sock_fprog *fprog)
 	current->seccomp.filter = filter;
 	return 0;
 fail:
+#if CONFIG_VE
+	kfree(filter->orig_prog.filter);
+#endif
 	kfree(filter);
 	return ret;
 }
@@ -332,6 +348,9 @@ void put_seccomp_filter(struct task_struct *tsk)
 	while (orig && atomic_dec_and_test(&orig->usage)) {
 		struct seccomp_filter *freeme = orig;
 		orig = orig->prev;
+#if CONFIG_VE
+		kfree(freeme->orig_prog.filter);
+#endif
 		kfree(freeme);
 	}
 }
@@ -381,6 +400,10 @@ int __secure_computing(int this_syscall)
 	int *syscall;
 	u32 ret;
 
+	if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
+	    unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
+		return 0;
+
 	switch (mode) {
 	case SECCOMP_MODE_STRICT:
 		syscall = mode1_syscalls;
@@ -511,3 +534,71 @@ long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
 out:
 	return ret;
 }
+
+#if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
+long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
+			void __user *data)
+{
+	struct seccomp_filter *filter;
+	long ret;
+	unsigned long count = 0;
+
+	if (!capable(CAP_SYS_ADMIN) ||
+	    current->seccomp.mode != SECCOMP_MODE_DISABLED) {
+		return -EACCES;
+	}
+
+	spin_lock_irq(&task->sighand->siglock);
+	if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	filter = task->seccomp.filter;
+	while (filter) {
+		filter = filter->prev;
+		count++;
+	}
+
+	if (filter_off >= count) {
+		ret = -ENOENT;
+		goto out;
+	}
+	count -= filter_off;
+
+	filter = task->seccomp.filter;
+	while (filter && count > 1) {
+		filter = filter->prev;
+		count--;
+	}
+
+	if (WARN_ON(count != 1 || !filter)) {
+		/* The filter tree shouldn't shrink while we're using it. */
+		ret = -ENOENT;
+		goto out;
+	}
+
+	ret = filter->len;
+	if (!data)
+		goto out;
+
+	get_seccomp_filter(task);
+	spin_unlock_irq(&task->sighand->siglock);
+
+#if CONFIG_VE
+	if (copy_to_user(data, filter->orig_prog.filter,
+			 filter->orig_prog.len * sizeof(filter->orig_prog.filter[0])))
+		ret = -EFAULT;
+#else
+	if (copy_to_user(data, filter->insns, filter->len * sizeof(filter->insns[0])))
+		ret = -EFAULT;
+#endif
+
+	put_seccomp_filter(task);
+	return ret;
+
+out:
+	spin_unlock_irq(&task->sighand->siglock);
+	return ret;
+}
+#endif
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -33,6 +33,8 @@
 #include <linux/uprobes.h>
 #include <linux/compat.h>
 #include <linux/cn_proc.h>
+#include <linux/interrupt.h>
+#include <linux/ve.h>
 #define CREATE_TRACE_POINTS
 #include <trace/events/signal.h>
 
@@ -41,6 +43,7 @@
 #include <asm/unistd.h>
 #include <asm/siginfo.h>
 #include <asm/cacheflush.h>
+#include <bc/misc.h>
 #include "audit.h"	/* audit_signal_info() */
 
 /*
@@ -48,6 +51,7 @@
  */
 
 static struct kmem_cache *sigqueue_cachep;
+static inline int is_si_special(const struct siginfo *info);
 
 int print_fatal_signals __read_mostly;
 
@@ -374,6 +378,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi
 	    atomic_read(&user->sigpending) <=
 			task_rlimit(t, RLIMIT_SIGPENDING)) {
 		q = kmem_cache_alloc(sigqueue_cachep, flags);
+		if (q && ub_siginfo_charge(q, get_task_ub(t), flags)) {
+			kmem_cache_free(sigqueue_cachep, q);
+			q = NULL;
+		}
 	} else {
 		print_dropped_signal(sig);
 	}
@@ -396,6 +404,7 @@ static void __sigqueue_free(struct sigqueue *q)
 		return;
 	atomic_dec(&q->user->sigpending);
 	free_uid(q->user);
+	ub_siginfo_uncharge(q);
 	kmem_cache_free(sigqueue_cachep, q);
 }
 
@@ -581,7 +590,18 @@ still_pending:
 static int __dequeue_signal(struct sigpending *pending, sigset_t *mask,
 			siginfo_t *info)
 {
-	int sig = next_signal(pending, mask);
+	int sig = 0;
+
+	/* SIGKILL must have priority, otherwise it is quite easy
+	 * to create an unkillable process, sending sig < SIGKILL
+	 * to self */
+	if (unlikely(sigismember(&pending->signal, SIGKILL))) {
+		if (!sigismember(mask, SIGKILL))
+			sig = SIGKILL;
+	}
+
+	if (likely(!sig))
+		sig = next_signal(pending, mask);
 
 	if (sig) {
 		if (current->notifier) {
@@ -3090,6 +3110,11 @@ COMPAT_SYSCALL_DEFINE4(rt_tgsigqueueinfo,
 }
 #endif
 
+void __weak sigaction_compat_abi(struct k_sigaction *act,
+		struct k_sigaction *oact)
+{
+}
+
 int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 {
 	struct task_struct *t = current;
@@ -3105,6 +3130,8 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 	if (oact)
 		*oact = *k;
 
+	sigaction_compat_abi(act, oact);
+
 	if (act) {
 		sigdelsetmask(&act->sa.sa_mask,
 			      sigmask(SIGKILL) | sigmask(SIGSTOP));
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,6 +24,8 @@
 #include <linux/smpboot.h>
 #include <linux/tick.h>
 
+#include <bc/beancounter.h>
+
 #define CREATE_TRACE_POINTS
 #include <trace/events/irq.h>
 
@@ -209,6 +211,7 @@ EXPORT_SYMBOL(local_bh_enable_ip);
 
 asmlinkage void __do_softirq(void)
 {
+	struct user_beancounter *ub;
 	struct softirq_action *h;
 	__u32 pending;
 	unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
@@ -238,6 +241,7 @@ restart:
 
 	h = softirq_vec;
 
+	ub = set_exec_ub(get_ub0());
 	do {
 		if (pending & 1) {
 			unsigned int vec_nr = h - softirq_vec;
@@ -262,6 +266,7 @@ restart:
 		h++;
 		pending >>= 1;
 	} while (pending);
+	(void)set_exec_ub(ub);
 
 	local_irq_disable();
 
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 #include <linux/utsname.h>
 #include <linux/mman.h>
+#include <linux/virtinfo.h>
 #include <linux/reboot.h>
 #include <linux/prctl.h>
 #include <linux/highuid.h>
@@ -42,6 +43,7 @@
 #include <linux/syscore_ops.h>
 #include <linux/version.h>
 #include <linux/ctype.h>
+#include <linux/ve.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
@@ -49,7 +51,6 @@
 #include <linux/user_namespace.h>
 #include <linux/binfmts.h>
 
-#include <linux/sched.h>
 #include <linux/rcupdate.h>
 #include <linux/uidgid.h>
 #include <linux/cred.h>
@@ -129,6 +130,122 @@ int C_A_D = 1;
 struct pid *cad_pid;
 EXPORT_SYMBOL(cad_pid);
 
+DEFINE_SEMAPHORE(virtinfo_sem);
+EXPORT_SYMBOL(virtinfo_sem);
+static struct vnotifier_block *virtinfo_chain[VIRT_TYPES];
+
+void __virtinfo_notifier_register(int type, struct vnotifier_block *nb)
+{
+	struct vnotifier_block **p;
+
+	for (p = &virtinfo_chain[type];
+	     *p != NULL && nb->priority < (*p)->priority;
+	     p = &(*p)->next);
+	nb->next = *p;
+	smp_wmb();
+	*p = nb;
+}
+
+EXPORT_SYMBOL(__virtinfo_notifier_register);
+
+void virtinfo_notifier_register(int type, struct vnotifier_block *nb)
+{
+	down(&virtinfo_sem);
+	__virtinfo_notifier_register(type, nb);
+	up(&virtinfo_sem);
+}
+
+EXPORT_SYMBOL(virtinfo_notifier_register);
+
+struct virtinfo_cnt_struct {
+	volatile unsigned long exit[NR_CPUS];
+	volatile unsigned long entry;
+};
+static DEFINE_PER_CPU(struct virtinfo_cnt_struct, virtcnt);
+
+void virtinfo_notifier_unregister(int type, struct vnotifier_block *nb)
+{
+	struct vnotifier_block **p;
+	int entry_cpu, exit_cpu;
+	unsigned long cnt, ent;
+
+	down(&virtinfo_sem);
+	for (p = &virtinfo_chain[type]; *p != nb; p = &(*p)->next);
+	*p = nb->next;
+	smp_mb();
+
+	for_each_possible_cpu(entry_cpu) {
+		while (1) {
+			cnt = 0;
+			for_each_possible_cpu(exit_cpu)
+				cnt +=
+				    per_cpu(virtcnt, entry_cpu).exit[exit_cpu];
+			smp_rmb();
+			ent = per_cpu(virtcnt, entry_cpu).entry;
+			if (cnt == ent)
+				break;
+			__set_current_state(TASK_UNINTERRUPTIBLE);
+			schedule_timeout(HZ / 100);
+		}
+	}
+
+	/* FIXME: replace virtinfo with srcu-notifier-chains */
+	rcu_barrier_sched();
+
+	up(&virtinfo_sem);
+}
+
+EXPORT_SYMBOL(virtinfo_notifier_unregister);
+
+static int do_virtinfo_notifier_call(int type, unsigned long n, void *data)
+{
+	int ret;
+	struct vnotifier_block *nb;
+
+	nb = virtinfo_chain[type];
+	ret = NOTIFY_DONE;
+	while (nb)
+	{
+		ret = nb->notifier_call(nb, n, data, ret);
+		if(ret & NOTIFY_STOP_MASK) {
+			ret &= ~NOTIFY_STOP_MASK;
+			break;
+		}
+		nb = nb->next;
+	}
+
+	return ret;
+}
+
+int virtinfo_notifier_call(int type, unsigned long n, void *data)
+{
+	int ret;
+	int entry_cpu, exit_cpu;
+
+	entry_cpu = get_cpu();
+	per_cpu(virtcnt, entry_cpu).entry++;
+	smp_wmb();
+	put_cpu();
+
+	ret = do_virtinfo_notifier_call(type, n, data);
+
+	exit_cpu = get_cpu();
+	smp_wmb();
+	per_cpu(virtcnt, entry_cpu).exit[exit_cpu]++;
+	put_cpu();
+
+	return ret;
+}
+EXPORT_SYMBOL(virtinfo_notifier_call);
+
+int virtinfo_notifier_call_irq(int type, unsigned long n, void *data)
+{
+	if (!in_interrupt())
+		return virtinfo_notifier_call(type, n, data);
+	return do_virtinfo_notifier_call(type, n, data);
+}
+EXPORT_SYMBOL(virtinfo_notifier_call_irq);
+
 /*
  * If set, this is used for preparing the system to power off.
  */
@@ -1109,7 +1226,7 @@ SYSCALL_DEFINE0(getppid)
 	int pid;
 
 	rcu_read_lock();
-	pid = task_tgid_vnr(rcu_dereference(current->real_parent));
+	pid = ve_task_ppid_nr_ns(current, current->nsproxy->pid_ns);
 	rcu_read_unlock();
 
 	return pid;
@@ -1152,8 +1269,27 @@ void do_sys_times(struct tms *tms)
 	tms->tms_cstime = cputime_to_clock_t(cstime);
 }
 
+#ifdef CONFIG_VE
+unsigned long long ve_relative_clock(struct timespec * ts)
+{
+	unsigned long long offset = 0;
+
+	if (ts->tv_sec > get_exec_env()->start_timespec.tv_sec ||
+	    (ts->tv_sec == get_exec_env()->start_timespec.tv_sec &&
+	     ts->tv_nsec >= get_exec_env()->start_timespec.tv_nsec))
+		offset = (unsigned long long)(ts->tv_sec -
+			get_exec_env()->start_timespec.tv_sec) * NSEC_PER_SEC
+			+ ts->tv_nsec -	get_exec_env()->start_timespec.tv_nsec;
+	return nsec_to_clock_t(offset);
+}
+#endif
+
 SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
 {
+#ifdef CONFIG_VE
+	struct timespec now;
+#endif
+
 	if (tbuf) {
 		struct tms tmp;
 
@@ -1161,8 +1297,15 @@ SYSCALL_DEFINE1(times, struct tms __user *, tbuf)
 		if (copy_to_user(tbuf, &tmp, sizeof(struct tms)))
 			return -EFAULT;
 	}
+#ifndef CONFIG_VE
 	force_successful_syscall_return();
 	return (long) jiffies_64_to_clock_t(get_jiffies_64());
+#else
+	/* Compare to calculation in fs/proc/array.c */
+	do_posix_clock_monotonic_gettime(&now);
+	force_successful_syscall_return();
+	return ve_relative_clock(&now);
+#endif
 }
 
 /*
@@ -1347,6 +1490,7 @@ out:
 }
 
 DECLARE_RWSEM(uts_sem);
+EXPORT_SYMBOL_GPL(uts_sem);
 
 #ifdef COMPAT_UTS_MACHINE
 #define override_architecture(name) \
@@ -1636,7 +1780,7 @@ int do_prlimit(struct task_struct *tsk, unsigned int resource,
 		/* Keep the capable check against init_user_ns until
 		   cgroups can contain all limits */
 		if (new_rlim->rlim_max > rlim->rlim_max &&
-				!capable(CAP_SYS_RESOURCE))
+				!ve_capable(CAP_SYS_RESOURCE))
 			retval = -EPERM;
 		if (!retval)
 			retval = security_task_setrlimit(tsk->group_leader,
@@ -1946,16 +2090,6 @@ static int prctl_set_mm_exe_file(struct mm_struct *mm, unsigned int fd)
 		fput(exe_file);
 	}
 
-	/*
-	 * The symlink can be changed only once, just to disallow arbitrary
-	 * transitions malicious software might bring in. This means one
-	 * could make a snapshot over all processes running and monitor
-	 * /proc/pid/exe changes to notice unusual activity if needed.
-	 */
-	err = -EPERM;
-	if (test_and_set_bit(MMF_EXE_FILE_CHANGED, &mm->flags))
-		goto exit;
-
 	err = 0;
 	/* set the new file, lockless */
 	get_file(exe.file);
@@ -2199,7 +2333,7 @@ static int prctl_set_mm(int opt, unsigned long addr,
 		return prctl_set_mm_map(opt, (const void __user *)addr, arg4);
 #endif
 
-	if (!capable(CAP_SYS_RESOURCE))
+	if (!ve_capable(CAP_SYS_RESOURCE))
 		return -EPERM;
 
 	if (opt == PR_SET_MM_EXE_FILE)
@@ -2636,19 +2770,35 @@ static int do_sysinfo(struct sysinfo *info)
 	unsigned long mem_total, sav_total;
 	unsigned int mem_unit, bitcount;
 	struct timespec tp;
+	struct ve_struct *ve;
 
 	memset(info, 0, sizeof(struct sysinfo));
 
+	si_meminfo(info);
+	si_swapinfo(info);
+
+	ve = get_exec_env();
+
 	get_monotonic_boottime(&tp);
 	info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
 
-	get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
+	if (ve_is_super(ve)) {
+		get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
 
-	info->procs = nr_threads;
+		info->procs = nr_threads;
+	} else {
+		info->uptime -= ve->real_start_timespec.tv_sec;
 
-	si_meminfo(info);
-	si_swapinfo(info);
+		info->procs = nr_threads_ve(ve);
+
+		get_avenrun_ve(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
+	}
 
+#ifdef CONFIG_BEANCOUNTERS
+	if (virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_SYSINFO, info)
+			& NOTIFY_FAIL)
+		return -ENOMSG;
+#endif
 	/*
 	 * If the sum of all the available memory (i.e. ram + swap)
 	 * is less than can be stored in a 32 bit unsigned long then
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c
@@ -202,6 +202,14 @@ cond_syscall(sys_userfaultfd);
 /* performance counters: */
 cond_syscall(sys_perf_event_open);
 
+/* user-beancounters */
+cond_syscall(sys_getluid);
+cond_syscall(sys_setluid);
+cond_syscall(sys_setublimit);
+cond_syscall(sys_ubstat);
+cond_syscall(compat_sys_setublimit);
+cond_syscall(compat_sys_ubstat);
+
 /* fanotify! */
 cond_syscall(sys_fanotify_init);
 cond_syscall(sys_fanotify_mark);
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -63,6 +63,7 @@
 #include <linux/binfmts.h>
 #include <linux/sched/sysctl.h>
 #include <linux/kexec.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -91,6 +92,7 @@
 #ifdef CONFIG_LOCKUP_DETECTOR
 #include <linux/nmi.h>
 #endif
+extern unsigned relatime_interval; /* fs/inode.c */
 
 
 #if defined(CONFIG_SYSCTL)
@@ -100,10 +102,8 @@ extern int max_threads;
 extern int suid_dumpable;
 #ifdef CONFIG_COREDUMP
 extern int core_uses_pid;
-extern char core_pattern[];
 extern unsigned int core_pipe_limit;
 #endif
-extern int pid_max;
 extern int pid_max_min, pid_max_max;
 extern int percpu_pagelist_fraction;
 extern int compat_log;
@@ -113,6 +113,9 @@ extern int sysctl_nr_open_min, sysctl_nr_open_max;
 extern int sysctl_nr_trim_pages;
 #endif
 
+int ve_allow_module_load = 1;
+EXPORT_SYMBOL(ve_allow_module_load);
+
 /* Constants used for minimum and  maximum */
 #ifdef CONFIG_LOCKUP_DETECTOR
 static int sixty = 60;
@@ -170,6 +173,17 @@ extern int unaligned_dump_stack;
 extern int no_unaligned_warning;
 #endif
 
+static bool virtual_ptr(void **ptr, void *base, size_t size, void *cur);
+#define sysctl_virtual(sysctl)							\
+int sysctl ## _virtual(struct ctl_table *table, int write,			\
+		        void __user *buffer, size_t *lenp, loff_t *ppos)	\
+{										\
+	struct ctl_table tmp = *table;						\
+	if (virtual_ptr(&tmp.data, &ve0, sizeof(ve0), get_exec_env()))		\
+		return sysctl(&tmp, write, buffer, lenp, ppos);			\
+	return -EINVAL;								\
+}
+
 #ifdef CONFIG_PROC_SYSCTL
 static int proc_do_cad_pid(struct ctl_table *table, int write,
 		  void __user *buffer, size_t *lenp, loff_t *ppos);
@@ -272,6 +286,20 @@ static int min_extfrag_threshold;
 static int max_extfrag_threshold = 1000;
 #endif
 
+static int proc_dointvec_pidmax(struct ctl_table *table, int write,
+		  void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	struct ctl_table tmp;
+
+	tmp = *table;
+	tmp.data = &current->nsproxy->pid_ns->pid_max;
+	return proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos);
+}
+
+#ifdef CONFIG_COREDUMP
+sysctl_virtual(proc_dostring_coredump);
+#endif
+
 static struct ctl_table kern_table[] = {
 	{
 		.procname	= "sched_child_runs_first",
@@ -457,6 +485,25 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &one,
 	},
 #endif
+#ifdef CONFIG_CFS_CPULIMIT
+	{
+		.procname	= "sched_vcpu_hotslice",
+		.data		= &sysctl_sched_vcpu_hotslice,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &zero,
+	},
+	{
+		.procname	= "sched_cpulimit_scale_cpufreq",
+		.data		= &sysctl_sched_cpulimit_scale_cpufreq,
+		.maxlen		= sizeof(unsigned int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 #ifdef CONFIG_PROVE_LOCKING
 	{
 		.procname	= "prove_locking",
@@ -492,10 +539,10 @@ static struct ctl_table kern_table[] = {
 	},
 	{
 		.procname	= "core_pattern",
-		.data		= core_pattern,
+		.data		= ve0.core_pattern,
 		.maxlen		= CORENAME_MAX_SIZE,
-		.mode		= 0644,
-		.proc_handler	= proc_dostring_coredump,
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_dostring_coredump_virtual,
 	},
 	{
 		.procname	= "core_pipe_limit",
@@ -665,8 +712,8 @@ static struct ctl_table kern_table[] = {
 		.procname	= "hotplug",
 		.data		= &uevent_helper,
 		.maxlen		= UEVENT_HELPER_PATH_LEN,
-		.mode		= 0644,
-		.proc_handler	= proc_dostring,
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_dostring_immutable,
 	},
 
 #ifdef CONFIG_CHR_DEV_SG
@@ -760,10 +807,10 @@ static struct ctl_table kern_table[] = {
 #endif
 	{
 		.procname	= "pid_max",
-		.data		= &pid_max,
+		.data		= NULL,
 		.maxlen		= sizeof (int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_dointvec_pidmax,
 		.extra1		= &pid_max_min,
 		.extra2		= &pid_max_max,
 	},
@@ -779,8 +826,8 @@ static struct ctl_table kern_table[] = {
 		.procname	= "printk",
 		.data		= &console_loglevel,
 		.maxlen		= 4*sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_dointvec_immutable,
 	},
 	{
 		.procname	= "printk_ratelimit",
@@ -824,6 +871,17 @@ static struct ctl_table kern_table[] = {
 		.extra2		= &two,
 	},
 #endif
+#ifdef CONFIG_VE
+        {
+		.procname       = "ve_allow_module_load",
+		.data           = &ve_allow_module_load,
+		.maxlen         = sizeof(int),
+		.mode           = 0644,
+		.proc_handler   = &proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
+#endif
 	{
 		.procname	= "ngroups_max",
 		.data		= &ngroups_max,
@@ -994,10 +1052,10 @@ static struct ctl_table kern_table[] = {
 #if defined(CONFIG_MMU)
 	{
 		.procname	= "randomize_va_space",
-		.data		= &randomize_va_space,
+		.data		= &ve0._randomize_va_space,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_dointvec_virtual,
 	},
 #endif
 #if defined(CONFIG_S390) && defined(CONFIG_SMP)
@@ -1204,6 +1262,13 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= proc_dointvec,
 	},
 	{
+		.procname	= "oom_relaxation",
+		.data		= &sysctl_oom_relaxation,
+		.maxlen		= sizeof(sysctl_oom_relaxation),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_ms_jiffies,
+	},
+	{
 		.procname	= "overcommit_ratio",
 		.data		= &sysctl_overcommit_ratio,
 		.maxlen		= sizeof(sysctl_overcommit_ratio),
@@ -1275,6 +1340,14 @@ static struct ctl_table vm_table[] = {
 		.extra1		= &zero,
 	},
 	{
+		.procname	= "dirtytime_expire_seconds",
+		.data		= &dirtytime_expire_interval,
+		.maxlen		= sizeof(dirty_expire_interval),
+		.mode		= 0644,
+		.proc_handler	= dirtytime_interval_handler,
+		.extra1		= &zero,
+	},
+	{
 		.procname       = "nr_pdflush_threads",
 		.mode           = 0444 /* read-only */,
 		.proc_handler   = pdflush_proc_obsolete,
@@ -1426,6 +1499,15 @@ static struct ctl_table vm_table[] = {
 		.proc_handler	= proc_dointvec,
 		.extra1		= &zero,
 	},
+	{
+		.procname	= "vfs_cache_min_ratio",
+		.data		= &sysctl_vfs_cache_min_ratio,
+		.maxlen		= sizeof(sysctl_vfs_cache_min_ratio),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
 #ifdef HAVE_ARCH_PICK_MMAP_LAYOUT
 	{
 		.procname	= "legacy_va_layout",
@@ -1478,7 +1560,7 @@ static struct ctl_table vm_table[] = {
 		.procname	= "mmap_min_addr",
 		.data		= &dac_mmap_min_addr,
 		.maxlen		= sizeof(unsigned long),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= mmap_min_addr_handler,
 	},
 #endif
@@ -1547,6 +1629,17 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_doulongvec_minmax,
 	},
+#ifdef CONFIG_MEMCG
+	{
+		.procname	= "force_scan_thresh",
+		.data		= &sysctl_force_scan_thresh,
+		.maxlen		= sizeof(sysctl_force_scan_thresh),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= &zero,
+		.extra2		= &one_hundred,
+	},
+#endif
 	{ }
 };
 
@@ -1558,16 +1651,23 @@ static struct ctl_table binfmt_misc_table[] = {
 
 static struct ctl_table fs_table[] = {
 	{
+		.procname	= "relatime_interval",
+		.data		= &relatime_interval,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
 		.procname	= "inode-nr",
 		.data		= &inodes_stat,
-		.maxlen		= 2*sizeof(int),
+		.maxlen		= 2*sizeof(long),
 		.mode		= 0444,
 		.proc_handler	= proc_nr_inodes,
 	},
 	{
 		.procname	= "inode-state",
 		.data		= &inodes_stat,
-		.maxlen		= 7*sizeof(int),
+		.maxlen		= 7*sizeof(long),
 		.mode		= 0444,
 		.proc_handler	= proc_nr_inodes,
 	},
@@ -1597,7 +1697,7 @@ static struct ctl_table fs_table[] = {
 	{
 		.procname	= "dentry-state",
 		.data		= &dentry_stat,
-		.maxlen		= 6*sizeof(int),
+		.maxlen		= 6*sizeof(long),
 		.mode		= 0444,
 		.proc_handler	= proc_nr_dentry,
 	},
@@ -1650,17 +1750,17 @@ static struct ctl_table fs_table[] = {
 #ifdef CONFIG_AIO
 	{
 		.procname	= "aio-nr",
-		.data		= &aio_nr,
-		.maxlen		= sizeof(aio_nr),
-		.mode		= 0444,
-		.proc_handler	= proc_doulongvec_minmax,
+		.data		= &ve0.aio_nr,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0444 | S_ISVTX,
+		.proc_handler	= proc_doulongvec_minmax_virtual,
 	},
 	{
 		.procname	= "aio-max-nr",
-		.data		= &aio_max_nr,
-		.maxlen		= sizeof(aio_max_nr),
-		.mode		= 0644,
-		.proc_handler	= proc_doulongvec_minmax,
+		.data		= &ve0.aio_max_nr,
+		.maxlen		= sizeof(unsigned long),
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_doulongvec_minmax_virtual,
 	},
 #endif /* CONFIG_AIO */
 #ifdef CONFIG_INOTIFY_USER
@@ -1713,6 +1813,13 @@ static struct ctl_table fs_table[] = {
 	},
 #endif
 	{
+		.procname	= "odirect_enable",
+		.data		= &ve0.odirect_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_dointvec_virtual,
+	},
+	{
 		.procname	= "pipe-max-size",
 		.data		= &pipe_max_size,
 		.maxlen		= sizeof(int),
@@ -2238,9 +2345,10 @@ int proc_dointvec_minmax(struct ctl_table *table, int write,
 
 static void validate_coredump_safety(void)
 {
+	struct ve_struct *ve = get_exec_env();
 #ifdef CONFIG_COREDUMP
 	if (suid_dumpable == SUID_DUMP_ROOT &&
-	    core_pattern[0] != '/' && core_pattern[0] != '|') {
+	    ve->core_pattern[0] != '/' && ve->core_pattern[0] != '|') {
 		printk(KERN_WARNING "Unsafe core_pattern used with "\
 			"suid_dumpable=2. Pipe handler or fully qualified "\
 			"core dump path required.\n");
@@ -2714,6 +2822,42 @@ int proc_do_large_bitmap(struct ctl_table *table, int write,
 	}
 }
 
+static bool virtual_ptr(void **ptr, void *base, size_t size, void *cur)
+{
+	unsigned long addr = (unsigned long)*ptr;
+	unsigned long base_addr = (unsigned long)base;
+
+	if (addr >= base_addr && addr < base_addr + size) {
+		*ptr = (char *)cur + (addr - base_addr);
+		return true;
+	}
+	return false;
+}
+
+sysctl_virtual(proc_dointvec);
+sysctl_virtual(proc_doulongvec_minmax);
+
+static inline bool sysctl_in_container(void)
+{
+	return !ve_is_super(get_exec_env());
+}
+
+int proc_dointvec_immutable(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	if (write && sysctl_in_container())
+		return 0;
+	return proc_dointvec(table, write, buffer, lenp, ppos);
+}
+
+int proc_dostring_immutable(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+	if (write && sysctl_in_container())
+		return 0;
+	return proc_dostring(table, write, buffer, lenp, ppos);
+}
+
 #else /* CONFIG_PROC_SYSCTL */
 
 int proc_dostring(struct ctl_table *table, int write,
--- a/kernel/taskstats.c
+++ b/kernel/taskstats.c
@@ -46,6 +46,7 @@ static struct genl_family family = {
 	.name		= TASKSTATS_GENL_NAME,
 	.version	= TASKSTATS_GENL_VERSION,
 	.maxattr	= TASKSTATS_CMD_ATTR_MAX,
+	.netnsok	= true,
 };
 
 static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = {
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -38,10 +38,11 @@
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
 #include <linux/irq_work.h>
-#include <linux/sched.h>
 #include <linux/sched/sysctl.h>
 #include <linux/slab.h>
 #include <linux/compat.h>
+#include <linux/virtinfo.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4269,6 +4269,8 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	struct trace_iterator *iter = filp->private_data;
 	struct trace_array *tr = iter->tr;
 	ssize_t sret;
+	size_t loops = 0;
+	enum print_line_t ret = 0;
 
 	/* return any leftover data */
 	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -4319,7 +4321,6 @@ waitagain:
 	trace_event_read_lock();
 	trace_access_lock(iter->cpu_file);
 	while (trace_find_next_entry_inc(iter) != NULL) {
-		enum print_line_t ret;
 		int len = iter->seq.len;
 
 		ret = print_trace_line(iter);
@@ -4354,8 +4355,20 @@ waitagain:
 	 * If there was nothing to send to user, in spite of consuming trace
 	 * entries, go back to wait for more entries.
 	 */
-	if (sret == -EBUSY)
+	if (sret == -EBUSY) {
+		if ((loops % 10000) == 0) {
+			WARN_ON(1);
+			printk("%ldk loops in tracing_read_pipe\n", loops / 1000);
+			printk("trace_empty(iter): %d\n", trace_empty(iter));
+			printk("iter->seq.len    : %d\n", iter->seq.len);
+			printk("iter->seq.readpos: %d\n", iter->seq.readpos);
+			printk("iter->cpu_file   : %d\n", iter->cpu_file);
+			printk("iter->lost_events: %ld\n", iter->lost_events);
+			printk("ret              : %d\n", ret);
+			printk("cnt              : %ld\n", cnt);
+		}
 		goto waitagain;
+	}
 
 out:
 	mutex_unlock(&iter->mutex);
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -1015,6 +1015,9 @@ static void parse_init(struct filter_parse_state *ps,
 
 static char infix_next(struct filter_parse_state *ps)
 {
+	if (!ps->infix.cnt)
+		return 0;
+
 	ps->infix.cnt--;
 
 	return ps->infix.string[ps->infix.tail++];
@@ -1030,6 +1033,9 @@ static char infix_peek(struct filter_parse_state *ps)
 
 static void infix_advance(struct filter_parse_state *ps)
 {
+	if (!ps->infix.cnt)
+		return;
+
 	ps->infix.cnt--;
 	ps->infix.tail++;
 }
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -66,14 +66,11 @@ EXPORT_SYMBOL_GPL(init_user_ns);
  * when changing user ID's (ie setuid() and friends).
  */
 
-#define UIDHASH_BITS	(CONFIG_BASE_SMALL ? 3 : 7)
-#define UIDHASH_SZ	(1 << UIDHASH_BITS)
 #define UIDHASH_MASK		(UIDHASH_SZ - 1)
 #define __uidhashfn(uid)	(((uid >> UIDHASH_BITS) + uid) & UIDHASH_MASK)
-#define uidhashentry(uid)	(uidhash_table + __uidhashfn((__kuid_val(uid))))
+#define uidhashentry(ns, uid)  ((ns)->uidhash_table + __uidhashfn((__kuid_val(uid))))
 
 static struct kmem_cache *uid_cachep;
-struct hlist_head uidhash_table[UIDHASH_SZ];
 
 /*
  * The uidhash_lock is mostly taken from process context, but it is
@@ -147,9 +144,10 @@ struct user_struct *find_user(kuid_t uid)
 {
 	struct user_struct *ret;
 	unsigned long flags;
+	struct user_namespace *ns = current_user_ns();
 
 	spin_lock_irqsave(&uidhash_lock, flags);
-	ret = uid_hash_find(uid, uidhashentry(uid));
+	ret = uid_hash_find(uid, uidhashentry(ns, uid));
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 	return ret;
 }
@@ -168,9 +166,9 @@ void free_uid(struct user_struct *up)
 		local_irq_restore(flags);
 }
 
-struct user_struct *alloc_uid(kuid_t uid)
+struct user_struct *alloc_uid_ns(struct user_namespace *ns, kuid_t uid)
 {
-	struct hlist_head *hashent = uidhashentry(uid);
+	struct hlist_head *hashent = uidhashentry(ns, uid);
 	struct user_struct *up, *new;
 
 	spin_lock_irq(&uidhash_lock);
@@ -208,6 +206,11 @@ out_unlock:
 	return NULL;
 }
 
+struct user_struct *alloc_uid(kuid_t uid)
+{
+	return alloc_uid_ns(current_user_ns(), uid);
+}
+
 static int __init uid_cache_init(void)
 {
 	int n;
@@ -216,11 +219,11 @@ static int __init uid_cache_init(void)
 			0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
 
 	for(n = 0; n < UIDHASH_SZ; ++n)
-		INIT_HLIST_HEAD(uidhash_table + n);
+		INIT_HLIST_HEAD(init_user_ns.uidhash_table + n);
 
 	/* Insert the root user immediately (init already runs as root) */
 	spin_lock_irq(&uidhash_lock);
-	uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
+	uid_hash_insert(&root_user, uidhashentry(&init_user_ns, GLOBAL_ROOT_UID));
 	spin_unlock_irq(&uidhash_lock);
 
 	return 0;
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -50,8 +50,10 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns)
 	cred->user_ns = user_ns;
 }
 
-/* While user namespaces remain in tech preview disable them */
-static bool enable_user_ns_creation;
+/* While user namespaces remain in tech preview in RHEL7 - they are disabled by
+ * default. Virtuozzo Containers are run in user namespaces, so enable them by
+ * default. */
+static bool enable_user_ns_creation = true;
 module_param_named(enable, enable_user_ns_creation, bool, 0444);
 MODULE_PARM_DESC(enable, "Enable creation of new user namespaces");
 
@@ -66,17 +68,14 @@ MODULE_PARM_DESC(enable, "Enable creation of new user namespaces");
 int create_user_ns(struct cred *new)
 {
 	struct user_namespace *ns, *parent_ns = new->user_ns;
+	struct user_struct *new_user;
 	kuid_t owner = new->euid;
 	kgid_t group = new->egid;
-	int ret;
-	static int __read_mostly called_mark_tech_preview = 0;
+	int ret, i;
 
 	if (!enable_user_ns_creation)
 		return -EINVAL;
 
-	if (!called_mark_tech_preview && !xchg(&called_mark_tech_preview, 1))
-		mark_tech_preview("user namespace", NULL);
-
 	if (parent_ns->level > 32)
 		return -EUSERS;
 
@@ -86,7 +85,7 @@ int create_user_ns(struct cred *new)
 	 * by verifing that the root directory is at the root of the
 	 * mount namespace which allows all files to be accessed.
 	 */
-	if (current_chrooted())
+	if (!IS_ENABLED(CONFIG_VE) && current_chrooted())
 		return -EPERM;
 
 	/* The creator needs a mapping in the parent user namespace
@@ -107,6 +106,19 @@ int create_user_ns(struct cred *new)
 		return ret;
 	}
 
+	for (i = 0; i < UIDHASH_SZ; ++i)
+		INIT_HLIST_HEAD(ns->uidhash_table + i);
+
+	new_user = alloc_uid_ns(ns, owner);
+	if (!new_user) {
+		proc_free_inum(ns->proc_inum);
+		kmem_cache_free(user_ns_cachep, ns);
+		return -ENOMEM;
+	}
+
+	free_uid(new->user);
+	new->user = new_user;
+
 	atomic_set(&ns->count, 1);
 	/* Leave the new->user_ns reference with the new user namespace. */
 	ns->parent = parent_ns;
@@ -128,6 +140,7 @@ int create_user_ns(struct cred *new)
 #endif
 	return 0;
 }
+EXPORT_SYMBOL(create_user_ns);
 
 int unshare_userns(unsigned long unshare_flags, struct cred **new_cred)
 {
@@ -978,8 +991,8 @@ static int userns_install(struct nsproxy *nsproxy, void *ns)
 	if (user_ns == current_user_ns())
 		return -EINVAL;
 
-	/* Threaded processes may not enter a different user namespace */
-	if (atomic_read(&current->mm->mm_users) > 1)
+	/* Tasks that share a thread group must share a user namespace */
+	if (!thread_group_empty(current))
 		return -EINVAL;
 
 	if (current->fs->users != 1)
--- a/kernel/utsname.c
+++ b/kernel/utsname.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/err.h>
 #include <linux/slab.h>
+#include <linux/mm.h>
 #include <linux/user_namespace.h>
 #include <linux/proc_ns.h>
 
@@ -22,8 +23,17 @@ static struct uts_namespace *create_uts_ns(void)
 	struct uts_namespace *uts_ns;
 
 	uts_ns = kmalloc(sizeof(struct uts_namespace), GFP_KERNEL);
-	if (uts_ns)
+	if (uts_ns) {
+#ifdef CONFIG_X86
+#ifdef CONFIG_X86_64
+		memset(&uts_ns->vdso, 0, sizeof(uts_ns->vdso));
+#endif
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+		memset(&uts_ns->vdso32, 0, sizeof(uts_ns->vdso32));
+#endif
+#endif
 		kref_init(&uts_ns->kref);
+	}
 	return uts_ns;
 }
 
@@ -85,6 +95,25 @@ void free_uts_ns(struct kref *kref)
 	ns = container_of(kref, struct uts_namespace, kref);
 	put_user_ns(ns->user_ns);
 	proc_free_inum(ns->proc_inum);
+#ifdef CONFIG_X86
+#ifdef CONFIG_X86_64
+	if (ns->vdso.pages) {
+		int i;
+		vunmap(ns->vdso.addr);
+		for (i = 0; i < ns->vdso.nr_pages; i++)
+			put_page(ns->vdso.pages[i]);
+		kfree(ns->vdso.pages);
+	}
+#endif
+#if defined(CONFIG_X86_32) || defined(CONFIG_IA32_EMULATION)
+	if (ns->vdso32.pages) {
+		int i;
+		for (i = 0; i < ns->vdso32.nr_pages; i++)
+			put_page(ns->vdso32.pages[i]);
+		kfree(ns->vdso32.pages);
+	}
+#endif
+#endif
 	kfree(ns);
 }
 
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -14,6 +14,7 @@
 #include <linux/utsname.h>
 #include <linux/sysctl.h>
 #include <linux/wait.h>
+#include <linux/stat.h>
 
 #ifdef CONFIG_PROC_SYSCTL
 
@@ -29,6 +30,14 @@ static void *get_uts(ctl_table *table, int write)
 		down_read(&uts_sem);
 	else
 		down_write(&uts_sem);
+
+	if (table->data == &virt_utsname.release) {
+		if (uts_ns == &init_uts_ns)
+			return virt_utsname.release;
+		else
+			return uts_ns->name.release;
+	}
+
 	return which;
 }
 
@@ -92,7 +101,7 @@ static struct ctl_table uts_kern_table[] = {
 		.procname	= "hostname",
 		.data		= init_uts_ns.name.nodename,
 		.maxlen		= sizeof(init_uts_ns.name.nodename),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_do_uts_string,
 		.poll		= &hostname_poll,
 	},
@@ -100,18 +109,20 @@ static struct ctl_table uts_kern_table[] = {
 		.procname	= "domainname",
 		.data		= init_uts_ns.name.domainname,
 		.maxlen		= sizeof(init_uts_ns.name.domainname),
-		.mode		= 0644,
+		.mode		= 0644 | S_ISVTX,
 		.proc_handler	= proc_do_uts_string,
 		.poll		= &domainname_poll,
 	},
 	{}
 };
 
-static struct ctl_table uts_root_table[] = {
+static struct ctl_table uts_virt_osrelease_table[] = {
 	{
-		.procname	= "kernel",
-		.mode		= 0555,
-		.child		= uts_kern_table,
+		.procname       = "virt_osrelease",
+		.data           = virt_utsname.release,
+		.maxlen         = sizeof(virt_utsname.release),
+		.mode           = 0644,
+		.proc_handler   = &proc_do_uts_string,
 	},
 	{}
 };
@@ -129,9 +140,15 @@ void uts_proc_notify(enum uts_proc proc)
 }
 #endif
 
+static struct ctl_path uts_path[] = {
+	{ .procname = "kernel", },
+	{ }
+};
+
 static int __init utsname_sysctl_init(void)
 {
-	register_sysctl_table(uts_root_table);
+	register_sysctl_paths(uts_path, uts_kern_table);
+	register_sysctl_paths(uts_path, uts_virt_osrelease_table);
 	return 0;
 }
 
--- /dev/null
+++ b/kernel/ve/Makefile
@@ -0,0 +1,26 @@
+#
+# kernel/ve/Makefile
+#
+# Copyright (c) 2000-2008 SWsoft
+# Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+#
+
+obj-$(CONFIG_VE) = ve.o veowner.o hooks.o vzstat_core.o ve-kobject.o
+obj-$(CONFIG_VZ_WDOG) += vzwdog.o
+obj-$(CONFIG_VE_CALLS) += vzmon.o
+
+vzmon-objs = vecalls.o
+
+obj-$(CONFIG_VZ_DEV) += vzdev.o
+obj-$(CONFIG_VZ_EVENT) += vzevent.o
+
+obj-$(CONFIG_VE_NETDEV_ACCOUNTING) += vznetstat/
+
+obj-$(CONFIG_VZ_LIST) += vzlist.o
+obj-$(CONFIG_VE_CALLS) += vzstat.o
+
+obj-$(CONFIG_VZ_IOLIMIT) += vziolimit.o
+
+obj-$(CONFIG_VE_IPTABLES) += ve.o
+
+obj-y += dummy/
--- /dev/null
+++ b/kernel/ve/dummy/Makefile
@@ -0,0 +1,10 @@
+#
+# kernel/ve/dummy/Makefile
+#
+# Copyright (c) 2000-2008 SWsoft
+# Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+#
+
+obj-m += ip6_vzprivnet.o
+obj-m += ip_vzprivnet.o
+obj-m += pio_nfs.o
--- /dev/null
+++ b/kernel/ve/dummy/ip6_vzprivnet.c
@@ -0,0 +1,22 @@
+/*
+ *  kernel/ve/dummy/ip6_vzprivnet.c
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+int __init dummy_init(void)
+{
+	return 0;
+}
+
+void __exit dummy_exit(void)
+{
+}
+
+module_init(dummy_init);
+module_exit(dummy_exit);
+MODULE_LICENSE("GPL v2");
--- /dev/null
+++ b/kernel/ve/dummy/ip_vzprivnet.c
@@ -0,0 +1,22 @@
+/*
+ *  kernel/ve/dummy/ip_vzprivnet.c
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+int __init dummy_init(void)
+{
+	return 0;
+}
+
+void __exit dummy_exit(void)
+{
+}
+
+module_init(dummy_init);
+module_exit(dummy_exit);
+MODULE_LICENSE("GPL v2");
--- /dev/null
+++ b/kernel/ve/dummy/pio_nfs.c
@@ -0,0 +1,23 @@
+/*
+ *  kernel/ve/dummy/pio_nfs.c
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+
+int __init dummy_init(void)
+{
+	return 0;
+}
+
+void __exit dummy_exit(void)
+{
+}
+
+module_init(dummy_init);
+module_exit(dummy_exit);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/kernel/ve/hooks.c
@@ -0,0 +1,111 @@
+/*
+ *  kernel/ve/hooks.c
+ *
+ *  Copyright (c) 2000-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/ve.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/ve_proto.h>
+#include <linux/module.h>
+
+static struct list_head ve_hooks[VE_MAX_CHAINS];
+static DECLARE_RWSEM(ve_hook_sem);
+
+void ve_hook_register(int chain, struct ve_hook *vh)
+{
+	struct list_head *lh;
+	struct ve_hook *tmp;
+
+	BUG_ON(chain > VE_MAX_CHAINS);
+
+	down_write(&ve_hook_sem);
+	list_for_each(lh, &ve_hooks[chain]) {
+		tmp = list_entry(lh, struct ve_hook, list);
+		if (vh->priority < tmp->priority)
+			break;
+	}
+
+	list_add_tail(&vh->list, lh);
+	up_write(&ve_hook_sem);
+}
+
+EXPORT_SYMBOL(ve_hook_register);
+
+void ve_hook_unregister(struct ve_hook *vh)
+{
+	down_write(&ve_hook_sem);
+	list_del(&vh->list);
+	up_write(&ve_hook_sem);
+}
+
+EXPORT_SYMBOL(ve_hook_unregister);
+
+static inline int ve_hook_init(struct ve_hook *vh, struct ve_struct *ve)
+{
+	int err;
+
+	err = 0;
+	if (vh->init != NULL && try_module_get(vh->owner)) {
+		err = vh->init(ve);
+		module_put(vh->owner);
+	}
+	return err;
+}
+
+static inline void ve_hook_fini(struct ve_hook *vh, struct ve_struct *ve)
+{
+	if (vh->fini != NULL && try_module_get(vh->owner)) {
+		vh->fini(ve);
+		module_put(vh->owner);
+	}
+}
+
+int ve_hook_iterate_init(int chain, void *ve)
+{
+	struct ve_hook *vh;
+	int err;
+
+	err = 0;
+
+	down_read(&ve_hook_sem);
+	list_for_each_entry(vh, &ve_hooks[chain], list)
+		if ((err = ve_hook_init(vh, ve)) < 0)
+			break;
+
+	if (err)
+		list_for_each_entry_continue_reverse(vh, &ve_hooks[chain], list)
+			ve_hook_fini(vh, ve);
+
+	up_read(&ve_hook_sem);
+	return err;
+}
+
+EXPORT_SYMBOL(ve_hook_iterate_init);
+
+void ve_hook_iterate_fini(int chain, void *ve)
+{
+	struct ve_hook *vh;
+
+	down_read(&ve_hook_sem);
+	list_for_each_entry_reverse(vh, &ve_hooks[chain], list)
+		ve_hook_fini(vh, ve);
+	up_read(&ve_hook_sem);
+}
+
+EXPORT_SYMBOL(ve_hook_iterate_fini);
+
+static int __init ve_hooks_init(void)
+{
+	int i;
+
+	for (i = 0; i < VE_MAX_CHAINS; i++)
+		INIT_LIST_HEAD(&ve_hooks[i]);
+	return 0;
+}
+
+core_initcall(ve_hooks_init);
+
--- /dev/null
+++ b/kernel/ve/ve-kobject.c
@@ -0,0 +1,48 @@
+/*
+ *  kernel/ve/ve-kobject.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/ve.h>
+#include <linux/kobject_ns.h>
+
+static const struct kobj_ns_type_operations *ve_child_ns_type(struct kobject *kobj)
+{
+	return &ve_ns_type_operations;
+}
+
+static void ve_kobj_release(struct kobject *kobj)
+{
+	kfree(kobj);
+}
+
+static struct kobj_type ve_kobj_ktype = {
+	.release	= ve_kobj_release,
+	.sysfs_ops	= &kobj_sysfs_ops,
+	.child_ns_type	= ve_child_ns_type,
+};
+
+struct kobject *kobject_create_and_add_ve(const char *name, struct kobject *parent)
+{
+	struct kobject *kobj;
+	int retval;
+
+	kobj = kzalloc(sizeof(*kobj), GFP_KERNEL);
+	if (!kobj)
+		return NULL;
+
+	kobject_init(kobj, &ve_kobj_ktype);
+
+	retval = kobject_add(kobj, parent, "%s", name);
+	if (retval) {
+		printk(KERN_WARNING "%s: kobject_add error: %d\n",
+		       __func__, retval);
+		kobject_put(kobj);
+		kobj = NULL;
+	}
+	return kobj;
+}
+
+
--- /dev/null
+++ b/kernel/ve/ve.c
@@ -0,0 +1,1578 @@
+/*
+ *  kernel/ve/ve.c
+ *
+ *  Copyright (c) 2000-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * 've.c' helper file performing VE sub-system initialization
+ */
+
+#include <linux/delay.h>
+#include <linux/capability.h>
+#include <linux/ve.h>
+#include <linux/init.h>
+
+#include <linux/aio.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/sys.h>
+#include <linux/kdev_t.h>
+#include <linux/termios.h>
+#include <linux/netdevice.h>
+#include <linux/utsname.h>
+#include <linux/proc_fs.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+#include <linux/ve_proto.h>
+#include <linux/devpts_fs.h>
+#include <linux/user_namespace.h>
+#include <linux/init_task.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/fs_struct.h>
+#include <linux/task_work.h>
+#include <linux/ctype.h>
+
+#include <uapi/linux/vzcalluser.h>
+#include <linux/venet.h>
+#include <linux/vziptable_defs.h>
+#include <net/rtnetlink.h>
+
+static struct kmem_cache *ve_cachep;
+
+unsigned long vz_rstamp = 0x37e0f59d;
+EXPORT_SYMBOL(vz_rstamp);
+
+#ifdef CONFIG_MODULES
+struct module no_module = { .state = MODULE_STATE_GOING };
+EXPORT_SYMBOL(no_module);
+#endif
+
+struct kmapset_set ve_sysfs_perms;
+
+static DEFINE_PER_CPU(struct kstat_lat_pcpu_snap_struct, ve0_lat_stats);
+
+struct ve_struct ve0 = {
+	.ve_name		= "0",
+	.start_jiffies		= INITIAL_JIFFIES,
+	RCU_POINTER_INITIALIZER(ve_ns, &init_nsproxy),
+	.ve_netns		= &init_net,
+	.is_running		= 1,
+	.is_pseudosuper		= 1,
+#ifdef CONFIG_VE_IPTABLES
+	.ipt_mask		= VE_IP_ALL,	/* everything is allowed */
+#endif
+	.features		= -1,
+	.fsync_enable		= FSYNC_FILTERED,
+	.meminfo_val		= VE_MEMINFO_SYSTEM,
+	._randomize_va_space	=
+#ifdef CONFIG_COMPAT_BRK
+					1,
+#else
+					2,
+#endif
+	.sched_lat_ve.cur	= &ve0_lat_stats,
+	.init_cred		= &init_cred,
+	.mnt_nr			= 0,
+	.netns_avail_nr		= ATOMIC_INIT(INT_MAX),
+	.netns_max_nr		= INT_MAX,
+	.netif_avail_nr		= ATOMIC_INIT(INT_MAX),
+	.netif_max_nr		= INT_MAX,
+};
+EXPORT_SYMBOL(ve0);
+
+LIST_HEAD(ve_list_head);
+DEFINE_MUTEX(ve_list_lock);
+
+int nr_ve = 1;	/* One VE always exists. Compatibility with vestat */
+EXPORT_SYMBOL(nr_ve);
+
+static DEFINE_IDR(ve_idr);
+
+struct ve_struct *get_ve(struct ve_struct *ve)
+{
+	if (ve)
+		css_get(&ve->css);
+	return ve;
+}
+EXPORT_SYMBOL(get_ve);
+
+void put_ve(struct ve_struct *ve)
+{
+	if (ve)
+		css_put(&ve->css);
+}
+EXPORT_SYMBOL(put_ve);
+
+struct cgroup_subsys_state *ve_get_init_css(struct ve_struct *ve, int subsys_id)
+{
+	struct cgroup_subsys_state *css;
+	struct task_struct *task;
+
+	rcu_read_lock();
+	task = ve->ve_ns ? ve->init_task : &init_task;
+	while (true) {
+		css = task_subsys_state(task, subsys_id);
+		if (likely(css_tryget(css)))
+			break;
+		cpu_relax();
+	}
+	rcu_read_unlock();
+	return css;
+}
+
+static int ve_list_add(struct ve_struct *ve)
+{
+	int err;
+
+	mutex_lock(&ve_list_lock);
+	err = idr_alloc(&ve_idr, ve, ve->veid, ve->veid + 1, GFP_KERNEL);
+	if (err < 0) {
+		if (err == -ENOSPC)
+			err = -EEXIST;
+		goto out;
+	}
+	list_add(&ve->ve_list, &ve_list_head);
+	nr_ve++;
+	err = 0;
+out:
+	mutex_unlock(&ve_list_lock);
+	return err;
+}
+
+static void ve_list_del(struct ve_struct *ve)
+{
+	mutex_lock(&ve_list_lock);
+	idr_remove(&ve_idr, ve->veid);
+	list_del_init(&ve->ve_list);
+	nr_ve--;
+	mutex_unlock(&ve_list_lock);
+}
+
+/* caller provides refrence to ve-struct */
+const char *ve_name(struct ve_struct *ve)
+{
+	return ve->ve_name;
+}
+EXPORT_SYMBOL(ve_name);
+
+/* under rcu_read_lock if task != current */
+const char *task_ve_name(struct task_struct *task)
+{
+	return rcu_dereference_check(task->task_ve, task == current)->ve_name;
+}
+EXPORT_SYMBOL(task_ve_name);
+
+struct ve_struct *get_ve_by_id(envid_t veid)
+{
+	struct ve_struct *ve;
+	rcu_read_lock();
+	ve = idr_find(&ve_idr, veid);
+	if (ve && !css_tryget(&ve->css))
+		ve = NULL;
+	rcu_read_unlock();
+	return ve;
+}
+EXPORT_SYMBOL(get_ve_by_id);
+
+EXPORT_SYMBOL(ve_list_lock);
+EXPORT_SYMBOL(ve_list_head);
+
+int vz_security_family_check(struct net *net, int family, int type)
+{
+	if (ve_is_super(net->owner_ve))
+		return 0;
+
+	switch (family) {
+	case PF_UNSPEC:
+	case PF_PACKET:
+	case PF_NETLINK:
+	case PF_UNIX:
+	case PF_INET:
+	case PF_INET6:
+	case PF_PPPOX:
+	case PF_KEY:
+		return 0;
+	case PF_BRIDGE:
+		if (type)
+			switch (type) {
+				case RTM_NEWNEIGH:
+				case RTM_DELNEIGH:
+				case RTM_GETNEIGH:
+					return 0;
+			}
+	default:
+		return -EAFNOSUPPORT;
+	}
+}
+EXPORT_SYMBOL_GPL(vz_security_family_check);
+
+int vz_security_protocol_check(struct net *net, int protocol)
+{
+	if (ve_is_super(net->owner_ve))
+		return 0;
+
+	switch (protocol) {
+	case  IPPROTO_IP:
+	case  IPPROTO_ICMP:
+	case  IPPROTO_TCP:
+	case  IPPROTO_UDP:
+	case  IPPROTO_RAW:
+	case  IPPROTO_DCCP:
+	case  IPPROTO_GRE:
+	case  IPPROTO_ESP:
+	case  IPPROTO_AH:
+	case  IPPROTO_SCTP:
+		return 0;
+	default:
+		return -EAFNOSUPPORT;
+	}
+}
+EXPORT_SYMBOL_GPL(vz_security_protocol_check);
+
+/* Check if current user_ns is initial for current ve */
+bool current_user_ns_initial(void)
+{
+	struct ve_struct *ve = get_exec_env();
+	bool ret = false;
+
+	rcu_read_lock();
+	if (ve->ve_ns && ve->init_cred->user_ns == current_user_ns())
+		ret = true;
+	rcu_read_unlock();
+
+	return ret;
+}
+EXPORT_SYMBOL(current_user_ns_initial);
+
+struct user_namespace *ve_init_user_ns(void)
+{
+	struct cred *init_cred;
+
+	init_cred = get_exec_env()->init_cred;
+	return init_cred ? init_cred->user_ns : &init_user_ns;
+}
+EXPORT_SYMBOL(ve_init_user_ns);
+
+int ve_net_hide_sysctl(struct net *net)
+{
+	/*
+	 * This can happen only on VE creation, when process created VE cgroup,
+	 * and clones a child with new network namespace.
+	 */
+	if (net->owner_ve->init_cred == NULL)
+		return 0;
+
+	/*
+	 * Expose sysctl only for container's init user namespace
+	 */
+	return net->user_ns != net->owner_ve->init_cred->user_ns;
+}
+EXPORT_SYMBOL(ve_net_hide_sysctl);
+
+int nr_threads_ve(struct ve_struct *ve)
+{
+	return cgroup_task_count(ve->css.cgroup);
+}
+EXPORT_SYMBOL(nr_threads_ve);
+
+struct kthread_attach_work {
+	struct kthread_work work;
+	struct completion done;
+	struct task_struct *target;
+	int result;
+};
+
+static void kthread_attach_fn(struct kthread_work *w)
+{
+	struct kthread_attach_work *work = container_of(w,
+			struct kthread_attach_work, work);
+	struct task_struct *target = work->target;
+	struct cred *cred;
+	int err;
+
+	switch_task_namespaces(current, get_nsproxy(target->nsproxy));
+
+	err = unshare_fs_struct();
+	if (err)
+		goto out;
+	set_fs_root(current->fs, &target->fs->root);
+	set_fs_pwd(current->fs, &target->fs->root);
+
+	err = -ENOMEM;
+	cred = prepare_kernel_cred(target);
+	if (!cred)
+		goto out;
+	err = commit_creds(cred);
+	if (err)
+		goto out;
+
+	err = change_active_pid_ns(current, task_active_pid_ns(target));
+	if (err)
+		goto out;
+
+	err = cgroup_attach_task_all(target, current);
+	if (err)
+		goto out;
+out:
+	work->result = err;
+	complete(&work->done);
+}
+
+struct kthread_create_work {
+	struct kthread_work work;
+	struct kthread_create_info *info;
+};
+
+extern void create_kthread(struct kthread_create_info *create);
+
+static void kthread_create_fn(struct kthread_work *w)
+{
+	struct kthread_create_work *work = container_of(w,
+			struct kthread_create_work, work);
+
+	create_kthread(work->info);
+}
+
+static void kthread_create_queue(void *data, struct kthread_create_info *info)
+{
+	struct ve_struct *ve = data;
+	struct kthread_create_work create = {
+		KTHREAD_WORK_INIT(create.work, kthread_create_fn),
+		.info = info,
+	};
+	queue_kthread_work(&ve->ve_kthread_worker, &create.work);
+	wait_for_completion(&info->done);
+}
+
+struct task_struct *kthread_create_on_node_ve(struct ve_struct *ve,
+					int (*threadfn)(void *data),
+					void *data, int node,
+					const char namefmt[], ...)
+{
+	va_list args;
+	struct task_struct *task;
+	void (*queue)(void *data, struct kthread_create_info *info) = NULL;
+
+	if (!ve_is_super(ve))
+		queue = kthread_create_queue;
+
+	va_start(args, namefmt);
+	task = __kthread_create_on_node(queue, ve, threadfn, data,
+					node, namefmt, args);
+	va_end(args);
+	return task;
+}
+EXPORT_SYMBOL(kthread_create_on_node_ve);
+
+static int ve_start_umh(struct ve_struct *ve)
+{
+	struct task_struct *t;
+
+	init_kthread_worker(&ve->ve_umh_worker);
+	t = kthread_run_ve(ve, kthread_worker_fn, &ve->ve_umh_worker,
+			"khelper");
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	ve->ve_umh_task = t;
+	return 0;
+}
+
+static void ve_stop_umh(struct ve_struct *ve)
+{
+	flush_kthread_worker(&ve->ve_umh_worker);
+	kthread_stop(ve->ve_umh_task);
+	ve->ve_umh_task = NULL;
+}
+
+static int ve_start_kthread(struct ve_struct *ve)
+{
+	struct task_struct *t;
+	struct kthread_attach_work attach = {
+		KTHREAD_WORK_INIT(attach.work, kthread_attach_fn),
+		COMPLETION_INITIALIZER_ONSTACK(attach.done),
+		.target = current,
+	};
+
+	init_kthread_worker(&ve->ve_kthread_worker);
+	t = kthread_run(kthread_worker_fn, &ve->ve_kthread_worker,
+			"kthreadd/%s", ve_name(ve));
+	if (IS_ERR(t))
+		return PTR_ERR(t);
+
+	queue_kthread_work(&ve->ve_kthread_worker, &attach.work);
+	wait_for_completion(&attach.done);
+	if (attach.result) {
+		kthread_stop(t);
+		return attach.result;
+	}
+
+	ve->ve_kthread_task = t;
+	return 0;
+}
+
+static void ve_stop_kthread(struct ve_struct *ve)
+{
+	flush_kthread_worker(&ve->ve_kthread_worker);
+	kthread_stop(ve->ve_kthread_task);
+	ve->ve_kthread_task = NULL;
+}
+
+static void ve_grab_context(struct ve_struct *ve)
+{
+	struct task_struct *tsk = current;
+
+	get_task_struct(tsk);
+	ve->init_task = tsk;
+	ve->init_cred = (struct cred *)get_current_cred();
+	rcu_assign_pointer(ve->ve_ns, get_nsproxy(tsk->nsproxy));
+	ve->ve_netns =  get_net(ve->ve_ns->net_ns);
+	get_fs_root(tsk->fs, &ve->root_path);
+	synchronize_rcu();
+}
+
+static void ve_drop_context(struct ve_struct *ve)
+{
+	struct nsproxy *ve_ns = ve->ve_ns;
+	path_put(&ve->root_path);
+	ve->root_path.mnt = NULL;
+	ve->root_path.dentry = NULL;
+
+	put_net(ve->ve_netns);
+	ve->ve_netns = NULL;
+
+	/* Allows to dereference init_cred and init_task if ve_ns is set */
+	rcu_assign_pointer(ve->ve_ns, NULL);
+	synchronize_rcu();
+	put_nsproxy(ve_ns);
+
+	ve_hook_iterate_fini(VE_SHUTDOWN_CHAIN, ve);
+
+	put_cred(ve->init_cred);
+	ve->init_cred = NULL;
+
+	put_task_struct(ve->init_task);
+	ve->init_task = NULL;
+
+}
+
+static const struct timespec zero_time = { };
+
+extern void cgroup_mark_ve_root(struct ve_struct *ve);
+
+/* under ve->op_sem write-lock */
+static int ve_start_container(struct ve_struct *ve)
+{
+	struct task_struct *tsk = current;
+	int err;
+
+	if (!ve->veid)
+		return -ENOENT;
+
+	if (ve->is_running || ve->ve_ns)
+		return -EBUSY;
+
+	if (tsk->task_ve != ve || !is_child_reaper(task_pid(tsk)))
+		return -ECHILD;
+
+	/*
+	 * Setup uptime for new containers only, if restored
+	 * the velue won't be zero here already but setup from
+	 * cgroup write while resuming the container.
+	 */
+	if (timespec_equal(&ve->start_timespec, &zero_time)) {
+		ve->start_timespec = tsk->start_time;
+		ve->real_start_timespec = tsk->real_start_time;
+	}
+
+	/* The value is wrong, but it is never compared to process
+	 * start times */
+	ve->start_jiffies = get_jiffies_64();
+
+	ve_grab_context(ve);
+
+	err = ve_list_add(ve);
+	if (err)
+		goto err_list;
+
+	err = ve_start_kthread(ve);
+	if (err)
+		goto err_kthread;
+
+	err = ve_start_umh(ve);
+	if (err)
+		goto err_umh;
+
+	err = ve_hook_iterate_init(VE_SS_CHAIN, ve);
+	if (err < 0)
+		goto err_iterate;
+
+	cgroup_mark_ve_root(ve);
+
+	ve->is_running = 1;
+
+	printk(KERN_INFO "CT: %s: started\n", ve_name(ve));
+
+	get_ve(ve); /* for ve_exit_ns() */
+
+	return 0;
+
+err_iterate:
+	ve_stop_umh(ve);
+err_umh:
+	ve_stop_kthread(ve);
+err_kthread:
+	ve_list_del(ve);
+err_list:
+	ve_drop_context(ve);
+	return err;
+}
+
+void ve_stop_ns(struct pid_namespace *pid_ns)
+{
+	struct ve_struct *ve = current->task_ve;
+
+	/*
+	 * current->cgroups already switched to init_css_set in cgroup_exit(),
+	 * but current->task_ve still points to our exec ve.
+	 */
+	if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
+		return;
+
+	down_write(&ve->op_sem);
+	/*
+	 * Here the VE changes its state into "not running".
+	 * op_sem works as barrier for vzctl ioctls.
+	 * ve_mutex works as barrier for ve_can_attach().
+	 */
+	ve->is_running = 0;
+
+	/*
+	 * Neither it can be in pseudosuper state
+	 * anymore, setup it again if needed.
+	 */
+	ve->is_pseudosuper = 0;
+
+	ve_stop_umh(ve);
+	/*
+	 * Stop kernel thread, or zap_pid_ns_processes() would wait it forever.
+	 */
+	ve_stop_kthread(ve);
+	up_write(&ve->op_sem);
+}
+
+void ve_exit_ns(struct pid_namespace *pid_ns)
+{
+	struct ve_struct *ve = current->task_ve;
+
+	/*
+	 * current->cgroups already switched to init_css_set in cgroup_exit(),
+	 * but current->task_ve still points to our exec ve.
+	 */
+	if (!ve->ve_ns || ve->ve_ns->pid_ns != pid_ns)
+		return;
+
+	/*
+	 * At this point all userspace tasks in container are dead.
+	 */
+
+	if (ve->dev_sb) {
+		deactivate_super(ve->dev_sb);
+		ve->dev_sb = NULL;
+	}
+	if (ve->devpts_sb) {
+		deactivate_super(ve->devpts_sb);
+		ve->devpts_sb = NULL;
+	}
+
+	down_write(&ve->op_sem);
+	ve_hook_iterate_fini(VE_SS_CHAIN, ve);
+
+	ve_list_del(ve);
+	ve_drop_context(ve);
+	up_write(&ve->op_sem);
+
+	printk(KERN_INFO "CT: %s: stopped\n", ve_name(ve));
+
+	put_ve(ve); /* from ve_start_container() */
+}
+
+#ifdef CONFIG_VE_IPTABLES
+static __u64 ve_setup_iptables_mask(__u64 init_mask)
+{
+	/* Remove when userspace will start supplying IPv6-related bits. */
+	init_mask &= ~VE_IP_IPTABLES6;
+	init_mask &= ~VE_IP_FILTER6;
+	init_mask &= ~VE_IP_MANGLE6;
+	init_mask &= ~VE_IP_IPTABLE_NAT_MOD;
+	init_mask &= ~VE_NF_CONNTRACK_MOD;
+
+	if (mask_ipt_allow(init_mask, VE_IP_IPTABLES))
+		init_mask |= VE_IP_IPTABLES6;
+	if (mask_ipt_allow(init_mask, VE_IP_FILTER))
+		init_mask |= VE_IP_FILTER6;
+	if (mask_ipt_allow(init_mask, VE_IP_MANGLE))
+		init_mask |= VE_IP_MANGLE6;
+	if (mask_ipt_allow(init_mask, VE_IP_NAT))
+		init_mask |= VE_IP_IPTABLE_NAT;
+	if (mask_ipt_allow(init_mask, VE_IP_CONNTRACK))
+		init_mask |= VE_NF_CONNTRACK;
+
+	return init_mask;
+}
+#endif
+
+static struct cgroup_subsys_state *ve_create(struct cgroup *cg)
+{
+	struct ve_struct *ve = &ve0;
+	int err;
+
+	if (!cg->parent)
+		goto do_init;
+
+	/* forbid nested containers */
+	if (cgroup_ve(cg->parent) != ve)
+		return ERR_PTR(-ENOTDIR);
+
+	err = -ENOMEM;
+	ve = kmem_cache_zalloc(ve_cachep, GFP_KERNEL);
+	if (!ve)
+		goto err_ve;
+
+	ve->ve_name = kstrdup(cg->dentry->d_name.name, GFP_KERNEL);
+	if (!ve->ve_name)
+		goto err_name;
+
+	ve->_randomize_va_space = ve0._randomize_va_space;
+
+	ve->features = VE_FEATURES_DEF;
+
+	ve->odirect_enable = 2;
+	ve->fsync_enable = 2;
+
+#ifdef CONFIG_VE_IPTABLES
+	ve->ipt_mask = ve_setup_iptables_mask(VE_IP_DEFAULT);
+#endif
+
+	ve->sched_lat_ve.cur = alloc_percpu(struct kstat_lat_pcpu_snap_struct);
+	if (!ve->sched_lat_ve.cur)
+		goto err_lat;
+
+	err = ve_log_init(ve);
+	if (err)
+		goto err_log;
+
+	ve->meminfo_val = VE_MEMINFO_DEFAULT;
+
+	atomic_set(&ve->netns_avail_nr, NETNS_MAX_NR_DEFAULT);
+	ve->netns_max_nr = NETNS_MAX_NR_DEFAULT;
+
+	atomic_set(&ve->netif_avail_nr, NETIF_MAX_NR_DEFAULT);
+	ve->netif_max_nr = NETIF_MAX_NR_DEFAULT;
+
+do_init:
+	init_rwsem(&ve->op_sem);
+	INIT_LIST_HEAD(&ve->devices);
+	INIT_LIST_HEAD(&ve->ve_list);
+	INIT_LIST_HEAD(&ve->devmnt_list);
+	mutex_init(&ve->devmnt_mutex);
+	kmapset_init_key(&ve->ve_sysfs_perms);
+
+#ifdef CONFIG_AIO
+	spin_lock_init(&ve->aio_nr_lock);
+	ve->aio_nr = 0;
+	ve->aio_max_nr = AIO_MAX_NR_DEFAULT;
+#endif
+	ve->mnt_nr = 0;
+
+#ifdef CONFIG_COREDUMP
+	strcpy(ve->core_pattern, "core");
+#endif
+
+	return &ve->css;
+
+err_log:
+	free_percpu(ve->sched_lat_ve.cur);
+err_lat:
+	kfree(ve->ve_name);
+err_name:
+	kmem_cache_free(ve_cachep, ve);
+err_ve:
+	return ERR_PTR(err);
+}
+
+static void ve_devmnt_free(struct ve_devmnt *devmnt)
+{
+	if (!devmnt)
+		return;
+
+	kfree(devmnt->allowed_options);
+	kfree(devmnt->hidden_options);
+	kfree(devmnt);
+}
+
+static void free_ve_devmnts(struct ve_struct *ve)
+{
+	while (!list_empty(&ve->devmnt_list)) {
+		struct ve_devmnt *devmnt;
+
+		devmnt = list_first_entry(&ve->devmnt_list, struct ve_devmnt, link);
+		list_del(&devmnt->link);
+		ve_devmnt_free(devmnt);
+	}
+}
+
+static bool ve_task_can_attach(struct cgroup *cg, struct cgroup_taskset *tset)
+{
+	struct task_struct *task = cgroup_taskset_first(tset);
+
+	if (cgroup_taskset_size(tset) > 1) {
+		pr_err_ratelimited("ve_cgroup#%s: attach of a thread group is not supported\n",
+				cg->name->name);
+		return false;
+	}
+	if (!thread_group_leader(task)) {
+		pr_err_ratelimited("ve_cgroup#%s: only thread group leader is allowed to attach\n",
+				cg->name->name);
+		return false;
+	}
+	if (!thread_group_empty(task)) {
+		pr_err_ratelimited("ve_cgroup#%s: only single-threaded process is allowed to attach\n",
+				cg->name->name);
+		return false;
+	}
+	return true;
+}
+
+static int ve_is_attachable(struct cgroup *cg, struct cgroup_taskset *tset)
+{
+	struct task_struct *task = cgroup_taskset_first(tset);
+	struct ve_struct *ve = cgroup_ve(cg);
+
+	if (ve->is_running)
+		return 0;
+
+	if (!ve->veid) {
+		pr_err_ratelimited("ve_cgroup#%s: container's veid is not set\n",
+				cg->name->name);
+		return -EINVAL;
+	}
+
+	if (task->flags & PF_KTHREAD) {
+		/* Paranoia check: allow to attach kthread only, if cgroup is
+		 * not empty.
+		 * This check is required for kthreadd, which is created on CT
+		 * start.
+		 */
+		if (nr_threads_ve(ve))
+			return 0;
+		pr_err_ratelimited("ve_cgroup#%s: can't attach kthread - empty group\n",
+				cg->name->name);
+	} else {
+		/* In case of generic task only one is allowed to enter to
+		 * non-running container: init.
+		 */
+		if (nr_threads_ve(ve) == 0)
+			return 0;
+		pr_err_ratelimited("ve_cgroup#%s: can't attach more than 1 task to "
+				"non-running container\n",
+				cg->name->name);
+	}
+	return -EINVAL;
+}
+
+static void ve_destroy(struct cgroup *cg)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+
+	kmapset_unlink(&ve->ve_sysfs_perms, &ve_sysfs_perms);
+	free_ve_devmnts(ve);
+
+	ve_log_destroy(ve);
+#if IS_ENABLED(CONFIG_BINFMT_MISC)
+	kfree(ve->binfmt_misc);
+#endif
+	free_percpu(ve->sched_lat_ve.cur);
+	kfree(ve->ve_name);
+	kmem_cache_free(ve_cachep, ve);
+}
+
+static int ve_can_attach(struct cgroup *cg, struct cgroup_taskset *tset)
+{
+	if (!ve_task_can_attach(cg, tset))
+		return -EINVAL;
+
+	return ve_is_attachable(cg, tset);
+}
+
+static void ve_update_cpuid_faulting(void *dummy)
+{
+	set_cpuid_faulting(!ve_is_super(get_exec_env()));
+}
+
+static void ve_attach(struct cgroup *cg, struct cgroup_taskset *tset)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	struct task_struct *task;
+
+	cgroup_taskset_for_each(task, cg, tset) {
+		/* this probihibts ptracing of task entered to VE from host system */
+		if (ve->is_running && task->mm)
+			task->mm->vps_dumpable = VD_VE_ENTER_TASK;
+
+		/* Drop OOM protection. */
+		task->signal->oom_score_adj = 0;
+		task->signal->oom_score_adj_min = 0;
+
+		/* Leave parent exec domain */
+		task->parent_exec_id--;
+
+		task->task_ve = ve;
+	}
+
+	/* Adjust cpuid faulting */
+	on_each_cpu(ve_update_cpuid_faulting, NULL, 1);
+}
+
+static int ve_state_read(struct cgroup *cg, struct cftype *cft,
+			 struct seq_file *m)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+
+	if (ve->is_running)
+		seq_puts(m, "RUNNING");
+	else if (!nr_threads_ve(ve))
+		seq_puts(m, "STOPPED");
+	else if (ve->ve_ns)
+		seq_puts(m, "STOPPING");
+	else
+		seq_puts(m, "STARTING");
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
+struct ve_start_callback {
+		struct callback_head head;
+		struct ve_struct *ve;
+};
+
+static void ve_start_work(struct callback_head *head)
+{
+	struct ve_start_callback *work;
+	struct ve_struct *ve;
+	int ret;
+
+	work = container_of(head, struct ve_start_callback, head);
+	ve = work->ve;
+
+	down_write(&ve->op_sem);
+	ret = ve_start_container(ve);
+	up_write(&ve->op_sem);
+	put_ve(ve);
+	if (ret)
+		force_sig(SIGKILL, current);
+
+	kfree(work);
+}
+
+static int ve_state_write(struct cgroup *cg, struct cftype *cft,
+			  const char *buffer)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	struct ve_start_callback *work = NULL;
+	struct task_struct *tsk;
+	int ret = -EINVAL;
+	pid_t pid;
+
+	if (!strcmp(buffer, "START")) {
+		down_write(&ve->op_sem);
+		ret = ve_start_container(ve);
+		up_write(&ve->op_sem);
+
+		return ret;
+	}
+
+	ret = sscanf(buffer, "START %d", &pid);
+	if (ret != 1)
+		return -EINVAL;
+
+	work = kmalloc(sizeof(struct ve_start_callback), GFP_KERNEL);
+	if (!work)
+		return -ENOMEM;
+
+	rcu_read_lock();
+	tsk = find_task_by_vpid(pid);
+	if (!tsk) {
+		ret = -ESRCH;
+		goto out_unlock;
+	}
+
+	init_task_work(&work->head, ve_start_work);
+
+	work->ve = get_ve(ve);
+	ret = task_work_add(tsk, &work->head, 1);
+	if (ret)
+		put_ve(ve);
+
+out_unlock:
+	rcu_read_unlock();
+	if (ret)
+		kfree(work);
+
+	return ret;
+}
+
+static u64 ve_id_read(struct cgroup *cg, struct cftype *cft)
+{
+	return cgroup_ve(cg)->veid;
+}
+
+static int ve_id_write(struct cgroup *cg, struct cftype *cft, u64 value)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	int err = 0;
+
+	if (value <= 0 || value > INT_MAX)
+		return -EINVAL;
+
+	down_write(&ve->op_sem);
+	if (ve->is_running || ve->ve_ns) {
+		if (ve->veid != value)
+			err = -EBUSY;
+	} else
+		ve->veid = value;
+	up_write(&ve->op_sem);
+	return err;
+}
+
+static void *ve_mount_opts_start(struct seq_file *m, loff_t *ppos)
+{
+	struct ve_struct *ve = m->private;
+	struct ve_devmnt *devmnt;
+	loff_t pos = *ppos;
+
+	mutex_lock(&ve->devmnt_mutex);
+	list_for_each_entry(devmnt, &ve->devmnt_list, link) {
+		if (!pos--)
+			return devmnt;
+	}
+	return NULL;
+}
+
+static void *ve_mount_opts_next(struct seq_file *m, void *v, loff_t *ppos)
+{
+	struct ve_struct *ve = m->private;
+	struct ve_devmnt *devmnt = v;
+
+	(*ppos)++;
+	if (list_is_last(&devmnt->link, &ve->devmnt_list))
+		return NULL;
+	return list_entry(devmnt->link.next, struct ve_devmnt, link);
+}
+
+static void ve_mount_opts_stop(struct seq_file *m, void *v)
+{
+	struct ve_struct *ve = m->private;
+
+	mutex_unlock(&ve->devmnt_mutex);
+}
+
+static int ve_mount_opts_show(struct seq_file *m, void *v)
+{
+	struct ve_devmnt *devmnt = v;
+	dev_t dev = devmnt->dev;
+
+	seq_printf(m, "0 %u:%u;", MAJOR(dev), MINOR(dev));
+	if (devmnt->hidden_options)
+		seq_printf(m, "1 %s;", devmnt->hidden_options);
+	if (devmnt->allowed_options)
+		seq_printf(m, "2 %s;", devmnt->allowed_options);
+	seq_putc(m, '\n');
+	return 0;
+}
+
+struct seq_operations ve_mount_opts_sops = {
+	.start = ve_mount_opts_start,
+	.stop = ve_mount_opts_stop,
+	.next = ve_mount_opts_next,
+	.show = ve_mount_opts_show,
+};
+
+static int ve_mount_opts_open(struct inode *inode, struct file *file)
+{
+	struct ve_struct *ve = cgroup_ve(file->f_dentry->d_parent->d_fsdata);
+	struct seq_file *m;
+	int ret;
+
+	if (ve_is_super(ve))
+		return -ENODEV;
+
+	ret = seq_open(file, &ve_mount_opts_sops);
+	if (!ret) {
+		m = file->private_data;
+		m->private = ve;
+	}
+	return ret;
+}
+
+static ssize_t ve_mount_opts_read(struct cgroup *cgrp, struct cftype *cft,
+				  struct file *file, char __user *buf,
+				  size_t nbytes, loff_t *ppos)
+{
+	return seq_read(file, buf, nbytes, ppos);
+}
+
+static int ve_mount_opts_release(struct inode *inode, struct file *file)
+{
+	return seq_release(inode, file);
+}
+
+/*
+ * 'data' for VE_CONFIGURE_MOUNT_OPTIONS is a zero-terminated string
+ * consisting of substrings separated by MNTOPT_DELIM.
+ */
+#define MNTOPT_DELIM ';'
+
+/*
+ * Each substring has the form of "<type> <comma-separated-list-of-options>"
+ * where types are:
+ */
+enum {
+	MNTOPT_DEVICE = 0,
+	MNTOPT_HIDDEN = 1,
+	MNTOPT_ALLOWED = 2,
+};
+
+/*
+ * 'ptr' points to the first character of buffer to parse
+ * 'endp' points to the last character of buffer to parse
+ */
+static int ve_parse_mount_options(const char *ptr, const char *endp,
+				  struct ve_devmnt *devmnt)
+{
+	while (*ptr) {
+		const char *delim = strchr(ptr, MNTOPT_DELIM) ? : endp;
+		char *space = strchr(ptr, ' ');
+		int type;
+		char *options, c, s;
+		int options_size = delim - space;
+		char **opts_pp = NULL; /* where to store 'options' */
+
+		if (delim == ptr || !space || options_size <= 1 ||
+		    !isdigit(*ptr) || space > delim)
+			return -EINVAL;
+
+		if (sscanf(ptr, "%d%c", &type, &c) != 2 || c != ' ')
+			return -EINVAL;
+
+		if (type == MNTOPT_DEVICE) {
+			unsigned major, minor;
+			if (devmnt->dev)
+				return -EINVAL; /* Already set */
+			if (sscanf(space + 1, "%u%c%u%c", &major, &c,
+							  &minor, &s) != 4 ||
+			    c != ':' || s != MNTOPT_DELIM)
+				return -EINVAL;
+			devmnt->dev = MKDEV(major, minor);
+			goto next;
+		}
+
+	        options = kmalloc(options_size, GFP_KERNEL);
+		if (!options)
+			return -ENOMEM;
+
+		strncpy(options, space + 1, options_size - 1);
+		options[options_size - 1] = 0;
+
+		switch (type) {
+		case MNTOPT_ALLOWED:
+			opts_pp = &devmnt->allowed_options;
+			break;
+		case MNTOPT_HIDDEN:
+			opts_pp = &devmnt->hidden_options;
+			break;
+		};
+
+		/* wrong type or already set */
+		if (!opts_pp || *opts_pp) {
+			kfree(options);
+			return -EINVAL;
+		}
+
+		*opts_pp = options;
+next:
+		if (!*delim)
+			break;
+
+		ptr = delim + 1;
+	}
+
+	if (!devmnt->dev)
+		return -EINVAL;
+	return 0;
+}
+
+static int ve_mount_opts_write(struct cgroup *cg, struct cftype *cft,
+			       const char *buffer)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	struct ve_devmnt *devmnt, *old;
+	int size, err;
+
+	size = strlen(buffer);
+	if (size <= 1)
+		return -EINVAL;
+
+	devmnt = kzalloc(sizeof(*devmnt), GFP_KERNEL);
+	if (!devmnt)
+		return -ENOMEM;
+
+	err = ve_parse_mount_options(buffer, buffer + size, devmnt);
+	if (err) {
+		ve_devmnt_free(devmnt);
+		return err;
+	}
+
+	mutex_lock(&ve->devmnt_mutex);
+	list_for_each_entry(old, &ve->devmnt_list, link) {
+		/* Delete old devmnt */
+		if (old->dev == devmnt->dev) {
+			list_del(&old->link);
+			ve_devmnt_free(old);
+			break;
+		}
+	}
+	list_add(&devmnt->link, &ve->devmnt_list);
+	mutex_unlock(&ve->devmnt_mutex);
+
+	return 0;
+}
+
+static int ve_os_release_read(struct cgroup *cg, struct cftype *cft,
+			      struct seq_file *m)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	int ret = 0;
+
+	down_read(&ve->op_sem);
+
+	if (!ve->ve_ns) {
+		ret = -ENOENT;
+		goto up_opsem;
+	}
+
+	down_read(&uts_sem);
+	seq_puts(m, ve->ve_ns->uts_ns->name.release);
+	seq_putc(m, '\n');
+	up_read(&uts_sem);
+up_opsem:
+	up_read(&ve->op_sem);
+
+	return ret;
+}
+
+static int ve_os_release_write(struct cgroup *cg, struct cftype *cft,
+			       const char *buffer)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	char *release;
+	int ret = 0;
+
+	down_read(&ve->op_sem);
+
+	if (!ve->ve_ns) {
+		ret = -ENOENT;
+		goto up_opsem;
+	}
+
+	down_write(&uts_sem);
+	release = ve->ve_ns->uts_ns->name.release;
+	strncpy(release, buffer, __NEW_UTS_LEN);
+	release[__NEW_UTS_LEN] = '\0';
+	up_write(&uts_sem);
+up_opsem:
+	up_read(&ve->op_sem);
+
+	return ret;
+}
+
+enum {
+	VE_CF_STATE,
+	VE_CF_FEATURES,
+	VE_CF_IPTABLES_MASK,
+	VE_CF_PSEUDOSUPER,
+	VE_CF_CLOCK_MONOTONIC,
+	VE_CF_CLOCK_BOOTBASED,
+	VE_CF_AIO_MAX_NR,
+	VE_CF_PID_MAX,
+	VE_CF_NETNS_MAX_NR,
+	VE_CF_NETNS_NR,
+	VE_CF_NETIF_MAX_NR,
+	VE_CF_NETIF_NR,
+};
+
+static int ve_ts_read(struct cgroup *cg, struct cftype *cft, struct seq_file *m)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	struct timespec ts, *delta;
+
+	do_posix_clock_monotonic_gettime(&ts);
+	if (cft->private == VE_CF_CLOCK_MONOTONIC) {
+		delta = &ve->start_timespec;
+	} else if (cft->private == VE_CF_CLOCK_BOOTBASED) {
+		delta = &ve->real_start_timespec;
+		monotonic_to_bootbased(&ts);
+	} else {
+		delta = &ts;
+		memset(&ts, 0, sizeof(ts));
+		WARN_ON_ONCE(1);
+	}
+
+	set_normalized_timespec(&ts, ts.tv_sec - delta->tv_sec,
+				ts.tv_nsec - delta->tv_nsec);
+	seq_printf(m, "%ld %ld", ts.tv_sec, ts.tv_nsec);
+	return 0;
+}
+
+static int ve_ts_write(struct cgroup *cg, struct cftype *cft, const char *buffer)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	struct timespec ts, delta, *target;
+
+	if (sscanf(buffer, "%ld %ld", &delta.tv_sec, &delta.tv_nsec) != 2)
+		return -EINVAL;
+
+	do_posix_clock_monotonic_gettime(&ts);
+	if (cft->private == VE_CF_CLOCK_MONOTONIC) {
+		target = &ve->start_timespec;
+	} else if (cft->private == VE_CF_CLOCK_BOOTBASED) {
+		target = &ve->real_start_timespec;
+		monotonic_to_bootbased(&ts);
+	} else {
+		WARN_ON_ONCE(1);
+		return -EINVAL;
+	}
+
+	set_normalized_timespec(target, ts.tv_sec - delta.tv_sec,
+				ts.tv_nsec - delta.tv_nsec);
+	return 0;
+}
+
+static u64 ve_read_u64(struct cgroup *cg, struct cftype *cft)
+{
+	if (cft->private == VE_CF_FEATURES)
+		return cgroup_ve(cg)->features;
+#ifdef CONFIG_VE_IPTABLES
+	else if (cft->private == VE_CF_IPTABLES_MASK)
+		return cgroup_ve(cg)->ipt_mask;
+#endif
+	else if (cft->private == VE_CF_PSEUDOSUPER)
+		return cgroup_ve(cg)->is_pseudosuper;
+	else if (cft->private == VE_CF_AIO_MAX_NR)
+		return cgroup_ve(cg)->aio_max_nr;
+	else if (cft->private == VE_CF_PID_MAX) {
+		struct ve_struct *ve = cgroup_ve(cg);
+		if (ve->ve_ns && ve->ve_ns->pid_ns)
+			return ve->ve_ns->pid_ns->pid_max;
+	} else if (cft->private == VE_CF_NETNS_MAX_NR)
+		return cgroup_ve(cg)->netns_max_nr;
+	else if (cft->private == VE_CF_NETNS_NR)
+		return atomic_read(&cgroup_ve(cg)->netns_avail_nr);
+	else if (cft->private == VE_CF_NETIF_MAX_NR)
+		return cgroup_ve(cg)->netif_max_nr;
+	else if (cft->private == VE_CF_NETIF_NR)
+		return atomic_read(&cgroup_ve(cg)->netif_avail_nr);
+	return 0;
+}
+
+/*
+ * Move VE into pseudosuper state where some of privilegued
+ * operations such as mounting cgroups from inside of VE context
+ * is allowed in a sake of container restore for example.
+ *
+ * While dropping pseudosuper privilegues is allowed from
+ * any context to set this value up one have to be a real
+ * node's owner.
+ */
+static int ve_write_pseudosuper(struct cgroup *cg,
+				struct cftype *cft,
+				u64 value)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+
+	if (!ve_capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!ve_is_super(get_exec_env()) && value)
+		return -EPERM;
+
+	down_write(&ve->op_sem);
+	if (value && (ve->is_running || ve->ve_ns)) {
+		up_write(&ve->op_sem);
+		return -EBUSY;
+	}
+	ve->is_pseudosuper = value;
+	up_write(&ve->op_sem);
+
+	return 0;
+}
+
+extern int pid_max_min, pid_max_max;
+
+static int ve_write_pid_max(struct cgroup *cg,
+			    struct cftype *cft,
+			    u64 value)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+	if (!ve->ve_ns || !ve->ve_ns->pid_ns)
+		return -EBUSY;
+
+	if (pid_max_min > value ||
+	     pid_max_max < value)
+		return -EINVAL;
+
+	ve->ve_ns->pid_ns->pid_max = value;
+	return 0;
+}
+
+static int _ve_write_u64(struct cgroup *cg, struct cftype *cft,
+                         u64 value, int running)
+{
+	struct ve_struct *ve = cgroup_ve(cg);
+
+	if (!ve_is_super(get_exec_env()) &&
+	    !ve->is_pseudosuper)
+		return -EPERM;
+
+	down_write(&ve->op_sem);
+	if (!running && (ve->is_running || ve->ve_ns)) {
+		up_write(&ve->op_sem);
+		return -EBUSY;
+	}
+
+	if (cft->private == VE_CF_FEATURES)
+		ve->features = value;
+#ifdef CONFIG_VE_IPTABLES
+	else if (cft->private == VE_CF_IPTABLES_MASK)
+		ve->ipt_mask = ve_setup_iptables_mask(value);
+#endif
+	else if (cft->private == VE_CF_AIO_MAX_NR)
+		ve->aio_max_nr = value;
+	else if (cft->private == VE_CF_PID_MAX) {
+		int ret;
+		ret = ve_write_pid_max(cg, cft, value);
+		up_write(&ve->op_sem);
+		return ret;
+	} else if (cft->private == VE_CF_NETNS_MAX_NR) {
+		int delta = value - ve->netns_max_nr;
+
+		ve->netns_max_nr = value;
+		atomic_add(delta, &ve->netns_avail_nr);
+	} else if (cft->private == VE_CF_NETIF_MAX_NR) {
+		int delta = value - ve->netif_max_nr;
+
+		ve->netif_max_nr = value;
+		atomic_add(delta, &ve->netif_avail_nr);
+	}
+	up_write(&ve->op_sem);
+	return 0;
+}
+
+static int ve_write_u64(struct cgroup *cg, struct cftype *cft, u64 value)
+{
+	return _ve_write_u64(cg, cft, value, 0);
+}
+
+static int ve_write_running_u64(struct cgroup *cg, struct cftype *cft, u64 value)
+{
+	return _ve_write_u64(cg, cft, value, 1);
+}
+
+static struct cftype ve_cftypes[] = {
+	{
+		.name			= "state",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_seq_string	= ve_state_read,
+		.write_string		= ve_state_write,
+		.private		= VE_CF_STATE,
+	},
+	{
+		.name			= "veid",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_id_read,
+		.write_u64		= ve_id_write,
+	},
+	{
+		.name			= "features",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_read_u64,
+		.write_u64		= ve_write_u64,
+		.private		= VE_CF_FEATURES,
+	},
+	{
+		.name			= "mount_opts",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.open			= ve_mount_opts_open,
+		.read			= ve_mount_opts_read,
+		.release		= ve_mount_opts_release,
+		.write_string		= ve_mount_opts_write,
+	},
+	{
+		.name			= "os_release",
+		.max_write_len		= __NEW_UTS_LEN + 1,
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_seq_string	= ve_os_release_read,
+		.write_string		= ve_os_release_write,
+	},
+	{
+		.name			= "iptables_mask",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_read_u64,
+		.write_u64		= ve_write_u64,
+		.private		= VE_CF_IPTABLES_MASK,
+	},
+	{
+		.name			= "pseudosuper",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_read_u64,
+		.write_u64		= ve_write_pseudosuper,
+		.private		= VE_CF_PSEUDOSUPER,
+	},
+	{
+		.name			= "clock_monotonic",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_seq_string	= ve_ts_read,
+		.write_string		= ve_ts_write,
+		.private		= VE_CF_CLOCK_MONOTONIC,
+	},
+	{
+		.name			= "clock_bootbased",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_seq_string	= ve_ts_read,
+		.write_string		= ve_ts_write,
+		.private		= VE_CF_CLOCK_BOOTBASED,
+	},
+	{
+		.name			= "aio_max_nr",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_read_u64,
+		.write_u64		= ve_write_u64,
+		.private		= VE_CF_AIO_MAX_NR,
+	},
+	{
+		.name			= "pid_max",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_read_u64,
+		.write_u64		= ve_write_running_u64,
+		.private		= VE_CF_PID_MAX,
+	},
+	{
+		.name			= "netns_max_nr",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_read_u64,
+		.write_u64		= ve_write_u64,
+		.private		= VE_CF_NETNS_MAX_NR,
+	},
+	{
+		.name			= "netns_avail_nr",
+		.read_u64		= ve_read_u64,
+		.private		= VE_CF_NETNS_NR,
+	},
+	{
+		.name			= "netif_max_nr",
+		.flags			= CFTYPE_NOT_ON_ROOT,
+		.read_u64		= ve_read_u64,
+		.write_u64		= ve_write_u64,
+		.private		= VE_CF_NETIF_MAX_NR,
+	},
+	{
+		.name			= "netif_avail_nr",
+		.read_u64		= ve_read_u64,
+		.private		= VE_CF_NETIF_NR,
+	},
+	{ }
+};
+
+struct cgroup_subsys ve_subsys = {
+	.name		= "ve",
+	.subsys_id	= ve_subsys_id,
+	.css_alloc	= ve_create,
+	.css_free	= ve_destroy,
+	.can_attach	= ve_can_attach,
+	.attach		= ve_attach,
+	.base_cftypes	= ve_cftypes,
+};
+EXPORT_SYMBOL(ve_subsys);
+
+static int __init ve_subsys_init(void)
+{
+	ve_cachep = KMEM_CACHE(ve_struct, SLAB_PANIC);
+	list_add(&ve0.ve_list, &ve_list_head);
+	return 0;
+}
+late_initcall(ve_subsys_init);
+
+#ifdef CONFIG_CGROUP_SCHED
+int cpu_cgroup_proc_stat(struct cgroup *cgrp, struct cftype *cft,
+			 struct seq_file *p);
+
+int ve_show_cpu_stat(struct ve_struct *ve, struct seq_file *p)
+{
+	struct cgroup_subsys_state *css;
+	int err;
+
+	css = ve_get_init_css(ve, cpu_cgroup_subsys_id);
+	err = cpu_cgroup_proc_stat(css->cgroup, NULL, p);
+	css_put(css);
+	return err;
+}
+
+int cpu_cgroup_proc_loadavg(struct cgroup *cgrp, struct cftype *cft,
+			    struct seq_file *p);
+
+int ve_show_loadavg(struct ve_struct *ve, struct seq_file *p)
+{
+	struct cgroup_subsys_state *css;
+	int err;
+
+	css = ve_get_init_css(ve, cpu_cgroup_subsys_id);
+	err = cpu_cgroup_proc_loadavg(css->cgroup, NULL, p);
+	css_put(css);
+	return err;
+}
+
+int cpu_cgroup_get_avenrun(struct cgroup *cgrp, unsigned long *avenrun);
+
+int ve_get_cpu_avenrun(struct ve_struct *ve, unsigned long *avenrun)
+{
+	struct cgroup_subsys_state *css;
+	int err;
+
+	css = ve_get_init_css(ve, cpu_cgroup_subsys_id);
+	err = cpu_cgroup_get_avenrun(css->cgroup, avenrun);
+	css_put(css);
+	return err;
+}
+EXPORT_SYMBOL(ve_get_cpu_avenrun);
+
+int cpu_cgroup_get_stat(struct cgroup *cgrp, struct kernel_cpustat *kstat);
+
+int ve_get_cpu_stat(struct ve_struct *ve, struct kernel_cpustat *kstat)
+{
+	struct cgroup_subsys_state *css;
+	int err;
+
+	css = ve_get_init_css(ve, cpu_cgroup_subsys_id);
+	err = cpu_cgroup_get_stat(css->cgroup, kstat);
+	css_put(css);
+	return err;
+}
+EXPORT_SYMBOL(ve_get_cpu_stat);
+#endif /* CONFIG_CGROUP_SCHED */
--- /dev/null
+++ b/kernel/ve/vecalls.c
@@ -0,0 +1,763 @@
+/*
+ *  kernel/ve/vecalls.c
+ *
+ *  Copyright (c) 2000-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * 'vecalls.c' is file with basic VE support. It provides basic primities
+ * along with initialization script
+ */
+
+#include <linux/ve.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/errno.h>
+#include <linux/unistd.h>
+#include <linux/slab.h>
+#include <linux/sys.h>
+#include <linux/fs.h>
+#include <linux/netdevice.h>
+#include <linux/utsname.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/kernel_stat.h>
+#include <linux/module.h>
+#include <linux/rcupdate.h>
+#include <linux/mount.h>
+#include <generated/utsrelease.h>
+
+#include <linux/venet.h>
+#include <linux/vzctl.h>
+#include <uapi/linux/vzcalluser.h>
+#include <linux/device_cgroup.h>
+
+static s64 ve_get_uptime(struct ve_struct *ve)
+{
+	struct timespec uptime;
+	do_posix_clock_monotonic_gettime(&uptime);
+	monotonic_to_bootbased(&uptime);
+	uptime = timespec_sub(uptime, ve->real_start_timespec);
+	return timespec_to_ns(&uptime);
+}
+
+static int fill_cpu_stat(envid_t veid, struct vz_cpu_stat __user *buf)
+{
+	struct ve_struct *ve;
+	struct vz_cpu_stat *vstat;
+	int retval;
+	int i;
+	unsigned long tmp;
+	unsigned long avenrun[3];
+	struct kernel_cpustat kstat;
+
+	if (!ve_is_super(get_exec_env()) && (veid != get_exec_env()->veid))
+		return -EPERM;
+	ve = get_ve_by_id(veid);
+	if (!ve)
+		return -ESRCH;
+
+	retval = -ENOMEM;
+	vstat = kzalloc(sizeof(*vstat), GFP_KERNEL);
+	if (!vstat)
+		goto out_put_ve;
+
+	retval = ve_get_cpu_stat(ve, &kstat);
+	if (retval)
+		goto out_free;
+
+	retval = ve_get_cpu_avenrun(ve, avenrun);
+	if (retval)
+		goto out_free;
+
+	vstat->user_jif += (unsigned long)cputime64_to_clock_t(kstat.cpustat[CPUTIME_USER]);
+	vstat->nice_jif += (unsigned long)cputime64_to_clock_t(kstat.cpustat[CPUTIME_NICE]);
+	vstat->system_jif += (unsigned long)cputime64_to_clock_t(kstat.cpustat[CPUTIME_SYSTEM]);
+	vstat->idle_clk += cputime_to_usecs(kstat.cpustat[CPUTIME_IDLE]) * NSEC_PER_USEC;
+
+	vstat->uptime_clk = ve_get_uptime(ve);
+
+	vstat->uptime_jif = (unsigned long)jiffies_64_to_clock_t(
+				get_jiffies_64() - ve->start_jiffies);
+	for (i = 0; i < 3; i++) {
+		tmp = avenrun[i] + (FIXED_1/200);
+		vstat->avenrun[i].val_int = LOAD_INT(tmp);
+		vstat->avenrun[i].val_frac = LOAD_FRAC(tmp);
+	}
+
+	retval = 0;
+	if (copy_to_user(buf, vstat, sizeof(*vstat)))
+		retval = -EFAULT;
+out_free:
+	kfree(vstat);
+out_put_ve:
+	put_ve(ve);
+	return retval;
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * /proc/meminfo virtualization
+ *
+ **********************************************************************
+ **********************************************************************/
+static int ve_set_meminfo(envid_t veid, unsigned long val)
+{
+#ifdef CONFIG_BEANCOUNTERS
+	struct ve_struct *ve;
+
+	ve = get_ve_by_id(veid);
+	if (!ve)
+		return -EINVAL;
+
+	if (val == 0)
+		val = VE_MEMINFO_SYSTEM;
+	else if (val == 1)
+		val = VE_MEMINFO_DEFAULT;
+
+	ve->meminfo_val = val;
+	put_ve(ve);
+	return 0;
+#else
+	return -ENOTTY;
+#endif
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * Pieces of VE network
+ *
+ **********************************************************************
+ **********************************************************************/
+
+#ifdef CONFIG_NET
+#include <asm/uaccess.h>
+#include <net/sock.h>
+#include <linux/netlink.h>
+#include <linux/rtnetlink.h>
+#include <net/route.h>
+#include <net/ip_fib.h>
+#endif
+
+static int ve_dev_add(envid_t veid, char *dev_name)
+{
+	struct net_device *dev;
+	struct ve_struct *dst_ve;
+	struct net *dst_net;
+	int err = -ESRCH;
+
+	dst_ve = get_ve_by_id(veid);
+	if (dst_ve == NULL)
+		goto out;
+
+	dst_net = dst_ve->ve_netns;
+
+	rtnl_lock();
+	read_lock(&dev_base_lock);
+	dev = __dev_get_by_name(&init_net, dev_name);
+	read_unlock(&dev_base_lock);
+	if (dev == NULL)
+		goto out_unlock;
+
+	err = dev_change_net_namespace(dev, dst_net, dev_name);
+out_unlock:
+	rtnl_unlock();
+	put_ve(dst_ve);
+
+	if (dev == NULL)
+		printk(KERN_WARNING "%s: device %s not found\n",
+			__func__, dev_name);
+out:
+	return err;
+}
+
+static int ve_dev_del(envid_t veid, char *dev_name)
+{
+	struct net_device *dev;
+	struct ve_struct *src_ve;
+	struct net *src_net;
+	int err = -ESRCH;
+
+	src_ve = get_ve_by_id(veid);
+	if (src_ve == NULL)
+		goto out;
+
+	src_net = src_ve->ve_netns;
+
+	rtnl_lock();
+
+	read_lock(&dev_base_lock);
+	dev = __dev_get_by_name(src_net, dev_name);
+	read_unlock(&dev_base_lock);
+	if (dev == NULL)
+		goto out_unlock;
+
+	err = dev_change_net_namespace(dev, &init_net, dev_name);
+out_unlock:
+	rtnl_unlock();
+	put_ve(src_ve);
+
+	if (dev == NULL)
+		printk(KERN_WARNING "%s: device %s not found\n",
+			__func__, dev_name);
+out:
+	return err;
+}
+
+int real_ve_dev_map(envid_t veid, int op, char *dev_name)
+{
+	if (!capable_setveid())
+		return -EPERM;
+	switch (op) {
+	case VE_NETDEV_ADD:
+		return ve_dev_add(veid, dev_name);
+	case VE_NETDEV_DEL:
+		return ve_dev_del(veid, dev_name);
+	default:
+		return -EINVAL;
+	}
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * VE information via /proc
+ *
+ **********************************************************************
+ **********************************************************************/
+#ifdef CONFIG_PROC_FS
+#if BITS_PER_LONG == 32
+#define VESTAT_LINE_WIDTH (6 * 11 + 6 * 21)
+#define VESTAT_LINE_FMT "%10s %10lu %10lu %10lu %10Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %10lu\n"
+#define VESTAT_HEAD_FMT "%10s %10s %10s %10s %10s %20s %20s %20s %20s %20s %20s %10s\n"
+#else
+#define VESTAT_LINE_WIDTH (12 * 21)
+#define VESTAT_LINE_FMT "%20s %20lu %20lu %20lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20Lu %20lu\n"
+#define VESTAT_HEAD_FMT "%20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s %20s\n"
+#endif
+
+static int vestat_seq_show(struct seq_file *m, void *v)
+{
+	struct list_head *entry;
+	struct ve_struct *ve;
+	struct ve_struct *curve;
+	int ret;
+	unsigned long user_ve, nice_ve, system_ve;
+	unsigned long long uptime;
+	u64 uptime_cycles, idle_time, strv_time, used;
+	struct kernel_cpustat kstat;
+
+	entry = (struct list_head *)v;
+	ve = list_entry(entry, struct ve_struct, ve_list);
+
+	curve = get_exec_env();
+	if (entry == ve_list_head.next ||
+	    (!ve_is_super(curve) && ve == curve)) {
+		/* print header */
+		seq_printf(m, "%-*s\n",
+			VESTAT_LINE_WIDTH - 1,
+			"Version: 2.2");
+		seq_printf(m, VESTAT_HEAD_FMT, "VEID",
+					"user", "nice", "system",
+					"uptime", "idle",
+					"strv", "uptime", "used",
+					"maxlat", "totlat", "numsched");
+	}
+
+	if (ve == get_ve0())
+		return 0;
+
+	ret = ve_get_cpu_stat(ve, &kstat);
+	if (ret)
+		return ret;
+
+	strv_time = 0;
+	user_ve = cputime_to_jiffies(kstat.cpustat[CPUTIME_USER]);
+	nice_ve = cputime_to_jiffies(kstat.cpustat[CPUTIME_NICE]);
+	system_ve = cputime_to_jiffies(kstat.cpustat[CPUTIME_SYSTEM]);
+	used = cputime_to_usecs(kstat.cpustat[CPUTIME_USED]) * NSEC_PER_USEC;
+	idle_time = cputime_to_usecs(kstat.cpustat[CPUTIME_IDLE]) *
+							NSEC_PER_USEC;
+
+	uptime_cycles = ve_get_uptime(ve);
+	uptime = get_jiffies_64() - ve->start_jiffies;
+
+	seq_printf(m, VESTAT_LINE_FMT, ve_name(ve),
+				user_ve, nice_ve, system_ve,
+				(unsigned long long)uptime,
+				(unsigned long long)idle_time, 
+				(unsigned long long)strv_time,
+				(unsigned long long)uptime_cycles,
+				(unsigned long long)used,
+				(unsigned long long)ve->sched_lat_ve.last.maxlat,
+				(unsigned long long)ve->sched_lat_ve.last.totlat,
+				ve->sched_lat_ve.last.count);
+	return 0;
+}
+
+void *ve_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct ve_struct *curve;
+
+	curve = get_exec_env();
+	mutex_lock(&ve_list_lock);
+	if (!ve_is_super(curve)) {
+		if (*pos != 0)
+			return NULL;
+		return &curve->ve_list;
+	}
+
+	return seq_list_start(&ve_list_head, *pos);
+}
+EXPORT_SYMBOL(ve_seq_start);
+
+void *ve_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	if (!ve_is_super(get_exec_env()))
+		return NULL;
+	else
+		return seq_list_next(v, &ve_list_head, pos);
+}
+EXPORT_SYMBOL(ve_seq_next);
+
+void ve_seq_stop(struct seq_file *m, void *v)
+{
+	mutex_unlock(&ve_list_lock);
+}
+EXPORT_SYMBOL(ve_seq_stop);
+
+static struct seq_operations vestat_seq_op = {
+        .start	= ve_seq_start,
+        .next	= ve_seq_next,
+        .stop	= ve_seq_stop,
+        .show	= vestat_seq_show
+};
+
+static int vestat_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &vestat_seq_op);
+}
+
+static struct file_operations proc_vestat_operations = {
+        .open	 = vestat_open,
+        .read	 = seq_read,
+        .llseek	 = seq_lseek,
+        .release = seq_release
+};
+
+static int devperms_seq_show(struct seq_file *m, void *v)
+{
+	struct ve_struct *ve = list_entry(v, struct ve_struct, ve_list);
+
+	if (m->private == (void *)0) {
+		seq_printf(m, "Version: 2.7\n");
+		m->private = (void *)-1;
+	}
+
+	if (ve_is_super(ve))
+		seq_printf(m, "%10u b 016 *:*\n%10u c 006 *:*\n", 0, 0);
+	else
+		devcgroup_seq_show_ve(ve, m);
+
+	return 0;
+}
+
+static struct seq_operations devperms_seq_op = {
+	.start  = ve_seq_start,
+	.next   = ve_seq_next,
+	.stop   = ve_seq_stop,
+	.show   = devperms_seq_show,
+};
+
+static int devperms_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &devperms_seq_op);
+}
+
+static struct file_operations proc_devperms_ops = {
+	.open           = devperms_open,
+	.read           = seq_read,
+	.llseek         = seq_lseek,
+	.release        = seq_release,
+};
+
+static int vz_version_show(struct seq_file *file, void* v)
+{
+	static const char ver[] = VZVERSION "\n";
+
+	return seq_puts(file, ver);
+}
+
+static int vz_version_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, vz_version_show, NULL);
+}
+
+static struct file_operations proc_vz_version_oparations = {
+	.open    = vz_version_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = single_release,
+};
+
+/* /proc/vz/veinfo */
+
+static ve_seq_print_t veaddr_seq_print_cb;
+
+void vzmon_register_veaddr_print_cb(ve_seq_print_t cb)
+{
+	rcu_assign_pointer(veaddr_seq_print_cb, cb);
+}
+EXPORT_SYMBOL(vzmon_register_veaddr_print_cb);
+
+void vzmon_unregister_veaddr_print_cb(ve_seq_print_t cb)
+{
+	rcu_assign_pointer(veaddr_seq_print_cb, NULL);
+	synchronize_rcu();
+}
+EXPORT_SYMBOL(vzmon_unregister_veaddr_print_cb);
+
+static int veinfo_seq_show(struct seq_file *m, void *v)
+{
+	struct ve_struct *ve;
+	ve_seq_print_t veaddr_seq_print;
+
+	ve = list_entry((struct list_head *)v, struct ve_struct, ve_list);
+
+	seq_printf(m, "%10s %5u %5u", ve_name(ve), ve->class_id, nr_threads_ve(ve));
+
+	rcu_read_lock();
+	veaddr_seq_print = rcu_dereference(veaddr_seq_print_cb);
+	if (veaddr_seq_print)
+		veaddr_seq_print(m, ve);
+	rcu_read_unlock();
+
+	seq_putc(m, '\n');
+	return 0;
+}
+
+static struct seq_operations veinfo_seq_op = {
+	.start	= ve_seq_start,
+	.next	=  ve_seq_next,
+	.stop	=  ve_seq_stop,
+	.show	=  veinfo_seq_show,
+};
+
+static int veinfo_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &veinfo_seq_op);
+}
+
+static struct file_operations proc_veinfo_operations = {
+	.open		= veinfo_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+static int __init init_vecalls_proc(void)
+{
+	struct proc_dir_entry *de;
+
+	de = proc_create("vestat", S_IFREG | S_IRUSR | S_ISVTX, proc_vz_dir,
+			&proc_vestat_operations);
+	if (!de)
+		printk(KERN_WARNING "VZMON: can't make vestat proc entry\n");
+
+	de = proc_create("devperms", S_IFREG | S_IRUSR, proc_vz_dir,
+			&proc_devperms_ops);
+	if (!de)
+		printk(KERN_WARNING "VZMON: can't make devperms proc entry\n");
+
+	de = proc_create("version", S_IFREG | S_IRUGO, proc_vz_dir,
+			&proc_vz_version_oparations);
+	if (!de)
+		printk(KERN_WARNING "VZMON: can't make version proc entry\n");
+
+	de = proc_create("veinfo", S_IFREG | S_IRUSR | S_ISVTX, proc_vz_dir,
+			&proc_veinfo_operations);
+	if (!de)
+		printk(KERN_WARNING "VZMON: can't make veinfo proc entry\n");
+
+	return 0;
+}
+
+static void fini_vecalls_proc(void)
+{
+	remove_proc_entry("version", proc_vz_dir);
+	remove_proc_entry("devperms", proc_vz_dir);
+	remove_proc_entry("vestat", proc_vz_dir);
+	remove_proc_entry("veinfo", proc_vz_dir);
+}
+#else
+#define init_vecalls_proc()	(0)
+#define fini_vecalls_proc()	do { } while (0)
+#endif /* CONFIG_PROC_FS */
+
+static int init_ve_osrelease(struct ve_struct *ve, char *release)
+{
+	if (!release)
+		return -ENODATA;
+
+	if (strlen(release) >= sizeof(ve->ve_ns->uts_ns->name.release))
+		return -EMSGSIZE;
+
+	down_write(&uts_sem);
+	strcpy(ve->ve_ns->uts_ns->name.release, release);
+	up_write(&uts_sem);
+
+	return 0;
+}
+
+static int ve_configure(envid_t veid, unsigned int key,
+			unsigned int val, unsigned int size, char *data)
+{
+	struct ve_struct *ve;
+	int err = -ENOKEY;
+
+	if (key == VE_CONFIGURE_OPEN_TTY)
+		return vtty_open_master(veid, val);
+
+	ve = get_ve_by_id(veid);
+	if (!ve)
+		return -EINVAL;
+
+	switch(key) {
+	case VE_CONFIGURE_OS_RELEASE:
+		err = init_ve_osrelease(ve, data);
+		break;
+	}
+
+	put_ve(ve);
+ 	return err;
+}
+
+static int ve_configure_ioctl(struct vzctl_ve_configure *arg)
+{
+	int err;
+	struct vzctl_ve_configure s;
+	char *data = NULL;
+
+	err = -EFAULT;
+	if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+		goto out;
+	if (s.size) {
+		if (s.size > PAGE_SIZE)
+			return -EMSGSIZE;
+
+		data = kzalloc(s.size + 1, GFP_KERNEL);
+		if (unlikely(!data))
+			return -ENOMEM;
+
+		if (copy_from_user(data, (void __user *) &arg->data, s.size))
+			goto out;
+	}
+	err = ve_configure(s.veid, s.key, s.val, s.size, data);
+out:
+	kfree(data);
+	return err;
+}
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * User ctl
+ *
+ **********************************************************************
+ **********************************************************************/
+
+int vzcalls_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int err;
+
+	err = -ENOTTY;
+	switch(cmd) {
+	    case VZCTL_MARK_ENV_TO_DOWN: {
+		        /* Compatibility issue */
+		        err = 0;
+		}
+		break;
+#ifdef CONFIG_INET
+	    case VZCTL_VE_NETDEV: {
+			struct vzctl_ve_netdev d;
+			char *s;
+			err = -EFAULT;
+			if (copy_from_user(&d, (void __user *)arg, sizeof(d)))
+				break;
+			err = -ENOMEM;
+			s = kmalloc(IFNAMSIZ+1, GFP_KERNEL);
+			if (s == NULL)
+				break;
+			err = -EFAULT;
+			if (strncpy_from_user(s, d.dev_name, IFNAMSIZ) > 0) {
+				s[IFNAMSIZ] = 0;
+				err = real_ve_dev_map(d.veid, d.op, s);
+			}
+			kfree(s);
+		}
+		break;
+#endif
+	    case VZCTL_ENV_CREATE: {
+			err = -ENOTSUPP;
+		}
+		break;
+	    case VZCTL_ENV_CREATE_DATA: {
+			err = -ENOTSUPP;
+		}
+		break;
+	    case VZCTL_GET_CPU_STAT: {
+			struct vzctl_cpustatctl s;
+			err = -EFAULT;
+			if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+				break;
+			err = fill_cpu_stat(s.veid, s.cpustat);
+		}
+		break;
+	    case VZCTL_VE_MEMINFO: {
+			struct vzctl_ve_meminfo s;
+			err = -EFAULT;
+			if (copy_from_user(&s, (void __user *)arg, sizeof(s)))
+				break;
+			err = ve_set_meminfo(s.veid, s.val);
+		}
+		break;
+	    case VZCTL_VE_CONFIGURE:
+		err = ve_configure_ioctl((struct vzctl_ve_configure *)arg);
+		break;
+	}
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+int compat_vzcalls_ioctl(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	int err;
+
+	switch(cmd) {
+	case VZCTL_GET_CPU_STAT: {
+		/* FIXME */
+	}
+	case VZCTL_COMPAT_ENV_CREATE_DATA: {
+		struct compat_vzctl_env_create_data cs;
+		struct vzctl_env_create_data __user *s;
+
+		s = compat_alloc_user_space(sizeof(*s));
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+
+		if (put_user(cs.veid, &s->veid) ||
+		    put_user(cs.flags, &s->flags) ||
+		    put_user(cs.class_id, &s->class_id) ||
+		    put_user(compat_ptr(cs.data), &s->data) ||
+		    put_user(cs.datalen, &s->datalen))
+			break;
+		err = vzcalls_ioctl(file, VZCTL_ENV_CREATE_DATA,
+						(unsigned long)s);
+		break;
+	}
+#ifdef CONFIG_NET
+	case VZCTL_COMPAT_VE_NETDEV: {
+		struct compat_vzctl_ve_netdev cs;
+		struct vzctl_ve_netdev __user *s;
+
+		s = compat_alloc_user_space(sizeof(*s));
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+
+		if (put_user(cs.veid, &s->veid) ||
+		    put_user(cs.op, &s->op) ||
+		    put_user(compat_ptr(cs.dev_name), &s->dev_name))
+			break;
+		err = vzcalls_ioctl(file, VZCTL_VE_NETDEV, (unsigned long)s);
+		break;
+	}
+#endif
+	case VZCTL_COMPAT_VE_MEMINFO: {
+		struct compat_vzctl_ve_meminfo cs;
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+		err = ve_set_meminfo(cs.veid, cs.val);
+		break;
+	}
+	default:
+		err = vzcalls_ioctl(file, cmd, arg);
+		break;
+	}
+	return err;
+}
+#endif
+
+static struct vzioctlinfo vzcalls = {
+	.type		= VZCTLTYPE,
+	.ioctl		= vzcalls_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= compat_vzcalls_ioctl,
+#endif
+	.owner		= THIS_MODULE,
+};
+
+
+/**********************************************************************
+ **********************************************************************
+ *
+ * Init/exit stuff
+ *
+ **********************************************************************
+ **********************************************************************/
+
+static inline __init int init_vecalls_ioctls(void)
+{
+	vzioctl_register(&vzcalls);
+	return 0;
+}
+
+static inline void fini_vecalls_ioctls(void)
+{
+	vzioctl_unregister(&vzcalls);
+}
+
+static int __init vecalls_init(void)
+{
+	int err;
+
+	err = init_vecalls_proc();
+	if (err < 0)
+		goto out_proc;
+
+	err = init_vecalls_ioctls();
+	if (err < 0)
+		goto out_ioctls;
+
+	/*
+	 * This one can also be dereferenced since not freed
+	 * VE holds reference on module
+	 */
+
+	return 0;
+
+out_ioctls:
+	fini_vecalls_proc();
+out_proc:
+	return err;
+}
+
+static void __exit vecalls_exit(void)
+{
+	fini_vecalls_ioctls();
+	fini_vecalls_proc();
+}
+
+MODULE_AUTHOR("SWsoft <devel@openvz.org>");
+MODULE_DESCRIPTION("Virtuozzo Control");
+MODULE_LICENSE("GPL v2");
+
+module_init(vecalls_init)
+module_exit(vecalls_exit)
--- /dev/null
+++ b/kernel/ve/veowner.c
@@ -0,0 +1,140 @@
+/*
+ *  kernel/ve/veowner.c
+ *
+ *  Copyright (c) 2000-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/ipc.h>
+#include <linux/fs_struct.h>
+#include <linux/fs.h>
+#include <linux/proc_fs.h>
+#include <linux/file.h>
+#include <linux/mm.h>
+#include <linux/delay.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/list.h>
+#include <linux/inetdevice.h>
+#include <linux/pid_namespace.h>
+#include <linux/xattr.h>
+#include <asm/io.h>
+
+#include <net/tcp.h>
+
+/*
+ * ------------------------------------------------------------------------
+ * proc entries
+ * ------------------------------------------------------------------------
+ */
+
+#ifdef CONFIG_PROC_FS
+struct proc_dir_entry *proc_vz_dir;
+EXPORT_SYMBOL(proc_vz_dir);
+
+static int proc_fairsched_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static ssize_t proc_fairsched_read(struct file *file, char __user *buf,
+				   size_t size, loff_t *ppos)
+{
+	return 0;
+}
+
+static struct file_operations proc_fairsched_operations = {
+	.open		= proc_fairsched_open,
+	.read		= proc_fairsched_read,
+	.llseek		= noop_llseek,
+};
+
+static void prepare_proc(void)
+{
+	proc_vz_dir = proc_mkdir_mode("vz", S_ISVTX | S_IRUGO | S_IXUGO, NULL);
+	if (!proc_vz_dir)
+		panic("Can't create /proc/vz dir\n");
+
+	/* Legacy files. They are not really needed and should be removed
+	 * sooner or later, but leave the stubs for now as they may be required
+	 * by userspace */
+
+	proc_mkdir_mode("fairsched", 0, proc_vz_dir);
+
+	proc_create("fairsched", S_ISVTX, NULL,	&proc_fairsched_operations);
+	proc_create("fairsched2", S_ISVTX, NULL, &proc_fairsched_operations);
+}
+#endif
+
+/*
+ * ------------------------------------------------------------------------
+ * OpenVZ sysctl
+ * ------------------------------------------------------------------------
+ */
+
+/*
+ * Operations with a big amount of mount points can require a lot of time.
+ * These operations take the global lock namespace_sem, so they can affect
+ * other containers. Let us allow no more than sysctl_ve_mount_nr mount
+ * points for a VE.
+ */
+unsigned int sysctl_ve_mount_nr = 4096;
+
+static struct ctl_table vz_fs_table[] = {
+	{
+		.procname	= "fsync-enable",
+		.data		= &ve0.fsync_enable,
+		.maxlen		= sizeof(int),
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= &proc_dointvec_virtual,
+	},
+	{
+		.procname       = "ve-mount-nr",
+		.data           = &sysctl_ve_mount_nr,
+		.maxlen         = sizeof(sysctl_ve_mount_nr),
+		.mode           = 0644,
+		.proc_handler   = proc_dointvec,
+	},
+	{ 0 }
+};
+
+static struct ctl_path fs_path[] = {
+	{ .procname = "fs", },
+	{ }
+};
+
+static void prepare_sysctl(void)
+{
+	register_sysctl_paths(fs_path, vz_fs_table);
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * XXX init_ve_system
+ * ------------------------------------------------------------------------
+ */
+
+void init_ve_system(void)
+{
+	struct task_struct *init_entry;
+	struct ve_struct *ve;
+	struct path root;
+
+	ve = get_ve0();
+
+	init_entry = init_pid_ns.child_reaper;
+
+	get_fs_root(init_entry->fs, &root);
+	ve->root_path = root;
+
+#ifdef CONFIG_PROC_FS
+	prepare_proc();
+#endif
+	prepare_sysctl();
+
+	kobj_ns_type_register(&ve_ns_type_operations);
+}
--- /dev/null
+++ b/kernel/ve/vzdev.c
@@ -0,0 +1,151 @@
+/*
+ *  kernel/ve/vzdev.c
+ *
+ *  Copyright (c) 2000-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/vzctl.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include <uapi/linux/vzcalluser.h>
+#include <asm/uaccess.h>
+#include <asm/pgalloc.h>
+#include <linux/device.h>
+
+#define VZCTL_MAJOR 126
+#define VZCTL_NAME "vzctl"
+
+MODULE_AUTHOR("SWsoft <devel@openvz.org>");
+MODULE_DESCRIPTION("Virtuozzo Interface");
+MODULE_LICENSE("GPL v2");
+
+static LIST_HEAD(ioctls);
+static DEFINE_SPINLOCK(ioctl_lock);
+
+static struct vzioctlinfo *vzctl_get_handler(unsigned int cmd)
+{
+	struct vzioctlinfo *h;
+
+	spin_lock(&ioctl_lock);
+	list_for_each_entry(h, &ioctls, list) {
+		if (h->type == _IOC_TYPE(cmd))
+			goto found;
+	}
+	h = NULL;
+found:
+	if (h && !try_module_get(h->owner))
+		h = NULL;
+	spin_unlock(&ioctl_lock);
+	return h;
+}
+
+static void vzctl_put_handler(struct vzioctlinfo *h)
+{
+	if (!h)
+		return;
+
+	module_put(h->owner);
+}
+
+long vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct vzioctlinfo *h;
+	int err;
+
+	err = -ENOTTY;
+	h = vzctl_get_handler(cmd);
+	if (h && h->ioctl)
+		err = (*h->ioctl)(file, cmd, arg);
+	vzctl_put_handler(h);
+
+	return err;
+}
+
+long compat_vzctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct vzioctlinfo *h;
+	int err;
+
+	err = -ENOIOCTLCMD;
+	h = vzctl_get_handler(cmd);
+	if (h && h->compat_ioctl)
+		err = (*h->compat_ioctl)(file, cmd, arg);
+	vzctl_put_handler(h);
+
+	return err;
+}
+
+void vzioctl_register(struct vzioctlinfo *inf)
+{
+	spin_lock(&ioctl_lock);
+	list_add(&inf->list, &ioctls);
+	spin_unlock(&ioctl_lock);
+}
+EXPORT_SYMBOL(vzioctl_register);
+
+void vzioctl_unregister(struct vzioctlinfo *inf)
+{
+	spin_lock(&ioctl_lock);
+	list_del_init(&inf->list);
+	spin_unlock(&ioctl_lock);
+}
+EXPORT_SYMBOL(vzioctl_unregister);
+
+/*
+ * Init/exit stuff.
+ */
+static struct file_operations vzctl_fops = {
+	.owner		= THIS_MODULE,
+	.unlocked_ioctl	= vzctl_ioctl,
+	.compat_ioctl	= compat_vzctl_ioctl,
+};
+
+static struct class *vzctl_class;
+
+static void __exit vzctl_exit(void)
+{
+	device_destroy(vzctl_class, MKDEV(VZCTL_MAJOR, 0));
+	class_destroy(vzctl_class);
+	unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME);
+}
+
+static int __init vzctl_init(void)
+{
+	int ret;
+	struct device *class_err;
+
+	ret = register_chrdev(VZCTL_MAJOR, VZCTL_NAME, &vzctl_fops);
+	if (ret < 0)
+		goto out;
+
+	vzctl_class = class_create(THIS_MODULE, "vzctl");
+	if (IS_ERR(vzctl_class)) {
+		ret = PTR_ERR(vzctl_class);
+		goto out_cleandev;
+	}
+
+	class_err = device_create(vzctl_class, NULL,
+			MKDEV(VZCTL_MAJOR, 0), NULL, VZCTL_NAME);
+	if (IS_ERR(class_err)) {
+		ret = PTR_ERR(class_err);
+		goto out_rmclass;
+	}
+
+	goto out;
+
+out_rmclass:
+	class_destroy(vzctl_class);
+out_cleandev:
+	unregister_chrdev(VZCTL_MAJOR, VZCTL_NAME);
+out:
+	return ret;
+}
+
+module_init(vzctl_init)
+module_exit(vzctl_exit);
--- /dev/null
+++ b/kernel/ve/vzevent.c
@@ -0,0 +1,144 @@
+/*
+ *  kernel/ve/vzevent.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/skbuff.h>
+#include <net/sock.h>
+#include <linux/netlink.h>
+#include <linux/errno.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/vzevent.h>
+#include <linux/pid_namespace.h>
+
+#define NETLINK_UEVENT	31
+#define VZ_EVGRP_ALL	0x01
+
+static int reboot_event;
+module_param(reboot_event, int, 0644);
+MODULE_PARM_DESC(reboot_event, "Enable reboot events");
+
+/*
+ * NOTE: the original idea was to send events via kobject_uevent(),
+ * however, it turns out that it has negative consequences like
+ * start of /sbin/hotplug which tries to react on our events in inadequate manner.
+ */
+
+static struct sock *vzev_sock;
+
+static char *action_to_string(int action)
+{
+	switch (action) {
+	case VE_EVENT_MOUNT:
+		return "ve-mount";
+	case VE_EVENT_UMOUNT:
+		return "ve-umount";
+	case VE_EVENT_START:
+		return "ve-start";
+	case VE_EVENT_STOP:
+		return "ve-stop";
+	case VE_EVENT_REBOOT:
+		return "ve-reboot";
+	default:
+		return NULL;
+	}
+}
+
+static int do_vzevent_send(int event, char *msg, int len)
+{
+	struct sk_buff *skb;
+	char *buf, *action;
+	int alen;
+
+	action = action_to_string(event);
+	if (!action)
+		return -EINVAL;
+
+	alen = strlen(action);
+
+	skb = alloc_skb(len + 1 + alen, GFP_KERNEL);
+	if (!skb)
+		return -ENOMEM;
+
+	buf = skb_put(skb, len + 1 + alen);
+	memcpy(buf, action, alen);
+	buf[alen] = '@';
+	memcpy(buf + alen + 1, msg, len);
+	(void)netlink_broadcast(vzev_sock, skb, 0, VZ_EVGRP_ALL, GFP_KERNEL);
+	return 0;
+}
+
+int vzevent_send(int event, const char *attrs_fmt, ...)
+{
+	va_list args;
+	int len, err;
+	char *page;
+
+	err = -ENOMEM;
+	page = (char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		goto out;
+
+	va_start(args, attrs_fmt);
+	len = vscnprintf(page, PAGE_SIZE, attrs_fmt, args);
+	va_end(args);
+
+	err = do_vzevent_send(event, page, len);
+	free_page((unsigned long)page);
+out:
+	return err;
+}
+EXPORT_SYMBOL(vzevent_send);
+
+static int ve_start(void *data)
+{
+	struct ve_struct *ve;
+
+	ve = (struct ve_struct *)data;
+	vzevent_send(VE_EVENT_START, "%s", ve_name(ve));
+	return 0;
+}
+
+static void ve_stop(void *data)
+{
+	struct ve_struct *ve = data;
+	int event = VE_EVENT_STOP;
+
+	if (ve->ve_ns->pid_ns->reboot == SIGHUP && reboot_event)
+		event = VE_EVENT_REBOOT;
+
+	vzevent_send(event, "%s", ve_name(ve));
+}
+
+static struct ve_hook ve_start_stop_hook = {
+	.init		= ve_start,
+	.fini		= ve_stop,
+	.owner		= THIS_MODULE,
+	.priority	= HOOK_PRIO_AFTERALL,
+};
+
+static int __init init_vzevent(void)
+{
+	vzev_sock = netlink_kernel_create(&init_net, NETLINK_UEVENT, NULL);
+	if (vzev_sock == NULL)
+		return -ENOMEM;
+	ve_hook_register(VE_SS_CHAIN, &ve_start_stop_hook);
+	return 0;
+}
+
+static void __exit exit_vzevent(void)
+{
+	ve_hook_unregister(&ve_start_stop_hook);
+	netlink_kernel_release(vzev_sock);
+}
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
+
+module_init(init_vzevent);
+module_exit(exit_vzevent);
--- /dev/null
+++ b/kernel/ve/vziolimit.c
@@ -0,0 +1,519 @@
+/*
+ *  kernel/ve/vziolimit.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/virtinfo.h>
+#include <linux/vzctl.h>
+#include <linux/vziolimit.h>
+#include <linux/blkdev.h>
+#include <linux/blktrace_api.h>
+#include <asm/uaccess.h>
+#include <bc/beancounter.h>
+
+struct throttle {
+       unsigned speed;		/* maximum speed, units per second */
+       unsigned burst;		/* maximum bust, units */
+       unsigned latency;	/* maximum wait delay, jiffies */
+       unsigned remain;		/* units/HZ */
+       unsigned long time;	/* wall time in jiffies */
+       long long state;		/* current state in units */
+};
+
+enum {
+	UB_CGROUP_IOLIMIT_SPEED 	= 0,
+	UB_CGROUP_IOLIMIT_BURST 	= 1,
+	UB_CGROUP_IOLIMIT_LATENCY 	= 2,
+	UB_CGROUP_IOPSLIMIT_SPEED 	= 3,
+	UB_CGROUP_IOPSLIMIT_BURST 	= 4,
+	UB_CGROUP_IOPSLIMIT_LATENCY 	= 5,
+
+};
+
+/**
+ * set throttler initial state, externally serialized
+ * @speed	maximum speed (1/sec)
+ * @burst	maximum burst chunk
+ * @latency	maximum timeout (ms)
+ */
+static void throttle_setup(struct throttle *th, unsigned speed,
+		unsigned burst, unsigned latency)
+{
+	th->time = jiffies;
+	th->burst = burst;
+	th->latency = msecs_to_jiffies(latency);
+	wmb();
+	th->speed = speed;
+}
+
+/* externally serialized */
+static void throttle_charge(struct throttle *th, long long charge)
+{
+	unsigned long time, now = jiffies;
+	long long step, ceiling = charge + th->burst;
+
+	if (time_before(th->time, now)) {
+		step = (u64)th->speed * (now - th->time);
+		do_div(step, HZ);
+		step += th->state;
+		/* feed throttler as much as we can */
+		if (step <= ceiling)
+			th->state = step;
+		else if (th->state < ceiling)
+			th->state = ceiling;
+		th->time = now;
+	}
+
+	if (charge > th->state) {
+		charge -= th->state;
+		step = charge * HZ;
+		if (do_div(step, th->speed))
+			step++;
+		time = th->time + step;
+		/* limit maximum latency */
+		if (time_after(time, now + th->latency))
+			time = now + th->latency;
+		th->time = time;
+		step *= th->speed;
+		step += th->remain;
+		th->remain = do_div(step, HZ);
+		th->state += step;
+	}
+}
+
+/* lockless */
+static unsigned long throttle_timeout(struct throttle *th, unsigned long now)
+{
+	unsigned long time;
+
+	if (!th->speed)
+		return 0;
+	rmb();
+	time = th->time;
+	if (time_before(time, now))
+		return 0;
+	return min(time - now, (unsigned long)th->latency);
+}
+
+struct iolimit {
+	struct throttle throttle;
+	struct throttle iops;
+	wait_queue_head_t wq;
+};
+
+static void iolimit_wait(struct iolimit *iolimit, unsigned long timeout)
+{
+	DEFINE_WAIT(wait);
+
+	do {
+		prepare_to_wait(&iolimit->wq, &wait,
+				TASK_KILLABLE | __TASK_IOTHROTTLED);
+		timeout = schedule_timeout(timeout);
+		if (fatal_signal_pending(current))
+			break;
+		if (unlikely(timeout))
+			timeout = min(throttle_timeout(&iolimit->throttle,
+						jiffies), timeout);
+	} while (timeout);
+	finish_wait(&iolimit->wq, &wait);
+}
+
+static unsigned long iolimit_timeout(struct iolimit *iolimit)
+{
+	unsigned long now = jiffies;
+
+	return max(throttle_timeout(&iolimit->throttle, now),
+			throttle_timeout(&iolimit->iops, now));
+}
+
+static void iolimit_balance_dirty(struct iolimit *iolimit,
+				  struct user_beancounter *ub,
+				  unsigned long write_chunk)
+{
+	struct throttle *th = &iolimit->throttle;
+	unsigned long flags, dirty, state;
+
+	if (!th->speed)
+		return;
+
+	/* can be non-atomic on i386, but ok. this just hint. */
+	state = th->state >> PAGE_SHIFT;
+	dirty = ub_stat_get(ub, dirty_pages) + write_chunk;
+	/* protect agains ub-stat percpu drift */
+	if (dirty + UB_STAT_BATCH * num_possible_cpus()	< state)
+		return;
+	/* get exact value of for smooth throttling */
+	dirty = ub_stat_get_exact(ub, dirty_pages) + write_chunk;
+	if (dirty < state)
+		return;
+
+	spin_lock_irqsave(&ub->ub_lock, flags);
+	/* precharge dirty pages */
+	throttle_charge(th, (long long)dirty << PAGE_SHIFT);
+	spin_unlock_irqrestore(&ub->ub_lock, flags);
+}
+
+static int iolimit_virtinfo(struct vnotifier_block *nb,
+		unsigned long cmd, void *arg, int old_ret)
+{
+	struct user_beancounter *ub = get_exec_ub();
+	struct iolimit *iolimit = ub->iolimit;
+	unsigned long flags, timeout;
+	struct request_queue *q;
+
+	if (!iolimit)
+		return old_ret;
+
+	if (!iolimit->throttle.speed && !iolimit->iops.speed)
+		return NOTIFY_OK;
+
+	switch (cmd) {
+		case VIRTINFO_IO_ACCOUNT:
+			if (!iolimit->throttle.speed)
+				break;
+			spin_lock_irqsave(&ub->ub_lock, flags);
+			if (iolimit->throttle.speed) {
+				long long charge = *(size_t*)arg;
+
+				throttle_charge(&iolimit->throttle, charge);
+				iolimit->throttle.state -= charge;
+			}
+			spin_unlock_irqrestore(&ub->ub_lock, flags);
+			break;
+		case VIRTINFO_IO_FUSE_REQ:
+		case VIRTINFO_IO_OP_ACCOUNT:
+
+			if (!iolimit->iops.speed)
+				break;
+
+			q = (struct request_queue *) arg;
+			if (q)
+				blk_add_trace_msg(q, "vziolimit iops ub:%s speed:%d remain:%d ",
+						  ub->ub_name,iolimit->iops.speed,
+						  iolimit->iops.remain);
+
+			spin_lock_irqsave(&ub->ub_lock, flags);
+			if (iolimit->iops.speed) {
+				throttle_charge(&iolimit->iops, 1);
+				/*
+				 * Writeback doesn't use last iops from stash
+				 * to avoid choking future sync operations.
+				 */
+				if (iolimit->iops.state > 1 ||
+				    !(current->flags & PF_SWAPWRITE))
+					iolimit->iops.state--;
+			}
+			spin_unlock_irqrestore(&ub->ub_lock, flags);
+			break;
+		case VIRTINFO_IO_PREPARE:
+		case VIRTINFO_IO_JOURNAL:
+
+			if (current->flags & PF_SWAPWRITE)
+				break;
+
+			timeout = iolimit_timeout(iolimit);
+			q = (struct request_queue *) arg;
+			if (q)
+				blk_add_trace_msg(q, "vziolimit sleep ub:%s speed:%ld ",
+						  ub->ub_name, timeout);
+
+			if (timeout && !fatal_signal_pending(current))
+				iolimit_wait(iolimit, timeout);
+			break;
+		case VIRTINFO_IO_READAHEAD:
+		case VIRTINFO_IO_CONGESTION:
+			timeout = iolimit_timeout(iolimit);
+			if (timeout)
+				return NOTIFY_FAIL;
+			break;
+		case VIRTINFO_IO_BALANCE_DIRTY:
+			iolimit_balance_dirty(iolimit, ub, (unsigned long)arg);
+			break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct vnotifier_block iolimit_virtinfo_nb = {
+	.notifier_call = iolimit_virtinfo,
+};
+
+
+static void throttle_state(struct user_beancounter *ub,
+		struct throttle *throttle, struct iolimit_state *state)
+{
+	spin_lock_irq(&ub->ub_lock);
+	state->speed = throttle->speed;
+	state->burst = throttle->burst;
+	state->latency = jiffies_to_msecs(throttle->latency);
+	spin_unlock_irq(&ub->ub_lock);
+}
+
+static struct iolimit *iolimit_get(struct user_beancounter *ub)
+{
+	struct iolimit *iolimit = ub->iolimit;
+
+	if (iolimit)
+		return iolimit;
+
+	iolimit = kzalloc(sizeof(struct iolimit), GFP_KERNEL);
+	if (!iolimit)
+		return NULL;
+	init_waitqueue_head(&iolimit->wq);
+
+	spin_lock_irq(&ub->ub_lock);
+	if (ub->iolimit) {
+		kfree(iolimit);
+		iolimit = ub->iolimit;
+	} else
+		ub->iolimit = iolimit;
+	spin_unlock_irq(&ub->ub_lock);
+
+	return iolimit;
+}
+
+static int iolimit_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct user_beancounter *ub;
+	struct iolimit *iolimit;
+	struct iolimit_state state;
+	int err;
+
+	if (cmd != VZCTL_SET_IOLIMIT && cmd != VZCTL_GET_IOLIMIT &&
+	    cmd != VZCTL_SET_IOPSLIMIT && cmd != VZCTL_GET_IOPSLIMIT)
+		return -ENOTTY;
+
+	if (copy_from_user(&state, (void __user *)arg, sizeof(state)))
+		return -EFAULT;
+
+	ub = get_beancounter_byuid(state.id, 0);
+	if (!ub)
+		return -ENOENT;
+
+	iolimit = ub->iolimit;
+
+	switch (cmd) {
+		case VZCTL_SET_IOLIMIT:
+			iolimit = iolimit_get(ub);
+			err = -ENOMEM;
+			if (!iolimit)
+				break;
+			spin_lock_irq(&ub->ub_lock);
+			throttle_setup(&iolimit->throttle, state.speed,
+					state.burst, state.latency);
+			spin_unlock_irq(&ub->ub_lock);
+			wake_up_all(&iolimit->wq);
+			err = 0;
+			break;
+		case VZCTL_SET_IOPSLIMIT:
+			iolimit = iolimit_get(ub);
+			err = -ENOMEM;
+			if (!iolimit)
+				break;
+			spin_lock_irq(&ub->ub_lock);
+			throttle_setup(&iolimit->iops, state.speed,
+					state.burst, state.latency);
+			spin_unlock_irq(&ub->ub_lock);
+			wake_up_all(&iolimit->wq);
+			err = 0;
+			break;
+		case VZCTL_GET_IOLIMIT:
+			err = -ENXIO;
+			if (!iolimit)
+				break;
+			throttle_state(ub, &iolimit->throttle, &state);
+			err = -EFAULT;
+			if (copy_to_user((void __user *)arg, &state, sizeof(state)))
+				break;
+			err = 0;
+			break;
+		case VZCTL_GET_IOPSLIMIT:
+			err = -ENXIO;
+			if (!iolimit)
+				break;
+			throttle_state(ub, &iolimit->iops, &state);
+			err = -EFAULT;
+			if (copy_to_user((void __user *)arg, &state, sizeof(state)))
+				break;
+			err = 0;
+			break;
+		default:
+			err = -ENOTTY;
+	}
+
+	put_beancounter(ub);
+	return err;
+}
+
+static struct vzioctlinfo iolimit_vzioctl = {
+	.type		= VZIOLIMITTYPE,
+	.ioctl		= iolimit_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= iolimit_ioctl,
+#endif
+	.owner		= THIS_MODULE,
+};
+
+static ssize_t iolimit_cgroup_read(struct cgroup *cg, struct cftype *cft,
+			      struct file *file, char __user *buf,
+			      size_t nbytes, loff_t *ppos)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+	struct iolimit *iolimit = ub->iolimit;
+	unsigned long val = 0;
+	int len;
+	char str[32];
+
+	if (!iolimit)
+		goto out;
+
+	spin_lock_irq(&ub->ub_lock);
+	switch (cft->private) {
+	case UB_CGROUP_IOLIMIT_SPEED:
+		val = iolimit->throttle.speed;
+		break;
+	case UB_CGROUP_IOLIMIT_BURST:
+		val = iolimit->throttle.burst;
+		break;
+	case UB_CGROUP_IOLIMIT_LATENCY:
+		val = iolimit->throttle.latency;
+		break;
+
+	case UB_CGROUP_IOPSLIMIT_SPEED:
+		val = iolimit->iops.speed;
+		break;
+	case UB_CGROUP_IOPSLIMIT_BURST:
+		val = iolimit->iops.burst;
+		break;
+	case UB_CGROUP_IOPSLIMIT_LATENCY:
+		val = iolimit->iops.latency;
+		break;
+	default:
+		BUG();
+	}
+	spin_unlock_irq(&ub->ub_lock);
+out:
+	len = scnprintf(str, sizeof(str), "%lu\n", val);
+	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+
+static int iolimit_cgroup_write_u64(struct cgroup *cg, struct cftype *cft, u64 val)
+{
+	struct user_beancounter *ub = cgroup_ub(cg);
+	struct iolimit *iolimit;
+
+	iolimit = iolimit_get(ub);
+	if (!iolimit)
+		return -ENOMEM;
+
+	spin_lock_irq(&ub->ub_lock);
+	iolimit->throttle.time = iolimit->iops.time = jiffies;
+
+	switch (cft->private) {
+	case UB_CGROUP_IOLIMIT_SPEED:
+		wmb();
+		iolimit->throttle.speed = val;
+		break;
+	case UB_CGROUP_IOPSLIMIT_SPEED:
+		wmb();
+		iolimit->iops.speed = val;
+		break;
+	case UB_CGROUP_IOLIMIT_BURST:
+		iolimit->throttle.burst = val;
+		break;
+	case UB_CGROUP_IOLIMIT_LATENCY:
+		iolimit->throttle.latency = val;
+		break;
+	case UB_CGROUP_IOPSLIMIT_BURST:
+		iolimit->iops.burst = val;
+		break;
+	case UB_CGROUP_IOPSLIMIT_LATENCY:
+		iolimit->iops.latency = val;
+		break;
+	default:
+		BUG();
+	}
+	wake_up_all(&iolimit->wq);
+	spin_unlock_irq(&ub->ub_lock);
+	return 0;
+}
+
+static struct cftype vziolimit_cftypes[] = {
+	{
+		.name = "iolimit.speed",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = UB_CGROUP_IOLIMIT_SPEED,
+		.read = iolimit_cgroup_read,
+		.write_u64 = iolimit_cgroup_write_u64,
+	},
+	{
+		.name = "iolimit.burst",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = UB_CGROUP_IOLIMIT_BURST,
+		.read = iolimit_cgroup_read,
+		.write_u64 = iolimit_cgroup_write_u64,
+	},
+	{
+		.name = "iolimit.latency",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = UB_CGROUP_IOLIMIT_LATENCY,
+		.read = iolimit_cgroup_read,
+		.write_u64 = iolimit_cgroup_write_u64,
+	},
+
+	{
+		.name = "iopslimit.speed",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = UB_CGROUP_IOPSLIMIT_SPEED,
+		.read = iolimit_cgroup_read,
+		.write_u64 = iolimit_cgroup_write_u64,
+	},
+	{
+		.name = "iopslimit.burst",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = UB_CGROUP_IOPSLIMIT_BURST,
+		.read = iolimit_cgroup_read,
+		.write_u64 = iolimit_cgroup_write_u64,
+	},
+	{
+		.name = "iopslimit.latency",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.private = UB_CGROUP_IOPSLIMIT_LATENCY,
+		.read = iolimit_cgroup_read,
+		.write_u64 = iolimit_cgroup_write_u64,
+	},
+	{ }
+};
+
+static int __init iolimit_init(void)
+{
+	int err;
+	virtinfo_notifier_register(VITYPE_IO, &iolimit_virtinfo_nb);
+	vzioctl_register(&iolimit_vzioctl);
+	err = cgroup_add_cftypes(&ub_subsys, vziolimit_cftypes);
+	if (err)
+		goto err_cgroup;
+	return 0;
+
+err_cgroup:
+	vzioctl_unregister(&iolimit_vzioctl);
+	virtinfo_notifier_unregister(VITYPE_IO, &iolimit_virtinfo_nb);
+	return err;
+}
+
+static void __exit iolimit_exit(void)
+{
+	cgroup_rm_cftypes(&ub_subsys, vziolimit_cftypes);
+	vzioctl_unregister(&iolimit_vzioctl);
+	virtinfo_notifier_unregister(VITYPE_IO, &iolimit_virtinfo_nb);
+}
+
+module_init(iolimit_init)
+module_exit(iolimit_exit)
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/kernel/ve/vzlist.c
@@ -0,0 +1,303 @@
+/*
+ *  kernel/ve/vzlist.c
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/in.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/ve.h>
+#include <linux/venet.h>
+#include <linux/vzctl.h>
+#include <uapi/linux/vzlist.h>
+#include <linux/vmalloc.h>
+#include <linux/ve_proto.h>
+#include <linux/veip.h>
+#include <linux/uaccess.h>
+#include <linux/pid_namespace.h>
+
+static DEFINE_SEMAPHORE(vzlist_sem);
+
+static int get_veids(struct vzlist_veidctl *s)
+{
+	int ret;
+	int ves;
+	unsigned long size;
+	envid_t *buf;
+	struct ve_struct *ve;
+
+	ves = nr_ve + 1;
+	if (!s->num || s->id == NULL)
+		return ves;
+
+	down(&vzlist_sem);
+again:
+	size = (ves + 20)*sizeof(envid_t);
+	ret = -ENOMEM;
+	buf = vmalloc(size);
+	if (!buf)
+		goto out_oom;
+
+	ves = 0;
+	mutex_lock(&ve_list_lock);
+	for_each_ve(ve) {
+		if (size >= (ves + 1)*sizeof(envid_t))
+			buf[ves] = ve->veid;
+		ves++;
+	}
+	mutex_unlock(&ve_list_lock);
+
+	ret = ves;
+	if (ves > s->num)
+		goto out;
+	if (size < ves*sizeof(envid_t)) {
+		vfree(buf);
+		goto again;
+	}
+	if (copy_to_user(s->id, buf, ves*sizeof(envid_t)))
+		ret = -EFAULT;
+	/* success */
+out:
+	vfree(buf);
+out_oom:
+	up(&vzlist_sem);
+	return ret;
+}
+
+static int get_vepids(struct vzlist_vepidctl *s)
+{
+	int ret;
+	int tasks = 0;
+	unsigned long size;
+	envid_t *buf;
+	struct ve_struct *ve;
+	struct task_struct *tsk;
+	struct pid_namespace *ns;
+	int nr;
+
+	ret = -ESRCH;
+	ve = get_ve_by_id(s->veid);
+	if (!ve)
+		goto out_no_ve;
+	ns = ve->ve_ns->pid_ns;
+
+	down(&vzlist_sem);
+again:
+	size = (tasks + 512)*(2*sizeof(pid_t));
+	ret = -ENOMEM;
+	buf = vmalloc(size);
+	if (!buf)
+		goto out_oom;
+
+	tasks = 0;
+	read_lock(&tasklist_lock);
+	nr = next_pidmap(ns, 0);
+	while (nr > 0) {
+		rcu_read_lock();
+
+		tsk = pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
+		if (tsk) {
+			if (size >= (tasks + 1)*(2*sizeof(pid_t))) {
+				buf[2*tasks] = tsk->pid;
+				buf[2*tasks + 1] = task_pid_nr_ns(tsk, ns);
+			}
+			tasks++;
+		}
+
+		rcu_read_unlock();
+		nr = next_pidmap(ns, nr);
+	}
+	read_unlock(&tasklist_lock);
+
+	ret = tasks;
+	if ((tasks > s->num) | (!tasks))
+		goto out;
+	if (size < tasks*(2*sizeof(pid_t))) {
+		vfree(buf);
+		goto again;
+	}
+	if (copy_to_user(s->pid, buf, tasks*(2*sizeof(pid_t))))
+		ret = -EFAULT;
+	/* success */
+out:
+	vfree(buf);
+out_oom:
+	up(&vzlist_sem);
+	put_ve(ve);
+out_no_ve:
+	return ret;
+}
+
+static int get_veips(struct vzlist_veipctl *s, unsigned int cmd)
+{
+	int ret;
+	int ips;
+	unsigned long size;
+	u32 *buf, *pos;
+	struct ve_struct *ve;
+	struct veip_struct *veip;
+	struct ip_entry_struct *entry;
+	struct ve_addr_struct *addr;
+
+	ret = -ESRCH;
+	ve = get_ve_by_id(s->veid);
+	if (!ve)
+		goto out_no_ve;
+
+	size = PAGE_SIZE;
+	down(&vzlist_sem);
+again:
+	ret = -ENOMEM;
+	buf = vmalloc(size);
+	if (!buf)
+		goto out_oom;
+
+	ips = 0;
+#if defined(CONFIG_VE_NETDEV) || defined(CONFIG_VE_NETDEV_MODULE)
+	rcu_read_lock();
+	veip = ACCESS_ONCE(ve->veip);
+	if (veip == NULL)
+		goto noip;
+
+	pos = buf;
+	list_for_each_entry_rcu(entry, &veip->ip_lh, ve_list) {
+		if (entry->active_env == NULL)
+			continue;
+
+		addr = &entry->addr;
+
+		if (cmd == VZCTL_GET_VEIPS && addr->family == AF_INET) {
+			if (size >= (ips + 1) * sizeof(addr->key[3])) {
+				pos[0] = addr->key[3];
+				pos++;
+			}
+			ips++;
+		}
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		if (cmd == VZCTL_GET_VEIP6S && addr->family == AF_INET6) {
+			if (size >= (ips + 1) * sizeof(addr->key)) {
+				memcpy(pos, addr->key, sizeof(addr->key));
+				pos += 4;
+			}
+			ips++;
+		}
+#endif
+	}
+noip:
+	rcu_read_unlock();
+#endif
+
+	ret = ips;
+	if (ips > s->num)
+		goto out;
+
+	if (cmd == VZCTL_GET_VEIPS) {
+		if (size < ips * sizeof(u32)) {
+			size = ips * sizeof(u32);
+			vfree(buf);
+			goto again;
+		}
+		if (copy_to_user(s->ip, buf, ips * sizeof(u32)))
+			ret = -EFAULT;
+	}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+	else {
+		if (size < ips * sizeof(u32) * 4) {
+			size = ips * sizeof(u32) * 4;
+			vfree(buf);
+			goto again;
+		}
+		if (copy_to_user(s->ip, buf, ips * sizeof(u32) * 4))
+			ret = -EFAULT;
+	}
+#endif
+	/* success */
+out:
+	vfree(buf);
+out_oom:
+	up(&vzlist_sem);
+	put_ve(ve);
+out_no_ve:
+	return ret;
+}
+
+static int vzlist_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	int err = -ENOTTY;
+	void __user *argp = (void __user *)arg;
+
+	switch (cmd) {
+	case VZCTL_GET_VEIDS: {
+			struct vzlist_veidctl s;
+
+			if (arg) {
+				err = -EFAULT;
+				if (copy_from_user(&s, argp, sizeof(s)))
+					break;
+				err = get_veids(&s);
+			} else
+				err = nr_ve;
+		}
+		break;
+	case VZCTL_GET_VEPIDS: {
+			struct vzlist_vepidctl s;
+
+			err = -EFAULT;
+			if (copy_from_user(&s, argp, sizeof(s)))
+				break;
+			err = get_vepids(&s);
+		}
+		break;
+	case VZCTL_GET_VEIP6S:
+	case VZCTL_GET_VEIPS: {
+			struct vzlist_veipctl s;
+
+			err = -EFAULT;
+			if (copy_from_user(&s, argp, sizeof(s)))
+				break;
+			err = get_veips(&s, cmd);
+		}
+		break;
+	}
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+static int vzlist_ioctl_compat(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	/* do we need this? */
+	return -ENOTTY;
+}
+#endif
+
+static struct vzioctlinfo vzid_calls = {
+	.type		= VZLISTTYPE,
+	.ioctl		= vzlist_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= vzlist_ioctl_compat,
+#endif
+	.owner		= THIS_MODULE
+};
+
+static int __init init_vzlist(void)
+{
+	vzioctl_register(&vzid_calls);
+	return 0;
+}
+
+static void __exit exit_vzlist(void)
+{
+	vzioctl_unregister(&vzid_calls);
+}
+
+module_init(init_vzlist);
+module_exit(exit_vzlist);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/kernel/ve/vznetstat/Makefile
@@ -0,0 +1,8 @@
+#
+# kernel/ve/vznetstat/Makefile
+#
+# Copyright (c) 2005-2008 SWsoft
+# Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+#
+
+obj-$(CONFIG_VE_NETDEV_ACCOUNTING) += vznetstat.o ip_vznetstat.o ip6_vznetstat.o
--- /dev/null
+++ b/kernel/ve/vznetstat/ip6_vznetstat.c
@@ -0,0 +1,102 @@
+/*
+ *  kernel/ve/vznetstat/ip6_vznetstat.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * Networking statistics for IPv6
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/if.h>
+#include <linux/netdevice.h>
+#include <linux/vznetstat.h>
+
+static unsigned int
+venet_acct_in_hook_v6(const struct nf_hook_ops *hook,
+		      struct sk_buff *skb,
+		      const struct net_device *in,
+		      const struct net_device *out,
+		      const struct nf_hook_state *state)
+{
+	int res = NF_ACCEPT;
+
+	if (in->flags & IFF_LOOPBACK)
+		goto out;
+
+	venet_acct_classify_add_incoming(in->nd_net->owner_ve->stat, skb);
+out:
+	return res;
+}
+
+static unsigned int
+venet_acct_out_hook_v6(const struct nf_hook_ops *hook,
+		    struct sk_buff *skb,
+		    const struct net_device *in,
+		    const struct net_device *out,
+		    const struct nf_hook_state *state)
+{
+	int res = NF_ACCEPT;
+
+	if (out->flags & IFF_LOOPBACK)
+		goto out;
+
+	skb->protocol = __constant_htons(ETH_P_IPV6);
+	venet_acct_classify_add_outgoing(out->nd_net->owner_ve->stat, skb);
+out:
+	return res;
+}
+
+static struct nf_hook_ops venet_acct_in_ops = {
+	.hook		= venet_acct_in_hook_v6,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET6,
+	.hooknum	= NF_INET_LOCAL_IN,
+	.priority	= NF_IP6_PRI_FIRST,
+};
+
+static struct nf_hook_ops venet_acct_out_ops = {
+	.hook		= venet_acct_out_hook_v6,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET6,
+	.hooknum	= NF_INET_LOCAL_OUT,
+	.priority	= NF_IP6_PRI_LAST,
+};
+
+int __init ip6_venetstat_init(void)
+{
+	int ret;
+
+	ret = nf_register_hook(&venet_acct_in_ops);
+	if (ret < 0)
+		return ret;
+
+	ret = nf_register_hook(&venet_acct_out_ops);
+	if (ret < 0) {
+		nf_unregister_hook(&venet_acct_in_ops);
+		return ret;
+	}
+
+	return 0;
+}
+
+void __exit ip6_venetstat_exit(void)
+{
+	nf_unregister_hook(&venet_acct_out_ops);
+	nf_unregister_hook(&venet_acct_in_ops);
+}
+
+module_init(ip6_venetstat_init);
+module_exit(ip6_venetstat_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/kernel/ve/vznetstat/ip_vznetstat.c
@@ -0,0 +1,170 @@
+/*
+ *  kernel/ve/vznetstat/ip_vznetstat.c
+ *
+ *  Copyright (c) 2004-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * Networking statistics for IPv4.
+ */
+
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/uaccess.h>
+#include <net/ip.h>
+#include <linux/module.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/venet.h>
+#include <linux/vznetstat.h>
+
+#define VZNS_DEBUG 0
+
+static unsigned int venet_acct_in_hook(const struct nf_hook_ops *hook,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       const struct nf_hook_state *state)
+{
+	int res;
+
+	res = NF_ACCEPT;
+
+	/* Skip loopback dev */
+	if (in == dev_net(in)->loopback_dev)
+		goto out;
+
+#if VZNS_DEBUG
+	printk("%s: in %s, out %s, size %d, in->owner_env=%s\n",
+		 __FUNCTION__, in ? in->name : NULL, out ? out->name : NULL,
+		 venet_acct_skb_size(skb),
+		 in ? in->nd_net->owner_ve->ve_name : -1);
+#endif
+
+	/*
+	 * Basically, pskb_may_pull() isn't necessary here, because it's done
+	 * in ip_rcv() before calling NF_IP_PRE_ROUTING NF_HOOK, but let's
+	 * have some insurance for the future.
+	 */
+	if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr))))
+		goto out_hdr_error;
+
+	venet_acct_classify_add_incoming(in->nd_net->owner_ve->stat, skb);
+
+out:
+	return res;
+
+out_hdr_error:
+	if (net_ratelimit())
+		printk("%s: IN accounting: IP header error\n", in->name);
+	res = NF_DROP;
+	goto out;
+}
+
+static unsigned int venet_acct_out_hook(const struct nf_hook_ops *hook,
+				        struct sk_buff *skb,
+				        const struct net_device *in,
+				        const struct net_device *out,
+				        const struct nf_hook_state *state)
+{
+	int res;
+
+	res = NF_ACCEPT;
+
+	/* Skip loopback dev */
+	if (out == dev_net(out)->loopback_dev)
+		goto out;
+
+	/* Paranoia */
+	if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr))))
+		goto out_hdr_error;
+
+#if VZNS_DEBUG
+	printk("%s: in %s, out %s, size %d, out->owner_env=%s\n",
+		 __FUNCTION__, in ? in->name : NULL, out ? out->name : NULL,
+		 venet_acct_skb_size(skb), out ? out->nd_net->owner_ve->ve_name : -1);
+#endif
+
+	/*
+	 * Basically, kproxy uses for accounting kp_account_check_in()
+	 * for incoming in it packets and kp_account_check_out() for
+	 * outgoing from it ones for both directions, from VE and to VE.
+	 *
+	 * So, for outgoing from VE packets on kproxy entrance
+	 * kp_account_check_in() substracts packet from accounting, then
+	 * kp_account_check_out() adds it back. Thus, we can don't worry
+	 * abount double accounting here.
+	 *
+	 * All kproxy's accounting can't be moved in this module,
+	 * since traffic amount between kproxy and outside world is a bit
+	 * different from traffic amount between VE and kproxy.
+	 */
+	skb->protocol = __constant_htons(ETH_P_IP);
+	venet_acct_classify_add_outgoing(out->nd_net->owner_ve->stat, skb);
+
+out:
+	return res;
+
+out_hdr_error:
+	if (net_ratelimit())
+		printk("%s: OUT accounting: IP header error\n", out->name);
+	res = NF_DROP;
+	goto out;
+}
+
+static struct nf_hook_ops venet_acct_in_ops = {
+	.hook		= venet_acct_in_hook,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum	= NF_INET_LOCAL_IN,
+	.priority	= NF_IP_PRI_FIRST,
+};
+
+static struct nf_hook_ops venet_acct_out_ops = {
+	.hook		= venet_acct_out_hook,
+	.owner		= THIS_MODULE,
+	.pf		= PF_INET,
+	.hooknum	= NF_INET_LOCAL_OUT,
+	.priority	= NF_IP_PRI_LAST,
+};
+
+int __init ip_venetstat_init(void)
+{
+	int ret;
+
+	ret = nf_register_hook(&venet_acct_in_ops);
+	if (ret < 0)
+		return ret;
+
+	ret = nf_register_hook(&venet_acct_out_ops);
+	if (ret < 0) {
+		nf_unregister_hook(&venet_acct_in_ops);
+		return ret;
+	}
+
+	return 0;
+}
+
+void __exit ip_venetstat_exit(void)
+{
+	nf_unregister_hook(&venet_acct_out_ops);
+	nf_unregister_hook(&venet_acct_in_ops);
+}
+
+#if defined(MODULE) && defined(VZ_AUDIT)
+VZ_AUDIT;
+#endif
+module_init(ip_venetstat_init);
+module_exit(ip_venetstat_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/kernel/ve/vznetstat/vznetstat.c
@@ -0,0 +1,1170 @@
+/*
+ *  kernel/ve/vznetstat/vznetstat.c
+ *
+ *  Copyright (c) 2004-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * Networking statistics.
+ * Traffic classes support.
+ * Persistent (independent from VE struct storage)
+ */
+
+#include <linux/sched.h>
+#include <linux/skbuff.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <asm/uaccess.h>
+#include <net/ip.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+
+#include <linux/ve.h>
+#include <linux/venet.h>
+#include <linux/vznetstat.h>
+#include <linux/vzctl.h>
+#include <uapi/linux/vzctl_netstat.h>
+#include <uapi/linux/vzcalluser.h>
+
+/*
+ * ---------------------------------------------------------------------------
+ * Traffic classes storage
+ * ---------------------------------------------------------------------------
+ */
+
+static int stat_num = 0;
+static DEFINE_RWLOCK(tc_lock);
+
+struct class_info_set {
+	unsigned int len;
+	union {
+		struct vz_tc_class_info info_v4[0];
+		struct vz_tc_class_info_v6 info_v6[0];
+		char data[0];
+	};
+};
+
+static struct class_info_set *info_v4 = NULL;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static struct class_info_set *info_v6 = NULL;
+#endif
+
+/* v6: flag IPv6 classes or IPv4 */
+static int venet_acct_set_classes(const void __user *user_info, int length, int v6)
+{
+	struct class_info_set *info, *old;
+	int size;
+	int err, i;
+
+	if (v6)
+		size = sizeof(struct vz_tc_class_info_v6);
+	else
+		size = sizeof(struct vz_tc_class_info);
+
+	info = kmalloc(sizeof(struct class_info_set) + size * length, GFP_KERNEL);
+	if (info == NULL)
+		return -ENOMEM;
+
+	err = -EFAULT;
+	info->len = length;
+	if (copy_from_user(info->data, user_info, size * length))
+		goto out_free;
+
+	/* Verify incoming data */
+	err = -EINVAL;
+	for (i = 0; i < length; i++) {
+		unsigned int cid;
+
+		if (v6)
+			cid = info->info_v6[i].cid;
+		else
+			cid = info->info_v4[i].cid;
+
+		if (cid < 0 || cid >= TC_CLASS_MAX)
+			goto out_free;
+	}
+
+	rcu_read_lock();
+	if (v6) {
+		old = rcu_dereference(info_v6);
+		rcu_assign_pointer(info_v6, info);
+	} else {
+		old = rcu_dereference(info_v4);
+		rcu_assign_pointer(info_v4, info);
+	}
+	rcu_read_unlock();
+
+	synchronize_net();
+	/* IMPORTANT. I think reset of statistics collected should not be
+	 * done here. */
+	kfree(old);
+	return 0;
+
+out_free:
+	kfree(info);
+	return err;
+}
+
+/* all records */
+static int venet_acct_get_classes(void __user *ret, int length, int v6)
+{
+	void *info;
+	struct class_info_set *rinfo;
+	int len, err;
+	unsigned int size;
+
+	if (v6)
+		size = sizeof(struct vz_tc_class_info_v6);
+	else
+		size = sizeof(struct vz_tc_class_info);
+
+	/* due to spinlock locking, see below */
+	info = kmalloc(size * length, GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	rcu_read_lock();
+	if (v6)
+		rinfo = rcu_dereference(info_v6);
+	else
+		rinfo = rcu_dereference(info_v4);
+
+	len = min(length, (int)rinfo->len);
+	memcpy(info, rinfo->data, size * length);
+	rcu_read_unlock();
+
+	err = -EFAULT;
+	if (!copy_to_user(ret, info, size * len))
+		err = len;
+	kfree(info);
+	return err;
+}
+
+static inline int class_info_len(int v6)
+{
+	int ret = 0;
+	struct class_info_set *info;
+
+	rcu_read_lock();
+	if (v6)
+		info = rcu_dereference(info_v6);
+	else
+		info = rcu_dereference(info_v4);
+
+	if (info)
+		ret = info->len;
+	rcu_read_unlock();
+
+	return ret;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * Persistent statistics storage
+ * ---------------------------------------------------------------------------
+ */
+
+/* The cache should not be good right now. It used only for user-space */
+#define STAT_HASH_LEN	128
+
+static struct list_head stat_hash_list[STAT_HASH_LEN];
+static int stat_hash(envid_t veid)
+{
+	return veid & (STAT_HASH_LEN - 1);
+}
+
+/* tc_lock is taken by the caller! */
+static inline struct venet_stat *__find(envid_t veid)
+{
+	int hash;
+	struct venet_stat *ptr;
+
+	hash = stat_hash(veid);
+	list_for_each_entry(ptr, stat_hash_list + hash, list) {
+		if (ptr->veid == veid)
+			return ptr;
+	}
+	return NULL;
+}
+
+static struct venet_stat *next_stat(int *hash, struct venet_stat *item)
+{
+	struct list_head *ptr;
+
+	ptr = item != NULL ? &item->list : (stat_hash_list + *hash);
+	while (*hash < STAT_HASH_LEN) {
+		if (ptr->next != stat_hash_list + *hash)
+			return list_entry(ptr->next, struct venet_stat, list);
+		(*hash)++;
+		ptr = stat_hash_list + *hash;
+	}
+	return NULL;
+}
+
+struct venet_stat *venet_acct_find_create_stat(envid_t veid)
+{
+	struct venet_stat *ptr;
+	unsigned long flags;
+	struct venet_stat *stat;
+
+	read_lock(&tc_lock);
+	ptr = __find(veid);
+	if (ptr != NULL) {
+		venet_acct_get_stat(ptr);
+		read_unlock(&tc_lock);
+		return ptr;
+	}
+	read_unlock(&tc_lock);
+
+	ptr = kzalloc(sizeof(struct venet_stat), GFP_KERNEL);
+	if (ptr == NULL)
+		goto out;
+	ptr->veid = veid;
+
+	ptr->ipv4_stat = alloc_percpu(struct acct_stat);
+	if (ptr->ipv4_stat == NULL)
+		goto out_free;
+
+	ptr->ipv6_stat = alloc_percpu(struct acct_stat);
+	if (ptr->ipv6_stat == NULL)
+		goto out_free_v4;
+
+	write_lock_irqsave(&tc_lock, flags);
+	stat = __find(veid);
+	if (stat != NULL) {
+		free_percpu(ptr->ipv6_stat);
+		free_percpu(ptr->ipv4_stat);
+		kfree(ptr);
+		ptr = stat;
+	} else {
+		list_add(&ptr->list, stat_hash_list + stat_hash(veid));
+		stat_num++;
+	}
+	venet_acct_get_stat(ptr);
+	write_unlock_irqrestore(&tc_lock, flags);
+	return ptr;
+
+out_free_v4:
+	free_percpu(ptr->ipv4_stat);
+out_free:
+	kfree(ptr);
+out:
+	return NULL;
+}
+
+struct venet_stat *venet_acct_find_stat(envid_t veid)
+{
+	struct venet_stat *ptr;
+
+	read_lock(&tc_lock);
+	ptr = __find(veid);
+	if (ptr != NULL)
+		venet_acct_get_stat(ptr);
+	read_unlock(&tc_lock);
+	return ptr;
+}
+
+void venet_acct_put_stat(struct venet_stat *stat)
+{
+	if (stat == NULL)
+		return;
+	atomic_dec(&stat->users);
+}
+
+static inline struct acct_stat *
+__choose_acct(struct venet_stat *stat, int v6)
+{
+	if (v6)
+		return stat->ipv6_stat;
+	else
+		return stat->ipv4_stat;
+}
+
+/*
+ * v6: flag - IPv6 or IPv4 statistic are interested in
+ * returns array of counters, indexed by tc
+ */
+static int venet_acct_get_ve_stat(struct vzctl_tc_get_stat *data, int v6)
+{
+	struct venet_stat *stat;
+	void *buf;
+	u64 *incoming, *outgoing;
+	u32 *incoming_pkt, *outgoing_pkt;
+	int err, size, cpu;
+	struct acct_stat *acct;
+
+	if (data->length < 0 || data->length > TC_CLASS_MAX)
+		return -EINVAL;
+
+	buf = kzalloc(2 * TC_CLASS_MAX * (sizeof(u64) + sizeof(u32)), GFP_KERNEL);
+	if (buf == NULL)
+		return -ENOMEM;
+
+	incoming = buf;
+	outgoing = incoming + TC_CLASS_MAX;
+	incoming_pkt = (u32 *)(outgoing + TC_CLASS_MAX);
+	outgoing_pkt = incoming_pkt + TC_CLASS_MAX;
+
+	read_lock(&tc_lock);
+	err = -ESRCH;
+	stat = __find(data->veid);
+	if (stat == NULL)
+		goto out_unlock;
+
+	acct = __choose_acct(stat, v6);
+
+	for_each_possible_cpu(cpu) {
+		struct acct_stat *stat;
+		int i;
+
+		stat = per_cpu_ptr(acct, cpu);
+		for (i = 0; i < data->length; i++) {
+			incoming[i] += stat->cnt[i][ACCT_IN].bytes;
+			outgoing[i] += stat->cnt[i][ACCT_OUT].bytes;
+			incoming_pkt[i] += stat->cnt[i][ACCT_IN].pkts;
+			outgoing_pkt[i] += stat->cnt[i][ACCT_OUT].pkts;
+		}
+	}
+
+	read_unlock(&tc_lock);
+
+	err = -EFAULT;
+	size = data->length * sizeof(u64);
+	if (copy_to_user(data->incoming, incoming, size))
+		goto out_free;
+	if (copy_to_user(data->outgoing, outgoing, size))
+		goto out_free;
+	size = data->length * sizeof(u32);
+	if (copy_to_user(data->incoming_pkt, incoming_pkt, size))
+		goto out_free;
+	if (copy_to_user(data->outgoing_pkt, outgoing_pkt, size))
+		goto out_free;
+
+	err = data->length;
+
+out_free:
+	kfree(buf);
+	return err;
+
+out_unlock:
+	read_unlock(&tc_lock);
+	goto out_free;
+}
+
+static int __tc_destroy_stat(struct venet_stat *stat)
+{
+	if (atomic_read(&stat->users))
+		return -EBUSY;
+	stat_num--;
+	list_del(&stat->list);
+	free_percpu(stat->ipv6_stat);
+	free_percpu(stat->ipv4_stat);
+	kfree(stat);
+	return 0;
+}
+
+/* cleans up counter and removes it from memory if VE not exists */
+static int venet_acct_destroy_stat(envid_t veid)
+{
+	struct venet_stat *stat;
+	int err;
+
+	err = -ESRCH;
+	write_lock_irq(&tc_lock);
+	stat = __find(veid);
+	if (stat != NULL)
+		err = __tc_destroy_stat(stat);
+	write_unlock_irq(&tc_lock);
+	return err;
+}
+
+static void venet_acct_destroy_all_stat(void)
+{
+	int hash;
+	struct list_head *ptr, *tmp;
+
+	write_lock_irq(&tc_lock);
+	for (hash = 0; hash < STAT_HASH_LEN; hash++) {
+		list_for_each_safe(ptr, tmp, stat_hash_list + hash)
+			__tc_destroy_stat(list_entry(ptr,
+						struct venet_stat, list));
+	}
+	write_unlock_irq(&tc_lock);
+}
+
+static DEFINE_MUTEX(req_mutex);
+static struct venet_stat *req_stat;
+
+static void zero_venet_stat(struct venet_stat *stat, unsigned cpu)
+{
+	struct acct_stat *acct;
+
+	acct = per_cpu_ptr(stat->ipv4_stat, cpu);
+	memset(acct, 0, sizeof(*acct));
+	acct = per_cpu_ptr(stat->ipv6_stat, cpu);
+	memset(acct, 0, sizeof(*acct));
+}
+
+static void clear_one_percpu_statistics(struct work_struct *dummy)
+{
+	unsigned cpu, this_cpu = get_cpu();
+
+	zero_venet_stat(req_stat, this_cpu);
+
+	if (cpumask_first(cpu_online_mask) != this_cpu)
+		goto out;
+
+	/* First cpu clears statistics on all offline cpus */
+	for_each_possible_cpu(cpu)
+		if (!cpu_online(cpu))
+			zero_venet_stat(req_stat, cpu);
+out:
+	put_cpu();
+}
+
+/* Clear VE's statistics */
+static int venet_acct_clear_stat(envid_t veid)
+{
+	int ret = -EINTR;
+
+	if (mutex_lock_interruptible(&req_mutex))
+		goto out;
+
+	req_stat = venet_acct_find_stat(veid);
+	if (!req_stat) {
+		ret = -ESRCH;
+		goto unlock;
+	}
+
+	ret = schedule_on_each_cpu(clear_one_percpu_statistics);
+
+	venet_acct_put_stat(req_stat);
+unlock:
+	mutex_unlock(&req_mutex);
+out:
+	return ret;
+}
+
+static void clear_all_percpu_statistics(struct work_struct *dummy)
+{
+	unsigned cpu, this_cpu = smp_processor_id();
+	struct venet_stat *stat = NULL;
+	int other = 0, hash = 0;
+
+	/*
+	 * Some cpus may be offline, and schedule_on_each_cpu()
+	 * does not create a work on them.
+	 * Work on the first online CPU clears their statistics.
+	 * Hotplug is disabled by schedule_on_each_cpu().
+	 */
+	if (cpumask_first(cpu_online_mask) == this_cpu)
+		other = 1;
+
+	read_lock(&tc_lock);
+
+	while ((stat = next_stat(&hash, stat)) != NULL) {
+		zero_venet_stat(stat, this_cpu);
+
+		if (!other)
+			continue;
+
+		/* Clear statistics on not active cpus */
+		for_each_possible_cpu(cpu)
+			if (!cpu_online(cpu))
+				zero_venet_stat(stat, cpu);
+	}
+
+	read_unlock(&tc_lock);
+}
+
+/* Clear all present statistics */
+static int venet_acct_clear_all_stat(void)
+{
+	int ret = -EINTR;
+
+	if (mutex_lock_interruptible(&req_mutex))
+		goto out;
+
+	ret = schedule_on_each_cpu(clear_all_percpu_statistics);
+
+	mutex_unlock(&req_mutex);
+out:
+	return ret;
+}
+
+static int venet_acct_get_stat_list(envid_t *__list, int length)
+{
+	int hash;
+	struct venet_stat *ptr;
+	int i, err;
+	envid_t *list;
+
+	if (length <= 0)
+		return -EINVAL;
+
+	list = kmalloc(sizeof(envid_t) * length, GFP_KERNEL);
+	if (list == NULL)
+		return -ENOMEM;
+
+	i = 0;
+	read_lock(&tc_lock);
+	for (hash = 0; hash < STAT_HASH_LEN; hash++) {
+		list_for_each_entry(ptr, stat_hash_list + hash, list) {
+			list[i++] = ptr->veid;
+			if (i == length)
+				break;
+		}
+	}
+	read_unlock(&tc_lock);
+
+	err = -EFAULT;
+	if (!copy_to_user(__list, list, sizeof(envid_t) * i))
+		err = i;
+	kfree(list);
+	return err;
+}
+
+static int venet_acct_get_base(envid_t veid)
+{
+	int err = -ESRCH;
+	struct venet_stat *ptr;
+
+	read_lock(&tc_lock);
+	ptr = __find(veid);
+	if (ptr != NULL)
+		err = ptr->base;
+	read_unlock(&tc_lock);
+	return err;
+}
+
+static int __check_base(__u16 base)
+{
+	int hash;
+	struct venet_stat *stat;
+
+	hash = 0;
+	stat = NULL;
+	while ((stat = next_stat(&hash, stat)) != NULL) {
+		if (stat->base == 0 || stat->base != base)
+			continue;
+		return 1;
+	}
+	return 0;
+}
+
+static int venet_acct_set_base(envid_t veid, __u16 base)
+{
+	static __u16 rover = 1;
+	int err, pos;
+	struct venet_stat *stat;
+
+	stat = venet_acct_find_create_stat(veid);
+	if (stat == NULL)
+		return -ENOMEM;
+
+	write_lock_irq(&tc_lock);
+	if (base != 0)
+		goto done;
+
+	err = -ERANGE;
+	pos = rover;
+	do {
+		rover++;
+		if (rover == 0)
+			rover = 1;
+		if (__check_base(rover))
+			continue;
+		base = rover;
+done:
+		err = base;
+		stat->base = base;
+		break;
+	} while (pos != rover);
+
+	write_unlock_irq(&tc_lock);
+	venet_acct_put_stat(stat);
+	return err;
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * Accounting engine
+ * ---------------------------------------------------------------------------
+ */
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+static int match_v6_class(const __u32 *addr, struct vz_tc_class_info_v6 *class)
+{
+	return !(
+			((addr[0] & class->mask[0]) ^ class->addr[0]) |
+			((addr[1] & class->mask[1]) ^ class->addr[1]) |
+			((addr[2] & class->mask[2]) ^ class->addr[2]) |
+			((addr[3] & class->mask[3]) ^ class->addr[3])
+		);
+}
+
+static noinline int venet_acct_classify_v6(struct sk_buff *skb, int dir)
+{
+	int i, ret = 0;
+	struct class_info_set *info;
+	const __u32 *addr;
+
+	if (dir == ACCT_IN)
+		addr = ipv6_hdr(skb)->saddr.s6_addr32;
+	else
+		addr = ipv6_hdr(skb)->daddr.s6_addr32;
+
+	rcu_read_lock();
+	info = rcu_dereference(info_v6);
+	if (info == NULL)
+		goto out_unlock;
+
+	for (i = info->len - 1; i >= 0; i--) {
+		if (match_v6_class(addr, &info->info_v6[i])) {
+			ret = info->info_v6[i].cid;
+			break;
+		}
+	}
+out_unlock:
+	rcu_read_unlock();
+	return ret;
+}
+#else
+#define venet_acct_classify_v6(skb, dir)	(0)
+#endif
+
+static int __venet_acct_classify(__u32 daddr)
+{
+	int ret, i;
+	struct class_info_set *info;
+
+	ret = 0;
+	rcu_read_lock();
+	info = rcu_dereference(info_v4);
+	if (info == NULL)
+		goto out_unlock;
+	for (i = info->len - 1; i >= 0; i--) {
+		if ((daddr & info->info_v4[i].mask) == info->info_v4[i].addr) {
+			ret = info->info_v4[i].cid;
+			break;
+		}
+	}
+out_unlock:
+	rcu_read_unlock();
+	return ret;
+}
+
+static int venet_acct_classify(struct sk_buff *skb, int dir)
+{
+	__u32 addr;
+
+	if (dir == ACCT_IN)
+		addr = ip_hdr(skb)->saddr;
+	else
+		addr = ip_hdr(skb)->daddr;
+
+	return __venet_acct_classify(addr);
+}
+
+static void __do_acct_one(struct acct_stat *acct, int class, int dir, int size)
+{
+	int cpu;
+	struct acct_counter *cnt;
+
+	cpu = get_cpu();
+
+	acct = per_cpu_ptr(acct, cpu);
+	cnt = &acct->cnt[class][dir];
+
+	cnt->bytes += size;
+	if (size > 0)
+		cnt->pkts++;
+	else
+		cnt->pkts--;
+
+	put_cpu();
+}
+
+static int acct_one_skb(struct venet_stat *stat, struct sk_buff *skb, int dir, int size)
+{
+	int class;
+	struct acct_stat *acct;
+
+	switch (skb->protocol) {
+	case __constant_htons(ETH_P_IP):
+		class = venet_acct_classify(skb, dir);
+		acct = stat->ipv4_stat;
+		break;
+	case __constant_htons(ETH_P_IPV6):
+		class = venet_acct_classify_v6(skb, dir);
+		acct = stat->ipv6_stat;
+		break;
+	default:
+		return 0;
+	}
+
+	__do_acct_one(acct, class, dir, size);
+
+	return class;
+}
+
+void venet_acct_classify_add_incoming(struct venet_stat *stat, struct sk_buff *skb)
+{
+	acct_one_skb(stat, skb, ACCT_IN, venet_acct_skb_size(skb));
+}
+
+static inline void venet_acct_mark(struct venet_stat *stat,
+	       struct sk_buff *skb, int class)
+{
+#ifdef CONFIG_NETFILTER
+	if (stat->base == 0)	/* compatibility mode */
+		skb->mark = class + stat->veid*2*TC_CLASS_MAX;
+	else
+		skb->mark = class + stat->base*TC_CLASS_MAX;
+#endif
+}
+
+/* FIX ME: hardheader accouting */
+void venet_acct_classify_add_outgoing(struct venet_stat *stat, struct sk_buff *skb)
+{
+	int class;
+
+	class = acct_one_skb(stat, skb, ACCT_OUT, venet_acct_skb_size(skb));
+	/* Do not forget to mark skb for traffic shaper */
+	venet_acct_mark(stat, skb, class);
+}
+
+void venet_acct_classify_sub_outgoing(struct venet_stat *stat, struct sk_buff *skb)
+{
+	int class;
+
+	class = acct_one_skb(stat, skb, ACCT_OUT, -venet_acct_skb_size(skb));
+	/* Do not forget to mark skb for traffic shaper */
+	venet_acct_mark(stat, skb, class);
+}
+
+void venet_acct_classify_add_incoming_plain(struct venet_stat *stat,
+		struct ve_addr_struct *src_addr, int data_size)
+{
+	int class;
+
+	class = __venet_acct_classify(src_addr->key[3]);
+	__do_acct_one(stat->ipv4_stat, class, ACCT_IN, data_size);
+}
+
+void venet_acct_classify_add_outgoing_plain(struct venet_stat *stat,
+		struct ve_addr_struct *dst_addr, int data_size)
+{
+	int class;
+
+	class = __venet_acct_classify(dst_addr->key[3]);
+	__do_acct_one(stat->ipv4_stat, class, ACCT_OUT, data_size);
+}
+
+/*
+ * ---------------------------------------------------------------------------
+ * IOCTL interface for user
+ * ---------------------------------------------------------------------------
+ */
+
+static int venet_acct_ioctl(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	int err;
+	struct vzctl_tc_classes		tcl;
+	struct vzctl_tc_classes_v6	tcl_v6;
+	struct vzctl_tc_get_stat 	tcnt;
+	struct vzctl_tc_get_stat_list	tcsl;
+
+
+	if (!capable_setveid())
+		return -EPERM;
+
+	err = -ENOTTY;
+	switch(cmd) {
+		case VZCTL_TC_MAX_CLASS:
+			err = TC_CLASS_MAX;
+			break;
+		case VZCTL_TC_CLASS_NUM:
+			err = class_info_len(0);
+			break;
+		case VZCTL_TC_SET_CLASS_TABLE:
+			err = -EFAULT;
+			if (copy_from_user(&tcl, (void *)arg, sizeof(tcl)))
+				break;
+			err = venet_acct_set_classes(tcl.info, tcl.length, 0);
+			break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		case VZCTL_TC_CLASS_NUM_V6:
+			err = class_info_len(1);
+			break;
+		case VZCTL_TC_SET_CLASS_TABLE_V6:
+			err = -EFAULT;
+			if (copy_from_user(&tcl_v6, (void *)arg, sizeof(tcl_v6)))
+				break;
+			err = venet_acct_set_classes(tcl_v6.info, tcl_v6.length, 1);
+			break;
+#endif
+		case VZCTL_TC_GET_CLASS_TABLE:
+			err = -EFAULT;
+			if (copy_from_user(&tcl, (void *)arg, sizeof(tcl)))
+				break;
+			err = venet_acct_get_classes(tcl.info, tcl.length, 0);
+			break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+		case VZCTL_TC_GET_CLASS_TABLE_V6:
+			err = -EFAULT;
+			if (copy_from_user(&tcl_v6, (void *)arg, sizeof(tcl_v6)))
+				break;
+			err = venet_acct_get_classes(tcl_v6.info, tcl_v6.length, 1);
+			break;
+#endif
+
+		case VZCTL_TC_STAT_NUM:
+			err = stat_num;
+			break;
+		case VZCTL_TC_GET_STAT_LIST:
+			err = -EFAULT;
+			if (copy_from_user(&tcsl, (void *)arg, sizeof(tcsl)))
+				break;
+			err = venet_acct_get_stat_list(tcsl.list, tcsl.length);
+			break;
+		case VZCTL_TC_GET_STAT:
+		case VZCTL_TC_GET_STAT_V6:
+			err = -EFAULT;
+			if (copy_from_user(&tcnt, (void *)arg, sizeof(tcnt)))
+				break;
+			err = venet_acct_get_ve_stat(&tcnt, cmd == VZCTL_TC_GET_STAT_V6);
+			break;
+		case VZCTL_TC_DESTROY_STAT:
+			err = venet_acct_destroy_stat(arg);
+			break;
+		case VZCTL_TC_DESTROY_ALL_STAT:
+			err = 0;
+			venet_acct_destroy_all_stat();
+			break;
+		case VZCTL_TC_CLEAR_STAT:
+			err = venet_acct_clear_stat(arg);
+			break;
+		case VZCTL_TC_CLEAR_ALL_STAT:
+			err = venet_acct_clear_all_stat();
+			break;
+
+		case VZCTL_TC_GET_BASE:
+			err = venet_acct_get_base(arg);
+			break;
+		case VZCTL_TC_SET_BASE:
+		{
+			struct vzctl_tc_set_base tcb;
+			err = -EFAULT;
+			if (copy_from_user(&tcb, (void *)arg, sizeof(tcb)))
+				break;
+			err = venet_acct_set_base(tcb.veid, tcb.base);
+			break;
+		}
+	}
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+static int compat_venet_acct_ioctl(struct file *file, unsigned int cmd,
+		unsigned long arg)
+{
+	int err;
+
+	if (!capable_setveid())
+		return -EPERM;
+
+	switch (cmd) {
+	case COMPAT_VZCTL_TC_GET_STAT: {
+		struct compat_vzctl_tc_get_stat cs;
+		struct vzctl_tc_get_stat __user *s;
+
+		s = compat_alloc_user_space(sizeof(*s));
+
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+		if (put_user(cs.veid, &s->veid) ||
+		    put_user(compat_ptr(cs.incoming), &s->incoming) ||
+		    put_user(compat_ptr(cs.outgoing), &s->outgoing) ||
+		    put_user(compat_ptr(cs.incoming_pkt), &s->incoming_pkt) ||
+		    put_user(compat_ptr(cs.outgoing_pkt), &s->outgoing_pkt) ||
+		    put_user(cs.length, &s->length))
+			break;
+
+		err = venet_acct_ioctl(file, VZCTL_TC_GET_STAT,
+				(unsigned long)s);
+		break;
+	}
+	case COMPAT_VZCTL_TC_GET_STAT_LIST: {
+		struct compat_vzctl_tc_get_stat_list cs;
+		struct vzctl_tc_get_stat_list __user *s;
+
+		s = compat_alloc_user_space(sizeof(*s));
+
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+		if (put_user(compat_ptr(cs.list), &s->list) ||
+		    put_user(cs.length, &s->length))
+			break;
+
+		err = venet_acct_ioctl(file, VZCTL_TC_GET_STAT_LIST,
+				(unsigned long)s);
+		break;
+	}
+	case COMPAT_VZCTL_TC_SET_CLASS_TABLE:
+	case COMPAT_VZCTL_TC_GET_CLASS_TABLE: {
+		struct compat_vzctl_tc_classes cs;
+		struct vzctl_tc_classes __user *s;
+
+		s = compat_alloc_user_space(sizeof(*s));
+
+		err = -EFAULT;
+		if (copy_from_user(&cs, (void *)arg, sizeof(cs)))
+			break;
+		if (put_user(compat_ptr(cs.info), &s->info) ||
+		    put_user(cs.length, &s->length))
+			break;
+
+		err = venet_acct_ioctl(file,
+				cmd == COMPAT_VZCTL_TC_GET_CLASS_TABLE ?
+					VZCTL_TC_GET_CLASS_TABLE :
+					VZCTL_TC_SET_CLASS_TABLE,
+				(unsigned long)s);
+		break;
+	}
+	default:
+		/* should be OK */
+		err = venet_acct_ioctl(file, cmd, arg);
+		break;
+	}
+	return err;
+}
+#endif
+
+static struct vzioctlinfo tc_ioctl_info = {
+	.type 		= VZTCCTLTYPE,
+	.ioctl		= venet_acct_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= compat_venet_acct_ioctl,
+#endif
+	.owner		= THIS_MODULE,
+};
+
+
+/*
+ * ---------------------------------------------------------------------------
+ * /proc interface for user
+ * ---------------------------------------------------------------------------
+ */
+
+static char seq_buffer[1024];
+static DEFINE_SPINLOCK(seq_buffer_lock);
+
+static int stat_seq_show_common(struct seq_file *m, void *v, int v6)
+{
+	struct venet_stat *ptr = (struct venet_stat *)v;
+	struct acct_stat *acct = __choose_acct(ptr, v6);
+	int i;
+
+	spin_lock(&seq_buffer_lock);
+	*seq_buffer = 0;
+	for (i = 0; i < TC_CLASS_MAX; i++) {
+		u64 incoming = 0;
+		u64 outgoing = 0;
+		int cpu;
+
+		for_each_possible_cpu(cpu) {
+			struct acct_stat *stat;
+
+			stat = per_cpu_ptr(acct, cpu);
+			incoming += stat->cnt[i][ACCT_IN].bytes;
+			outgoing += stat->cnt[i][ACCT_OUT].bytes;
+		}
+
+		sprintf(seq_buffer + strlen(seq_buffer), " %20Lu/%20Lu",
+				incoming, outgoing);
+	}
+
+	seq_printf(m, "%u %s\n", ptr->veid, seq_buffer);
+	spin_unlock(&seq_buffer_lock);
+	return 0;
+}
+
+static int stat_seq_show_v4(struct seq_file *m, void *v)
+{
+	return stat_seq_show_common(m, v, 0);
+}
+
+static int stat_seq_show_v6(struct seq_file *m, void *v)
+{
+	return stat_seq_show_common(m, v, 1);
+}
+
+static void *stat_seq_start(struct seq_file *m, loff_t *pos)
+{
+	struct venet_stat *stat;
+	int hash;
+	loff_t l;
+
+	if (!ve_is_super(get_exec_env()))
+		return NULL;
+
+	read_lock(&tc_lock);
+	hash = 0;
+	stat = NULL;
+	stat = next_stat(&hash, stat);
+	for (l = *pos; stat && l > 0; l--)
+		stat = next_stat(&hash, stat);
+	return stat;
+}
+
+static void *stat_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct venet_stat *ptr = (struct venet_stat *)v;
+	int hash;
+
+	if (!ve_is_super(get_exec_env()))
+		return NULL;
+	hash = stat_hash(ptr->veid);
+	(*pos)++;
+	return next_stat(&hash, ptr);
+}
+
+static void stat_seq_stop(struct seq_file *m, void *v)
+{
+	read_unlock(&tc_lock);
+}
+
+static struct seq_operations stat_seq_op = {
+        .start = stat_seq_start,
+        .next  = stat_seq_next,
+        .stop  = stat_seq_stop,
+        .show  = stat_seq_show_v4,
+};
+
+static struct seq_operations stat_v6_seq_op = {
+        .start = stat_seq_start,
+        .next  = stat_seq_next,
+        .stop  = stat_seq_stop,
+        .show  = stat_seq_show_v6,
+};
+
+static int stat_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &stat_seq_op);
+}
+
+static int stat_v6_open(struct inode *inode, struct file *file)
+{
+        return seq_open(file, &stat_v6_seq_op);
+}
+
+static struct file_operations proc_venetstat_operations = {
+        .open		= stat_open,
+        .read		= seq_read,
+        .llseek		= seq_lseek,
+        .release	= seq_release,
+};
+
+static struct file_operations proc_venetstat_v6_operations = {
+        .open		= stat_v6_open,
+        .read		= seq_read,
+        .llseek		= seq_lseek,
+        .release	= seq_release,
+};
+
+static int __net_init net_init_acct(struct net *net)
+{
+	struct ve_struct *ve = net->owner_ve;
+
+	if (!ve->stat) {
+		ve->stat = venet_acct_find_create_stat(ve->veid);
+		if (!ve->stat)
+			return -ENOMEM;
+	} else
+		venet_acct_get_stat(ve->stat);
+
+	return 0;
+}
+
+static void __net_exit net_exit_acct(struct net *net)
+{
+	struct ve_struct *ve = net->owner_ve;
+
+	if (ve->stat) {
+		venet_acct_put_stat(ve->stat);
+		if (ve->ve_netns == net)
+			ve->stat = NULL;
+	}
+}
+
+static struct pernet_operations __net_initdata net_acct_ops = {
+	.init	= net_init_acct,
+	.exit	= net_exit_acct,
+};
+
+int __init venetstat_init(void)
+{
+	int i, ret;
+#if CONFIG_PROC_FS
+	struct proc_dir_entry *de;
+#endif
+
+	for (i = 0; i < STAT_HASH_LEN; i++)
+		INIT_LIST_HEAD(stat_hash_list + i);
+
+	ret = register_pernet_subsys(&net_acct_ops);
+	if (ret)
+		return ret;
+
+#if CONFIG_PROC_FS
+	de = proc_create("venetstat", S_IFREG|S_IRUSR, proc_vz_dir,
+			&proc_venetstat_operations);
+	if (de == NULL)
+		printk(KERN_WARNING "VENET: can't make venetstat proc entry\n");
+
+	de = proc_create("venetstat_v6", S_IFREG|S_IRUSR, proc_vz_dir,
+			&proc_venetstat_v6_operations);
+	if (de == NULL)
+		printk(KERN_WARNING "VENET: can't make venetstat_v6 proc entry\n");
+
+#endif
+	vzioctl_register(&tc_ioctl_info);
+	return 0;
+}
+
+void __exit venetstat_exit(void)
+{
+	unregister_pernet_subsys(&net_acct_ops);
+	vzioctl_unregister(&tc_ioctl_info);
+	venet_acct_destroy_all_stat();
+
+#if CONFIG_PROC_FS
+	remove_proc_entry("venetstat_v6", proc_vz_dir);
+	remove_proc_entry("venetstat", proc_vz_dir);
+#endif
+	kfree(info_v4);
+	kfree(info_v6);
+}
+
+module_init(venetstat_init);
+module_exit(venetstat_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
+
+EXPORT_SYMBOL(venet_acct_find_create_stat);
+EXPORT_SYMBOL(venet_acct_find_stat);
+EXPORT_SYMBOL(venet_acct_put_stat);
+EXPORT_SYMBOL(venet_acct_classify);
+EXPORT_SYMBOL(venet_acct_classify_add_outgoing);
+EXPORT_SYMBOL(venet_acct_classify_sub_outgoing);
+EXPORT_SYMBOL(venet_acct_classify_add_incoming);
+EXPORT_SYMBOL(venet_acct_classify_add_incoming_plain);
+EXPORT_SYMBOL(venet_acct_classify_add_outgoing_plain);
--- /dev/null
+++ b/kernel/ve/vzstat.c
@@ -0,0 +1,763 @@
+/*
+ *  kernel/ve/vzstat.c
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/mm.h>
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/errno.h>
+#include <linux/suspend.h>
+#include <linux/interrupt.h>
+#include <linux/mmzone.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#include <linux/vzstat.h>
+
+/* local variables */
+static struct task_struct *vzstat_thread_tsk;
+
+static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = {
+	"alocatomic:",
+	"aloclow:",
+	"alochigh:",
+	"aloclowmp:",
+	"alochighmp:"
+};
+
+/*
+ * ------------------------------------------------------------------------
+ * Kernel protection: kernel code checksumming
+ * ------------------------------------------------------------------------
+ */
+#ifdef CONFIG_VE_KERNEL_CSUM
+
+#ifdef __x86_64__
+/* skip init_level4_pgt */
+#define KERNEL_PROT_START	((unsigned long)(&_stext) + 0x2000)
+#else
+#define KERNEL_PROT_START	((unsigned long)(&_stext))
+#endif
+#define KERNEL_PROT_END		((unsigned long)(&_etext))
+#define CSALIGN(value, size)	((value + (size - 1)) & ~(size - 1))
+
+void kernel_text_csum_check(void)
+{
+#define CSUM_NR	2
+	static unsigned long text_csum[CSUM_NR], text_csumed, csum_time;
+	unsigned long start, end, ptr, csum[CSUM_NR];
+	int i;
+
+	if (jiffies - csum_time < 60*HZ)
+		return;
+
+	csum_time = jiffies;
+	for (i = 0; i < CSUM_NR; i++) csum[i] = 0;
+	start = CSALIGN(KERNEL_PROT_START, sizeof(csum[0]));
+	end = CSALIGN(KERNEL_PROT_END, sizeof(csum[0]));
+
+	for (ptr = start; ptr < end; ptr += sizeof(csum[0])) {
+		unsigned long i = *(unsigned long*)ptr;
+		csum[0] = csum[0] + i;
+		csum[1] = (csum[1] ^ i) + ((csum[1] << 1) + (csum[1] >> 31));
+		cond_resched();
+	}
+
+	if (!text_csumed) {
+		for (i = 0; i < CSUM_NR; i++) text_csum[i] = csum[i];
+		text_csumed = 1;
+		return;
+	}
+	for (i = 0; i < CSUM_NR; i++)
+		if (text_csum[i] != csum[i]) {
+			printk(KERN_EMERG "Kernel checksum %d changed "
+				"(csum%d=%08lx, onboot csum%d=%08lx)\n",
+				i, i, csum[i], i, text_csum[i]);
+			kernel_text_csum_broken++;
+		}
+}
+
+#endif
+
+/*
+ * ------------------------------------------------------------------------
+ * Latency update and show functions
+ * ------------------------------------------------------------------------
+ */
+static inline u64 get_task_lat(struct task_struct *t, u64 now)
+{
+	u64 wstamp;
+
+	wstamp = t->se.statistics->wait_start;
+	if (wstamp && now > wstamp && now - wstamp < (1ULL << 63))
+		return now - wstamp;
+	return 0;
+}
+
+static void update_max_sched_latency_snap(void)
+{
+	struct task_struct *t, *g;
+	u64 now, max, tmp;
+	struct kstat_lat_pcpu_struct *st;
+
+	max = 0;
+	read_lock(&tasklist_lock);
+	now = ktime_to_ns(ktime_get());
+	do_each_thread(g, t) {
+		if (likely(t->state != TASK_RUNNING))
+			continue;
+
+		tmp = get_task_lat(t, now);
+		if (max < tmp)
+			max = tmp;
+		st = &t->task_ve->sched_lat_ve;
+		if (st->max_snap < tmp)
+			st->max_snap = tmp;
+	} while_each_thread(g, t);
+	read_unlock(&tasklist_lock);
+	kstat_glob.sched_lat.max_snap = max;
+}
+
+static void update_schedule_latency(void)
+{
+	/*
+	 * global scheduling latency is updated in schedule() and
+	 * update_max_sched_latency_snap(). The latter function guarantees
+	 * that tasks which do not recieve CPU time are still accounted in
+	 * scheduling latency
+	 */
+	update_max_sched_latency_snap();
+
+	spin_lock_irq(&kstat_glb_lock);
+	KSTAT_LAT_PCPU_UPDATE(&kstat_glob.sched_lat);
+	spin_unlock_irq(&kstat_glb_lock);
+	/* Note: per-VE latency is updated in update_venum() */
+}
+
+static void update_alloc_latency(void)
+{
+	int i;
+
+	spin_lock_irq(&kstat_glb_lock);
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+		KSTAT_LAT_PCPU_UPDATE(&kstat_glob.alloc_lat[i]);
+	KSTAT_LAT_UPDATE(&kstat_glob.swap_in);
+	KSTAT_LAT_PCPU_UPDATE(&kstat_glob.page_in);
+	spin_unlock_irq(&kstat_glb_lock);
+}
+
+static void lastlat_seq_show(struct seq_file *m,
+		const char *name,
+		struct kstat_lat_snap_struct *snap)
+{
+	seq_printf(m, "%-11s %20Lu %20Lu %20lu\n", name,
+			snap->maxlat, snap->totlat, snap->count);
+}
+
+static void avglat_seq_show(struct seq_file *m,
+		const char *name,
+		u64 *avg)
+{
+	seq_printf(m, "%-11s %20Lu %20Lu %20Lu\n", name,
+			avg[0], avg[1], avg[2]);
+}
+
+static int latency_seq_show(struct seq_file *m, void *v)
+{
+	int i;
+
+	if (!v)
+		return 0;
+
+	seq_puts(m, "Version: 2.5\n");
+
+	seq_puts(m, "\nLatencies:\n");
+	seq_printf(m, "%-11s %20s %20s %20s\n",
+			"Type", "Lat", "Total_lat", "Calls");
+	lastlat_seq_show(m, "scheduling:", &kstat_glob.sched_lat.last);
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+		lastlat_seq_show(m, alloc_descr[i],
+				&kstat_glob.alloc_lat[i].last);
+	lastlat_seq_show(m, "swap_in:", &kstat_glob.swap_in.last);
+	lastlat_seq_show(m, "page_in:", &kstat_glob.page_in.last);
+
+	seq_puts(m, "\nAverages:\n");
+	seq_printf(m, "%-11s %20s %20s %20s\n",
+			"Type", "Avg1", "Avg5", "Avg15");
+	avglat_seq_show(m, "scheduling:", kstat_glob.sched_lat.avg);
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+		avglat_seq_show(m, alloc_descr[i],
+				kstat_glob.alloc_lat[i].avg);
+	avglat_seq_show(m, "swap_in:", kstat_glob.swap_in.avg);
+	avglat_seq_show(m, "page_in:", kstat_glob.page_in.avg);
+
+	return 0;
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * General system info: processes, memory, VE
+ * ------------------------------------------------------------------------
+ */
+static void update_memory(void)
+{
+	pg_data_t *pgdat;
+	struct zone *zone;
+	struct kstat_zone_avg *zone_avg;
+	unsigned type;
+	unsigned long nr_free, nr_active, nr_inactive;
+	unsigned present;
+
+	for (type = 0; type < MAX_NR_ZONES; type++) {
+		present = 0;
+		nr_free = 0;
+		nr_active = 0;
+		nr_inactive = 0;
+
+		for_each_online_pgdat (pgdat) {
+			zone = pgdat->node_zones + type;
+			if (!zone->present_pages)
+				continue;
+
+			present++;
+			nr_free += zone_page_state(zone, NR_FREE_PAGES);
+			nr_active +=  zone_page_state(zone, NR_ACTIVE_ANON) +
+				zone_page_state(zone, NR_ACTIVE_FILE);
+			nr_inactive += zone_page_state(zone, NR_INACTIVE_ANON) +
+				zone_page_state(zone, NR_INACTIVE_FILE);
+		}
+
+		if (!present)
+			continue;
+
+		zone_avg = &kstat_glob.zone_avg[type];
+
+		CALC_LOAD(zone_avg->free_pages_avg[0], EXP_1, nr_free);
+		CALC_LOAD(zone_avg->free_pages_avg[1], EXP_5, nr_free);
+		CALC_LOAD(zone_avg->free_pages_avg[2], EXP_15,nr_free);
+
+		CALC_LOAD(zone_avg->nr_active_avg[0], EXP_1, nr_active);
+		CALC_LOAD(zone_avg->nr_active_avg[1], EXP_5, nr_active);
+		CALC_LOAD(zone_avg->nr_active_avg[2], EXP_15, nr_active);
+
+		CALC_LOAD(zone_avg->nr_inactive_avg[0], EXP_1, nr_inactive);
+		CALC_LOAD(zone_avg->nr_inactive_avg[1], EXP_5, nr_inactive);
+		CALC_LOAD(zone_avg->nr_inactive_avg[2], EXP_15, nr_inactive);
+	}
+}
+
+static void mem_avg_show(struct seq_file *m, void *v)
+{
+	unsigned type;
+	pg_data_t *pgdat;
+	struct zone *zone;
+	struct kstat_zone_avg *zone_avg;
+	unsigned present;
+	int zone_id;
+
+	zone_id = 0;
+
+	for (type = 0; type < MAX_NR_ZONES; type++) {
+		present = 0;
+
+		for_each_online_pgdat (pgdat) {
+			zone = pgdat->node_zones + type;
+			if (zone->present_pages) {
+				present++;
+				break;
+			}
+		}
+		if (!present)
+			continue;
+
+		zone_avg = &kstat_glob.zone_avg[type];
+		seq_printf(m, "ZONE%u %s averages: "
+			"active %lu %lu %lu, "
+			"inactive %lu %lu %lu, "
+			"free %lu %lu %lu\n",
+			zone_id++,
+			zone->name,
+			zone_avg->nr_active_avg[0],
+			zone_avg->nr_active_avg[1],
+			zone_avg->nr_active_avg[2],
+			zone_avg->nr_inactive_avg[0],
+			zone_avg->nr_inactive_avg[1],
+			zone_avg->nr_inactive_avg[2],
+			zone_avg->free_pages_avg[0],
+			zone_avg->free_pages_avg[1],
+			zone_avg->free_pages_avg[2]);
+	}
+}
+
+static void update_venum(void)
+{
+	struct ve_struct *ve;
+
+	mutex_lock(&ve_list_lock);
+	spin_lock_irq(&kstat_glb_lock);
+	for_each_ve(ve)
+		/* max_snap is already set in update_schedule_latency */
+		KSTAT_LAT_PCPU_UPDATE(&ve->sched_lat_ve);
+	spin_unlock_irq(&kstat_glb_lock);
+	mutex_unlock(&ve_list_lock);
+}
+
+static void task_counts_seq_show(struct seq_file *m, void *v)
+{
+	unsigned long _nr_running, _nr_sleeping, _nr_unint,
+				_nr_zombie, _nr_dead, _nr_stopped;
+	unsigned long avg[3];
+
+	_nr_running = nr_running();
+	_nr_unint = nr_uninterruptible();
+	_nr_sleeping = nr_sleeping();
+	_nr_zombie = nr_zombie;
+	_nr_dead = atomic_read(&nr_dead);
+	_nr_stopped = nr_stopped();
+
+	spin_lock_irq(&kstat_glb_lock);
+	memcpy(avg, kstat_glob.nr_unint_avg, sizeof(avg));
+	spin_unlock_irq(&kstat_glb_lock);
+
+	seq_printf(m, "VEs: %d\n", nr_ve);
+	seq_printf(m, "Processes: R %lu, S %lu, D %lu, "
+		"Z %lu, T %lu, X %lu\n",
+			_nr_running,
+			_nr_sleeping,
+			_nr_unint,
+			_nr_zombie,
+			_nr_stopped,
+			_nr_dead);
+	seq_printf(m, "Processes avg: unint %lu %lu %lu\n",
+			avg[0] >> FSHIFT, avg[1] >> FSHIFT, avg[2] >> FSHIFT);
+}
+
+static void cycles_per_jiffy_show(struct seq_file *m, void *v)
+{
+	/* Now all time slices are measured in nanoseconds */
+	seq_printf(m, "cycles_per_jiffy: %llu\n", ((u64) jiffies_to_usecs(1)) * 1000);
+}
+
+static void jiffies_per_second_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "jiffies_per_second: %u\n", HZ);
+}
+
+static void kernel_text_csum_seq_show(struct seq_file *m, void *v)
+{
+	seq_printf(m, "kernel_text_csum_broken: %d\n", 0);
+}
+
+static void swap_cache_seq_show(struct seq_file *m, void *v)
+{
+	struct swap_cache_info_struct *swpcache;
+	extern struct swap_cache_info_struct swap_cache_info;
+
+	swpcache = &swap_cache_info;
+	seq_printf(m, "Swap cache: add %lu, del %lu, find %lu/%lu\n",
+			swpcache->add_total,
+			swpcache->del_total,
+			swpcache->find_success,
+			swpcache->find_total);
+}
+
+/*
+ * Declare special structure to store summarized statistics. The 'struct zone'
+ * is not used because of it's tremendous size.
+ */
+struct zonestat {
+	const char *name;
+	unsigned long free_pages;
+	unsigned long nr_free[MAX_ORDER];
+	unsigned long pages_min;
+	unsigned long pages_low;
+	unsigned long pages_high;
+	unsigned long nr_active;
+	unsigned long nr_inactive;
+	unsigned long present_pages;
+};
+
+/*
+ * Show information about all memory zones.
+ */
+static void mem_free_areas_show_zonestat(struct seq_file *m,
+						struct zonestat *zstat)
+{
+	unsigned int order;
+	unsigned type;
+
+	for (type = 0; type < MAX_NR_ZONES; type++) {
+		struct zonestat *zone = &zstat[type];
+
+		if (!zone->name)
+			continue;
+
+		/* Skip empty zones */
+		if (!zone->present_pages)
+			continue;
+
+		seq_printf(m, "%s free %lu (", zone->name, zone->free_pages);
+		for (order = 0; order < MAX_ORDER; order++)
+			seq_printf(m, "%lu*%lu ", zone->nr_free[order],
+								1UL << order);
+
+		seq_printf(m, ") min %lu low %lu high %lu "
+			"active %lu inactive %lu size %lu\n",
+				zone->pages_min,
+				zone->pages_low,
+				zone->pages_high,
+				zone->nr_active,
+				zone->nr_inactive,
+				zone->present_pages);
+	}
+}
+
+/*
+ * Scan all registered pgdat's (i.e. memory nodes) and summarize
+ * values for identical zones.
+ */
+static void mem_free_areas_show(struct seq_file *m, void *v)
+{
+	pg_data_t *pgdat;
+	struct zonestat zones[MAX_NR_ZONES];
+	struct zonestat *zdst;
+	struct zone *zsrc;
+	int type, order;
+
+	memset(zones, 0, sizeof(zones));
+
+	for_each_online_pgdat (pgdat) {
+		for (type = 0; type < MAX_NR_ZONES; type++) {
+			unsigned long flags;
+
+			zdst = &zones[type];
+			zsrc = pgdat->node_zones + type;
+			if (!zsrc || !zsrc->name)
+				continue;
+
+			if (!zdst->name)
+				zdst->name = zsrc->name;
+			else if (strcmp(zsrc->name, zdst->name))
+				/* This shouldn't happen! */
+				printk("Warning: names mismatch for "
+					"zone %d: %s != %s\n",
+					type, zsrc->name, zdst->name);
+
+			spin_lock_irqsave(&zsrc->lock, flags);
+			for (order = 0; order < MAX_ORDER; order++)
+				zdst->nr_free[order] += zsrc->free_area[order].nr_free;
+			spin_unlock_irqrestore(&zsrc->lock, flags);
+
+			zdst->nr_active     += zone_page_state(zsrc, NR_ACTIVE_ANON) +
+						zone_page_state(zsrc, NR_ACTIVE_FILE);
+			zdst->nr_inactive   += zone_page_state(zsrc, NR_INACTIVE_ANON) +
+						zone_page_state(zsrc, NR_INACTIVE_FILE);
+			zdst->pages_min     += min_wmark_pages(zsrc);
+			zdst->pages_low     += low_wmark_pages(zsrc);
+			zdst->pages_high    += high_wmark_pages(zsrc);
+			zdst->present_pages += zsrc->present_pages;
+			zdst->free_pages    += zone_page_state(zsrc, NR_FREE_PAGES);
+		}
+	}
+	mem_free_areas_show_zonestat(m, zones);
+}
+
+static void mem_fails_show(struct seq_file *m, void *v)
+{
+	int i, cpu;
+	unsigned long alloc_fails[KSTAT_ALLOCSTAT_NR];
+
+	memset(alloc_fails, 0, sizeof(alloc_fails));
+	for_each_online_cpu(cpu)
+		for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+			alloc_fails[i] += kstat_glob.alloc_fails[cpu][i];
+
+	seq_puts(m, "\nMemory fails:\n");
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+		seq_printf(m, "%-11s %20lu\n", alloc_descr[i],
+				alloc_fails[i]);
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * Memory management profiling
+ * ------------------------------------------------------------------------
+ */
+static void KSTAT_PERF_UPDATE(struct kstat_perf_pcpu_struct *p)
+{
+	unsigned i, cpu;
+	struct kstat_perf_pcpu_snap_struct snap, *cur;
+
+	memset(&p->last, 0, sizeof(p->last));
+	for_each_online_cpu(cpu) {
+		cur = per_cpu_ptr(p->cur, cpu);
+		do {
+			i = read_seqcount_begin(&cur->lock);
+			memcpy(&snap, cur, sizeof(snap));
+		} while (read_seqcount_retry(&cur->lock, i));
+
+		if (p->last.wall_maxdur < snap.wall_maxdur)
+			p->last.wall_maxdur = snap.wall_maxdur;
+		if (p->last.cpu_maxdur < snap.cpu_maxdur)
+			p->last.cpu_maxdur = snap.cpu_maxdur;
+		cur->wall_maxdur = cur->cpu_maxdur = 0;
+
+		p->last.count += snap.count;
+		p->last.wall_tottime += snap.wall_tottime;
+		p->last.cpu_tottime += snap.cpu_tottime;
+	}
+}
+
+static void update_mmperf(void)
+{
+	KSTAT_PERF_UPDATE(&kstat_glob.ttfp);
+	KSTAT_PERF_UPDATE(&kstat_glob.cache_reap);
+	KSTAT_PERF_UPDATE(&kstat_glob.refill_inact);
+	KSTAT_PERF_UPDATE(&kstat_glob.shrink_icache);
+	KSTAT_PERF_UPDATE(&kstat_glob.shrink_dcache);
+}
+
+static void perf_seq_show(struct seq_file *m,
+		const char *name,
+		struct kstat_perf_pcpu_struct *p)
+{
+	seq_printf(m, "%-14s %10lu %20Lu %20Lu %20Lu %20Lu\n",
+			name,
+			p->last.count,
+			p->last.cpu_maxdur,
+			p->last.wall_maxdur,
+			p->last.cpu_tottime,
+			p->last.wall_tottime);
+}
+
+static int mmperf_seq_show(struct seq_file *m, void *v)
+{
+	if (!v)
+		return 0;
+	seq_puts(m, "Version: 2.5.1\n");
+	seq_printf(m, "%-14s %10s %20s %20s %20s %20s\n",
+			"Type",
+			"Count",
+			"CPU_max_dur",
+			"Wall_max_dur",
+			"CPU_tot_time",
+			"Wall_tot_time");
+	perf_seq_show(m, "ttfp:", &kstat_glob.ttfp);
+	perf_seq_show(m, "cache_reap:", &kstat_glob.cache_reap);
+	perf_seq_show(m, "refill_inact:", &kstat_glob.refill_inact);
+	perf_seq_show(m, "shrink_icache:", &kstat_glob.shrink_icache);
+	perf_seq_show(m, "shrink_dcache:", &kstat_glob.shrink_dcache);
+	return 0;
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * Main loop
+ * ------------------------------------------------------------------------
+ */
+static int vzstat_mon_loop(void* data)
+{
+	while (1) {
+		try_to_freeze();
+#ifdef CONFIG_VE_KERNEL_CSUM
+		kernel_text_csum_check();
+#endif
+		update_alloc_latency();
+		update_schedule_latency();
+		update_memory();
+		update_venum();
+		update_mmperf();
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
+		schedule_timeout(LOAD_FREQ);
+	}
+	return 0;
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * default sequential files methods
+ * ------------------------------------------------------------------------
+ */
+static void *empty_seq_start(struct seq_file *m, loff_t *pos)
+{
+	if (*pos == 0)
+		return (void*)1;
+	else
+		return NULL;
+}
+
+static void *empty_seq_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	return NULL;
+}
+
+static void empty_seq_stop(struct seq_file *m, void *v)
+{
+}
+
+/*
+ * ------------------------------------------------------------------------
+ * /proc/vz/latency sequential file methods
+ * ------------------------------------------------------------------------
+ */
+static struct seq_operations latency_seq_op = {
+	start:	empty_seq_start,
+	next:	empty_seq_next,
+	stop:	empty_seq_stop,
+	show:	latency_seq_show
+};
+
+static int latency_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &latency_seq_op);
+}
+
+static struct file_operations proc_latency_operations = {
+	.open = latency_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+	.owner = THIS_MODULE,
+};
+
+/*
+ * ------------------------------------------------------------------------
+ * /proc/vz/stats sequential file methods
+ * ------------------------------------------------------------------------
+ */
+static int stats_seq_show(struct seq_file *m, void *v)
+{
+	if (!v)
+		return 0;
+	seq_puts(m, "Version: 2.6\n");
+	cycles_per_jiffy_show(m, v);
+	jiffies_per_second_show(m, v);
+	seq_puts(m, "\nLoad info:\n");
+	task_counts_seq_show(m, v);
+	seq_puts(m, "\nMemory info:\n");
+	kernel_text_csum_seq_show(m, v);
+	swap_cache_seq_show(m, v);
+	mem_free_areas_show(m, v);
+	mem_avg_show(m, v);
+	mem_fails_show(m, v);
+	return 0;
+}
+
+static struct seq_operations stats_seq_op = {
+	start:	empty_seq_start,
+	next:	empty_seq_next,
+	stop:	empty_seq_stop,
+	show:	stats_seq_show
+};
+
+static int stats_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &stats_seq_op);
+}
+
+static struct file_operations proc_stats_operations = {
+	.open = stats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+	.owner = THIS_MODULE,
+};
+
+/*
+ * ------------------------------------------------------------------------
+ * /proc/vz/mmperf sequential file methods
+ * ------------------------------------------------------------------------
+ */
+static struct seq_operations mmperf_seq_op = {
+	start:	empty_seq_start,
+	next:	empty_seq_next,
+	stop:	empty_seq_stop,
+	show:	mmperf_seq_show
+};
+
+static int mmperf_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &mmperf_seq_op);
+}
+
+static struct file_operations proc_mmperf_operations = {
+	.open = mmperf_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+	.owner = THIS_MODULE,
+};
+
+/*
+ * ------------------------------------------------------------------------
+ * module init/exit code
+ * ------------------------------------------------------------------------
+ */
+
+int __init vzstat_mon_init(void)
+{
+	struct proc_dir_entry *entry;
+
+	entry = proc_create("latency", S_IRUGO, proc_vz_dir, &proc_latency_operations);
+	if (entry == NULL) {
+		printk(KERN_WARNING "VZSTAT: can't make proc entry\n");
+		goto fail_lat;
+	}
+
+	entry = proc_create("stats", S_IRUGO, proc_vz_dir, &proc_stats_operations);
+	if (!entry) {
+		printk(KERN_WARNING "VZSTAT: can't make proc entry\n");
+		goto fail_stat;
+	}
+
+	entry = proc_create("mmperf", S_IRUGO, proc_vz_dir, &proc_mmperf_operations);
+	if (!entry) {
+		printk(KERN_WARNING "VZSTAT: can't make proc entry\n");
+		goto fail_perf;
+	}
+
+	vzstat_thread_tsk = kthread_run(vzstat_mon_loop, NULL, "vzstat");
+	if (IS_ERR(vzstat_thread_tsk))
+		goto fail_thread;
+
+	printk(KERN_INFO "VZSTAT: initialized successfully\n");
+
+	return 0;
+
+fail_thread:
+	remove_proc_entry("mmperf", proc_vz_dir);
+fail_perf:
+	remove_proc_entry("stats", proc_vz_dir);
+fail_stat:
+	remove_proc_entry("latency", proc_vz_dir);
+fail_lat:
+	return -EBUSY;
+}
+
+void __exit vzstat_mon_exit(void)
+{
+	kthread_stop(vzstat_thread_tsk);
+
+	remove_proc_entry("mmperf", proc_vz_dir);
+	remove_proc_entry("stats", proc_vz_dir);
+	remove_proc_entry("latency", proc_vz_dir);
+}
+
+module_init(vzstat_mon_init);
+module_exit(vzstat_mon_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- /dev/null
+++ b/kernel/ve/vzstat_core.c
@@ -0,0 +1,122 @@
+/*
+ *  kernel/ve/vzstat_core.c
+ *
+ *  Copyright (c) 2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/sched.h>
+#include <linux/vzstat.h>
+
+void KSTAT_PERF_ADD(struct kstat_perf_pcpu_struct *ptr, u64 real_time, u64 cpu_time)
+{
+	struct kstat_perf_pcpu_snap_struct *cur = get_cpu_ptr(ptr->cur);
+
+	write_seqcount_begin(&cur->lock);
+	cur->count++;
+	if (cur->wall_maxdur < real_time)
+		cur->wall_maxdur = real_time;
+	cur->wall_tottime += real_time;
+	if (cur->cpu_maxdur < cpu_time)
+		cur->cpu_maxdur = cpu_time;
+	cur->cpu_tottime += real_time;
+	write_seqcount_end(&cur->lock);
+	put_cpu_ptr(cur);
+}
+
+/*
+ * Add another statistics reading.
+ * Serialization is the caller's due.
+ */
+void KSTAT_LAT_ADD(struct kstat_lat_struct *p,
+		u64 dur)
+{
+	p->cur.count++;
+	if (p->cur.maxlat < dur)
+		p->cur.maxlat = dur;
+	p->cur.totlat += dur;
+}
+
+/*
+ * Must be called with disabled interrupts to remove any possible
+ * locks and seqcounts under write-lock and avoid this 3-way deadlock:
+ *
+ * timer interrupt:
+ *	write_seqlock(&xtime_lock);
+ *	 spin_lock_irqsave(&kstat_glb_lock);
+ *
+ * update_schedule_latency():
+ *	spin_lock_irq(&kstat_glb_lock);
+ *	 read_seqcount_begin(&cur->lock)
+ *
+ * some-interrupt during KSTAT_LAT_PCPU_ADD()
+ *   KSTAT_LAT_PCPU_ADD()
+ *    write_seqcount_begin(&cur->lock);
+ *     <interrupt>
+ *      ktime_get()
+ *       read_seqcount_begin(&xtime_lock);
+ */
+void KSTAT_LAT_PCPU_ADD(struct kstat_lat_pcpu_struct *p, int cpu,
+		u64 dur)
+{
+	struct kstat_lat_pcpu_snap_struct *cur;
+
+	cur = per_cpu_ptr(p->cur, cpu);
+	write_seqcount_begin(&cur->lock);
+	cur->count++;
+	if (cur->maxlat < dur)
+		cur->maxlat = dur;
+	cur->totlat += dur;
+	write_seqcount_end(&cur->lock);
+}
+
+/*
+ * Move current statistics to last, clear last.
+ * Serialization is the caller's due.
+ */
+void KSTAT_LAT_UPDATE(struct kstat_lat_struct *p)
+{
+	u64 m;
+	memcpy(&p->last, &p->cur, sizeof(p->last));
+	p->cur.maxlat = 0;
+	m = p->last.maxlat;
+	CALC_LOAD(p->avg[0], EXP_1, m)
+	CALC_LOAD(p->avg[1], EXP_5, m)
+	CALC_LOAD(p->avg[2], EXP_15, m)
+}
+EXPORT_SYMBOL(KSTAT_LAT_UPDATE);
+
+void KSTAT_LAT_PCPU_UPDATE(struct kstat_lat_pcpu_struct *p)
+{
+	unsigned i, cpu;
+	struct kstat_lat_pcpu_snap_struct snap, *cur;
+	u64 m;
+
+	memset(&p->last, 0, sizeof(p->last));
+	for_each_online_cpu(cpu) {
+		cur = per_cpu_ptr(p->cur, cpu);
+		do {
+			i = read_seqcount_begin(&cur->lock);
+			memcpy(&snap, cur, sizeof(snap));
+		} while (read_seqcount_retry(&cur->lock, i));
+		/* 
+		 * read above and this update of maxlat is not atomic,
+		 * but this is OK, since it happens rarely and losing
+		 * a couple of peaks is not essential. xemul
+		 */
+		cur->maxlat = 0;
+
+		p->last.count += snap.count;
+		p->last.totlat += snap.totlat;
+		if (p->last.maxlat < snap.maxlat)
+			p->last.maxlat = snap.maxlat;
+	}
+
+	m = (p->last.maxlat > p->max_snap ? p->last.maxlat : p->max_snap);
+	CALC_LOAD(p->avg[0], EXP_1, m);
+	CALC_LOAD(p->avg[1], EXP_5, m);
+	CALC_LOAD(p->avg[2], EXP_15, m);
+	/* reset max_snap to calculate it correctly next time */
+	p->max_snap = 0;
+}
+EXPORT_SYMBOL(KSTAT_LAT_PCPU_UPDATE);
--- /dev/null
+++ b/kernel/ve/vzwdog.c
@@ -0,0 +1,353 @@
+/*
+ *  kernel/ve/vzwdog.c
+ *
+ *  Copyright (c) 2000-2008 SWsoft
+ *  Copyright (c) 2009-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/list.h>
+#include <linux/ctype.h>
+#include <linux/kobject.h>
+#include <linux/genhd.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kernel_stat.h>
+#include <linux/errno.h>
+#include <linux/suspend.h>
+#include <linux/ve.h>
+#include <linux/vzstat.h>
+#include <asm/uaccess.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+/* Staff regading kernel thread polling VE validity */
+static int sleep_timeout = 60;
+static struct task_struct *wdog_thread_tsk;
+
+static struct file *intr_file;
+static char page[PAGE_SIZE];
+
+static void parse_irq_list(int len)
+{
+	int i, k, skip;
+	for (i = 0; i < len; ) {
+		k = i;
+		while (i < len && page[i] != '\n' && page[i] != ':')
+			i++;
+		skip = 0;
+		if (i < len && page[i] != '\n') {
+			i++; /* skip ':' */
+			while (i < len && (page[i] == ' ' || page[i] == '0'))
+				i++;
+			skip = (i < len && (page[i] < '0' || page[i] > '9'));
+			while (i < len && page[i] != '\n')
+				i++;
+		}
+		if (!skip)
+			printk("%.*s\n", i - k, page + k);
+		if (i < len)
+			i++; /* skip '\n' */
+	}
+}
+
+static void show_irq_list(void)
+{
+	mm_segment_t fs;
+	int r;
+
+	fs = get_fs();
+	set_fs(KERNEL_DS);
+	vfs_llseek(intr_file, 0, 0);
+	r = vfs_read(intr_file, (void __user *)page, sizeof(page),
+			&intr_file->f_pos);
+	set_fs(fs);
+
+	if (r > 0)
+		parse_irq_list(r);
+}
+
+static u64 max_sched_lat;
+static u64 max_alloc_lat[KSTAT_ALLOCSTAT_NR];
+
+static void update_max_alloc_latency(void)
+{
+	int i;
+
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++)
+		max_alloc_lat[i] = max(max_alloc_lat[i],
+				kstat_glob.alloc_lat[i].last.maxlat);
+}
+
+static void update_max_schedule_latency(void)
+{
+	max_sched_lat = max(max_sched_lat, kstat_glob.sched_lat.last.maxlat);
+}
+
+static void update_max_latencies(void)
+{
+	spin_lock_irq(&kstat_glb_lock);
+	update_max_alloc_latency();
+	update_max_schedule_latency();
+	spin_unlock_irq(&kstat_glb_lock);
+}
+
+static void reset_max_latencies(void)
+{
+	max_sched_lat = 0;
+	memset(max_alloc_lat, 0, sizeof(max_alloc_lat));
+}
+
+static void show_alloc_latency(void)
+{
+	static const char *alloc_descr[KSTAT_ALLOCSTAT_NR] = {
+		"A0",
+		"L0",
+		"H0",
+		"L1",
+		"H1"
+	};
+	int i;
+
+	printk("lat: ");
+	for (i = 0; i < KSTAT_ALLOCSTAT_NR; i++) {
+		struct kstat_lat_pcpu_struct *p;
+		u64 maxlat, avg0, avg1, avg2;
+
+		p = &kstat_glob.alloc_lat[i];
+		spin_lock_irq(&kstat_glb_lock);
+		maxlat = p->last.maxlat;
+		avg0 = p->avg[0];
+		avg1 = p->avg[1];
+		avg2 = p->avg[2];
+		spin_unlock_irq(&kstat_glb_lock);
+
+		printk("%s %Lu %Lu (%Lu %Lu %Lu)",
+				alloc_descr[i],
+				(unsigned long long)max_alloc_lat[i],
+				(unsigned long long)maxlat,
+				(unsigned long long)avg0,
+				(unsigned long long)avg1,
+				(unsigned long long)avg2);
+	}
+	printk("\n");
+}
+
+static void show_schedule_latency(void)
+{
+	struct kstat_lat_pcpu_struct *p;
+	cycles_t maxlat, totlat, avg0, avg1, avg2;
+	unsigned long count;
+
+	p = &kstat_glob.sched_lat;
+	spin_lock_irq(&kstat_glb_lock);
+	maxlat = p->last.maxlat;
+	totlat = p->last.totlat;
+	count = p->last.count;
+	avg0 = p->avg[0];
+	avg1 = p->avg[1];
+	avg2 = p->avg[2];
+	spin_unlock_irq(&kstat_glb_lock);
+
+	printk("sched lat: %Lu/%Lu/%Lu/%lu (%Lu %Lu %Lu)\n",
+			(unsigned long long)max_sched_lat,
+			(unsigned long long)maxlat,
+			(unsigned long long)totlat,
+			count,
+			(unsigned long long)avg0,
+			(unsigned long long)avg1,
+			(unsigned long long)avg2);
+}
+
+static void show_header(void)
+{
+	struct timeval tv;
+
+	do_gettimeofday(&tv);
+	preempt_disable();
+	printk("*** VZWDOG 1.14: time %lu.%06lu uptime %Lu CPU %d ***\n",
+			tv.tv_sec, (long)tv.tv_usec,
+			(unsigned long long)get_jiffies_64(),
+			smp_processor_id());
+	printk("*** jiffies_per_second %u ***\n", HZ);
+	preempt_enable();
+}
+
+static void show_pgdatinfo(void)
+{
+	pg_data_t *pgdat;
+
+	printk("pgdat:");
+	for_each_online_pgdat(pgdat) {
+		printk(" %d: %lu,%lu,%lu",
+				pgdat->node_id,
+				pgdat->node_start_pfn,
+				pgdat->node_present_pages,
+				pgdat->node_spanned_pages);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+		printk(",%p", pgdat->node_mem_map);
+#endif
+	}
+	printk("\n");
+}
+
+static int show_partitions_io(struct gendisk *gp)
+{
+	struct disk_part_iter piter;
+	struct hd_struct *hd;
+	char buf[BDEVNAME_SIZE];
+	int cpu;
+
+	/*
+	if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
+		seq_puts(seqf,	"major minor name"
+				"     rio rmerge rsect ruse wio wmerge "
+				"wsect wuse running use aveq"
+				"\n\n");
+	*/
+ 
+	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
+	while ((hd = disk_part_iter_next(&piter))) {
+		cpu = part_stat_lock();
+		part_round_stats(cpu, hd);
+		part_stat_unlock();
+		printk("%4d %7d %s %lu %lu %llu "
+			   "%u %lu %lu %llu %u %u %u %u\n",
+			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
+			   disk_name(gp, hd->partno, buf),
+			   part_stat_read(hd, ios[0]),
+			   part_stat_read(hd, merges[0]),
+			   (unsigned long long)part_stat_read(hd, sectors[0]),
+			   jiffies_to_msecs(part_stat_read(hd, ticks[0])),
+			   part_stat_read(hd, ios[1]),
+			   part_stat_read(hd, merges[1]),
+			   (unsigned long long)part_stat_read(hd, sectors[1]),
+			   jiffies_to_msecs(part_stat_read(hd, ticks[1])),
+			   part_in_flight(hd),
+			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
+			   jiffies_to_msecs(part_stat_read(hd, time_in_queue))
+			);
+	}
+	disk_part_iter_exit(&piter);
+ 
+	return 0;
+}
+
+static int show_one_disk_io(struct device *dev, void *x)
+{
+	char *name;
+	char buf[BDEVNAME_SIZE];
+	struct gendisk *gd;
+
+	if (dev->type != &disk_type)
+		return 0;
+
+	gd = dev_to_disk(dev);
+
+	name = disk_name(gd, 0, buf);
+	if ((strlen(name) > 4) && (strncmp(name, "loop", 4) == 0) &&
+			isdigit(name[4]))
+		return 0;
+
+	if ((strlen(name) > 3) && (strncmp(name, "ram", 3) == 0) &&
+			isdigit(name[3]))
+		return 0;
+
+	show_partitions_io(gd);
+
+	return 0;
+}
+
+static void show_diskio(void)
+{
+	printk("disk_io: ");
+	class_for_each_device(&block_class, NULL, NULL, show_one_disk_io);
+	printk("\n");
+}
+
+static void show_nrprocs(void)
+{
+	unsigned long _nr_running, _nr_sleeping,
+			_nr_unint, _nr_zombie, _nr_dead, _nr_stopped;
+
+	_nr_running = nr_running();
+	_nr_unint = nr_uninterruptible();
+	_nr_sleeping = nr_sleeping();
+	_nr_zombie = nr_zombie;
+	_nr_dead = atomic_read(&nr_dead);
+	_nr_stopped = nr_stopped();
+
+	printk("VEnum: %d, proc R %lu, S %lu, D %lu, "
+		"Z %lu, X %lu, T %lu (tot %d)\n",
+		nr_ve,	_nr_running, _nr_sleeping, _nr_unint,
+		_nr_zombie, _nr_dead, _nr_stopped, nr_threads);
+}
+
+static void wdog_print(void)
+{
+	show_header();
+	show_irq_list();
+	show_pgdatinfo();
+	show_mem(SHOW_MEM_FILTER_NODES);
+	show_diskio();
+	show_schedule_latency();
+	show_alloc_latency();
+	show_nrprocs();
+}
+
+static int wdog_loop(void* data)
+{
+	unsigned long next_print;
+	long timeout;
+
+	next_print = jiffies;
+	while (1) {
+		update_max_latencies();
+		if (time_is_before_eq_jiffies(next_print)) {
+			wdog_print();
+			reset_max_latencies();
+			next_print = jiffies + sleep_timeout * HZ;
+		}
+		try_to_freeze();
+
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		if (kthread_should_stop())
+			break;
+		timeout = clamp_t(long, next_print - jiffies, 0, LOAD_FREQ);
+		schedule_timeout(timeout);
+	}
+	return 0;
+}
+
+static int __init wdog_init(void)
+{
+	struct file *file;
+
+	file = filp_open("/proc/interrupts", 0, 0);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+	intr_file = file;
+
+	wdog_thread_tsk = kthread_run(wdog_loop, NULL, "vzwdog");
+	if (IS_ERR(wdog_thread_tsk)) {
+		filp_close(intr_file, NULL);
+		return -EBUSY;
+	}
+	return 0;
+}
+
+static void __exit wdog_exit(void)
+{
+	kthread_stop(wdog_thread_tsk);
+	filp_close(intr_file, NULL);
+}
+
+module_param(sleep_timeout, int, 0660);
+MODULE_AUTHOR("SWsoft <devel@openvz.org>");
+MODULE_DESCRIPTION("Virtuozzo WDOG");
+MODULE_LICENSE("GPL v2");
+
+module_init(wdog_init)
+module_exit(wdog_exit)
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -3059,6 +3059,7 @@ int schedule_on_each_cpu(work_func_t func)
 	free_percpu(works);
 	return 0;
 }
+EXPORT_SYMBOL_GPL(schedule_on_each_cpu);
 
 /**
  * flush_scheduled_work - ensure that any scheduled work has run to completion.
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -445,6 +445,14 @@ config MPILIB
 	  It is used to implement RSA digital signature verification,
 	  which is used by IMA/EVM digital signature extension.
 
+config MPILIB_EXTRA
+	bool
+	depends on MPILIB
+	help
+	  Additional sources of multiprecision maths library from GnuPG.
+	  This code is unnecessary for RSA digital signature verification,
+	  but can be compiled if needed.
+
 config SIGNATURE
 	tristate
 	depends on KEYS && CRYPTO
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -174,6 +174,27 @@ config DEBUG_KERNEL
 	  Say Y here if you are developing drivers or trying to debug and
 	  identify kernel problems.
 
+config ARCH_HAS_KCOV
+	bool
+	help
+	  KCOV does not have any arch-specific code, but currently it is enabled
+	  only for x86_64. KCOV requires testing on other archs, and most likely
+	  disabling of instrumentation for some early boot code.
+
+config KCOV
+	bool "Code coverage for fuzzing"
+	depends on ARCH_HAS_KCOV
+	select DEBUG_FS
+	help
+	  KCOV exposes kernel code coverage information in a form suitable
+	  for coverage-guided fuzzing (randomized testing).
+
+	  If RANDOMIZE_BASE is enabled, PC values will not be stable across
+	  different machines and across reboots. If you need stable PC values,
+	  disable RANDOMIZE_BASE.
+
+	  For more details, see Documentation/kcov.txt.
+
 config DEBUG_SHIRQ
 	bool "Debug shared IRQ handlers"
 	depends on DEBUG_KERNEL && GENERIC_HARDIRQS
@@ -1540,6 +1561,8 @@ source "lib/Kconfig.kgdb"
 
 source "lib/Kconfig.kmemcheck"
 
+source "lib/Kconfig.kasan"
+
 config TEST_STRING_HELPERS
 	tristate "Test functions located in the string_helpers module at runtime"
 
--- /dev/null
+++ b/lib/Kconfig.kasan
@@ -0,0 +1,54 @@
+config HAVE_ARCH_KASAN
+	bool
+
+if HAVE_ARCH_KASAN
+
+config KASAN
+	bool "KASan: runtime memory debugger"
+	depends on SLUB_DEBUG
+	select CONSTRUCTORS
+	help
+	  Enables kernel address sanitizer - runtime memory debugger,
+	  designed to find out-of-bounds accesses and use-after-free bugs.
+	  This is strictly debugging feature. It consumes about 1/8
+	  of available memory and brings about ~x3 performance slowdown.
+	  For better error detection enable CONFIG_STACKTRACE,
+	  and add slub_debug=U to boot cmdline.
+
+config KASAN_SHADOW_OFFSET
+	hex
+	default 0xdffffc0000000000 if X86_64
+
+choice
+	prompt "Instrumentation type"
+	depends on KASAN
+	default KASAN_OUTLINE
+
+config KASAN_OUTLINE
+	bool "Outline instrumentation"
+	help
+	  Before every memory access compiler insert function call
+	  __asan_load*/__asan_store*. These functions performs check
+	  of shadow memory. This is slower than inline instrumentation,
+	  however it doesn't bloat size of kernel's .text section so
+	  much as inline does.
+
+config KASAN_INLINE
+	bool "Inline instrumentation"
+	help
+	  Compiler directly inserts code checking shadow memory before
+	  memory accesses. This is faster than outline (in some workloads
+	  it gives about x2 boost over outline instrumentation), but
+	  make kernel's .text size much bigger.
+
+endchoice
+
+config TEST_KASAN
+	tristate "Module for testing kasan for bug detection"
+	depends on m && KASAN
+	help
+	  This is a test module doing various nasty things like
+	  out of bounds accesses, use after free. It is useful for testing
+	  kernel debugging features like kernel address sanitizer.
+
+endif
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -7,6 +7,18 @@ ORIG_CFLAGS := $(KBUILD_CFLAGS)
 KBUILD_CFLAGS = $(subst -pg,,$(ORIG_CFLAGS))
 endif
 
+# These files are disabled because they produce lots of non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. For example,
+# rbtree can be global and individual rotations don't correlate with inputs.
+KCOV_INSTRUMENT_string.o := n
+KCOV_INSTRUMENT_rbtree.o := n
+KCOV_INSTRUMENT_list_debug.o := n
+KCOV_INSTRUMENT_debugobjects.o := n
+KCOV_INSTRUMENT_dynamic_debug.o := n
+# Kernel does not boot if we instrument this file as it uses custom calling
+# convention (see CONFIG_ARCH_HWEIGHT_CFLAGS).
+KCOV_INSTRUMENT_hweight.o := n
+
 lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o timerqueue.o\
 	 idr.o int_sqrt.o extable.o \
@@ -32,6 +44,9 @@ obj-$(CONFIG_TEST_STRING_HELPERS) += test-string_helpers.o
 obj-y += kstrtox.o
 obj-$(CONFIG_TEST_KSTRTOX) += test-kstrtox.o
 obj-$(CONFIG_TEST_RHASHTABLE) += test_rhashtable.o
+obj-$(CONFIG_TEST_KASAN) += test_kasan.o
+
+obj-y += kmapset.o
 
 ifeq ($(CONFIG_DEBUG_KOBJECT),y)
 CFLAGS_kobject.o += -DDEBUG
--- a/lib/debug_locks.c
+++ b/lib/debug_locks.c
@@ -42,6 +42,13 @@ int debug_locks_off(void)
 			console_verbose();
 			return 1;
 		}
+
+		/*
+		 * We want to taint kernel so tests can easily detect a lockdep
+		 * related problem reported.
+		 */
+
+		add_taint(TAINT_CRAP, LOCKDEP_STILL_OK);
 	}
 	return 0;
 }
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -250,7 +250,7 @@ static int sub_alloc(struct idr *idp, int *starting_id, struct idr_layer **pa,
 			id = (id | ((1 << (IDR_BITS * l)) - 1)) + 1;
 
 			/* if already at the top layer, we need to grow */
-			if (id >= 1 << (idp->layers * IDR_BITS)) {
+			if (id > idr_max(idp->layers)) {
 				*starting_id = id;
 				return -EAGAIN;
 			}
@@ -524,9 +524,7 @@ EXPORT_SYMBOL(idr_alloc_cyclic);
 
 static void idr_remove_warning(int id)
 {
-	printk(KERN_WARNING
-		"idr_remove called for id=%d which is not allocated.\n", id);
-	dump_stack();
+	WARN(1, "idr_remove called for id=%d which is not allocated.\n", id);
 }
 
 static void sub_remove(struct idr *idp, int shift, int id)
@@ -832,12 +830,10 @@ void *idr_replace(struct idr *idp, void *ptr, int id)
 	if (!p)
 		return ERR_PTR(-EINVAL);
 
-	n = (p->layer+1) * IDR_BITS;
-
-	if (id >= (1 << n))
+	if (id > idr_max(p->layer + 1))
 		return ERR_PTR(-EINVAL);
 
-	n -= IDR_BITS;
+	n = p->layer * IDR_BITS;
 	while ((n > 0) && p) {
 		p = p->ary[(id >> n) & IDR_MASK];
 		n -= IDR_BITS;
@@ -1077,8 +1073,7 @@ void ida_remove(struct ida *ida, int id)
 	return;
 
  err:
-	printk(KERN_WARNING
-	       "ida_remove called for id=%d which is not allocated.\n", id);
+	WARN(1, "ida_remove called for id=%d which is not allocated.\n", id);
 }
 EXPORT_SYMBOL(ida_remove);
 
--- /dev/null
+++ b/lib/kmapset.c
@@ -0,0 +1,339 @@
+/*
+ *  lib/kmapset.c
+ *
+ *  Copyright (c) 2013-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/kmapset.h>
+#include <linux/slab.h>
+#include <linux/hash.h>
+
+struct kmapset_map *kmapset_new(struct kmapset_set *set)
+{
+	struct kmapset_map *map;
+
+	map = kmalloc(sizeof(struct kmapset_map), GFP_KERNEL);
+	if (!map)
+		return NULL;
+	kmapset_init_map(map, set);
+	return map;
+}
+
+static void kmapset_free(struct kmapset_map *map)
+{
+	struct kmapset_link *link;
+	struct hlist_node *next;
+
+	hlist_for_each_entry_safe(link, next, &map->links, map_link)
+		kfree_rcu(link, rcu_head);
+	kfree_rcu(map, rcu_head);
+}
+
+static long kmapset_cmp(struct kmapset_map *map_a, struct kmapset_map *map_b)
+{
+	struct kmapset_link *link_a, *link_b;
+
+	if (map_a->hash != map_b->hash)
+		return map_a->hash - map_b->hash;
+
+	if (map_a->size != map_b->size)
+		return map_a->size - map_b->size;
+
+	link_a = hlist_entry(map_a->links.first,
+			struct kmapset_link, map_link);
+	link_b = hlist_entry(map_b->links.first,
+			struct kmapset_link, map_link);
+	while (&link_a->map_link) {
+		if (link_a->key != link_b->key)
+			return (long)link_a->key - (long)link_b->key;
+		if (link_a->value != link_b->value)
+			return link_a->value - link_b->value;
+		link_a = list_entry(link_a->map_link.next,
+				struct kmapset_link, map_link);
+		link_b = list_entry(link_b->map_link.next,
+				struct kmapset_link, map_link);
+	}
+
+	return map_a->default_value - map_b->default_value;
+}
+
+static inline bool kmapset_hashed(struct kmapset_map *map)
+{
+	return !RB_EMPTY_NODE(&map->node);
+}
+
+static bool kmapset_hash(struct kmapset_map *map, struct kmapset_map **old)
+{
+	struct rb_node **p = &map->set->tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct kmapset_map *cur;
+	struct kmapset_link *link;
+	long diff;
+
+	map->hash = hash_long(map->default_value, BITS_PER_LONG);
+	hlist_for_each_entry(link, &map->links, map_link)
+		map->hash ^= hash_ptr(link->key, BITS_PER_LONG) *
+			     hash_long(link->value, BITS_PER_LONG);
+
+	while (*p) {
+		parent = *p;
+		cur = rb_entry(parent, struct kmapset_map, node);
+		diff = kmapset_cmp(map, cur);
+		if (diff < 0)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+		if (!diff && old) {
+			*old = cur;
+			return true;
+		}
+	}
+	rb_link_node(&map->node, parent, p);
+	rb_insert_color(&map->node, &map->set->tree);
+	return false;
+}
+
+static void kmapset_unhash(struct kmapset_map *map)
+{
+	rb_erase(&map->node, &map->set->tree);
+	RB_CLEAR_NODE(&map->node);
+}
+
+static void kmapset_rehash(struct kmapset_map *map)
+{
+	if (kmapset_hashed(map)) {
+		kmapset_unhash(map);
+		kmapset_hash(map, NULL);
+	}
+}
+
+struct kmapset_map *kmapset_get(struct kmapset_map *map)
+{
+	if (map)
+		kref_get(&map->kref);
+	return map;
+}
+
+static void kmapset_release(struct kref *kref)
+{
+	struct kmapset_map *map = container_of(kref, struct kmapset_map, kref);
+	struct kmapset_set *set = map->set;
+	struct kmapset_link *link;
+
+	if (kmapset_hashed(map))
+		kmapset_unhash(map);
+	hlist_for_each_entry(link, &map->links, map_link)
+		hlist_del(&link->key_link);
+	mutex_unlock(&set->mutex);
+
+	kmapset_free(map);
+}
+
+void kmapset_put(struct kmapset_map *map)
+{
+	if (map)
+		kref_put_mutex(&map->kref, kmapset_release, &map->set->mutex);
+}
+
+/*
+ * kmapset_commit - hash new map into set or lookup existing copy\
+ *
+ * after committing map must stay immutable
+ */
+struct kmapset_map *kmapset_commit(struct kmapset_map *map)
+{
+	struct kmapset_set *set = map->set;
+	struct kmapset_map *ret = map;
+
+	kmapset_lock(set);
+	if (kmapset_hash(map, &ret)) {
+		kmapset_get(ret);
+		kmapset_release(&map->kref);
+	} else
+		kmapset_unlock(set);
+
+	return ret;
+}
+
+/*
+ * kmapset_copy - copy content of one set to another
+ */
+static int kmapset_copy(struct kmapset_map *dst, struct kmapset_map *src)
+{
+	struct kmapset_set *set = src->set;
+	struct kmapset_link *old_link, *new_link;
+	struct hlist_node *next;
+	int i;
+
+	for (i = src->size; i; i--) {
+		new_link = kmalloc(sizeof(struct kmapset_link), GFP_KERNEL);
+		if (!new_link)
+			return -ENOMEM;
+		hlist_add_head(&new_link->map_link, &dst->links);
+	}
+
+	kmapset_lock(set);
+	dst->default_value = src->default_value;
+	new_link = hlist_entry(dst->links.first, struct kmapset_link, map_link);
+	hlist_for_each_entry(old_link, &src->links, map_link) {
+		new_link->key = old_link->key;
+		new_link->value = old_link->value;
+		new_link->map = dst;
+		dst->size++;
+		hlist_add_head(&new_link->key_link, &new_link->key->links);
+		new_link = hlist_entry(new_link->map_link.next,
+				struct kmapset_link, map_link);
+	}
+	kmapset_unlock(set);
+
+	while (&new_link->map_link) {
+		next = new_link->map_link.next;
+		hlist_del(&new_link->map_link);
+		kfree(new_link);
+		new_link = hlist_entry(next, struct kmapset_link, map_link);
+	}
+
+	return 0;
+}
+
+struct kmapset_map *kmapset_dup(struct kmapset_map *map)
+{
+	struct kmapset_map *new;
+
+	new = kmapset_new(map->set);
+	if (!new)
+		return NULL;
+
+	if (kmapset_copy(new, map)) {
+		kmapset_free(new);
+		return NULL;
+	}
+
+	return new;
+}
+
+/*
+ * kmapset_value - lookup link object for given key
+ *
+ * requires kmapset_lock or rcu_read_lock
+ */
+struct kmapset_link *
+kmapset_lookup(struct kmapset_map *map, struct kmapset_key *key)
+{
+	struct kmapset_link *link;
+
+	hlist_for_each_entry_rcu(link, &map->links, map_link) {
+		if (link->key == key)
+			return link;
+		if (link->key > key)
+			break;
+	}
+	return NULL;
+}
+
+/*
+ * kmapset_get_value - retrieve value for given key
+ */
+unsigned long
+kmapset_get_value(struct kmapset_map *map, struct kmapset_key *key)
+{
+	struct kmapset_link *link;
+	unsigned long value;
+
+	rcu_read_lock();
+	link = kmapset_lookup(map, key);
+	value = link ? link->value : map->default_value;
+	rcu_read_unlock();
+	return value;
+}
+
+int kmapset_set_value(struct kmapset_map *map,
+		struct kmapset_key *key, unsigned long value)
+{
+	struct kmapset_set *set = map->set;
+	struct kmapset_link *new_link, *old_link, *last_link = NULL;
+
+	new_link = kmalloc(sizeof(struct kmapset_link), GFP_KERNEL);
+	if (!new_link)
+		return -ENOMEM;
+
+	new_link->key = key;
+	new_link->value = value;
+	new_link->map = map;
+
+	kmapset_lock(set);
+	if (hlist_empty(&map->links)) {
+		hlist_add_head_rcu(&new_link->map_link, &map->links);
+	} else {
+		hlist_for_each_entry(old_link, &map->links, map_link) {
+			last_link = old_link;
+			if (old_link->key < key)
+				continue;
+			if (old_link->key == key) {
+				old_link->value = value;
+				kfree(new_link);
+				goto out;
+			}
+			hlist_add_before_rcu(&new_link->map_link,
+					     &old_link->map_link);
+			goto add;
+		}
+		hlist_add_after_rcu(&last_link->map_link, &new_link->map_link);
+	}
+add:
+	hlist_add_head(&new_link->key_link, &new_link->key->links);
+	map->size++;
+out:
+	kmapset_unlock(set);
+
+	return 0;
+}
+
+bool kmapset_del_value(struct kmapset_map *map, struct kmapset_key *key)
+{
+	struct kmapset_set *set = map->set;
+	struct kmapset_link *link;
+	bool ret = false;
+
+	kmapset_lock(set);
+	link = kmapset_lookup(map, key);
+	if (link) {
+		hlist_del_rcu(&link->map_link);
+		hlist_del(&link->key_link);
+		kfree_rcu(link, rcu_head);
+		ret = true;
+	}
+	kmapset_unlock(set);
+	return ret;
+}
+
+void kmapset_set_default(struct kmapset_map *map, unsigned long value)
+{
+	struct kmapset_set *set = map->set;
+
+	kmapset_lock(set);
+	map->default_value = value;
+	kmapset_unlock(set);
+}
+
+/*
+ * kmapset_unlink - unlink key from all maps in set
+ */
+void kmapset_unlink(struct kmapset_key *key, struct kmapset_set *set)
+{
+	struct kmapset_link *link;
+	struct kmapset_map *map;
+	struct hlist_node *next;
+
+	kmapset_lock(set);
+	hlist_for_each_entry_safe(link, next, &key->links, key_link) {
+		map = link->map;
+		hlist_del(&link->key_link);
+		hlist_del_rcu(&link->map_link);
+		map->size--;
+		kfree_rcu(link, rcu_head);
+		kmapset_rehash(map);
+	}
+	kmapset_unlock(set);
+}
--- a/lib/kobject.c
+++ b/lib/kobject.c
@@ -635,6 +635,7 @@ struct kobject *kobject_create(void)
 	kobject_init(kobj, &dynamic_kobj_ktype);
 	return kobj;
 }
+EXPORT_SYMBOL(kobject_create);
 
 /**
  * kobject_create_and_add - create a struct kobject dynamically and register it with sysfs
--- a/lib/kobject_uevent.c
+++ b/lib/kobject_uevent.c
@@ -27,6 +27,7 @@
 #include <net/sock.h>
 #include <net/net_namespace.h>
 
+#include <linux/ve.h>
 
 u64 uevent_seqnum;
 char uevent_helper[UEVENT_HELPER_PATH_LEN] = CONFIG_UEVENT_HELPER_PATH;
@@ -128,7 +129,7 @@ static int kobj_usermode_filter(struct kobject *kobj)
  * Returns 0 if kobject_uevent_env() is completed with success or the
  * corresponding error when it fails.
  */
-int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
+int kobject_uevent_env_one(struct kobject *kobj, enum kobject_action action,
 		       char *envp_ext[])
 {
 	struct kobj_uevent_env *env;
@@ -246,7 +247,7 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 
 	mutex_lock(&uevent_sock_mutex);
 	/* we will send an event, so request a new sequence number */
-	retval = add_uevent_var(env, "SEQNUM=%llu", (unsigned long long)++uevent_seqnum);
+	retval = add_uevent_var(env, "SEQNUM=%llu", (unsigned long long)++ve_uevent_seqnum);
 	if (retval) {
 		mutex_unlock(&uevent_sock_mutex);
 		goto exit;
@@ -262,6 +263,9 @@ int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
 		if (!netlink_has_listeners(uevent_sock, 1))
 			continue;
 
+		if (sock_net(uevent_sock)->owner_ve != get_exec_env())
+			continue;
+
 		/* allocate message with the maximum possible size */
 		len = strlen(action_string) + strlen(devpath) + 2;
 		skb = alloc_skb(len + env->buflen, GFP_KERNEL);
@@ -319,6 +323,12 @@ exit:
 }
 EXPORT_SYMBOL_GPL(kobject_uevent_env);
 
+int kobject_uevent_env(struct kobject *kobj, enum kobject_action action,
+			char *envp_ext[])
+{
+	return kobject_uevent_env_one(kobj, action, envp_ext);
+}
+
 /**
  * kobject_uevent - notify userspace by sending an uevent
  *
--- a/lib/mpi/Makefile
+++ b/lib/mpi/Makefile
@@ -20,3 +20,14 @@ mpi-y = \
 	mpih-mul.o			\
 	mpi-pow.o			\
 	mpiutil.o
+
+mpi-$(CONFIG_MPILIB_EXTRA) += \
+	mpi-add.o			\
+	mpi-div.o			\
+	mpi-cmp.o			\
+	mpi-gcd.o			\
+	mpi-inline.o			\
+	mpi-inv.o			\
+	mpi-mpow.o			\
+	mpi-mul.o			\
+	mpi-scan.o
--- /dev/null
+++ b/lib/mpi/generic_mpi-asm-defs.h
@@ -0,0 +1,4 @@
+/* This file defines some basic constants for the MPI machinery.  We
+ * need to define the types on a per-CPU basis, so it is done with
+ * this file here.  */
+#define BYTES_PER_MPI_LIMB  (SIZEOF_UNSIGNED_LONG)
--- /dev/null
+++ b/lib/mpi/mpi-add.c
@@ -0,0 +1,234 @@
+/* mpi-add.c  -  MPI functions
+ *	Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
+ *	Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+
+/****************
+ * Add the unsigned integer V to the mpi-integer U and store the
+ * result in W. U and V may be the same.
+ */
+int mpi_add_ui(MPI w, const MPI u, unsigned long v)
+{
+	mpi_ptr_t wp, up;
+	mpi_size_t usize, wsize;
+	int usign, wsign;
+
+	usize = u->nlimbs;
+	usign = u->sign;
+	wsign = 0;
+
+	/* If not space for W (and possible carry), increase space.  */
+	wsize = usize + 1;
+	if (w->alloced < wsize)
+		if (mpi_resize(w, wsize) < 0)
+			return -ENOMEM;
+
+	/* These must be after realloc (U may be the same as W).  */
+	up = u->d;
+	wp = w->d;
+
+	if (!usize) {		/* simple */
+		wp[0] = v;
+		wsize = v ? 1 : 0;
+	} else if (!usign) {	/* mpi is not negative */
+		mpi_limb_t cy;
+		cy = mpihelp_add_1(wp, up, usize, v);
+		wp[usize] = cy;
+		wsize = usize + cy;
+	} else {		/* The signs are different.  Need exact comparison to determine
+				 * which operand to subtract from which.  */
+		if (usize == 1 && up[0] < v) {
+			wp[0] = v - up[0];
+			wsize = 1;
+		} else {
+			mpihelp_sub_1(wp, up, usize, v);
+			/* Size can decrease with at most one limb. */
+			wsize = usize - (wp[usize - 1] == 0);
+			wsign = 1;
+		}
+	}
+
+	w->nlimbs = wsize;
+	w->sign = wsign;
+	return 0;
+}
+
+int mpi_add(MPI w, MPI u, MPI v)
+{
+	mpi_ptr_t wp, up, vp;
+	mpi_size_t usize, vsize, wsize;
+	int usign, vsign, wsign;
+
+	if (u->nlimbs < v->nlimbs) {	/* Swap U and V. */
+		usize = v->nlimbs;
+		usign = v->sign;
+		vsize = u->nlimbs;
+		vsign = u->sign;
+		wsize = usize + 1;
+		if (RESIZE_IF_NEEDED(w, wsize) < 0)
+			return -ENOMEM;
+		/* These must be after realloc (u or v may be the same as w).  */
+		up = v->d;
+		vp = u->d;
+	} else {
+		usize = u->nlimbs;
+		usign = u->sign;
+		vsize = v->nlimbs;
+		vsign = v->sign;
+		wsize = usize + 1;
+		if (RESIZE_IF_NEEDED(w, wsize) < 0)
+			return -ENOMEM;
+		/* These must be after realloc (u or v may be the same as w).  */
+		up = u->d;
+		vp = v->d;
+	}
+	wp = w->d;
+	wsign = 0;
+
+	if (!vsize) {		/* simple */
+		MPN_COPY(wp, up, usize);
+		wsize = usize;
+		wsign = usign;
+	} else if (usign != vsign) {	/* different sign */
+		/* This test is right since USIZE >= VSIZE */
+		if (usize != vsize) {
+			mpihelp_sub(wp, up, usize, vp, vsize);
+			wsize = usize;
+			MPN_NORMALIZE(wp, wsize);
+			wsign = usign;
+		} else if (mpihelp_cmp(up, vp, usize) < 0) {
+			mpihelp_sub_n(wp, vp, up, usize);
+			wsize = usize;
+			MPN_NORMALIZE(wp, wsize);
+			if (!usign)
+				wsign = 1;
+		} else {
+			mpihelp_sub_n(wp, up, vp, usize);
+			wsize = usize;
+			MPN_NORMALIZE(wp, wsize);
+			if (usign)
+				wsign = 1;
+		}
+	} else {		/* U and V have same sign. Add them. */
+		mpi_limb_t cy = mpihelp_add(wp, up, usize, vp, vsize);
+		wp[usize] = cy;
+		wsize = usize + cy;
+		if (usign)
+			wsign = 1;
+	}
+
+	w->nlimbs = wsize;
+	w->sign = wsign;
+	return 0;
+}
+
+/****************
+ * Subtract the unsigned integer V from the mpi-integer U and store the
+ * result in W.
+ */
+int mpi_sub_ui(MPI w, MPI u, unsigned long v)
+{
+	mpi_ptr_t wp, up;
+	mpi_size_t usize, wsize;
+	int usign, wsign;
+
+	usize = u->nlimbs;
+	usign = u->sign;
+	wsign = 0;
+
+	/* If not space for W (and possible carry), increase space.  */
+	wsize = usize + 1;
+	if (w->alloced < wsize)
+		if (mpi_resize(w, wsize) < 0)
+			return -ENOMEM;
+
+	/* These must be after realloc (U may be the same as W).  */
+	up = u->d;
+	wp = w->d;
+
+	if (!usize) {		/* simple */
+		wp[0] = v;
+		wsize = v ? 1 : 0;
+		wsign = 1;
+	} else if (usign) {	/* mpi and v are negative */
+		mpi_limb_t cy;
+		cy = mpihelp_add_1(wp, up, usize, v);
+		wp[usize] = cy;
+		wsize = usize + cy;
+	} else {		/* The signs are different.  Need exact comparison to determine
+				 * which operand to subtract from which.  */
+		if (usize == 1 && up[0] < v) {
+			wp[0] = v - up[0];
+			wsize = 1;
+			wsign = 1;
+		} else {
+			mpihelp_sub_1(wp, up, usize, v);
+			/* Size can decrease with at most one limb. */
+			wsize = usize - (wp[usize - 1] == 0);
+		}
+	}
+
+	w->nlimbs = wsize;
+	w->sign = wsign;
+	return 0;
+}
+
+int mpi_sub(MPI w, MPI u, MPI v)
+{
+	int rc;
+
+	if (w == v) {
+		MPI vv;
+		if (mpi_copy(&vv, v) < 0)
+			return -ENOMEM;
+		vv->sign = !vv->sign;
+		rc = mpi_add(w, u, vv);
+		mpi_free(vv);
+	} else {
+		/* fixme: this is not thread-save (we temp. modify v) */
+		v->sign = !v->sign;
+		rc = mpi_add(w, u, v);
+		v->sign = !v->sign;
+	}
+	return rc;
+}
+
+int mpi_addm(MPI w, MPI u, MPI v, MPI m)
+{
+	if (mpi_add(w, u, v) < 0 || mpi_fdiv_r(w, w, m) < 0)
+		return -ENOMEM;
+	return 0;
+}
+
+int mpi_subm(MPI w, MPI u, MPI v, MPI m)
+{
+	if (mpi_sub(w, u, v) < 0 || mpi_fdiv_r(w, w, m) < 0)
+		return -ENOMEM;
+	return 0;
+}
--- a/lib/mpi/mpi-bit.c
+++ b/lib/mpi/mpi-bit.c
@@ -54,3 +54,165 @@ unsigned mpi_get_nbits(MPI a)
 	return n;
 }
 EXPORT_SYMBOL_GPL(mpi_get_nbits);
+
+/****************
+ * Test whether bit N is set.
+ */
+int mpi_test_bit(MPI a, unsigned n)
+{
+	unsigned limbno, bitno;
+	mpi_limb_t limb;
+
+	limbno = n / BITS_PER_MPI_LIMB;
+	bitno = n % BITS_PER_MPI_LIMB;
+
+	if (limbno >= a->nlimbs)
+		return 0;	/* too far left: this is a 0 */
+	limb = a->d[limbno];
+	return (limb & (A_LIMB_1 << bitno)) ? 1 : 0;
+}
+
+/****************
+ * Set bit N of A.
+ */
+int mpi_set_bit(MPI a, unsigned n)
+{
+	unsigned limbno, bitno;
+
+	limbno = n / BITS_PER_MPI_LIMB;
+	bitno = n % BITS_PER_MPI_LIMB;
+
+	if (limbno >= a->nlimbs) {	/* resize */
+		if (a->alloced >= limbno)
+			if (mpi_resize(a, limbno + 1) < 0)
+				return -ENOMEM;
+		a->nlimbs = limbno + 1;
+	}
+	a->d[limbno] |= (A_LIMB_1 << bitno);
+	return 0;
+}
+
+/****************
+ * Set bit N of A. and clear all bits above
+ */
+int mpi_set_highbit(MPI a, unsigned n)
+{
+	unsigned limbno, bitno;
+
+	limbno = n / BITS_PER_MPI_LIMB;
+	bitno = n % BITS_PER_MPI_LIMB;
+
+	if (limbno >= a->nlimbs) {	/* resize */
+		if (a->alloced >= limbno)
+			if (mpi_resize(a, limbno + 1) < 0)
+				return -ENOMEM;
+		a->nlimbs = limbno + 1;
+	}
+	a->d[limbno] |= (A_LIMB_1 << bitno);
+	for (bitno++; bitno < BITS_PER_MPI_LIMB; bitno++)
+		a->d[limbno] &= ~(A_LIMB_1 << bitno);
+	a->nlimbs = limbno + 1;
+	return 0;
+}
+
+/****************
+ * clear bit N of A and all bits above
+ */
+void mpi_clear_highbit(MPI a, unsigned n)
+{
+	unsigned limbno, bitno;
+
+	limbno = n / BITS_PER_MPI_LIMB;
+	bitno = n % BITS_PER_MPI_LIMB;
+
+	if (limbno >= a->nlimbs)
+		return;		/* not allocated, so need to clear bits :-) */
+
+	for (; bitno < BITS_PER_MPI_LIMB; bitno++)
+		a->d[limbno] &= ~(A_LIMB_1 << bitno);
+	a->nlimbs = limbno + 1;
+}
+
+/****************
+ * Clear bit N of A.
+ */
+void mpi_clear_bit(MPI a, unsigned n)
+{
+	unsigned limbno, bitno;
+
+	limbno = n / BITS_PER_MPI_LIMB;
+	bitno = n % BITS_PER_MPI_LIMB;
+
+	if (limbno >= a->nlimbs)
+		return;		/* don't need to clear this bit, it's to far to left */
+	a->d[limbno] &= ~(A_LIMB_1 << bitno);
+}
+
+/****************
+ * Shift A by N bits to the right
+ * FIXME: should use alloc_limb if X and A are same.
+ */
+int mpi_rshift(MPI x, MPI a, unsigned n)
+{
+	mpi_ptr_t xp;
+	mpi_size_t xsize;
+
+	xsize = a->nlimbs;
+	x->sign = a->sign;
+	if (RESIZE_IF_NEEDED(x, (size_t) xsize) < 0)
+		return -ENOMEM;
+	xp = x->d;
+
+	if (xsize) {
+		mpihelp_rshift(xp, a->d, xsize, n);
+		MPN_NORMALIZE(xp, xsize);
+	}
+	x->nlimbs = xsize;
+	return 0;
+}
+
+/****************
+ * Shift A by COUNT limbs to the left
+ * This is used only within the MPI library
+ */
+int mpi_lshift_limbs(MPI a, unsigned int count)
+{
+	const int n = a->nlimbs;
+	mpi_ptr_t ap;
+	int i;
+
+	if (!count || !n)
+		return 0;
+
+	if (RESIZE_IF_NEEDED(a, n + count) < 0)
+		return -ENOMEM;
+
+	ap = a->d;
+	for (i = n - 1; i >= 0; i--)
+		ap[i + count] = ap[i];
+	for (i = 0; i < count; i++)
+		ap[i] = 0;
+	a->nlimbs += count;
+	return 0;
+}
+
+/****************
+ * Shift A by COUNT limbs to the right
+ * This is used only within the MPI library
+ */
+void mpi_rshift_limbs(MPI a, unsigned int count)
+{
+	mpi_ptr_t ap = a->d;
+	mpi_size_t n = a->nlimbs;
+	unsigned int i;
+
+	if (count >= n) {
+		a->nlimbs = 0;
+		return;
+	}
+
+	for (i = 0; i < n - count; i++)
+		ap[i] = ap[i + count];
+	ap[i] = 0;
+	a->nlimbs -= count;
+}
--- /dev/null
+++ b/lib/mpi/mpi-div.c
@@ -0,0 +1,339 @@
+/* mpi-div.c  -  MPI functions
+ *	Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ *	Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include <linux/string.h>
+#include "mpi-internal.h"
+#include "longlong.h"
+
+int mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor)
+{
+	int rc = -ENOMEM;
+	int divisor_sign = divisor->sign;
+	MPI temp_divisor = NULL;
+
+	/* We need the original value of the divisor after the remainder has been
+	 * preliminary calculated.      We have to copy it to temporary space if it's
+	 * the same variable as REM.  */
+	if (rem == divisor) {
+		if (mpi_copy(&temp_divisor, divisor) < 0)
+			goto nomem;
+		divisor = temp_divisor;
+	}
+
+	if (mpi_tdiv_qr(NULL, rem, dividend, divisor) < 0)
+		goto nomem;
+	if (((divisor_sign ? 1 : 0) ^ (dividend->sign ? 1 : 0)) && rem->nlimbs)
+		if (mpi_add(rem, rem, divisor) < 0)
+			goto nomem;
+
+	rc = 0;
+
+nomem:
+	if (temp_divisor)
+		mpi_free(temp_divisor);
+	return rc;
+}
+EXPORT_SYMBOL(mpi_fdiv_r);
+
+/****************
+ * Division rounding the quotient towards -infinity.
+ * The remainder gets the same sign as the denominator.
+ * rem is optional
+ */
+
+ulong mpi_fdiv_r_ui(MPI rem, MPI dividend, ulong divisor)
+{
+	mpi_limb_t rlimb;
+
+	rlimb = mpihelp_mod_1(dividend->d, dividend->nlimbs, divisor);
+	if (rlimb && dividend->sign)
+		rlimb = divisor - rlimb;
+
+	if (rem) {
+		rem->d[0] = rlimb;
+		rem->nlimbs = rlimb ? 1 : 0;
+	}
+	return rlimb;
+}
+
+int mpi_fdiv_q(MPI quot, MPI dividend, MPI divisor)
+{
+	MPI tmp = mpi_alloc(mpi_get_nlimbs(quot));
+	if (!tmp)
+		return -ENOMEM;
+	mpi_fdiv_qr(quot, tmp, dividend, divisor);
+	mpi_free(tmp);
+	return 0;
+}
+
+int mpi_fdiv_qr(MPI quot, MPI rem, MPI dividend, MPI divisor)
+{
+	int divisor_sign = divisor->sign;
+	MPI temp_divisor = NULL;
+
+	if (quot == divisor || rem == divisor) {
+		if (mpi_copy(&temp_divisor, divisor) < 0)
+			return -ENOMEM;
+		divisor = temp_divisor;
+	}
+
+	if (mpi_tdiv_qr(quot, rem, dividend, divisor) < 0)
+		goto nomem;
+
+	if ((divisor_sign ^ dividend->sign) && rem->nlimbs) {
+		if (mpi_sub_ui(quot, quot, 1) < 0)
+			goto nomem;
+		if (mpi_add(rem, rem, divisor) < 0)
+			goto nomem;
+	}
+
+	if (temp_divisor)
+		mpi_free(temp_divisor);
+
+	return 0;
+
+nomem:
+	mpi_free(temp_divisor);
+	return -ENOMEM;
+}
+
+/* If den == quot, den needs temporary storage.
+ * If den == rem, den needs temporary storage.
+ * If num == quot, num needs temporary storage.
+ * If den has temporary storage, it can be normalized while being copied,
+ *   i.e no extra storage should be allocated.
+ */
+
+int mpi_tdiv_r(MPI rem, MPI num, MPI den)
+{
+	return mpi_tdiv_qr(NULL, rem, num, den);
+}
+
+int mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den)
+{
+	int rc = -ENOMEM;
+	mpi_ptr_t np, dp;
+	mpi_ptr_t qp, rp;
+	mpi_size_t nsize = num->nlimbs;
+	mpi_size_t dsize = den->nlimbs;
+	mpi_size_t qsize, rsize;
+	mpi_size_t sign_remainder = num->sign;
+	mpi_size_t sign_quotient = num->sign ^ den->sign;
+	unsigned normalization_steps;
+	mpi_limb_t q_limb;
+	mpi_ptr_t marker[5];
+	int markidx = 0;
+
+	if (!dsize)
+		return -EINVAL;
+
+	memset(marker, 0, sizeof(marker));
+
+	/* Ensure space is enough for quotient and remainder.
+	 * We need space for an extra limb in the remainder, because it's
+	 * up-shifted (normalized) below.  */
+	rsize = nsize + 1;
+	if (mpi_resize(rem, rsize) < 0)
+		goto nomem;
+
+	qsize = rsize - dsize;	/* qsize cannot be bigger than this.  */
+	if (qsize <= 0) {
+		if (num != rem) {
+			rem->nlimbs = num->nlimbs;
+			rem->sign = num->sign;
+			MPN_COPY(rem->d, num->d, nsize);
+		}
+		if (quot) {
+			/* This needs to follow the assignment to rem, in case the
+			 * numerator and quotient are the same.  */
+			quot->nlimbs = 0;
+			quot->sign = 0;
+		}
+		return 0;
+	}
+
+	if (quot)
+		if (mpi_resize(quot, qsize) < 0)
+			goto nomem;
+
+	/* Read pointers here, when reallocation is finished.  */
+	np = num->d;
+	dp = den->d;
+	rp = rem->d;
+
+	/* Optimize division by a single-limb divisor.  */
+	if (dsize == 1) {
+		mpi_limb_t rlimb;
+		if (quot) {
+			qp = quot->d;
+			rlimb = mpihelp_divmod_1(qp, np, nsize, dp[0]);
+			qsize -= qp[qsize - 1] == 0;
+			quot->nlimbs = qsize;
+			quot->sign = sign_quotient;
+		} else
+			rlimb = mpihelp_mod_1(np, nsize, dp[0]);
+		rp[0] = rlimb;
+		rsize = rlimb != 0 ? 1 : 0;
+		rem->nlimbs = rsize;
+		rem->sign = sign_remainder;
+		return 0;
+	}
+
+	if (quot) {
+		qp = quot->d;
+		/* Make sure QP and NP point to different objects.  Otherwise the
+		 * numerator would be gradually overwritten by the quotient limbs.  */
+		if (qp == np) {	/* Copy NP object to temporary space.  */
+			np = marker[markidx++] = mpi_alloc_limb_space(nsize);
+			if (!np)
+				goto nomem;
+			MPN_COPY(np, qp, nsize);
+		}
+	} else			/* Put quotient at top of remainder. */
+		qp = rp + dsize;
+
+	normalization_steps = count_leading_zeros(dp[dsize - 1]);
+
+	/* Normalize the denominator, i.e. make its most significant bit set by
+	 * shifting it NORMALIZATION_STEPS bits to the left.  Also shift the
+	 * numerator the same number of steps (to keep the quotient the same!).
+	 */
+	if (normalization_steps) {
+		mpi_ptr_t tp;
+		mpi_limb_t nlimb;
+
+		/* Shift up the denominator setting the most significant bit of
+		 * the most significant word.  Use temporary storage not to clobber
+		 * the original contents of the denominator.  */
+		tp = marker[markidx++] = mpi_alloc_limb_space(dsize);
+		if (!tp)
+			goto nomem;
+		mpihelp_lshift(tp, dp, dsize, normalization_steps);
+		dp = tp;
+
+		/* Shift up the numerator, possibly introducing a new most
+		 * significant word.  Move the shifted numerator in the remainder
+		 * meanwhile.  */
+		nlimb = mpihelp_lshift(rp, np, nsize, normalization_steps);
+		if (nlimb) {
+			rp[nsize] = nlimb;
+			rsize = nsize + 1;
+		} else
+			rsize = nsize;
+	} else {
+		/* The denominator is already normalized, as required.  Copy it to
+		 * temporary space if it overlaps with the quotient or remainder.  */
+		if (dp == rp || (quot && (dp == qp))) {
+			mpi_ptr_t tp;
+
+			tp = marker[markidx++] = mpi_alloc_limb_space(dsize);
+			if (!tp)
+				goto nomem;
+			MPN_COPY(tp, dp, dsize);
+			dp = tp;
+		}
+
+		/* Move the numerator to the remainder.  */
+		if (rp != np)
+			MPN_COPY(rp, np, nsize);
+
+		rsize = nsize;
+	}
+
+	q_limb = mpihelp_divrem(qp, 0, rp, rsize, dp, dsize);
+
+	if (quot) {
+		qsize = rsize - dsize;
+		if (q_limb) {
+			qp[qsize] = q_limb;
+			qsize += 1;
+		}
+
+		quot->nlimbs = qsize;
+		quot->sign = sign_quotient;
+	}
+
+	rsize = dsize;
+	MPN_NORMALIZE(rp, rsize);
+
+	if (normalization_steps && rsize) {
+		mpihelp_rshift(rp, rp, rsize, normalization_steps);
+		rsize -= rp[rsize - 1] == 0 ? 1 : 0;
+	}
+
+	rem->nlimbs = rsize;
+	rem->sign = sign_remainder;
+
+	rc = 0;
+nomem:
+	while (markidx)
+		mpi_free_limb_space(marker[--markidx]);
+	return rc;
+}
+
+int mpi_tdiv_q_2exp(MPI w, MPI u, unsigned count)
+{
+	mpi_size_t usize, wsize;
+	mpi_size_t limb_cnt;
+
+	usize = u->nlimbs;
+	limb_cnt = count / BITS_PER_MPI_LIMB;
+	wsize = usize - limb_cnt;
+	if (limb_cnt >= usize)
+		w->nlimbs = 0;
+	else {
+		mpi_ptr_t wp;
+		mpi_ptr_t up;
+
+		if (RESIZE_IF_NEEDED(w, wsize) < 0)
+			return -ENOMEM;
+		wp = w->d;
+		up = u->d;
+
+		count %= BITS_PER_MPI_LIMB;
+		if (count) {
+			mpihelp_rshift(wp, up + limb_cnt, wsize, count);
+			wsize -= !wp[wsize - 1];
+		} else {
+			MPN_COPY_INCR(wp, up + limb_cnt, wsize);
+		}
+
+		w->nlimbs = wsize;
+	}
+	return 0;
+}
+
+/****************
+ * Check whether dividend is divisible by divisor
+ * (note: divisor must fit into a limb)
+ */
+int mpi_divisible_ui(MPI dividend, ulong divisor)
+{
+	return !mpihelp_mod_1(dividend->d, dividend->nlimbs, divisor);
+}
--- /dev/null
+++ b/lib/mpi/mpi-gcd.c
@@ -0,0 +1,59 @@
+/* mpi-gcd.c  -  MPI functions
+ * Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "mpi-internal.h"
+
+/****************
+ * Find the greatest common divisor G of A and B.
+ * Return: true if this 1, false in all other cases
+ */
+int mpi_gcd(MPI g, const MPI xa, const MPI xb)
+{
+	MPI a = NULL, b = NULL;
+
+	if (mpi_copy(&a, xa) < 0)
+		goto nomem;
+
+	if (mpi_copy(&b, xb) < 0)
+		goto nomem;
+
+	/* TAOCP Vol II, 4.5.2, Algorithm A */
+	a->sign = 0;
+	b->sign = 0;
+	while (mpi_cmp_ui(b, 0)) {
+		if (mpi_fdiv_r(g, a, b) < 0)	/* g used as temorary variable */
+			goto nomem;
+		if (mpi_set(a, b) < 0)
+			goto nomem;
+		if (mpi_set(b, g) < 0)
+			goto nomem;
+	}
+	if (mpi_set(g, a) < 0)
+		goto nomem;
+
+	mpi_free(a);
+	mpi_free(b);
+	return !mpi_cmp_ui(g, 1);
+
+nomem:
+	mpi_free(a);
+	mpi_free(b);
+	return -ENOMEM;
+}
--- /dev/null
+++ b/lib/mpi/mpi-inline.c
@@ -0,0 +1,31 @@
+/* mpi-inline.c
+ * Copyright (C) 1999, 2000, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+/* put the inline functions as real functions into the lib */
+#define G10_MPI_INLINE_DECL
+
+#include "mpi-internal.h"
+
+/* always include the header becuase it is only
+ * included by mpi-internal if __GCC__ is defined but we
+ * need it here in all cases and the above definition of
+ * of the macro allows us to do so
+ */
+#include "mpi-inline.h"
--- /dev/null
+++ b/lib/mpi/mpi-inv.c
@@ -0,0 +1,188 @@
+/* mpi-inv.c  -  MPI functions
+ * Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "mpi-internal.h"
+
+/****************
+ * Calculate the multiplicative inverse X of A mod N
+ * That is: Find the solution x for
+ *		1 = (a*x) mod n
+ */
+int mpi_invm(MPI x, const MPI a, const MPI n)
+{
+	/* Extended Euclid's algorithm (See TAOPC Vol II, 4.5.2, Alg X)
+	 * modified according to Michael Penk's solution for Exercice 35
+	 * with further enhancement */
+	MPI u = NULL, v = NULL;
+	MPI u1 = NULL, u2 = NULL, u3 = NULL;
+	MPI v1 = NULL, v2 = NULL, v3 = NULL;
+	MPI t1 = NULL, t2 = NULL, t3 = NULL;
+	unsigned k;
+	int sign;
+	int odd = 0;
+	int rc = -ENOMEM;
+
+	if (mpi_copy(&u, a) < 0)
+		goto cleanup;
+	if (mpi_copy(&v, n) < 0)
+		goto cleanup;
+
+	for (k = 0; !mpi_test_bit(u, 0) && !mpi_test_bit(v, 0); k++) {
+		if (mpi_rshift(u, u, 1) < 0)
+			goto cleanup;
+		if (mpi_rshift(v, v, 1) < 0)
+			goto cleanup;
+	}
+	odd = mpi_test_bit(v, 0);
+
+	u1 = mpi_alloc_set_ui(1);
+	if (!u1)
+		goto cleanup;
+	if (!odd) {
+		u2 = mpi_alloc_set_ui(0);
+		if (!u2)
+			goto cleanup;
+	}
+	if (mpi_copy(&u3, u) < 0)
+		goto cleanup;
+	if (mpi_copy(&v1, v) < 0)
+		goto cleanup;
+	if (!odd) {
+		v2 = mpi_alloc(mpi_get_nlimbs(u));
+		if (!v2)
+			goto cleanup;
+		if (mpi_sub(v2, u1, u) < 0)
+			goto cleanup;	/* U is used as const 1 */
+	}
+	if (mpi_copy(&v3, v) < 0)
+		goto cleanup;
+	if (mpi_test_bit(u, 0)) {	/* u is odd */
+		t1 = mpi_alloc_set_ui(0);
+		if (!t1)
+			goto cleanup;
+		if (!odd) {
+			t2 = mpi_alloc_set_ui(1);
+			if (!t2)
+				goto cleanup;
+			t2->sign = 1;
+		}
+		if (mpi_copy(&t3, v) < 0)
+			goto cleanup;
+		t3->sign = !t3->sign;
+		goto Y4;
+	} else {
+		t1 = mpi_alloc_set_ui(1);
+		if (!t1)
+			goto cleanup;
+		if (!odd) {
+			t2 = mpi_alloc_set_ui(0);
+			if (!t2)
+				goto cleanup;
+		}
+		if (mpi_copy(&t3, u) < 0)
+			goto cleanup;
+	}
+	do {
+		do {
+			if (!odd) {
+				if (mpi_test_bit(t1, 0) || mpi_test_bit(t2, 0)) {	/* one is odd */
+					if (mpi_add(t1, t1, v) < 0)
+						goto cleanup;
+					if (mpi_sub(t2, t2, u) < 0)
+						goto cleanup;
+				}
+				if (mpi_rshift(t1, t1, 1) < 0)
+					goto cleanup;
+				if (mpi_rshift(t2, t2, 1) < 0)
+					goto cleanup;
+				if (mpi_rshift(t3, t3, 1) < 0)
+					goto cleanup;
+			} else {
+				if (mpi_test_bit(t1, 0))
+					if (mpi_add(t1, t1, v) < 0)
+						goto cleanup;
+				if (mpi_rshift(t1, t1, 1) < 0)
+					goto cleanup;
+				if (mpi_rshift(t3, t3, 1) < 0)
+					goto cleanup;
+			}
+Y4:
+			;
+		} while (!mpi_test_bit(t3, 0));	/* while t3 is even */
+
+		if (!t3->sign) {
+			if (mpi_set(u1, t1) < 0)
+				goto cleanup;
+			if (!odd)
+				if (mpi_set(u2, t2) < 0)
+					goto cleanup;
+			if (mpi_set(u3, t3) < 0)
+				goto cleanup;
+		} else {
+			if (mpi_sub(v1, v, t1) < 0)
+				goto cleanup;
+			sign = u->sign;
+			u->sign = !u->sign;
+			if (!odd)
+				if (mpi_sub(v2, u, t2) < 0)
+					goto cleanup;
+			u->sign = sign;
+			sign = t3->sign;
+			t3->sign = !t3->sign;
+			if (mpi_set(v3, t3) < 0)
+				goto cleanup;
+			t3->sign = sign;
+		}
+		if (mpi_sub(t1, u1, v1) < 0)
+			goto cleanup;
+		if (!odd)
+			if (mpi_sub(t2, u2, v2) < 0)
+				goto cleanup;
+		if (mpi_sub(t3, u3, v3) < 0)
+			goto cleanup;
+		if (t1->sign) {
+			if (mpi_add(t1, t1, v) < 0)
+				goto cleanup;
+			if (!odd)
+				if (mpi_sub(t2, t2, u) < 0)
+					goto cleanup;
+		}
+	} while (mpi_cmp_ui(t3, 0));	/* while t3 != 0 */
+	/* mpi_lshift( u3, k ); */
+	rc = mpi_set(x, u1);
+
+cleanup:
+	mpi_free(u1);
+	mpi_free(v1);
+	mpi_free(t1);
+	if (!odd) {
+		mpi_free(u2);
+		mpi_free(v2);
+		mpi_free(t2);
+	}
+	mpi_free(u3);
+	mpi_free(v3);
+	mpi_free(t3);
+
+	mpi_free(u);
+	mpi_free(v);
+	return rc;
+}
+EXPORT_SYMBOL(mpi_invm);
--- /dev/null
+++ b/lib/mpi/mpi-mpow.c
@@ -0,0 +1,135 @@
+/* mpi-mpow.c  -  MPI functions
+ * Copyright (C) 1998, 1999, 2000 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+static int build_index(const MPI *exparray, int k, int i, int t)
+{
+	int j, bitno;
+	int index = 0;
+
+	bitno = t - i;
+	for (j = k - 1; j >= 0; j--) {
+		index <<= 1;
+		if (mpi_test_bit(exparray[j], bitno))
+			index |= 1;
+	}
+	return index;
+}
+
+/****************
+ * RES = (BASE[0] ^ EXP[0]) *  (BASE[1] ^ EXP[1]) * ... * mod M
+ */
+int mpi_mulpowm(MPI res, MPI *basearray, MPI *exparray, MPI m)
+{
+	int rc = -ENOMEM;
+	int k;			/* number of elements */
+	int t;			/* bit size of largest exponent */
+	int i, j, idx;
+	MPI *G = NULL;		/* table with precomputed values of size 2^k */
+	MPI tmp = NULL;
+
+	for (k = 0; basearray[k]; k++)
+		;
+	if (!k) {
+		pr_emerg("mpi_mulpowm: assert(k) failed\n");
+		BUG();
+	}
+	for (t = 0, i = 0; (tmp = exparray[i]); i++) {
+		j = mpi_get_nbits(tmp);
+		if (j > t)
+			t = j;
+	}
+	if (i != k) {
+		pr_emerg("mpi_mulpowm: assert(i==k) failed\n");
+		BUG();
+	}
+	if (!t) {
+		pr_emerg("mpi_mulpowm: assert(t) failed\n");
+		BUG();
+	}
+	if (k >= 10) {
+		pr_emerg("mpi_mulpowm: assert(k<10) failed\n");
+		BUG();
+	}
+
+	G = kzalloc((1 << k) * sizeof *G, GFP_KERNEL);
+	if (!G)
+		goto err_out;
+
+	/* and calculate */
+	tmp = mpi_alloc(mpi_get_nlimbs(m) + 1);
+	if (!tmp)
+		goto nomem;
+	if (mpi_set_ui(res, 1) < 0)
+		goto nomem;
+	for (i = 1; i <= t; i++) {
+		if (mpi_mulm(tmp, res, res, m) < 0)
+			goto nomem;
+		idx = build_index(exparray, k, i, t);
+		if (!(idx >= 0 && idx < (1 << k))) {
+			pr_emerg("mpi_mulpowm: assert(idx >= 0 && idx < (1<<k)) failed\n");
+			BUG();
+		}
+		if (!G[idx]) {
+			if (!idx) {
+				G[0] = mpi_alloc_set_ui(1);
+				if (!G[0])
+					goto nomem;
+			} else {
+				for (j = 0; j < k; j++) {
+					if ((idx & (1 << j))) {
+						if (!G[idx]) {
+							if (mpi_copy
+							    (&G[idx],
+							     basearray[j]) < 0)
+								goto nomem;
+						} else {
+							if (mpi_mulm
+							    (G[idx], G[idx],
+							     basearray[j],
+							     m) < 0)
+								goto nomem;
+						}
+					}
+				}
+				if (!G[idx]) {
+					G[idx] = mpi_alloc(0);
+					if (!G[idx])
+						goto nomem;
+				}
+			}
+		}
+		if (mpi_mulm(res, tmp, G[idx], m) < 0)
+			goto nomem;
+	}
+
+	rc = 0;
+nomem:
+	/* cleanup */
+	mpi_free(tmp);
+	for (i = 0; i < (1 << k); i++)
+		mpi_free(G[i]);
+	kfree(G);
+err_out:
+	return rc;
+}
+EXPORT_SYMBOL(mpi_mulpowm);
--- /dev/null
+++ b/lib/mpi/mpi-mul.c
@@ -0,0 +1,195 @@
+/* mpi-mul.c  -  MPI functions
+ *	Copyright (C) 1994, 1996 Free Software Foundation, Inc.
+ *	Copyright (C) 1998, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ *
+ * Note: This code is heavily based on the GNU MP Library.
+ *	 Actually it's the same code with only minor changes in the
+ *	 way the data is stored; this is to support the abstraction
+ *	 of an optional secure memory allocation which may be used
+ *	 to avoid revealing of sensitive data due to paging etc.
+ *	 The GNU MP Library itself is published under the LGPL;
+ *	 however I decided to publish this code under the plain GPL.
+ */
+
+#include "mpi-internal.h"
+
+int mpi_mul_ui(MPI prod, MPI mult, unsigned long small_mult)
+{
+	mpi_size_t size, prod_size;
+	mpi_ptr_t prod_ptr;
+	mpi_limb_t cy;
+	int sign;
+
+	size = mult->nlimbs;
+	sign = mult->sign;
+
+	if (!size || !small_mult) {
+		prod->nlimbs = 0;
+		prod->sign = 0;
+		return 0;
+	}
+
+	prod_size = size + 1;
+	if (prod->alloced < prod_size)
+		if (mpi_resize(prod, prod_size) < 0)
+			return -ENOMEM;
+	prod_ptr = prod->d;
+
+	cy = mpihelp_mul_1(prod_ptr, mult->d, size, (mpi_limb_t) small_mult);
+	if (cy)
+		prod_ptr[size++] = cy;
+	prod->nlimbs = size;
+	prod->sign = sign;
+	return 0;
+}
+
+int mpi_mul_2exp(MPI w, MPI u, unsigned long cnt)
+{
+	mpi_size_t usize, wsize, limb_cnt;
+	mpi_ptr_t wp;
+	mpi_limb_t wlimb;
+	int usign, wsign;
+
+	usize = u->nlimbs;
+	usign = u->sign;
+
+	if (!usize) {
+		w->nlimbs = 0;
+		w->sign = 0;
+		return 0;
+	}
+
+	limb_cnt = cnt / BITS_PER_MPI_LIMB;
+	wsize = usize + limb_cnt + 1;
+	if (w->alloced < wsize)
+		if (mpi_resize(w, wsize) < 0)
+			return -ENOMEM;
+	wp = w->d;
+	wsize = usize + limb_cnt;
+	wsign = usign;
+
+	cnt %= BITS_PER_MPI_LIMB;
+	if (cnt) {
+		wlimb = mpihelp_lshift(wp + limb_cnt, u->d, usize, cnt);
+		if (wlimb) {
+			wp[wsize] = wlimb;
+			wsize++;
+		}
+	} else {
+		MPN_COPY_DECR(wp + limb_cnt, u->d, usize);
+	}
+
+	/* Zero all whole limbs at low end.  Do it here and not before calling
+	 * mpn_lshift, not to lose for U == W.  */
+	MPN_ZERO(wp, limb_cnt);
+
+	w->nlimbs = wsize;
+	w->sign = wsign;
+	return 0;
+}
+
+int mpi_mul(MPI w, MPI u, MPI v)
+{
+	int rc = -ENOMEM;
+	mpi_size_t usize, vsize, wsize;
+	mpi_ptr_t up, vp, wp;
+	mpi_limb_t cy;
+	int usign, vsign, sign_product;
+	int assign_wp = 0;
+	mpi_ptr_t tmp_limb = NULL;
+
+	if (u->nlimbs < v->nlimbs) {	/* Swap U and V. */
+		usize = v->nlimbs;
+		usign = v->sign;
+		up = v->d;
+		vsize = u->nlimbs;
+		vsign = u->sign;
+		vp = u->d;
+	} else {
+		usize = u->nlimbs;
+		usign = u->sign;
+		up = u->d;
+		vsize = v->nlimbs;
+		vsign = v->sign;
+		vp = v->d;
+	}
+	sign_product = usign ^ vsign;
+	wp = w->d;
+
+	/* Ensure W has space enough to store the result.  */
+	wsize = usize + vsize;
+	if (w->alloced < (size_t) wsize) {
+		if (wp == up || wp == vp) {
+			wp = mpi_alloc_limb_space(wsize);
+			if (!wp)
+				goto nomem;
+			assign_wp = 1;
+		} else {
+			if (mpi_resize(w, wsize) < 0)
+				goto nomem;
+			wp = w->d;
+		}
+	} else {		/* Make U and V not overlap with W.      */
+		if (wp == up) {
+			/* W and U are identical.  Allocate temporary space for U.      */
+			up = tmp_limb = mpi_alloc_limb_space(usize);
+			if (!up)
+				goto nomem;
+			/* Is V identical too?  Keep it identical with U.  */
+			if (wp == vp)
+				vp = up;
+			/* Copy to the temporary space.  */
+			MPN_COPY(up, wp, usize);
+		} else if (wp == vp) {
+			/* W and V are identical.  Allocate temporary space for V.      */
+			vp = tmp_limb = mpi_alloc_limb_space(vsize);
+			if (!vp)
+				goto nomem;
+			/* Copy to the temporary space.  */
+			MPN_COPY(vp, wp, vsize);
+		}
+	}
+
+	if (!vsize)
+		wsize = 0;
+	else {
+		if (mpihelp_mul(wp, up, usize, vp, vsize, &cy) < 0)
+			goto nomem;
+		wsize -= cy ? 0 : 1;
+	}
+
+	if (assign_wp)
+		mpi_assign_limb_space(w, wp, wsize);
+
+	w->nlimbs = wsize;
+	w->sign = sign_product;
+	rc = 0;
+nomem:
+	if (tmp_limb)
+		mpi_free_limb_space(tmp_limb);
+	return rc;
+}
+
+int mpi_mulm(MPI w, MPI u, MPI v, MPI m)
+{
+	if (mpi_mul(w, u, v) < 0)
+		return -ENOMEM;
+	return mpi_fdiv_r(w, w, m);
+}
+EXPORT_SYMBOL(mpi_mulm);
--- /dev/null
+++ b/lib/mpi/mpi-scan.c
@@ -0,0 +1,136 @@
+/* mpi-scan.c  -  MPI functions
+ * Copyright (C) 1998, 1999, 2000, 2001 Free Software Foundation, Inc.
+ *
+ * This file is part of GnuPG.
+ *
+ * GnuPG is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * GnuPG is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA
+ */
+
+#include "mpi-internal.h"
+#include "longlong.h"
+
+/****************
+ * Scan through an mpi and return byte for byte. a -1 is returned to indicate
+ * the end of the mpi. Scanning is done from the lsb to the msb, returned
+ * values are in the range of 0 .. 255.
+ *
+ * FIXME: This code is VERY ugly!
+ */
+int mpi_getbyte(const MPI a, unsigned idx)
+{
+	int i, j;
+	unsigned n;
+	mpi_ptr_t ap;
+	mpi_limb_t limb;
+
+	ap = a->d;
+	for (n = 0, i = 0; i < a->nlimbs; i++) {
+		limb = ap[i];
+		for (j = 0; j < BYTES_PER_MPI_LIMB; j++, n++)
+			if (n == idx)
+				return (limb >> j * 8) & 0xff;
+	}
+	return -1;
+}
+
+/****************
+ * Put a value at position IDX into A. idx counts from lsb to msb
+ */
+void mpi_putbyte(MPI a, unsigned idx, int xc)
+{
+	int i, j;
+	unsigned n;
+	mpi_ptr_t ap;
+	mpi_limb_t limb, c;
+
+	c = xc & 0xff;
+	ap = a->d;
+	for (n = 0, i = 0; i < a->alloced; i++) {
+		limb = ap[i];
+		for (j = 0; j < BYTES_PER_MPI_LIMB; j++, n++)
+			if (n == idx) {
+#if BYTES_PER_MPI_LIMB == 4
+				if (j == 0)
+					limb = (limb & 0xffffff00) | c;
+				else if (j == 1)
+					limb = (limb & 0xffff00ff) | (c << 8);
+				else if (j == 2)
+					limb = (limb & 0xff00ffff) | (c << 16);
+				else
+					limb = (limb & 0x00ffffff) | (c << 24);
+#elif BYTES_PER_MPI_LIMB == 8
+				if (j == 0)
+					limb = (limb & 0xffffffffffffff00) | c;
+				else if (j == 1)
+					limb =
+					    (limb & 0xffffffffffff00ff) | (c <<
+									   8);
+				else if (j == 2)
+					limb =
+					    (limb & 0xffffffffff00ffff) | (c <<
+									   16);
+				else if (j == 3)
+					limb =
+					    (limb & 0xffffffff00ffffff) | (c <<
+									   24);
+				else if (j == 4)
+					limb =
+					    (limb & 0xffffff00ffffffff) | (c <<
+									   32);
+				else if (j == 5)
+					limb =
+					    (limb & 0xffff00ffffffffff) | (c <<
+									   40);
+				else if (j == 6)
+					limb =
+					    (limb & 0xff00ffffffffffff) | (c <<
+									   48);
+				else
+					limb =
+					    (limb & 0x00ffffffffffffff) | (c <<
+									   56);
+#else
+#error please enhance this function, its ugly - i know.
+#endif
+				if (a->nlimbs <= i)
+					a->nlimbs = i + 1;
+				ap[i] = limb;
+				return;
+			}
+	}
+	log_bug("index out of range\n");
+}
+
+/****************
+ * Count the number of zerobits at the low end of A
+ */
+unsigned mpi_trailing_zeros(const MPI a)
+{
+	unsigned n, count = 0;
+
+	for (n = 0; n < a->nlimbs; n++) {
+		if (a->d[n]) {
+			unsigned nn;
+			mpi_limb_t alimb = a->d[n];
+
+			nn = count_trailing_zeros(alimb);
+			count += nn;
+			break;
+		}
+		count += BITS_PER_MPI_LIMB;
+	}
+	return count;
+
+}
--- a/lib/mpi/mpicoder.c
+++ b/lib/mpi/mpicoder.c
@@ -128,6 +128,81 @@ leave:
 }
 EXPORT_SYMBOL_GPL(mpi_read_from_buffer);
 
+/****************
+ * Make an mpi from a character string.
+ */
+int mpi_fromstr(MPI val, const char *str)
+{
+	int hexmode = 0, sign = 0, prepend_zero = 0, i, j, c, c1, c2;
+	unsigned nbits, nbytes, nlimbs;
+	mpi_limb_t a;
+
+	if (*str == '-') {
+		sign = 1;
+		str++;
+	}
+	if (*str == '0' && str[1] == 'x')
+		hexmode = 1;
+	else
+		return -EINVAL;	/* other bases are not yet supported */
+	str += 2;
+
+	nbits = strlen(str) * 4;
+	if (nbits % 8)
+		prepend_zero = 1;
+	nbytes = (nbits + 7) / 8;
+	nlimbs = (nbytes + BYTES_PER_MPI_LIMB - 1) / BYTES_PER_MPI_LIMB;
+	if (val->alloced < nlimbs)
+		if (!mpi_resize(val, nlimbs))
+			return -ENOMEM;
+	i = BYTES_PER_MPI_LIMB - nbytes % BYTES_PER_MPI_LIMB;
+	i %= BYTES_PER_MPI_LIMB;
+	j = val->nlimbs = nlimbs;
+	val->sign = sign;
+	for (; j > 0; j--) {
+		a = 0;
+		for (; i < BYTES_PER_MPI_LIMB; i++) {
+			if (prepend_zero) {
+				c1 = '0';
+				prepend_zero = 0;
+			} else
+				c1 = *str++;
+			assert(c1);
+			c2 = *str++;
+			assert(c2);
+			if (c1 >= '0' && c1 <= '9')
+				c = c1 - '0';
+			else if (c1 >= 'a' && c1 <= 'f')
+				c = c1 - 'a' + 10;
+			else if (c1 >= 'A' && c1 <= 'F')
+				c = c1 - 'A' + 10;
+			else {
+				mpi_clear(val);
+				return 1;
+			}
+			c <<= 4;
+			if (c2 >= '0' && c2 <= '9')
+				c |= c2 - '0';
+			else if (c2 >= 'a' && c2 <= 'f')
+				c |= c2 - 'a' + 10;
+			else if (c2 >= 'A' && c2 <= 'F')
+				c |= c2 - 'A' + 10;
+			else {
+				mpi_clear(val);
+				return 1;
+			}
+			a <<= 8;
+			a |= c;
+		}
+		i = 0;
+
+		val->d[j - 1] = a;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(mpi_fromstr);
+
 /**
  * mpi_read_buffer() - read MPI to a bufer provided by user (msb first)
  *
--- a/lib/mpi/mpih-div.c
+++ b/lib/mpi/mpih-div.c
@@ -37,6 +37,159 @@
 #define UDIV_TIME UMUL_TIME
 #endif
 
+/* FIXME: We should be using invert_limb (or invert_normalized_limb)
+ * here (not udiv_qrnnd).
+ */
+
+mpi_limb_t
+mpihelp_mod_1(mpi_ptr_t dividend_ptr, mpi_size_t dividend_size,
+	      mpi_limb_t divisor_limb)
+{
+	mpi_size_t i;
+	mpi_limb_t n1, n0, r;
+	int dummy;
+
+	/* Botch: Should this be handled at all?  Rely on callers?  */
+	if (!dividend_size)
+		return 0;
+
+	/* If multiplication is much faster than division, and the
+	 * dividend is large, pre-invert the divisor, and use
+	 * only multiplications in the inner loop.
+	 *
+	 * This test should be read:
+	 *   Does it ever help to use udiv_qrnnd_preinv?
+	 *     && Does what we save compensate for the inversion overhead?
+	 */
+	if (UDIV_TIME > (2 * UMUL_TIME + 6)
+	    && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME) {
+		int normalization_steps;
+
+		normalization_steps = count_leading_zeros(divisor_limb);
+		if (normalization_steps) {
+			mpi_limb_t divisor_limb_inverted;
+
+			divisor_limb <<= normalization_steps;
+
+			/* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB.  The
+			 * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
+			 * most significant bit (with weight 2**N) implicit.
+			 *
+			 * Special case for DIVISOR_LIMB == 100...000.
+			 */
+			if (!(divisor_limb << 1))
+				divisor_limb_inverted = ~(mpi_limb_t) 0;
+			else
+				udiv_qrnnd(divisor_limb_inverted, dummy,
+					   -divisor_limb, 0, divisor_limb);
+
+			n1 = dividend_ptr[dividend_size - 1];
+			r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps);
+
+			/* Possible optimization:
+			 * if (r == 0
+			 * && divisor_limb > ((n1 << normalization_steps)
+			 *                 | (dividend_ptr[dividend_size - 2] >> ...)))
+			 * ...one division less...
+			 */
+			for (i = dividend_size - 2; i >= 0; i--) {
+				n0 = dividend_ptr[i];
+				UDIV_QRNND_PREINV(dummy, r, r,
+						  ((n1 << normalization_steps)
+						   | (n0 >>
+						      (BITS_PER_MPI_LIMB -
+						       normalization_steps))),
+						  divisor_limb,
+						  divisor_limb_inverted);
+				n1 = n0;
+			}
+			UDIV_QRNND_PREINV(dummy, r, r,
+					  n1 << normalization_steps,
+					  divisor_limb, divisor_limb_inverted);
+			return r >> normalization_steps;
+		} else {
+			mpi_limb_t divisor_limb_inverted;
+
+			/* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB.  The
+			 * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
+			 * most significant bit (with weight 2**N) implicit.
+			 *
+			 * Special case for DIVISOR_LIMB == 100...000.
+			 */
+			if (!(divisor_limb << 1))
+				divisor_limb_inverted = ~(mpi_limb_t) 0;
+			else
+				udiv_qrnnd(divisor_limb_inverted, dummy,
+					   -divisor_limb, 0, divisor_limb);
+
+			i = dividend_size - 1;
+			r = dividend_ptr[i];
+
+			if (r >= divisor_limb)
+				r = 0;
+			else
+				i--;
+
+			for (; i >= 0; i--) {
+				n0 = dividend_ptr[i];
+				UDIV_QRNND_PREINV(dummy, r, r,
+						  n0, divisor_limb,
+						  divisor_limb_inverted);
+			}
+			return r;
+		}
+	} else {
+		if (UDIV_NEEDS_NORMALIZATION) {
+			int normalization_steps;
+
+			normalization_steps = count_leading_zeros(divisor_limb);
+			if (normalization_steps) {
+				divisor_limb <<= normalization_steps;
+
+				n1 = dividend_ptr[dividend_size - 1];
+				r = n1 >> (BITS_PER_MPI_LIMB -
+					   normalization_steps);
+
+				/* Possible optimization:
+				 * if (r == 0
+				 * && divisor_limb > ((n1 << normalization_steps)
+				 *                 | (dividend_ptr[dividend_size - 2] >> ...)))
+				 * ...one division less...
+				 */
+				for (i = dividend_size - 2; i >= 0; i--) {
+					n0 = dividend_ptr[i];
+					udiv_qrnnd(dummy, r, r,
+						   ((n1 << normalization_steps)
+						    | (n0 >>
+						       (BITS_PER_MPI_LIMB -
+							normalization_steps))),
+						   divisor_limb);
+					n1 = n0;
+				}
+				udiv_qrnnd(dummy, r, r,
+					   n1 << normalization_steps,
+					   divisor_limb);
+				return r >> normalization_steps;
+			}
+		}
+		/* No normalization needed, either because udiv_qrnnd doesn't require
+		 * it, or because DIVISOR_LIMB is already normalized.  */
+		i = dividend_size - 1;
+		r = dividend_ptr[i];
+
+		if (r >= divisor_limb)
+			r = 0;
+		else
+			i--;
+
+		for (; i >= 0; i--) {
+			n0 = dividend_ptr[i];
+			udiv_qrnnd(dummy, r, r, n0, divisor_limb);
+		}
+		return r;
+	}
+}
+
 /* Divide num (NP/NSIZE) by den (DP/DSIZE) and write
  * the NSIZE-DSIZE least significant quotient limbs at QP
  * and the DSIZE long remainder at NP.	If QEXTRA_LIMBS is
@@ -234,3 +387,159 @@ q_test:
 
 	return most_significant_q_limb;
 }
+
+/****************
+ * Divide (DIVIDEND_PTR,,DIVIDEND_SIZE) by DIVISOR_LIMB.
+ * Write DIVIDEND_SIZE limbs of quotient at QUOT_PTR.
+ * Return the single-limb remainder.
+ * There are no constraints on the value of the divisor.
+ *
+ * QUOT_PTR and DIVIDEND_PTR might point to the same limb.
+ */
+
+mpi_limb_t
+mpihelp_divmod_1(mpi_ptr_t quot_ptr,
+		 mpi_ptr_t dividend_ptr, mpi_size_t dividend_size,
+		 mpi_limb_t divisor_limb)
+{
+	mpi_size_t i;
+	mpi_limb_t n1, n0, r;
+	int dummy;
+
+	if (!dividend_size)
+		return 0;
+
+	/* If multiplication is much faster than division, and the
+	 * dividend is large, pre-invert the divisor, and use
+	 * only multiplications in the inner loop.
+	 *
+	 * This test should be read:
+	 * Does it ever help to use udiv_qrnnd_preinv?
+	 * && Does what we save compensate for the inversion overhead?
+	 */
+	if (UDIV_TIME > (2 * UMUL_TIME + 6)
+	    && (UDIV_TIME - (2 * UMUL_TIME + 6)) * dividend_size > UDIV_TIME) {
+		int normalization_steps;
+
+		normalization_steps = count_leading_zeros(divisor_limb);
+		if (normalization_steps) {
+			mpi_limb_t divisor_limb_inverted;
+
+			divisor_limb <<= normalization_steps;
+
+			/* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB.  The
+			 * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
+			 * most significant bit (with weight 2**N) implicit.
+			 */
+			/* Special case for DIVISOR_LIMB == 100...000.  */
+			if (!(divisor_limb << 1))
+				divisor_limb_inverted = ~(mpi_limb_t) 0;
+			else
+				udiv_qrnnd(divisor_limb_inverted, dummy,
+					   -divisor_limb, 0, divisor_limb);
+
+			n1 = dividend_ptr[dividend_size - 1];
+			r = n1 >> (BITS_PER_MPI_LIMB - normalization_steps);
+
+			/* Possible optimization:
+			 * if (r == 0
+			 * && divisor_limb > ((n1 << normalization_steps)
+			 *                 | (dividend_ptr[dividend_size - 2] >> ...)))
+			 * ...one division less...
+			 */
+			for (i = dividend_size - 2; i >= 0; i--) {
+				n0 = dividend_ptr[i];
+				UDIV_QRNND_PREINV(quot_ptr[i + 1], r, r,
+						  ((n1 << normalization_steps)
+						   | (n0 >>
+						      (BITS_PER_MPI_LIMB -
+						       normalization_steps))),
+						  divisor_limb,
+						  divisor_limb_inverted);
+				n1 = n0;
+			}
+			UDIV_QRNND_PREINV(quot_ptr[0], r, r,
+					  n1 << normalization_steps,
+					  divisor_limb, divisor_limb_inverted);
+			return r >> normalization_steps;
+		} else {
+			mpi_limb_t divisor_limb_inverted;
+
+			/* Compute (2**2N - 2**N * DIVISOR_LIMB) / DIVISOR_LIMB.  The
+			 * result is a (N+1)-bit approximation to 1/DIVISOR_LIMB, with the
+			 * most significant bit (with weight 2**N) implicit.
+			 */
+			/* Special case for DIVISOR_LIMB == 100...000.  */
+			if (!(divisor_limb << 1))
+				divisor_limb_inverted = ~(mpi_limb_t) 0;
+			else
+				udiv_qrnnd(divisor_limb_inverted, dummy,
+					   -divisor_limb, 0, divisor_limb);
+
+			i = dividend_size - 1;
+			r = dividend_ptr[i];
+
+			if (r >= divisor_limb)
+				r = 0;
+			else
+				quot_ptr[i--] = 0;
+
+			for (; i >= 0; i--) {
+				n0 = dividend_ptr[i];
+				UDIV_QRNND_PREINV(quot_ptr[i], r, r,
+						  n0, divisor_limb,
+						  divisor_limb_inverted);
+			}
+			return r;
+		}
+	} else {
+		if (UDIV_NEEDS_NORMALIZATION) {
+			int normalization_steps;
+
+			normalization_steps = count_leading_zeros(divisor_limb);
+			if (normalization_steps) {
+				divisor_limb <<= normalization_steps;
+
+				n1 = dividend_ptr[dividend_size - 1];
+				r = n1 >> (BITS_PER_MPI_LIMB -
+					   normalization_steps);
+
+				/* Possible optimization:
+				 * if (r == 0
+				 * && divisor_limb > ((n1 << normalization_steps)
+				 *                 | (dividend_ptr[dividend_size - 2] >> ...)))
+				 * ...one division less...
+				 */
+				for (i = dividend_size - 2; i >= 0; i--) {
+					n0 = dividend_ptr[i];
+					udiv_qrnnd(quot_ptr[i + 1], r, r,
+						   ((n1 << normalization_steps)
+						    | (n0 >>
+						       (BITS_PER_MPI_LIMB -
+							normalization_steps))),
+						   divisor_limb);
+					n1 = n0;
+				}
+				udiv_qrnnd(quot_ptr[0], r, r,
+					   n1 << normalization_steps,
+					   divisor_limb);
+				return r >> normalization_steps;
+			}
+		}
+		/* No normalization needed, either because udiv_qrnnd doesn't require
+		 * it, or because DIVISOR_LIMB is already normalized.  */
+		i = dividend_size - 1;
+		r = dividend_ptr[i];
+
+		if (r >= divisor_limb)
+			r = 0;
+		else
+			quot_ptr[i--] = 0;
+
+		for (; i >= 0; i--) {
+			n0 = dividend_ptr[i];
+			udiv_qrnnd(quot_ptr[i], r, r, n0, divisor_limb);
+		}
+		return r;
+	}
+}
--- a/lib/mpi/mpih-mul.c
+++ b/lib/mpi/mpih-mul.c
@@ -330,6 +330,36 @@ mpih_sqr_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size, mpi_ptr_t tspace)
 	}
 }
 
+/* This should be made into an inline function in gmp.h.  */
+int mpihelp_mul_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size)
+{
+	if (up == vp) {
+		if (size < KARATSUBA_THRESHOLD)
+			mpih_sqr_n_basecase(prodp, up, size);
+		else {
+			mpi_ptr_t tspace;
+			tspace = mpi_alloc_limb_space(2 * size);
+			if (!tspace)
+				return -ENOMEM;
+			mpih_sqr_n(prodp, up, size, tspace);
+			mpi_free_limb_space(tspace);
+		}
+	} else {
+		if (size < KARATSUBA_THRESHOLD)
+			mul_n_basecase(prodp, up, vp, size);
+		else {
+			mpi_ptr_t tspace;
+			tspace = mpi_alloc_limb_space(2 * size);
+			if (!tspace)
+				return -ENOMEM;
+			mul_n(prodp, up, vp, size, tspace);
+			mpi_free_limb_space(tspace);
+		}
+	}
+
+	return 0;
+}
+
 int
 mpihelp_mul_karatsuba_case(mpi_ptr_t prodp,
 			   mpi_ptr_t up, mpi_size_t usize,
--- a/lib/mpi/mpiutil.c
+++ b/lib/mpi/mpiutil.c
@@ -106,6 +106,13 @@ int mpi_resize(MPI a, unsigned nlimbs)
 	return 0;
 }
 
+void mpi_clear(MPI a)
+{
+	a->nlimbs = 0;
+	a->nbits = 0;
+	a->flags = 0;
+}
+
 void mpi_free(MPI a)
 {
 	if (!a)
@@ -122,5 +129,86 @@ void mpi_free(MPI a)
 }
 EXPORT_SYMBOL_GPL(mpi_free);
 
+/****************
+ * Note: This copy function should not interpret the MPI
+ *	 but copy it transparently.
+ */
+int mpi_copy(MPI *copied, const MPI a)
+{
+	size_t i;
+	MPI b;
+
+	*copied = NULL;
+
+	if (a) {
+		b = mpi_alloc(a->nlimbs);
+		if (!b)
+			return -ENOMEM;
+
+		b->nlimbs = a->nlimbs;
+		b->sign = a->sign;
+		b->flags = a->flags;
+		b->nbits = a->nbits;
+
+		for (i = 0; i < b->nlimbs; i++)
+			b->d[i] = a->d[i];
+
+		*copied = b;
+	}
+
+	return 0;
+}
+
+int mpi_set(MPI w, const MPI u)
+{
+	mpi_ptr_t wp, up;
+	mpi_size_t usize = u->nlimbs;
+	int usign = u->sign;
+
+	if (RESIZE_IF_NEEDED(w, (size_t) usize) < 0)
+		return -ENOMEM;
+
+	wp = w->d;
+	up = u->d;
+	MPN_COPY(wp, up, usize);
+	w->nlimbs = usize;
+	w->nbits = u->nbits;
+	w->flags = u->flags;
+	w->sign = usign;
+	return 0;
+}
+
+int mpi_set_ui(MPI w, unsigned long u)
+{
+	if (RESIZE_IF_NEEDED(w, 1) < 0)
+		return -ENOMEM;
+	w->d[0] = u;
+	w->nlimbs = u ? 1 : 0;
+	w->sign = 0;
+	w->nbits = 0;
+	w->flags = 0;
+	return 0;
+}
+
+MPI mpi_alloc_set_ui(unsigned long u)
+{
+	MPI w = mpi_alloc(1);
+	if (!w)
+		return w;
+	w->d[0] = u;
+	w->nlimbs = u ? 1 : 0;
+	w->sign = 0;
+	return w;
+}
+
+void mpi_swap(MPI a, MPI b)
+{
+	struct gcry_mpi tmp;
+
+	tmp = *a;
+	*a = *b;
+	*b = tmp;
+}
+
 MODULE_DESCRIPTION("Multiprecision maths library");
 MODULE_LICENSE("GPL");
--- a/lib/nlattr.c
+++ b/lib/nlattr.c
@@ -201,8 +201,9 @@ int nla_parse(struct nlattr **tb, int maxtype, const struct nlattr *head,
 	}
 
 	if (unlikely(rem > 0))
-		pr_warn_ratelimited("netlink: %d bytes leftover after parsing attributes in process `%s'.\n",
-				    rem, current->comm);
+		ve_pr_warn_ratelimited(VE_LOG,
+			"netlink: %d bytes leftover after parsing attributes in process `%s'.\n",
+			rem, current->comm);
 
 	err = 0;
 errout:
--- a/lib/radix-tree.c
+++ b/lib/radix-tree.c
@@ -34,7 +34,6 @@
 #include <linux/rcupdate.h>
 #include <linux/hardirq.h>		/* in_interrupt() */
 
-
 /*
  * The height_to_maxindex array needs to be one deeper than the maximum
  * path as height 0 holds only 1 entry.
@@ -111,9 +110,15 @@ static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag
 	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
 }
 
+static inline void root_tag_move_all_to_prev(struct radix_tree_root *root)
+{
+	root->gfp_mask = (root->gfp_mask & __GFP_BITS_MASK) |
+		(root->gfp_mask & RADIX_ROOT_TAG_MASK) << RADIX_TREE_MAX_TAGS;
+}
+
 static inline void root_tag_clear_all(struct radix_tree_root *root)
 {
-	root->gfp_mask &= __GFP_BITS_MASK;
+	root->gfp_mask &= (__force gfp_t)~RADIX_ROOT_TAG_MASK;
 }
 
 static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
@@ -121,6 +126,27 @@ static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
 	return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
 }
 
+static inline void prev_tag_set(struct radix_tree_root *root, unsigned int tag)
+{
+	root->gfp_mask |= (1 << (tag + RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT));
+}
+
+static inline void prev_tag_clear(struct radix_tree_root *root, unsigned int tag)
+{
+	root->gfp_mask &= ~(1 << (tag + RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT));
+}
+
+static inline void prev_tag_clear_all(struct radix_tree_root *root)
+{
+	root->gfp_mask &= __GFP_BITS_MASK | RADIX_ROOT_TAG_MASK;
+}
+
+static inline int prev_tag_get(struct radix_tree_root *root, unsigned int tag)
+{
+	BUILD_BUG_ON(__GFP_BITS_SHIFT + RADIX_TREE_MAX_TAGS * 2 > 32);
+	return root->gfp_mask & (1 << (tag + RADIX_TREE_MAX_TAGS + __GFP_BITS_SHIFT));
+}
+
 /*
  * Returns 1 if any slot in the node has this tag set.
  * Otherwise returns 0.
@@ -583,6 +609,8 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 {
 	unsigned int height, shift;
 	struct radix_tree_node *slot;
+	int prev = 0; /* suppress warning */
+	int right_prev = radix_tree_tag_get(root, index, tag);
 
 	height = root->height;
 	BUG_ON(index > radix_tree_maxindex(height));
@@ -590,11 +618,15 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 	slot = indirect_to_ptr(root->rnode);
 	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
 
+	if (!height)
+		prev = root_tag_get(root, tag);
+
 	while (height > 0) {
 		int offset;
 
 		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
-		if (!tag_get(slot, tag, offset))
+		prev = tag_get(slot, tag, offset);
+		if (!prev)
 			tag_set(slot, tag, offset);
 		slot = slot->slots[offset];
 		BUG_ON(slot == NULL);
@@ -602,6 +634,13 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
 		height--;
 	}
 
+	if (prev)
+		prev_tag_set(root, tag);
+	else
+		prev_tag_clear(root, tag);
+
+	BUG_ON(!prev != !right_prev);
+
 	/* set the root's tag bit */
 	if (slot && !root_tag_get(root, tag))
 		root_tag_set(root, tag);
@@ -631,6 +670,8 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 	struct radix_tree_node *slot = NULL;
 	unsigned int height, shift;
 	int uninitialized_var(offset);
+	int prev = 0; /* suppress warning */
+	int right_prev = radix_tree_tag_get(root, index, tag);
 
 	height = root->height;
 	if (index > radix_tree_maxindex(height))
@@ -639,6 +680,13 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 	shift = height * RADIX_TREE_MAP_SHIFT;
 	slot = indirect_to_ptr(root->rnode);
 
+	if (!height) {
+		prev = root_tag_get(root, tag);
+		if (prev)
+			root_tag_clear(root, tag);
+		goto out;
+	}
+
 	while (shift) {
 		if (slot == NULL)
 			goto out;
@@ -653,7 +701,8 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 		goto out;
 
 	while (node) {
-		if (!tag_get(node, tag, offset))
+		prev = tag_get(node, tag, offset);
+		if (!prev)
 			goto out;
 		tag_clear(node, tag, offset);
 		if (any_tag_set(node, tag))
@@ -669,10 +718,27 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
 		root_tag_clear(root, tag);
 
 out:
+	if (prev)
+		prev_tag_set(root, tag);
+	else
+		prev_tag_clear(root, tag);
+
+	BUG_ON(!prev != !right_prev);
+
 	return slot;
 }
 EXPORT_SYMBOL(radix_tree_tag_clear);
 
+void __radix_tree_root_tag_move_all_to_prev(struct radix_tree_root *root)
+{
+	root_tag_move_all_to_prev(root);
+}
+
+void __radix_tree_prev_tag_clear(struct radix_tree_root *root, unsigned int tag)
+{
+	prev_tag_clear(root, tag);
+}
+
 /**
  * radix_tree_tag_get - get a tag on a radix tree node
  * @root:		radix tree root
@@ -1369,18 +1435,22 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 	void **slot;
 	void *entry;
 	int tag;
+	int right_prev[RADIX_TREE_MAX_TAGS] = {0,};
 
 	entry = __radix_tree_lookup(root, index, &node, &slot);
 	if (!entry)
-		return NULL;
+		goto out_none;
 
 	if (item && entry != item)
-		return NULL;
+		goto out_none;
+
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+		right_prev[tag] = radix_tree_tag_get(root, index, tag);
 
 	if (!node) {
-		root_tag_clear_all(root);
+		root_tag_move_all_to_prev(root);
 		root->rnode = NULL;
-		return entry;
+		goto out;
 	}
 
 	offset = index & RADIX_TREE_MAP_MASK;
@@ -1392,6 +1462,8 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
 		if (tag_get(node, tag, offset))
 			radix_tree_tag_clear(root, index, tag);
+		else
+			prev_tag_clear(root, tag);
 	}
 
 	node->slots[offset] = NULL;
@@ -1399,7 +1471,13 @@ void *radix_tree_delete_item(struct radix_tree_root *root,
 
 	__radix_tree_delete_node(root, node);
 
+out:
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++)
+		BUG_ON(!right_prev[tag] != !prev_tag_get(root, tag));
 	return entry;
+out_none:
+	prev_tag_clear_all(root);
+	goto out;
 }
 EXPORT_SYMBOL(radix_tree_delete_item);
 
@@ -1429,6 +1507,19 @@ int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
 }
 EXPORT_SYMBOL(radix_tree_tagged);
 
+/**
+ *	radix_tree_prev_tag_get - get previous tag status for last changed item
+ *			call is valid right after radix_tree_tag_set/clear for
+ *			changed tag and after radix_tree_delete for all tags
+ *	@root:		radix tree root
+ *	@tag:		tag to test
+ */
+int radix_tree_prev_tag_get(struct radix_tree_root *root, unsigned int tag)
+{
+	return prev_tag_get(root, tag);
+}
+EXPORT_SYMBOL(radix_tree_prev_tag_get);
+
 static void
 radix_tree_node_ctor(void *arg)
 {
--- a/lib/sha1.c
+++ b/lib/sha1.c
@@ -198,3 +198,4 @@ void sha_init(__u32 *buf)
 	buf[3] = 0x10325476;
 	buf[4] = 0xc3d2e1f0;
 }
+EXPORT_SYMBOL(sha_init);
--- a/lib/show_mem.c
+++ b/lib/show_mem.c
@@ -8,6 +8,7 @@
 #include <linux/mm.h>
 #include <linux/nmi.h>
 #include <linux/quicklist.h>
+#include <linux/module.h>
 
 void show_mem(unsigned int filter)
 {
@@ -47,3 +48,4 @@ void show_mem(unsigned int filter)
 		quicklist_total_size());
 #endif
 }
+EXPORT_SYMBOL(show_mem);
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -1,5 +1,6 @@
 #include <linux/module.h>
 #include <linux/uaccess.h>
+#include <linux/kasan-checks.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 
@@ -106,6 +107,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
 	src_addr = (unsigned long)src;
 	if (likely(src_addr < max_addr)) {
 		unsigned long max = max_addr - src_addr;
+		kasan_check_write(dst, count);
 		return do_strncpy_from_user(dst, src, count, max);
 	}
 	return -EFAULT;
--- /dev/null
+++ b/lib/test_kasan.c
@@ -0,0 +1,277 @@
+/*
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) "kasan test: %s " fmt, __func__
+
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/module.h>
+
+static noinline void __init kmalloc_oob_right(void)
+{
+	char *ptr;
+	size_t size = 123;
+
+	pr_info("out-of-bounds to right\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	ptr[size] = 'x';
+	kfree(ptr);
+}
+
+static noinline void __init kmalloc_oob_left(void)
+{
+	char *ptr;
+	size_t size = 15;
+
+	pr_info("out-of-bounds to left\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	*ptr = *(ptr - 1);
+	kfree(ptr);
+}
+
+static noinline void __init kmalloc_node_oob_right(void)
+{
+	char *ptr;
+	size_t size = 4096;
+
+	pr_info("kmalloc_node(): out-of-bounds to right\n");
+	ptr = kmalloc_node(size, GFP_KERNEL, 0);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	ptr[size] = 0;
+	kfree(ptr);
+}
+
+static noinline void __init kmalloc_large_oob_rigth(void)
+{
+	char *ptr;
+	size_t size = KMALLOC_MAX_CACHE_SIZE + 10;
+
+	pr_info("kmalloc large allocation: out-of-bounds to right\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	ptr[size] = 0;
+	kfree(ptr);
+}
+
+static noinline void __init kmalloc_oob_krealloc_more(void)
+{
+	char *ptr1, *ptr2;
+	size_t size1 = 17;
+	size_t size2 = 19;
+
+	pr_info("out-of-bounds after krealloc more\n");
+	ptr1 = kmalloc(size1, GFP_KERNEL);
+	ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
+	if (!ptr1 || !ptr2) {
+		pr_err("Allocation failed\n");
+		kfree(ptr1);
+		return;
+	}
+
+	ptr2[size2] = 'x';
+	kfree(ptr2);
+}
+
+static noinline void __init kmalloc_oob_krealloc_less(void)
+{
+	char *ptr1, *ptr2;
+	size_t size1 = 17;
+	size_t size2 = 15;
+
+	pr_info("out-of-bounds after krealloc less\n");
+	ptr1 = kmalloc(size1, GFP_KERNEL);
+	ptr2 = krealloc(ptr1, size2, GFP_KERNEL);
+	if (!ptr1 || !ptr2) {
+		pr_err("Allocation failed\n");
+		kfree(ptr1);
+		return;
+	}
+	ptr2[size1] = 'x';
+	kfree(ptr2);
+}
+
+static noinline void __init kmalloc_oob_16(void)
+{
+	struct {
+		u64 words[2];
+	} *ptr1, *ptr2;
+
+	pr_info("kmalloc out-of-bounds for 16-bytes access\n");
+	ptr1 = kmalloc(sizeof(*ptr1) - 3, GFP_KERNEL);
+	ptr2 = kmalloc(sizeof(*ptr2), GFP_KERNEL);
+	if (!ptr1 || !ptr2) {
+		pr_err("Allocation failed\n");
+		kfree(ptr1);
+		kfree(ptr2);
+		return;
+	}
+	*ptr1 = *ptr2;
+	kfree(ptr1);
+	kfree(ptr2);
+}
+
+static noinline void __init kmalloc_oob_in_memset(void)
+{
+	char *ptr;
+	size_t size = 666;
+
+	pr_info("out-of-bounds in memset\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	memset(ptr, 0, size+5);
+	kfree(ptr);
+}
+
+static noinline void __init kmalloc_uaf(void)
+{
+	char *ptr;
+	size_t size = 10;
+
+	pr_info("use-after-free\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	kfree(ptr);
+	*(ptr + 8) = 'x';
+}
+
+static noinline void __init kmalloc_uaf_memset(void)
+{
+	char *ptr;
+	size_t size = 33;
+
+	pr_info("use-after-free in memset\n");
+	ptr = kmalloc(size, GFP_KERNEL);
+	if (!ptr) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	kfree(ptr);
+	memset(ptr, 0, size);
+}
+
+static noinline void __init kmalloc_uaf2(void)
+{
+	char *ptr1, *ptr2;
+	size_t size = 43;
+
+	pr_info("use-after-free after another kmalloc\n");
+	ptr1 = kmalloc(size, GFP_KERNEL);
+	if (!ptr1) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	kfree(ptr1);
+	ptr2 = kmalloc(size, GFP_KERNEL);
+	if (!ptr2) {
+		pr_err("Allocation failed\n");
+		return;
+	}
+
+	ptr1[40] = 'x';
+	kfree(ptr2);
+}
+
+static noinline void __init kmem_cache_oob(void)
+{
+	char *p;
+	size_t size = 200;
+	struct kmem_cache *cache = kmem_cache_create("test_cache",
+						size, 0,
+						0, NULL);
+	if (!cache) {
+		pr_err("Cache allocation failed\n");
+		return;
+	}
+	pr_info("out-of-bounds in kmem_cache_alloc\n");
+	p = kmem_cache_alloc(cache, GFP_KERNEL);
+	if (!p) {
+		pr_err("Allocation failed\n");
+		kmem_cache_destroy(cache);
+		return;
+	}
+
+	*p = p[size];
+	kmem_cache_free(cache, p);
+	kmem_cache_destroy(cache);
+}
+
+static char global_array[10];
+
+static noinline void __init kasan_global_oob(void)
+{
+	volatile int i = 3;
+	char *p = &global_array[ARRAY_SIZE(global_array) + i];
+
+	pr_info("out-of-bounds global variable\n");
+	*(volatile char *)p;
+}
+
+static noinline void __init kasan_stack_oob(void)
+{
+	char stack_array[10];
+	volatile int i = 0;
+	char *p = &stack_array[ARRAY_SIZE(stack_array) + i];
+
+	pr_info("out-of-bounds on stack\n");
+	*(volatile char *)p;
+}
+
+static int __init kmalloc_tests_init(void)
+{
+	kmalloc_oob_right();
+	kmalloc_oob_left();
+	kmalloc_node_oob_right();
+	kmalloc_large_oob_rigth();
+	kmalloc_oob_krealloc_more();
+	kmalloc_oob_krealloc_less();
+	kmalloc_oob_16();
+	kmalloc_oob_in_memset();
+	kmalloc_uaf();
+	kmalloc_uaf_memset();
+	kmalloc_uaf2();
+	kmem_cache_oob();
+	kasan_stack_oob();
+	kasan_global_oob();
+	return -EAGAIN;
+}
+
+module_init(kmalloc_tests_init);
+MODULE_LICENSE("GPL");
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -558,6 +558,37 @@ config MEM_SOFT_DIRTY
 
 	  See Documentation/vm/soft-dirty.txt for more details.
 
+config TCACHE
+	bool "Transcendent file cache"
+	depends on CLEANCACHE
+	default n
+	help
+	  Transcendent file cache is a simple backend for cleancache, which
+	  stores reclaimed pages in memory without any modifications. It is
+	  only worth enabling if used along with memory cgroups in order to
+	  cache pages which were reclaimed on local pressure.
+
+config TSWAP
+	bool "Transcendent swap cache"
+	depends on FRONTSWAP
+	default n
+	help
+	  Transcendent swap cache is a simple backend for frontswap, which
+	  stores reclaimed pages in memory without any modifications. It is
+	  only worth enabling if used along with memory cgroups in order to
+	  cache pages which were reclaimed on local pressure.
+
+config IDLE_PAGE_TRACKING
+	bool "Enable idle page tracking"
+	depends on SYSFS && MMU && 64BIT
+	help
+	  This feature allows to estimate the amount of user pages that have
+	  not been touched during a given period of time. This information can
+	  be useful to tune memory cgroup limits and/or for job placement
+	  within a compute cluster.
+
+	  See Documentation/vm/idle_page_tracking.txt for more details.
+
 config ZSWAP
 	bool "Compressed cache for swap pages (EXPERIMENTAL)"
 	depends on FRONTSWAP && CRYPTO=y
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -2,8 +2,26 @@
 # Makefile for the linux memory manager.
 #
 
+KASAN_SANITIZE_slab_common.o := n
+KASAN_SANITIZE_slub.o := n
+
+# These files are disabled because they produce non-interesting and/or
+# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
+# free pages, or a task is migrated between nodes.
+KCOV_INSTRUMENT_slab_common.o := n
+KCOV_INSTRUMENT_slob.o := n
+KCOV_INSTRUMENT_slab.o := n
+KCOV_INSTRUMENT_slub.o := n
+KCOV_INSTRUMENT_page_alloc.o := n
+KCOV_INSTRUMENT_debug-pagealloc.o := n
+KCOV_INSTRUMENT_kmemleak.o := n
+KCOV_INSTRUMENT_kmemcheck.o := n
+KCOV_INSTRUMENT_memcontrol.o := n
+KCOV_INSTRUMENT_mmzone.o := n
+KCOV_INSTRUMENT_vmstat.o := n
+
 mmu-y			:= nommu.o
-mmu-$(CONFIG_MMU)	:= fremap.o gup.o highmem.o madvise.o memory.o mincore.o \
+mmu-$(CONFIG_MMU)	:= gup.o highmem.o madvise.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
 			   vmalloc.o pagewalk.o pgtable-generic.o
 
@@ -17,7 +35,8 @@ obj-y			:= filemap.o mempool.o oom_kill.o fadvise.o \
 			   util.o mmzone.o vmstat.o backing-dev.o \
 			   mm_init.o mmu_context.o percpu.o slab_common.o \
 			   compaction.o \
-			   interval_tree.o list_lru.o workingset.o $(mmu-y)
+			   interval_tree.o list_lru.o workingset.o oom_group.o \
+			   iov-iter.o $(mmu-y)
 
 obj-y += init-mm.o
 
@@ -45,6 +64,7 @@ obj-$(CONFIG_PAGE_POISONING) += debug-pagealloc.o
 obj-$(CONFIG_SLAB) += slab.o
 obj-$(CONFIG_SLUB) += slub.o
 obj-$(CONFIG_KMEMCHECK) += kmemcheck.o
+obj-$(CONFIG_KASAN)	+= kasan/
 obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_MIGRATION) += migrate.o
@@ -67,3 +87,6 @@ obj-$(CONFIG_ZBUD)	+= zbud.o
 obj-$(CONFIG_ZSMALLOC)	+= zsmalloc.o
 obj-$(CONFIG_USERFAULTFD) += userfaultfd.o
 obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
+obj-$(CONFIG_TCACHE) += tcache.o
+obj-$(CONFIG_TSWAP) += tswap.o
+obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -69,10 +69,10 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 	unsigned long background_thresh;
 	unsigned long dirty_thresh;
 	unsigned long bdi_thresh;
-	unsigned long nr_dirty, nr_io, nr_more_io;
+	unsigned long nr_dirty, nr_io, nr_more_io, nr_dirty_time;
 	struct inode *inode;
 
-	nr_dirty = nr_io = nr_more_io = 0;
+	nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0;
 	spin_lock(&wb->list_lock);
 	list_for_each_entry(inode, &wb->b_dirty, i_wb_list)
 		nr_dirty++;
@@ -80,6 +80,9 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		nr_io++;
 	list_for_each_entry(inode, &wb->b_more_io, i_wb_list)
 		nr_more_io++;
+	list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list)
+		if (inode->i_state & I_DIRTY_TIME)
+			nr_dirty_time++;
 	spin_unlock(&wb->list_lock);
 
 	global_dirty_limits(&background_thresh, &dirty_thresh);
@@ -98,6 +101,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   "b_dirty:            %10lu\n"
 		   "b_io:               %10lu\n"
 		   "b_more_io:          %10lu\n"
+		   "b_dirty_time:       %10lu\n"
 		   "bdi_list:           %10u\n"
 		   "state:              %10lx\n",
 		   (unsigned long) K(bdi_stat(bdi, BDI_WRITEBACK)),
@@ -111,6 +115,7 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
 		   nr_dirty,
 		   nr_io,
 		   nr_more_io,
+		   nr_dirty_time,
 		   !list_empty(&bdi->bdi_list), bdi->state);
 #undef K
 
@@ -184,43 +189,52 @@ static ssize_t name##_show(struct device *dev,				\
 
 BDI_SHOW(read_ahead_kb, K(bdi->ra_pages))
 
-static ssize_t min_ratio_store(struct device *dev,
-		struct device_attribute *attr, const char *buf, size_t count)
+static inline ssize_t generic_uint_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count,
+		int (*set_func) (struct backing_dev_info *, unsigned int))
 {
 	struct backing_dev_info *bdi = dev_get_drvdata(dev);
-	unsigned int ratio;
+	unsigned int val;
 	ssize_t ret;
 
-	ret = kstrtouint(buf, 10, &ratio);
+	ret = kstrtouint(buf, 10, &val);
 	if (ret < 0)
 		return ret;
 
-	ret = bdi_set_min_ratio(bdi, ratio);
+	ret = set_func(bdi, val);
 	if (!ret)
 		ret = count;
 
 	return ret;
 }
+
+static ssize_t min_ratio_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	return generic_uint_store(dev, attr, buf, count, bdi_set_min_ratio);
+}
 BDI_SHOW(min_ratio, bdi->min_ratio)
 
 static ssize_t max_ratio_store(struct device *dev,
 		struct device_attribute *attr, const char *buf, size_t count)
 {
-	struct backing_dev_info *bdi = dev_get_drvdata(dev);
-	unsigned int ratio;
-	ssize_t ret;
-
-	ret = kstrtouint(buf, 10, &ratio);
-	if (ret < 0)
-		return ret;
+	return generic_uint_store(dev, attr, buf, count, bdi_set_max_ratio);
+}
+BDI_SHOW(max_ratio, bdi->max_ratio)
 
-	ret = bdi_set_max_ratio(bdi, ratio);
-	if (!ret)
-		ret = count;
+static ssize_t min_dirty_pages_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	return generic_uint_store(dev, attr, buf, count, bdi_set_min_dirty);
+}
+BDI_SHOW(min_dirty_pages, bdi->min_dirty_pages)
 
-	return ret;
+static ssize_t max_dirty_pages_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t count)
+{
+	return generic_uint_store(dev, attr, buf, count, bdi_set_max_dirty);
 }
-BDI_SHOW(max_ratio, bdi->max_ratio)
+BDI_SHOW(max_dirty_pages, bdi->max_dirty_pages)
 
 static ssize_t stable_pages_required_show(struct device *dev,
 					  struct device_attribute *attr,
@@ -236,6 +250,8 @@ static struct device_attribute bdi_dev_attrs[] = {
 	__ATTR_RW(read_ahead_kb),
 	__ATTR_RW(min_ratio),
 	__ATTR_RW(max_ratio),
+	__ATTR_RW(min_dirty_pages),
+	__ATTR_RW(max_dirty_pages),
 	__ATTR_RO(stable_pages_required),
 	__ATTR_NULL,
 };
@@ -427,6 +443,7 @@ static void bdi_wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi)
 	INIT_LIST_HEAD(&wb->b_dirty);
 	INIT_LIST_HEAD(&wb->b_io);
 	INIT_LIST_HEAD(&wb->b_more_io);
+	INIT_LIST_HEAD(&wb->b_dirty_time);
 	spin_lock_init(&wb->list_lock);
 	INIT_DELAYED_WORK(&wb->dwork, bdi_writeback_workfn);
 }
@@ -445,9 +462,12 @@ int bdi_init(struct backing_dev_info *bdi)
 	bdi->min_ratio = 0;
 	bdi->max_ratio = 100;
 	bdi->max_prop_frac = FPROP_FRAC_BASE;
+	bdi->min_dirty_pages = 0;
+	bdi->max_dirty_pages = 0;
 	spin_lock_init(&bdi->wb_lock);
 	INIT_LIST_HEAD(&bdi->bdi_list);
 	INIT_LIST_HEAD(&bdi->work_list);
+	init_waitqueue_head(&bdi->cong_waitq);
 
 	bdi_wb_init(&bdi->wb, bdi);
 
--- a/mm/cleancache.c
+++ b/mm/cleancache.c
@@ -15,11 +15,12 @@
 #include <linux/fs.h>
 #include <linux/exportfs.h>
 #include <linux/mm.h>
+#include <linux/memcontrol.h>
 #include <linux/debugfs.h>
 #include <linux/cleancache.h>
 
 /*
- * cleancache_ops is set by cleancache_ops_register to contain the pointers
+ * cleancache_ops is set by cleancache_register_ops to contain the pointers
  * to the cleancache "backend" implementation functions.
  */
 static struct cleancache_ops *cleancache_ops __read_mostly;
@@ -34,145 +35,107 @@ static u64 cleancache_failed_gets;
 static u64 cleancache_puts;
 static u64 cleancache_invalidates;
 
-/*
- * When no backend is registered all calls to init_fs and init_shared_fs
- * are registered and fake poolids (FAKE_FS_POOLID_OFFSET or
- * FAKE_SHARED_FS_POOLID_OFFSET, plus offset in the respective array
- * [shared_|]fs_poolid_map) are given to the respective super block
- * (sb->cleancache_poolid) and no tmem_pools are created. When a backend
- * registers with cleancache the previous calls to init_fs and init_shared_fs
- * are executed to create tmem_pools and set the respective poolids. While no
- * backend is registered all "puts", "gets" and "flushes" are ignored or failed.
- */
-#define MAX_INITIALIZABLE_FS 32
-#define FAKE_FS_POOLID_OFFSET 1000
-#define FAKE_SHARED_FS_POOLID_OFFSET 2000
-
-#define FS_NO_BACKEND (-1)
-#define FS_UNKNOWN (-2)
-static int fs_poolid_map[MAX_INITIALIZABLE_FS];
-static int shared_fs_poolid_map[MAX_INITIALIZABLE_FS];
-static char *uuids[MAX_INITIALIZABLE_FS];
-/*
- * Mutex for the [shared_|]fs_poolid_map to guard against multiple threads
- * invoking umount (and ending in __cleancache_invalidate_fs) and also multiple
- * threads calling mount (and ending up in __cleancache_init_[shared|]fs).
- */
-static DEFINE_MUTEX(poolid_mutex);
-/*
- * When set to false (default) all calls to the cleancache functions, except
- * the __cleancache_invalidate_fs and __cleancache_init_[shared|]fs are guarded
- * by the if (!cleancache_ops) return. This means multiple threads (from
- * different filesystems) will be checking cleancache_ops. The usage of a
- * bool instead of a atomic_t or a bool guarded by a spinlock is OK - we are
- * OK if the time between the backend's have been initialized (and
- * cleancache_ops has been set to not NULL) and when the filesystems start
- * actually calling the backends. The inverse (when unloading) is obviously
- * not good - but this shim does not do that (yet).
- */
-
-/*
- * The backends and filesystems work all asynchronously. This is b/c the
- * backends can be built as modules.
- * The usual sequence of events is:
- *	a) mount /	-> __cleancache_init_fs is called. We set the
- *		[shared_|]fs_poolid_map and uuids for.
- *
- *	b). user does I/Os -> we call the rest of __cleancache_* functions
- *		which return immediately as cleancache_ops is false.
- *
- *	c). modprobe zcache -> cleancache_register_ops. We init the backend
- *		and set cleancache_ops to true, and for any fs_poolid_map
- *		(which is set by __cleancache_init_fs) we initialize the poolid.
- *
- *	d). user does I/Os -> now that cleancache_ops is true all the
- *		__cleancache_* functions can call the backend. They all check
- *		that fs_poolid_map is valid and if so invoke the backend.
- *
- *	e). umount /	-> __cleancache_invalidate_fs, the fs_poolid_map is
- *		reset (which is the second check in the __cleancache_* ops
- *		to call the backend).
- *
- * The sequence of event could also be c), followed by a), and d). and e). The
- * c) would not happen anymore. There is also the chance of c), and one thread
- * doing a) + d), and another doing e). For that case we depend on the
- * filesystem calling __cleancache_invalidate_fs in the proper sequence (so
- * that it handles all I/Os before it invalidates the fs (which is last part
- * of unmounting process).
- *
- * Note: The acute reader will notice that there is no "rmmod zcache" case.
- * This is b/c the functionality for that is not yet implemented and when
- * done, will require some extra locking not yet devised.
- */
+static void cleancache_register_ops_sb(struct super_block *sb, void *unused)
+{
+	switch (sb->cleancache_poolid) {
+	case CLEANCACHE_NO_BACKEND:
+		__cleancache_init_fs(sb);
+		break;
+	case CLEANCACHE_NO_BACKEND_SHARED:
+		__cleancache_init_shared_fs(sb);
+		break;
+	}
+}
 
 /*
- * Register operations for cleancache, returning previous thus allowing
- * detection of multiple backends and possible nesting.
+ * Register operations for cleancache. Returns 0 on success.
  */
-struct cleancache_ops *cleancache_register_ops(struct cleancache_ops *ops)
+int cleancache_register_ops(struct cleancache_ops *ops)
 {
-	struct cleancache_ops *old = cleancache_ops;
-	int i;
+	if (cmpxchg(&cleancache_ops, NULL, ops))
+		return -EBUSY;
 
-	mutex_lock(&poolid_mutex);
-	for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
-		if (fs_poolid_map[i] == FS_NO_BACKEND)
-			fs_poolid_map[i] = ops->init_fs(PAGE_SIZE);
-		if (shared_fs_poolid_map[i] == FS_NO_BACKEND)
-			shared_fs_poolid_map[i] = ops->init_shared_fs
-					(uuids[i], PAGE_SIZE);
-	}
 	/*
-	 * We MUST set cleancache_ops _after_ we have called the backends
-	 * init_fs or init_shared_fs functions. Otherwise the compiler might
-	 * re-order where cleancache_ops is set in this function.
+	 * A cleancache backend can be built as a module and hence loaded after
+	 * a cleancache enabled filesystem has called cleancache_init_fs. To
+	 * handle such a scenario, here we call ->init_fs or ->init_shared_fs
+	 * for each active super block. To differentiate between local and
+	 * shared filesystems, we temporarily initialize sb->cleancache_poolid
+	 * to CLEANCACHE_NO_BACKEND or CLEANCACHE_NO_BACKEND_SHARED
+	 * respectively in case there is no backend registered at the time
+	 * cleancache_init_fs or cleancache_init_shared_fs is called.
+	 *
+	 * Since filesystems can be mounted concurrently with cleancache
+	 * backend registration, we have to be careful to guarantee that all
+	 * cleancache enabled filesystems that has been mounted by the time
+	 * cleancache_register_ops is called has got and all mounted later will
+	 * get cleancache_poolid. This is assured by the following statements
+	 * tied together:
+	 *
+	 * a) iterate_supers skips only those super blocks that has started
+	 *    ->kill_sb
+	 *
+	 * b) if iterate_supers encounters a super block that has not finished
+	 *    ->mount yet, it waits until it is finished
+	 *
+	 * c) cleancache_init_fs is called from ->mount and
+	 *    cleancache_invalidate_fs is called from ->kill_sb
+	 *
+	 * d) we call iterate_supers after cleancache_ops has been set
+	 *
+	 * From a) it follows that if iterate_supers skips a super block, then
+	 * either the super block is already dead, in which case we do not need
+	 * to bother initializing cleancache for it, or it was mounted after we
+	 * initiated iterate_supers. In the latter case, it must have seen
+	 * cleancache_ops set according to d) and initialized cleancache from
+	 * ->mount by itself according to c). This proves that we call
+	 * ->init_fs at least once for each active super block.
+	 *
+	 * From b) and c) it follows that if iterate_supers encounters a super
+	 * block that has already started ->init_fs, it will wait until ->mount
+	 * and hence ->init_fs has finished, then check cleancache_poolid, see
+	 * that it has already been set and therefore do nothing. This proves
+	 * that we call ->init_fs no more than once for each super block.
+	 *
+	 * Combined together, the last two paragraphs prove the function
+	 * correctness.
+	 *
+	 * Note that various cleancache callbacks may proceed before this
+	 * function is called or even concurrently with it, but since
+	 * CLEANCACHE_NO_BACKEND is negative, they will all result in a noop
+	 * until the corresponding ->init_fs has been actually called and
+	 * cleancache_ops has been set.
 	 */
-	barrier();
-	cleancache_ops = ops;
-	mutex_unlock(&poolid_mutex);
-	return old;
+	iterate_supers(cleancache_register_ops_sb, NULL);
+	return 0;
 }
 EXPORT_SYMBOL(cleancache_register_ops);
 
 /* Called by a cleancache-enabled filesystem at time of mount */
 void __cleancache_init_fs(struct super_block *sb)
 {
-	int i;
+	int pool_id = CLEANCACHE_NO_BACKEND;
 
-	mutex_lock(&poolid_mutex);
-	for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
-		if (fs_poolid_map[i] == FS_UNKNOWN) {
-			sb->cleancache_poolid = i + FAKE_FS_POOLID_OFFSET;
-			if (cleancache_ops)
-				fs_poolid_map[i] = cleancache_ops->init_fs(PAGE_SIZE);
-			else
-				fs_poolid_map[i] = FS_NO_BACKEND;
-			break;
-		}
+	if (cleancache_ops) {
+		pool_id = cleancache_ops->init_fs(PAGE_SIZE);
+		if (pool_id < 0)
+			pool_id = CLEANCACHE_NO_POOL;
 	}
-	mutex_unlock(&poolid_mutex);
+	sb->cleancache_poolid = pool_id;
 }
 EXPORT_SYMBOL(__cleancache_init_fs);
 
 /* Called by a cleancache-enabled clustered filesystem at time of mount */
-void __cleancache_init_shared_fs(char *uuid, struct super_block *sb)
+void __cleancache_init_shared_fs(struct super_block *sb)
 {
-	int i;
+	int pool_id = CLEANCACHE_NO_BACKEND_SHARED;
 
-	mutex_lock(&poolid_mutex);
-	for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
-		if (shared_fs_poolid_map[i] == FS_UNKNOWN) {
-			sb->cleancache_poolid = i + FAKE_SHARED_FS_POOLID_OFFSET;
-			uuids[i] = uuid;
-			if (cleancache_ops)
-				shared_fs_poolid_map[i] = cleancache_ops->init_shared_fs
-						(uuid, PAGE_SIZE);
-			else
-				shared_fs_poolid_map[i] = FS_NO_BACKEND;
-			break;
-		}
+	if (cleancache_ops) {
+		pool_id = cleancache_ops->init_shared_fs(sb->s_uuid, PAGE_SIZE);
+		if (pool_id < 0)
+			pool_id = CLEANCACHE_NO_POOL;
 	}
-	mutex_unlock(&poolid_mutex);
+	sb->cleancache_poolid = pool_id;
 }
 EXPORT_SYMBOL(__cleancache_init_shared_fs);
 
@@ -202,19 +165,6 @@ static int cleancache_get_key(struct inode *inode,
 }
 
 /*
- * Returns a pool_id that is associated with a given fake poolid.
- */
-static int get_poolid_from_fake(int fake_pool_id)
-{
-	if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET)
-		return shared_fs_poolid_map[fake_pool_id -
-			FAKE_SHARED_FS_POOLID_OFFSET];
-	else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET)
-		return fs_poolid_map[fake_pool_id - FAKE_FS_POOLID_OFFSET];
-	return FS_NO_BACKEND;
-}
-
-/*
  * "Get" data from cleancache associated with the poolid/inode/index
  * that were specified when the data was put to cleanache and, if
  * successful, use it to fill the specified page with data and return 0.
@@ -229,7 +179,6 @@ int __cleancache_get_page(struct page *page)
 {
 	int ret = -1;
 	int pool_id;
-	int fake_pool_id;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
 	if (!cleancache_ops) {
@@ -238,17 +187,14 @@ int __cleancache_get_page(struct page *page)
 	}
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
-	fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
-	if (fake_pool_id < 0)
+	pool_id = page->mapping->host->i_sb->cleancache_poolid;
+	if (pool_id < 0)
 		goto out;
-	pool_id = get_poolid_from_fake(fake_pool_id);
 
 	if (cleancache_get_key(page->mapping->host, &key) < 0)
 		goto out;
 
-	if (pool_id >= 0)
-		ret = cleancache_ops->get_page(pool_id,
-				key, page->index, page);
+	ret = cleancache_ops->get_page(pool_id, key, page->index, page);
 	if (ret == 0)
 		cleancache_succ_gets++;
 	else
@@ -271,7 +217,6 @@ EXPORT_SYMBOL(__cleancache_get_page);
 void __cleancache_put_page(struct page *page)
 {
 	int pool_id;
-	int fake_pool_id;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
 	if (!cleancache_ops) {
@@ -280,16 +225,16 @@ void __cleancache_put_page(struct page *page)
 	}
 
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
-	fake_pool_id = page->mapping->host->i_sb->cleancache_poolid;
-	if (fake_pool_id < 0)
-		return;
-
-	pool_id = get_poolid_from_fake(fake_pool_id);
-
+	pool_id = page->mapping->host->i_sb->cleancache_poolid;
 	if (pool_id >= 0 &&
 		cleancache_get_key(page->mapping->host, &key) >= 0) {
-		cleancache_ops->put_page(pool_id, key, page->index, page);
-		cleancache_puts++;
+		if (!mem_cgroup_cleancache_disabled(page)) {
+			cleancache_ops->put_page(pool_id, key,
+						 page->index, page);
+			cleancache_puts++;
+		} else
+			cleancache_ops->invalidate_page(pool_id, key,
+							page->index);
 	}
 }
 EXPORT_SYMBOL(__cleancache_put_page);
@@ -306,18 +251,13 @@ void __cleancache_invalidate_page(struct address_space *mapping,
 					struct page *page)
 {
 	/* careful... page->mapping is NULL sometimes when this is called */
-	int pool_id;
-	int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
+	int pool_id = mapping->host->i_sb->cleancache_poolid;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
 	if (!cleancache_ops)
 		return;
 
-	if (fake_pool_id >= 0) {
-		pool_id = get_poolid_from_fake(fake_pool_id);
-		if (pool_id < 0)
-			return;
-
+	if (pool_id >= 0) {
 		VM_BUG_ON_PAGE(!PageLocked(page), page);
 		if (cleancache_get_key(mapping->host, &key) >= 0) {
 			cleancache_ops->invalidate_page(pool_id,
@@ -339,18 +279,12 @@ EXPORT_SYMBOL(__cleancache_invalidate_page);
  */
 void __cleancache_invalidate_inode(struct address_space *mapping)
 {
-	int pool_id;
-	int fake_pool_id = mapping->host->i_sb->cleancache_poolid;
+	int pool_id = mapping->host->i_sb->cleancache_poolid;
 	struct cleancache_filekey key = { .u.key = { 0 } };
 
 	if (!cleancache_ops)
 		return;
 
-	if (fake_pool_id < 0)
-		return;
-
-	pool_id = get_poolid_from_fake(fake_pool_id);
-
 	if (pool_id >= 0 && cleancache_get_key(mapping->host, &key) >= 0)
 		cleancache_ops->invalidate_inode(pool_id, key);
 }
@@ -363,32 +297,18 @@ EXPORT_SYMBOL(__cleancache_invalidate_inode);
  */
 void __cleancache_invalidate_fs(struct super_block *sb)
 {
-	int index;
-	int fake_pool_id = sb->cleancache_poolid;
-	int old_poolid = fake_pool_id;
+	int pool_id;
 
-	mutex_lock(&poolid_mutex);
-	if (fake_pool_id >= FAKE_SHARED_FS_POOLID_OFFSET) {
-		index = fake_pool_id - FAKE_SHARED_FS_POOLID_OFFSET;
-		old_poolid = shared_fs_poolid_map[index];
-		shared_fs_poolid_map[index] = FS_UNKNOWN;
-		uuids[index] = NULL;
-	} else if (fake_pool_id >= FAKE_FS_POOLID_OFFSET) {
-		index = fake_pool_id - FAKE_FS_POOLID_OFFSET;
-		old_poolid = fs_poolid_map[index];
-		fs_poolid_map[index] = FS_UNKNOWN;
-	}
-	sb->cleancache_poolid = -1;
-	if (cleancache_ops)
-		cleancache_ops->invalidate_fs(old_poolid);
-	mutex_unlock(&poolid_mutex);
+	pool_id = sb->cleancache_poolid;
+	sb->cleancache_poolid = CLEANCACHE_NO_POOL;
+
+	if (cleancache_ops && pool_id >= 0)
+		cleancache_ops->invalidate_fs(pool_id);
 }
 EXPORT_SYMBOL(__cleancache_invalidate_fs);
 
 static int __init init_cleancache(void)
 {
-	int i;
-
 #ifdef CONFIG_DEBUG_FS
 	struct dentry *root = debugfs_create_dir("cleancache", NULL);
 	if (root == NULL)
@@ -400,10 +320,6 @@ static int __init init_cleancache(void)
 	debugfs_create_u64("invalidates", S_IRUGO,
 				root, &cleancache_invalidates);
 #endif
-	for (i = 0; i < MAX_INITIALIZABLE_FS; i++) {
-		fs_poolid_map[i] = FS_UNKNOWN;
-		shared_fs_poolid_map[i] = FS_UNKNOWN;
-	}
 	return 0;
 }
 module_init(init_cleancache)
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -16,6 +16,7 @@
 #include <linux/sysfs.h>
 #include <linux/balloon_compaction.h>
 #include <linux/page-isolation.h>
+#include <linux/kasan.h>
 #include "internal.h"
 
 #ifdef CONFIG_COMPACTION
@@ -59,6 +60,7 @@ static void map_pages(struct list_head *list)
 	list_for_each_entry(page, list, lru) {
 		arch_alloc_page(page, 0);
 		kernel_map_pages(page, 1, 1);
+		kasan_alloc_pages(page, 0);
 	}
 }
 
@@ -431,6 +433,10 @@ static bool too_many_isolated(struct zone *zone)
 	isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
 					zone_page_state(zone, NR_ISOLATED_ANON);
 
+	if (isolated > (inactive + active) / 2)
+		isolated = zone_page_state_snapshot(zone, NR_ISOLATED_FILE) +
+			   zone_page_state_snapshot(zone, NR_ISOLATED_ANON);
+
 	return isolated > (inactive + active) / 2;
 }
 
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -7,6 +7,7 @@
  *		Initial version.
  */
 
+#include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/file.h>
 #include <linux/fs.h>
@@ -21,15 +22,51 @@
 
 #include <asm/unistd.h>
 
+static void fadvise_deactivate(struct address_space *mapping,
+		pgoff_t start, pgoff_t end)
+{
+	struct pagevec pvec;
+	pgoff_t index = start;
+	int i;
+
+	if (start > end)
+		return;
+
+	/*
+	 * Note: this function may get called on a shmem/tmpfs mapping:
+	 * pagevec_lookup() might then return 0 prematurely (because it
+	 * got a gangful of swap entries); but it's hardly worth worrying
+	 * about - it can rarely have anything to free from such a mapping
+	 * (most pages are dirty), and already skips over any difficulties.
+	 */
+
+	pagevec_init(&pvec, 0);
+	while (index <= end && pagevec_lookup(&pvec, mapping, index,
+			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) {
+		for (i = 0; i < pagevec_count(&pvec); i++) {
+			struct page *page = pvec.pages[i];
+
+			/* We rely upon deletion not changing page->index */
+			index = page->index;
+			if (index > end)
+				break;
+
+			deactivate_page(page);
+		}
+		pagevec_release(&pvec);
+		cond_resched();
+		index++;
+	}
+}
+
 /*
  * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
  * deactivate the pages and clear PG_Referenced.
  */
-SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
+int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
 {
-	struct fd f = fdget(fd);
 	struct inode *inode;
-	struct address_space *mapping;
+	struct address_space *mapping = file->f_mapping;
 	struct backing_dev_info *bdi;
 	loff_t endbyte;			/* inclusive */
 	pgoff_t start_index;
@@ -37,20 +74,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 	unsigned long nrpages;
 	int ret = 0;
 
-	if (!f.file)
-		return -EBADF;
-
-	inode = file_inode(f.file);
-	if (S_ISFIFO(inode->i_mode)) {
-		ret = -ESPIPE;
-		goto out;
-	}
-
-	mapping = f.file->f_mapping;
-	if (!mapping || len < 0) {
-		ret = -EINVAL;
-		goto out;
-	}
+	inode = file_inode(file);
 
 	if (IS_DAX(inode)) {
 		switch (advice) {
@@ -60,6 +84,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 		case POSIX_FADV_WILLNEED:
 		case POSIX_FADV_NOREUSE:
 		case POSIX_FADV_DONTNEED:
+		case FADV_DEACTIVATE:
 			/* no bad return value, but ignore advice */
 			break;
 		default:
@@ -79,21 +104,21 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 
 	switch (advice) {
 	case POSIX_FADV_NORMAL:
-		f.file->f_ra.ra_pages = bdi->ra_pages;
-		spin_lock(&f.file->f_lock);
-		f.file->f_mode &= ~FMODE_RANDOM;
-		spin_unlock(&f.file->f_lock);
+		file->f_ra.ra_pages = bdi->ra_pages;
+		spin_lock(&file->f_lock);
+		file->f_mode &= ~FMODE_RANDOM;
+		spin_unlock(&file->f_lock);
 		break;
 	case POSIX_FADV_RANDOM:
-		spin_lock(&f.file->f_lock);
-		f.file->f_mode |= FMODE_RANDOM;
-		spin_unlock(&f.file->f_lock);
+		spin_lock(&file->f_lock);
+		file->f_mode |= FMODE_RANDOM;
+		spin_unlock(&file->f_lock);
 		break;
 	case POSIX_FADV_SEQUENTIAL:
-		f.file->f_ra.ra_pages = bdi->ra_pages * 2;
-		spin_lock(&f.file->f_lock);
-		f.file->f_mode &= ~FMODE_RANDOM;
-		spin_unlock(&f.file->f_lock);
+		file->f_ra.ra_pages = bdi->ra_pages * 2;
+		spin_lock(&file->f_lock);
+		file->f_mode &= ~FMODE_RANDOM;
+		spin_unlock(&file->f_lock);
 		break;
 	case POSIX_FADV_WILLNEED:
 		/* First and last PARTIAL page! */
@@ -109,7 +134,7 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 		 * Ignore return value because fadvise() shall return
 		 * success even if filesystem can't retrieve a hint,
 		 */
-		force_page_cache_readahead(mapping, f.file, start_index,
+		force_page_cache_readahead(mapping, file, start_index,
 					   nrpages);
 		break;
 	case POSIX_FADV_NOREUSE:
@@ -140,11 +165,43 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
 			}
 		}
 		break;
+	case FADV_DEACTIVATE:
+		start_index = (offset+(PAGE_CACHE_SIZE-1)) >> PAGE_CACHE_SHIFT;
+		end_index = (endbyte >> PAGE_CACHE_SHIFT);
+		fadvise_deactivate(mapping, start_index, end_index);
+		break;
 	default:
 		ret = -EINVAL;
 	}
 out:
-	fdput(f);
+	return ret;
+}
+EXPORT_SYMBOL(generic_fadvise);
+
+SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice)
+{
+	struct file *file = fget(fd);
+	int (*fadvise)(struct file *,loff_t, loff_t, int) = generic_fadvise;
+	int ret = 0;
+
+	if (!file)
+		return -EBADF;
+
+	if (S_ISFIFO(file->f_path.dentry->d_inode->i_mode)) {
+		ret = -ESPIPE;
+		goto out;
+	}
+
+	if (!file->f_mapping || len < 0) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (file->f_op && file->f_op->fadvise)
+		fadvise = file->f_op->fadvise;
+
+	ret = fadvise(file, offset, len, advice);
+out:
+	fput(file);
 	return ret;
 }
 
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -32,6 +32,7 @@
 #include <linux/security.h>
 #include <linux/cpuset.h>
 #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */
+#include <linux/hugetlb.h>
 #include <linux/memcontrol.h>
 #include <linux/cleancache.h>
 #include "internal.h"
@@ -46,6 +47,9 @@
 
 #include <asm/mman.h>
 
+#include <linux/virtinfo.h>
+#include <bc/io_acct.h>
+
 /*
  * Shared mappings implemented 30.11.1994. It's not fully working yet,
  * though.
@@ -135,7 +139,7 @@ static void page_cache_tree_delete(struct address_space *mapping,
 
 	if (!node) {
 		/* Clear direct pointer tags in root node */
-		mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
+		__radix_tree_root_tag_move_all_to_prev(&mapping->page_tree);
 		radix_tree_replace_slot(slot, shadow);
 		return;
 	}
@@ -146,6 +150,8 @@ static void page_cache_tree_delete(struct address_space *mapping,
 	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
 		if (test_bit(offset, node->tags[tag]))
 			radix_tree_tag_clear(&mapping->page_tree, index, tag);
+		else
+			__radix_tree_prev_tag_clear(&mapping->page_tree, tag);
 	}
 
 	/* Delete page, swap shadow entry */
@@ -167,7 +173,7 @@ static void page_cache_tree_delete(struct address_space *mapping,
 	if (!workingset_node_pages(node) &&
 	    list_empty(&node->private_list)) {
 		node->private_data = mapping;
-		workingset_remember_node(node);
+		list_lru_add(&workingset_shadow_nodes, &node->private_list);
 	}
 }
 
@@ -192,6 +198,15 @@ void __delete_from_page_cache(struct page *page, void *shadow)
 		cleancache_invalidate_page(mapping, page);
 
 	page_cache_tree_delete(mapping, page, shadow);
+	if (mapping_cap_account_dirty(mapping) &&
+			radix_tree_prev_tag_get(&mapping->page_tree,
+				PAGECACHE_TAG_DIRTY))
+		ub_io_account_cancel(mapping);
+
+	if (mapping_cap_account_writeback(mapping) &&
+			radix_tree_prev_tag_get(&mapping->page_tree,
+				PAGECACHE_TAG_WRITEBACK))
+		ub_io_writeback_dec(mapping);
 
 	page->mapping = NULL;
 	/* Leave page->index set: truncation lookup relies upon it */
@@ -233,7 +248,6 @@ void delete_from_page_cache(struct page *page)
 	spin_lock_irq(&mapping->tree_lock);
 	__delete_from_page_cache(page, NULL);
 	spin_unlock_irq(&mapping->tree_lock);
-	mem_cgroup_uncharge_cache_page(page);
 
 	if (freepage)
 		freepage(page);
@@ -530,8 +544,7 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
 		if (PageSwapBacked(new))
 			__inc_zone_page_state(new, NR_SHMEM);
 		spin_unlock_irq(&mapping->tree_lock);
-		/* mem_cgroup codes must not be called under tree_lock */
-		mem_cgroup_replace_page_cache(old, new);
+		mem_cgroup_migrate(old, new, true);
 		radix_tree_preload_end();
 		if (freepage)
 			freepage(old);
@@ -582,7 +595,8 @@ static int page_cache_tree_insert(struct address_space *mapping,
 		 * mapping->tree_lock.
 		 */
 		if (!list_empty(&node->private_list))
-			workingset_forget_node(node);
+			list_lru_del(&workingset_shadow_nodes,
+				     &node->private_list);
 	}
 	return 0;
 }
@@ -592,15 +606,19 @@ static int __add_to_page_cache_locked(struct page *page,
 				      pgoff_t offset, gfp_t gfp_mask,
 				      void **shadowp)
 {
+	int huge = PageHuge(page);
+	struct mem_cgroup *memcg;
 	int error;
 
 	VM_BUG_ON(!PageLocked(page));
 	VM_BUG_ON(PageSwapBacked(page));
 
-	error = mem_cgroup_cache_charge(page, current->mm,
-					gfp_mask & GFP_RECLAIM_MASK);
-	if (error)
-		goto out;
+	if (!huge) {
+		error = mem_cgroup_try_charge(page, current->mm, gfp_mask,
+					&memcg);
+		if (error)
+			return error;
+	}
 
 	error = radix_tree_maybe_preload(gfp_mask & ~__GFP_HIGHMEM);
 	if (error == 0) {
@@ -613,18 +631,20 @@ static int __add_to_page_cache_locked(struct page *page,
 		if (likely(!error)) {
 			__inc_zone_page_state(page, NR_FILE_PAGES);
 			spin_unlock_irq(&mapping->tree_lock);
+			if (!huge)
+				mem_cgroup_commit_charge(page, memcg, false);
 			trace_mm_filemap_add_to_page_cache(page);
 		} else {
 			page->mapping = NULL;
 			/* Leave page->index set: truncation relies upon it */
 			spin_unlock_irq(&mapping->tree_lock);
-			mem_cgroup_uncharge_cache_page(page);
+			if (!huge)
+				mem_cgroup_cancel_charge(page, memcg);
 			page_cache_release(page);
 		}
 		radix_tree_preload_end();
-	} else
-		mem_cgroup_uncharge_cache_page(page);
-out:
+	} else if (!huge)
+		mem_cgroup_cancel_charge(page, memcg);
 	return error;
 }
 
@@ -1562,162 +1582,6 @@ static void shrink_readahead_size_eio(struct file *filp,
 	ra->ra_pages /= 4;
 }
 
-size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
-	                 struct iov_iter *i)
-{
-	size_t skip, copy, left, wanted;
-	const struct iovec *iov;
-	char __user *buf;
-	void *kaddr, *from;
-
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	wanted = bytes;
-	iov = i->iov;
-	skip = i->iov_offset;
-	buf = iov->iov_base + skip;
-	copy = min(bytes, iov->iov_len - skip);
-
-	if (!fault_in_pages_writeable(buf, copy)) {
-		kaddr = kmap_atomic(page);
-		from = kaddr + offset;
-
-		/* first chunk, usually the only one */
-		left = __copy_to_user_inatomic(buf, from, copy);
-		copy -= left;
-		skip += copy;
-		from += copy;
-		bytes -= copy;
-
-		while (unlikely(!left && bytes)) {
-			iov++;
-			buf = iov->iov_base;
-			copy = min(bytes, iov->iov_len);
-			left = __copy_to_user_inatomic(buf, from, copy);
-			copy -= left;
-			skip = copy;
-			from += copy;
-			bytes -= copy;
-		}
-		if (likely(!bytes)) {
-			kunmap_atomic(kaddr);
-			goto done;
-		}
-		offset = from - kaddr;
-		buf += copy;
-		kunmap_atomic(kaddr);
-		copy = min(bytes, iov->iov_len - skip);
-	}
-	/* Too bad - revert to non-atomic kmap */
-	kaddr = kmap(page);
-	from = kaddr + offset;
-	left = __copy_to_user(buf, from, copy);
-	copy -= left;
-	skip += copy;
-	from += copy;
-	bytes -= copy;
-	while (unlikely(!left && bytes)) {
-		iov++;
-		buf = iov->iov_base;
-		copy = min(bytes, iov->iov_len);
-		left = __copy_to_user(buf, from, copy);
-		copy -= left;
-		skip = copy;
-		from += copy;
-		bytes -= copy;
-	}
-	kunmap(page);
-done:
-	i->count -= wanted - bytes;
-	i->nr_segs -= iov - i->iov;
-	i->iov = iov;
-	i->iov_offset = skip;
-	return wanted - bytes;
-}
-EXPORT_SYMBOL(copy_page_to_iter);
-
-size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
-			   struct iov_iter *i)
-{
-	size_t skip, copy, left, wanted;
-	const struct iovec *iov;
-	char __user *buf;
-	void *kaddr, *to;
-
-	if (unlikely(bytes > i->count))
-		bytes = i->count;
-
-	if (unlikely(!bytes))
-		return 0;
-
-	wanted = bytes;
-	iov = i->iov;
-	skip = i->iov_offset;
-	buf = iov->iov_base + skip;
-	copy = min(bytes, iov->iov_len - skip);
-
-	if (!fault_in_pages_readable(buf, copy)) {
-		kaddr = kmap_atomic(page);
-		to = kaddr + offset;
-
-		/* first chunk, usually the only one */
-		left = __copy_from_user_inatomic(to, buf, copy);
-		copy -= left;
-		skip += copy;
-		to += copy;
-		bytes -= copy;
-
-		while (unlikely(!left && bytes)) {
-			iov++;
-			buf = iov->iov_base;
-			copy = min(bytes, iov->iov_len);
-			left = __copy_from_user_inatomic(to, buf, copy);
-			copy -= left;
-			skip = copy;
-			to += copy;
-			bytes -= copy;
-		}
-		if (likely(!bytes)) {
-			kunmap_atomic(kaddr);
-			goto done;
-		}
-		offset = to - kaddr;
-		buf += copy;
-		kunmap_atomic(kaddr);
-		copy = min(bytes, iov->iov_len - skip);
-	}
-	/* Too bad - revert to non-atomic kmap */
-	kaddr = kmap(page);
-	to = kaddr + offset;
-	left = __copy_from_user(to, buf, copy);
-	copy -= left;
-	skip += copy;
-	to += copy;
-	bytes -= copy;
-	while (unlikely(!left && bytes)) {
-		iov++;
-		buf = iov->iov_base;
-		copy = min(bytes, iov->iov_len);
-		left = __copy_from_user(to, buf, copy);
-		copy -= left;
-		skip = copy;
-		to += copy;
-		bytes -= copy;
-	}
-	kunmap(page);
-done:
-	i->count -= wanted - bytes;
-	i->nr_segs -= iov - i->iov;
-	i->iov = iov;
-	i->iov_offset = skip;
-	return wanted - bytes;
-}
-EXPORT_SYMBOL(copy_page_from_iter);
-
 /**
  * do_generic_file_read - generic file read routine
  * @filp:	the file to read
@@ -1759,6 +1623,12 @@ static void do_generic_file_read(struct file *filp, loff_t *ppos,
 		cond_resched();
 find_page:
 		page = find_get_page(mapping, index);
+		if (!page && mapping->i_peer_file) {
+			page = pick_peer_page(mapping, index, ra,
+					      last_index - index);
+			if (page)
+				goto page_ok;
+		}
 		if (!page) {
 			page_cache_sync_readahead(mapping,
 					ra, filp,
@@ -1851,6 +1721,8 @@ page_ok:
 		goto out;
 
 page_not_up_to_date:
+		virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 		/* Get exclusive access to the page ... */
 		error = lock_page_killable(page);
 		if (unlikely(error))
@@ -1918,6 +1790,8 @@ readpage_error:
 		goto out;
 
 no_cached_page:
+		virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 		/*
 		 * Ok, it wasn't cached, so we need to create a new
 		 * page..
@@ -2025,31 +1899,60 @@ int generic_segment_checks(const struct iovec *iov,
 }
 EXPORT_SYMBOL(generic_segment_checks);
 
+static ssize_t mapping_direct_IO(struct address_space *mapping, int rw,
+			         struct kiocb *iocb, struct iov_iter *iter,
+			         loff_t pos)
+{
+	if (iov_iter_has_iovec(iter))
+		return mapping->a_ops->direct_IO(rw, iocb, iov_iter_iovec(iter),
+						 pos, iter->nr_segs);
+	else if (iov_iter_has_bvec(iter))
+		return mapping->a_ops->direct_IO_bvec(rw, iocb,
+						      iov_iter_bvec(iter), pos,
+						      iter->nr_segs);
+	else if (iov_iter_has_page(iter))
+		return mapping->a_ops->direct_IO_page(rw, iocb,
+						      iov_iter_page(iter), pos);
+	else
+		BUG();
+}
+
+static int file_read_iter_actor(read_descriptor_t *desc, struct page *page,
+				unsigned long offset, unsigned long size)
+{
+	struct iov_iter *iter = desc->arg.data;
+	unsigned long copied = 0;
+
+	if (size > desc->count)
+		size = desc->count;
+
+	copied = iov_iter_copy_to_user(page, iter, offset, size);
+	if (copied < size)
+		desc->error = -EFAULT;
+
+	iov_iter_advance(iter, copied);
+	desc->count -= copied;
+	desc->written += copied;
+
+	return copied;
+}
+
+
 /**
- * generic_file_aio_read - generic filesystem read routine
+ * generic_file_read_iter - generic filesystem read routine
  * @iocb:	kernel I/O control block
- * @iov:	io vector request
- * @nr_segs:	number of segments in the iovec
+ * @iov_iter:	memory vector
  * @pos:	current file position
- *
- * This is the "read()" routine for all filesystems
- * that can use the page cache directly.
  */
 ssize_t
-generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos)
+generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter, loff_t pos)
 {
 	struct file *filp = iocb->ki_filp;
-	ssize_t retval;
-	unsigned long seg = 0;
-	size_t count;
+	read_descriptor_t desc;
+	ssize_t retval = 0;
+	size_t count = iov_iter_count(iter);
 	loff_t *ppos = &iocb->ki_pos;
 
-	count = 0;
-	retval = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
-	if (retval)
-		return retval;
-
 	if (io_is_direct(filp)) {
 		loff_t size;
 		struct address_space *mapping;
@@ -2060,30 +1963,28 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 		if (!count)
 			goto out; /* skip atime */
 		size = i_size_read(inode);
-		if (pos < size) {
-			retval = filemap_write_and_wait_range(mapping, pos,
-					pos + iov_length(iov, nr_segs) - 1);
-			if (!retval) {
-				retval = mapping->a_ops->direct_IO(READ, iocb,
-							iov, pos, nr_segs);
-			}
-			if (retval > 0) {
-				*ppos = pos + retval;
-				count -= retval;
-			}
+		retval = filemap_write_and_wait_range(mapping, pos,
+				pos + count - 1);
+		if (!retval) {
+			retval = mapping_direct_IO(mapping, READ,
+						   iocb, iter, pos);
+		}
+		if (retval > 0) {
+			*ppos = pos + retval;
+			count -= retval;
+		}
 
-			/*
-			 * Btrfs can have a short DIO read if we encounter
-			 * compressed extents, so if there was an error, or if
-			 * we've already read everything we wanted to, or if
-			 * there was a short read because we hit EOF, go ahead
-			 * and return.  Otherwise fallthrough to buffered io for
-			 * the rest of the read.
-			 */
-			if (retval < 0 || !count || *ppos >= size) {
-				file_accessed(filp);
-				goto out;
-			}
+		/*
+		 * Btrfs can have a short DIO read if we encounter
+		 * compressed extents, so if there was an error, or if
+		 * we've already read everything we wanted to, or if
+		 * there was a short read because we hit EOF, go ahead
+		 * and return.  Otherwise fallthrough to buffered io for
+		 * the rest of the read.
+		 */
+		if (retval < 0 || !count || *ppos >= size) {
+			file_accessed(filp);
+			goto out;
 		}
 
 		/*
@@ -2096,42 +1997,49 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
 		}
 	}
 
-	count = retval;
-	for (seg = 0; seg < nr_segs; seg++) {
-		read_descriptor_t desc;
-		loff_t offset = 0;
+	iov_iter_advance(iter, retval);
 
-		/*
-		 * If we did a short DIO read we need to skip the section of the
-		 * iov that we've already read data into.
-		 */
-		if (count) {
-			if (count > iov[seg].iov_len) {
-				count -= iov[seg].iov_len;
-				continue;
-			}
-			offset = count;
-			count = 0;
-		}
+	desc.written = 0;
+	desc.arg.data = iter;
+	desc.count = count;
+	desc.error = 0;
+	do_generic_file_read(filp, ppos, &desc, file_read_iter_actor);
 
-		desc.written = 0;
-		desc.arg.buf = iov[seg].iov_base + offset;
-		desc.count = iov[seg].iov_len - offset;
-		if (desc.count == 0)
-			continue;
-		desc.error = 0;
-		do_generic_file_read(filp, ppos, &desc, file_read_actor);
-		retval += desc.written;
-		if (desc.error) {
-			retval = retval ?: desc.error;
-			break;
-		}
-		if (desc.count > 0)
-			break;
-	}
+	retval += desc.written;
+	if (desc.error && !retval)
+		retval = desc.error;
 out:
 	return retval;
 }
+EXPORT_SYMBOL(generic_file_read_iter);
+
+/**
+ * generic_file_aio_read - generic filesystem read routine
+ * @iocb:	kernel I/O control block
+ * @iov:	io vector request
+ * @nr_segs:	number of segments in the iovec
+ * @pos:	current file position
+ *
+ * This is the "read()" routine for all filesystems
+ * that can use the page cache directly.
+ */
+ssize_t
+generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos)
+{
+	struct iov_iter iter;
+	int ret;
+	size_t count;
+
+	count = 0;
+	ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE);
+	if (ret)
+		return ret;
+
+	iov_iter_init(&iter, iov, nr_segs, count, 0);
+
+	return generic_file_read_iter(iocb, &iter, pos);
+}
 EXPORT_SYMBOL(generic_file_aio_read);
 
 #ifdef CONFIG_MMU
@@ -2149,6 +2057,8 @@ static int page_cache_read(struct file *file, pgoff_t offset)
 	struct page *page; 
 	int ret;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	do {
 		page = page_cache_alloc_cold(mapping);
 		if (!page)
@@ -2266,12 +2176,24 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 	 * Do we have something in the page cache already?
 	 */
 	page = find_get_page(mapping, offset);
+	if (!page && mapping->i_peer_file) {
+		page = pick_peer_page(mapping, offset, ra, ra->ra_pages);
+		if (page) {
+			vmf->page = page;
+			return 0; /* unlocked page */
+		}
+	}
 	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
 		/*
 		 * We found the page, so try async readahead before
 		 * waiting for the lock.
 		 */
 		do_async_mmap_readahead(vma, ra, file, page, offset);
+
+		if (unlikely(!PageUptodate(page)))
+			virtinfo_notifier_call(VITYPE_IO,
+					VIRTINFO_IO_PREPARE, NULL);
+
 	} else if (!page) {
 		/* No page in the page cache at all */
 		do_sync_mmap_readahead(vma, ra, file, offset);
@@ -2397,7 +2319,6 @@ EXPORT_SYMBOL(filemap_page_mkwrite);
 const struct vm_operations_struct generic_file_vm_ops = {
 	.fault		= filemap_fault,
 	.page_mkwrite	= filemap_page_mkwrite,
-	.remap_pages	= generic_file_remap_pages,
 };
 
 /* This is used for a general mmap of a disk file */
@@ -2582,150 +2503,6 @@ struct page *read_cache_page(struct address_space *mapping,
 }
 EXPORT_SYMBOL(read_cache_page);
 
-static size_t __iovec_copy_from_user_inatomic(char *vaddr,
-			const struct iovec *iov, size_t base, size_t bytes)
-{
-	size_t copied = 0, left = 0;
-
-	while (bytes) {
-		char __user *buf = iov->iov_base + base;
-		int copy = min(bytes, iov->iov_len - base);
-
-		base = 0;
-		left = __copy_from_user_inatomic(vaddr, buf, copy);
-		copied += copy;
-		bytes -= copy;
-		vaddr += copy;
-		iov++;
-
-		if (unlikely(left))
-			break;
-	}
-	return copied - left;
-}
-
-/*
- * Copy as much as we can into the page and return the number of bytes which
- * were successfully copied.  If a fault is encountered then return the number of
- * bytes which were copied.
- */
-size_t iov_iter_copy_from_user_atomic(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes)
-{
-	char *kaddr;
-	size_t copied;
-
-	BUG_ON(!in_atomic());
-	kaddr = kmap_atomic(page);
-	if (likely(i->nr_segs == 1)) {
-		int left;
-		char __user *buf = i->iov->iov_base + i->iov_offset;
-		left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
-		copied = bytes - left;
-	} else {
-		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
-						i->iov, i->iov_offset, bytes);
-	}
-	kunmap_atomic(kaddr);
-
-	return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
-
-/*
- * This has the same sideeffects and return value as
- * iov_iter_copy_from_user_atomic().
- * The difference is that it attempts to resolve faults.
- * Page must not be locked.
- */
-size_t iov_iter_copy_from_user(struct page *page,
-		struct iov_iter *i, unsigned long offset, size_t bytes)
-{
-	char *kaddr;
-	size_t copied;
-
-	kaddr = kmap(page);
-	if (likely(i->nr_segs == 1)) {
-		int left;
-		char __user *buf = i->iov->iov_base + i->iov_offset;
-		left = __copy_from_user(kaddr + offset, buf, bytes);
-		copied = bytes - left;
-	} else {
-		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
-						i->iov, i->iov_offset, bytes);
-	}
-	kunmap(page);
-	return copied;
-}
-EXPORT_SYMBOL(iov_iter_copy_from_user);
-
-void iov_iter_advance(struct iov_iter *i, size_t bytes)
-{
-	BUG_ON(i->count < bytes);
-
-	if (likely(i->nr_segs == 1)) {
-		i->iov_offset += bytes;
-		i->count -= bytes;
-	} else {
-		const struct iovec *iov = i->iov;
-		size_t base = i->iov_offset;
-		unsigned long nr_segs = i->nr_segs;
-
-		/*
-		 * The !iov->iov_len check ensures we skip over unlikely
-		 * zero-length segments (without overruning the iovec).
-		 */
-		while (bytes || unlikely(i->count && !iov->iov_len)) {
-			int copy;
-
-			copy = min(bytes, iov->iov_len - base);
-			BUG_ON(!i->count || i->count < copy);
-			i->count -= copy;
-			bytes -= copy;
-			base += copy;
-			if (iov->iov_len == base) {
-				iov++;
-				nr_segs--;
-				base = 0;
-			}
-		}
-		i->iov = iov;
-		i->iov_offset = base;
-		i->nr_segs = nr_segs;
-	}
-}
-EXPORT_SYMBOL(iov_iter_advance);
-
-/*
- * Fault in the first iovec of the given iov_iter, to a maximum length
- * of bytes. Returns 0 on success, or non-zero if the memory could not be
- * accessed (ie. because it is an invalid address).
- *
- * writev-intensive code may want this to prefault several iovecs -- that
- * would be possible (callers must not rely on the fact that _only_ the
- * first iovec will be faulted with the current implementation).
- */
-int iov_iter_fault_in_readable(struct iov_iter *i, size_t bytes)
-{
-	char __user *buf = i->iov->iov_base + i->iov_offset;
-	bytes = min(bytes, i->iov->iov_len - i->iov_offset);
-	return fault_in_pages_readable(buf, bytes);
-}
-EXPORT_SYMBOL(iov_iter_fault_in_readable);
-
-/*
- * Return the count of just the current iov_iter segment.
- */
-size_t iov_iter_single_seg_count(const struct iov_iter *i)
-{
-	const struct iovec *iov = i->iov;
-	if (i->nr_segs == 1)
-		return i->count;
-	else
-		return min(i->count, iov->iov_len - i->iov_offset);
-}
-EXPORT_SYMBOL(iov_iter_single_seg_count);
-
 /*
  * Performs necessary checks before doing a write
  *
@@ -2831,9 +2608,8 @@ int pagecache_write_end(struct file *file, struct address_space *mapping,
 EXPORT_SYMBOL(pagecache_write_end);
 
 ssize_t
-generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
-		size_t count, size_t ocount)
+generic_file_direct_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+		loff_t pos, loff_t *ppos, size_t count)
 {
 	struct file	*file = iocb->ki_filp;
 	struct address_space *mapping = file->f_mapping;
@@ -2842,10 +2618,13 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	size_t		write_len;
 	pgoff_t		end;
 
-	if (count != ocount)
-		*nr_segs = iov_shorten((struct iovec *)iov, *nr_segs, count);
+	if (count != iov_iter_count(iter)) {
+		written = iov_iter_shorten(iter, count);
+		if (written)
+			goto out;
+	}
 
-	write_len = iov_length(iov, *nr_segs);
+	write_len = count;
 	end = (pos + write_len - 1) >> PAGE_CACHE_SHIFT;
 
 	written = filemap_write_and_wait_range(mapping, pos, pos + write_len - 1);
@@ -2858,21 +2637,19 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	 * about to write.  We do this *before* the write so that we can return
 	 * without clobbering -EIOCBQUEUED from ->direct_IO().
 	 */
-	if (mapping->nrpages) {
-		written = invalidate_inode_pages2_range(mapping,
-					pos >> PAGE_CACHE_SHIFT, end);
-		/*
-		 * If a page can not be invalidated, return 0 to fall back
-		 * to buffered write.
-		 */
-		if (written) {
-			if (written == -EBUSY)
-				return 0;
-			goto out;
-		}
+	written = invalidate_inode_pages2_range(mapping,
+						pos >> PAGE_CACHE_SHIFT, end);
+	/*
+	 * If a page can not be invalidated, return 0 to fall back
+	 * to buffered write.
+	 */
+	if (written) {
+		if (written == -EBUSY)
+			return 0;
+		goto out;
 	}
 
-	written = mapping->a_ops->direct_IO(WRITE, iocb, iov, pos, *nr_segs);
+	written = mapping_direct_IO(mapping, WRITE, iocb, iter, pos);
 
 	/*
 	 * Finally, try again to invalidate clean pages which might have been
@@ -2882,10 +2659,8 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 	 * so we don't support it 100%.  If this invalidation
 	 * fails, tough, the write still worked...
 	 */
-	if (mapping->nrpages) {
-		invalidate_inode_pages2_range(mapping,
-					      pos >> PAGE_CACHE_SHIFT, end);
-	}
+	invalidate_inode_pages2_range(mapping,
+				pos >> PAGE_CACHE_SHIFT, end);
 
 	if (written > 0) {
 		pos += written;
@@ -2898,6 +2673,23 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
 out:
 	return written;
 }
+EXPORT_SYMBOL(generic_file_direct_write_iter);
+
+ssize_t
+generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long *nr_segs, loff_t pos, loff_t *ppos,
+		size_t count, size_t ocount)
+{
+	struct iov_iter iter;
+	ssize_t ret;
+
+	iov_iter_init(&iter, iov, *nr_segs, ocount, 0);
+	ret = generic_file_direct_write_iter(iocb, &iter, pos, ppos, count);
+	/* generic_file_direct_write_iter() might have shortened the vec */
+	if (*nr_segs != iter.nr_segs)
+		*nr_segs = iter.nr_segs;
+	return ret;
+}
 EXPORT_SYMBOL(generic_file_direct_write);
 
 /*
@@ -3031,16 +2823,15 @@ again:
 }
 
 ssize_t
-generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
-		unsigned long nr_segs, loff_t pos, loff_t *ppos,
-		size_t count, ssize_t written)
+generic_file_buffered_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+		loff_t pos, loff_t *ppos, ssize_t written)
 {
 	struct file *file = iocb->ki_filp;
 	ssize_t status;
-	struct iov_iter i;
 
-	iov_iter_init(&i, iov, nr_segs, count, written);
-	status = generic_perform_write(file, &i, pos);
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
+	status = generic_perform_write(file, iter, pos);
 
 	if (likely(status >= 0)) {
 		written += status;
@@ -3049,13 +2840,24 @@ generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
 	
 	return written ? written : status;
 }
+EXPORT_SYMBOL(generic_file_buffered_write_iter);
+
+ssize_t
+generic_file_buffered_write(struct kiocb *iocb, const struct iovec *iov,
+		unsigned long nr_segs, loff_t pos, loff_t *ppos,
+		size_t count, ssize_t written)
+{
+	struct iov_iter iter;
+	iov_iter_init(&iter, iov, nr_segs, count, written);
+	return generic_file_buffered_write_iter(iocb, &iter, pos, ppos,
+						written);
+}
 EXPORT_SYMBOL(generic_file_buffered_write);
 
 /**
  * __generic_file_aio_write - write data to a file
  * @iocb:	IO state structure (file, offset, etc.)
- * @iov:	vector with data to write
- * @nr_segs:	number of segments in the vector
+ * @iter:	iov_iter specifying memory to write
  * @ppos:	position where to write
  *
  * This function does all the work needed for actually writing data to a
@@ -3070,24 +2872,18 @@ EXPORT_SYMBOL(generic_file_buffered_write);
  * A caller has to handle it. This is mainly due to the fact that we want to
  * avoid syncing under i_mutex.
  */
-ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
-				 unsigned long nr_segs, loff_t *ppos)
+ssize_t __generic_file_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+				  loff_t *ppos)
 {
 	struct file *file = iocb->ki_filp;
 	struct address_space * mapping = file->f_mapping;
-	size_t ocount;		/* original count */
 	size_t count;		/* after file limit checks */
 	struct inode 	*inode = mapping->host;
 	loff_t		pos;
 	ssize_t		written;
 	ssize_t		err;
 
-	ocount = 0;
-	err = generic_segment_checks(iov, &nr_segs, &ocount, VERIFY_READ);
-	if (err)
-		return err;
-
-	count = ocount;
+	count = iov_iter_count(iter);
 	pos = *ppos;
 
 	/* We can write back this queue in page reclaim */
@@ -3113,8 +2909,8 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 		loff_t endbyte;
 		ssize_t written_buffered;
 
-		written = generic_file_direct_write(iocb, iov, &nr_segs, pos,
-							ppos, count, ocount);
+		written = generic_file_direct_write_iter(iocb, iter, pos,
+							 ppos, count);
 		/*
 		 * If the write stopped short of completing, fall back to
 		 * buffered writes.  Some filesystems do this for writes to
@@ -3127,9 +2923,9 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 
 		pos += written;
 		count -= written;
-		written_buffered = generic_file_buffered_write(iocb, iov,
-						nr_segs, pos, ppos, count,
-						written);
+		iov_iter_advance(iter, written);
+		written_buffered = generic_file_buffered_write_iter(iocb, iter,
+						pos, ppos, written);
 		/*
 		 * If generic_file_buffered_write() retuned a synchronous error
 		 * then we want to return the number of bytes which were
@@ -3161,13 +2957,57 @@ ssize_t __generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
 			 */
 		}
 	} else {
-		written = generic_file_buffered_write(iocb, iov, nr_segs,
-				pos, ppos, count, written);
+		iter->count = count;
+		written = generic_file_buffered_write_iter(iocb, iter,
+				pos, ppos, written);
 	}
 out:
 	current->backing_dev_info = NULL;
 	return written ? written : err;
 }
+EXPORT_SYMBOL(__generic_file_write_iter);
+
+ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *iter,
+			        loff_t pos)
+{
+	struct file *file = iocb->ki_filp;
+	struct inode *inode = file->f_mapping->host;
+	ssize_t ret;
+
+	mutex_lock(&inode->i_mutex);
+	ret = __generic_file_write_iter(iocb, iter, &iocb->ki_pos);
+	mutex_unlock(&inode->i_mutex);
+
+	if (ret > 0 || ret == -EIOCBQUEUED) {
+		ssize_t err;
+
+		err = generic_write_sync(file, pos, ret);
+		if (err < 0 && ret > 0)
+			ret = err;
+	}
+	return ret;
+}
+EXPORT_SYMBOL(generic_file_write_iter);
+
+ssize_t
+__generic_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
+			 unsigned long nr_segs, loff_t *ppos)
+{
+	struct iov_iter iter;
+	size_t count;
+	int ret;
+
+	count = 0;
+	ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ);
+	if (ret)
+		goto out;
+
+	iov_iter_init(&iter, iov, nr_segs, count, 0);
+
+	ret = __generic_file_write_iter(iocb, &iter, ppos);
+out:
+	return ret;
+}
 EXPORT_SYMBOL(__generic_file_aio_write);
 
 /**
--- /dev/null
+++ b/mm/filemap_xip.c
@@ -0,0 +1,483 @@
+/*
+ *	linux/mm/filemap_xip.c
+ *
+ * Copyright (C) 2005 IBM Corporation
+ * Author: Carsten Otte <cotte@de.ibm.com>
+ *
+ * derived from linux/mm/filemap.c - Copyright (C) Linus Torvalds
+ *
+ */
+
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/export.h>
+#include <linux/uio.h>
+#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/sched.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
+#include <linux/gfp.h>
+#include <asm/tlbflush.h>
+#include <asm/io.h>
+#include <bc/vmpages.h>
+
+/*
+ * We do use our own empty page to avoid interference with other users
+ * of ZERO_PAGE(), such as /dev/zero
+ */
+static DEFINE_MUTEX(xip_sparse_mutex);
+static seqcount_t xip_sparse_seq = SEQCNT_ZERO;
+static struct page *__xip_sparse_page;
+
+/* called under xip_sparse_mutex */
+static struct page *xip_sparse_page(void)
+{
+	if (!__xip_sparse_page) {
+		struct page *page = alloc_page(GFP_HIGHUSER | __GFP_ZERO);
+
+		if (page)
+			__xip_sparse_page = page;
+	}
+	return __xip_sparse_page;
+}
+
+/*
+ * This is a file read routine for execute in place files, and uses
+ * the mapping->a_ops->get_xip_mem() function for the actual low-level
+ * stuff.
+ *
+ * Note the struct file* is not used at all.  It may be NULL.
+ */
+static ssize_t
+do_xip_mapping_read(struct address_space *mapping,
+		    struct file_ra_state *_ra,
+		    struct file *filp,
+		    char __user *buf,
+		    size_t len,
+		    loff_t *ppos)
+{
+	struct inode *inode = mapping->host;
+	pgoff_t index, end_index;
+	unsigned long offset;
+	loff_t isize, pos;
+	size_t copied = 0, error = 0;
+
+	BUG_ON(!mapping->a_ops->get_xip_mem);
+
+	pos = *ppos;
+	index = pos >> PAGE_CACHE_SHIFT;
+	offset = pos & ~PAGE_CACHE_MASK;
+
+	isize = i_size_read(inode);
+	if (!isize)
+		goto out;
+
+	end_index = (isize - 1) >> PAGE_CACHE_SHIFT;
+	do {
+		unsigned long nr, left;
+		void *xip_mem;
+		unsigned long xip_pfn;
+		int zero = 0;
+
+		/* nr is the maximum number of bytes to copy from this page */
+		nr = PAGE_CACHE_SIZE;
+		if (index >= end_index) {
+			if (index > end_index)
+				goto out;
+			nr = ((isize - 1) & ~PAGE_CACHE_MASK) + 1;
+			if (nr <= offset) {
+				goto out;
+			}
+		}
+		nr = nr - offset;
+		if (nr > len - copied)
+			nr = len - copied;
+
+		error = mapping->a_ops->get_xip_mem(mapping, index, 0,
+							&xip_mem, &xip_pfn);
+		if (unlikely(error)) {
+			if (error == -ENODATA) {
+				/* sparse */
+				zero = 1;
+			} else
+				goto out;
+		}
+
+		/* If users can be writing to this page using arbitrary
+		 * virtual addresses, take care about potential aliasing
+		 * before reading the page on the kernel side.
+		 */
+		if (mapping_writably_mapped(mapping))
+			/* address based flush */ ;
+
+		/*
+		 * Ok, we have the mem, so now we can copy it to user space...
+		 *
+		 * The actor routine returns how many bytes were actually used..
+		 * NOTE! This may not be the same as how much of a user buffer
+		 * we filled up (we may be padding etc), so we can only update
+		 * "pos" here (the actor routine has to update the user buffer
+		 * pointers and the remaining count).
+		 */
+		if (!zero)
+			left = __copy_to_user(buf+copied, xip_mem+offset, nr);
+		else
+			left = __clear_user(buf + copied, nr);
+
+		if (left) {
+			error = -EFAULT;
+			goto out;
+		}
+
+		copied += (nr - left);
+		offset += (nr - left);
+		index += offset >> PAGE_CACHE_SHIFT;
+		offset &= ~PAGE_CACHE_MASK;
+	} while (copied < len);
+
+out:
+	*ppos = pos + copied;
+	if (filp)
+		file_accessed(filp);
+
+	return (copied ? copied : error);
+}
+
+ssize_t
+xip_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos)
+{
+	if (!access_ok(VERIFY_WRITE, buf, len))
+		return -EFAULT;
+
+	return do_xip_mapping_read(filp->f_mapping, &filp->f_ra, filp,
+			    buf, len, ppos);
+}
+EXPORT_SYMBOL_GPL(xip_file_read);
+
+/*
+ * __xip_unmap is invoked from xip_unmap and
+ * xip_write
+ *
+ * This function walks all vmas of the address_space and unmaps the
+ * __xip_sparse_page when found at pgoff.
+ */
+static void
+__xip_unmap (struct address_space * mapping,
+		     unsigned long pgoff)
+{
+	struct vm_area_struct *vma;
+	struct mm_struct *mm;
+	unsigned long address;
+	pte_t *pte;
+	pte_t pteval;
+	spinlock_t *ptl;
+	struct page *page;
+	unsigned count;
+	int locked = 0;
+
+	count = read_seqcount_begin(&xip_sparse_seq);
+
+	page = __xip_sparse_page;
+	if (!page)
+		return;
+
+retry:
+	mutex_lock(&mapping->i_mmap_mutex);
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
+		mm = vma->vm_mm;
+		address = vma->vm_start +
+			((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
+		BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+		pte = page_check_address(page, mm, address, &ptl, 1);
+		if (pte) {
+			/* Nuke the page table entry. */
+			flush_cache_page(vma, address, pte_pfn(*pte));
+			pteval = ptep_clear_flush(vma, address, pte);
+			page_remove_rmap(page);
+			dec_mm_counter(mm, MM_FILEPAGES);
+			BUG_ON(pte_dirty(pteval));
+			pte_unmap_unlock(pte, ptl);
+			/* must invalidate_page _before_ freeing the page */
+			mmu_notifier_invalidate_page(mm, address);
+			page_cache_release(page);
+		}
+	}
+	mutex_unlock(&mapping->i_mmap_mutex);
+
+	if (locked) {
+		mutex_unlock(&xip_sparse_mutex);
+	} else if (read_seqcount_retry(&xip_sparse_seq, count)) {
+		mutex_lock(&xip_sparse_mutex);
+		locked = 1;
+		goto retry;
+	}
+}
+
+/*
+ * xip_fault() is invoked via the vma operations vector for a
+ * mapped memory region to read in file data during a page fault.
+ *
+ * This function is derived from filemap_fault, but used for execute in place
+ */
+static int xip_file_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct file *file = vma->vm_file;
+	struct address_space *mapping = file->f_mapping;
+	struct inode *inode = mapping->host;
+	pgoff_t size;
+	void *xip_mem;
+	unsigned long xip_pfn;
+	struct page *page;
+	int error;
+
+	/* XXX: are VM_FAULT_ codes OK? */
+again:
+	size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+	if (vmf->pgoff >= size)
+		return VM_FAULT_SIGBUS;
+
+	error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+						&xip_mem, &xip_pfn);
+	if (likely(!error))
+		goto found;
+	if (error != -ENODATA)
+		return VM_FAULT_OOM;
+
+	/* sparse block */
+	if ((vma->vm_flags & (VM_WRITE | VM_MAYWRITE)) &&
+	    (vma->vm_flags & (VM_SHARED | VM_MAYSHARE)) &&
+	    (!(mapping->host->i_sb->s_flags & MS_RDONLY))) {
+		int err;
+
+		/* maybe shared writable, allocate new block */
+		mutex_lock(&xip_sparse_mutex);
+		error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 1,
+							&xip_mem, &xip_pfn);
+		mutex_unlock(&xip_sparse_mutex);
+		if (error)
+			return VM_FAULT_SIGBUS;
+		/* unmap sparse mappings at pgoff from all other vmas */
+		__xip_unmap(mapping, vmf->pgoff);
+
+found:
+		err = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address,
+							xip_pfn);
+		if (err == -ENOMEM)
+			return VM_FAULT_OOM;
+		/*
+		 * err == -EBUSY is fine, we've raced against another thread
+		 * that faulted-in the same page
+		 */
+		if (err != -EBUSY)
+			BUG_ON(err);
+		return VM_FAULT_NOPAGE;
+	} else {
+		int err, ret = VM_FAULT_OOM;
+
+		mutex_lock(&xip_sparse_mutex);
+		write_seqcount_begin(&xip_sparse_seq);
+		error = mapping->a_ops->get_xip_mem(mapping, vmf->pgoff, 0,
+							&xip_mem, &xip_pfn);
+		if (unlikely(!error)) {
+			write_seqcount_end(&xip_sparse_seq);
+			mutex_unlock(&xip_sparse_mutex);
+			goto again;
+		}
+		if (error != -ENODATA)
+			goto out;
+		/* not shared and writable, use xip_sparse_page() */
+		page = xip_sparse_page();
+		if (!page)
+			goto out;
+		err = vm_insert_page(vma, (unsigned long)vmf->virtual_address,
+							page);
+		if (err == -ENOMEM)
+			goto out;
+
+		ret = VM_FAULT_NOPAGE;
+out:
+		write_seqcount_end(&xip_sparse_seq);
+		mutex_unlock(&xip_sparse_mutex);
+
+		return ret;
+	}
+}
+
+static const struct vm_operations_struct xip_file_vm_ops = {
+	.fault	= xip_file_fault,
+	.page_mkwrite	= filemap_page_mkwrite,
+};
+
+int xip_file_mmap(struct file * file, struct vm_area_struct * vma)
+{
+	BUG_ON(!file->f_mapping->a_ops->get_xip_mem);
+
+	file_accessed(file);
+	vma->vm_ops = &xip_file_vm_ops;
+	vma->vm_flags |= VM_MIXEDMAP;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xip_file_mmap);
+
+static ssize_t
+__xip_file_write(struct file *filp, const char __user *buf,
+		  size_t count, loff_t pos, loff_t *ppos)
+{
+	struct address_space * mapping = filp->f_mapping;
+	const struct address_space_operations *a_ops = mapping->a_ops;
+	struct inode 	*inode = mapping->host;
+	long		status = 0;
+	size_t		bytes;
+	ssize_t		written = 0;
+
+	BUG_ON(!mapping->a_ops->get_xip_mem);
+
+	do {
+		unsigned long index;
+		unsigned long offset;
+		size_t copied;
+		void *xip_mem;
+		unsigned long xip_pfn;
+
+		offset = (pos & (PAGE_CACHE_SIZE -1)); /* Within page */
+		index = pos >> PAGE_CACHE_SHIFT;
+		bytes = PAGE_CACHE_SIZE - offset;
+		if (bytes > count)
+			bytes = count;
+
+		status = a_ops->get_xip_mem(mapping, index, 0,
+						&xip_mem, &xip_pfn);
+		if (status == -ENODATA) {
+			/* we allocate a new page unmap it */
+			mutex_lock(&xip_sparse_mutex);
+			status = a_ops->get_xip_mem(mapping, index, 1,
+							&xip_mem, &xip_pfn);
+			mutex_unlock(&xip_sparse_mutex);
+			if (!status)
+				/* unmap page at pgoff from all other vmas */
+				__xip_unmap(mapping, index);
+		}
+
+		if (status)
+			break;
+
+		copied = bytes -
+			__copy_from_user_nocache(xip_mem + offset, buf, bytes);
+
+		if (likely(copied > 0)) {
+			status = copied;
+
+			if (status >= 0) {
+				written += status;
+				count -= status;
+				pos += status;
+				buf += status;
+			}
+		}
+		if (unlikely(copied != bytes))
+			if (status >= 0)
+				status = -EFAULT;
+		if (status < 0)
+			break;
+	} while (count);
+	*ppos = pos;
+	/*
+	 * No need to use i_size_read() here, the i_size
+	 * cannot change under us because we hold i_mutex.
+	 */
+	if (pos > inode->i_size) {
+		i_size_write(inode, pos);
+		mark_inode_dirty(inode);
+	}
+
+	return written ? written : status;
+}
+
+ssize_t
+xip_file_write(struct file *filp, const char __user *buf, size_t len,
+	       loff_t *ppos)
+{
+	struct address_space *mapping = filp->f_mapping;
+	struct inode *inode = mapping->host;
+	size_t count;
+	loff_t pos;
+	ssize_t ret;
+
+	mutex_lock(&inode->i_mutex);
+
+	if (!access_ok(VERIFY_READ, buf, len)) {
+		ret=-EFAULT;
+		goto out_up;
+	}
+
+	pos = *ppos;
+	count = len;
+
+	/* We can write back this queue in page reclaim */
+	current->backing_dev_info = mapping->backing_dev_info;
+
+	ret = generic_write_checks(filp, &pos, &count, S_ISBLK(inode->i_mode));
+	if (ret)
+		goto out_backing;
+	if (count == 0)
+		goto out_backing;
+
+	ret = file_remove_suid(filp);
+	if (ret)
+		goto out_backing;
+
+	ret = file_update_time(filp);
+	if (ret)
+		goto out_backing;
+
+	ret = __xip_file_write (filp, buf, count, pos, ppos);
+
+ out_backing:
+	current->backing_dev_info = NULL;
+ out_up:
+	mutex_unlock(&inode->i_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(xip_file_write);
+
+/*
+ * truncate a page used for execute in place
+ * functionality is analog to block_truncate_page but does use get_xip_mem
+ * to get the page instead of page cache
+ */
+int
+xip_truncate_page(struct address_space *mapping, loff_t from)
+{
+	pgoff_t index = from >> PAGE_CACHE_SHIFT;
+	unsigned offset = from & (PAGE_CACHE_SIZE-1);
+	unsigned blocksize;
+	unsigned length;
+	void *xip_mem;
+	unsigned long xip_pfn;
+	int err;
+
+	BUG_ON(!mapping->a_ops->get_xip_mem);
+
+	blocksize = 1 << mapping->host->i_blkbits;
+	length = offset & (blocksize - 1);
+
+	/* Block boundary? Nothing to do */
+	if (!length)
+		return 0;
+
+	length = blocksize - length;
+
+	err = mapping->a_ops->get_xip_mem(mapping, index, 0,
+						&xip_mem, &xip_pfn);
+	if (unlikely(err)) {
+		if (err == -ENODATA)
+			/* Hole? No need to truncate */
+			return 0;
+		else
+			return err;
+	}
+	memset(xip_mem + offset, 0, length);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(xip_truncate_page);
--- a/mm/fremap.c
+++ /dev/null
@@ -1,263 +0,0 @@
-/*
- *   linux/mm/fremap.c
- * 
- * Explicit pagetable population and nonlinear (random) mappings support.
- *
- * started by Ingo Molnar, Copyright (C) 2002, 2003
- */
-#include <linux/export.h>
-#include <linux/backing-dev.h>
-#include <linux/mm.h>
-#include <linux/swap.h>
-#include <linux/file.h>
-#include <linux/mman.h>
-#include <linux/pagemap.h>
-#include <linux/swapops.h>
-#include <linux/rmap.h>
-#include <linux/syscalls.h>
-#include <linux/mmu_notifier.h>
-
-#include <asm/mmu_context.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
-#include "internal.h"
-
-static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-			unsigned long addr, pte_t *ptep)
-{
-	pte_t pte = *ptep;
-
-	if (pte_present(pte)) {
-		struct page *page;
-
-		flush_cache_page(vma, addr, pte_pfn(pte));
-		pte = ptep_clear_flush_notify(vma, addr, ptep);
-		page = vm_normal_page(vma, addr, pte);
-		if (page) {
-			if (pte_dirty(pte))
-				set_page_dirty(page);
-			page_remove_rmap(page);
-			page_cache_release(page);
-			update_hiwater_rss(mm);
-			dec_mm_counter(mm, mm_counter_file(page));
-		}
-	} else {
-		if (!pte_file(pte))
-			free_swap_and_cache(pte_to_swp_entry(pte));
-		pte_clear_not_present_full(mm, addr, ptep, 0);
-	}
-}
-
-/*
- * Install a file pte to a given virtual memory address, release any
- * previously existing mapping.
- */
-static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long addr, unsigned long pgoff, pgprot_t prot)
-{
-	int err = -ENOMEM;
-	pte_t *pte, ptfile;
-	spinlock_t *ptl;
-
-	pte = get_locked_pte(mm, addr, &ptl);
-	if (!pte)
-		goto out;
-
-	ptfile = pgoff_to_pte(pgoff);
-
-	if (!pte_none(*pte))
-		zap_pte(mm, vma, addr, pte);
-
-	set_pte_at(mm, addr, pte, pte_file_mksoft_dirty(ptfile));
-	/*
-	 * We don't need to run update_mmu_cache() here because the "file pte"
-	 * being installed by install_file_pte() is not a real pte - it's a
-	 * non-present entry (like a swap entry), noting what file offset should
-	 * be mapped there when there's a fault (in a non-linear vma where
-	 * that's not obvious).
-	 */
-	pte_unmap_unlock(pte, ptl);
-	err = 0;
-out:
-	return err;
-}
-
-int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
-			     unsigned long size, pgoff_t pgoff)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	int err;
-
-	do {
-		err = install_file_pte(mm, vma, addr, pgoff, vma->vm_page_prot);
-		if (err)
-			return err;
-
-		size -= PAGE_SIZE;
-		addr += PAGE_SIZE;
-		pgoff++;
-	} while (size);
-
-	return 0;
-}
-EXPORT_SYMBOL(generic_file_remap_pages);
-
-/**
- * sys_remap_file_pages - remap arbitrary pages of an existing VM_SHARED vma
- * @start: start of the remapped virtual memory range
- * @size: size of the remapped virtual memory range
- * @prot: new protection bits of the range (see NOTE)
- * @pgoff: to-be-mapped page of the backing store file
- * @flags: 0 or MAP_NONBLOCKED - the later will cause no IO.
- *
- * sys_remap_file_pages remaps arbitrary pages of an existing VM_SHARED vma
- * (shared backing store file).
- *
- * This syscall works purely via pagetables, so it's the most efficient
- * way to map the same (large) file into a given virtual window. Unlike
- * mmap()/mremap() it does not create any new vmas. The new mappings are
- * also safe across swapout.
- *
- * NOTE: the @prot parameter right now is ignored (but must be zero),
- * and the vma's default protection is used. Arbitrary protections
- * might be implemented in the future.
- */
-SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
-		unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
-{
-	struct mm_struct *mm = current->mm;
-	struct address_space *mapping;
-	struct vm_area_struct *vma;
-	int err = -EINVAL;
-	int has_write_lock = 0;
-	vm_flags_t vm_flags = 0;
-
-	if (prot)
-		return err;
-	/*
-	 * Sanitize the syscall parameters:
-	 */
-	start = start & PAGE_MASK;
-	size = size & PAGE_MASK;
-
-	/* Does the address range wrap, or is the span zero-sized? */
-	if (start + size <= start)
-		return err;
-
-	/* Does pgoff wrap? */
-	if (pgoff + (size >> PAGE_SHIFT) < pgoff)
-		return err;
-
-	/* Can we represent this offset inside this architecture's pte's? */
-#if PTE_FILE_MAX_BITS < BITS_PER_LONG
-	if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
-		return err;
-#endif
-
-	/* We need down_write() to change vma->vm_flags. */
-	down_read(&mm->mmap_sem);
- retry:
-	vma = find_vma(mm, start);
-
-	/*
-	 * Make sure the vma is shared, that it supports prefaulting,
-	 * and that the remapped range is valid and fully within
-	 * the single existing vma.
-	 */
-	if (!vma || !(vma->vm_flags & VM_SHARED))
-		goto out;
-
-	if (!vma->vm_ops || !vma->vm_ops->remap_pages)
-		goto out;
-
-	if (start < vma->vm_start || start + size > vma->vm_end)
-		goto out;
-
-	/* Must set VM_NONLINEAR before any pages are populated. */
-	if (!(vma->vm_flags & VM_NONLINEAR)) {
-		/*
-		 * vm_private_data is used as a swapout cursor
-		 * in a VM_NONLINEAR vma.
-		 */
-		if (vma->vm_private_data)
-			goto out;
-
-		/* Don't need a nonlinear mapping, exit success */
-		if (pgoff == linear_page_index(vma, start)) {
-			err = 0;
-			goto out;
-		}
-
-		if (!has_write_lock) {
-get_write_lock:
-			up_read(&mm->mmap_sem);
-			down_write(&mm->mmap_sem);
-			has_write_lock = 1;
-			goto retry;
-		}
-		mapping = vma->vm_file->f_mapping;
-		/*
-		 * page_mkclean doesn't work on nonlinear vmas, so if
-		 * dirty pages need to be accounted, emulate with linear
-		 * vmas.
-		 */
-		if (mapping_cap_account_dirty(mapping)) {
-			unsigned long addr;
-			struct file *file = get_file(vma->vm_file);
-			/* mmap_region may free vma; grab the info now */
-			vm_flags = vma->vm_flags;
-
-			addr = mmap_region(file, start, size, vm_flags, pgoff);
-			fput(file);
-			if (IS_ERR_VALUE(addr)) {
-				err = addr;
-			} else {
-				BUG_ON(addr != start);
-				err = 0;
-			}
-			goto out_freed;
-		}
-		mutex_lock(&mapping->i_mmap_mutex);
-		flush_dcache_mmap_lock(mapping);
-		vma->vm_flags |= VM_NONLINEAR;
-		vma_interval_tree_remove(vma, &mapping->i_mmap);
-		vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
-		flush_dcache_mmap_unlock(mapping);
-		mutex_unlock(&mapping->i_mmap_mutex);
-	}
-
-	if (vma->vm_flags & VM_LOCKED) {
-		/*
-		 * drop PG_Mlocked flag for over-mapped range
-		 */
-		if (!has_write_lock)
-			goto get_write_lock;
-		vm_flags = vma->vm_flags;
-		munlock_vma_pages_range(vma, start, start + size);
-		vma->vm_flags = vm_flags;
-	}
-
-	mmu_notifier_invalidate_range_start(mm, start, start + size);
-	err = vma->vm_ops->remap_pages(vma, start, size, pgoff);
-	mmu_notifier_invalidate_range_end(mm, start, start + size);
-
-	/*
-	 * We can't clear VM_NONLINEAR because we'd have to do
-	 * it after ->populate completes, and that would prevent
-	 * downgrading the lock.  (Locks can't be upgraded).
-	 */
-
-out:
-	if (vma)
-		vm_flags = vma->vm_flags;
-out_freed:
-	if (likely(!has_write_lock))
-		up_read(&mm->mmap_sem);
-	else
-		up_write(&mm->mmap_sem);
-	if (!err && ((vm_flags & VM_LOCKED) || !(flags & MAP_NONBLOCK)))
-		mm_populate(start, size);
-
-	return err;
-}
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -244,8 +244,10 @@ int __frontswap_store(struct page *page)
 		  the (older) page from frontswap
 		 */
 		inc_frontswap_failed_stores();
-		if (dup)
+		if (dup) {
 			__frontswap_clear(sis, offset);
+			frontswap_ops->invalidate_page(type, offset);
+		}
 	}
 	if (frontswap_writethrough_enabled)
 		/* report failure so swap also writes to swap device */
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -158,7 +158,7 @@ split_fallthrough:
 		 */
 		if (likely(!(flags & FOLL_MIGRATION)))
 			goto no_page;
-		if (pte_none(pte) || pte_file(pte))
+		if (pte_none(pte))
 			goto no_page;
 		entry = pte_to_swp_entry(pte);
 		if (!is_migration_entry(entry))
@@ -221,7 +221,7 @@ split_fallthrough:
 	if (flags & FOLL_TOUCH) {
 		if ((flags & FOLL_WRITE) &&
 		    !pte_dirty(pte) && !PageDirty(page))
-			set_page_dirty(page);
+			set_page_dirty_mm(page, mm);
 		/*
 		 * pte_mkyoung() would be more correct here, but atomic care
 		 * is needed to avoid losing the dirty bit: it is easier to use
@@ -276,12 +276,6 @@ no_page_table:
 	return page;
 }
 
-static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
-{
-	return stack_guard_page_start(vma, addr) ||
-	       stack_guard_page_end(vma, addr+PAGE_SIZE);
-}
-
 /**
  * __get_user_pages() - pin user pages in memory
  * @tsk:	task_struct of target task
@@ -449,11 +443,6 @@ long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				int ret;
 				unsigned int fault_flags = 0;
 
-				/* For mlock, just skip the stack guard page. */
-				if (foll_flags & FOLL_MLOCK) {
-					if (stack_guard_page(vma, start))
-						goto next_page;
-				}
 				if (foll_flags & FOLL_WRITE)
 					fault_flags |= FAULT_FLAG_WRITE;
 				if (nonblocking)
--- a/mm/hmm_migrate.c
+++ b/mm/hmm_migrate.c
@@ -255,7 +255,6 @@ static int hmm_migrate_unmap(struct vm_area_struct *vma,
 			pte = swp_entry_to_pte(swap);
 		set_pte_at(vma->vm_mm, addr, ptep, pte);
 		hmm_migrate_get(migrate);
-		BUG_ON(pte_file(*ptep));
 
 	} while (ptep++, gtep++, addr += PAGE_SIZE, addr != end);
 	arch_leave_lazy_mmu_mode();
@@ -286,7 +285,7 @@ static int hmm_migrate_isolate(struct vm_area_struct *vma,
 		pte_t pte = *ptep;
 		swp_entry_t swap;
 
-		if (pte_present(pte) || pte_file(pte))
+		if (pte_present(pte))
 			continue;
 
 		swap = pte_to_swp_entry(pte);
@@ -348,11 +347,10 @@ static int hmm_migrate_check(struct vm_area_struct *vma,
 
 	do {
 		struct page *spage, *dpage;
-		struct mem_cgroup *memcg;
 		pte_t pte = *ptep;
 		swp_entry_t swap;
 
-		if (pte_present(pte) || pte_file(pte))
+		if (pte_present(pte))
 			continue;
 
 		swap = pte_to_swp_entry(pte);
@@ -404,8 +402,6 @@ static int hmm_migrate_check(struct vm_area_struct *vma,
 		if (PageSwapBacked(spage))
 			SetPageSwapBacked(dpage);
 
-		mem_cgroup_prepare_migration(spage, dpage, &memcg);
-		dpage->mapping = (void *)memcg;
 		*gtep = hmm_entry_set_memcg(*gtep);
 	} while (ptep++, gtep++, addr += PAGE_SIZE, addr != end);
 
@@ -484,7 +480,7 @@ static int hmm_migrate_finalize(struct vm_area_struct *vma,
 		pte_t pte = *ptep;
 		swp_entry_t swap;
 
-		if (pte_present(pte) || pte_file(pte))
+		if (pte_present(pte))
 			continue;
 
 		swap = pte_to_swp_entry(pte);
@@ -521,7 +517,7 @@ static int hmm_migrate_cleanup(struct vm_area_struct *vma,
 		pte_t pte = *ptep;
 		swp_entry_t swap;
 
-		if (pte_present(pte) || pte_file(pte))
+		if (pte_present(pte))
 			continue;
 
 		swap = pte_to_swp_entry(pte);
@@ -531,15 +527,6 @@ static int hmm_migrate_cleanup(struct vm_area_struct *vma,
 		VM_BUG_ON(!spage);
 
 		dpage = hmm_entry_to_page(*gtep);
-		if (hmm_entry_is_memcg(*gtep)) {
-			struct mem_cgroup *memcg;
-
-			memcg = (struct mem_cgroup *)dpage->mapping;
-			dpage->mapping = NULL;
-			mem_cgroup_end_migration(memcg, spage, dpage,
-					         hmm_entry_is_migrate(*gtep));
-		}
-
 		if (hmm_entry_is_migrate(*gtep)) {
 			VM_BUG_ON(!spage);
 
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -24,6 +24,7 @@
 #include <linux/migrate.h>
 #include <linux/hashtable.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlb.h>
 #include <asm/pgalloc.h>
@@ -59,11 +60,10 @@ static DEFINE_MUTEX(khugepaged_mutex);
 static DEFINE_SPINLOCK(khugepaged_mm_lock);
 static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
 /*
- * default collapse hugepages if there is at least one pte mapped like
- * it would have happened if the vma was large enough during page
- * fault.
+ * default collapse hugepages if there is at least 1/4th ptes mapped
+ * to avoid memory footprint growth due to fragmentation
  */
-static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR-1;
+static unsigned int khugepaged_max_ptes_none __read_mostly = HPAGE_PMD_NR*3/4;
 
 static int khugepaged(void *none);
 static int khugepaged_slab_init(void);
@@ -204,24 +204,29 @@ void put_huge_zero_page(void)
 	BUG_ON(atomic_dec_and_test(&huge_zero_refcount));
 }
 
-static int shrink_huge_zero_page(struct shrinker *shrink,
-		struct shrink_control *sc)
+static unsigned long shrink_huge_zero_page_count(struct shrinker *shrink,
+					struct shrink_control *sc)
 {
-	if (!sc->nr_to_scan)
-		/* we can free zero page only if last reference remains */
-		return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+	/* we can free zero page only if last reference remains */
+	return atomic_read(&huge_zero_refcount) == 1 ? HPAGE_PMD_NR : 0;
+}
 
+static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink,
+				       struct shrink_control *sc)
+{
 	if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) {
 		struct page *zero_page = xchg(&huge_zero_page, NULL);
 		BUG_ON(zero_page == NULL);
 		__free_pages(zero_page, compound_order(zero_page));
+		return HPAGE_PMD_NR;
 	}
 
 	return 0;
 }
 
 static struct shrinker huge_zero_page_shrinker = {
-	.shrink = shrink_huge_zero_page,
+	.count_objects = shrink_huge_zero_page_count,
+	.scan_objects = shrink_huge_zero_page_scan,
 	.seeks = DEFAULT_SEEKS,
 };
 
@@ -697,13 +702,14 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 					unsigned long address, pmd_t *pmd,
 					struct page *page, unsigned int flags)
 {
+	struct mem_cgroup *memcg;
 	pgtable_t pgtable;
 	spinlock_t *ptl;
 	unsigned long haddr = address & HPAGE_PMD_MASK;
 
 	VM_BUG_ON_PAGE(!PageCompound(page), page);
 
-	if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+	if (unlikely(mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg))) {
 		put_page(page);
 		count_vm_event(THP_FAULT_FALLBACK);
 		return VM_FAULT_FALLBACK;
@@ -711,7 +717,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 
 	pgtable = pte_alloc_one(mm, haddr);
 	if (unlikely(!pgtable)) {
-		mem_cgroup_uncharge_page(page);
+		mem_cgroup_cancel_charge(page, memcg);
 		put_page(page);
 		return VM_FAULT_OOM;
 	}
@@ -727,7 +733,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 	ptl = pmd_lock(mm, pmd);
 	if (unlikely(!pmd_none(*pmd))) {
 		spin_unlock(ptl);
-		mem_cgroup_uncharge_page(page);
+		mem_cgroup_cancel_charge(page, memcg);
 		put_page(page);
 		pte_free(mm, pgtable);
 	} else {
@@ -738,7 +744,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 			int ret;
 
 			spin_unlock(ptl);
-			mem_cgroup_uncharge_page(page);
+			mem_cgroup_cancel_charge(page, memcg);
 			put_page(page);
 			pte_free(mm, pgtable);
 			ret = handle_userfault(vma, address, flags,
@@ -750,6 +756,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm,
 		init_trans_huge_mmu_gather_count(page);
 		entry = mk_huge_pmd(page, vma);
 		page_add_new_anon_rmap(page, vma, haddr);
+		mem_cgroup_commit_charge(page, memcg, false);
+		lru_cache_add_active_or_unevictable(page, vma);
 		pgtable_trans_huge_deposit(mm, pmd, pgtable);
 		set_pmd_at(mm, haddr, pmd, entry);
 		add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -1005,6 +1013,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
 		struct vm_area_struct *vma, unsigned long address,
 		pmd_t *pmd, pmd_t orig_pmd, unsigned long haddr)
 {
+	struct mem_cgroup *memcg;
 	spinlock_t *ptl;
 	pgtable_t pgtable;
 	pmd_t _pmd;
@@ -1019,7 +1028,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
 		goto out;
 	}
 
-	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) {
+	if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
 		put_page(page);
 		ret |= VM_FAULT_OOM;
 		goto out;
@@ -1048,6 +1057,8 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm,
 			entry = mk_pte(page, vma->vm_page_prot);
 			entry = maybe_mkwrite(pte_mkdirty(entry), vma);
 			page_add_new_anon_rmap(page, vma, haddr);
+			mem_cgroup_commit_charge(page, memcg, false);
+			lru_cache_add_active_or_unevictable(page, vma);
 		} else {
 			entry = pfn_pte(my_zero_pfn(haddr), vma->vm_page_prot);
 			entry = pte_mkspecial(entry);
@@ -1071,7 +1082,7 @@ out:
 out_free_page:
 	spin_unlock(ptl);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-	mem_cgroup_uncharge_page(page);
+	mem_cgroup_cancel_charge(page, memcg);
 	put_page(page);
 	goto out;
 }
@@ -1083,6 +1094,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 					struct page *page,
 					unsigned long haddr)
 {
+	struct mem_cgroup *memcg;
 	spinlock_t *ptl;
 	pgtable_t pgtable;
 	pmd_t _pmd = {0};
@@ -1103,20 +1115,21 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 					       __GFP_OTHER_NODE,
 					       vma, address, page_to_nid(page));
 		if (unlikely(!pages[i] ||
-			     mem_cgroup_newpage_charge(pages[i], mm,
-						       GFP_KERNEL))) {
+			     mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL,
+						   &memcg))) {
 			if (pages[i])
 				put_page(pages[i]);
-			mem_cgroup_uncharge_start();
 			while (--i >= 0) {
-				mem_cgroup_uncharge_page(pages[i]);
+				memcg = (void *)page_private(pages[i]);
+				set_page_private(pages[i], 0);
+				mem_cgroup_cancel_charge(pages[i], memcg);
 				put_page(pages[i]);
 			}
-			mem_cgroup_uncharge_end();
 			kfree(pages);
 			ret |= VM_FAULT_OOM;
 			goto out;
 		}
+		set_page_private(pages[i], (unsigned long)memcg);
 	}
 
 	for (i = 0; i < HPAGE_PMD_NR; i++) {
@@ -1145,7 +1158,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm,
 		pte_t *pte, entry;
 		entry = mk_pte(pages[i], vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+		memcg = (void *)page_private(pages[i]);
+		set_page_private(pages[i], 0);
 		page_add_new_anon_rmap(pages[i], vma, haddr);
+		mem_cgroup_commit_charge(pages[i], memcg, false);
+		lru_cache_add_active_or_unevictable(pages[i], vma);
 		pte = pte_offset_map(&_pmd, haddr);
 		VM_BUG_ON(!pte_none(*pte));
 		set_pte_at(mm, haddr, pte, entry);
@@ -1169,12 +1186,12 @@ out:
 out_free_pages:
 	spin_unlock(ptl);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-	mem_cgroup_uncharge_start();
 	for (i = 0; i < HPAGE_PMD_NR; i++) {
-		mem_cgroup_uncharge_page(pages[i]);
+		memcg = (void *)page_private(pages[i]);
+		set_page_private(pages[i], 0);
+		mem_cgroup_cancel_charge(pages[i], memcg);
 		put_page(pages[i]);
 	}
-	mem_cgroup_uncharge_end();
 	kfree(pages);
 	goto out;
 }
@@ -1185,6 +1202,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	int ret = 0;
 	struct page *page = NULL, *new_page;
+	struct mem_cgroup *memcg;
 	unsigned long haddr;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
@@ -1236,7 +1254,8 @@ alloc:
 		goto out;
 	}
 
-	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+	if (unlikely(mem_cgroup_try_charge(new_page, mm,
+					   GFP_TRANSHUGE, &memcg))) {
 		put_page(new_page);
 		if (page) {
 			split_huge_page(page);
@@ -1265,7 +1284,7 @@ alloc:
 		put_page(page);
 	if (unlikely(!pmd_same(*pmd, orig_pmd))) {
 		spin_unlock(ptl);
-		mem_cgroup_uncharge_page(new_page);
+		mem_cgroup_cancel_charge(new_page, memcg);
 		put_page(new_page);
 		goto out_mn;
 	} else {
@@ -1274,6 +1293,8 @@ alloc:
 		entry = mk_huge_pmd(new_page, vma);
 		pmdp_clear_flush_notify(vma, haddr, pmd);
 		page_add_new_anon_rmap(new_page, vma, haddr);
+		mem_cgroup_commit_charge(new_page, memcg, false);
+		lru_cache_add_active_or_unevictable(new_page, vma);
 		set_pmd_at(mm, haddr, pmd, entry);
 		update_mmu_cache_pmd(vma, address, pmd);
 		if (!page) {
@@ -1827,6 +1848,11 @@ static void __split_huge_page_refcount(struct page *page,
 		/* clear PageTail before overwriting first_page */
 		smp_wmb();
 
+		if (page_is_young(page))
+			set_page_young(page_tail);
+		if (page_is_idle(page))
+			set_page_idle(page_tail);
+
 		/*
 		 * __split_huge_page_splitting() already set the
 		 * splitting bit in all pmd that could map this
@@ -2301,7 +2327,8 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
 		VM_BUG_ON_PAGE(PageLRU(page), page);
 
 		/* If there is no mapped pte young don't collapse the page */
-		if (pte_young(pteval) || PageReferenced(page) ||
+		if (pte_young(pteval) ||
+		    page_is_young(page) || PageReferenced(page) ||
 		    mmu_notifier_test_young(vma->vm_mm, address))
 			referenced = 1;
 	}
@@ -2503,6 +2530,7 @@ static void collapse_huge_page(struct mm_struct *mm,
 	spinlock_t *pmd_ptl, *pte_ptl;
 	int isolated;
 	unsigned long hstart, hend;
+	struct mem_cgroup *memcg;
 	unsigned long mmun_start;	/* For mmu_notifiers */
 	unsigned long mmun_end;		/* For mmu_notifiers */
 
@@ -2513,7 +2541,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	if (!new_page)
 		return;
 
-	if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
+	if (unlikely(mem_cgroup_try_charge(new_page, mm,
+					   GFP_TRANSHUGE, &memcg)))
 		return;
 
 	/*
@@ -2601,6 +2630,8 @@ static void collapse_huge_page(struct mm_struct *mm,
 	spin_lock(pmd_ptl);
 	BUG_ON(!pmd_none(*pmd));
 	page_add_new_anon_rmap(new_page, vma, address);
+	mem_cgroup_commit_charge(new_page, memcg, false);
+	lru_cache_add_active_or_unevictable(new_page, vma);
 	pgtable_trans_huge_deposit(mm, pmd, pgtable);
 	set_pmd_at(mm, address, pmd, _pmd);
 	update_mmu_cache_pmd(vma, address, pmd);
@@ -2614,7 +2645,7 @@ out_up_write:
 	return;
 
 out:
-	mem_cgroup_uncharge_page(new_page);
+	mem_cgroup_cancel_charge(new_page, memcg);
 	goto out_up_write;
 }
 
@@ -2668,7 +2699,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
 		/* cannot use mapcount: can't collapse if there's a gup pin */
 		if (page_count(page) != 1)
 			goto out_unmap;
-		if (pte_young(pteval) || PageReferenced(page) ||
+		if (pte_young(pteval) ||
+		    page_is_young(page) || PageReferenced(page) ||
 		    mmu_notifier_test_young(vma->vm_mm, address))
 			referenced = 1;
 	}
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -4,6 +4,7 @@
 #include <linux/spinlock.h>
 #include <linux/list.h>
 #include <linux/cpumask.h>
+#include <linux/module.h>
 
 #include <linux/atomic.h>
 #include <asm/pgtable.h>
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -86,6 +86,7 @@ extern unsigned long highest_memmap_pfn;
  */
 extern int isolate_lru_page(struct page *page);
 extern void putback_lru_page(struct page *page);
+extern bool zone_reclaimable(struct zone *zone);
 
 /*
  * in mm/rmap.c:
@@ -166,8 +167,13 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
 #ifdef CONFIG_MMU
 extern long __mlock_vma_pages_range(struct vm_area_struct *vma,
 		unsigned long start, unsigned long end, int *nonblocking);
-extern void munlock_vma_pages_range(struct vm_area_struct *vma,
-			unsigned long start, unsigned long end);
+extern void __munlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end, int acct);
+static inline void munlock_vma_pages_range(struct vm_area_struct *vma,
+			unsigned long start, unsigned long end)
+{
+	__munlock_vma_pages_range(vma, start, end, 1);
+}
 static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
 {
 	munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end);
--- a/mm/interval_tree.c
+++ b/mm/interval_tree.c
@@ -21,8 +21,8 @@ static inline unsigned long vma_last_pgoff(struct vm_area_struct *v)
 	return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1;
 }
 
-INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.linear.rb,
-		     unsigned long, shared.linear.rb_subtree_last,
+INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb,
+		     unsigned long, shared.rb_subtree_last,
 		     vma_start_pgoff, vma_last_pgoff,, vma_interval_tree)
 
 /* Insert node immediately after prev in the interval tree */
@@ -36,26 +36,26 @@ void vma_interval_tree_insert_after(struct vm_area_struct *node,
 
 	VM_BUG_ON(vma_start_pgoff(node) != vma_start_pgoff(prev));
 
-	if (!prev->shared.linear.rb.rb_right) {
+	if (!prev->shared.rb.rb_right) {
 		parent = prev;
-		link = &prev->shared.linear.rb.rb_right;
+		link = &prev->shared.rb.rb_right;
 	} else {
-		parent = rb_entry(prev->shared.linear.rb.rb_right,
-				  struct vm_area_struct, shared.linear.rb);
-		if (parent->shared.linear.rb_subtree_last < last)
-			parent->shared.linear.rb_subtree_last = last;
-		while (parent->shared.linear.rb.rb_left) {
-			parent = rb_entry(parent->shared.linear.rb.rb_left,
-				struct vm_area_struct, shared.linear.rb);
-			if (parent->shared.linear.rb_subtree_last < last)
-				parent->shared.linear.rb_subtree_last = last;
+		parent = rb_entry(prev->shared.rb.rb_right,
+				  struct vm_area_struct, shared.rb);
+		if (parent->shared.rb_subtree_last < last)
+			parent->shared.rb_subtree_last = last;
+		while (parent->shared.rb.rb_left) {
+			parent = rb_entry(parent->shared.rb.rb_left,
+				struct vm_area_struct, shared.rb);
+			if (parent->shared.rb_subtree_last < last)
+				parent->shared.rb_subtree_last = last;
 		}
-		link = &parent->shared.linear.rb.rb_left;
+		link = &parent->shared.rb.rb_left;
 	}
 
-	node->shared.linear.rb_subtree_last = last;
-	rb_link_node(&node->shared.linear.rb, &parent->shared.linear.rb, link);
-	rb_insert_augmented(&node->shared.linear.rb, root,
+	node->shared.rb_subtree_last = last;
+	rb_link_node(&node->shared.rb, &parent->shared.rb, link);
+	rb_insert_augmented(&node->shared.rb, root,
 			    &vma_interval_tree_augment);
 }
 
--- /dev/null
+++ b/mm/iov-iter.c
@@ -0,0 +1,481 @@
+/*
+ *  mm/iov-iter.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/uio.h>
+#include <linux/hardirq.h>
+#include <linux/highmem.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+
+static size_t __iovec_copy_to_user_inatomic(char *vaddr,
+			const struct iovec *iov, size_t base, size_t bytes)
+{
+	size_t copied = 0, left = 0;
+
+	while (bytes) {
+		char __user *buf = iov->iov_base + base;
+		int copy = min(bytes, iov->iov_len - base);
+
+		base = 0;
+		left = __copy_to_user_inatomic(buf, vaddr, copy);
+		copied += copy;
+		bytes -= copy;
+		vaddr += copy;
+		iov++;
+
+		if (unlikely(left))
+			break;
+	}
+	return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+static size_t ii_iovec_copy_to_user_atomic(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	char *kaddr;
+	size_t copied;
+
+	BUG_ON(!in_atomic());
+	kaddr = kmap_atomic(page);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = iov->iov_base + i->iov_offset;
+		left = __copy_to_user_inatomic(buf, kaddr + offset, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_to_user_inatomic(kaddr + offset,
+						iov, i->iov_offset, bytes);
+	}
+	kunmap_atomic(kaddr);
+
+	return copied;
+}
+
+/*
+ * This has the same sideeffects and return value as
+ * ii_iovec_copy_to_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+static size_t ii_iovec_copy_to_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	char *kaddr;
+	size_t copied;
+
+	kaddr = kmap(page);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = iov->iov_base + i->iov_offset;
+		left = copy_to_user(buf, kaddr + offset, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_to_user_inatomic(kaddr + offset,
+						iov, i->iov_offset, bytes);
+	}
+	kunmap(page);
+	return copied;
+}
+
+
+static size_t __iovec_copy_from_user_inatomic(char *vaddr,
+			const struct iovec *iov, size_t base, size_t bytes)
+{
+	size_t copied = 0, left = 0;
+
+	while (bytes) {
+		char __user *buf = iov->iov_base + base;
+		int copy = min(bytes, iov->iov_len - base);
+
+		base = 0;
+		left = __copy_from_user_inatomic(vaddr, buf, copy);
+		copied += copy;
+		bytes -= copy;
+		vaddr += copy;
+		iov++;
+
+		if (unlikely(left))
+			break;
+	}
+	return copied - left;
+}
+
+/*
+ * Copy as much as we can into the page and return the number of bytes which
+ * were sucessfully copied.  If a fault is encountered then return the number of
+ * bytes which were copied.
+ */
+static size_t ii_iovec_copy_from_user_atomic(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	char *kaddr;
+	size_t copied;
+
+	BUG_ON(!in_atomic());
+	kaddr = kmap_atomic(page);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = iov->iov_base + i->iov_offset;
+		left = __copy_from_user_inatomic(kaddr + offset, buf, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+						iov, i->iov_offset, bytes);
+	}
+	kunmap_atomic(kaddr);
+
+	return copied;
+}
+EXPORT_SYMBOL(iov_iter_copy_from_user_atomic);
+
+/*
+ * This has the same sideeffects and return value as
+ * ii_iovec_copy_from_user_atomic().
+ * The difference is that it attempts to resolve faults.
+ * Page must not be locked.
+ */
+static size_t ii_iovec_copy_from_user(struct page *page,
+		struct iov_iter *i, unsigned long offset, size_t bytes)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	char *kaddr;
+	size_t copied;
+
+	kaddr = kmap(page);
+	if (likely(i->nr_segs == 1)) {
+		int left;
+		char __user *buf = iov->iov_base + i->iov_offset;
+		left = __copy_from_user(kaddr + offset, buf, bytes);
+		copied = bytes - left;
+	} else {
+		copied = __iovec_copy_from_user_inatomic(kaddr + offset,
+						iov, i->iov_offset, bytes);
+	}
+	kunmap(page);
+	return copied;
+}
+
+static void ii_iovec_advance(struct iov_iter *i, size_t bytes)
+{
+	BUG_ON(i->count < bytes);
+
+	if (likely(i->nr_segs == 1)) {
+		i->iov_offset += bytes;
+		i->count -= bytes;
+	} else {
+		struct iovec *iov = (struct iovec *)i->data;
+		size_t base = i->iov_offset;
+		unsigned long nr_segs = i->nr_segs;
+
+		/*
+		 * The !iov->iov_len check ensures we skip over unlikely
+		 * zero-length segments (without overruning the iovec).
+		 */
+		while (bytes || unlikely(i->count && !iov->iov_len)) {
+			int copy;
+
+			copy = min(bytes, iov->iov_len - base);
+			BUG_ON(!i->count || i->count < copy);
+			i->count -= copy;
+			bytes -= copy;
+			base += copy;
+			if (iov->iov_len == base) {
+				iov++;
+				nr_segs--;
+				base = 0;
+			}
+		}
+		i->data = (unsigned long)iov;
+		i->iov_offset = base;
+		i->nr_segs = nr_segs;
+	}
+}
+
+/*
+ * Fault in the first iovec of the given iov_iter, to a maximum length
+ * of bytes. Returns 0 on success, or non-zero if the memory could not be
+ * accessed (ie. because it is an invalid address).
+ *
+ * writev-intensive code may want this to prefault several iovecs -- that
+ * would be possible (callers must not rely on the fact that _only_ the
+ * first iovec will be faulted with the current implementation).
+ */
+static int ii_iovec_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	char __user *buf = iov->iov_base + i->iov_offset;
+	bytes = min(bytes, iov->iov_len - i->iov_offset);
+	return fault_in_pages_readable(buf, bytes);
+}
+
+/*
+ * Return the count of just the current iov_iter segment.
+ */
+static size_t ii_iovec_single_seg_count(const struct iov_iter *i)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	if (i->nr_segs == 1)
+		return i->count;
+	else
+		return min(i->count, iov->iov_len - i->iov_offset);
+}
+
+static int ii_iovec_shorten(struct iov_iter *i, size_t count)
+{
+	struct iovec *iov = (struct iovec *)i->data;
+	i->nr_segs = iov_shorten(iov, i->nr_segs, count);
+	return 0;
+}
+
+struct iov_iter_ops ii_iovec_ops = {
+	.ii_copy_to_user_atomic = ii_iovec_copy_to_user_atomic,
+	.ii_copy_to_user = ii_iovec_copy_to_user,
+	.ii_copy_from_user_atomic = ii_iovec_copy_from_user_atomic,
+	.ii_copy_from_user = ii_iovec_copy_from_user,
+	.ii_advance = ii_iovec_advance,
+	.ii_fault_in_readable = ii_iovec_fault_in_readable,
+	.ii_single_seg_count = ii_iovec_single_seg_count,
+	.ii_shorten = ii_iovec_shorten,
+};
+EXPORT_SYMBOL(ii_iovec_ops);
+
+/*
+ * As an easily verifiable first pass, we implement all the methods that
+ * copy data to and from bvec pages with one function.  We implement it
+ * all with kmap_atomic().
+ */
+static size_t bvec_copy_tofrom_page(struct iov_iter *iter, struct page *page,
+				    unsigned long page_offset, size_t bytes,
+				    int topage)
+{
+	struct bio_vec *bvec = (struct bio_vec *)iter->data;
+	size_t bvec_offset = iter->iov_offset;
+	size_t remaining = bytes;
+	void *bvec_map;
+	void *page_map;
+	size_t copy;
+
+	page_map = kmap_atomic(page);
+
+	BUG_ON(bytes > iter->count);
+	while (remaining) {
+		BUG_ON(bvec->bv_len == 0);
+		BUG_ON(bvec_offset >= bvec->bv_len);
+		copy = min(remaining, bvec->bv_len - bvec_offset);
+		bvec_map = kmap_atomic(bvec->bv_page);
+		if (topage)
+			memcpy(page_map + page_offset,
+			       bvec_map + bvec->bv_offset + bvec_offset,
+			       copy);
+		else
+			memcpy(bvec_map + bvec->bv_offset + bvec_offset,
+			       page_map + page_offset,
+			       copy);
+		kunmap_atomic(bvec_map);
+		remaining -= copy;
+		bvec_offset += copy;
+		page_offset += copy;
+		if (bvec_offset == bvec->bv_len) {
+			bvec_offset = 0;
+			bvec++;
+		}
+	}
+
+	kunmap_atomic(page_map);
+
+	return bytes;
+}
+
+size_t ii_bvec_copy_to_user_atomic(struct page *page, struct iov_iter *i,
+				   unsigned long offset, size_t bytes)
+{
+	return bvec_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_bvec_copy_to_user(struct page *page, struct iov_iter *i,
+				   unsigned long offset, size_t bytes)
+{
+	return bvec_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_bvec_copy_from_user_atomic(struct page *page, struct iov_iter *i,
+				     unsigned long offset, size_t bytes)
+{
+	return bvec_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+size_t ii_bvec_copy_from_user(struct page *page, struct iov_iter *i,
+			      unsigned long offset, size_t bytes)
+{
+	return bvec_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+
+/*
+ * bio_vecs have a stricter structure than iovecs that might have
+ * come from userspace.  There are no zero length bio_vec elements.
+ */
+void ii_bvec_advance(struct iov_iter *i, size_t bytes)
+{
+	struct bio_vec *bvec = (struct bio_vec *)i->data;
+	size_t offset = i->iov_offset;
+	size_t delta;
+
+	BUG_ON(i->count < bytes);
+	while (bytes) {
+		BUG_ON(bvec->bv_len == 0);
+		BUG_ON(bvec->bv_len <= offset);
+		delta = min(bytes, bvec->bv_len - offset);
+		offset += delta;
+		i->count -= delta;
+		bytes -= delta;
+		if (offset == bvec->bv_len) {
+			bvec++;
+			offset = 0;
+		}
+	}
+
+	i->data = (unsigned long)bvec;
+	i->iov_offset = offset;
+}
+
+/*
+ * pages pointed to by bio_vecs are always pinned.
+ */
+int ii_bvec_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+	return 0;
+}
+
+size_t ii_bvec_single_seg_count(const struct iov_iter *i)
+{
+	const struct bio_vec *bvec = (struct bio_vec *)i->data;
+	if (i->nr_segs == 1)
+		return i->count;
+	else
+		return min(i->count, bvec->bv_len - i->iov_offset);
+}
+
+static int ii_bvec_shorten(struct iov_iter *i, size_t count)
+{
+	return -EINVAL;
+}
+
+struct iov_iter_ops ii_bvec_ops = {
+	.ii_copy_to_user_atomic = ii_bvec_copy_to_user_atomic,
+	.ii_copy_to_user = ii_bvec_copy_to_user,
+	.ii_copy_from_user_atomic = ii_bvec_copy_from_user_atomic,
+	.ii_copy_from_user = ii_bvec_copy_from_user,
+	.ii_advance = ii_bvec_advance,
+	.ii_fault_in_readable = ii_bvec_fault_in_readable,
+	.ii_single_seg_count = ii_bvec_single_seg_count,
+	.ii_shorten = ii_bvec_shorten,
+};
+EXPORT_SYMBOL(ii_bvec_ops);
+
+/* Functions to get on with single page */
+
+static size_t page_copy_tofrom_page(struct iov_iter *iter, struct page *page,
+				    unsigned long page_offset, size_t bytes,
+				    int topage)
+{
+	struct page *ipage = (struct page *)iter->data;
+	size_t ipage_offset = iter->iov_offset;
+	void *ipage_map;
+	void *page_map;
+
+	BUG_ON(bytes > iter->count);
+	BUG_ON(bytes > PAGE_SIZE - ipage_offset);
+	BUG_ON(ipage_offset >= PAGE_SIZE);
+
+	page_map = kmap_atomic(page);
+	ipage_map = kmap_atomic(ipage);
+
+	if (topage)
+		memcpy(page_map + page_offset,
+		       ipage_map + ipage_offset,
+		       bytes);
+	else
+		memcpy(ipage_map + ipage_offset,
+		       page_map + page_offset,
+		       bytes);
+
+	kunmap_atomic(ipage_map);
+	kunmap_atomic(page_map);
+
+	return bytes;
+}
+
+size_t ii_page_copy_to_user_atomic(struct page *page, struct iov_iter *i,
+				   unsigned long offset, size_t bytes)
+{
+	return page_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_page_copy_to_user(struct page *page, struct iov_iter *i,
+				   unsigned long offset, size_t bytes)
+{
+	return page_copy_tofrom_page(i, page, offset, bytes, 0);
+}
+size_t ii_page_copy_from_user_atomic(struct page *page, struct iov_iter *i,
+				     unsigned long offset, size_t bytes)
+{
+	return page_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+size_t ii_page_copy_from_user(struct page *page, struct iov_iter *i,
+			      unsigned long offset, size_t bytes)
+{
+	return page_copy_tofrom_page(i, page, offset, bytes, 1);
+}
+
+void ii_page_advance(struct iov_iter *i, size_t bytes)
+{
+	BUG_ON(i->count < bytes);
+	BUG_ON(i->iov_offset >= PAGE_SIZE);
+	BUG_ON(bytes > PAGE_SIZE - i->iov_offset);
+
+	i->iov_offset += bytes;
+	i->count      -= bytes;
+}
+
+/*
+ * pages pointed to by bio_vecs are always pinned.
+ */
+int ii_page_fault_in_readable(struct iov_iter *i, size_t bytes)
+{
+	return 0;
+}
+
+size_t ii_page_single_seg_count(const struct iov_iter *i)
+{
+	BUG_ON(i->nr_segs != 1);
+
+	return i->count;
+}
+
+static int ii_page_shorten(struct iov_iter *i, size_t count)
+{
+	return -EINVAL;
+}
+
+struct iov_iter_ops ii_page_ops = {
+	.ii_copy_to_user_atomic = ii_page_copy_to_user_atomic,
+	.ii_copy_to_user = ii_page_copy_to_user,
+	.ii_copy_from_user_atomic = ii_page_copy_from_user_atomic,
+	.ii_copy_from_user = ii_page_copy_from_user,
+	.ii_advance = ii_page_advance,
+	.ii_fault_in_readable = ii_page_fault_in_readable,
+	.ii_single_seg_count = ii_page_single_seg_count,
+	.ii_shorten = ii_page_shorten,
+};
+EXPORT_SYMBOL(ii_page_ops);
--- /dev/null
+++ b/mm/kasan/Makefile
@@ -0,0 +1,10 @@
+KASAN_SANITIZE := n
+
+KCOV_INSTRUMENT := n
+
+CFLAGS_REMOVE_kasan.o = -pg
+# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
+# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
+CFLAGS_kasan.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector)
+
+obj-y := kasan.o report.o
--- /dev/null
+++ b/mm/kasan/kasan.c
@@ -0,0 +1,549 @@
+/*
+ * This file contains shadow memory manipulation code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ *
+ * Some of code borrowed from https://github.com/xairy/linux by
+ *        Andrey Konovalov <adech.fo@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define DISABLE_BRANCH_PROFILING
+
+#include <linux/export.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/memblock.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+#include <linux/kasan.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+/*
+ * Poisons the shadow memory for 'size' bytes starting from 'addr'.
+ * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
+ */
+static void kasan_poison_shadow(const void *address, size_t size, u8 value)
+{
+	void *shadow_start, *shadow_end;
+
+	shadow_start = kasan_mem_to_shadow(address);
+	shadow_end = kasan_mem_to_shadow(address + size);
+
+	memset(shadow_start, value, shadow_end - shadow_start);
+}
+
+void kasan_unpoison_shadow(const void *address, size_t size)
+{
+	kasan_poison_shadow(address, size, 0);
+
+	if (size & KASAN_SHADOW_MASK) {
+		u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
+		*shadow = size & KASAN_SHADOW_MASK;
+	}
+}
+
+
+/*
+ * All functions below always inlined so compiler could
+ * perform better optimizations in each of __asan_loadX/__assn_storeX
+ * depending on memory access size X.
+ */
+
+static __always_inline bool memory_is_poisoned_1(unsigned long addr)
+{
+	s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(shadow_value)) {
+		s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
+		return unlikely(last_accessible_byte >= shadow_value);
+	}
+
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned_2(unsigned long addr)
+{
+	u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(*shadow_addr)) {
+		if (memory_is_poisoned_1(addr + 1))
+			return true;
+
+		if (likely(((addr + 1) & KASAN_SHADOW_MASK) != 0))
+			return false;
+
+		return unlikely(*(u8 *)shadow_addr);
+	}
+
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned_4(unsigned long addr)
+{
+	u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(*shadow_addr)) {
+		if (memory_is_poisoned_1(addr + 3))
+			return true;
+
+		if (likely(((addr + 3) & KASAN_SHADOW_MASK) >= 3))
+			return false;
+
+		return unlikely(*(u8 *)shadow_addr);
+	}
+
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned_8(unsigned long addr)
+{
+	u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(*shadow_addr)) {
+		if (memory_is_poisoned_1(addr + 7))
+			return true;
+
+		if (likely(((addr + 7) & KASAN_SHADOW_MASK) >= 7))
+			return false;
+
+		return unlikely(*(u8 *)shadow_addr);
+	}
+
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned_16(unsigned long addr)
+{
+	u32 *shadow_addr = (u32 *)kasan_mem_to_shadow((void *)addr);
+
+	if (unlikely(*shadow_addr)) {
+		u16 shadow_first_bytes = *(u16 *)shadow_addr;
+		s8 last_byte = (addr + 15) & KASAN_SHADOW_MASK;
+
+		if (unlikely(shadow_first_bytes))
+			return true;
+
+		if (likely(!last_byte))
+			return false;
+
+		return memory_is_poisoned_1(addr + 15);
+	}
+
+	return false;
+}
+
+static __always_inline unsigned long bytes_is_zero(const u8 *start,
+					size_t size)
+{
+	while (size) {
+		if (unlikely(*start))
+			return (unsigned long)start;
+		start++;
+		size--;
+	}
+
+	return 0;
+}
+
+static __always_inline unsigned long memory_is_zero(const void *start,
+						const void *end)
+{
+	unsigned int words;
+	unsigned long ret;
+	unsigned int prefix = (unsigned long)start % 8;
+
+	if (end - start <= 16)
+		return bytes_is_zero(start, end - start);
+
+	if (prefix) {
+		prefix = 8 - prefix;
+		ret = bytes_is_zero(start, prefix);
+		if (unlikely(ret))
+			return ret;
+		start += prefix;
+	}
+
+	words = (end - start) / 8;
+	while (words) {
+		if (unlikely(*(u64 *)start))
+			return bytes_is_zero(start, 8);
+		start += 8;
+		words--;
+	}
+
+	return bytes_is_zero(start, (end - start) % 8);
+}
+
+static __always_inline bool memory_is_poisoned_n(unsigned long addr,
+						size_t size)
+{
+	unsigned long ret;
+
+	ret = memory_is_zero(kasan_mem_to_shadow((void *)addr),
+			kasan_mem_to_shadow((void *)addr + size - 1) + 1);
+
+	if (unlikely(ret)) {
+		unsigned long last_byte = addr + size - 1;
+		s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
+
+		if (unlikely(ret != (unsigned long)last_shadow ||
+			((last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+			return true;
+	}
+	return false;
+}
+
+static __always_inline bool memory_is_poisoned(unsigned long addr, size_t size)
+{
+	if (__builtin_constant_p(size)) {
+		switch (size) {
+		case 1:
+			return memory_is_poisoned_1(addr);
+		case 2:
+			return memory_is_poisoned_2(addr);
+		case 4:
+			return memory_is_poisoned_4(addr);
+		case 8:
+			return memory_is_poisoned_8(addr);
+		case 16:
+			return memory_is_poisoned_16(addr);
+		default:
+			BUILD_BUG();
+		}
+	}
+
+	return memory_is_poisoned_n(addr, size);
+}
+
+static __always_inline void check_memory_region_inline(unsigned long addr,
+						size_t size, bool write,
+						unsigned long ret_ip)
+{
+	if (unlikely(size == 0))
+		return;
+
+	if (unlikely((void *)addr <
+		kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
+		kasan_report(addr, size, write, ret_ip);
+		return;
+	}
+
+	if (likely(!memory_is_poisoned(addr, size)))
+		return;
+
+	kasan_report(addr, size, write, ret_ip);
+}
+
+static void check_memory_region(unsigned long addr,
+				size_t size, bool write,
+				unsigned long ret_ip)
+{
+	check_memory_region_inline(addr, size, write, ret_ip);
+}
+
+void kasan_check_read(const void *p, unsigned int size)
+{
+	check_memory_region((unsigned long)p, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_read);
+
+void kasan_check_write(const void *p, unsigned int size)
+{
+	check_memory_region((unsigned long)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(kasan_check_write);
+
+#undef memset
+void *memset(void *addr, int c, size_t len)
+{
+	check_memory_region((unsigned long)addr, len, true, _RET_IP_);
+
+	return __memset(addr, c, len);
+}
+
+#undef memmove
+void *memmove(void *dest, const void *src, size_t len)
+{
+	check_memory_region((unsigned long)src, len, false, _RET_IP_);
+	check_memory_region((unsigned long)dest, len, true, _RET_IP_);
+
+	return __memmove(dest, src, len);
+}
+
+#undef memcpy
+void *memcpy(void *dest, const void *src, size_t len)
+{
+	check_memory_region((unsigned long)src, len, false, _RET_IP_);
+	check_memory_region((unsigned long)dest, len, true, _RET_IP_);
+
+	return __memcpy(dest, src, len);
+}
+
+void kasan_alloc_pages(struct page *page, unsigned int order)
+{
+	if (likely(!PageHighMem(page)))
+		kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
+}
+
+void kasan_free_pages(struct page *page, unsigned int order)
+{
+	if (likely(!PageHighMem(page)))
+		kasan_poison_shadow(page_address(page),
+				PAGE_SIZE << order,
+				KASAN_FREE_PAGE);
+}
+
+void kasan_poison_slab(struct page *page)
+{
+	kasan_poison_shadow(page_address(page),
+			PAGE_SIZE << compound_order(page),
+			KASAN_KMALLOC_REDZONE);
+}
+
+void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
+{
+	kasan_unpoison_shadow(object, cache->object_size);
+}
+
+void kasan_poison_object_data(struct kmem_cache *cache, void *object)
+{
+	kasan_poison_shadow(object,
+			round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
+			KASAN_KMALLOC_REDZONE);
+}
+
+void kasan_slab_alloc(struct kmem_cache *cache, void *object)
+{
+	kasan_kmalloc(cache, object, cache->object_size);
+}
+
+void kasan_slab_free(struct kmem_cache *cache, void *object)
+{
+	unsigned long size = cache->object_size;
+	unsigned long rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+
+	/* RCU slabs could be legally used after free within the RCU period */
+	if (unlikely(cache->flags & SLAB_DESTROY_BY_RCU))
+		return;
+
+	kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+}
+
+void kasan_kmalloc(struct kmem_cache *cache, const void *object, size_t size)
+{
+	unsigned long redzone_start;
+	unsigned long redzone_end;
+
+	if (unlikely(object == NULL))
+		return;
+
+	redzone_start = round_up((unsigned long)(object + size),
+				KASAN_SHADOW_SCALE_SIZE);
+	redzone_end = round_up((unsigned long)object + cache->object_size,
+				KASAN_SHADOW_SCALE_SIZE);
+
+	kasan_unpoison_shadow(object, size);
+	kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+		KASAN_KMALLOC_REDZONE);
+}
+EXPORT_SYMBOL(kasan_kmalloc);
+
+void kasan_kmalloc_large(const void *ptr, size_t size)
+{
+	struct page *page;
+	unsigned long redzone_start;
+	unsigned long redzone_end;
+
+	if (unlikely(ptr == NULL))
+		return;
+
+	page = virt_to_page(ptr);
+	redzone_start = round_up((unsigned long)(ptr + size),
+				KASAN_SHADOW_SCALE_SIZE);
+	redzone_end = (unsigned long)ptr + (PAGE_SIZE << compound_order(page));
+
+	kasan_unpoison_shadow(ptr, size);
+	kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
+		KASAN_PAGE_REDZONE);
+}
+
+void kasan_krealloc(const void *object, size_t size)
+{
+	struct page *page;
+
+	if (unlikely(object == ZERO_SIZE_PTR))
+		return;
+
+	page = virt_to_head_page(object);
+
+	if (unlikely(!PageSlab(page)))
+		kasan_kmalloc_large(object, size);
+	else
+		kasan_kmalloc(page->slab_cache, object, size);
+}
+
+void kasan_kfree(void *ptr)
+{
+	struct page *page;
+
+	page = virt_to_head_page(ptr);
+
+	if (unlikely(!PageSlab(page)))
+		kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
+				KASAN_FREE_PAGE);
+	else
+		kasan_slab_free(page->slab_cache, ptr);
+}
+
+void kasan_kfree_large(const void *ptr)
+{
+	struct page *page = virt_to_page(ptr);
+
+	kasan_poison_shadow(ptr, PAGE_SIZE << compound_order(page),
+			KASAN_FREE_PAGE);
+}
+
+int kasan_module_alloc(void *addr, size_t size)
+{
+	void *ret;
+	size_t shadow_size;
+	unsigned long shadow_start;
+
+	shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
+	shadow_size = round_up(size >> KASAN_SHADOW_SCALE_SHIFT,
+			PAGE_SIZE);
+
+	if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
+		return -EINVAL;
+
+	ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
+			shadow_start + shadow_size,
+			GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
+			PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
+			__builtin_return_address(0));
+
+	if (ret) {
+		find_vm_area(addr)->flags |= VM_KASAN;
+		kmemleak_ignore(ret);
+		return 0;
+	}
+
+	return -ENOMEM;
+}
+
+void kasan_free_shadow(const struct vm_struct *vm)
+{
+	if (vm->flags & VM_KASAN)
+		vfree(kasan_mem_to_shadow(vm->addr));
+}
+
+static void register_global(struct kasan_global *global)
+{
+	size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
+
+	kasan_unpoison_shadow(global->beg, global->size);
+
+	kasan_poison_shadow(global->beg + aligned_size,
+		global->size_with_redzone - aligned_size,
+		KASAN_GLOBAL_REDZONE);
+}
+
+void __asan_register_globals(struct kasan_global *globals, size_t size)
+{
+	int i;
+
+	for (i = 0; i < size; i++)
+		register_global(&globals[i]);
+}
+EXPORT_SYMBOL(__asan_register_globals);
+
+void __asan_unregister_globals(struct kasan_global *globals, size_t size)
+{
+}
+EXPORT_SYMBOL(__asan_unregister_globals);
+
+#define DEFINE_ASAN_LOAD_STORE(size)					\
+	void __asan_load##size(unsigned long addr)			\
+	{								\
+		check_memory_region_inline(addr, size, false, _RET_IP_);\
+	}								\
+	EXPORT_SYMBOL(__asan_load##size);				\
+	__alias(__asan_load##size)					\
+	void __asan_load##size##_noabort(unsigned long);		\
+	EXPORT_SYMBOL(__asan_load##size##_noabort);			\
+	void __asan_store##size(unsigned long addr)			\
+	{								\
+		check_memory_region_inline(addr, size, true, _RET_IP_);	\
+	}								\
+	EXPORT_SYMBOL(__asan_store##size);				\
+	__alias(__asan_store##size)					\
+	void __asan_store##size##_noabort(unsigned long);		\
+	EXPORT_SYMBOL(__asan_store##size##_noabort)
+
+DEFINE_ASAN_LOAD_STORE(1);
+DEFINE_ASAN_LOAD_STORE(2);
+DEFINE_ASAN_LOAD_STORE(4);
+DEFINE_ASAN_LOAD_STORE(8);
+DEFINE_ASAN_LOAD_STORE(16);
+
+void __asan_loadN(unsigned long addr, size_t size)
+{
+	check_memory_region(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_loadN);
+
+__alias(__asan_loadN)
+void __asan_loadN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_loadN_noabort);
+
+void __asan_storeN(unsigned long addr, size_t size)
+{
+	check_memory_region(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_storeN);
+
+__alias(__asan_storeN)
+void __asan_storeN_noabort(unsigned long, size_t);
+EXPORT_SYMBOL(__asan_storeN_noabort);
+
+/* to shut up compiler complaints */
+void __asan_handle_no_return(void) {}
+EXPORT_SYMBOL(__asan_handle_no_return);
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static int kasan_mem_notifier(struct notifier_block *nb,
+			unsigned long action, void *data)
+{
+	return (action == MEM_GOING_ONLINE) ? NOTIFY_BAD : NOTIFY_OK;
+}
+
+static int __init kasan_memhotplug_init(void)
+{
+	pr_err("WARNING: KASan doesn't support memory hot-add\n");
+	pr_err("Memory hot-add will be disabled\n");
+
+	hotplug_memory_notifier(kasan_mem_notifier, 0);
+
+	return 0;
+}
+
+module_init(kasan_memhotplug_init);
+#endif
--- /dev/null
+++ b/mm/kasan/kasan.h
@@ -0,0 +1,72 @@
+#ifndef __MM_KASAN_KASAN_H
+#define __MM_KASAN_KASAN_H
+
+#include <linux/kasan.h>
+
+#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
+#define KASAN_SHADOW_MASK       (KASAN_SHADOW_SCALE_SIZE - 1)
+
+#define KASAN_FREE_PAGE         0xFF  /* page was freed */
+#define KASAN_FREE_PAGE         0xFF  /* page was freed */
+#define KASAN_PAGE_REDZONE      0xFE  /* redzone for kmalloc_large allocations */
+#define KASAN_KMALLOC_REDZONE   0xFC  /* redzone inside slub object */
+#define KASAN_KMALLOC_FREE      0xFB  /* object was freed (kmem_cache_free/kfree) */
+#define KASAN_GLOBAL_REDZONE    0xFA  /* redzone for global variable */
+
+/*
+ * Stack redzone shadow values
+ * (Those are compiler's ABI, don't change them)
+ */
+#define KASAN_STACK_LEFT        0xF1
+#define KASAN_STACK_MID         0xF2
+#define KASAN_STACK_RIGHT       0xF3
+#define KASAN_STACK_PARTIAL     0xF4
+
+/* Don't break randconfig/all*config builds */
+#ifndef KASAN_ABI_VERSION
+#define KASAN_ABI_VERSION 1
+#endif
+
+struct kasan_access_info {
+	const void *access_addr;
+	const void *first_bad_addr;
+	size_t access_size;
+	bool is_write;
+	unsigned long ip;
+};
+
+/* The layout of struct dictated by compiler */
+struct kasan_source_location {
+	const char *filename;
+	int line_no;
+	int column_no;
+};
+
+/* The layout of struct dictated by compiler */
+struct kasan_global {
+	const void *beg;		/* Address of the beginning of the global variable. */
+	size_t size;			/* Size of the global variable. */
+	size_t size_with_redzone;	/* Size of the variable + size of the red zone. 32 bytes aligned */
+	const void *name;
+	const void *module_name;	/* Name of the module where the global variable is declared. */
+	unsigned long has_dynamic_init;	/* This needed for C++ */
+#if KASAN_ABI_VERSION >= 4
+	struct kasan_source_location *location;
+#endif
+};
+
+static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
+{
+	return (void *)(((unsigned long)shadow_addr - KASAN_SHADOW_OFFSET)
+		<< KASAN_SHADOW_SCALE_SHIFT);
+}
+
+static inline bool kasan_enabled(void)
+{
+	return !current->kasan_depth;
+}
+
+void kasan_report(unsigned long addr, size_t size,
+		bool is_write, unsigned long ip);
+
+#endif
--- /dev/null
+++ b/mm/kasan/report.c
@@ -0,0 +1,285 @@
+/*
+ * This file contains error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <a.ryabinin@samsung.com>
+ *
+ * Some of code borrowed from https://github.com/xairy/linux by
+ *        Andrey Konovalov <adech.fo@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+
+#include <asm/sections.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+/* Shadow layout customization. */
+#define SHADOW_BYTES_PER_BLOCK 1
+#define SHADOW_BLOCKS_PER_ROW 16
+#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
+#define SHADOW_ROWS_AROUND_ADDR 2
+
+static const void *find_first_bad_addr(const void *addr, size_t size)
+{
+	u8 shadow_val = *(u8 *)kasan_mem_to_shadow(addr);
+	const void *first_bad_addr = addr;
+
+	while (!shadow_val && first_bad_addr < addr + size) {
+		first_bad_addr += KASAN_SHADOW_SCALE_SIZE;
+		shadow_val = *(u8 *)kasan_mem_to_shadow(first_bad_addr);
+	}
+	return first_bad_addr;
+}
+
+static void print_error_description(struct kasan_access_info *info)
+{
+	const char *bug_type = "unknown crash";
+	u8 shadow_val;
+
+	info->first_bad_addr = find_first_bad_addr(info->access_addr,
+						info->access_size);
+
+	shadow_val = *(u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+
+	switch (shadow_val) {
+	case KASAN_FREE_PAGE:
+	case KASAN_KMALLOC_FREE:
+		bug_type = "use after free";
+		break;
+	case KASAN_PAGE_REDZONE:
+	case KASAN_KMALLOC_REDZONE:
+	case KASAN_GLOBAL_REDZONE:
+	case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
+		bug_type = "out of bounds access";
+		break;
+	case KASAN_STACK_LEFT:
+	case KASAN_STACK_MID:
+	case KASAN_STACK_RIGHT:
+	case KASAN_STACK_PARTIAL:
+		bug_type = "out of bounds on stack";
+		break;
+	}
+
+	pr_err("BUG: KASan: %s in %pS at addr %p\n",
+		bug_type, (void *)info->ip,
+		info->access_addr);
+	pr_err("%s of size %zu by task %s/%d\n",
+		info->is_write ? "Write" : "Read",
+		info->access_size, current->comm, task_pid_nr(current));
+}
+
+static inline bool kernel_or_module_addr(const void *addr)
+{
+	return (addr >= (void *)_stext && addr < (void *)_end)
+		|| (addr >= (void *)MODULES_VADDR
+			&& addr < (void *)MODULES_END);
+}
+
+static inline bool init_task_stack_addr(const void *addr)
+{
+	return addr >= (void *)&init_thread_union.stack &&
+		(addr <= (void *)&init_thread_union.stack +
+			sizeof(init_thread_union.stack));
+}
+
+static void print_address_description(struct kasan_access_info *info)
+{
+	const void *addr = info->access_addr;
+
+	if ((addr >= (void *)PAGE_OFFSET) &&
+		(addr < high_memory)) {
+		struct page *page = virt_to_head_page(addr);
+
+		if (PageSlab(page)) {
+			void *object;
+			struct kmem_cache *cache = page->slab_cache;
+			void *last_object;
+
+			object = virt_to_obj(cache, page_address(page), addr);
+			last_object = page_address(page) +
+				page->objects * cache->size;
+
+			if (unlikely(object > last_object))
+				object = last_object; /* we hit into padding */
+
+			object_err(cache, page, object,
+				"kasan: bad access detected");
+			return;
+		}
+		dump_page(page, "kasan: bad access detected");
+	}
+
+	if (kernel_or_module_addr(addr)) {
+		if (!init_task_stack_addr(addr))
+			pr_err("Address belongs to variable %pS\n", addr);
+	}
+
+	dump_stack();
+}
+
+static bool row_is_guilty(const void *row, const void *guilty)
+{
+	return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW);
+}
+
+static int shadow_pointer_offset(const void *row, const void *shadow)
+{
+	/* The length of ">ff00ff00ff00ff00: " is
+	 *    3 + (BITS_PER_LONG/8)*2 chars.
+	 */
+	return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 +
+		(shadow - row) / SHADOW_BYTES_PER_BLOCK + 1;
+}
+
+static void print_shadow_for_address(const void *addr)
+{
+	int i;
+	const void *shadow = kasan_mem_to_shadow(addr);
+	const void *shadow_row;
+
+	shadow_row = (void *)round_down((unsigned long)shadow,
+					SHADOW_BYTES_PER_ROW)
+		- SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW;
+
+	pr_err("Memory state around the buggy address:\n");
+
+	for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
+		const void *kaddr = kasan_shadow_to_mem(shadow_row);
+		char buffer[4 + (BITS_PER_LONG/8)*2];
+
+		snprintf(buffer, sizeof(buffer),
+			(i == 0) ? ">%p: " : " %p: ", kaddr);
+
+		kasan_disable_current();
+		print_hex_dump(KERN_ERR, buffer,
+			DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
+			shadow_row, SHADOW_BYTES_PER_ROW, 0);
+		kasan_enable_current();
+
+		if (row_is_guilty(shadow_row, shadow))
+			pr_err("%*c\n",
+				shadow_pointer_offset(shadow_row, shadow),
+				'^');
+
+		shadow_row += SHADOW_BYTES_PER_ROW;
+	}
+}
+
+static DEFINE_SPINLOCK(report_lock);
+
+static void kasan_report_error(struct kasan_access_info *info)
+{
+	unsigned long flags;
+	const char *bug_type;
+
+	spin_lock_irqsave(&report_lock, flags);
+	pr_err("================================="
+		"=================================\n");
+	if (info->access_addr <
+			kasan_shadow_to_mem((void *)KASAN_SHADOW_START)) {
+		if ((unsigned long)info->access_addr < PAGE_SIZE)
+			bug_type = "null-ptr-deref";
+		else if ((unsigned long)info->access_addr < TASK_SIZE)
+			bug_type = "user-memory-access";
+		else
+			bug_type = "wild-memory-access";
+		pr_err("BUG: KASan: %s on address %p\n",
+			bug_type, info->access_addr);
+		pr_err("%s of size %zu by task %s/%d\n",
+			info->is_write ? "Write" : "Read",
+			info->access_size, current->comm,
+			task_pid_nr(current));
+		dump_stack();
+	} else {
+		print_error_description(info);
+		print_address_description(info);
+		print_shadow_for_address(info->first_bad_addr);
+	}
+	pr_err("================================="
+		"=================================\n");
+	add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE);
+	spin_unlock_irqrestore(&report_lock, flags);
+}
+
+static bool print_till_death;
+static int __init kasan_setup(char *arg)
+{
+	print_till_death = true;
+	return 0;
+}
+__setup("kasan_print_till_death", kasan_setup);
+
+void kasan_report(unsigned long addr, size_t size,
+		bool is_write, unsigned long ip)
+{
+	struct kasan_access_info info;
+	static bool reported = false;
+
+	if (likely(!kasan_enabled()))
+		return;
+
+	if (likely(!print_till_death)) {
+		if (reported)
+			return;
+		reported = true;
+	}
+	info.access_addr = (void *)addr;
+	info.access_size = size;
+	info.is_write = is_write;
+	info.ip = ip;
+
+	kasan_report_error(&info);
+}
+
+
+#define DEFINE_ASAN_REPORT_LOAD(size)                     \
+void __asan_report_load##size##_noabort(unsigned long addr) \
+{                                                         \
+	kasan_report(addr, size, false, _RET_IP_);	  \
+}                                                         \
+EXPORT_SYMBOL(__asan_report_load##size##_noabort)
+
+#define DEFINE_ASAN_REPORT_STORE(size)                     \
+void __asan_report_store##size##_noabort(unsigned long addr) \
+{                                                          \
+	kasan_report(addr, size, true, _RET_IP_);	   \
+}                                                          \
+EXPORT_SYMBOL(__asan_report_store##size##_noabort)
+
+DEFINE_ASAN_REPORT_LOAD(1);
+DEFINE_ASAN_REPORT_LOAD(2);
+DEFINE_ASAN_REPORT_LOAD(4);
+DEFINE_ASAN_REPORT_LOAD(8);
+DEFINE_ASAN_REPORT_LOAD(16);
+DEFINE_ASAN_REPORT_STORE(1);
+DEFINE_ASAN_REPORT_STORE(2);
+DEFINE_ASAN_REPORT_STORE(4);
+DEFINE_ASAN_REPORT_STORE(8);
+DEFINE_ASAN_REPORT_STORE(16);
+
+void __asan_report_load_n_noabort(unsigned long addr, size_t size)
+{
+	kasan_report(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_load_n_noabort);
+
+void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+{
+	kasan_report(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_store_n_noabort);
--- a/mm/kmemcheck.c
+++ b/mm/kmemcheck.c
@@ -2,6 +2,7 @@
 #include <linux/mm_types.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
+#include "slab.h"
 #include <linux/kmemcheck.h>
 
 void kmemcheck_alloc_shadow(struct page *page, int order, gfp_t flags, int node)
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -98,6 +98,7 @@
 #include <asm/processor.h>
 #include <linux/atomic.h>
 
+#include <linux/kasan.h>
 #include <linux/kmemcheck.h>
 #include <linux/kmemleak.h>
 #include <linux/memory_hotplug.h>
@@ -1077,7 +1078,10 @@ static bool update_checksum(struct kmemleak_object *object)
 	if (!kmemcheck_is_obj_initialized(object->pointer, object->size))
 		return false;
 
+	kasan_disable_current();
 	object->checksum = crc32(0, (void *)object->pointer, object->size);
+	kasan_enable_current();
+
 	return object->checksum != old_csum;
 }
 
@@ -1128,7 +1132,9 @@ static void scan_block(void *_start, void *_end,
 						  BYTES_PER_POINTER))
 			continue;
 
+		kasan_disable_current();
 		pointer = *ptr;
+		kasan_enable_current();
 
 		object = find_and_get_object(pointer, 1);
 		if (!object)
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -126,12 +126,9 @@ struct ksm_scan {
  * struct stable_node - node of the stable rbtree
  * @node: rb node of this ksm page in the stable tree
  * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list
- * @hlist_dup: linked into the stable_node->hlist with a stable_node chain
  * @list: linked into migrate_nodes, pending placement in the proper node tree
  * @hlist: hlist head of rmap_items using this ksm page
  * @kpfn: page frame number of this ksm page (perhaps temporarily on wrong nid)
- * @chain_prune_time: time of the last full garbage collection
- * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN
  * @nid: NUMA node id of stable tree in which linked (may not match kpfn)
  */
 struct stable_node {
@@ -139,24 +136,11 @@ struct stable_node {
 		struct rb_node node;	/* when node of stable tree */
 		struct {		/* when listed for migration */
 			struct list_head *head;
-			struct {
-				struct hlist_node hlist_dup;
-				struct list_head list;
-			};
+			struct list_head list;
 		};
 	};
 	struct hlist_head hlist;
-	union {
-		unsigned long kpfn;
-		unsigned long chain_prune_time;
-	};
-	/*
-	 * STABLE_NODE_CHAIN can be any negative number in
-	 * rmap_hlist_len negative range, but better not -1 to be able
-	 * to reliably detect underflows.
-	 */
-#define STABLE_NODE_CHAIN -1024
-	int rmap_hlist_len;
+	unsigned long kpfn;
 #ifdef CONFIG_NUMA
 	int nid;
 #endif
@@ -206,7 +190,6 @@ static struct rb_root *root_unstable_tree = one_unstable_tree;
 
 /* Recently migrated nodes of stable tree, pending proper placement */
 static LIST_HEAD(migrate_nodes);
-#define STABLE_NODE_DUP_HEAD ((struct list_head *)&migrate_nodes.prev)
 
 #define MM_SLOTS_HASH_BITS 10
 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -234,18 +217,6 @@ static unsigned long ksm_pages_unshared;
 /* The number of rmap_items in use: to calculate pages_volatile */
 static unsigned long ksm_rmap_items;
 
-/* The number of stable_node chains */
-static unsigned long ksm_stable_node_chains;
-
-/* The number of stable_node dups linked to the stable_node chains */
-static unsigned long ksm_stable_node_dups;
-
-/* Delay in pruning stale stable_node_dups in the stable_node_chains */
-static int ksm_stable_node_chains_prune_millisecs = 2000;
-
-/* Maximum number of page slots sharing a stable node */
-static int ksm_max_page_sharing = 256;
-
 /* Number of pages ksmd should scan in one batch */
 static unsigned int ksm_thread_pages_to_scan = 100;
 
@@ -308,44 +279,6 @@ static void __init ksm_slab_free(void)
 	mm_slot_cache = NULL;
 }
 
-static __always_inline bool is_stable_node_chain(struct stable_node *chain)
-{
-	return chain->rmap_hlist_len == STABLE_NODE_CHAIN;
-}
-
-static __always_inline bool is_stable_node_dup(struct stable_node *dup)
-{
-	return dup->head == STABLE_NODE_DUP_HEAD;
-}
-
-static inline void stable_node_chain_add_dup(struct stable_node *dup,
-					     struct stable_node *chain)
-{
-	VM_BUG_ON(is_stable_node_dup(dup));
-	dup->head = STABLE_NODE_DUP_HEAD;
-	VM_BUG_ON(!is_stable_node_chain(chain));
-	hlist_add_head(&dup->hlist_dup, &chain->hlist);
-	ksm_stable_node_dups++;
-}
-
-static inline void __stable_node_dup_del(struct stable_node *dup)
-{
-	hlist_del(&dup->hlist_dup);
-	ksm_stable_node_dups--;
-}
-
-static inline void stable_node_dup_del(struct stable_node *dup)
-{
-	VM_BUG_ON(is_stable_node_chain(dup));
-	if (is_stable_node_dup(dup))
-		__stable_node_dup_del(dup);
-	else
-		rb_erase(&dup->node, root_stable_tree + NUMA(dup->nid));
-#ifdef CONFIG_DEBUG_VM
-	dup->head = NULL;
-#endif
-}
-
 static inline struct rmap_item *alloc_rmap_item(void)
 {
 	struct rmap_item *rmap_item;
@@ -370,8 +303,6 @@ static inline struct stable_node *alloc_stable_node(void)
 
 static inline void free_stable_node(struct stable_node *stable_node)
 {
-	VM_BUG_ON(stable_node->rmap_hlist_len &&
-		  !is_stable_node_chain(stable_node));
 	kmem_cache_free(stable_node_cache, stable_node);
 }
 
@@ -562,80 +493,25 @@ static inline int get_kpfn_nid(unsigned long kpfn)
 	return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn));
 }
 
-static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
-						   struct rb_root *root)
-{
-	struct stable_node *chain = alloc_stable_node();
-	VM_BUG_ON(is_stable_node_chain(dup));
-	if (likely(chain)) {
-		INIT_HLIST_HEAD(&chain->hlist);
-		chain->chain_prune_time = jiffies;
-		chain->rmap_hlist_len = STABLE_NODE_CHAIN;
-#if defined(CONFIG_DEBUG_VM) && defined(CONFIG_NUMA)
-		chain->nid = -1; /* debug */
-#endif
-		ksm_stable_node_chains++;
-
-		/*
-		 * Put the stable node chain in the first dimension of
-		 * the stable tree and at the same time remove the old
-		 * stable node.
-		 */
-		rb_replace_node(&dup->node, &chain->node, root);
-
-		/*
-		 * Move the old stable node to the second dimension
-		 * queued in the hlist_dup. The invariant is that all
-		 * dup stable_nodes in the chain->hlist point to pages
-		 * that are wrprotected and have the exact same
-		 * content.
-		 */
-		stable_node_chain_add_dup(dup, chain);
-	}
-	return chain;
-}
-
-static inline void free_stable_node_chain(struct stable_node *chain,
-					  struct rb_root *root)
-{
-	rb_erase(&chain->node, root);
-	free_stable_node(chain);
-	ksm_stable_node_chains--;
-}
-
 static void remove_node_from_stable_tree(struct stable_node *stable_node)
 {
 	struct rmap_item *rmap_item;
 
-	/* check it's not STABLE_NODE_CHAIN or negative */
-	BUG_ON(stable_node->rmap_hlist_len < 0);
-
 	hlist_for_each_entry(rmap_item, &stable_node->hlist, hlist) {
 		if (rmap_item->hlist.next)
 			ksm_pages_sharing--;
 		else
 			ksm_pages_shared--;
-		VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
-		stable_node->rmap_hlist_len--;
 		put_anon_vma(rmap_item->anon_vma);
 		rmap_item->address &= PAGE_MASK;
 		cond_resched();
 	}
 
-	/*
-	 * We need the second aligned pointer of the migrate_nodes
-	 * list_head to stay clear from the rb_parent_color union
-	 * (aligned and different than any node) and also different
-	 * from &migrate_nodes. This will verify that future list.h changes
-	 * don't break STABLE_NODE_DUP_HEAD.
-	 */
-	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD <= &migrate_nodes);
-	BUILD_BUG_ON(STABLE_NODE_DUP_HEAD >= &migrate_nodes + 1);
-
 	if (stable_node->head == &migrate_nodes)
 		list_del(&stable_node->list);
 	else
-		stable_node_dup_del(stable_node);
+		rb_erase(&stable_node->node,
+			 root_stable_tree + NUMA(stable_node->nid));
 	free_stable_node(stable_node);
 }
 
@@ -754,8 +630,6 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item)
 			ksm_pages_sharing--;
 		else
 			ksm_pages_shared--;
-		VM_BUG_ON(stable_node->rmap_hlist_len <= 0);
-		stable_node->rmap_hlist_len--;
 
 		put_anon_vma(rmap_item->anon_vma);
 		rmap_item->address &= PAGE_MASK;
@@ -864,32 +738,6 @@ static int remove_stable_node(struct stable_node *stable_node)
 	return err;
 }
 
-static int remove_stable_node_chain(struct stable_node *stable_node,
-				    struct rb_root *root)
-{
-	struct stable_node *dup;
-	struct hlist_node *hlist_safe;
-
-	if (!is_stable_node_chain(stable_node)) {
-		VM_BUG_ON(is_stable_node_dup(stable_node));
-		if (remove_stable_node(stable_node))
-			return true;
-		else
-			return false;
-	}
-
-	hlist_for_each_entry_safe(dup, hlist_safe,
-				  &stable_node->hlist, hlist_dup) {
-		VM_BUG_ON(!is_stable_node_dup(dup));
-		if (remove_stable_node(dup))
-			return true;
-		cond_resched();
-	}
-	BUG_ON(!hlist_empty(&stable_node->hlist));
-	free_stable_node_chain(stable_node, root);
-	return false;
-}
-
 static int remove_all_stable_nodes(void)
 {
 	struct stable_node *stable_node;
@@ -901,8 +749,7 @@ static int remove_all_stable_nodes(void)
 		while (root_stable_tree[nid].rb_node) {
 			stable_node = rb_entry(root_stable_tree[nid].rb_node,
 						struct stable_node, node);
-			if (remove_stable_node_chain(stable_node,
-						     root_stable_tree + nid)) {
+			if (remove_stable_node(stable_node)) {
 				err = -EBUSY;
 				break;	/* proceed to next nid */
 			}
@@ -1298,163 +1145,6 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item,
 	return err ? NULL : page;
 }
 
-static __always_inline
-bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset)
-{
-	VM_BUG_ON(stable_node->rmap_hlist_len < 0);
-	/*
-	 * Check that at least one mapping still exists, otherwise
-	 * there's no much point to merge and share with this
-	 * stable_node, as the underlying tree_page of the other
-	 * sharer is going to be freed soon.
-	 */
-	return stable_node->rmap_hlist_len &&
-		stable_node->rmap_hlist_len + offset < ksm_max_page_sharing;
-}
-
-static __always_inline
-bool is_page_sharing_candidate(struct stable_node *stable_node)
-{
-	return __is_page_sharing_candidate(stable_node, 0);
-}
-
-static struct stable_node *stable_node_dup(struct stable_node *stable_node,
-					   struct page **tree_page,
-					   struct rb_root *root,
-					   bool prune_stale_stable_nodes)
-{
-	struct stable_node *dup, *found = NULL;
-	struct hlist_node *hlist_safe;
-	struct page *_tree_page;
-	int nr = 0;
-	int found_rmap_hlist_len;
-
-	if (!prune_stale_stable_nodes ||
-	    time_before(jiffies, stable_node->chain_prune_time +
-			msecs_to_jiffies(
-				ksm_stable_node_chains_prune_millisecs)))
-		prune_stale_stable_nodes = false;
-	else
-		stable_node->chain_prune_time = jiffies;
-
-	hlist_for_each_entry_safe(dup, hlist_safe,
-				  &stable_node->hlist, hlist_dup) {
-		cond_resched();
-		/*
-		 * We must walk all stable_node_dup to prune the stale
-		 * stable nodes during lookup.
-		 *
-		 * get_ksm_page can drop the nodes from the
-		 * stable_node->hlist if they point to freed pages
-		 * (that's why we do a _safe walk). The "dup"
-		 * stable_node parameter itself will be freed from
-		 * under us if it returns NULL.
-		 */
-		_tree_page = get_ksm_page(dup, false);
-		if (!_tree_page)
-			continue;
-		nr += 1;
-		if (is_page_sharing_candidate(dup)) {
-			if (!found ||
-			    dup->rmap_hlist_len > found_rmap_hlist_len) {
-				if (found)
-					put_page(*tree_page);
-				found = dup;
-				found_rmap_hlist_len = found->rmap_hlist_len;
-				*tree_page = _tree_page;
-
-				if (!prune_stale_stable_nodes)
-					break;
-				/* skip put_page */
-				continue;
-			}
-		}
-		put_page(_tree_page);
-	}
-
-	/*
-	 * nr is relevant only if prune_stale_stable_nodes is true,
-	 * otherwise we may break the loop at nr == 1 even if there
-	 * are multiple entries.
-	 */
-	if (prune_stale_stable_nodes && found) {
-		if (nr == 1) {
-			/*
-			 * If there's not just one entry it would
-			 * corrupt memory, better BUG_ON. In KSM
-			 * context with no lock held it's not even
-			 * fatal.
-			 */
-			BUG_ON(stable_node->hlist.first->next);
-
-			/*
-			 * There's just one entry and it is below the
-			 * deduplication limit so drop the chain.
-			 */
-			rb_replace_node(&stable_node->node, &found->node,
-					root);
-			free_stable_node(stable_node);
-			ksm_stable_node_chains--;
-			ksm_stable_node_dups--;
-		} else if (__is_page_sharing_candidate(found, 1)) {
-			/*
-			 * Refile our candidate at the head
-			 * after the prune if our candidate
-			 * can accept one more future sharing
-			 * in addition to the one underway.
-			 */
-			hlist_del(&found->hlist_dup);
-			hlist_add_head(&found->hlist_dup,
-				       &stable_node->hlist);
-		}
-	}
-
-	return found;
-}
-
-static struct stable_node *stable_node_dup_any(struct stable_node *stable_node,
-					       struct rb_root *root)
-{
-	if (!is_stable_node_chain(stable_node))
-		return stable_node;
-	if (hlist_empty(&stable_node->hlist)) {
-		free_stable_node_chain(stable_node, root);
-		return NULL;
-	}
-	return hlist_entry(stable_node->hlist.first,
-			   typeof(*stable_node), hlist_dup);
-}
-
-static struct stable_node *__stable_node_chain(struct stable_node *stable_node,
-					       struct page **tree_page,
-					       struct rb_root *root,
-					       bool prune_stale_stable_nodes)
-{
-	if (!is_stable_node_chain(stable_node)) {
-		if (is_page_sharing_candidate(stable_node)) {
-			*tree_page = get_ksm_page(stable_node, false);
-			return stable_node;
-		}
-		return NULL;
-	}
-	return stable_node_dup(stable_node, tree_page, root,
-			       prune_stale_stable_nodes);
-}
-
-static __always_inline struct stable_node *chain_prune(struct stable_node *s_n,
-						       struct page **t_p,
-						       struct rb_root *root)
-{
-	return __stable_node_chain(s_n, t_p, root, true);
-}
-
-static __always_inline struct stable_node *chain(struct stable_node *s_n,
-						 struct page **t_p,
-						 struct rb_root *root)
-{
-	return __stable_node_chain(s_n, t_p, root, false);
-}
-
 /*
  * stable_tree_search - search for page inside the stable tree
  *
@@ -1470,7 +1160,7 @@ static struct page *stable_tree_search(struct page *page)
 	struct rb_root *root;
 	struct rb_node **new;
 	struct rb_node *parent;
-	struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
+	struct stable_node *stable_node;
 	struct stable_node *page_node;
 
 	page_node = page_stable_node(page);
@@ -1492,32 +1182,7 @@ again:
 
 		cond_resched();
 		stable_node = rb_entry(*new, struct stable_node, node);
-		stable_node_any = NULL;
-		stable_node_dup = chain_prune(stable_node, &tree_page, root);
-		if (!stable_node_dup) {
-			/*
-			 * Either all stable_node dups were full in
-			 * this stable_node chain, or this chain was
-			 * empty and should be rb_erased.
-			 */
-			stable_node_any = stable_node_dup_any(stable_node,
-							      root);
-			if (!stable_node_any) {
-				/* rb_erase just run */
-				goto again;
-			}
-			/*
-			 * Take any of the stable_node dups page of
-			 * this stable_node chain to let the tree walk
-			 * continue. All KSM pages belonging to the
-			 * stable_node dups in a stable_node chain
-			 * have the same content and they're
-			 * wrprotected at all times. Any will work
-			 * fine to continue the walk.
-			 */
-			tree_page = get_ksm_page(stable_node_any, false);
-		}
-		VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
+		tree_page = get_ksm_page(stable_node, false);
 		if (!tree_page) {
 			/*
 			 * If we walked over a stale stable_node,
@@ -1540,34 +1205,6 @@ again:
 		else if (ret > 0)
 			new = &parent->rb_right;
 		else {
-			if (page_node) {
-				VM_BUG_ON(page_node->head != &migrate_nodes);
-				/*
-				 * Test if the migrated page should be merged
-				 * into a stable node dup. If the mapcount is
-				 * 1 we can migrate it with another KSM page
-				 * without adding it to the chain.
-				 */
-				if (page_mapcount(page) > 1)
-					goto chain_append;
-			}
-
-			if (!stable_node_dup) {
-				/*
-				 * If the stable_node is a chain and
-				 * we got a payload match in memcmp
-				 * but we cannot merge the scanned
-				 * page in any of the existing
-				 * stable_node dups because they're
-				 * all full, we need to wait the
-				 * scanned page to find itself a match
-				 * in the unstable tree to create a
-				 * brand new KSM page to add later to
-				 * the dups of this stable_node.
-				 */
-				return NULL;
-			}
-
 			/*
 			 * Lock and unlock the stable_node's page (which
 			 * might already have been migrated) so that page
@@ -1575,21 +1212,23 @@ again:
 			 * It would be more elegant to return stable_node
 			 * than kpage, but that involves more changes.
 			 */
-			tree_page = get_ksm_page(stable_node_dup, true);
-			if (unlikely(!tree_page))
-				/*
-				 * The tree may have been rebalanced,
-				 * so re-evaluate parent and new.
-				 */
-				goto again;
-			unlock_page(tree_page);
-
-			if (get_kpfn_nid(stable_node_dup->kpfn) !=
-			    NUMA(stable_node_dup->nid)) {
-				put_page(tree_page);
-				goto replace;
+			tree_page = get_ksm_page(stable_node, true);
+			if (tree_page) {
+				unlock_page(tree_page);
+				if (get_kpfn_nid(stable_node->kpfn) !=
+						NUMA(stable_node->nid)) {
+					put_page(tree_page);
+					goto replace;
+				}
+				return tree_page;
 			}
-			return tree_page;
+			/*
+			 * There is now a place for page_node, but the tree may
+			 * have been rebalanced, so re-evaluate parent and new.
+			 */
+			if (page_node)
+				goto again;
+			return NULL;
 		}
 	}
 
@@ -1600,72 +1239,22 @@ again:
 	DO_NUMA(page_node->nid = nid);
 	rb_link_node(&page_node->node, parent, new);
 	rb_insert_color(&page_node->node, root);
-out:
-	if (is_page_sharing_candidate(page_node)) {
-		get_page(page);
-		return page;
-	} else
-		return NULL;
+	get_page(page);
+	return page;
 
 replace:
-	if (stable_node_dup == stable_node) {
-		/* there is no chain */
-		if (page_node) {
-			VM_BUG_ON(page_node->head != &migrate_nodes);
-			list_del(&page_node->list);
-			DO_NUMA(page_node->nid = nid);
-			rb_replace_node(&stable_node->node, &page_node->node,
-					root);
-			if (is_page_sharing_candidate(page_node))
-				get_page(page);
-			else
-				page = NULL;
-		} else {
-			rb_erase(&stable_node->node, root);
-			page = NULL;
-		}
+	if (page_node) {
+		list_del(&page_node->list);
+		DO_NUMA(page_node->nid = nid);
+		rb_replace_node(&stable_node->node, &page_node->node, root);
+		get_page(page);
 	} else {
-		VM_BUG_ON(!is_stable_node_chain(stable_node));
-		__stable_node_dup_del(stable_node_dup);
-		if (page_node) {
-			VM_BUG_ON(page_node->head != &migrate_nodes);
-			list_del(&page_node->list);
-			DO_NUMA(page_node->nid = nid);
-			stable_node_chain_add_dup(page_node, stable_node);
-			if (is_page_sharing_candidate(page_node))
-				get_page(page);
-			else
-				page = NULL;
-		} else {
-			page = NULL;
-		}
+		rb_erase(&stable_node->node, root);
+		page = NULL;
 	}
-	stable_node_dup->head = &migrate_nodes;
-	list_add(&stable_node_dup->list, stable_node_dup->head);
+	stable_node->head = &migrate_nodes;
+	list_add(&stable_node->list, stable_node->head);
 	return page;
-
-chain_append:
-	/* stable_node_dup could be null if it reached the limit */
-	if (!stable_node_dup)
-		stable_node_dup = stable_node_any;
-	if (stable_node_dup == stable_node) {
-		/* chain is missing so create it */
-		stable_node = alloc_stable_node_chain(stable_node_dup,
-						      root);
-		if (!stable_node)
-			return NULL;
-	}
-	/*
-	 * Add this stable_node dup that was
-	 * migrated to the stable_node chain
-	 * of the current nid for this page
-	 * content.
-	 */
-	VM_BUG_ON(page_node->head != &migrate_nodes);
-	list_del(&page_node->list);
-	DO_NUMA(page_node->nid = nid);
-	stable_node_chain_add_dup(page_node, stable_node);
-	goto out;
 }
 
 /*
@@ -1682,8 +1271,7 @@ static struct stable_node *stable_tree_insert(struct page *kpage)
 	struct rb_root *root;
 	struct rb_node **new;
 	struct rb_node *parent;
-	struct stable_node *stable_node, *stable_node_dup, *stable_node_any;
-	bool need_chain = false;
+	struct stable_node *stable_node;
 
 	kpfn = page_to_pfn(kpage);
 	nid = get_kpfn_nid(kpfn);
@@ -1698,32 +1286,7 @@ again:
 
 		cond_resched();
 		stable_node = rb_entry(*new, struct stable_node, node);
-		stable_node_any = NULL;
-		stable_node_dup = chain(stable_node, &tree_page, root);
-		if (!stable_node_dup) {
-			/*
-			 * Either all stable_node dups were full in
-			 * this stable_node chain, or this chain was
-			 * empty and should be rb_erased.
-			 */
-			stable_node_any = stable_node_dup_any(stable_node,
-							      root);
-			if (!stable_node_any) {
-				/* rb_erase just run */
-				goto again;
-			}
-			/*
-			 * Take any of the stable_node dups page of
-			 * this stable_node chain to let the tree walk
-			 * continue. All KSM pages belonging to the
-			 * stable_node dups in a stable_node chain
-			 * have the same content and they're
-			 * wrprotected at all times. Any will work
-			 * fine to continue the walk.
-			 */
-			tree_page = get_ksm_page(stable_node_any, false);
-		}
-		VM_BUG_ON(!stable_node_dup ^ !!stable_node_any);
+		tree_page = get_ksm_page(stable_node, false);
 		if (!tree_page) {
 			/*
 			 * If we walked over a stale stable_node,
@@ -1746,37 +1309,27 @@ again:
 		else if (ret > 0)
 			new = &parent->rb_right;
 		else {
-			need_chain = true;
-			break;
+			/*
+			 * It is not a bug that stable_tree_search() didn't
+			 * find this node: because at that time our page was
+			 * not yet write-protected, so may have changed since.
+			 */
+			return NULL;
 		}
 	}
 
-	stable_node_dup = alloc_stable_node();
-	if (!stable_node_dup)
+	stable_node = alloc_stable_node();
+	if (!stable_node)
 		return NULL;
 
-	INIT_HLIST_HEAD(&stable_node_dup->hlist);
-	stable_node_dup->kpfn = kpfn;
-	set_page_stable_node(kpage, stable_node_dup);
-	stable_node_dup->rmap_hlist_len = 0;
-	DO_NUMA(stable_node_dup->nid = nid);
-	if (!need_chain) {
-		rb_link_node(&stable_node_dup->node, parent, new);
-		rb_insert_color(&stable_node_dup->node, root);
-	} else {
-		if (!is_stable_node_chain(stable_node)) {
-			struct stable_node *orig = stable_node;
-			/* chain is missing so create it */
-			stable_node = alloc_stable_node_chain(orig, root);
-			if (!stable_node) {
-				free_stable_node(stable_node_dup);
-				return NULL;
-			}
-		}
-		stable_node_chain_add_dup(stable_node_dup, stable_node);
-	}
+	INIT_HLIST_HEAD(&stable_node->hlist);
+	stable_node->kpfn = kpfn;
+	set_page_stable_node(kpage, stable_node);
+	DO_NUMA(stable_node->nid = nid);
+	rb_link_node(&stable_node->node, parent, new);
+	rb_insert_color(&stable_node->node, root);
 
-	return stable_node_dup;
+	return stable_node;
 }
 
 /*
@@ -1866,27 +1419,8 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item,
  * the same ksm page.
  */
 static void stable_tree_append(struct rmap_item *rmap_item,
-			       struct stable_node *stable_node,
-			       bool max_page_sharing_bypass)
+			       struct stable_node *stable_node)
 {
-	/*
-	 * rmap won't find this mapping if we don't insert the
-	 * rmap_item in the right stable_node
-	 * duplicate. page_migration could break later if rmap breaks,
-	 * so we can as well crash here. We really need to check for
-	 * rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
-	 * for other negative values as an undeflow if detected here
-	 * for the first time (and not when decreasing rmap_hlist_len)
-	 * would be sign of memory corruption in the stable_node.
-	 */
-	BUG_ON(stable_node->rmap_hlist_len < 0);
-
-	stable_node->rmap_hlist_len++;
-	if (!max_page_sharing_bypass)
-		/* possibly non fatal but unexpected overflow, only warn */
-		WARN_ON_ONCE(stable_node->rmap_hlist_len >
-			     ksm_max_page_sharing);
-
 	rmap_item->head = stable_node;
 	rmap_item->address |= STABLE_FLAG;
 	hlist_add_head(&rmap_item->hlist, &stable_node->hlist);
@@ -1914,26 +1448,19 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
 	struct page *kpage;
 	unsigned int checksum;
 	int err;
-	bool max_page_sharing_bypass = false;
 
 	stable_node = page_stable_node(page);
 	if (stable_node) {
 		if (stable_node->head != &migrate_nodes &&
-		    get_kpfn_nid(READ_ONCE(stable_node->kpfn)) !=
-		    NUMA(stable_node->nid)) {
-			stable_node_dup_del(stable_node);
+		    get_kpfn_nid(stable_node->kpfn) != NUMA(stable_node->nid)) {
+			rb_erase(&stable_node->node,
+				 root_stable_tree + NUMA(stable_node->nid));
 			stable_node->head = &migrate_nodes;
 			list_add(&stable_node->list, stable_node->head);
 		}
 		if (stable_node->head != &migrate_nodes &&
 		    rmap_item->head == stable_node)
 			return;
-		/*
-		 * If it's a KSM fork, allow it to go over the sharing limit
-		 * without warnings.
-		 */
-		if (!is_page_sharing_candidate(stable_node))
-			max_page_sharing_bypass = true;
 	}
 
 	/* We first start with searching the page inside the stable tree */
@@ -1953,8 +1480,7 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
 			 * add its rmap_item to the stable tree.
 			 */
 			lock_page(kpage);
-			stable_tree_append(rmap_item, page_stable_node(kpage),
-					   max_page_sharing_bypass);
+			stable_tree_append(rmap_item, page_stable_node(kpage));
 			unlock_page(kpage);
 		}
 		put_page(kpage);
@@ -1987,10 +1513,8 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
 			lock_page(kpage);
 			stable_node = stable_tree_insert(kpage);
 			if (stable_node) {
-				stable_tree_append(tree_rmap_item, stable_node,
-						   false);
-				stable_tree_append(rmap_item, stable_node,
-						   false);
+				stable_tree_append(tree_rmap_item, stable_node);
+				stable_tree_append(rmap_item, stable_node);
 			}
 			unlock_page(kpage);
 
@@ -2259,7 +1783,7 @@ int ksm_madvise(struct vm_area_struct *vma, unsigned long start,
 		 */
 		if (*vm_flags & (VM_MERGEABLE | VM_SHARED  | VM_MAYSHARE   |
 				 VM_PFNMAP    | VM_IO      | VM_DONTEXPAND |
-				 VM_HUGETLB | VM_NONLINEAR | VM_MIXEDMAP))
+				 VM_HUGETLB | VM_MIXEDMAP))
 			return 0;		/* just ignore the advice */
 
 #ifdef VM_SAO
@@ -2390,6 +1914,7 @@ struct page *ksm_might_need_to_copy(struct page *page,
 		return page;		/* let do_swap_page report the error */
 
 	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+
 	if (new_page) {
 		copy_user_highpage(new_page, page, address, vma);
 
@@ -2514,8 +2039,7 @@ out:
 }
 
 #ifdef CONFIG_MIGRATION
-int rmap_walk_ksm(struct page *page, int (*rmap_one)(struct page *,
-		  struct vm_area_struct *, unsigned long, void *), void *arg)
+int rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc)
 {
 	struct stable_node *stable_node;
 	struct rmap_item *rmap_item;
@@ -2550,11 +2074,19 @@ again:
 			if ((rmap_item->mm == vma->vm_mm) == search_new_forks)
 				continue;
 
-			ret = rmap_one(page, vma, rmap_item->address, arg);
+			if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+				continue;
+
+			ret = rwc->rmap_one(page, vma,
+					rmap_item->address, rwc->arg);
 			if (ret != SWAP_AGAIN) {
 				anon_vma_unlock_read(anon_vma);
 				goto out;
 			}
+			if (rwc->done && rwc->done(page)) {
+				anon_vma_unlock_read(anon_vma);
+				goto out;
+			}
 		}
 		anon_vma_unlock_read(anon_vma);
 	}
@@ -2599,48 +2131,6 @@ static void wait_while_offlining(void)
 	}
 }
 
-static bool stable_node_dup_remove_range(struct stable_node *stable_node,
-					 unsigned long start_pfn,
-					 unsigned long end_pfn)
-{
-	if (stable_node->kpfn >= start_pfn &&
-	    stable_node->kpfn < end_pfn) {
-		/*
-		 * Don't get_ksm_page, page has already gone:
-		 * which is why we keep kpfn instead of page*
-		 */
-		remove_node_from_stable_tree(stable_node);
-		return true;
-	}
-	return false;
-}
-
-static bool stable_node_chain_remove_range(struct stable_node *stable_node,
-					   unsigned long start_pfn,
-					   unsigned long end_pfn,
-					   struct rb_root *root)
-{
-	struct stable_node *dup;
-	struct hlist_node *hlist_safe;
-
-	if (!is_stable_node_chain(stable_node)) {
-		VM_BUG_ON(is_stable_node_dup(stable_node));
-		return stable_node_dup_remove_range(stable_node, start_pfn,
-						    end_pfn);
-	}
-
-	hlist_for_each_entry_safe(dup, hlist_safe,
-				  &stable_node->hlist, hlist_dup) {
-		VM_BUG_ON(!is_stable_node_dup(dup));
-		stable_node_dup_remove_range(dup, start_pfn, end_pfn);
-	}
-	if (hlist_empty(&stable_node->hlist)) {
-		free_stable_node_chain(stable_node, root);
-		return true; /* notify caller that tree was rebalanced */
-	} else
-		return false;
-}
-
 static void ksm_check_stable_tree(unsigned long start_pfn,
 				  unsigned long end_pfn)
 {
@@ -2653,12 +2143,15 @@ static void ksm_check_stable_tree(unsigned long start_pfn,
 		node = rb_first(root_stable_tree + nid);
 		while (node) {
 			stable_node = rb_entry(node, struct stable_node, node);
-			if (stable_node_chain_remove_range(stable_node,
-							   start_pfn, end_pfn,
-							   root_stable_tree +
-							   nid))
+			if (stable_node->kpfn >= start_pfn &&
+			    stable_node->kpfn < end_pfn) {
+				/*
+				 * Don't get_ksm_page, page has already gone:
+				 * which is why we keep kpfn instead of page*
+				 */
+				remove_node_from_stable_tree(stable_node);
 				node = rb_first(root_stable_tree + nid);
-			else
+			} else
 				node = rb_next(node);
 			cond_resched();
 		}
@@ -2883,47 +2376,6 @@ static ssize_t merge_across_nodes_store(struct kobject *kobj,
 KSM_ATTR(merge_across_nodes);
 #endif
 
-static ssize_t max_page_sharing_show(struct kobject *kobj,
-				     struct kobj_attribute *attr, char *buf)
-{
-	return sprintf(buf, "%u\n", ksm_max_page_sharing);
-}
-
-static ssize_t max_page_sharing_store(struct kobject *kobj,
-				      struct kobj_attribute *attr,
-				      const char *buf, size_t count)
-{
-	int err;
-	int knob;
-
-	err = kstrtoint(buf, 10, &knob);
-	if (err)
-		return err;
-	/*
-	 * When a KSM page is created it is shared by 2 mappings. This
-	 * being a signed comparison, it implicitly verifies it's not
-	 * negative.
-	 */
-	if (knob < 2)
-		return -EINVAL;
-
-	if (READ_ONCE(ksm_max_page_sharing) == knob)
-		return count;
-
-	mutex_lock(&ksm_thread_mutex);
-	wait_while_offlining();
-	if (ksm_max_page_sharing != knob) {
-		if (ksm_pages_shared || remove_all_stable_nodes())
-			err = -EBUSY;
-		else
-			ksm_max_page_sharing = knob;
-	}
-	mutex_unlock(&ksm_thread_mutex);
-
-	return err ? err : count;
-}
-KSM_ATTR(max_page_sharing);
-
 static ssize_t pages_shared_show(struct kobject *kobj,
 				 struct kobj_attribute *attr, char *buf)
 {
@@ -2962,46 +2414,6 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
 }
 KSM_ATTR_RO(pages_volatile);
 
-static ssize_t stable_node_dups_show(struct kobject *kobj,
-				     struct kobj_attribute *attr, char *buf)
-{
-	return sprintf(buf, "%lu\n", ksm_stable_node_dups);
-}
-KSM_ATTR_RO(stable_node_dups);
-
-static ssize_t stable_node_chains_show(struct kobject *kobj,
-				       struct kobj_attribute *attr, char *buf)
-{
-	return sprintf(buf, "%lu\n", ksm_stable_node_chains);
-}
-KSM_ATTR_RO(stable_node_chains);
-
-static ssize_t
-stable_node_chains_prune_millisecs_show(struct kobject *kobj,
-					struct kobj_attribute *attr,
-					char *buf)
-{
-	return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
-}
-
-static ssize_t
-stable_node_chains_prune_millisecs_store(struct kobject *kobj,
-					 struct kobj_attribute *attr,
-					 const char *buf, size_t count)
-{
-	unsigned long msecs;
-	int err;
-
-	err = kstrtoul(buf, 10, &msecs);
-	if (err || msecs > UINT_MAX)
-		return -EINVAL;
-
-	ksm_stable_node_chains_prune_millisecs = msecs;
-
-	return count;
-}
-KSM_ATTR(stable_node_chains_prune_millisecs);
-
 static ssize_t full_scans_show(struct kobject *kobj,
 			       struct kobj_attribute *attr, char *buf)
 {
@@ -3021,10 +2433,6 @@ static struct attribute *ksm_attrs[] = {
 #ifdef CONFIG_NUMA
 	&merge_across_nodes_attr.attr,
 #endif
-	&max_page_sharing_attr.attr,
-	&stable_node_chains_attr.attr,
-	&stable_node_dups_attr.attr,
-	&stable_node_chains_prune_millisecs_attr.attr,
 	NULL,
 };
 
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -9,18 +9,100 @@
 #include <linux/mm.h>
 #include <linux/list_lru.h>
 #include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/memcontrol.h>
+
+#ifdef CONFIG_MEMCG_KMEM
+static LIST_HEAD(list_lrus);
+static DEFINE_MUTEX(list_lrus_mutex);
+
+static void list_lru_register(struct list_lru *lru)
+{
+	mutex_lock(&list_lrus_mutex);
+	list_add(&lru->list, &list_lrus);
+	mutex_unlock(&list_lrus_mutex);
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+	mutex_lock(&list_lrus_mutex);
+	list_del(&lru->list);
+	mutex_unlock(&list_lrus_mutex);
+}
+#else
+static void list_lru_register(struct list_lru *lru)
+{
+}
+
+static void list_lru_unregister(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+#ifdef CONFIG_MEMCG_KMEM
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+	return !!lru->node[0].memcg_lrus;
+}
+
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+	/*
+	 * The lock protects the array of per cgroup lists from relocation
+	 * (see memcg_update_list_lru_node).
+	 */
+	lockdep_assert_held(&nlru->lock);
+	if (nlru->memcg_lrus && idx >= 0)
+		return nlru->memcg_lrus->lru[idx];
+
+	return &nlru->lru;
+}
+
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+	struct mem_cgroup *memcg;
+
+	if (!nlru->memcg_lrus)
+		return &nlru->lru;
+
+	memcg = mem_cgroup_from_kmem(ptr);
+	if (!memcg)
+		return &nlru->lru;
+
+	return list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
+}
+#else
+static inline bool list_lru_memcg_aware(struct list_lru *lru)
+{
+	return false;
+}
+
+static inline struct list_lru_one *
+list_lru_from_memcg_idx(struct list_lru_node *nlru, int idx)
+{
+	return &nlru->lru;
+}
+
+static inline struct list_lru_one *
+list_lru_from_kmem(struct list_lru_node *nlru, void *ptr)
+{
+	return &nlru->lru;
+}
+#endif /* CONFIG_MEMCG_KMEM */
 
 bool list_lru_add(struct list_lru *lru, struct list_head *item)
 {
 	int nid = page_to_nid(virt_to_page(item));
 	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
 
 	spin_lock(&nlru->lock);
-	WARN_ON_ONCE(nlru->nr_items < 0);
+	l = list_lru_from_kmem(nlru, item);
 	if (list_empty(item)) {
-		list_add_tail(item, &nlru->list);
-		if (nlru->nr_items++ == 0)
-			node_set(nid, lru->active_nodes);
+		list_add_tail(item, &l->list);
+		l->nr_items++;
 		spin_unlock(&nlru->lock);
 		return true;
 	}
@@ -33,13 +115,13 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 {
 	int nid = page_to_nid(virt_to_page(item));
 	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
 
 	spin_lock(&nlru->lock);
+	l = list_lru_from_kmem(nlru, item);
 	if (!list_empty(item)) {
 		list_del_init(item);
-		if (--nlru->nr_items == 0)
-			node_clear(nid, lru->active_nodes);
-		WARN_ON_ONCE(nlru->nr_items < 0);
+		l->nr_items--;
 		spin_unlock(&nlru->lock);
 		return true;
 	}
@@ -48,33 +130,72 @@ bool list_lru_del(struct list_lru *lru, struct list_head *item)
 }
 EXPORT_SYMBOL_GPL(list_lru_del);
 
-unsigned long
-list_lru_count_node(struct list_lru *lru, int nid)
+void list_lru_isolate(struct list_lru_one *list, struct list_head *item)
+{
+	list_del_init(item);
+	list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate);
+
+void list_lru_isolate_move(struct list_lru_one *list, struct list_head *item,
+			   struct list_head *head)
+{
+	list_move(item, head);
+	list->nr_items--;
+}
+EXPORT_SYMBOL_GPL(list_lru_isolate_move);
+
+static unsigned long __list_lru_count_one(struct list_lru *lru,
+					  int nid, int memcg_idx)
 {
-	unsigned long count = 0;
 	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
+	unsigned long count;
 
 	spin_lock(&nlru->lock);
-	WARN_ON_ONCE(nlru->nr_items < 0);
-	count += nlru->nr_items;
+	l = list_lru_from_memcg_idx(nlru, memcg_idx);
+	count = l->nr_items;
 	spin_unlock(&nlru->lock);
 
 	return count;
 }
+
+unsigned long list_lru_count_one(struct list_lru *lru,
+				 int nid, struct mem_cgroup *memcg)
+{
+	return __list_lru_count_one(lru, nid, memcg_cache_id(memcg));
+}
+EXPORT_SYMBOL_GPL(list_lru_count_one);
+
+unsigned long list_lru_count_node(struct list_lru *lru, int nid)
+{
+	long count = 0;
+	int memcg_idx;
+
+	count += __list_lru_count_one(lru, nid, -1);
+	if (list_lru_memcg_aware(lru)) {
+		for_each_memcg_cache_index(memcg_idx)
+			count += __list_lru_count_one(lru, nid, memcg_idx);
+	}
+	return count;
+}
 EXPORT_SYMBOL_GPL(list_lru_count_node);
 
-unsigned long
-list_lru_walk_node(struct list_lru *lru, int nid, list_lru_walk_cb isolate,
-		   void *cb_arg, unsigned long *nr_to_walk)
+static unsigned long
+__list_lru_walk_one(struct list_lru *lru, int nid, int memcg_idx,
+		    list_lru_walk_cb isolate, void *cb_arg,
+		    unsigned long *nr_to_walk)
 {
 
-	struct list_lru_node	*nlru = &lru->node[nid];
+	struct list_lru_node *nlru = &lru->node[nid];
+	struct list_lru_one *l;
 	struct list_head *item, *n;
 	unsigned long isolated = 0;
 
 	spin_lock(&nlru->lock);
+	l = list_lru_from_memcg_idx(nlru, memcg_idx);
 restart:
-	list_for_each_safe(item, n, &nlru->list) {
+	list_for_each_safe(item, n, &l->list) {
 		enum lru_status ret;
 
 		/*
@@ -85,14 +206,11 @@ restart:
 			break;
 		--*nr_to_walk;
 
-		ret = isolate(item, &nlru->lock, cb_arg);
+		ret = isolate(item, l, &nlru->lock, cb_arg);
 		switch (ret) {
 		case LRU_REMOVED_RETRY:
 			assert_spin_locked(&nlru->lock);
 		case LRU_REMOVED:
-			if (--nlru->nr_items == 0)
-				node_clear(nid, lru->active_nodes);
-			WARN_ON_ONCE(nlru->nr_items < 0);
 			isolated++;
 			/*
 			 * If the lru lock has been dropped, our list
@@ -103,7 +221,7 @@ restart:
 				goto restart;
 			break;
 		case LRU_ROTATE:
-			list_move_tail(item, &nlru->list);
+			list_move_tail(item, &l->list);
 			break;
 		case LRU_SKIP:
 			break;
@@ -122,31 +240,322 @@ restart:
 	spin_unlock(&nlru->lock);
 	return isolated;
 }
+
+unsigned long
+list_lru_walk_one(struct list_lru *lru, int nid, struct mem_cgroup *memcg,
+		  list_lru_walk_cb isolate, void *cb_arg,
+		  unsigned long *nr_to_walk)
+{
+	return __list_lru_walk_one(lru, nid, memcg_cache_id(memcg),
+				   isolate, cb_arg, nr_to_walk);
+}
+EXPORT_SYMBOL_GPL(list_lru_walk_one);
+
+unsigned long list_lru_walk_node(struct list_lru *lru, int nid,
+				 list_lru_walk_cb isolate, void *cb_arg,
+				 unsigned long *nr_to_walk)
+{
+	long isolated = 0;
+	int memcg_idx;
+
+	isolated += __list_lru_walk_one(lru, nid, -1, isolate, cb_arg,
+					nr_to_walk);
+	if (*nr_to_walk > 0 && list_lru_memcg_aware(lru)) {
+		for_each_memcg_cache_index(memcg_idx) {
+			isolated += __list_lru_walk_one(lru, nid, memcg_idx,
+						isolate, cb_arg, nr_to_walk);
+			if (*nr_to_walk <= 0)
+				break;
+		}
+	}
+	return isolated;
+}
 EXPORT_SYMBOL_GPL(list_lru_walk_node);
 
-int list_lru_init_key(struct list_lru *lru, struct lock_class_key *key)
+static void init_one_lru(struct list_lru_one *l)
+{
+	INIT_LIST_HEAD(&l->list);
+	l->nr_items = 0;
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+static void __memcg_destroy_list_lru_node(struct list_lru_memcg *memcg_lrus,
+					  int begin, int end)
+{
+	int i;
+
+	for (i = begin; i < end; i++)
+		kfree(memcg_lrus->lru[i]);
+}
+
+static int __memcg_init_list_lru_node(struct list_lru_memcg *memcg_lrus,
+				      int begin, int end)
+{
+	int i;
+
+	for (i = begin; i < end; i++) {
+		struct list_lru_one *l;
+
+		l = kmalloc(sizeof(struct list_lru_one), GFP_KERNEL);
+		if (!l)
+			goto fail;
+
+		init_one_lru(l);
+		memcg_lrus->lru[i] = l;
+	}
+	return 0;
+fail:
+	__memcg_destroy_list_lru_node(memcg_lrus, begin, i - 1);
+	return -ENOMEM;
+}
+
+static int memcg_init_list_lru_node(struct list_lru_node *nlru)
+{
+	int size = memcg_nr_cache_ids;
+
+	nlru->memcg_lrus = kmalloc(size * sizeof(void *), GFP_KERNEL);
+	if (!nlru->memcg_lrus)
+		return -ENOMEM;
+
+	if (__memcg_init_list_lru_node(nlru->memcg_lrus, 0, size)) {
+		kfree(nlru->memcg_lrus);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
+{
+	__memcg_destroy_list_lru_node(nlru->memcg_lrus, 0, memcg_nr_cache_ids);
+	kfree(nlru->memcg_lrus);
+}
+
+static int memcg_update_list_lru_node(struct list_lru_node *nlru,
+				      int old_size, int new_size)
+{
+	struct list_lru_memcg *old, *new;
+
+	BUG_ON(old_size > new_size);
+
+	old = nlru->memcg_lrus;
+	new = kmalloc(new_size * sizeof(void *), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	if (__memcg_init_list_lru_node(new, old_size, new_size)) {
+		kfree(new);
+		return -ENOMEM;
+	}
+
+	memcpy(new, old, old_size * sizeof(void *));
+
+	/*
+	 * The lock guarantees that we won't race with a reader
+	 * (see list_lru_from_memcg_idx).
+	 *
+	 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+	 * we have to use IRQ-safe primitives here to avoid deadlock.
+	 */
+	spin_lock_irq(&nlru->lock);
+	nlru->memcg_lrus = new;
+	spin_unlock_irq(&nlru->lock);
+
+	kfree(old);
+	return 0;
+}
+
+static void memcg_cancel_update_list_lru_node(struct list_lru_node *nlru,
+					      int old_size, int new_size)
+{
+	/* do not bother shrinking the array back to the old size, because we
+	 * cannot handle allocation failures here */
+	__memcg_destroy_list_lru_node(nlru->memcg_lrus, old_size, new_size);
+}
+
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+	int i;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		if (!memcg_aware)
+			lru->node[i].memcg_lrus = NULL;
+		else if (memcg_init_list_lru_node(&lru->node[i]))
+			goto fail;
+	}
+	return 0;
+fail:
+	for (i = i - 1; i >= 0; i--)
+		memcg_destroy_list_lru_node(&lru->node[i]);
+	return -ENOMEM;
+}
+
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+	int i;
+
+	if (!list_lru_memcg_aware(lru))
+		return;
+
+	for (i = 0; i < nr_node_ids; i++)
+		memcg_destroy_list_lru_node(&lru->node[i]);
+}
+
+static int memcg_update_list_lru(struct list_lru *lru,
+				 int old_size, int new_size)
+{
+	int i;
+
+	if (!list_lru_memcg_aware(lru))
+		return 0;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		if (memcg_update_list_lru_node(&lru->node[i],
+					       old_size, new_size))
+			goto fail;
+	}
+	return 0;
+fail:
+	for (i = i - 1; i >= 0; i--)
+		memcg_cancel_update_list_lru_node(&lru->node[i],
+						  old_size, new_size);
+	return -ENOMEM;
+}
+
+static void memcg_cancel_update_list_lru(struct list_lru *lru,
+					 int old_size, int new_size)
+{
+	int i;
+
+	if (!list_lru_memcg_aware(lru))
+		return;
+
+	for (i = 0; i < nr_node_ids; i++)
+		memcg_cancel_update_list_lru_node(&lru->node[i],
+						  old_size, new_size);
+}
+
+int memcg_update_all_list_lrus(int new_size)
+{
+	int ret = 0;
+	struct list_lru *lru;
+	int old_size = memcg_nr_cache_ids;
+
+	mutex_lock(&list_lrus_mutex);
+	list_for_each_entry(lru, &list_lrus, list) {
+		ret = memcg_update_list_lru(lru, old_size, new_size);
+		if (ret)
+			goto fail;
+	}
+out:
+	mutex_unlock(&list_lrus_mutex);
+	return ret;
+fail:
+	list_for_each_entry_continue_reverse(lru, &list_lrus, list)
+		memcg_cancel_update_list_lru(lru, old_size, new_size);
+	goto out;
+}
+
+static void memcg_drain_list_lru_node(struct list_lru_node *nlru,
+				      int src_idx, int dst_idx)
+{
+	struct list_lru_one *src, *dst;
+
+	/*
+	 * Since list_lru_{add,del} may be called under an IRQ-safe lock,
+	 * we have to use IRQ-safe primitives here to avoid deadlock.
+	 */
+	spin_lock_irq(&nlru->lock);
+
+	src = list_lru_from_memcg_idx(nlru, src_idx);
+	dst = list_lru_from_memcg_idx(nlru, dst_idx);
+
+	list_splice_init(&src->list, &dst->list);
+	dst->nr_items += src->nr_items;
+	src->nr_items = 0;
+
+	spin_unlock_irq(&nlru->lock);
+}
+
+static void memcg_drain_list_lru(struct list_lru *lru,
+				 int src_idx, int dst_idx)
+{
+	int i;
+
+	if (!list_lru_memcg_aware(lru))
+		return;
+
+	for (i = 0; i < nr_node_ids; i++)
+		memcg_drain_list_lru_node(&lru->node[i], src_idx, dst_idx);
+}
+
+void memcg_drain_all_list_lrus(int src_idx, int dst_idx)
+{
+	struct list_lru *lru;
+
+	mutex_lock(&list_lrus_mutex);
+	list_for_each_entry(lru, &list_lrus, list)
+		memcg_drain_list_lru(lru, src_idx, dst_idx);
+	mutex_unlock(&list_lrus_mutex);
+}
+#else
+static int memcg_init_list_lru(struct list_lru *lru, bool memcg_aware)
+{
+	return 0;
+}
+
+static void memcg_destroy_list_lru(struct list_lru *lru)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+int __list_lru_init(struct list_lru *lru, bool memcg_aware,
+		    struct lock_class_key *key)
 {
 	int i;
 	size_t size = sizeof(*lru->node) * nr_node_ids;
+	int err = -ENOMEM;
+
+	memcg_get_cache_ids();
 
 	lru->node = kzalloc(size, GFP_KERNEL);
 	if (!lru->node)
-		return -ENOMEM;
+		goto out;
 
-	nodes_clear(lru->active_nodes);
 	for (i = 0; i < nr_node_ids; i++) {
 		spin_lock_init(&lru->node[i].lock);
 		if (key)
 			lockdep_set_class(&lru->node[i].lock, key);
-		INIT_LIST_HEAD(&lru->node[i].list);
-		lru->node[i].nr_items = 0;
+		init_one_lru(&lru->node[i].lru);
 	}
-	return 0;
+
+	err = memcg_init_list_lru(lru, memcg_aware);
+	if (err) {
+		kfree(lru->node);
+		goto out;
+	}
+
+	list_lru_register(lru);
+out:
+	memcg_put_cache_ids();
+	return err;
 }
-EXPORT_SYMBOL_GPL(list_lru_init_key);
+EXPORT_SYMBOL_GPL(__list_lru_init);
 
 void list_lru_destroy(struct list_lru *lru)
 {
+	/* Already destroyed or not yet initialized? */
+	if (!lru->node)
+		return;
+
+	memcg_get_cache_ids();
+
+	list_lru_unregister(lru);
+
+	memcg_destroy_list_lru(lru);
 	kfree(lru->node);
+	lru->node = NULL;
+
+	memcg_put_cache_ids();
 }
 EXPORT_SYMBOL_GPL(list_lru_destroy);
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -156,7 +156,7 @@ static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start,
 		pte = *(orig_pte + ((index - start) / PAGE_SIZE));
 		pte_unmap_unlock(orig_pte, ptl);
 
-		if (pte_present(pte) || pte_none(pte) || pte_file(pte))
+		if (pte_present(pte) || pte_none(pte))
 			continue;
 		entry = pte_to_swp_entry(pte);
 		if (unlikely(non_swap_entry(entry)))
@@ -279,14 +279,7 @@ static long madvise_dontneed(struct vm_area_struct * vma,
 	if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP))
 		return -EINVAL;
 
-	if (unlikely(vma->vm_flags & VM_NONLINEAR)) {
-		struct zap_details details = {
-			.nonlinear_vma = vma,
-			.last_index = ULONG_MAX,
-		};
-		zap_page_range(vma, start, end - start, &details);
-	} else
-		zap_page_range(vma, start, end - start, NULL);
+	zap_page_range(vma, start, end - start, NULL);
 	return 0;
 }
 
@@ -307,7 +300,7 @@ static long madvise_remove(struct vm_area_struct *vma,
 
 	*prev = NULL;	/* tell sys_madvise we drop mmap_sem */
 
-	if (vma->vm_flags & (VM_LOCKED|VM_NONLINEAR|VM_HUGETLB))
+	if (vma->vm_flags & (VM_LOCKED | VM_HUGETLB))
 		return -EINVAL;
 
 	f = vma->vm_file;
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -48,16 +48,19 @@
 #include <linux/sort.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
-#include <linux/vmalloc.h>
 #include <linux/vmpressure.h>
 #include <linux/mm_inline.h>
 #include <linux/page_cgroup.h>
 #include <linux/cpu.h>
 #include <linux/oom.h>
+#include <linux/virtinfo.h>
+#include <linux/migrate.h>
 #include "internal.h"
 #include <net/sock.h>
 #include <net/ip.h>
 #include <net/tcp_memcontrol.h>
+#include <net/udp_memcontrol.h>
+#include "slab.h"
 
 #include <asm/uaccess.h>
 
@@ -96,6 +99,9 @@ enum mem_cgroup_stat_index {
 	MEM_CGROUP_STAT_RSS,		/* # of pages charged as anon rss */
 	MEM_CGROUP_STAT_RSS_HUGE,	/* # of pages charged as anon huge */
 	MEM_CGROUP_STAT_FILE_MAPPED,	/* # of pages charged as file rss */
+	MEM_CGROUP_STAT_SHMEM,		/* # of charged shmem pages */
+	MEM_CGROUP_STAT_SLAB_RECLAIMABLE, /* # of reclaimable slab pages */
+	MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE, /* # of unreclaimable slab pages */
 	MEM_CGROUP_STAT_SWAP,		/* # of pages, swapped out */
 	MEM_CGROUP_STAT_NSTATS,
 };
@@ -105,12 +111,17 @@ static const char * const mem_cgroup_stat_names[] = {
 	"rss",
 	"rss_huge",
 	"mapped_file",
+	"shmem",
+	"slab_reclaimable",
+	"slab_unreclaimable",
 	"swap",
 };
 
 enum mem_cgroup_events_index {
 	MEM_CGROUP_EVENTS_PGPGIN,	/* # of pages paged in */
 	MEM_CGROUP_EVENTS_PGPGOUT,	/* # of pages paged out */
+	MEM_CGROUP_EVENTS_PSWPIN,	/* # of pages swapped in */
+	MEM_CGROUP_EVENTS_PSWPOUT,	/* # of pages swapped out */
 	MEM_CGROUP_EVENTS_PGFAULT,	/* # of page-faults */
 	MEM_CGROUP_EVENTS_PGMAJFAULT,	/* # of major page-faults */
 	MEM_CGROUP_EVENTS_NSTATS,
@@ -119,6 +130,8 @@ enum mem_cgroup_events_index {
 static const char * const mem_cgroup_events_names[] = {
 	"pgpgin",
 	"pgpgout",
+	"pswpin",
+	"pswpout",
 	"pgfault",
 	"pgmajfault",
 };
@@ -267,46 +280,46 @@ struct mem_cgroup {
 
 	unsigned long soft_limit;
 
+	/* Normal memory consumption range */
+	unsigned long low;
+	unsigned long high;
+
 	/* vmpressure notifications */
 	struct vmpressure vmpressure;
 
-	union {
-		/*
-		 * the counter to account for mem+swap usage.
-		 */
-		struct page_counter memsw;
-		/*
-		 * rcu_freeing is used only when freeing struct mem_cgroup,
-		 * so put it into a union to avoid wasting more memory.
-		 * It must be disjoint from the css field.  It could be
-		 * in a union with the res field, but res plays a much
-		 * larger part in mem_cgroup life than memsw, and might
-		 * be of interest, even at time of free, when debugging.
-		 * So share rcu_head with the less interesting memsw.
-		 */
-		struct rcu_head rcu_freeing;
-		/*
-		 * We also need some space for a worker in deferred freeing.
-		 * By the time we call it, rcu_freeing is no longer in use.
-		 */
-		struct work_struct work_freeing;
-	};
 	/*
 	 * the counter to account for kernel memory usage.
 	 */
 	struct page_counter kmem;
+	struct page_counter memsw;
+	/*
+	 * the counter to account for dcache usage.
+	 *
+	 * Never limited, only needed for showing stats. We could use a per cpu
+	 * counter if we did not have to report max usage.
+	 */
+	struct page_counter dcache;
+
+	/* beancounter-related stats */
+	unsigned long long swap_max;
+	atomic_long_t mem_failcnt;
+	atomic_long_t swap_failcnt;
+	atomic_long_t oom_kill_cnt;
+
+	struct oom_context oom_ctx;
+	unsigned long oom_guarantee;
+
 	/*
 	 * Should the accounting and control be hierarchical, per subtree?
 	 */
 	bool use_hierarchy;
+	bool is_offline;
 	unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */
 
 	bool		oom_lock;
 	atomic_t	under_oom;
 	atomic_t	oom_wakeups;
 
-	atomic_t	refcnt;
-
 	int	swappiness;
 	/* OOM-Killer disable */
 	int		oom_kill_disable;
@@ -314,6 +327,20 @@ struct mem_cgroup {
 	/* set when res.limit == memsw.limit */
 	bool		memsw_is_minimum;
 
+#ifdef CONFIG_CLEANCACHE
+	/*
+	 * cleancache_disabled_toggle: toggled by writing to
+	 * memory.disable_cleancache
+	 *
+	 * cleancache_disabled: set iff cleancache_disabled_toggle is
+	 * set in this cgroup or any of its ascendants; controls whether
+	 * cleancache callback is called when a page is evicted from
+	 * this cgroup
+	 */
+	bool cleancache_disabled_toggle;
+	bool cleancache_disabled;
+#endif
+
 	/* protect arrays of thresholds */
 	struct mutex thresholds_lock;
 
@@ -341,24 +368,18 @@ struct mem_cgroup {
 	 * percpu counter.
 	 */
 	struct mem_cgroup_stat_cpu __percpu *stat;
-	/*
-	 * used when a cpu is offlined or other synchronizations
-	 * See mem_cgroup_read_stat().
-	 */
-	struct mem_cgroup_stat_cpu nocpu_base;
 	spinlock_t pcp_counter_lock;
 
 	atomic_t	dead_count;
 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET)
 	struct tcp_memcontrol tcp_mem;
+	struct udp_memcontrol udp_mem;
 #endif
 #if defined(CONFIG_MEMCG_KMEM)
-	/* analogous to slab_common's slab_caches list. per-memcg */
-	struct list_head memcg_slab_caches;
-	/* Not a spinlock, we can take a lot of time walking the list */
-	struct mutex slab_caches_mutex;
-        /* Index in the kmem_cache->memcg_params->memcg_caches array */
+        /* Index in the kmem_cache->memcg_params.memcg_caches array */
 	int kmemcg_id;
+	/* List of memcgs sharing the same kmemcg_id */
+	struct list_head kmemcg_sharers;
 #endif
 
 	int last_scanned_node;
@@ -378,48 +399,27 @@ struct mem_cgroup {
 	struct mem_cgroup_lru_info info;
 };
 
-static size_t memcg_size(void)
-{
-	return sizeof(struct mem_cgroup) +
-		nr_node_ids * sizeof(struct mem_cgroup_per_node *);
-}
-
 /* internal only representation about the status of kmem accounting. */
 enum {
-	KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */
-	KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */
+	KMEM_ACCOUNTED_ACTIVE, /* accounted by this cgroup itself */
+	KMEM_ACCOUNTED_ACTIVATED, /* static key enabled */
 	KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */
 };
 
-/* We account when limit is on, but only after call sites are patched */
-#define KMEM_ACCOUNTED_MASK \
-		((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED))
-
 #ifdef CONFIG_MEMCG_KMEM
-static inline void memcg_kmem_set_active(struct mem_cgroup *memcg)
-{
-	set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
-}
-
-static bool memcg_kmem_is_active(struct mem_cgroup *memcg)
+bool memcg_kmem_is_active(struct mem_cgroup *memcg)
 {
 	return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
 }
 
-static void memcg_kmem_set_activated(struct mem_cgroup *memcg)
-{
-	set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
-static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
-{
-	clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
-}
-
 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
 {
-	if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
-		set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
+	/*
+	 * Our caller must use css_get() first, because memcg_uncharge_kmem()
+	 * will call css_put() if it sees the memcg is dead.
+	 */
+	smp_wmb();
+	set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
 }
 
 static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg)
@@ -510,9 +510,6 @@ enum res_type {
  */
 static DEFINE_MUTEX(memcg_create_mutex);
 
-static void mem_cgroup_get(struct mem_cgroup *memcg);
-static void mem_cgroup_put(struct mem_cgroup *memcg);
-
 static inline
 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
 {
@@ -563,15 +560,15 @@ void sock_update_memcg(struct sock *sk)
 		 */
 		if (sk->sk_cgrp) {
 			BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
-			mem_cgroup_get(sk->sk_cgrp->memcg);
+			css_get(&sk->sk_cgrp->memcg->css);
 			return;
 		}
 
 		rcu_read_lock();
 		memcg = mem_cgroup_from_task(current);
 		cg_proto = sk->sk_prot->proto_cgroup(memcg);
-		if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
-			mem_cgroup_get(memcg);
+		if (!mem_cgroup_is_root(memcg) &&
+		    memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
 			sk->sk_cgrp = cg_proto;
 		}
 		rcu_read_unlock();
@@ -585,7 +582,7 @@ void sock_release_memcg(struct sock *sk)
 		struct mem_cgroup *memcg;
 		WARN_ON(!sk->sk_cgrp->memcg);
 		memcg = sk->sk_cgrp->memcg;
-		mem_cgroup_put(memcg);
+		css_put(&sk->sk_cgrp->memcg->css);
 	}
 }
 
@@ -598,11 +595,21 @@ struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg)
 }
 EXPORT_SYMBOL(tcp_proto_cgroup);
 
+struct cg_proto *udp_proto_cgroup(struct mem_cgroup *memcg)
+{
+	if (!memcg || mem_cgroup_is_root(memcg))
+		return NULL;
+
+	return &memcg->udp_mem.cg_proto;
+}
+EXPORT_SYMBOL(udp_proto_cgroup);
+
 static void disarm_sock_keys(struct mem_cgroup *memcg)
 {
-	if (!memcg_proto_activated(&memcg->tcp_mem.cg_proto))
-		return;
-	static_key_slow_dec(&memcg_socket_limit_enabled);
+	if (memcg_proto_activated(&memcg->tcp_mem.cg_proto))
+		static_key_slow_dec(&memcg_socket_limit_enabled);
+	if (memcg_proto_activated(&memcg->udp_mem.cg_proto))
+		static_key_slow_dec(&memcg_socket_limit_enabled);
 }
 #else
 static void disarm_sock_keys(struct mem_cgroup *memcg)
@@ -612,7 +619,7 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
 
 #ifdef CONFIG_MEMCG_KMEM
 /*
- * This will be the memcg's index in each cache's ->memcg_params->memcg_caches.
+ * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
  * There are two main reasons for not using the css_id for this:
  *  1) this works better in sparse environments, where we have a lot of memcgs,
  *     but only a few kmem-limited. Or also, if we have, for instance, 200
@@ -624,12 +631,24 @@ static void disarm_sock_keys(struct mem_cgroup *memcg)
  *     css_id. Having a separate index prevents us from messing with the cgroup
  *     core for this
  *
- * The current size of the caches array is stored in
- * memcg_limited_groups_array_size.  It will double each time we have to
- * increase it.
+ * The current size of the caches array is stored in memcg_nr_cache_ids. It
+ * will double each time we have to increase it.
  */
-static DEFINE_IDA(kmem_limited_groups);
-int memcg_limited_groups_array_size;
+static DEFINE_IDA(memcg_cache_ida);
+int memcg_nr_cache_ids;
+
+/* Protects memcg_nr_cache_ids */
+static DECLARE_RWSEM(memcg_cache_ids_sem);
+
+void memcg_get_cache_ids(void)
+{
+	down_read(&memcg_cache_ids_sem);
+}
+
+void memcg_put_cache_ids(void)
+{
+	up_read(&memcg_cache_ids_sem);
+}
 
 /*
  * MIN_SIZE is different than 1, because we would like to avoid going through
@@ -657,10 +676,8 @@ EXPORT_SYMBOL(memcg_kmem_enabled_key);
 
 static void disarm_kmem_keys(struct mem_cgroup *memcg)
 {
-	if (memcg_kmem_is_active(memcg)) {
+	if (test_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags))
 		static_key_slow_dec(&memcg_kmem_enabled_key);
-		ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id);
-	}
 	/*
 	 * This check can't live in kmem destruction function,
 	 * since the charges will outlive the cgroup
@@ -693,6 +710,32 @@ struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
 	return &memcg->css;
 }
 
+/**
+ * page_cgroup_ino - return inode number of the memcg a page is charged to
+ * @page: the page
+ *
+ * Look up the memory cgroup @page is charged to and return its inode number or
+ * 0 if @page is not charged to any cgroup. It is safe to call this function
+ * without holding a reference to @page.
+ *
+ * Note, this function is inherently racy, because there is nothing to prevent
+ * the cgroup inode from getting torn down and potentially reallocated a moment
+ * after page_cgroup_ino() returns, so it only should be used by callers that
+ * do not care (such as procfs interfaces).
+ */
+ino_t page_cgroup_ino(struct page *page)
+{
+	struct page_cgroup *pc;
+	unsigned long ino = 0;
+
+	pc = lookup_page_cgroup(page);
+	if (!PageCgroupUsed(pc))
+		return 0;
+	if (likely(PageCgroupUsed(pc)))
+		ino = pc->mem_cgroup->css.cgroup->dentry->d_inode->i_ino;
+	return ino;
+}
+
 static struct mem_cgroup_per_zone *
 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page)
 {
@@ -767,9 +810,11 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *memcg,
 				struct mem_cgroup_per_zone *mz,
 				struct mem_cgroup_tree_per_zone *mctz)
 {
-	spin_lock(&mctz->lock);
+	unsigned long flags;
+
+	spin_lock_irqsave(&mctz->lock, flags);
 	__mem_cgroup_remove_exceeded(memcg, mz, mctz);
-	spin_unlock(&mctz->lock);
+	spin_unlock_irqrestore(&mctz->lock, flags);
 }
 
 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
@@ -805,7 +850,9 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 		 * mem is over its softlimit.
 		 */
 		if (excess || mz->on_tree) {
-			spin_lock(&mctz->lock);
+			unsigned long flags;
+
+			spin_lock_irqsave(&mctz->lock, flags);
 			/* if on-tree, remove it */
 			if (mz->on_tree)
 				__mem_cgroup_remove_exceeded(memcg, mz, mctz);
@@ -814,7 +861,7 @@ static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 			 * If excess is 0, no tree ops.
 			 */
 			__mem_cgroup_insert_exceeded(memcg, mz, mctz, excess);
-			spin_unlock(&mctz->lock);
+			spin_unlock_irqrestore(&mctz->lock, flags);
 		}
 	}
 }
@@ -865,19 +912,21 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
 {
 	struct mem_cgroup_per_zone *mz;
 
-	spin_lock(&mctz->lock);
+	spin_lock_irq(&mctz->lock);
 	mz = __mem_cgroup_largest_soft_limit_node(mctz);
-	spin_unlock(&mctz->lock);
+	spin_unlock_irq(&mctz->lock);
 	return mz;
 }
 
 /*
+ * Return page count for single (non recursive) @memcg.
+ *
  * Implementation Note: reading percpu statistics for memcg.
  *
  * Both of vmstat[] and percpu_counter has threshold and do periodic
  * synchronization to implement "quick" read. There are trade-off between
  * reading cost and precision of value. Then, we may have a chance to implement
- * a periodic synchronizion of counter in memcg's counter.
+ * a periodic synchronization of counter in memcg's counter.
  *
  * But this _read() function is used for user interface now. The user accounts
  * memory usage by memory cgroup and he _always_ requires exact value because
@@ -887,32 +936,59 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
  *
  * If there are kernel internal actions which can make use of some not-exact
  * value, and reading all cpu value can be performance bottleneck in some
- * common workload, threashold and synchonization as vmstat[] should be
+ * common workload, threshold and synchronization as vmstat[] should be
  * implemented.
  */
-static long mem_cgroup_read_stat(struct mem_cgroup *memcg,
-				 enum mem_cgroup_stat_index idx)
+static unsigned long
+mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
 {
 	long val = 0;
 	int cpu;
 
-	get_online_cpus();
-	for_each_online_cpu(cpu)
+	/* Per-cpu values can be negative, use a signed accumulator */
+	for_each_possible_cpu(cpu)
 		val += per_cpu(memcg->stat->count[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-	spin_lock(&memcg->pcp_counter_lock);
-	val += memcg->nocpu_base.count[idx];
-	spin_unlock(&memcg->pcp_counter_lock);
-#endif
-	put_online_cpus();
+	/*
+	 * Summing races with updates, so val may be negative.  Avoid exposing
+	 * transient negative values.
+	 */
+	if (val < 0)
+		val = 0;
 	return val;
 }
 
-static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
-					 bool charge)
+static void mem_cgroup_update_swap_max(struct mem_cgroup *memcg)
 {
-	int val = (charge) ? 1 : -1;
-	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
+	long long swap;
+
+	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+		swap = page_counter_read(&memcg->memsw) -
+			page_counter_read(&memcg->memory);
+
+		/* This is racy, but we don't have to be absolutely precise */
+		if (swap > (long long)memcg->swap_max)
+			memcg->swap_max = swap;
+       }
+}
+
+static void mem_cgroup_inc_failcnt(struct mem_cgroup *memcg,
+                                  gfp_t gfp_mask, unsigned int nr_pages)
+{
+	unsigned long margin = 0;
+	unsigned long count;
+	unsigned long limit;
+
+	if (gfp_mask & __GFP_NOWARN)
+		return;
+
+	atomic_long_inc(&memcg->mem_failcnt);
+	count = page_counter_read(&memcg->memsw);
+	limit = ACCESS_ONCE(memcg->memsw.limit);
+	if (count < limit)
+		margin = limit - count;
+
+	if (do_swap_account && margin < nr_pages)
+		atomic_long_inc(&memcg->swap_failcnt);
 }
 
 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
@@ -921,19 +997,14 @@ static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 	unsigned long val = 0;
 	int cpu;
 
-	for_each_online_cpu(cpu)
+	for_each_possible_cpu(cpu)
 		val += per_cpu(memcg->stat->events[idx], cpu);
-#ifdef CONFIG_HOTPLUG_CPU
-	spin_lock(&memcg->pcp_counter_lock);
-	val += memcg->nocpu_base.events[idx];
-	spin_unlock(&memcg->pcp_counter_lock);
-#endif
 	return val;
 }
 
 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 					 struct page *page,
-					 bool anon, int nr_pages)
+					 int nr_pages)
 {
 	preempt_disable();
 
@@ -941,12 +1012,16 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 	 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
 	 * counted as CACHE even if it's on ANON LRU.
 	 */
-	if (anon)
+	if (PageAnon(page))
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
 				nr_pages);
-	else
+	else {
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
 				nr_pages);
+		if (PageSwapBacked(page))
+			__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SHMEM],
+				       nr_pages);
+	}
 
 	if (PageTransHuge(page))
 		__this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
@@ -1050,7 +1125,6 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
  */
 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 {
-	preempt_disable();
 	/* threshold event is triggered in finer grain than soft limit */
 	if (unlikely(mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_THRESH))) {
@@ -1063,8 +1137,6 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 		do_numainfo = mem_cgroup_event_ratelimit(memcg,
 						MEM_CGROUP_TARGET_NUMAINFO);
 #endif
-		preempt_enable();
-
 		mem_cgroup_threshold(memcg);
 		if (unlikely(do_softlimit))
 			mem_cgroup_update_tree(memcg, page);
@@ -1072,8 +1144,7 @@ static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 		if (unlikely(do_numainfo))
 			atomic_inc(&memcg->numainfo_events);
 #endif
-	} else
-		preempt_enable();
+	}
 }
 
 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
@@ -1095,22 +1166,24 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 	return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
 }
 
-struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
 	struct mem_cgroup *memcg = NULL;
 
-	if (!mm)
-		return NULL;
-	/*
-	 * Because we have no locks, mm->owner's may be being moved to other
-	 * cgroup. We use css_tryget() here even if this looks
-	 * pessimistic (rather than adding locks here).
-	 */
 	rcu_read_lock();
 	do {
-		memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
-		if (unlikely(!memcg))
-			break;
+		/*
+		 * Page cache insertions can happen withou an
+		 * actual mm context, e.g. during disk probing
+		 * on boot, loopback IO, acct() writes etc.
+		 */
+		if (unlikely(!mm))
+			memcg = root_mem_cgroup;
+		else {
+			memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
+			if (unlikely(!memcg))
+				memcg = root_mem_cgroup;
+		}
 	} while (!css_tryget(&memcg->css));
 	rcu_read_unlock();
 	return memcg;
@@ -1296,6 +1369,19 @@ void mem_cgroup_iter_break(struct mem_cgroup *root,
 	     iter != NULL;				\
 	     iter = mem_cgroup_iter(NULL, iter, NULL))
 
+void mem_cgroup_get_nr_pages(struct mem_cgroup *memcg, int nid,
+			     unsigned long *pages)
+{
+	struct mem_cgroup *iter;
+	int i;
+
+	for_each_mem_cgroup_tree(iter, memcg) {
+		for (i = 0; i < NR_LRU_LISTS; i++)
+			pages[i] += mem_cgroup_node_nr_lru_pages(iter, nid,
+								 BIT(i));
+	}
+}
+
 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx)
 {
 	struct mem_cgroup *memcg;
@@ -1353,20 +1439,6 @@ out:
 	return lruvec;
 }
 
-/*
- * Following LRU functions are allowed to be used without PCG_LOCK.
- * Operations are called by routine of global LRU independently from memcg.
- * What we have to take care of here is validness of pc->mem_cgroup.
- *
- * Changes to pc->mem_cgroup happens when
- * 1. charge
- * 2. moving account
- * In typical case, "charge" is done before add-to-lru. Exception is SwapCache.
- * It is added to LRU before charge.
- * If PCG_USED bit is not set, page_cgroup is not added to this private LRU.
- * When moving account, the page is not on LRU. It's isolated.
- */
-
 /**
  * mem_cgroup_page_lruvec - return lruvec for adding an lru page
  * @page: the page
@@ -1469,7 +1541,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
 
 	p = find_lock_task_mm(task);
 	if (p) {
-		curr = try_get_mem_cgroup_from_mm(p->mm);
+		curr = get_mem_cgroup_from_mm(p->mm);
 		task_unlock(p);
 	} else {
 		/*
@@ -1483,8 +1555,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg)
 			css_get(&curr->css);
 		task_unlock(task);
 	}
-	if (!curr)
-		return 0;
 	/*
 	 * We should check use_hierarchy of "memcg" not "curr". Because checking
 	 * use_hierarchy of "curr" here make this function true if hierarchy is
@@ -1515,6 +1585,156 @@ int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec)
 	return inactive * inactive_ratio < active;
 }
 
+bool mem_cgroup_dcache_is_low(struct mem_cgroup *memcg)
+{
+	unsigned long anon, file, dcache;
+
+	anon = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_RSS);
+	file = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_CACHE);
+	dcache = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_SLAB_RECLAIMABLE);
+
+	return dcache / sysctl_vfs_cache_min_ratio <
+			(anon + file + dcache) / 100;
+}
+
+/**
+ * mem_cgroup_low - check if memory consumption is below the normal range
+ * @root: the highest ancestor to consider
+ * @memcg: the memory cgroup to check
+ *
+ * Returns %true if memory consumption of @memcg, and that of all
+ * configurable ancestors up to @root, is below the normal range.
+ */
+bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
+{
+	if (mem_cgroup_disabled())
+		return false;
+
+	/*
+	 * The toplevel group doesn't have a configurable range, so
+	 * it's never low when looked at directly, and it is not
+	 * considered an ancestor when assessing the hierarchy.
+	 */
+
+	if (memcg == root_mem_cgroup)
+		return false;
+
+	if (page_counter_read(&memcg->memory) >= memcg->low)
+		return false;
+
+	/*
+	 * XXX: It's OK to set memory.low for a cgroup to infinity. This might
+	 * be useful if no tasks are supposed to run inside the cgroup itself,
+	 * but only in its sub-cgroups (e.g. /machine.slice). In this case
+	 * protection against memory pressure originating on upper levels will
+	 * be guarded solely by memory.low configuration in sub-cgroups.
+	 *
+	 * However, in the current implementation, in contrast to mainstream,
+	 * charges can appear in a cgroup even if there's no tasks in it - they
+	 * can be reparented from a dead sub-cgroup. If the cgroup has
+	 * memory.low set to inf, such reparented charges will not get
+	 * reclaimed normally on memory pressure, resulting in performance
+	 * degradation in other cgroups. To avoid that, let's ignore memory.low
+	 * for cgroups w/o tasks.
+	 */
+	if (cgroup_task_count(memcg->css.cgroup) == 0)
+		return false;
+
+	while (memcg != root) {
+		memcg = parent_mem_cgroup(memcg);
+		if (!memcg)
+			break;
+
+		if (memcg == root_mem_cgroup)
+			break;
+
+		if (page_counter_read(&memcg->memory) >= memcg->low)
+			return false;
+	}
+	return true;
+}
+
+#ifdef CONFIG_CLEANCACHE
+bool mem_cgroup_cleancache_disabled(struct page *page)
+{
+	struct page_cgroup *pc;
+	bool ret = false;
+
+	if (mem_cgroup_disabled())
+		return false;
+
+	pc = lookup_page_cgroup(page);
+	if (!PageCgroupUsed(pc))
+		return false;
+
+	if (likely(PageCgroupUsed(pc)))
+		ret = pc->mem_cgroup->cleancache_disabled;
+	return ret;
+}
+#endif
+
+void mem_cgroup_note_oom_kill(struct mem_cgroup *root_memcg,
+			      struct task_struct *task)
+{
+	struct mem_cgroup *memcg, *memcg_to_put;
+	struct task_struct *p;
+
+	if (!root_memcg)
+		root_memcg = root_mem_cgroup;
+
+	p = find_lock_task_mm(task);
+	if (p) {
+		memcg = get_mem_cgroup_from_mm(p->mm);
+		task_unlock(p);
+	} else {
+		rcu_read_lock();
+		memcg = mem_cgroup_from_task(task);
+		css_get(&memcg->css);
+		rcu_read_unlock();
+	}
+	memcg_to_put = memcg;
+	if (!memcg || !mem_cgroup_same_or_subtree(root_memcg, memcg))
+		memcg = root_memcg;
+
+	for (; memcg; memcg = parent_mem_cgroup(memcg)) {
+		atomic_long_inc(&memcg->oom_kill_cnt);
+		if (memcg == root_memcg)
+			break;
+	}
+
+	if (memcg_to_put)
+		css_put(&memcg_to_put->css);
+}
+
+struct oom_context *mem_cgroup_oom_context(struct mem_cgroup *memcg)
+{
+	if (mem_cgroup_disabled())
+		return &global_oom_ctx;
+	if (!memcg)
+		memcg = root_mem_cgroup;
+	return &memcg->oom_ctx;
+}
+
+unsigned long mem_cgroup_overdraft(struct mem_cgroup *memcg)
+{
+	unsigned long long guarantee, usage;
+
+	if (mem_cgroup_disabled() || mem_cgroup_is_root(memcg))
+		return 0;
+
+	guarantee = ACCESS_ONCE(memcg->oom_guarantee);
+	usage = page_counter_read(&memcg->memsw);
+	return div64_u64(1000 * usage, guarantee + 1);
+}
+
+unsigned long mem_cgroup_total_pages(struct mem_cgroup *memcg, bool swap)
+{
+	unsigned long limit;
+
+	limit = swap ? memcg->memsw.limit : memcg->memory.limit;
+	return min_t(unsigned long, PAGE_COUNTER_MAX, limit);
+}
+
 #define mem_cgroup_from_counter(counter, member)	\
 	container_of(counter, struct mem_cgroup, member)
 
@@ -1679,13 +1899,13 @@ static void move_unlock_mem_cgroup(struct mem_cgroup *memcg,
  */
 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 {
-	struct cgroup *task_cgrp;
-	struct cgroup *mem_cgrp;
 	/*
-	 * Need a buffer in BSS, can't rely on allocations. The code relies
-	 * on the assumption that OOM is serialized for memory controller.
-	 * If this assumption is broken, revisit this code.
+	 * protects memcg_name and makes sure that parallel ooms do not
+	 * interleave
 	 */
+	static DEFINE_MUTEX(oom_info_lock);
+	struct cgroup *task_cgrp;
+	struct cgroup *mem_cgrp;
 	static char memcg_name[PATH_MAX];
 	int ret;
 	struct mem_cgroup *iter;
@@ -1694,6 +1914,7 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
 	if (!p)
 		return;
 
+	mutex_lock(&oom_info_lock);
 	rcu_read_lock();
 
 	mem_cgrp = memcg->css.cgroup;
@@ -1749,7 +1970,7 @@ done:
 		for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 			if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 				continue;
-			pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i],
+			pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
 				K(mem_cgroup_read_stat(iter, i)));
 		}
 
@@ -1759,6 +1980,7 @@ done:
 
 		pr_cont("\n");
 	}
+	mutex_unlock(&oom_info_lock);
 }
 
 /*
@@ -1796,9 +2018,11 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 				     int order)
 {
 	struct mem_cgroup *iter;
+	unsigned long max_overdraft = 0;
 	unsigned long chosen_points = 0;
 	unsigned long totalpages;
-	unsigned int points = 0;
+	unsigned long overdraft;
+	unsigned long points = 0;
 	struct task_struct *chosen = NULL;
 
 	/*
@@ -1807,7 +2031,7 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 * quickly exit and free its memory.
 	 */
 	if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
-		set_thread_flag(TIF_MEMDIE);
+		mark_oom_victim(current);
 		return;
 	}
 
@@ -1820,32 +2044,27 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 		cgroup_iter_start(cgroup, &it);
 		while ((task = cgroup_iter_next(cgroup, &it))) {
-			switch (oom_scan_process_thread(task, totalpages, NULL,
-							false)) {
+			switch (oom_scan_process_thread(task, NULL)) {
 			case OOM_SCAN_SELECT:
 				if (chosen)
 					put_task_struct(chosen);
 				chosen = task;
 				chosen_points = ULONG_MAX;
+				max_overdraft = ULONG_MAX;
 				get_task_struct(chosen);
 				/* fall through */
 			case OOM_SCAN_CONTINUE:
 				continue;
-			case OOM_SCAN_ABORT:
-				cgroup_iter_end(cgroup, &it);
-				mem_cgroup_iter_break(memcg, iter);
-				if (chosen)
-					put_task_struct(chosen);
-				return;
 			case OOM_SCAN_OK:
 				break;
 			};
-			points = oom_badness(task, memcg, NULL, totalpages);
-			if (points > chosen_points) {
+			points = oom_badness(task, memcg, NULL, totalpages,
+					     &overdraft);
+			if (oom_worse(points, overdraft, &chosen_points,
+				      &max_overdraft)) {
 				if (chosen)
 					put_task_struct(chosen);
 				chosen = task;
-				chosen_points = points;
 				get_task_struct(chosen);
 			}
 		}
@@ -1854,8 +2073,8 @@ static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
 
 	if (!chosen)
 		return;
-	points = chosen_points * 1000 / totalpages;
-	oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg,
+	oom_kill_process(chosen, gfp_mask, order, chosen_points, max_overdraft,
+			 totalpages, memcg,
 			 NULL, "Memory cgroup out of memory");
 }
 
@@ -1875,7 +2094,11 @@ static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg,
 	for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) {
 		if (loop)
 			drain_all_stock_async(memcg);
-		total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap);
+		total += try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
+						      gfp_mask, noswap);
+		if (test_thread_flag(TIF_MEMDIE) ||
+		    fatal_signal_pending(current))
+			return 1;
 		/*
 		 * Allow limit shrinkers, which are triggered directly
 		 * by userspace, to catch signals and stop reclaim
@@ -2085,60 +2308,6 @@ static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
 	return total;
 }
 
-static DEFINE_SPINLOCK(memcg_oom_lock);
-
-/*
- * Check OOM-Killer is already running under our hierarchy.
- * If someone is running, return false.
- */
-static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *iter, *failed = NULL;
-
-	spin_lock(&memcg_oom_lock);
-
-	for_each_mem_cgroup_tree(iter, memcg) {
-		if (iter->oom_lock) {
-			/*
-			 * this subtree of our hierarchy is already locked
-			 * so we cannot give a lock.
-			 */
-			failed = iter;
-			mem_cgroup_iter_break(memcg, iter);
-			break;
-		} else
-			iter->oom_lock = true;
-	}
-
-	if (failed) {
-		/*
-		 * OK, we failed to lock the whole subtree so we have
-		 * to clean up what we set up to the failing subtree
-		 */
-		for_each_mem_cgroup_tree(iter, memcg) {
-			if (iter == failed) {
-				mem_cgroup_iter_break(memcg, iter);
-				break;
-			}
-			iter->oom_lock = false;
-		}
-	}
-
-	spin_unlock(&memcg_oom_lock);
-
-	return !failed;
-}
-
-static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
-{
-	struct mem_cgroup *iter;
-
-	spin_lock(&memcg_oom_lock);
-	for_each_mem_cgroup_tree(iter, memcg)
-		iter->oom_lock = false;
-	spin_unlock(&memcg_oom_lock);
-}
-
 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup *iter;
@@ -2200,10 +2369,27 @@ static void memcg_oom_recover(struct mem_cgroup *memcg)
 		memcg_wakeup_oom(memcg);
 }
 
-static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+static void memcg_wait_oom_recover(struct mem_cgroup *memcg)
 {
-	if (!current->memcg_oom.may_oom)
-		return;
+	struct oom_wait_info owait;
+
+	owait.memcg = memcg;
+	owait.wait.flags = 0;
+	owait.wait.func = memcg_oom_wake_function;
+	owait.wait.private = current;
+	INIT_LIST_HEAD(&owait.wait.task_list);
+
+	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
+	schedule();
+	finish_wait(&memcg_oom_waitq, &owait.wait);
+
+	memcg_wakeup_oom(memcg);
+}
+
+static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
+{
+	if (!current->memcg_oom.may_oom)
+		return;
 	/*
 	 * We are in the middle of the charge context here, so we
 	 * don't want to block when potentially sitting on a callstack
@@ -2244,8 +2430,6 @@ static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
 bool mem_cgroup_oom_synchronize(bool handle)
 {
 	struct mem_cgroup *memcg = current->memcg_oom.memcg;
-	struct oom_wait_info owait;
-	bool locked;
 
 	/* OOM is global, do not handle */
 	if (!memcg)
@@ -2254,40 +2438,19 @@ bool mem_cgroup_oom_synchronize(bool handle)
 	if (!handle)
 		goto cleanup;
 
-	owait.memcg = memcg;
-	owait.wait.flags = 0;
-	owait.wait.func = memcg_oom_wake_function;
-	owait.wait.private = current;
-	INIT_LIST_HEAD(&owait.wait.task_list);
-
-	prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
 	mem_cgroup_mark_under_oom(memcg);
-
-	locked = mem_cgroup_oom_trylock(memcg);
-
-	if (locked)
+	if (oom_trylock(memcg)) {
 		mem_cgroup_oom_notify(memcg);
-
-	if (locked && !memcg->oom_kill_disable) {
-		mem_cgroup_unmark_under_oom(memcg);
-		finish_wait(&memcg_oom_waitq, &owait.wait);
-		mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask,
-					 current->memcg_oom.order);
-	} else {
-		schedule();
-		mem_cgroup_unmark_under_oom(memcg);
-		finish_wait(&memcg_oom_waitq, &owait.wait);
+		if (memcg->oom_kill_disable)
+			memcg_wait_oom_recover(memcg);
+		else
+			mem_cgroup_out_of_memory(memcg,
+						 current->memcg_oom.gfp_mask,
+						 current->memcg_oom.order);
+		oom_unlock(memcg);
 	}
+	mem_cgroup_unmark_under_oom(memcg);
 
-	if (locked) {
-		mem_cgroup_oom_unlock(memcg);
-		/*
-		 * There is no guarantee that an OOM-lock contender
-		 * sees the wakeups triggered by the OOM kill
-		 * uncharges.  Wake any sleepers explicitely.
-		 */
-		memcg_oom_recover(memcg);
-	}
 cleanup:
 	current->memcg_oom.memcg = NULL;
 	css_put(&memcg->css);
@@ -2300,22 +2463,14 @@ cleanup:
  *
  * Notes: Race condition
  *
- * We usually use page_cgroup_lock() for accessing page_cgroup member but
- * it tends to be costly. But considering some conditions, we doesn't need
- * to do so _always_.
- *
- * Considering "charge", lock_page_cgroup() is not required because all
- * file-stat operations happen after a page is attached to radix-tree. There
- * are no race with "charge".
+ * Charging occurs during page instantiation, while the page is
+ * unmapped and locked in page migration, or while the page table is
+ * locked in THP migration.  No race is possible.
  *
- * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup
- * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even
- * if there are race with "uncharge". Statistics itself is properly handled
- * by flags.
+ * Uncharge happens to pages with zero references, no race possible.
  *
- * Considering "move", this is an only case we see a race. To make the race
- * small, we check mm->moving_account and detect there are possibility of race
- * If there is, we take a lock.
+ * Charge moving between groups is protected by checking mm->moving
+ * account and taking the move_lock in the slowpath.
  */
 
 void __mem_cgroup_begin_update_page_stat(struct page *page,
@@ -2547,37 +2702,12 @@ static void drain_all_stock_sync(struct mem_cgroup *root_memcg)
 	mutex_unlock(&percpu_charge_mutex);
 }
 
-/*
- * This function drains percpu counter value from DEAD cpu and
- * move it to local cpu. Note that this function can be preempted.
- */
-static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu)
-{
-	int i;
-
-	spin_lock(&memcg->pcp_counter_lock);
-	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-		long x = per_cpu(memcg->stat->count[i], cpu);
-
-		per_cpu(memcg->stat->count[i], cpu) = 0;
-		memcg->nocpu_base.count[i] += x;
-	}
-	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
-		unsigned long x = per_cpu(memcg->stat->events[i], cpu);
-
-		per_cpu(memcg->stat->events[i], cpu) = 0;
-		memcg->nocpu_base.events[i] += x;
-	}
-	spin_unlock(&memcg->pcp_counter_lock);
-}
-
 static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
 					unsigned long action,
 					void *hcpu)
 {
 	int cpu = (unsigned long)hcpu;
 	struct memcg_stock_pcp *stock;
-	struct mem_cgroup *iter;
 
 	if (action == CPU_ONLINE)
 		return NOTIFY_OK;
@@ -2585,62 +2715,84 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
 	if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
 		return NOTIFY_OK;
 
-	for_each_mem_cgroup(iter)
-		mem_cgroup_drain_pcp_counter(iter, cpu);
-
 	stock = &per_cpu(memcg_stock, cpu);
 	drain_stock(stock);
 	return NOTIFY_OK;
 }
 
-
-/* See __mem_cgroup_try_charge() for details */
-enum {
-	CHARGE_OK,		/* success */
-	CHARGE_RETRY,		/* need to retry but retry is not bad */
-	CHARGE_NOMEM,		/* we can't do more. return -ENOMEM */
-	CHARGE_WOULDBLOCK,	/* GFP_WAIT wasn't set and no enough res. */
-};
-
-static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
-				unsigned int nr_pages, unsigned int min_pages,
-				bool invoke_oom)
+/**
+ * mem_cgroup_try_charge - try charging a memcg
+ * @memcg: memcg to charge
+ * @nr_pages: number of pages to charge
+ *
+ * Returns 0 if @memcg was charged successfully, -EINTR if the charge
+ * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
+ */
+static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
+		      unsigned int nr_pages)
 {
+	unsigned int batch = max(CHARGE_BATCH, nr_pages);
+	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 	struct mem_cgroup *mem_over_limit;
 	struct page_counter *counter;
+	unsigned long nr_reclaimed;
 	unsigned long flags = 0;
-	int ret;
 
-	ret = page_counter_try_charge(&memcg->memory, nr_pages, &counter);
+	if (mem_cgroup_is_root(memcg))
+		goto done;
+retry:
+	if (consume_stock(memcg, nr_pages))
+		goto done;
 
-	if (likely(!ret)) {
+	if (!page_counter_try_charge(&memcg->memory, batch, &counter)) {
 		if (!do_swap_account)
-			return CHARGE_OK;
-		ret = page_counter_try_charge(&memcg->memsw, nr_pages, &counter);
-		if (likely(!ret))
-			return CHARGE_OK;
-
-		page_counter_uncharge(&memcg->memory, nr_pages);
+			goto done_restock;
+		if (!page_counter_try_charge(&memcg->memsw, batch, &counter))
+			goto done_restock;
+		page_counter_uncharge(&memcg->memory, batch);
 		mem_over_limit = mem_cgroup_from_counter(counter, memsw);
 		flags |= MEM_CGROUP_RECLAIM_NOSWAP;
 	} else
 		mem_over_limit = mem_cgroup_from_counter(counter, memory);
+
+	if (batch > nr_pages) {
+		batch = nr_pages;
+		goto retry;
+	}
+
+	/*
+	 * Unlike in global OOM situations, memcg is not in a physical
+	 * memory shortage.  Allow dying and OOM-killed tasks to
+	 * bypass the last charges so that they can exit quickly and
+	 * free their memory.
+	 */
+	if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+		     fatal_signal_pending(current) ||
+		     current->flags & PF_EXITING))
+		goto bypass;
+
 	/*
-	 * Never reclaim on behalf of optional batching, retry with a
-	 * single page instead.
+	 * Prevent unbounded recursion when reclaim operations need to
+	 * allocate memory. This might exceed the limits temporarily,
+	 * but we prefer facilitating memory reclaim and getting back
+	 * under the limit over triggering OOM kills in these cases.
 	 */
-	if (nr_pages > min_pages)
-		return CHARGE_RETRY;
+	if (unlikely(current->flags & PF_MEMALLOC))
+		goto bypass;
+
+	if (unlikely(task_in_memcg_oom(current)))
+		goto nomem;
 
 	if (!(gfp_mask & __GFP_WAIT))
-		return CHARGE_WOULDBLOCK;
+		goto nomem;
 
-	if (gfp_mask & __GFP_NORETRY)
-		return CHARGE_NOMEM;
+	nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
 
-	ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
-	if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
-		return CHARGE_RETRY;
+	if (mem_cgroup_margin(mem_over_limit) >= batch)
+		goto retry;
+
+	if (gfp_mask & __GFP_NORETRY)
+		goto nomem;
 	/*
 	 * Even though the limit is exceeded at this point, reclaim
 	 * may have been able to free some pages.  Retry the charge
@@ -2650,192 +2802,43 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
 	 * unlikely to succeed so close to the limit, and we fall back
 	 * to regular pages anyway in case of failure.
 	 */
-	if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret)
-		return CHARGE_RETRY;
-
+	if (nr_reclaimed && batch <= (1 << PAGE_ALLOC_COSTLY_ORDER))
+		goto retry;
 	/*
 	 * At task move, charge accounts can be doubly counted. So, it's
 	 * better to wait until the end of task_move if something is going on.
 	 */
 	if (mem_cgroup_wait_acct_move(mem_over_limit))
-		return CHARGE_RETRY;
-
-	if (invoke_oom)
-		mem_cgroup_oom(mem_over_limit, gfp_mask,
-			       get_order(nr_pages * PAGE_SIZE));
-
-	return CHARGE_NOMEM;
-}
+		goto retry;
 
-/*
- * __mem_cgroup_try_charge() does
- * 1. detect memcg to be charged against from passed *mm and *ptr,
- * 2. update page_counter
- * 3. call memory reclaim if necessary.
- *
- * In some special case, if the task is fatal, fatal_signal_pending() or
- * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
- * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon
- * as possible without any hazards. 2: all pages should have a valid
- * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
- * pointer, that is treated as a charge to root_mem_cgroup.
- *
- * So __mem_cgroup_try_charge() will return
- *  0       ...  on success, filling *ptr with a valid memcg pointer.
- *  -ENOMEM ...  charge failure because of resource limits.
- *  -EINTR  ...  if thread is fatal. *ptr is filled with root_mem_cgroup.
- *
- * Unlike the exported interface, an "oom" parameter is added. if oom==true,
- * the oom-killer can be invoked.
- */
-static int __mem_cgroup_try_charge(struct mm_struct *mm,
-				   gfp_t gfp_mask,
-				   unsigned int nr_pages,
-				   struct mem_cgroup **ptr,
-				   bool oom)
-{
-	unsigned int batch = max(CHARGE_BATCH, nr_pages);
-	int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-	struct mem_cgroup *memcg = NULL;
-	int ret;
+	if (nr_retries--)
+		goto retry;
 
-	/*
-	 * Unlike gloval-vm's OOM-kill, we're not in memory shortage
-	 * in system level. So, allow to go ahead dying process in addition to
-	 * MEMDIE process.
-	 */
-	if (unlikely(test_thread_flag(TIF_MEMDIE)
-		     || fatal_signal_pending(current)))
+	if (gfp_mask & __GFP_NOFAIL)
 		goto bypass;
 
-	/*
-	 * Prevent unbounded recursion when reclaim operations need to
-	 * allocate memory. This might exceed the limits temporarily,
-	 * but we prefer facilitating memory reclaim and getting back
-	 * under the limit over triggering OOM kills in these cases.
-	 */
-	if (unlikely(current->flags & PF_MEMALLOC))
+	if (fatal_signal_pending(current))
 		goto bypass;
 
-	if (unlikely(task_in_memcg_oom(current)))
-		goto nomem;
-
-	if (gfp_mask & __GFP_NOFAIL)
-		oom = false;
-
-	/*
-	 * We always charge the cgroup the mm_struct belongs to.
-	 * The mm_struct's mem_cgroup changes on task migration if the
-	 * thread group leader migrates. It's possible that mm is not
-	 * set, if so charge the root memcg (happens for pagecache usage).
-	 */
-	if (!*ptr && !mm)
-		*ptr = root_mem_cgroup;
-again:
-	if (*ptr) { /* css should be a valid one */
-		memcg = *ptr;
-		if (mem_cgroup_is_root(memcg))
-			goto done;
-		if (consume_stock(memcg, nr_pages))
-			goto done;
-		css_get(&memcg->css);
-	} else {
-		struct task_struct *p;
-
-		rcu_read_lock();
-		p = rcu_dereference(mm->owner);
-		/*
-		 * Because we don't have task_lock(), "p" can exit.
-		 * In that case, "memcg" can point to root or p can be NULL with
-		 * race with swapoff. Then, we have small risk of mis-accouning.
-		 * But such kind of mis-account by race always happens because
-		 * we don't have cgroup_mutex(). It's overkill and we allo that
-		 * small race, here.
-		 * (*) swapoff at el will charge against mm-struct not against
-		 * task-struct. So, mm->owner can be NULL.
-		 */
-		memcg = mem_cgroup_from_task(p);
-		if (!memcg)
-			memcg = root_mem_cgroup;
-		if (mem_cgroup_is_root(memcg)) {
-			rcu_read_unlock();
-			goto done;
-		}
-		if (consume_stock(memcg, nr_pages)) {
-			/*
-			 * It seems dagerous to access memcg without css_get().
-			 * But considering how consume_stok works, it's not
-			 * necessary. If consume_stock success, some charges
-			 * from this memcg are cached on this cpu. So, we
-			 * don't need to call css_get()/css_tryget() before
-			 * calling consume_stock().
-			 */
-			rcu_read_unlock();
-			goto done;
-		}
-		/* after here, we may be blocked. we need to get refcnt */
-		if (!css_tryget(&memcg->css)) {
-			rcu_read_unlock();
-			goto again;
-		}
-		rcu_read_unlock();
-	}
-
-	do {
-		bool invoke_oom = oom && !nr_oom_retries;
+	mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch * PAGE_SIZE));
+	mem_cgroup_inc_failcnt(mem_over_limit, gfp_mask, nr_pages);
 
-		/* If killed, bypass charge */
-		if (fatal_signal_pending(current)) {
-			css_put(&memcg->css);
-			goto bypass;
-		}
+nomem:
+	mem_cgroup_inc_failcnt(mem_over_limit, gfp_mask, nr_pages);
 
-		ret = mem_cgroup_do_charge(memcg, gfp_mask, batch,
-					   nr_pages, invoke_oom);
-		switch (ret) {
-		case CHARGE_OK:
-			break;
-		case CHARGE_RETRY: /* not in OOM situation but retry */
-			batch = nr_pages;
-			css_put(&memcg->css);
-			memcg = NULL;
-			goto again;
-		case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
-			css_put(&memcg->css);
-			goto nomem;
-		case CHARGE_NOMEM: /* OOM routine works */
-			if (!oom || invoke_oom) {
-				css_put(&memcg->css);
-				goto nomem;
-			}
-			nr_oom_retries--;
-			break;
-		}
-	} while (ret != CHARGE_OK);
+	if (!(gfp_mask & __GFP_NOFAIL))
+		return -ENOMEM;
+bypass:
+	return -EINTR;
 
+done_restock:
 	if (batch > nr_pages)
 		refill_stock(memcg, batch - nr_pages);
-	css_put(&memcg->css);
 done:
-	*ptr = memcg;
 	return 0;
-nomem:
-	if (!(gfp_mask & __GFP_NOFAIL)) {
-		*ptr = NULL;
-		return -ENOMEM;
-	}
-bypass:
-	*ptr = root_mem_cgroup;
-	return -EINTR;
 }
 
-/*
- * Somemtimes we have to undo a charge we got by try_charge().
- * This function is for that and do uncharge, put css's refcnt.
- * gotten by try_charge().
- */
-static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg,
-				       unsigned int nr_pages)
+static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
 {
 	if (!mem_cgroup_is_root(memcg)) {
 		page_counter_uncharge(&memcg->memory, nr_pages);
@@ -2863,6 +2866,16 @@ static struct mem_cgroup *mem_cgroup_lookup(unsigned short id)
 	return mem_cgroup_from_css(css);
 }
 
+/*
+ * try_get_mem_cgroup_from_page - look up page's memcg association
+ * @page: the page
+ *
+ * Look up, get a css reference, and return the memcg that owns @page.
+ *
+ * The page must be locked to prevent racing with swap-in and page
+ * cache charges.  If coming from an unlocked page table, the caller
+ * must ensure the page is on the LRU or this can race with charging.
+ */
 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 {
 	struct mem_cgroup *memcg = NULL;
@@ -2873,7 +2886,6 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 	VM_BUG_ON_PAGE(!PageLocked(page), page);
 
 	pc = lookup_page_cgroup(page);
-	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		memcg = pc->mem_cgroup;
 		if (memcg && !css_tryget(&memcg->css))
@@ -2887,23 +2899,46 @@ struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page)
 			memcg = NULL;
 		rcu_read_unlock();
 	}
-	unlock_page_cgroup(pc);
 	return memcg;
 }
 
-static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
-				       struct page *page,
-				       unsigned int nr_pages,
-				       enum charge_type ctype,
-				       bool lrucare)
+static void lock_page_lru(struct page *page, int *isolated)
+{
+	struct zone *zone = page_zone(page);
+
+	spin_lock_irq(&zone->lru_lock);
+	if (PageLRU(page)) {
+		struct lruvec *lruvec;
+
+		lruvec = mem_cgroup_page_lruvec(page, zone);
+		ClearPageLRU(page);
+		del_page_from_lru_list(page, lruvec, page_lru(page));
+		*isolated = 1;
+	} else
+		*isolated = 0;
+}
+
+static void unlock_page_lru(struct page *page, int isolated)
+{
+	struct zone *zone = page_zone(page);
+
+	if (isolated) {
+		struct lruvec *lruvec;
+
+		lruvec = mem_cgroup_page_lruvec(page, zone);
+		VM_BUG_ON_PAGE(PageLRU(page), page);
+		SetPageLRU(page);
+		add_page_to_lru_list(page, lruvec, page_lru(page));
+	}
+	spin_unlock_irq(&zone->lru_lock);
+}
+
+static void commit_charge(struct page *page, struct mem_cgroup *memcg,
+			  unsigned int nr_pages, bool lrucare)
 {
 	struct page_cgroup *pc = lookup_page_cgroup(page);
-	struct zone *uninitialized_var(zone);
-	struct lruvec *lruvec;
-	bool was_on_lru = false;
-	bool anon;
+	int isolated;
 
-	lock_page_cgroup(pc);
 	VM_BUG_ON_PAGE(PageCgroupUsed(pc), page);
 	/*
 	 * we don't need page_cgroup_lock about tail pages, becase they are not
@@ -2914,165 +2949,194 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg,
 	 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
 	 * may already be on some other mem_cgroup's LRU.  Take care of it.
 	 */
-	if (lrucare) {
-		zone = page_zone(page);
-		spin_lock_irq(&zone->lru_lock);
-		if (PageLRU(page)) {
-			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
-			ClearPageLRU(page);
-			del_page_from_lru_list(page, lruvec, page_lru(page));
-			was_on_lru = true;
-		}
-	}
+	if (lrucare)
+		lock_page_lru(page, &isolated);
 
-	pc->mem_cgroup = memcg;
 	/*
-	 * We access a page_cgroup asynchronously without lock_page_cgroup().
-	 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup
-	 * is accessed after testing USED bit. To make pc->mem_cgroup visible
-	 * before USED bit, we need memory barrier here.
-	 * See mem_cgroup_add_lru_list(), etc.
- 	 */
-	smp_wmb();
-	SetPageCgroupUsed(pc);
-
-	if (lrucare) {
-		if (was_on_lru) {
-			lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup);
-			VM_BUG_ON_PAGE(PageLRU(page), page);
-			SetPageLRU(page);
-			add_page_to_lru_list(page, lruvec, page_lru(page));
-		}
-		spin_unlock_irq(&zone->lru_lock);
-	}
-
-	if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON)
-		anon = true;
-	else
-		anon = false;
+	 * Nobody should be changing or seriously looking at
+	 * pc->mem_cgroup and pc->flags at this point:
+	 *
+	 * - the page is uncharged
+	 *
+	 * - the page is off-LRU
+	 *
+	 * - an anonymous fault has exclusive page access, except for
+	 *   a locked page table
+	 *
+	 * - a page cache insertion, a swapin fault, or a migration
+	 *   have the page locked
+	 */
+	pc->mem_cgroup = memcg;
+	pc->flags = PCG_USED | PCG_MEM | (do_swap_account ? PCG_MEMSW : 0);
 
-	mem_cgroup_charge_statistics(memcg, page, anon, nr_pages);
-	unlock_page_cgroup(pc);
+	if (lrucare)
+		unlock_page_lru(page, isolated);
 
+	local_irq_disable();
+	mem_cgroup_charge_statistics(memcg, page, nr_pages);
 	/*
 	 * "charge_statistics" updated event counter. Then, check it.
 	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
 	 * if they exceeds softlimit.
 	 */
 	memcg_check_events(memcg, page);
+	local_irq_enable();
 }
 
 #ifdef CONFIG_MEMCG_KMEM
-static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg)
-{
-	return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) &&
-		(memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK);
-}
-
-/*
- * This is a bit cumbersome, but it is rarely used and avoids a backpointer
- * in the memcg_cache_params struct.
- */
-static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p)
-{
-	struct kmem_cache *cachep;
 
-	VM_BUG_ON(p->is_root_cache);
-	cachep = p->root_cache;
-	return cachep->memcg_params->memcg_caches[memcg_cache_id(p->memcg)];
-}
+static DEFINE_MUTEX(activate_kmem_mutex);
 
 #ifdef CONFIG_SLABINFO
 static int mem_cgroup_slabinfo_read(struct cgroup *cont, struct cftype *cft,
 					struct seq_file *m)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
-	struct memcg_cache_params *params;
-
-	if (!memcg_can_account_kmem(memcg))
-		return -EIO;
+	loff_t pos = 0;
+	void *p;
 
-	print_slabinfo_header(m);
-
-	mutex_lock(&memcg->slab_caches_mutex);
-	list_for_each_entry(params, &memcg->memcg_slab_caches, list)
-		cache_show(memcg_params_to_cache(params), m);
-	mutex_unlock(&memcg->slab_caches_mutex);
+	for (p = slab_start(m, &pos); p; p = slab_next(m, p, &pos))
+		memcg_slab_show(memcg, m, p);
+	slab_stop(m, p);
 
 	return 0;
 }
 #endif
 
-static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
+int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp,
 			     unsigned long nr_pages)
 {
 	struct page_counter *counter;
-	struct mem_cgroup *_memcg;
 	int ret = 0;
-	bool may_oom;
-
-	ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
-	if (ret < 0)
-		return ret;
-
-	/*
-	 * Conditions under which we can wait for the oom_killer. Those are
-	 * the same conditions tested by the core page allocator
-	 */
-	may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
-
-	_memcg = memcg;
-	ret = __mem_cgroup_try_charge(NULL, gfp, nr_pages, &_memcg, may_oom);
 
+	ret = try_charge(memcg, gfp, nr_pages);
 	if (ret == -EINTR)  {
 		/*
-		 * __mem_cgroup_try_charge() chosed to bypass to root due to
-		 * OOM kill or fatal signal.  Since our only options are to
-		 * either fail the allocation or charge it to this cgroup, do
-		 * it as a temporary condition. But we can't fail. From a
-		 * kmem/slab perspective, the cache has already been selected,
-		 * by mem_cgroup_kmem_get_cache(), so it is too late to change
+		 * try_charge() chose to bypass to root due to OOM kill or
+		 * fatal signal.  Since our only options are to either fail
+		 * the allocation or charge it to this cgroup, do it as a
+		 * temporary condition. But we can't fail. From a kmem/slab
+		 * perspective, the cache has already been selected, by
+		 * mem_cgroup_kmem_get_cache(), so it is too late to change
 		 * our minds.
 		 *
 		 * This condition will only trigger if the task entered
-		 * memcg_charge_kmem in a sane state, but was OOM-killed during
-		 * __mem_cgroup_try_charge() above. Tasks that were already
-		 * dying when the allocation triggers should have been already
+		 * memcg_charge_kmem in a sane state, but was OOM-killed
+		 * during try_charge() above. Tasks that were already dying
+		 * when the allocation triggers should have been already
 		 * directed to the root cgroup in memcontrol.h
 		 */
 		page_counter_charge(&memcg->memory, nr_pages);
 		if (do_swap_account)
 			page_counter_charge(&memcg->memsw, nr_pages);
 		ret = 0;
-	} else if (ret)
-		page_counter_uncharge(&memcg->kmem, nr_pages);
+	}
+
+	if (ret)
+		return ret;
+
+	/*
+	 * When a cgroup is destroyed, all user memory pages get recharged to
+	 * the parent cgroup. Recharging is done by mem_cgroup_reparent_charges
+	 * which keeps looping until res <= kmem. This is supposed to guarantee
+	 * that by the time cgroup gets released, no pages is charged to it.
+	 *
+	 * If kmem were charged before res or uncharged after, kmem might
+	 * become greater than res for a short period of time even if there
+	 * were still user memory pages charged to the cgroup. In this case
+	 * mem_cgroup_reparent_charges would give up prematurely, and the
+	 * cgroup could be released though there were still pages charged to
+	 * it. Uncharge of such a page would trigger kernel panic.
+	 *
+	 * To prevent this from happening, kmem must be charged after res and
+	 * uncharged before res.
+	 */
+	ret = page_counter_try_charge(&memcg->kmem, nr_pages, &counter);
+	if (ret) {
+		page_counter_uncharge(&memcg->memory, nr_pages);
+		if (do_swap_account)
+			page_counter_uncharge(&memcg->memsw, nr_pages);
+	}
 
 	return ret;
 }
 
-static void memcg_uncharge_kmem(struct mem_cgroup *memcg,
+void memcg_charge_kmem_nofail(struct mem_cgroup *memcg, unsigned long nr_pages)
+{
+	page_counter_charge(&memcg->memory, nr_pages);
+	if (do_swap_account)
+		page_counter_charge(&memcg->memsw, nr_pages);
+
+	/* kmem must be charged after res - see memcg_charge_kmem() */
+	page_counter_charge(&memcg->kmem, nr_pages);
+}
+
+
+void memcg_uncharge_kmem(struct mem_cgroup *memcg,
 				unsigned long nr_pages)
 {
+	u64 kmem;
+
+	kmem = page_counter_uncharge(&memcg->kmem, nr_pages);
+
 	page_counter_uncharge(&memcg->memory, nr_pages);
 	if (do_swap_account)
 		page_counter_uncharge(&memcg->memsw, nr_pages);
 
 	/* Not down to 0 */
-	if (page_counter_uncharge(&memcg->kmem, nr_pages))
+	if (kmem)
 		return;
 
+	/*
+	 * Releases a reference taken in memcg_deactivate_kmem in case
+	 * this last uncharge is racing with the offlining code or it is
+	 * outliving the memcg existence.
+	 *
+	 * The memory barrier imposed by test&clear is paired with the
+	 * explicit one in memcg_kmem_mark_dead().
+	 */
 	if (memcg_kmem_test_and_clear_dead(memcg))
-		mem_cgroup_put(memcg);
+		css_put(&memcg->css);
 }
 
-void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
+int __memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, unsigned int nr_pages)
 {
-	if (!memcg)
-		return;
+	struct mem_cgroup *memcg;
+	int idx;
+	int ret;
 
-	mutex_lock(&memcg->slab_caches_mutex);
-	list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches);
-	mutex_unlock(&memcg->slab_caches_mutex);
+	VM_BUG_ON(is_root_cache(s));
+	memcg = s->memcg_params.memcg;
+
+	ret = memcg_charge_kmem(memcg, gfp, nr_pages);
+	if (ret)
+		return ret;
+	if (s->flags & SLAB_RECLAIM_ACCOUNT) {
+		page_counter_charge(&memcg->dcache, nr_pages);
+		idx = MEM_CGROUP_STAT_SLAB_RECLAIMABLE;
+	} else
+		idx = MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE;
+
+	this_cpu_add(memcg->stat->count[idx], nr_pages);
+	return 0;
+}
+
+void __memcg_uncharge_slab(struct kmem_cache *s, unsigned int nr_pages)
+{
+	struct mem_cgroup *memcg;
+	int idx;
+
+	VM_BUG_ON(is_root_cache(s));
+	memcg = s->memcg_params.memcg;
+
+	memcg_uncharge_kmem(memcg, nr_pages);
+	if (s->flags & SLAB_RECLAIM_ACCOUNT) {
+		page_counter_uncharge(&memcg->dcache, nr_pages);
+		idx = MEM_CGROUP_STAT_SLAB_RECLAIMABLE;
+	} else
+		idx = MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE;
+
+	this_cpu_sub(memcg->stat->count[idx], nr_pages);
 }
 
 /*
@@ -3085,177 +3149,49 @@ int memcg_cache_id(struct mem_cgroup *memcg)
 	return memcg ? memcg->kmemcg_id : -1;
 }
 
-/*
- * This ends up being protected by the set_limit mutex, during normal
- * operation, because that is its main call site.
- *
- * But when we create a new cache, we can call this as well if its parent
- * is kmem-limited. That will have to hold set_limit_mutex as well.
- */
-int memcg_update_cache_sizes(struct mem_cgroup *memcg)
+static int memcg_alloc_cache_id(void)
 {
-	int num, ret;
+	int id, size;
+	int err;
 
-	num = ida_simple_get(&kmem_limited_groups,
-				0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
-	if (num < 0)
-		return num;
-	/*
-	 * After this point, kmem_accounted (that we test atomically in
-	 * the beginning of this conditional), is no longer 0. This
-	 * guarantees only one process will set the following boolean
-	 * to true. We don't need test_and_set because we're protected
-	 * by the set_limit_mutex anyway.
-	 */
-	memcg_kmem_set_activated(memcg);
+	id = ida_simple_get(&memcg_cache_ida,
+			    0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
+	if (id < 0)
+		return id;
 
-	ret = memcg_update_all_caches(num+1);
-	if (ret) {
-		ida_simple_remove(&kmem_limited_groups, num);
-		memcg_kmem_clear_activated(memcg);
-		return ret;
-	}
+	if (id < memcg_nr_cache_ids)
+		return id;
 
-	memcg->kmemcg_id = num;
-	INIT_LIST_HEAD(&memcg->memcg_slab_caches);
-	mutex_init(&memcg->slab_caches_mutex);
-	return 0;
-}
-
-static size_t memcg_caches_array_size(int num_groups)
-{
-	ssize_t size;
-	if (num_groups <= 0)
-		return 0;
+	/*
+	 * There's no space for the new id in memcg_caches arrays,
+	 * so we have to grow them.
+	 */
+	down_write(&memcg_cache_ids_sem);
 
-	size = 2 * num_groups;
+	size = 2 * (id + 1);
 	if (size < MEMCG_CACHES_MIN_SIZE)
 		size = MEMCG_CACHES_MIN_SIZE;
 	else if (size > MEMCG_CACHES_MAX_SIZE)
 		size = MEMCG_CACHES_MAX_SIZE;
 
-	return size;
-}
-
-/*
- * We should update the current array size iff all caches updates succeed. This
- * can only be done from the slab side. The slab mutex needs to be held when
- * calling this.
- */
-void memcg_update_array_size(int num)
-{
-	if (num > memcg_limited_groups_array_size)
-		memcg_limited_groups_array_size = memcg_caches_array_size(num);
-}
-
-static void kmem_cache_destroy_work_func(struct work_struct *w);
-
-int memcg_update_cache_size(struct kmem_cache *s, int num_groups)
-{
-	struct memcg_cache_params *cur_params = s->memcg_params;
-
-	VM_BUG_ON(s->memcg_params && !s->memcg_params->is_root_cache);
-
-	if (num_groups > memcg_limited_groups_array_size) {
-		int i;
-		ssize_t size = memcg_caches_array_size(num_groups);
-
-		size *= sizeof(void *);
-		size += sizeof(struct memcg_cache_params);
-
-		s->memcg_params = kzalloc(size, GFP_KERNEL);
-		if (!s->memcg_params) {
-			s->memcg_params = cur_params;
-			return -ENOMEM;
-		}
-
-		s->memcg_params->is_root_cache = true;
+	err = memcg_update_all_caches(size);
+	if (!err)
+		err = memcg_update_all_list_lrus(size);
+	if (!err)
+		memcg_nr_cache_ids = size;
 
-		/*
-		 * There is the chance it will be bigger than
-		 * memcg_limited_groups_array_size, if we failed an allocation
-		 * in a cache, in which case all caches updated before it, will
-		 * have a bigger array.
-		 *
-		 * But if that is the case, the data after
-		 * memcg_limited_groups_array_size is certainly unused
-		 */
-		for (i = 0; i < memcg_limited_groups_array_size; i++) {
-			if (!cur_params->memcg_caches[i])
-				continue;
-			s->memcg_params->memcg_caches[i] =
-						cur_params->memcg_caches[i];
-		}
+	up_write(&memcg_cache_ids_sem);
 
-		/*
-		 * Ideally, we would wait until all caches succeed, and only
-		 * then free the old one. But this is not worth the extra
-		 * pointer per-cache we'd have to have for this.
-		 *
-		 * It is not a big deal if some caches are left with a size
-		 * bigger than the others. And all updates will reset this
-		 * anyway.
-		 */
-		kfree(cur_params);
+	if (err) {
+		ida_simple_remove(&memcg_cache_ida, id);
+		return err;
 	}
-	return 0;
+	return id;
 }
 
-int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s,
-			 struct kmem_cache *root_cache)
+static void memcg_free_cache_id(int id)
 {
-	size_t size = sizeof(struct memcg_cache_params);
-
-	if (!memcg_kmem_enabled())
-		return 0;
-
-	if (!memcg)
-		size += memcg_limited_groups_array_size * sizeof(void *);
-
-	s->memcg_params = kzalloc(size, GFP_KERNEL);
-	if (!s->memcg_params)
-		return -ENOMEM;
-
-	if (memcg) {
-		s->memcg_params->memcg = memcg;
-		s->memcg_params->root_cache = root_cache;
-		INIT_WORK(&s->memcg_params->destroy,
-				kmem_cache_destroy_work_func);
-	} else
-		s->memcg_params->is_root_cache = true;
-
-	return 0;
-}
-
-void memcg_release_cache(struct kmem_cache *s)
-{
-	struct kmem_cache *root;
-	struct mem_cgroup *memcg;
-	int id;
-
-	/*
-	 * This happens, for instance, when a root cache goes away before we
-	 * add any memcg.
-	 */
-	if (!s->memcg_params)
-		return;
-
-	if (s->memcg_params->is_root_cache)
-		goto out;
-
-	memcg = s->memcg_params->memcg;
-	id  = memcg_cache_id(memcg);
-
-	root = s->memcg_params->root_cache;
-	root->memcg_params->memcg_caches[id] = NULL;
-
-	mutex_lock(&memcg->slab_caches_mutex);
-	list_del(&s->memcg_params->list);
-	mutex_unlock(&memcg->slab_caches_mutex);
-
-	mem_cgroup_put(memcg);
-out:
-	kfree(s->memcg_params);
+	ida_simple_remove(&memcg_cache_ida, id);
 }
 
 /*
@@ -3277,274 +3213,56 @@ out:
  * memcg_kmem_skip_account. So we enclose anything that might allocate memory
  * inside the following two functions.
  */
-static inline void memcg_stop_kmem_account(void)
-{
-	VM_BUG_ON(!current->mm);
-	current->memcg_kmem_skip_account++;
-}
-
-static inline void memcg_resume_kmem_account(void)
-{
-	VM_BUG_ON(!current->mm);
-	current->memcg_kmem_skip_account--;
-}
-
-static void kmem_cache_destroy_work_func(struct work_struct *w)
-{
-	struct kmem_cache *cachep;
-	struct memcg_cache_params *p;
-
-	p = container_of(w, struct memcg_cache_params, destroy);
-
-	cachep = memcg_params_to_cache(p);
-
-	/*
-	 * If we get down to 0 after shrink, we could delete right away.
-	 * However, memcg_release_pages() already puts us back in the workqueue
-	 * in that case. If we proceed deleting, we'll get a dangling
-	 * reference, and removing the object from the workqueue in that case
-	 * is unnecessary complication. We are not a fast path.
-	 *
-	 * Note that this case is fundamentally different from racing with
-	 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in
-	 * kmem_cache_shrink, not only we would be reinserting a dead cache
-	 * into the queue, but doing so from inside the worker racing to
-	 * destroy it.
-	 *
-	 * So if we aren't down to zero, we'll just schedule a worker and try
-	 * again
-	 */
-	if (atomic_read(&cachep->memcg_params->nr_pages) != 0) {
-		kmem_cache_shrink(cachep);
-		if (atomic_read(&cachep->memcg_params->nr_pages) == 0)
-			return;
-	} else
-		kmem_cache_destroy(cachep);
-}
-
-void mem_cgroup_destroy_cache(struct kmem_cache *cachep)
-{
-	if (!cachep->memcg_params->dead)
-		return;
-
-	/*
-	 * There are many ways in which we can get here.
-	 *
-	 * We can get to a memory-pressure situation while the delayed work is
-	 * still pending to run. The vmscan shrinkers can then release all
-	 * cache memory and get us to destruction. If this is the case, we'll
-	 * be executed twice, which is a bug (the second time will execute over
-	 * bogus data). In this case, cancelling the work should be fine.
-	 *
-	 * But we can also get here from the worker itself, if
-	 * kmem_cache_shrink is enough to shake all the remaining objects and
-	 * get the page count to 0. In this case, we'll deadlock if we try to
-	 * cancel the work (the worker runs with an internal lock held, which
-	 * is the same lock we would hold for cancel_work_sync().)
-	 *
-	 * Since we can't possibly know who got us here, just refrain from
-	 * running if there is already work pending
-	 */
-	if (work_pending(&cachep->memcg_params->destroy))
-		return;
-	/*
-	 * We have to defer the actual destroying to a workqueue, because
-	 * we might currently be in a context that cannot sleep.
-	 */
-	schedule_work(&cachep->memcg_params->destroy);
-}
-
-/*
- * This lock protects updaters, not readers. We want readers to be as fast as
- * they can, and they will either see NULL or a valid cache value. Our model
- * allow them to see NULL, in which case the root memcg will be selected.
- *
- * We need this lock because multiple allocations to the same cache from a non
- * will span more than one worker. Only one of them can create the cache.
- */
-static DEFINE_MUTEX(memcg_cache_mutex);
-
-/*
- * Called with memcg_cache_mutex held
- */
-static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg,
-					 struct kmem_cache *s)
-{
-	struct kmem_cache *new;
-	static char *tmp_name = NULL;
-
-	lockdep_assert_held(&memcg_cache_mutex);
-
-	/*
-	 * kmem_cache_create_memcg duplicates the given name and
-	 * cgroup_name for this name requires RCU context.
-	 * This static temporary buffer is used to prevent from
-	 * pointless shortliving allocation.
-	 */
-	if (!tmp_name) {
-		tmp_name = kmalloc(PATH_MAX, GFP_KERNEL);
-		if (!tmp_name)
-			return NULL;
-	}
-
-	rcu_read_lock();
-	snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name,
-			 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup));
-	rcu_read_unlock();
-
-	new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align,
-				      (s->flags & ~SLAB_PANIC), s->ctor, s);
-
-	if (new)
-		new->allocflags |= __GFP_KMEMCG;
-
-	return new;
-}
-
-static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
-						  struct kmem_cache *cachep)
-{
-	struct kmem_cache *new_cachep;
-	int idx;
-
-	BUG_ON(!memcg_can_account_kmem(memcg));
-
-	idx = memcg_cache_id(memcg);
-
-	mutex_lock(&memcg_cache_mutex);
-	new_cachep = cachep->memcg_params->memcg_caches[idx];
-	if (new_cachep)
-		goto out;
-
-	new_cachep = kmem_cache_dup(memcg, cachep);
-	if (new_cachep == NULL) {
-		new_cachep = cachep;
-		goto out;
-	}
-
-	mem_cgroup_get(memcg);
-	atomic_set(&new_cachep->memcg_params->nr_pages , 0);
-
-	cachep->memcg_params->memcg_caches[idx] = new_cachep;
-	/*
-	 * the readers won't lock, make sure everybody sees the updated value,
-	 * so they won't put stuff in the queue again for no reason
-	 */
-	wmb();
-out:
-	mutex_unlock(&memcg_cache_mutex);
-	return new_cachep;
-}
 
 static DEFINE_MUTEX(memcg_limit_mutex);
 
-void kmem_cache_destroy_memcg_children(struct kmem_cache *s)
-{
-	struct kmem_cache *c;
-	int i;
-
-	if (!s->memcg_params)
-		return;
-	if (!s->memcg_params->is_root_cache)
-		return;
-
-	/*
-	 * If the cache is being destroyed, we trust that there is no one else
-	 * requesting objects from it. Even if there are, the sanity checks in
-	 * kmem_cache_destroy should caught this ill-case.
-	 *
-	 * Still, we don't want anyone else freeing memcg_caches under our
-	 * noses, which can happen if a new memcg comes to life. As usual,
-	 * we'll take the memcg_limit_mutex to protect ourselves against this.
-	 */
-	mutex_lock(&memcg_limit_mutex);
-	for (i = 0; i < memcg_limited_groups_array_size; i++) {
-		c = s->memcg_params->memcg_caches[i];
-		if (!c)
-			continue;
-
-		/*
-		 * We will now manually delete the caches, so to avoid races
-		 * we need to cancel all pending destruction workers and
-		 * proceed with destruction ourselves.
-		 *
-		 * kmem_cache_destroy() will call kmem_cache_shrink internally,
-		 * and that could spawn the workers again: it is likely that
-		 * the cache still have active pages until this very moment.
-		 * This would lead us back to mem_cgroup_destroy_cache.
-		 *
-		 * But that will not execute at all if the "dead" flag is not
-		 * set, so flip it down to guarantee we are in control.
-		 */
-		c->memcg_params->dead = false;
-		cancel_work_sync(&c->memcg_params->destroy);
-		kmem_cache_destroy(c);
-	}
-	mutex_unlock(&memcg_limit_mutex);
-}
-
-struct create_work {
+struct memcg_kmem_cache_create_work {
 	struct mem_cgroup *memcg;
 	struct kmem_cache *cachep;
 	struct work_struct work;
 };
 
-static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+static void memcg_kmem_cache_create_func(struct work_struct *w)
 {
-	struct kmem_cache *cachep;
-	struct memcg_cache_params *params;
+	struct memcg_kmem_cache_create_work *cw =
+		container_of(w, struct memcg_kmem_cache_create_work, work);
+	struct mem_cgroup *memcg = cw->memcg;
+	struct kmem_cache *cachep = cw->cachep;
 
-	if (!memcg_kmem_is_active(memcg))
-		return;
-
-	mutex_lock(&memcg->slab_caches_mutex);
-	list_for_each_entry(params, &memcg->memcg_slab_caches, list) {
-		cachep = memcg_params_to_cache(params);
-		cachep->memcg_params->dead = true;
-		schedule_work(&cachep->memcg_params->destroy);
-	}
-	mutex_unlock(&memcg->slab_caches_mutex);
-}
-
-static void memcg_create_cache_work_func(struct work_struct *w)
-{
-	struct create_work *cw;
+	memcg_create_kmem_cache(memcg, cachep);
 
-	cw = container_of(w, struct create_work, work);
-	memcg_create_kmem_cache(cw->memcg, cw->cachep);
-	/* Drop the reference gotten when we enqueued. */
-	css_put(&cw->memcg->css);
+	css_put(&memcg->css);
 	kfree(cw);
 }
 
 /*
  * Enqueue the creation of a per-memcg kmem_cache.
  */
-static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg,
-					 struct kmem_cache *cachep)
+static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+					       struct kmem_cache *cachep)
 {
-	struct create_work *cw;
+	struct memcg_kmem_cache_create_work *cw;
 
-	cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT);
-	if (cw == NULL) {
-		css_put(&memcg->css);
+	cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
+	if (!cw)
 		return;
-	}
+
+	css_get(&memcg->css);
 
 	cw->memcg = memcg;
 	cw->cachep = cachep;
+	INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
 
-	INIT_WORK(&cw->work, memcg_create_cache_work_func);
 	schedule_work(&cw->work);
 }
 
-static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
-				       struct kmem_cache *cachep)
+static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
+					     struct kmem_cache *cachep)
 {
 	/*
 	 * We need to stop accounting when we kmalloc, because if the
 	 * corresponding kmalloc cache is not yet created, the first allocation
-	 * in __memcg_create_cache_enqueue will recurse.
+	 * in __memcg_schedule_kmem_cache_create will recurse.
 	 *
 	 * However, it is better to enclose the whole function. Depending on
 	 * the debugging options enabled, INIT_WORK(), for instance, can
@@ -3553,9 +3271,10 @@ static void memcg_create_cache_enqueue(struct mem_cgroup *memcg,
 	 * the safest choice is to do it like this, wrapping the whole function.
 	 */
 	memcg_stop_kmem_account();
-	__memcg_create_cache_enqueue(memcg, cachep);
+	__memcg_schedule_kmem_cache_create(memcg, cachep);
 	memcg_resume_kmem_account();
 }
+
 /*
  * Return the kmem_cache we're supposed to use for a slab allocation.
  * We try to use the current memcg's version of the cache.
@@ -3573,36 +3292,27 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
 					  gfp_t gfp)
 {
 	struct mem_cgroup *memcg;
-	int idx;
+	struct kmem_cache *memcg_cachep;
 
-	VM_BUG_ON(!cachep->memcg_params);
-	VM_BUG_ON(!cachep->memcg_params->is_root_cache);
+	VM_BUG_ON(!is_root_cache(cachep));
 
-	if (!current->mm || current->memcg_kmem_skip_account)
-		return cachep;
+	if (cachep->flags & SLAB_ACCOUNT)
+		gfp |= __GFP_ACCOUNT;
 
-	rcu_read_lock();
-	memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner));
+	if (!(gfp & __GFP_ACCOUNT))
+		return cachep;
 
-	if (!memcg_can_account_kmem(memcg))
-		goto out;
+	if (!current->mm || current->memcg_kmem_skip_account)
+		return cachep;
 
-	idx = memcg_cache_id(memcg);
+	memcg = get_mem_cgroup_from_mm(current->mm);
 
-	/*
-	 * barrier to mare sure we're always seeing the up to date value.  The
-	 * code updating memcg_caches will issue a write barrier to match this.
-	 */
-	read_barrier_depends();
-	if (likely(cachep->memcg_params->memcg_caches[idx])) {
-		cachep = cachep->memcg_params->memcg_caches[idx];
+	if (!memcg_kmem_is_active(memcg))
 		goto out;
-	}
 
-	/* The corresponding put will be done in the workqueue. */
-	if (!css_tryget(&memcg->css))
-		goto out;
-	rcu_read_unlock();
+	memcg_cachep = cache_from_memcg_idx(cachep, memcg_cache_id(memcg));
+	if (likely(memcg_cachep))
+		return memcg_cachep;
 
 	/*
 	 * If we are in a safe context (can wait, and not in interrupt
@@ -3612,23 +3322,23 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep,
 	 *
 	 * However, there are some clashes that can arrive from locking.
 	 * For instance, because we acquire the slab_mutex while doing
-	 * kmem_cache_dup, this means no further allocation could happen
-	 * with the slab_mutex held.
-	 *
-	 * Also, because cache creation issue get_online_cpus(), this
-	 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex,
-	 * that ends up reversed during cpu hotplug. (cpuset allocates
-	 * a bunch of GFP_KERNEL memory during cpuup). Due to all that,
-	 * better to defer everything.
+	 * memcg_create_kmem_cache, this means no further allocation
+	 * could happen with the slab_mutex held. So it's better to
+	 * defer everything.
 	 */
-	memcg_create_cache_enqueue(memcg, cachep);
-	return cachep;
+	memcg_schedule_kmem_cache_create(memcg, cachep);
 out:
-	rcu_read_unlock();
+	css_put(&memcg->css);
 	return cachep;
 }
 EXPORT_SYMBOL(__memcg_kmem_get_cache);
 
+void __memcg_kmem_put_cache(struct kmem_cache *cachep)
+{
+	if (!is_root_cache(cachep))
+		css_put(&cachep->memcg_params.memcg->css);
+}
+
 /*
  * We need to verify if the allocation against current->mm->owner's memcg is
  * possible for the given order. But the page is not allocated yet, so we'll
@@ -3644,53 +3354,61 @@ EXPORT_SYMBOL(__memcg_kmem_get_cache);
  * Returning true means the allocation is possible.
  */
 bool
-__memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
+__memcg_kmem_newpage_charge(struct page *page, gfp_t gfp, int order)
 {
+	struct page_cgroup *pc;
 	struct mem_cgroup *memcg;
 	int ret;
 
-	*_memcg = NULL;
-	memcg = try_get_mem_cgroup_from_mm(current->mm);
-
 	/*
-	 * very rare case described in mem_cgroup_from_task. Unfortunately there
-	 * isn't much we can do without complicating this too much, and it would
-	 * be gfp-dependent anyway. Just let it go
+	 * Disabling accounting is only relevant for some specific memcg
+	 * internal allocations. Therefore we would initially not have such
+	 * check here, since direct calls to the page allocator that are
+	 * accounted to kmemcg (alloc_kmem_pages and friends) only happen
+	 * outside memcg core. We are mostly concerned with cache allocations,
+	 * and by having this test at memcg_kmem_get_cache, we are already able
+	 * to relay the allocation to the root cache and bypass the memcg cache
+	 * altogether.
+	 *
+	 * There is one exception, though: the SLUB allocator does not create
+	 * large order caches, but rather service large kmallocs directly from
+	 * the page allocator. Therefore, the following sequence when backed by
+	 * the SLUB allocator:
+	 *
+	 * 	memcg_stop_kmem_account();
+	 * 	kmalloc(<large_number>)
+	 * 	memcg_resume_kmem_account();
+	 *
+	 * would effectively ignore the fact that we should skip accounting,
+	 * since it will drive us directly to this function without passing
+	 * through the cache selector memcg_kmem_get_cache. Such large
+	 * allocations are extremely rare but can happen, for instance, for the
+	 * cache arrays. We bring this test here.
 	 */
-	if (unlikely(!memcg))
+	if (!current->mm || current->memcg_kmem_skip_account)
 		return true;
 
-	if (!memcg_can_account_kmem(memcg)) {
+	memcg = get_mem_cgroup_from_mm(current->mm);
+
+	if (!memcg_kmem_is_active(memcg)) {
 		css_put(&memcg->css);
 		return true;
 	}
 
 	ret = memcg_charge_kmem(memcg, gfp, 1 << order);
-	if (!ret)
-		*_memcg = memcg;
-
-	css_put(&memcg->css);
-	return (ret == 0);
-}
-
-void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg,
-			      int order)
-{
-	struct page_cgroup *pc;
-
-	VM_BUG_ON(mem_cgroup_is_root(memcg));
-
-	/* The page allocation failed. Revert */
-	if (!page) {
-		memcg_uncharge_kmem(memcg, 1 << order);
-		return;
+	if (ret) {
+		css_put(&memcg->css);
+		return false;
 	}
 
 	pc = lookup_page_cgroup(page);
-	lock_page_cgroup(pc);
 	pc->mem_cgroup = memcg;
-	SetPageCgroupUsed(pc);
-	unlock_page_cgroup(pc);
+        pc->flags = PCG_USED;
+
+	__SetPageKmemcg(page);
+
+	css_put(&memcg->css);
+	return true;
 }
 
 void __memcg_kmem_uncharge_pages(struct page *page, int order)
@@ -3707,12 +3425,10 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
 	if (!PageCgroupUsed(pc))
 		return;
 
-	lock_page_cgroup(pc);
 	if (PageCgroupUsed(pc)) {
 		memcg = pc->mem_cgroup;
-		ClearPageCgroupUsed(pc);
+		pc->flags = 0;
 	}
-	unlock_page_cgroup(pc);
 
 	/*
 	 * We trust that only if there is a memcg associated with the page, it
@@ -3723,16 +3439,34 @@ void __memcg_kmem_uncharge_pages(struct page *page, int order)
 
 	VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
 	memcg_uncharge_kmem(memcg, 1 << order);
+
+	__ClearPageKmemcg(page);
 }
-#else
-static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg)
+
+struct mem_cgroup *__mem_cgroup_from_kmem(void *ptr)
 {
+	struct mem_cgroup *memcg = NULL;
+	struct page_cgroup *pc;
+	struct kmem_cache *cachep;
+	struct page *page;
+
+	page = virt_to_head_page(ptr);
+	if (PageSlab(page)) {
+		cachep = page->slab_cache;
+		if (!is_root_cache(cachep))
+			memcg = cachep->memcg_params.memcg;
+	} else {
+		pc = lookup_page_cgroup(page);
+		if (PageCgroupUsed(pc))
+			memcg = pc->mem_cgroup;
+	}
+
+	return memcg;
 }
 #endif /* CONFIG_MEMCG_KMEM */
 
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 
-#define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION)
 /*
  * Because tail pages are not marked as "used", set it. We're under
  * zone->lru_lock, 'splitting on pmd' and compound_lock.
@@ -3753,8 +3487,7 @@ void mem_cgroup_split_huge_fixup(struct page *head)
 	for (i = 1; i < HPAGE_PMD_NR; i++) {
 		pc = head_pc + i;
 		pc->mem_cgroup = memcg;
-		smp_wmb();/* see __commit_charge() */
-		pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
+		pc->flags = head_pc->flags;
 	}
 	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
 		       HPAGE_PMD_NR);
@@ -3784,7 +3517,6 @@ static int mem_cgroup_move_account(struct page *page,
 {
 	unsigned long flags;
 	int ret;
-	bool anon = PageAnon(page);
 
 	VM_BUG_ON(from == to);
 	VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -3798,35 +3530,47 @@ static int mem_cgroup_move_account(struct page *page,
 	if (nr_pages > 1 && !PageTransHuge(page))
 		goto out;
 
-	lock_page_cgroup(pc);
+	/*
+	 * Prevent mem_cgroup_migrate() from looking at pc->mem_cgroup
+	 * of its source page while we change it: page migration takes
+	 * both pages off the LRU, but page cache replacement doesn't.
+	 */
+	if (!trylock_page(page))
+		goto out;
 
 	ret = -EINVAL;
 	if (!PageCgroupUsed(pc) || pc->mem_cgroup != from)
-		goto unlock;
+		goto out_unlock;
 
 	move_lock_mem_cgroup(from, &flags);
 
-	if (!anon && page_mapped(page)) {
+	if (!PageAnon(page) && page_mapped(page)) {
 		/* Update mapped_file data for mem_cgroup */
 		preempt_disable();
 		__this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
 		__this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
 		preempt_enable();
 	}
-	mem_cgroup_charge_statistics(from, page, anon, -nr_pages);
+
+	/*
+	 * It is safe to change pc->mem_cgroup here because the page
+	 * is referenced, charged, and isolated - we can't race with
+	 * uncharging, charging, migration, or LRU putback.
+	 */
 
 	/* caller should have done css_get */
 	pc->mem_cgroup = to;
-	mem_cgroup_charge_statistics(to, page, anon, nr_pages);
 	move_unlock_mem_cgroup(from, &flags);
 	ret = 0;
-unlock:
-	unlock_page_cgroup(pc);
-	/*
-	 * check events
-	 */
+
+	local_irq_disable();
+	mem_cgroup_charge_statistics(to, page, nr_pages);
 	memcg_check_events(to, page);
+	mem_cgroup_charge_statistics(from, page, -nr_pages);
 	memcg_check_events(from, page);
+	local_irq_enable();
+out_unlock:
+	unlock_page(page);
 out:
 	return ret;
 }
@@ -3901,463 +3645,14 @@ out:
 	return ret;
 }
 
-/*
- * Charge the memory controller for page usage.
- * Return
- * 0 if the charge was successful
- * < 0 if the cgroup is over its limit
- */
-static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
-				gfp_t gfp_mask, enum charge_type ctype)
-{
-	struct mem_cgroup *memcg = NULL;
-	unsigned int nr_pages = 1;
-	bool oom = true;
-	int ret;
-
-	if (PageTransHuge(page)) {
-		nr_pages <<= compound_order(page);
-		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-		/*
-		 * Never OOM-kill a process for a huge page.  The
-		 * fault handler will fall back to regular pages.
-		 */
-		oom = false;
-	}
-
-	ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom);
-	if (ret == -ENOMEM)
-		return ret;
-	__mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
-	return 0;
-}
-
-int mem_cgroup_newpage_charge(struct page *page,
-			      struct mm_struct *mm, gfp_t gfp_mask)
-{
-	if (mem_cgroup_disabled())
-		return 0;
-	VM_BUG_ON_PAGE(page_mapped(page), page);
-	VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
-	VM_BUG_ON(!mm);
-	return mem_cgroup_charge_common(page, mm, gfp_mask,
-					MEM_CGROUP_CHARGE_TYPE_ANON);
-}
-
-/*
- * While swap-in, try_charge -> commit or cancel, the page is locked.
- * And when try_charge() successfully returns, one refcnt to memcg without
- * struct page_cgroup is acquired. This refcnt will be consumed by
- * "commit()" or removed by "cancel()"
- */
-static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm,
-					  struct page *page,
-					  gfp_t mask,
-					  struct mem_cgroup **memcgp)
-{
-	struct mem_cgroup *memcg;
-	struct page_cgroup *pc;
-	int ret;
-
-	pc = lookup_page_cgroup(page);
-	/*
-	 * Every swap fault against a single page tries to charge the
-	 * page, bail as early as possible.  shmem_unuse() encounters
-	 * already charged pages, too.  The USED bit is protected by
-	 * the page lock, which serializes swap cache removal, which
-	 * in turn serializes uncharging.
-	 */
-	if (PageCgroupUsed(pc))
-		return 0;
-	if (!do_swap_account)
-		goto charge_cur_mm;
-	memcg = try_get_mem_cgroup_from_page(page);
-	if (!memcg)
-		goto charge_cur_mm;
-	*memcgp = memcg;
-	ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true);
-	css_put(&memcg->css);
-	if (ret == -EINTR)
-		ret = 0;
-	return ret;
-charge_cur_mm:
-	ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true);
-	if (ret == -EINTR)
-		ret = 0;
-	return ret;
-}
-
-int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page,
-				 gfp_t gfp_mask, struct mem_cgroup **memcgp)
-{
-	*memcgp = NULL;
-	if (mem_cgroup_disabled())
-		return 0;
-	/*
-	 * A racing thread's fault, or swapoff, may have already
-	 * updated the pte, and even removed page from swap cache: in
-	 * those cases unuse_pte()'s pte_same() test will fail; but
-	 * there's also a KSM case which does need to charge the page.
-	 */
-	if (!PageSwapCache(page)) {
-		int ret;
-
-		ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true);
-		if (ret == -EINTR)
-			ret = 0;
-		return ret;
-	}
-	return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp);
-}
-
-void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg)
-{
-	if (mem_cgroup_disabled())
-		return;
-	if (!memcg)
-		return;
-	__mem_cgroup_cancel_charge(memcg, 1);
-}
-
-static void
-__mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg,
-					enum charge_type ctype)
-{
-	if (mem_cgroup_disabled())
-		return;
-	if (!memcg)
-		return;
-
-	__mem_cgroup_commit_charge(memcg, page, 1, ctype, true);
-	/*
-	 * Now swap is on-memory. This means this page may be
-	 * counted both as mem and swap....double count.
-	 * Fix it by uncharging from memsw. Basically, this SwapCache is stable
-	 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page()
-	 * may call delete_from_swap_cache() before reach here.
-	 */
-	if (do_swap_account && PageSwapCache(page)) {
-		swp_entry_t ent = {.val = page_private(page)};
-		mem_cgroup_uncharge_swap(ent);
-	}
-}
-
-void mem_cgroup_commit_charge_swapin(struct page *page,
-				     struct mem_cgroup *memcg)
-{
-	__mem_cgroup_commit_charge_swapin(page, memcg,
-					  MEM_CGROUP_CHARGE_TYPE_ANON);
-}
-
-int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
-				gfp_t gfp_mask)
-{
-	struct mem_cgroup *memcg = NULL;
-	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
-	int ret;
-
-	if (mem_cgroup_disabled())
-		return 0;
-	if (PageCompound(page))
-		return 0;
-
-	if (!PageSwapCache(page))
-		ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
-	else { /* page is swapcache/shmem */
-		ret = __mem_cgroup_try_charge_swapin(mm, page,
-						     gfp_mask, &memcg);
-		if (!ret)
-			__mem_cgroup_commit_charge_swapin(page, memcg, type);
-	}
-	return ret;
-}
-
-static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg,
-				   unsigned int nr_pages,
-				   const enum charge_type ctype)
-{
-	struct memcg_batch_info *batch = NULL;
-	bool uncharge_memsw = true;
-
-	/* If swapout, usage of swap doesn't decrease */
-	if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
-		uncharge_memsw = false;
-
-	batch = &current->memcg_batch;
-	/*
-	 * In usual, we do css_get() when we remember memcg pointer.
-	 * But in this case, we keep res->usage until end of a series of
-	 * uncharges. Then, it's ok to ignore memcg's refcnt.
-	 */
-	if (!batch->memcg)
-		batch->memcg = memcg;
-	/*
-	 * do_batch > 0 when unmapping pages or inode invalidate/truncate.
-	 * In those cases, all pages freed continuously can be expected to be in
-	 * the same cgroup and we have chance to coalesce uncharges.
-	 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE)
-	 * because we want to do uncharge as soon as possible.
-	 */
-
-	if (!batch->do_batch || test_thread_flag(TIF_MEMDIE))
-		goto direct_uncharge;
-
-	if (nr_pages > 1)
-		goto direct_uncharge;
-
-	/*
-	 * In typical case, batch->memcg == mem. This means we can
-	 * merge a series of uncharges to an uncharge of page_counter.
-	 * If not, we uncharge page_counter ony by one.
-	 */
-	if (batch->memcg != memcg)
-		goto direct_uncharge;
-	/* remember freed charge and uncharge it later */
-	batch->nr_pages++;
-	if (uncharge_memsw)
-		batch->memsw_nr_pages++;
-	return;
-direct_uncharge:
-	page_counter_uncharge(&memcg->memory, nr_pages);
-	if (uncharge_memsw)
-		page_counter_uncharge(&memcg->memsw, nr_pages);
-	if (unlikely(batch->memcg != memcg))
-		memcg_oom_recover(memcg);
-}
-
-/*
- * uncharge if !page_mapped(page)
- */
-static struct mem_cgroup *
-__mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
-			     bool end_migration)
-{
-	struct mem_cgroup *memcg = NULL;
-	unsigned int nr_pages = 1;
-	struct page_cgroup *pc;
-	bool anon;
-
-	if (mem_cgroup_disabled())
-		return NULL;
-
-	if (PageTransHuge(page)) {
-		nr_pages <<= compound_order(page);
-		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-	}
-	/*
-	 * Check if our page_cgroup is valid
-	 */
-	pc = lookup_page_cgroup(page);
-	if (unlikely(!PageCgroupUsed(pc)))
-		return NULL;
-
-	lock_page_cgroup(pc);
-
-	memcg = pc->mem_cgroup;
-
-	if (!PageCgroupUsed(pc))
-		goto unlock_out;
-
-	anon = PageAnon(page);
-
-	switch (ctype) {
-	case MEM_CGROUP_CHARGE_TYPE_ANON:
-		/*
-		 * Generally PageAnon tells if it's the anon statistics to be
-		 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is
-		 * used before page reached the stage of being marked PageAnon.
-		 */
-		anon = true;
-		/* fallthrough */
-	case MEM_CGROUP_CHARGE_TYPE_DROP:
-		/* See mem_cgroup_prepare_migration() */
-		if (page_mapped(page))
-			goto unlock_out;
-		/*
-		 * Pages under migration may not be uncharged.  But
-		 * end_migration() /must/ be the one uncharging the
-		 * unused post-migration page and so it has to call
-		 * here with the migration bit still set.  See the
-		 * page_counter handling below.
-		 */
-		if (!end_migration && PageCgroupMigration(pc))
-			goto unlock_out;
-		break;
-	case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
-		if (!PageAnon(page)) {	/* Shared memory */
-			if (page->mapping && !page_is_file_cache(page))
-				goto unlock_out;
-		} else if (page_mapped(page)) /* Anon */
-				goto unlock_out;
-		break;
-	default:
-		break;
-	}
-
-	mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages);
-
-	ClearPageCgroupUsed(pc);
-	/*
-	 * pc->mem_cgroup is not cleared here. It will be accessed when it's
-	 * freed from LRU. This is safe because uncharged page is expected not
-	 * to be reused (freed soon). Exception is SwapCache, it's handled by
-	 * special functions.
-	 */
-
-	unlock_page_cgroup(pc);
-	/*
-	 * even after unlock, we have memcg->memory.usage here and this memcg
-	 * will never be freed.
-	 */
-	memcg_check_events(memcg, page);
-	if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
-		mem_cgroup_swap_statistics(memcg, true);
-		mem_cgroup_get(memcg);
-	}
-	/*
-	 * Migration does not charge the page_counter for the
-	 * replacement page, so leave it alone when phasing out the
-	 * page that is unused after the migration.
-	 */
-	if (!end_migration && !mem_cgroup_is_root(memcg))
-		mem_cgroup_do_uncharge(memcg, nr_pages, ctype);
-
-	return memcg;
-
-unlock_out:
-	unlock_page_cgroup(pc);
-	return NULL;
-}
-
-void mem_cgroup_uncharge_page(struct page *page)
-{
-	/* early check. */
-	if (page_mapped(page))
-		return;
-	VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
-	/*
-	 * If the page is in swap cache, uncharge should be deferred
-	 * to the swap path, which also properly accounts swap usage
-	 * and handles memcg lifetime.
-	 *
-	 * Note that this check is not stable and reclaim may add the
-	 * page to swap cache at any time after this.  However, if the
-	 * page is not in swap cache by the time page->mapcount hits
-	 * 0, there won't be any page table references to the swap
-	 * slot, and reclaim will free it and not actually write the
-	 * page to disk.
-	 */
-	if (PageSwapCache(page))
-		return;
-	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false);
-}
-
-void mem_cgroup_uncharge_cache_page(struct page *page)
-{
-	VM_BUG_ON_PAGE(page_mapped(page), page);
-	VM_BUG_ON_PAGE(page->mapping, page);
-	__mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false);
-}
-
-/*
- * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
- * In that cases, pages are freed continuously and we can expect pages
- * are in the same memcg. All these calls itself limits the number of
- * pages freed at once, then uncharge_start/end() is called properly.
- * This may be called prural(2) times in a context,
- */
-
-void mem_cgroup_uncharge_start(void)
-{
-	current->memcg_batch.do_batch++;
-	/* We can do nest. */
-	if (current->memcg_batch.do_batch == 1) {
-		current->memcg_batch.memcg = NULL;
-		current->memcg_batch.nr_pages = 0;
-		current->memcg_batch.memsw_nr_pages = 0;
-	}
-}
-
-void mem_cgroup_uncharge_end(void)
-{
-	struct memcg_batch_info *batch = &current->memcg_batch;
-
-	if (!batch->do_batch)
-		return;
-
-	batch->do_batch--;
-	if (batch->do_batch) /* If stacked, do nothing. */
-		return;
-
-	if (!batch->memcg)
-		return;
-	/*
-	 * This "batch->memcg" is valid without any css_get/put etc...
-	 * bacause we hide charges behind us.
-	 */
-	if (batch->nr_pages)
-		page_counter_uncharge(&batch->memcg->memory, batch->nr_pages);
-	if (batch->memsw_nr_pages)
-		page_counter_uncharge(&batch->memcg->memsw, batch->memsw_nr_pages);
-	memcg_oom_recover(batch->memcg);
-	/* forget this pointer (for sanity check) */
-	batch->memcg = NULL;
-}
-
-#ifdef CONFIG_SWAP
-/*
- * called after __delete_from_swap_cache() and drop "page" account.
- * memcg information is recorded to swap_cgroup of "ent"
- */
-void
-mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
-{
-	struct mem_cgroup *memcg;
-	int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT;
-
-	if (!swapout) /* this was a swap cache but the swap is unused ! */
-		ctype = MEM_CGROUP_CHARGE_TYPE_DROP;
-
-	memcg = __mem_cgroup_uncharge_common(page, ctype, false);
-
-	/*
-	 * record memcg information,  if swapout && memcg != NULL,
-	 * mem_cgroup_get() was called in uncharge().
-	 */
-	if (do_swap_account && swapout && memcg)
-		swap_cgroup_record(ent, css_id(&memcg->css));
-}
-#endif
-
-#ifdef CONFIG_MEMCG_SWAP
-/*
- * called from swap_entry_free(). remove record in swap_cgroup and
- * uncharge "memsw" account.
- */
-void mem_cgroup_uncharge_swap(swp_entry_t ent)
-{
-	struct mem_cgroup *memcg;
-	unsigned short id;
-
-	if (!do_swap_account)
-		return;
-
-	id = swap_cgroup_record(ent, 0);
-	rcu_read_lock();
-	memcg = mem_cgroup_lookup(id);
-	if (memcg) {
-		/*
-		 * We uncharge this because swap is freed.
-		 * This memcg can be obsolete one. We avoid calling css_tryget
-		 */
-		if (!mem_cgroup_is_root(memcg))
-			page_counter_uncharge(&memcg->memsw, 1);
-		mem_cgroup_swap_statistics(memcg, false);
-		mem_cgroup_put(memcg);
-	}
-	rcu_read_unlock();
-}
-
+#ifdef CONFIG_MEMCG_SWAP
+static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
+					 bool charge)
+{
+	int val = (charge) ? 1 : -1;
+	this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
+}
+
 /**
  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
  * @entry: swap entry to be moved
@@ -4387,11 +3682,14 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
 		 * This function is only called from task migration context now.
 		 * It postpones page_counter and refcount handling till the end
 		 * of task migration(mem_cgroup_clear_mc()) for performance
-		 * improvement. But we cannot postpone mem_cgroup_get(to)
-		 * because if the process that has been moved to @to does
-		 * swap-in, the refcount of @to might be decreased to 0.
+		 * improvement. But we cannot postpone css_get(to)  because if
+		 * the process that has been moved to @to does swap-in, the
+		 * refcount of @to might be decreased to 0.
+		 *
+		 * We are in attach() phase, so the cgroup is guaranteed to be
+		 * alive, so we can just call css_get().
 		 */
-		mem_cgroup_get(to);
+		css_get(&to->css);
 		return 0;
 	}
 	return -EINVAL;
@@ -4404,175 +3702,6 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
 }
 #endif
 
-/*
- * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
- * page belongs to.
- */
-void mem_cgroup_prepare_migration(struct page *page, struct page *newpage,
-				  struct mem_cgroup **memcgp)
-{
-	struct mem_cgroup *memcg = NULL;
-	unsigned int nr_pages = 1;
-	struct page_cgroup *pc;
-	enum charge_type ctype;
-
-	*memcgp = NULL;
-
-	if (mem_cgroup_disabled())
-		return;
-
-	if (PageTransHuge(page))
-		nr_pages <<= compound_order(page);
-
-	pc = lookup_page_cgroup(page);
-	lock_page_cgroup(pc);
-	if (PageCgroupUsed(pc)) {
-		memcg = pc->mem_cgroup;
-		css_get(&memcg->css);
-		/*
-		 * At migrating an anonymous page, its mapcount goes down
-		 * to 0 and uncharge() will be called. But, even if it's fully
-		 * unmapped, migration may fail and this page has to be
-		 * charged again. We set MIGRATION flag here and delay uncharge
-		 * until end_migration() is called
-		 *
-		 * Corner Case Thinking
-		 * A)
-		 * When the old page was mapped as Anon and it's unmap-and-freed
-		 * while migration was ongoing.
-		 * If unmap finds the old page, uncharge() of it will be delayed
-		 * until end_migration(). If unmap finds a new page, it's
-		 * uncharged when it make mapcount to be 1->0. If unmap code
-		 * finds swap_migration_entry, the new page will not be mapped
-		 * and end_migration() will find it(mapcount==0).
-		 *
-		 * B)
-		 * When the old page was mapped but migraion fails, the kernel
-		 * remaps it. A charge for it is kept by MIGRATION flag even
-		 * if mapcount goes down to 0. We can do remap successfully
-		 * without charging it again.
-		 *
-		 * C)
-		 * The "old" page is under lock_page() until the end of
-		 * migration, so, the old page itself will not be swapped-out.
-		 * If the new page is swapped out before end_migraton, our
-		 * hook to usual swap-out path will catch the event.
-		 */
-		if (PageAnon(page))
-			SetPageCgroupMigration(pc);
-	}
-	unlock_page_cgroup(pc);
-	/*
-	 * If the page is not charged at this point,
-	 * we return here.
-	 */
-	if (!memcg)
-		return;
-
-	*memcgp = memcg;
-	/*
-	 * We charge new page before it's used/mapped. So, even if unlock_page()
-	 * is called before end_migration, we can catch all events on this new
-	 * page. In the case new page is migrated but not remapped, new page's
-	 * mapcount will be finally 0 and we call uncharge in end_migration().
-	 */
-	if (PageAnon(page))
-		ctype = MEM_CGROUP_CHARGE_TYPE_ANON;
-	else
-		ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
-	/*
-	 * The page is committed to the memcg, but it's not actually
-	 * charged to the page_counter since we plan on replacing the
-	 * old one and only one page is going to be left afterwards.
-	 */
-	__mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false);
-}
-
-/* remove redundant charge if migration failed*/
-void mem_cgroup_end_migration(struct mem_cgroup *memcg,
-	struct page *oldpage, struct page *newpage, bool migration_ok)
-{
-	struct page *used, *unused;
-	struct page_cgroup *pc;
-	bool anon;
-
-	if (!memcg)
-		return;
-
-	if (!migration_ok) {
-		used = oldpage;
-		unused = newpage;
-	} else {
-		used = newpage;
-		unused = oldpage;
-	}
-	anon = PageAnon(used);
-	__mem_cgroup_uncharge_common(unused,
-				     anon ? MEM_CGROUP_CHARGE_TYPE_ANON
-				     : MEM_CGROUP_CHARGE_TYPE_CACHE,
-				     true);
-	css_put(&memcg->css);
-	/*
-	 * We disallowed uncharge of pages under migration because mapcount
-	 * of the page goes down to zero, temporarly.
-	 * Clear the flag and check the page should be charged.
-	 */
-	pc = lookup_page_cgroup(oldpage);
-	lock_page_cgroup(pc);
-	ClearPageCgroupMigration(pc);
-	unlock_page_cgroup(pc);
-
-	/*
-	 * If a page is a file cache, radix-tree replacement is very atomic
-	 * and we can skip this check. When it was an Anon page, its mapcount
-	 * goes down to 0. But because we added MIGRATION flage, it's not
-	 * uncharged yet. There are several case but page->mapcount check
-	 * and USED bit check in mem_cgroup_uncharge_page() will do enough
-	 * check. (see prepare_charge() also)
-	 */
-	if (anon)
-		mem_cgroup_uncharge_page(used);
-}
-
-/*
- * At replace page cache, newpage is not under any memcg but it's on
- * LRU. So, this function doesn't touch page_counter but handles LRU
- * in correct way. Both pages are locked so we cannot race with uncharge.
- */
-void mem_cgroup_replace_page_cache(struct page *oldpage,
-				  struct page *newpage)
-{
-	struct mem_cgroup *memcg = NULL;
-	struct page_cgroup *pc;
-	enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE;
-
-	if (mem_cgroup_disabled())
-		return;
-
-	pc = lookup_page_cgroup(oldpage);
-	/* fix accounting on old pages */
-	lock_page_cgroup(pc);
-	if (PageCgroupUsed(pc)) {
-		memcg = pc->mem_cgroup;
-		mem_cgroup_charge_statistics(memcg, oldpage, false, -1);
-		ClearPageCgroupUsed(pc);
-	}
-	unlock_page_cgroup(pc);
-
-	/*
-	 * When called from shmem_replace_page(), in some cases the
-	 * oldpage has already been charged, and in some cases not.
-	 */
-	if (!memcg)
-		return;
-	/*
-	 * Even if newpage->mapping was NULL before starting replacement,
-	 * the newpage may be on LRU(or pagevec for LRU) already. We lock
-	 * LRU while we overwrite pc->mem_cgroup.
-	 */
-	__mem_cgroup_commit_charge(memcg, newpage, 1, type, true);
-}
-
 #ifdef CONFIG_DEBUG_VM
 static struct page_cgroup *lookup_page_cgroup_used(struct page *page)
 {
@@ -4766,7 +3895,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 						    gfp_mask, &nr_scanned);
 		nr_reclaimed += reclaimed;
 		*total_scanned += nr_scanned;
-		spin_lock(&mctz->lock);
+		spin_lock_irq(&mctz->lock);
 
 		/*
 		 * If we failed to reclaim anything from this memory cgroup
@@ -4806,7 +3935,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
 		 */
 		/* If excess == 0, no tree ops */
 		__mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess);
-		spin_unlock(&mctz->lock);
+		spin_unlock_irq(&mctz->lock);
 		css_put(&mz->memcg->css);
 		loop++;
 		/*
@@ -4974,8 +4103,8 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
 		if (signal_pending(current))
 			return -EINTR;
 
-		progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL,
-						false);
+		progress = try_to_free_mem_cgroup_pages(memcg, SWAP_CLUSTER_MAX,
+							GFP_KERNEL, false);
 		if (!progress) {
 			nr_retries--;
 			/* maybe some writeback is necessary */
@@ -5049,18 +4178,15 @@ out:
 }
 
 
-static unsigned long tree_stat(struct mem_cgroup *memcg,
-			       enum mem_cgroup_stat_index idx)
+static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg,
+					       enum mem_cgroup_stat_index idx)
 {
 	struct mem_cgroup *iter;
-	long val = 0;
+	unsigned long val = 0;
 
-	/* Per-cpu values can be negative, use a signed accumulator */
 	for_each_mem_cgroup_tree(iter, memcg)
 		val += mem_cgroup_read_stat(iter, idx);
 
-	if (val < 0) /* race ? */
-		val = 0;
 	return val;
 }
 
@@ -5068,18 +4194,60 @@ static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
 {
 	u64 val;
 
-	if (mem_cgroup_is_root(memcg)) {
-		val = tree_stat(memcg, MEM_CGROUP_STAT_CACHE);
-		val += tree_stat(memcg, MEM_CGROUP_STAT_RSS);
-		if (swap)
-			val += tree_stat(memcg, MEM_CGROUP_STAT_SWAP);
-	} else {
+	if (!mem_cgroup_is_root(memcg)) {
 		if (!swap)
-			val = page_counter_read(&memcg->memory);
+			return page_counter_read(&memcg->memory);
 		else
-			val = page_counter_read(&memcg->memsw);
+			return page_counter_read(&memcg->memsw);
 	}
-	return val << PAGE_SHIFT;
+
+	/*
+	 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS
+	 * as well as in MEM_CGROUP_STAT_RSS_HUGE.
+	 */
+	val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+	val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS);
+
+	if (swap)
+		val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP);
+
+	return val;
+}
+
+void mem_cgroup_fill_meminfo(struct mem_cgroup *memcg, struct meminfo *mi)
+{
+	int nid;
+
+	memset(&mi->pages, 0, sizeof(mi->pages));
+	for_each_online_node(nid)
+		mem_cgroup_get_nr_pages(memcg, nid, mi->pages);
+
+	mi->slab_reclaimable = mem_cgroup_recursive_stat(memcg,
+					MEM_CGROUP_STAT_SLAB_RECLAIMABLE);
+	mi->slab_unreclaimable = mem_cgroup_recursive_stat(memcg,
+					MEM_CGROUP_STAT_SLAB_UNRECLAIMABLE);
+	mi->cached = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+	mi->shmem = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SHMEM);
+}
+
+int mem_cgroup_enough_memory(struct mem_cgroup *memcg, long pages)
+{
+	long free;
+
+	/* unused memory */
+	free = memcg->memsw.limit - page_counter_read(&memcg->memsw);
+
+	/* reclaimable slabs */
+	free += page_counter_read(&memcg->dcache);
+
+	/* assume file cache is reclaimable */
+	free += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE);
+
+	/* but do not count shmem pages as they can't be purged,
+	 * only swapped out */
+	free -= mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SHMEM);
+
+	return free < pages ? -ENOMEM : 0;
 }
 
 enum {
@@ -5117,9 +4285,9 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
 	switch (MEMFILE_ATTR(cft->private)) {
 	case RES_USAGE:
 		if (counter == &memcg->memory)
-			val = mem_cgroup_usage(memcg, false);
+			val = mem_cgroup_usage(memcg, false) * PAGE_SIZE;
 		else if (counter == &memcg->memsw)
-			val = mem_cgroup_usage(memcg, true);
+			val = mem_cgroup_usage(memcg, true) * PAGE_SIZE;
 		else
 			val = (u64)page_counter_read(counter) * PAGE_SIZE;
 		break;
@@ -5143,11 +4311,17 @@ static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
 	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
 }
 
-static int memcg_update_kmem_limit(struct cgroup *cont, unsigned long limit)
-{
-	int ret = -EINVAL;
 #ifdef CONFIG_MEMCG_KMEM
-	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+/* should be called with activate_kmem_mutex held */
+static int __memcg_activate_kmem(struct mem_cgroup *memcg,
+				 unsigned long nr_pages)
+{
+	int err = 0;
+	int memcg_id;
+
+	if (memcg_kmem_is_active(memcg))
+		return 0;
+
 	/*
 	 * For simplicity, we won't allow this to be disabled.  It also can't
 	 * be changed if the cgroup has children already, or if tasks had
@@ -5161,80 +4335,99 @@ static int memcg_update_kmem_limit(struct cgroup *cont, unsigned long limit)
 	 * of course permitted.
 	 */
 	mutex_lock(&memcg_create_mutex);
-	mutex_lock(&memcg_limit_mutex);
-	if (!memcg->kmem_account_flags && limit != PAGE_COUNTER_MAX) {
-		if (cgroup_task_count(cont) || memcg_has_children(memcg)) {
-			ret = -EBUSY;
-			goto out;
-		}
-		ret = page_counter_limit(&memcg->kmem, limit);
-		VM_BUG_ON(ret);
+	if (cgroup_task_count(memcg->css.cgroup) || memcg_has_children(memcg))
+		err = -EBUSY;
+	mutex_unlock(&memcg_create_mutex);
+	if (err)
+		goto out;
 
-		ret = memcg_update_cache_sizes(memcg);
-		if (ret) {
-			page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
-			goto out;
-		}
-		static_key_slow_inc(&memcg_kmem_enabled_key);
-		/*
-		 * setting the active bit after the inc will guarantee no one
-		 * starts accounting before all call sites are patched
-		 */
-		memcg_kmem_set_active(memcg);
+	memcg_id = memcg_alloc_cache_id();
+        if (memcg_id < 0) {
+                err = memcg_id;
+                goto out;
+        }
 
-		/*
-		 * kmem charges can outlive the cgroup. In the case of slab
-		 * pages, for instance, a page contain objects from various
-		 * processes, so it is unfeasible to migrate them away. We
-		 * need to reference count the memcg because of that.
-		 */
-		mem_cgroup_get(memcg);
-	} else
-		ret = page_counter_limit(&memcg->kmem, limit);
+	/*
+	 * We couldn't have accounted to this cgroup, because it hasn't got
+	 * activated yet, so this should succeed.
+	 */
+	err = page_counter_limit(&memcg->kmem, nr_pages);
+	VM_BUG_ON(err);
+
+	static_key_slow_inc(&memcg_kmem_enabled_key);
+	/*
+	 * A memory cgroup is considered kmem-active as soon as it gets
+	 * kmemcg_id. Setting the id after enabling static branching will
+	 * patched.
+	 */
+	memcg->kmemcg_id = memcg_id;
+	set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+	set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags);
 out:
+	return err;
+}
+
+static int memcg_activate_kmem(struct mem_cgroup *memcg,
+			       unsigned long nr_pages)
+{
+	int ret;
+
+	mutex_lock(&activate_kmem_mutex);
+	ret = __memcg_activate_kmem(memcg, nr_pages);
+	mutex_unlock(&activate_kmem_mutex);
+	return ret;
+}
+
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+				   unsigned long nr_pages)
+{
+	int ret;
+
+	mutex_lock(&memcg_limit_mutex);
+	if (!memcg_kmem_is_active(memcg))
+		ret = memcg_activate_kmem(memcg, nr_pages);
+	else
+		ret = page_counter_limit(&memcg->kmem, nr_pages);
 	mutex_unlock(&memcg_limit_mutex);
-	mutex_unlock(&memcg_create_mutex);
-#endif
 	return ret;
 }
 
-#ifdef CONFIG_MEMCG_KMEM
+static bool do_kmem_account = true;
+
+static int __init enable_kmem_account(char *s)
+{
+	if (!strcmp(s, "1"))
+		do_kmem_account = true;
+	else if (!strcmp(s, "0"))
+		do_kmem_account = false;
+	return 1;
+}
+__setup("kmemaccount=", enable_kmem_account);
+
 static int memcg_propagate_kmem(struct mem_cgroup *memcg)
 {
 	int ret = 0;
 	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-	if (!parent)
-		goto out;
 
-	memcg->kmem_account_flags = parent->kmem_account_flags;
-	/*
-	 * When that happen, we need to disable the static branch only on those
-	 * memcgs that enabled it. To achieve this, we would be forced to
-	 * complicate the code by keeping track of which memcgs were the ones
-	 * that actually enabled limits, and which ones got it from its
-	 * parents.
-	 *
-	 * It is a lot simpler just to do static_key_slow_inc() on every child
-	 * that is accounted.
-	 */
-	if (!memcg_kmem_is_active(memcg))
-		goto out;
+	if (!parent)
+		return 0;
 
+	mutex_lock(&activate_kmem_mutex);
 	/*
-	 * destroy(), called if we fail, will issue static_key_slow_inc() and
-	 * mem_cgroup_put() if kmem is enabled. We have to either call them
-	 * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
-	 * this more consistent, since it always leads to the same destroy path
+	 * If the parent cgroup is not kmem-active now, it cannot be activated
+	 * after this point, because it has at least one child already.
 	 */
-	mem_cgroup_get(memcg);
-	static_key_slow_inc(&memcg_kmem_enabled_key);
-
-	mutex_lock(&memcg_limit_mutex);
-	ret = memcg_update_cache_sizes(memcg);
-	mutex_unlock(&memcg_limit_mutex);
-out:
+	if (do_kmem_account || memcg_kmem_is_active(parent))
+		ret = __memcg_activate_kmem(memcg, PAGE_COUNTER_MAX);
+	mutex_unlock(&activate_kmem_mutex);
 	return ret;
 }
+#else
+static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
+				   unsigned long long val)
+{
+	return -EINVAL;
+}
 #endif /* CONFIG_MEMCG_KMEM */
 
 /*
@@ -5252,31 +4445,145 @@ static int mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 	if (ret)
 		return ret;
 
-	switch (MEMFILE_ATTR(cft->private)) {
-	case RES_LIMIT:
-		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
-			ret = -EINVAL;
-			break;
-		}
-		switch (MEMFILE_TYPE(cft->private)) {
-		case _MEM:
-			ret = mem_cgroup_resize_limit(memcg, nr_pages);
-			break;
-		case _MEMSWAP:
-			ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
-			break;
-		case _KMEM:
-			ret = memcg_update_kmem_limit(cont, nr_pages);
-			break;
-		}
-		break;
-	case RES_SOFT_LIMIT:
-		memcg->soft_limit = nr_pages;
-		ret = 0;
-		break;
+	switch (MEMFILE_ATTR(cft->private)) {
+	case RES_LIMIT:
+		if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
+			ret = -EINVAL;
+			break;
+		}
+		switch (MEMFILE_TYPE(cft->private)) {
+		case _MEM:
+			ret = mem_cgroup_resize_limit(memcg, nr_pages);
+			break;
+		case _MEMSWAP:
+			ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
+			break;
+		case _KMEM:
+			ret = memcg_update_kmem_limit(memcg, nr_pages);
+			break;
+		}
+		break;
+	case RES_SOFT_LIMIT:
+		memcg->soft_limit = nr_pages;
+		ret = 0;
+		break;
+	}
+	return ret;
+}
+
+static ssize_t mem_cgroup_low_read(struct cgroup *cont, struct cftype *cft,
+				   struct file *file, char __user *buf,
+				   size_t nbytes, loff_t *ppos)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	char str[64];
+	int len;
+
+	len = scnprintf(str, sizeof(str), "%llu\n",
+			((unsigned long long)memcg->low) << PAGE_SHIFT);
+	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+
+static int mem_cgroup_low_write(struct cgroup *cont, struct cftype *cft,
+				const char *buffer)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	unsigned long nr_pages;
+	int ret;
+
+	ret = page_counter_memparse(buffer, &nr_pages);
+	if (ret)
+		return ret;
+
+	memcg->low = nr_pages;
+	return 0;
+}
+
+static ssize_t mem_cgroup_high_read(struct cgroup *cont, struct cftype *cft,
+				    struct file *file, char __user *buf,
+				    size_t nbytes, loff_t *ppos)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	char str[64];
+	int len;
+
+	len = scnprintf(str, sizeof(str), "%llu\n",
+			((unsigned long long)memcg->high) << PAGE_SHIFT);
+	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+
+static int mem_cgroup_high_write(struct cgroup *cont, struct cftype *cft,
+				 const char *buffer)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	unsigned long nr_pages, usage;
+	int ret;
+
+	ret = page_counter_memparse(buffer, &nr_pages);
+	if (ret)
+		return ret;
+
+	memcg->high = nr_pages;
+
+	usage = page_counter_read(&memcg->memory);
+	if (usage > nr_pages)
+		try_to_free_mem_cgroup_pages(memcg, usage - nr_pages,
+					     GFP_KERNEL, false);
+	return 0;
+}
+
+static ssize_t mem_cgroup_oom_guarantee_read(struct cgroup *cont,
+		struct cftype *cft, struct file *file, char __user *buf,
+		size_t nbytes, loff_t *ppos)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	char str[64];
+	int len;
+
+	len = scnprintf(str, sizeof(str), "%llu\n",
+			((unsigned long long)memcg->oom_guarantee) << PAGE_SHIFT);
+	return simple_read_from_buffer(buf, nbytes, ppos, str, len);
+}
+
+static int mem_cgroup_oom_guarantee_write(struct cgroup *cont,
+		struct cftype *cft, const char *buffer)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	unsigned long nr_pages;
+	int ret;
+
+	ret = page_counter_memparse(buffer, &nr_pages);
+	if (ret)
+		return ret;
+
+	memcg->oom_guarantee = nr_pages;
+	return 0;
+}
+
+#ifdef CONFIG_CLEANCACHE
+static u64 mem_cgroup_disable_cleancache_read(struct cgroup *cgrp,
+					      struct cftype *cft)
+{
+	return mem_cgroup_from_cont(cgrp)->cleancache_disabled_toggle;
+}
+
+static int mem_cgroup_disable_cleancache_write(struct cgroup *cgrp,
+					       struct cftype *cft, u64 val)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp);
+	struct mem_cgroup *iter, *parent;
+
+	mutex_lock(&memcg_create_mutex);
+	memcg->cleancache_disabled_toggle = !!val;
+	for_each_mem_cgroup_tree(iter, memcg) {
+		parent = parent_mem_cgroup(iter);
+		iter->cleancache_disabled = parent->cleancache_disabled ||
+					iter->cleancache_disabled_toggle;
 	}
-	return ret;
+	mutex_unlock(&memcg_create_mutex);
+	return 0;
 }
+#endif
 
 static int mem_cgroup_reset(struct cgroup *cont, unsigned int event)
 {
@@ -5343,6 +4650,144 @@ static int mem_cgroup_move_charge_write(struct cgroup *cgrp,
 }
 #endif
 
+#ifdef CONFIG_BEANCOUNTERS
+
+#include <bc/beancounter.h>
+
+void mem_cgroup_sync_beancounter(struct mem_cgroup *memcg,
+				 struct user_beancounter *ub)
+{
+	struct mem_cgroup *mi;
+	unsigned long lim, held, maxheld;
+	volatile struct ubparm *k, *d, *p, *s, *o;
+
+	k = &ub->ub_parms[UB_KMEMSIZE];
+	d = &ub->ub_parms[UB_DCACHESIZE];
+	p = &ub->ub_parms[UB_PHYSPAGES];
+	s = &ub->ub_parms[UB_SWAPPAGES];
+	o = &ub->ub_parms[UB_OOMGUARPAGES];
+
+	p->held	= page_counter_read(&memcg->memory);
+	p->maxheld = memcg->memory.watermark;
+	p->failcnt = atomic_long_read(&memcg->mem_failcnt);
+	lim = memcg->memory.limit;
+	lim = lim >= PAGE_COUNTER_MAX ? UB_MAXVALUE :
+		min_t(unsigned long, lim, UB_MAXVALUE);
+	p->barrier = p->limit = lim;
+
+	//todo: check odd code - counting in bytes instead of pages. wtf???
+	k->held = page_counter_read(&memcg->kmem) << PAGE_SHIFT;
+	k->maxheld = memcg->kmem.watermark << PAGE_SHIFT;
+	k->failcnt = memcg->kmem.failcnt << PAGE_SHIFT;
+	lim = memcg->kmem.limit << PAGE_SHIFT;
+	lim = lim >= (PAGE_COUNTER_MAX << PAGE_SHIFT) ? UB_MAXVALUE :
+		min_t(unsigned long long, lim, UB_MAXVALUE);
+	k->barrier = k->limit = lim;
+
+	d->held = page_counter_read(&memcg->dcache) << PAGE_SHIFT;
+	d->maxheld = memcg->dcache.watermark << PAGE_SHIFT;
+	d->failcnt = 0;
+	d->barrier = d->limit = UB_MAXVALUE;
+
+	held = page_counter_read(&memcg->memsw) -
+		page_counter_read(&memcg->memory);
+	maxheld = memcg->swap_max;
+	s->failcnt = atomic_long_read(&memcg->swap_failcnt);
+	lim = memcg->memsw.limit;
+	lim = lim >= PAGE_COUNTER_MAX ? UB_MAXVALUE :
+		min_t(unsigned long long, lim, UB_MAXVALUE);
+	if (lim != UB_MAXVALUE)
+		lim -= p->limit;
+	s->barrier = s->limit = lim;
+
+	/* Due to global reclaim, memory.memsw.usage can be greater than
+	 * (memory.memsw.limit - memory.limit). */
+	s->held = min(held, lim);
+	s->maxheld = min(maxheld, lim);
+
+	o->held = page_counter_read(&memcg->memsw);
+	o->maxheld = memcg->memsw.watermark;
+	o->failcnt = atomic_long_read(&memcg->oom_kill_cnt);
+	lim = memcg->oom_guarantee;
+	lim = lim >= PAGE_COUNTER_MAX ? UB_MAXVALUE :
+		min_t(unsigned long long, lim >> PAGE_SHIFT, UB_MAXVALUE);
+	o->barrier = o->limit = lim;
+
+	ub->swapin = 0;
+	ub->swapout = 0;
+	for_each_mem_cgroup_tree(mi, memcg) {
+		ub->swapin += mem_cgroup_read_events(mi, MEM_CGROUP_EVENTS_PSWPIN);
+		ub->swapout += mem_cgroup_read_events(mi, MEM_CGROUP_EVENTS_PSWPOUT);
+	}
+}
+
+int mem_cgroup_apply_beancounter(struct mem_cgroup *memcg,
+				 struct user_beancounter *ub)
+{
+	unsigned long long mem, memsw, mem_old, memsw_old, oomguar;
+	int ret = 0;
+
+	if (mem_cgroup_is_root(memcg))
+		return -EPERM;
+
+	mem = ub->ub_parms[UB_PHYSPAGES].limit;
+	memsw = ub->ub_parms[UB_SWAPPAGES].limit;
+	if (memsw < PAGE_COUNTER_MAX - mem)
+		memsw += mem;
+	else
+		memsw = PAGE_COUNTER_MAX;
+
+	oomguar = ub->ub_parms[UB_OOMGUARPAGES].barrier;
+
+	if (ub->ub_parms[UB_KMEMSIZE].limit != UB_MAXVALUE)
+		pr_warn_once("ub: kmemsize limit is deprecated\n");
+	if (ub->ub_parms[UB_DCACHESIZE].limit != UB_MAXVALUE)
+		pr_warn_once("ub: dcachesize limit is deprecated\n");
+
+	/* activate kmem accounting */
+	ret = memcg_update_kmem_limit(memcg, PAGE_COUNTER_MAX);
+	if (ret)
+		goto out;
+
+	/* try change mem+swap before changing mem limit */
+	if (memcg->memsw.limit != memsw)
+		(void)mem_cgroup_resize_memsw_limit(memcg, memsw);
+
+	if (memcg->memory.limit != mem) {
+		ret = mem_cgroup_resize_limit(memcg, mem);
+		if (ret)
+			goto out;
+	}
+
+	mem_old = memcg->memory.limit;
+	memsw_old = memcg->memsw.limit;
+
+	if (mem != mem_old) {
+		/* first, reset memsw limit since it cannot be < mem limit */
+		if (memsw_old < PAGE_COUNTER_MAX) {
+			memsw_old = PAGE_COUNTER_MAX;
+			ret = mem_cgroup_resize_memsw_limit(memcg, memsw_old);
+			if (ret)
+				goto out;
+		}
+		ret = mem_cgroup_resize_limit(memcg, mem);
+		if (ret)
+			goto out;
+	}
+
+	if (memsw != memsw_old) {
+		ret = mem_cgroup_resize_memsw_limit(memcg, memsw);
+		if (ret)
+			goto out;
+	}
+
+	memcg->oom_guarantee = oomguar;
+out:
+	return ret;
+}
+
+#endif /* CONFIG_BEANCOUNTERS */
+
 #ifdef CONFIG_NUMA
 static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
 				      struct seq_file *m)
@@ -5388,6 +4833,236 @@ static int memcg_numa_stat_show(struct cgroup *cont, struct cftype *cft,
 	seq_putc(m, '\n');
 	return 0;
 }
+
+/*
+ * memcg_numa_migrate_new_page() private argument. @target_nodes specifies the
+ * set of nodes to allocate pages from. @current_node is the current preferable
+ * node, it gets rotated after each allocation.
+ */
+struct memcg_numa_migrate_struct {
+	nodemask_t *target_nodes;
+	int current_node;
+};
+
+/*
+ * Used as an argument for migrate_pages(). Allocated pages are spread evenly
+ * among destination nodes.
+ */
+static struct page *memcg_numa_migrate_new_page(struct page *page,
+				unsigned long private, int **result)
+{
+	struct memcg_numa_migrate_struct *ms = (void *)private;
+	gfp_t gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_NORETRY | __GFP_NOWARN;
+
+	ms->current_node = next_node(ms->current_node, *ms->target_nodes);
+	if (ms->current_node >= MAX_NUMNODES) {
+		ms->current_node = first_node(*ms->target_nodes);
+		VM_BUG_ON(ms->current_node >= MAX_NUMNODES);
+	}
+
+	return __alloc_pages_nodemask(gfp_mask, 0,
+			node_zonelist(ms->current_node, gfp_mask),
+			ms->target_nodes);
+}
+
+/*
+ * Isolate at most @nr_to_scan pages from @lruvec for further migration and
+ * store them in @dst. Returns the number of pages scanned. Return value of 0
+ * means that @lruved is empty.
+ */
+static long memcg_numa_isolate_pages(struct lruvec *lruvec, enum lru_list lru,
+				     long nr_to_scan, struct list_head *dst)
+{
+	struct list_head *src = &lruvec->lists[lru];
+	struct zone *zone = lruvec_zone(lruvec);
+	struct page *page;
+	long scanned = 0, taken = 0;
+
+	spin_lock_irq(&zone->lru_lock);
+	while (!list_empty(src) && scanned < nr_to_scan && taken < nr_to_scan) {
+		int nr_pages;
+		page = list_last_entry(src, struct page, lru);
+
+		scanned++;
+
+		switch (__isolate_lru_page(page, ISOLATE_ASYNC_MIGRATE)) {
+		case 0:
+			nr_pages = hpage_nr_pages(page);
+			mem_cgroup_update_lru_size(lruvec, lru, -nr_pages);
+			list_move(&page->lru, dst);
+			taken += nr_pages;
+			break;
+
+		case -EBUSY:
+			list_move(&page->lru, src);
+			continue;
+
+		default:
+			BUG();
+		}
+	}
+	__mod_zone_page_state(zone, NR_LRU_BASE + lru, -taken);
+	__mod_zone_page_state(zone, NR_ISOLATED_ANON + is_file_lru(lru), taken);
+	spin_unlock_irq(&zone->lru_lock);
+
+	list_for_each_entry(page, dst, lru) {
+		if (PageTransHuge(page) && split_huge_page_to_list(page, dst)) {
+			list_del(&page->lru);
+			mod_zone_page_state(zone, NR_ISOLATED_ANON,
+					HPAGE_PMD_NR);
+			putback_lru_page(page);
+		}
+	}
+
+	return scanned;
+}
+
+static long __memcg_numa_migrate_pages(struct lruvec *lruvec, enum lru_list lru,
+				       nodemask_t *target_nodes, long nr_to_scan)
+{
+	struct memcg_numa_migrate_struct ms = {
+		.target_nodes = target_nodes,
+		.current_node = -1,
+	};
+	LIST_HEAD(pages);
+	long total_scanned = 0;
+
+	/*
+	 * If no limit on the maximal number of migrated pages is specified,
+	 * assume the caller wants to migrate them all.
+	 */
+	if (nr_to_scan < 0)
+		nr_to_scan = mem_cgroup_get_lru_size(lruvec, lru);
+
+	while (total_scanned < nr_to_scan) {
+		int ret;
+		long scanned;
+
+		scanned = memcg_numa_isolate_pages(lruvec, lru,
+						   SWAP_CLUSTER_MAX, &pages);
+		if (!scanned)
+			break;
+
+		ret = migrate_pages(&pages, memcg_numa_migrate_new_page,
+				    (unsigned long)&ms, MIGRATE_ASYNC,
+				    MR_SYSCALL);
+		putback_lru_pages(&pages);
+		if (ret < 0)
+			return ret;
+
+		if (signal_pending(current))
+			return -EINTR;
+
+		total_scanned += scanned;
+	}
+
+	return total_scanned;
+}
+
+/*
+ * Migrate at most @nr_to_scan pages accounted to @memcg to @target_nodes.
+ * Pages are spreaded evenly among destination nodes. If @nr_to_scan is <= 0,
+ * then the function will attempt to migrate all pages accounted to @memcg.
+ */
+static int memcg_numa_migrate_pages(struct mem_cgroup *memcg,
+				    nodemask_t *target_nodes, long nr_to_scan)
+{
+	struct mem_cgroup *mi;
+	long total_scanned = 0, scanned;
+
+again:
+	scanned = 0;
+	for_each_mem_cgroup_tree(mi, memcg) {
+		struct zone *zone;
+
+		for_each_populated_zone(zone) {
+			struct lruvec *lruvec;
+			enum lru_list lru;
+
+			if (node_isset(zone_to_nid(zone), *target_nodes))
+				continue;
+
+			lruvec = mem_cgroup_zone_lruvec(zone, mi);
+			/*
+			 * For the sake of simplicity, do not attempt to migrate
+			 * unevictable pages. It should be fine as long as there
+			 * aren't too many of them, which is usually true.
+			 */
+			for_each_evictable_lru(lru) {
+				long ret = __memcg_numa_migrate_pages(lruvec,
+						lru, target_nodes,
+						nr_to_scan > 0 ?
+						SWAP_CLUSTER_MAX : -1);
+				if (ret < 0) {
+					mem_cgroup_iter_break(memcg, mi);
+					return ret;
+				}
+				scanned += ret;
+			}
+		}
+	}
+
+	total_scanned += scanned;
+
+	/*
+	 * Retry only if we made progress in the previous iteration.
+	 */
+	if (nr_to_scan > 0 && scanned > 0 && total_scanned < nr_to_scan)
+		goto again;
+
+	return 0;
+}
+
+/*
+ * The format of memory.numa_migrate is
+ *
+ *   NODELIST[ MAX_SCAN]
+ *
+ * where NODELIST is a comma-separated list of ranges N1-N2 specifying the set
+ * of nodes to migrate pages of this cgroup to, and the optional MAX_SCAN
+ * imposes a limit on the number of pages that can be migrated in one go.
+ *
+ * The call may be interrupted by a signal, in which case -EINTR is returned.
+ */
+static int memcg_numa_migrate_write(struct cgroup *cont,
+		struct cftype *cft, const char *buf)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	NODEMASK_ALLOC(nodemask_t, target_nodes, GFP_KERNEL);
+	const char *nodes_str = buf, *nr_str;
+	long nr_to_scan = -1;
+	int ret = -ENOMEM;
+
+	if (!target_nodes)
+		goto out;
+
+	nr_str = strchr(buf, ' ');
+	if (nr_str) {
+		nodes_str = kstrndup(buf, nr_str - buf, GFP_KERNEL);
+		if (!nodes_str)
+			goto out;
+		nr_str += 1;
+	}
+
+	ret = nodelist_parse(nodes_str, *target_nodes);
+	if (ret)
+		goto out;
+
+	ret = -EINVAL;
+	if (!nodes_subset(*target_nodes, node_states[N_MEMORY]))
+		goto out;
+
+	if (nr_str && (kstrtol(nr_str, 10, &nr_to_scan) || nr_to_scan <= 0))
+		goto out;
+
+	ret = memcg_numa_migrate_pages(memcg, target_nodes, nr_to_scan);
+out:
+	if (nodes_str != buf)
+		kfree(nodes_str);
+	NODEMASK_FREE(target_nodes);
+	return ret;
+}
+
 #endif /* CONFIG_NUMA */
 
 static inline void mem_cgroup_lru_names_not_uptodate(void)
@@ -5406,7 +5081,7 @@ static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
 		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 			continue;
-		seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i],
+		seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
 			   mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
 	}
 
@@ -5431,13 +5106,13 @@ static int memcg_stat_show(struct cgroup *cont, struct cftype *cft,
 			   (u64)memsw * PAGE_SIZE);
 
 	for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
-		long long val = 0;
+		unsigned long long val = 0;
 
 		if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
 			continue;
 		for_each_mem_cgroup_tree(mi, memcg)
 			val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
-		seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val);
+		seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
 	}
 
 	for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
@@ -5584,16 +5259,18 @@ static int compare_thresholds(const void *a, const void *b)
 	return 0;
 }
 
+static DEFINE_SPINLOCK(memcg_oom_notify_lock);
+
 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
 {
 	struct mem_cgroup_eventfd_list *ev;
 
-	spin_lock(&memcg_oom_lock);
+	spin_lock(&memcg_oom_notify_lock);
 
 	list_for_each_entry(ev, &memcg->oom_notify, list)
 		eventfd_signal(ev->eventfd, 1);
 
-	spin_unlock(&memcg_oom_lock);
+	spin_unlock(&memcg_oom_notify_lock);
 	return 0;
 }
 
@@ -5780,7 +5457,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
 	if (!event)
 		return -ENOMEM;
 
-	spin_lock(&memcg_oom_lock);
+	spin_lock(&memcg_oom_notify_lock);
 
 	event->eventfd = eventfd;
 	list_add(&event->list, &memcg->oom_notify);
@@ -5788,7 +5465,7 @@ static int mem_cgroup_oom_register_event(struct cgroup *cgrp,
 	/* already in OOM ? */
 	if (atomic_read(&memcg->under_oom))
 		eventfd_signal(eventfd, 1);
-	spin_unlock(&memcg_oom_lock);
+	spin_unlock(&memcg_oom_notify_lock);
 
 	return 0;
 }
@@ -5802,7 +5479,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 
 	BUG_ON(type != _OOM_TYPE);
 
-	spin_lock(&memcg_oom_lock);
+	spin_lock(&memcg_oom_notify_lock);
 
 	list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
 		if (ev->eventfd == eventfd) {
@@ -5811,7 +5488,7 @@ static void mem_cgroup_oom_unregister_event(struct cgroup *cgrp,
 		}
 	}
 
-	spin_unlock(&memcg_oom_lock);
+	spin_unlock(&memcg_oom_notify_lock);
 }
 
 static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
@@ -5837,6 +5514,9 @@ static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
 	if (!cgrp->parent || !((val == 0) || (val == 1)))
 		return -EINVAL;
 
+	if (!ve_is_super(get_exec_env()) && val != 0)
+		return -EACCES;
+
 	memcg->oom_kill_disable = val;
 	if (!val)
 		memcg_oom_recover(memcg);
@@ -5849,7 +5529,6 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	int ret;
 
-	memcg->kmemcg_id = -1;
 	ret = memcg_propagate_kmem(memcg);
 	if (ret)
 		return ret;
@@ -5857,9 +5536,79 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	return mem_cgroup_sockets_init(memcg, ss);
 }
 
-static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
+static void memcg_destroy_kmem(struct mem_cgroup *memcg)
 {
+	if (test_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags)) {
+		list_del(&memcg->kmemcg_sharers);
+		memcg_destroy_kmem_caches(memcg);
+	}
 	mem_cgroup_sockets_destroy(memcg);
+}
+
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
+{
+	struct mem_cgroup *parent, *sharer;
+	int kmemcg_id;
+
+	if (!memcg_kmem_is_active(memcg))
+		return;
+
+	/*
+	 * Clear the 'active' flag before clearing memcg_caches arrays entries.
+	 * Since we take the slab_mutex in memcg_deactivate_kmem_caches(), it
+	 * guarantees no cache will be created for this cgroup after we are
+	 * done (see memcg_create_kmem_cache()).
+	 */
+	clear_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags);
+
+	memcg_deactivate_kmem_caches(memcg);
+
+	kmemcg_id = memcg->kmemcg_id;
+	BUG_ON(kmemcg_id < 0);
+
+	parent = parent_mem_cgroup(memcg);
+	if (!parent)
+		parent = root_mem_cgroup;
+
+	/*
+	 * Change kmemcg_id of this cgroup and all its descendants to the
+	 * parent's id, and then move all entries from this cgroup's list_lrus
+	 * to ones of the parent. After we have finished, all list_lrus
+	 * corresponding to this cgroup are guaranteed to remain empty. The
+	 * ordering is imposed by list_lru_node->lock taken by
+	 * memcg_drain_all_list_lrus().
+	 */
+	list_for_each_entry(sharer, &memcg->kmemcg_sharers, kmemcg_sharers) {
+		BUG_ON(sharer->kmemcg_id != kmemcg_id);
+		sharer->kmemcg_id = parent->kmemcg_id;
+	}
+	memcg->kmemcg_id = parent->kmemcg_id;
+	list_splice(&memcg->kmemcg_sharers, &parent->kmemcg_sharers);
+	list_add(&memcg->kmemcg_sharers, &parent->kmemcg_sharers);
+
+	memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
+
+	memcg_free_cache_id(kmemcg_id);
+
+	/*
+	 * kmem charges can outlive the cgroup. In the case of slab
+	 * pages, for instance, a page contain objects from various
+	 * processes. As we prevent from taking a reference for every
+	 * such allocation we have to be careful when doing uncharge
+	 * (see memcg_uncharge_kmem) and here during offlining.
+	 *
+	 * The idea is that that only the _last_ uncharge which sees
+	 * the dead memcg will drop the last reference. An additional
+	 * reference is taken here before the group is marked dead
+	 * which is then paired with css_put during uncharge resp. here.
+	 *
+	 * Although this might sound strange as this path is called from
+	 * css_offline() when the referencemight have dropped down to 0
+	 * and shouldn't be incremented anymore (css_tryget would fail)
+	 * we do not have other options because of the kmem allocations
+	 * lifetime.
+	 */
+	css_get(&memcg->css);
 
 	memcg_kmem_mark_dead(memcg);
 
@@ -5873,7 +5622,7 @@ static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
 	 * page_counter read, so in that case, we don't need the put
 	 */
 	if (memcg_kmem_test_and_clear_dead(memcg))
-		mem_cgroup_put(memcg);
+		css_put(&memcg->css);
 }
 #else
 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -5881,7 +5630,11 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	return 0;
 }
 
-static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
+static void memcg_destroy_kmem(struct mem_cgroup *memcg)
+{
+}
+
+static void memcg_deactivate_kmem(struct mem_cgroup *memcg)
 {
 }
 #endif
@@ -5913,6 +5666,18 @@ static struct cftype mem_cgroup_files[] = {
 		.read = mem_cgroup_read,
 	},
 	{
+		.name = "low",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write_string = mem_cgroup_low_write,
+		.read = mem_cgroup_low_read,
+	},
+	{
+		.name = "high",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write_string = mem_cgroup_high_write,
+		.read = mem_cgroup_high_read,
+	},
+	{
 		.name = "failcnt",
 		.private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
 		.trigger = mem_cgroup_reset,
@@ -5928,7 +5693,7 @@ static struct cftype mem_cgroup_files[] = {
 	},
 	{
 		.name = "use_hierarchy",
-		.flags = CFTYPE_INSANE,
+		.flags = CFTYPE_INSANE | CFTYPE_VE_WRITABLE,
 		.write_u64 = mem_cgroup_hierarchy_write,
 		.read_u64 = mem_cgroup_hierarchy_read,
 	},
@@ -5951,6 +5716,12 @@ static struct cftype mem_cgroup_files[] = {
 		.private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
 	},
 	{
+		.name = "oom_guarantee",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write_string = mem_cgroup_oom_guarantee_write,
+		.read = mem_cgroup_oom_guarantee_read,
+	},
+	{
 		.name = "pressure_level",
 		.register_event = vmpressure_register_event,
 		.unregister_event = vmpressure_unregister_event,
@@ -5960,6 +5731,19 @@ static struct cftype mem_cgroup_files[] = {
 		.name = "numa_stat",
 		.read_seq_string = memcg_numa_stat_show,
 	},
+	{
+		.name = "numa_migrate",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.write_string = memcg_numa_migrate_write,
+	},
+#endif
+#ifdef CONFIG_CLEANCACHE
+	{
+		.name = "disable_cleancache",
+		.flags = CFTYPE_NOT_ON_ROOT,
+		.read_u64 = mem_cgroup_disable_cleancache_read,
+		.write_u64 = mem_cgroup_disable_cleancache_write,
+	},
 #endif
 #ifdef CONFIG_MEMCG_KMEM
 	{
@@ -6063,14 +5847,12 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
 	struct mem_cgroup *memcg;
-	size_t size = memcg_size();
+	size_t size;
 
-	/* Can be very big if nr_node_ids is very big */
-	if (size < PAGE_SIZE)
-		memcg = kzalloc(size, GFP_KERNEL);
-	else
-		memcg = vzalloc(size);
+	size = sizeof(struct mem_cgroup);
+	size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
 
+	memcg = kzalloc(size, GFP_KERNEL);
 	if (!memcg)
 		return NULL;
 
@@ -6081,10 +5863,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
 	return memcg;
 
 out_free:
-	if (size < PAGE_SIZE)
-		kfree(memcg);
-	else
-		vfree(memcg);
+	kfree(memcg);
 	return NULL;
 }
 
@@ -6102,7 +5881,6 @@ out_free:
 static void __mem_cgroup_free(struct mem_cgroup *memcg)
 {
 	int node;
-	size_t size = memcg_size();
 
 	mem_cgroup_remove_from_trees(memcg);
 	free_css_id(&mem_cgroup_subsys, &memcg->css);
@@ -6124,53 +5902,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
 	 * the cgroup_lock.
 	 */
 	disarm_static_keys(memcg);
-	if (size < PAGE_SIZE)
-		kfree(memcg);
-	else
-		vfree(memcg);
-}
-
-
-/*
- * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
- * but in process context.  The work_freeing structure is overlaid
- * on the rcu_freeing structure, which itself is overlaid on memsw.
- */
-static void free_work(struct work_struct *work)
-{
-	struct mem_cgroup *memcg;
-
-	memcg = container_of(work, struct mem_cgroup, work_freeing);
-	__mem_cgroup_free(memcg);
-}
-
-static void free_rcu(struct rcu_head *rcu_head)
-{
-	struct mem_cgroup *memcg;
-
-	memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
-	INIT_WORK(&memcg->work_freeing, free_work);
-	schedule_work(&memcg->work_freeing);
-}
-
-static void mem_cgroup_get(struct mem_cgroup *memcg)
-{
-	atomic_inc(&memcg->refcnt);
-}
-
-static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
-{
-	if (atomic_sub_and_test(count, &memcg->refcnt)) {
-		struct mem_cgroup *parent = parent_mem_cgroup(memcg);
-		call_rcu(&memcg->rcu_freeing, free_rcu);
-		if (parent)
-			mem_cgroup_put(parent);
-	}
-}
-
-static void mem_cgroup_put(struct mem_cgroup *memcg)
-{
-	__mem_cgroup_put(memcg, 1);
+	kfree(memcg);
 }
 
 /*
@@ -6227,17 +5959,23 @@ mem_cgroup_css_alloc(struct cgroup *cont)
 		root_mem_cgroup = memcg;
 		page_counter_init(&memcg->memory, NULL);
 		memcg->soft_limit = PAGE_COUNTER_MAX;
+		memcg->high = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, NULL);
 		page_counter_init(&memcg->kmem, NULL);
+		page_counter_init(&memcg->dcache, NULL);
 	}
 
 	memcg->last_scanned_node = MAX_NUMNODES;
 	INIT_LIST_HEAD(&memcg->oom_notify);
-	atomic_set(&memcg->refcnt, 1);
 	memcg->move_charge_at_immigrate = 0;
 	mutex_init(&memcg->thresholds_lock);
 	spin_lock_init(&memcg->move_lock);
 	vmpressure_init(&memcg->vmpressure);
+	init_oom_context(&memcg->oom_ctx);
+#ifdef CONFIG_MEMCG_KMEM
+	memcg->kmemcg_id = -1;
+	INIT_LIST_HEAD(&memcg->kmemcg_sharers);
+#endif
 
 	return &memcg->css;
 
@@ -6250,7 +5988,6 @@ static int
 mem_cgroup_css_online(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg, *parent;
-	int error = 0;
 
 	if (!cont->parent)
 		return 0;
@@ -6262,25 +5999,29 @@ mem_cgroup_css_online(struct cgroup *cont)
 	memcg->use_hierarchy = parent->use_hierarchy;
 	memcg->oom_kill_disable = parent->oom_kill_disable;
 	memcg->swappiness = mem_cgroup_swappiness(parent);
+#ifdef CONFIG_CLEANCACHE
+	memcg->cleancache_disabled = parent->cleancache_disabled;
+#endif
 
 	if (parent->use_hierarchy) {
 		page_counter_init(&memcg->memory, &parent->memory);
 		memcg->soft_limit = PAGE_COUNTER_MAX;
+		memcg->high = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, &parent->memsw);
 		page_counter_init(&memcg->kmem, &parent->kmem);
+		page_counter_init(&memcg->dcache, &parent->dcache);
 
 		/*
-		 * We increment refcnt of the parent to ensure that we can
-		 * safely access it on page_counter_charge/uncharge.
-		 * This refcnt will be decremented when freeing this
-		 * mem_cgroup(see mem_cgroup_put).
+		 * No need to take a reference to the parent because cgroup
+		 * core guarantees its existence.
 		 */
-		mem_cgroup_get(parent);
 	} else {
 		page_counter_init(&memcg->memory, NULL);
 		memcg->soft_limit = PAGE_COUNTER_MAX;
+		memcg->high = PAGE_COUNTER_MAX;
 		page_counter_init(&memcg->memsw, NULL);
 		page_counter_init(&memcg->kmem, NULL);
+		page_counter_init(&memcg->dcache, NULL);
 		/*
 		 * Deeper hierachy with use_hierarchy == false doesn't make
 		 * much sense so let cgroup subsystem know about this
@@ -6289,10 +6030,9 @@ mem_cgroup_css_online(struct cgroup *cont)
 		if (parent != root_mem_cgroup)
 			mem_cgroup_subsys.broken_hierarchy = true;
 	}
-
-	error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
 	mutex_unlock(&memcg_create_mutex);
-	return error;
+
+	return memcg_init_kmem(memcg, &mem_cgroup_subsys);
 }
 
 /*
@@ -6318,6 +6058,17 @@ static void mem_cgroup_css_offline(struct cgroup *cont)
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 	struct cgroup *iter;
 
+	/*
+	 * Mark memory cgroup as offline before going to reparent charges.
+	 * This guarantees that __mem_cgroup_try_charge() either charges before
+	 * reparenting starts or doesn't charge at all, hence we won't have
+	 * pending user memory charges after reparenting is done.
+	 */
+	memcg->is_offline = true;
+	smp_mb();
+
+	memcg_deactivate_kmem(memcg);
+
 	mem_cgroup_invalidate_reclaim_iterators(memcg);
 
 	/*
@@ -6333,70 +6084,101 @@ static void mem_cgroup_css_offline(struct cgroup *cont)
 	rcu_read_unlock();
 	mem_cgroup_reparent_charges(memcg);
 
-	mem_cgroup_destroy_all_caches(memcg);
+	/*
+	 * A cgroup can be destroyed while somebody is waiting for its
+	 * oom context, in which case the context will never be unlocked
+	 * from oom_unlock, because the latter only iterates over live
+	 * cgroups. So we need to release the context now, when one can
+	 * no longer iterate over it.
+	 */
+	release_oom_context(&memcg->oom_ctx);
 }
 
 static void mem_cgroup_css_free(struct cgroup *cont)
 {
 	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
 
-	kmem_cgroup_destroy(memcg);
+	/*
+	 * XXX: css_offline() would be where we should reparent all
+	 * memory to prepare the cgroup for destruction.  However,
+	 * memcg does not do css_tryget() and res_counter charging
+	 * under the same RCU lock region, which means that charging
+	 * could race with offlining.  Offlining only happens to
+	 * cgroups with no tasks in them but charges can show up
+	 * without any tasks from the swapin path when the target
+	 * memcg is looked up from the swapout record and not from the
+	 * current task as it usually is.  A race like this can leak
+	 * charges and put pages with stale cgroup pointers into
+	 * circulation:
+	 *
+	 * #0                        #1
+	 *                           lookup_swap_cgroup_id()
+	 *                           rcu_read_lock()
+	 *                           mem_cgroup_lookup()
+	 *                           css_tryget()
+	 *                           rcu_read_unlock()
+	 * disable css_tryget()
+	 * call_rcu()
+	 *   offline_css()
+	 *     reparent_charges()
+	 *                           res_counter_charge()
+	 *                           css_put()
+	 *                             css_free()
+	 *                           pc->mem_cgroup = dead memcg
+	 *                           add page to lru
+	 *
+	 * The bulk of the charges are still moved in offline_css() to
+	 * avoid pinning a lot of pages in case a long-term reference
+	 * like a swapout record is deferring the css_free() to long
+	 * after offlining.  But this makes sure we catch any charges
+	 * made after offlining:
+	 */
+	mem_cgroup_reparent_charges(memcg);
 
-	mem_cgroup_put(memcg);
+	memcg_destroy_kmem(memcg);
+	__mem_cgroup_free(memcg);
 }
 
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
-#define PRECHARGE_COUNT_AT_ONCE	256
 static int mem_cgroup_do_precharge(unsigned long count)
 {
 	int ret = 0;
-	int batch_count = PRECHARGE_COUNT_AT_ONCE;
-	struct mem_cgroup *memcg = mc.to;
 
-	if (mem_cgroup_is_root(memcg)) {
+	if (mem_cgroup_is_root(mc.to)) {
 		mc.precharge += count;
 		/* we don't need css_get for root */
 		return ret;
 	}
-	/* try to charge at once */
-	if (count > 1) {
-		struct page_counter *dummy;
-		/*
-		 * "memcg" cannot be under rmdir() because we've already checked
-		 * by cgroup_lock_live_cgroup() that it is not removed and we
-		 * are still under the same cgroup_mutex. So we can postpone
-		 * css_get().
-		 */
-		if (page_counter_try_charge(&memcg->memory, count, &dummy))
-			goto one_by_one;
-		if (do_swap_account &&
-		    page_counter_try_charge(&memcg->memsw, count, &dummy)) {
-			page_counter_uncharge(&memcg->memory, count);
-			goto one_by_one;
-		}
+
+	/* Try a single bulk charge without reclaim first */
+	ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
+	if (!ret) {
 		mc.precharge += count;
 		return ret;
 	}
-one_by_one:
-	/* fall back to one by one charge */
+	if (ret == -EINTR) {
+		cancel_charge(root_mem_cgroup, count);
+		return ret;
+	}
+
+	/* Try charges one by one with reclaim */
 	while (count--) {
-		if (signal_pending(current)) {
-			ret = -EINTR;
-			break;
-		}
-		if (!batch_count--) {
-			batch_count = PRECHARGE_COUNT_AT_ONCE;
-			cond_resched();
-		}
-		ret = __mem_cgroup_try_charge(NULL,
-					GFP_KERNEL, 1, &memcg, false);
+		ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
+		/*
+		 * In case of failure, any residual charges against
+		 * mc.to will be dropped by mem_cgroup_clear_mc()
+		 * later on.  However, cancel any charges that are
+		 * bypassed to root right away or they'll be lost.
+		 */
+		if (ret == -EINTR)
+			cancel_charge(root_mem_cgroup, 1);
 		if (ret)
-			/* mem_cgroup_clear_mc() will do uncharge later */
 			return ret;
 		mc.precharge++;
+		cond_resched();
 	}
-	return ret;
+	return 0;
 }
 
 /**
@@ -6488,10 +6270,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
 		return NULL;
 
 	mapping = vma->vm_file->f_mapping;
-	if (pte_none(ptent))
-		pgoff = linear_page_index(vma, addr);
-	else /* pte_file(ptent) is true */
-		pgoff = pte_to_pgoff(ptent);
+	pgoff = linear_page_index(vma, addr);
 
 	/* page is moved even if it's not RSS of this task(page-faulted). */
 #ifdef CONFIG_SWAP
@@ -6524,7 +6303,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 		page = mc_handle_present_pte(vma, addr, ptent);
 	else if (is_swap_pte(ptent))
 		page = mc_handle_swap_pte(vma, addr, ptent, &ent);
-	else if (pte_none(ptent) || pte_file(ptent))
+	else if (pte_none(ptent))
 		page = mc_handle_file_pte(vma, addr, ptent, &ent);
 
 	if (!page && !ent.val)
@@ -6532,9 +6311,9 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
 	if (page) {
 		pc = lookup_page_cgroup(page);
 		/*
-		 * Do only loose check w/o page_cgroup lock.
-		 * mem_cgroup_move_account() checks the pc is valid or not under
-		 * the lock.
+		 * Do only loose check w/o serialization.
+		 * mem_cgroup_move_account() checks the pc is valid or
+		 * not under LRU exclusion.
 		 */
 		if (PageCgroupUsed(pc) && pc->mem_cgroup == mc.from) {
 			ret = MC_TARGET_PAGE;
@@ -6655,10 +6434,11 @@ static void __mem_cgroup_clear_mc(void)
 {
 	struct mem_cgroup *from = mc.from;
 	struct mem_cgroup *to = mc.to;
+	int i;
 
 	/* we must uncharge all the leftover precharges from mc.to */
 	if (mc.precharge) {
-		__mem_cgroup_cancel_charge(mc.to, mc.precharge);
+		cancel_charge(mc.to, mc.precharge);
 		mc.precharge = 0;
 	}
 	/*
@@ -6666,7 +6446,7 @@ static void __mem_cgroup_clear_mc(void)
 	 * we must uncharge here.
 	 */
 	if (mc.moved_charge) {
-		__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
+		cancel_charge(mc.from, mc.moved_charge);
 		mc.moved_charge = 0;
 	}
 	/* we must fixup refcnts and charges */
@@ -6675,6 +6455,9 @@ static void __mem_cgroup_clear_mc(void)
 		if (!mem_cgroup_is_root(mc.from))
 			page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
 
+		for (i = 0; i < mc.moved_swap; i++)
+			css_put(&mc.from->css);
+
 		if (!mem_cgroup_is_root(mc.to)) {
 			/*
 			 * we charged both to->memory and to->memsw, so we
@@ -6682,11 +6465,13 @@ static void __mem_cgroup_clear_mc(void)
 			 */
 			page_counter_uncharge(&mc.to->memory, mc.moved_swap);
 		}
-		__mem_cgroup_put(mc.from, mc.moved_swap);
-
-		/* we've already done mem_cgroup_get(mc.to) */
+		/* we've already done css_get(mc.to) */
 		mc.moved_swap = 0;
 	}
+	if (do_swap_account) {
+		mem_cgroup_update_swap_max(from);
+		mem_cgroup_update_swap_max(to);
+	}
 	memcg_oom_recover(from);
 	memcg_oom_recover(to);
 	wake_up_all(&mc.waitq);
@@ -6996,6 +6781,421 @@ static void __init enable_swap_cgroup(void)
 }
 #endif
 
+#ifdef CONFIG_MEMCG_SWAP
+/**
+ * mem_cgroup_swapout - transfer a memsw charge to swap
+ * @page: page whose memsw charge to transfer
+ * @entry: swap entry to move the charge to
+ *
+ * Transfer the memsw charge of @page to @entry.
+ */
+void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
+{
+	struct page_cgroup *pc;
+	unsigned short oldid;
+
+	VM_BUG_ON_PAGE(PageLRU(page), page);
+	VM_BUG_ON_PAGE(page_count(page), page);
+
+	if (!do_swap_account)
+		return;
+
+	pc = lookup_page_cgroup(page);
+
+	/* Readahead page, never charged */
+	if (!PageCgroupUsed(pc))
+		return;
+
+	VM_BUG_ON_PAGE(!(pc->flags & PCG_MEMSW), page);
+
+	oldid = swap_cgroup_record(entry, css_id(&pc->mem_cgroup->css));
+	VM_BUG_ON_PAGE(oldid, page);
+
+	pc->flags &= ~PCG_MEMSW;
+	css_get(&pc->mem_cgroup->css);
+	mem_cgroup_swap_statistics(pc->mem_cgroup, true);
+}
+
+/**
+ * mem_cgroup_uncharge_swap - uncharge a swap entry
+ * @entry: swap entry to uncharge
+ *
+ * Drop the memsw charge associated with @entry.
+ */
+void mem_cgroup_uncharge_swap(swp_entry_t entry)
+{
+	struct mem_cgroup *memcg;
+	unsigned short id;
+
+	if (!do_swap_account)
+		return;
+
+	id = swap_cgroup_record(entry, 0);
+	rcu_read_lock();
+	memcg = mem_cgroup_lookup(id);
+	if (memcg) {
+		if (!mem_cgroup_is_root(memcg))
+			page_counter_uncharge(&memcg->memsw, 1);
+		mem_cgroup_swap_statistics(memcg, false);
+		css_put(&memcg->css);
+	}
+	rcu_read_unlock();
+}
+#endif
+
+/**
+ * mem_cgroup_try_charge - try charging a page
+ * @page: page to charge
+ * @mm: mm context of the victim
+ * @gfp_mask: reclaim mode
+ * @memcgp: charged memcg return
+ *
+ * Try to charge @page to the memcg that @mm belongs to, reclaiming
+ * pages according to @gfp_mask if necessary.
+ *
+ * Returns 0 on success, with *@memcgp pointing to the charged memcg.
+ * Otherwise, an error code is returned.
+ *
+ * After page->mapping has been set up, the caller must finalize the
+ * charge with mem_cgroup_commit_charge().  Or abort the transaction
+ * with mem_cgroup_cancel_charge() in case page instantiation fails.
+ */
+int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
+			  gfp_t gfp_mask, struct mem_cgroup **memcgp)
+{
+	struct mem_cgroup *memcg = NULL;
+	unsigned int nr_pages = 1;
+	int ret = 0;
+
+	if (mem_cgroup_disabled())
+		goto out;
+
+	if (PageSwapCache(page)) {
+		struct page_cgroup *pc = lookup_page_cgroup(page);
+		/*
+		 * Every swap fault against a single page tries to charge the
+		 * page, bail as early as possible.  shmem_unuse() encounters
+		 * already charged pages, too.  The USED bit is protected by
+		 * the page lock, which serializes swap cache removal, which
+		 * in turn serializes uncharging.
+		 */
+		if (PageCgroupUsed(pc))
+			goto out;
+	}
+
+	if (PageTransHuge(page)) {
+		nr_pages <<= compound_order(page);
+		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+	}
+
+	if (do_swap_account && PageSwapCache(page))
+		memcg = try_get_mem_cgroup_from_page(page);
+	if (!memcg)
+		memcg = get_mem_cgroup_from_mm(mm);
+
+	ret = try_charge(memcg, gfp_mask, nr_pages);
+
+	css_put(&memcg->css);
+
+	if (ret == -EINTR) {
+		memcg = root_mem_cgroup;
+		ret = 0;
+	}
+out:
+	*memcgp = memcg;
+	return ret;
+}
+
+/**
+ * mem_cgroup_commit_charge - commit a page charge
+ * @page: page to charge
+ * @memcg: memcg to charge the page to
+ * @lrucare: page might be on LRU already
+ *
+ * Finalize a charge transaction started by mem_cgroup_try_charge(),
+ * after page->mapping has been set up.  This must happen atomically
+ * as part of the page instantiation, i.e. under the page table lock
+ * for anonymous pages, under the page lock for page and swap cache.
+ *
+ * In addition, the page must not be on the LRU during the commit, to
+ * prevent racing with task migration.  If it might be, use @lrucare.
+ *
+ * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
+ */
+void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
+			      bool lrucare)
+{
+	unsigned int nr_pages = 1;
+
+	VM_BUG_ON_PAGE(!page->mapping, page);
+	VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
+
+	if (mem_cgroup_disabled())
+		return;
+	/*
+	 * Swap faults will attempt to charge the same page multiple
+	 * times.  But reuse_swap_page() might have removed the page
+	 * from swapcache already, so we can't check PageSwapCache().
+	 */
+	if (!memcg)
+		return;
+
+	if (PageTransHuge(page)) {
+		nr_pages <<= compound_order(page);
+		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+	}
+
+	commit_charge(page, memcg, nr_pages, lrucare);
+
+	if (do_swap_account && PageSwapCache(page)) {
+		swp_entry_t entry = { .val = page_private(page) };
+		/*
+		 * The swap entry might not get freed for a long time,
+		 * let's not wait for it.  The page already received a
+		 * memory+swap charge, drop the swap entry duplicate.
+		 */
+		mem_cgroup_uncharge_swap(entry);
+	}
+}
+
+/**
+ * mem_cgroup_cancel_charge - cancel a page charge
+ * @page: page to charge
+ * @memcg: memcg to charge the page to
+ *
+ * Cancel a charge transaction started by mem_cgroup_try_charge().
+ */
+void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg)
+{
+	unsigned int nr_pages = 1;
+
+	if (mem_cgroup_disabled())
+		return;
+	/*
+	 * Swap faults will attempt to charge the same page multiple
+	 * times.  But reuse_swap_page() might have removed the page
+	 * from swapcache already, so we can't check PageSwapCache().
+	 */
+	if (!memcg)
+		return;
+
+	if (PageTransHuge(page)) {
+		nr_pages <<= compound_order(page);
+		VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+	}
+
+	cancel_charge(memcg, nr_pages);
+}
+
+static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
+			   unsigned long nr_mem, unsigned long nr_memsw,
+			   unsigned long nr_anon, unsigned long nr_file,
+			   unsigned long nr_huge, unsigned long nr_kmem,
+			   unsigned long nr_shmem, struct page *dummy_page)
+{
+	unsigned long flags;
+
+	if (!mem_cgroup_is_root(memcg)) {
+		if (nr_mem + nr_kmem)
+			page_counter_uncharge(&memcg->memory, nr_mem + nr_kmem);
+		if (nr_memsw + nr_kmem)
+			page_counter_uncharge(&memcg->memsw, nr_memsw + nr_kmem);
+		if (nr_kmem)
+			page_counter_uncharge(&memcg->kmem, nr_kmem);
+
+		memcg_oom_recover(memcg);
+	}
+
+	local_irq_save(flags);
+	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
+	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
+	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
+	__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_SHMEM], nr_shmem);
+	__this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
+	__this_cpu_add(memcg->stat->nr_page_events, nr_anon + nr_file);
+	memcg_check_events(memcg, dummy_page);
+	local_irq_restore(flags);
+}
+
+static void uncharge_list(struct list_head *page_list)
+{
+	struct mem_cgroup *memcg = NULL;
+	unsigned long nr_memsw = 0;
+	unsigned long nr_anon = 0;
+	unsigned long nr_file = 0;
+	unsigned long nr_huge = 0;
+	unsigned long nr_kmem = 0;
+	unsigned long pgpgout = 0;
+	unsigned long nr_mem = 0;
+	unsigned long nr_shmem = 0;
+	struct list_head *next;
+	struct page *page;
+
+	next = page_list->next;
+	do {
+		unsigned int nr_pages = 1;
+		struct page_cgroup *pc;
+
+		page = list_entry(next, struct page, lru);
+		next = page->lru.next;
+
+		VM_BUG_ON_PAGE(PageLRU(page), page);
+		VM_BUG_ON_PAGE(page_count(page), page);
+
+		pc = lookup_page_cgroup(page);
+		if (!PageCgroupUsed(pc))
+			continue;
+
+		/*
+		 * Nobody should be changing or seriously looking at
+		 * pc->mem_cgroup and pc->flags at this point, we have
+		 * fully exclusive access to the page.
+		 */
+
+		if (memcg != pc->mem_cgroup) {
+			if (memcg) {
+				uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
+					nr_anon, nr_file, nr_huge, nr_kmem,
+					nr_shmem, page);
+				pgpgout = nr_mem = nr_memsw = nr_kmem = 0;
+				nr_anon = nr_file = nr_huge = nr_shmem = 0;
+			}
+			memcg = pc->mem_cgroup;
+		}
+
+		if (!PageKmemcg(page)) {
+			if (PageTransHuge(page)) {
+				nr_pages <<= compound_order(page);
+				VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+				nr_huge += nr_pages;
+			}
+			if (PageAnon(page))
+				nr_anon += nr_pages;
+			else {
+				if (PageSwapBacked(page))
+					nr_shmem += nr_pages;
+				nr_file += nr_pages;
+			}
+			pgpgout++;
+		} else {
+			nr_kmem += 1 << compound_order(page);
+			__ClearPageKmemcg(page);
+		}
+
+		if (pc->flags & PCG_MEM)
+			nr_mem += nr_pages;
+		if (pc->flags & PCG_MEMSW)
+			nr_memsw += nr_pages;
+		pc->flags = 0;
+
+		pgpgout++;
+	} while (next != page_list);
+
+	if (memcg)
+		uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, nr_anon,
+				nr_file, nr_huge, nr_kmem, nr_shmem, page);
+}
+
+/**
+ * mem_cgroup_uncharge - uncharge a page
+ * @page: page to uncharge
+ *
+ * Uncharge a page previously charged with mem_cgroup_try_charge() and
+ * mem_cgroup_commit_charge().
+ */
+void mem_cgroup_uncharge(struct page *page)
+{
+	struct page_cgroup *pc;
+
+	if (mem_cgroup_disabled())
+		return;
+
+	/* Don't touch page->lru of any random page, pre-check: */
+	pc = lookup_page_cgroup(page);
+	if (!PageCgroupUsed(pc))
+		return;
+
+	INIT_LIST_HEAD(&page->lru);
+	uncharge_list(&page->lru);
+}
+
+/**
+ * mem_cgroup_uncharge_list - uncharge a list of page
+ * @page_list: list of pages to uncharge
+ *
+ * Uncharge a list of pages previously charged with
+ * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
+ */
+void mem_cgroup_uncharge_list(struct list_head *page_list)
+{
+	if (mem_cgroup_disabled())
+		return;
+
+	if (!list_empty(page_list))
+		uncharge_list(page_list);
+}
+
+/**
+ * mem_cgroup_migrate - migrate a charge to another page
+ * @oldpage: currently charged page
+ * @newpage: page to transfer the charge to
+ * @lrucare: both pages might be on the LRU already
+ *
+ * Migrate the charge from @oldpage to @newpage.
+ *
+ * Both pages must be locked, @newpage->mapping must be set up.
+ */
+void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
+			bool lrucare)
+{
+	unsigned int nr_pages = 1;
+	struct page_cgroup *pc;
+	int isolated;
+
+	VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
+	VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
+	VM_BUG_ON_PAGE(!lrucare && PageLRU(oldpage), oldpage);
+	VM_BUG_ON_PAGE(!lrucare && PageLRU(newpage), newpage);
+	VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
+
+	if (mem_cgroup_disabled())
+		return;
+
+	/* Page cache replacement: new page already charged? */
+	pc = lookup_page_cgroup(newpage);
+	if (PageCgroupUsed(pc))
+		return;
+
+	/* Re-entrant migration: old page already uncharged? */
+	pc = lookup_page_cgroup(oldpage);
+	if (!PageCgroupUsed(pc))
+		return;
+
+	VM_BUG_ON_PAGE(!(pc->flags & PCG_MEM), oldpage);
+	VM_BUG_ON_PAGE(do_swap_account && !(pc->flags & PCG_MEMSW), oldpage);
+
+	if (PageTransHuge(oldpage)) {
+		nr_pages <<= compound_order(oldpage);
+		VM_BUG_ON_PAGE(!PageTransHuge(oldpage), oldpage);
+		VM_BUG_ON_PAGE(!PageTransHuge(newpage), newpage);
+	}
+
+	if (lrucare)
+		lock_page_lru(oldpage, &isolated);
+
+	pc->flags = 0;
+
+	if (lrucare)
+		unlock_page_lru(oldpage, isolated);
+
+	local_irq_disable();
+	mem_cgroup_charge_statistics(pc->mem_cgroup, oldpage, -nr_pages);
+	memcg_check_events(pc->mem_cgroup, oldpage);
+	local_irq_enable();
+
+	commit_charge(newpage, pc->mem_cgroup, nr_pages, lrucare);
+}
+
 /*
  * subsys_initcall() for memory controller.
  *
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -243,21 +243,11 @@ void shake_page(struct page *p, int access)
 	}
 
 	/*
-	 * Only call shrink_slab here (which would also shrink other caches) if
-	 * access is not potentially fatal.
+	 * Only call shrink_node_slabs here (which would also shrink
+	 * other caches) if access is not potentially fatal.
 	 */
-	if (access) {
-		int nr;
-		do {
-			struct shrink_control shrink = {
-				.gfp_mask = GFP_KERNEL,
-			};
-
-			nr = shrink_slab(&shrink, 1000, 1000);
-			if (page_count(p) == 1)
-				break;
-		} while (nr > 10);
-	}
+	if (access)
+		drop_slab_node(page_to_nid(p));
 }
 EXPORT_SYMBOL_GPL(shake_page);
 
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -42,6 +42,7 @@
 #include <linux/mm.h>
 #include <linux/hugetlb.h>
 #include <linux/mman.h>
+#include <linux/virtinfo.h>
 #include <linux/swap.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
@@ -61,6 +62,11 @@
 #include <linux/migrate.h>
 #include <linux/string.h>
 #include <linux/userfaultfd_k.h>
+#include <linux/ve.h>
+
+#include <bc/beancounter.h>
+#include <bc/io_acct.h>
+#include <bc/vmpages.h>
 
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -103,7 +109,7 @@ EXPORT_SYMBOL(high_memory);
  * ( When CONFIG_COMPAT_BRK=y we exclude brk from randomization,
  *   as ancient (libc5 based) binaries can segfault. )
  */
-int randomize_va_space __read_mostly =
+int _randomize_va_space __read_mostly =
 #ifdef CONFIG_COMPAT_BRK
 					1;
 #else
@@ -839,45 +845,44 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 
 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
-		if (!pte_file(pte)) {
-			swp_entry_t entry = pte_to_swp_entry(pte);
-
-			if (swap_duplicate(entry) < 0)
-				return entry.val;
-
-			/* make sure dst_mm is on swapoff's mmlist. */
-			if (unlikely(list_empty(&dst_mm->mmlist))) {
-				spin_lock(&mmlist_lock);
-				if (list_empty(&dst_mm->mmlist))
-					list_add(&dst_mm->mmlist,
-						 &src_mm->mmlist);
-				spin_unlock(&mmlist_lock);
-			}
-			if (likely(!non_swap_entry(entry)))
-				rss[MM_SWAPENTS]++;
-			else if (is_migration_entry(entry)) {
-				page = migration_entry_to_page(entry);
-
-				rss[mm_counter(page)]++;
-
-				if (is_write_migration_entry(entry) &&
-				    is_cow_mapping(vm_flags)) {
-					/*
-					 * COW mappings require pages in both
-					 * parent and child to be set to read.
-					 */
-					make_migration_entry_read(&entry);
-					pte = swp_entry_to_pte(entry);
-					if (pte_swp_soft_dirty(*src_pte))
-						pte = pte_swp_mksoft_dirty(pte);
-					set_pte_at(src_mm, addr, src_pte, pte);
-				}
-			} else
+		swp_entry_t entry = pte_to_swp_entry(pte);
+
+		if (swap_duplicate(entry) < 0)
+			return entry.val;
+
+		/* make sure dst_mm is on swapoff's mmlist. */
+		if (unlikely(list_empty(&dst_mm->mmlist))) {
+			spin_lock(&mmlist_lock);
+			if (list_empty(&dst_mm->mmlist))
+				list_add(&dst_mm->mmlist,
+					 &src_mm->mmlist);
+			spin_unlock(&mmlist_lock);
+		}
+		if (likely(!non_swap_entry(entry)))
+			rss[MM_SWAPENTS]++;
+		else if (is_migration_entry(entry)) {
+			page = migration_entry_to_page(entry);
+
+			rss[mm_counter(page)]++;
+
+			if (is_write_migration_entry(entry) &&
+			    is_cow_mapping(vm_flags)) {
 				/*
-				 * This can not happen because HMM migration holds
-				 * mmap_sem in read mode.
+				 * COW mappings require pages in both
+				 * parent and child to be set to read.
 				 */
-				VM_BUG_ON(is_hmm_entry(entry));
+				make_migration_entry_read(&entry);
+				pte = swp_entry_to_pte(entry);
+				if (pte_swp_soft_dirty(*src_pte))
+					pte = pte_swp_mksoft_dirty(pte);
+				set_pte_at(src_mm, addr, src_pte, pte);
+			}
+		} else {
+			/*
+			 * This can not happen because HMM migration holds
+			 * mmap_sem in read mode.
+			 */
+			VM_BUG_ON(is_hmm_entry(entry));
 		}
 		goto out_set_pte;
 	}
@@ -1047,11 +1052,9 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * readonly mappings. The tradeoff is that copy_page_range is more
 	 * efficient than faulting.
 	 */
-	if (!(vma->vm_flags & (VM_HUGETLB | VM_NONLINEAR |
-			       VM_PFNMAP | VM_MIXEDMAP))) {
-		if (!vma->anon_vma)
-			return 0;
-	}
+	if (!(vma->vm_flags & (VM_HUGETLB | VM_PFNMAP | VM_MIXEDMAP)) &&
+			!vma->anon_vma)
+		return 0;
 
 	if (is_vm_hugetlb_page(vma))
 		return copy_hugetlb_page_range(dst_mm, src_mm, vma);
@@ -1109,6 +1112,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 	spinlock_t *ptl;
 	pte_t *start_pte;
 	pte_t *pte;
+	swp_entry_t entry;
 
 again:
 	init_rss_vec(rss);
@@ -1134,31 +1138,15 @@ again:
 				if (details->check_mapping &&
 				    details->check_mapping != page->mapping)
 					continue;
-				/*
-				 * Each page->index must be checked when
-				 * invalidating or truncating nonlinear.
-				 */
-				if (details->nonlinear_vma &&
-				    (page->index < details->first_index ||
-				     page->index > details->last_index))
-					continue;
 			}
 			ptent = ptep_get_and_clear_full(mm, addr, pte,
 							tlb->fullmm);
 			tlb_remove_tlb_entry(tlb, pte, addr);
 			if (unlikely(!page))
 				continue;
-			if (unlikely(details) && details->nonlinear_vma
-			    && linear_page_index(details->nonlinear_vma,
-						addr) != page->index) {
-				pte_t ptfile = pgoff_to_pte(page->index);
-				if (pte_soft_dirty(ptent))
-					pte_file_mksoft_dirty(ptfile);
-				set_pte_at(mm, addr, pte, ptfile);
-			}
 			if (!PageAnon(page)) {
 				if (pte_dirty(ptent))
-					set_page_dirty(page);
+					set_page_dirty_mm(page, mm);
 				if (pte_young(ptent) &&
 				    likely(!VM_SequentialReadHint(vma)))
 					mark_page_accessed(page);
@@ -1172,30 +1160,21 @@ again:
 				break;
 			continue;
 		}
-		/*
-		 * If details->check_mapping, we leave swap entries;
-		 * if details->nonlinear_vma, we leave file entries.
-		 */
+		/* If details->check_mapping, we leave swap entries. */
 		if (unlikely(details))
 			continue;
-		if (pte_file(ptent)) {
-			if (unlikely(!(vma->vm_flags & VM_NONLINEAR)))
-				print_bad_pte(vma, addr, ptent, NULL);
-		} else {
-			swp_entry_t entry = pte_to_swp_entry(ptent);
-
-			if (!non_swap_entry(entry))
-				rss[MM_SWAPENTS]--;
-			else if (is_migration_entry(entry)) {
-				struct page *page;
 
-				page = migration_entry_to_page(entry);
+		entry = pte_to_swp_entry(ptent);
+		if (!non_swap_entry(entry))
+			rss[MM_SWAPENTS]--;
+		else if (is_migration_entry(entry)) {
+			struct page *page;
 
-				rss[mm_counter(page)]--;
-			}
-			if (unlikely(!free_swap_and_cache(entry)))
-				print_bad_pte(vma, addr, ptent, NULL);
+			page = migration_entry_to_page(entry);
+			rss[mm_counter(page)]--;
 		}
+		if (unlikely(!free_swap_and_cache(entry)))
+			print_bad_pte(vma, addr, ptent, NULL);
 		pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
 	} while (pte++, addr += PAGE_SIZE, addr != end);
 
@@ -1304,11 +1283,10 @@ static void unmap_page_range(struct mmu_gather *tlb,
 	pgd_t *pgd;
 	unsigned long next;
 
-	if (details && !details->check_mapping && !details->nonlinear_vma)
+	if (details && !details->check_mapping)
 		details = NULL;
 
 	BUG_ON(addr >= end);
-	mem_cgroup_uncharge_start();
 	tlb_start_vma(tlb, vma);
 	pgd = pgd_offset(vma->vm_mm, addr);
 	do {
@@ -1318,7 +1296,6 @@ static void unmap_page_range(struct mmu_gather *tlb,
 		next = zap_pud_range(tlb, vma, pgd, addr, next, details);
 	} while (pgd++, addr = next, addr != end);
 	tlb_end_vma(tlb, vma);
-	mem_cgroup_uncharge_end();
 }
 
 
@@ -1400,7 +1377,7 @@ void unmap_vmas(struct mmu_gather *tlb,
  * @vma: vm_area_struct holding the applicable pages
  * @start: starting address of pages to zap
  * @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
  *
  * Caller must protect the VMA list
  */
@@ -1426,7 +1403,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start,
  * @vma: vm_area_struct holding the applicable pages
  * @address: starting address of pages to zap
  * @size: number of bytes to zap
- * @details: details of nonlinear truncation or shared cache invalidation
+ * @details: details of shared cache invalidation
  *
  * The range must fit into one VMA.
  */
@@ -1514,7 +1491,7 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr,
 	/* Ok, finally just insert the thing.. */
 	get_page(page);
 	inc_mm_counter_fast(mm, mm_counter_file(page));
-	page_add_file_rmap(page);
+	page_add_file_rmap(page, mm);
 	set_pte_at(mm, addr, pte, mk_pte(page, prot));
 
 	retval = 0;
@@ -1959,12 +1936,11 @@ int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
 EXPORT_SYMBOL_GPL(apply_to_page_range);
 
 /*
- * handle_pte_fault chooses page fault handler according to an entry
- * which was read non-atomically.  Before making any commitment, on
- * those architectures or configurations (e.g. i386 with PAE) which
- * might give a mix of unmatched parts, do_swap_page and do_nonlinear_fault
- * must check under lock before unmapping the pte and proceeding
- * (but do_wp_page is only called after already making such a check;
+ * handle_pte_fault chooses page fault handler according to an entry which was
+ * read non-atomically.  Before making any commitment, on those architectures
+ * or configurations (e.g. i386 with PAE) which might give a mix of unmatched
+ * parts, do_swap_page must check under lock before unmapping the pte and
+ * proceeding (but do_wp_page is only called after already making such a check;
  * and do_anonymous_page can safely check later on).
  */
 static inline int pte_unmap_same(struct mm_struct *mm, pmd_t *pmd,
@@ -2127,6 +2103,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
 	int page_copied = 0;
 	const unsigned long mmun_start = address & PAGE_MASK;	/* For mmu_notifiers */
 	const unsigned long mmun_end = mmun_start + PAGE_SIZE;	/* For mmu_notifiers */
+	struct mem_cgroup *memcg;
 
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
@@ -2143,7 +2120,7 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 	__SetPageUptodate(new_page);
 
-	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))
+	if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg))
 		goto oom_free_new;
 
 	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
@@ -2173,6 +2150,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
 		 */
 		ptep_clear_flush_notify(vma, address, page_table);
 		page_add_new_anon_rmap(new_page, vma, address);
+		mem_cgroup_commit_charge(new_page, memcg, false);
+		lru_cache_add_active_or_unevictable(new_page, vma);
 		/*
 		 * We call the notify macro here because, when using secondary
 		 * mmu page tables (such as kvm shadow page tables), we want the
@@ -2209,9 +2188,8 @@ static int wp_page_copy(struct mm_struct *mm, struct vm_area_struct *vma,
 		/* Free the old page.. */
 		new_page = old_page;
 		page_copied = 1;
-	} else {
-		mem_cgroup_uncharge_page(new_page);
-	}
+	} else
+		mem_cgroup_cancel_charge(new_page, memcg);
 
 	if (new_page)
 		page_cache_release(new_page);
@@ -2445,25 +2423,11 @@ static inline void unmap_mapping_range_tree(struct rb_root *root,
 	}
 }
 
-static inline void unmap_mapping_range_list(struct list_head *head,
-					    struct zap_details *details)
-{
-	struct vm_area_struct *vma;
-
-	/*
-	 * In nonlinear VMAs there is no correspondence between virtual address
-	 * offset and file offset.  So we must perform an exhaustive search
-	 * across *all* the pages in each nonlinear VMA, not just the pages
-	 * whose virtual address lies outside the file truncation point.
-	 */
-	list_for_each_entry(vma, head, shared.nonlinear) {
-		details->nonlinear_vma = vma;
-		unmap_mapping_range_vma(vma, vma->vm_start, vma->vm_end, details);
-	}
-}
-
 /**
- * unmap_mapping_range - unmap the portion of all mmaps in the specified address_space corresponding to the specified page range in the underlying file.
+ * unmap_mapping_range - unmap the portion of all mmaps in the specified
+ * address_space corresponding to the specified page range in the underlying
+ * file.
+ *
  * @mapping: the address space containing mmaps to be unmapped.
  * @holebegin: byte in first page to unmap, relative to the start of
  * the underlying file.  This will be rounded down to a PAGE_SIZE
@@ -2492,7 +2456,6 @@ void unmap_mapping_range(struct address_space *mapping,
 	}
 
 	details.check_mapping = even_cows? NULL: mapping;
-	details.nonlinear_vma = NULL;
 	details.first_index = hba;
 	details.last_index = hba + hlen - 1;
 	if (details.last_index < details.first_index)
@@ -2503,8 +2466,6 @@ void unmap_mapping_range(struct address_space *mapping,
 	mutex_lock(&mapping->i_mmap_mutex);
 	if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap)))
 		unmap_mapping_range_tree(&mapping->i_mmap, &details);
-	if (unlikely(!list_empty(&mapping->i_mmap_nonlinear)))
-		unmap_mapping_range_list(&mapping->i_mmap_nonlinear, &details);
 	mutex_unlock(&mapping->i_mmap_mutex);
 }
 EXPORT_SYMBOL(unmap_mapping_range);
@@ -2520,13 +2481,15 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 {
 	spinlock_t *ptl;
 	struct page *page, *swapcache;
+	struct mem_cgroup *memcg;
 	swp_entry_t entry;
 	pte_t pte;
 	int locked;
-	struct mem_cgroup *ptr;
 	int exclusive = 0;
 	int ret = 0;
+	cycles_t start;
 
+	start = get_cycles();
 	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
 		goto out;
 
@@ -2601,7 +2564,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto out_page;
 	}
 
-	if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
+	if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg)) {
 		ret = VM_FAULT_OOM;
 		goto out_page;
 	}
@@ -2626,10 +2589,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * while the page is counted on swap but not yet in mapcount i.e.
 	 * before page_add_anon_rmap() and swap_free(); try_to_free_swap()
 	 * must be called after the swap_free(), or it will never succeed.
-	 * Because delete_from_swap_page() may be called by reuse_swap_page(),
-	 * mem_cgroup_commit_charge_swapin() may not be able to find swp_entry
-	 * in page->private. In this case, a record in swap_cgroup  is silently
-	 * discarded at swap_free().
 	 */
 
 	inc_mm_counter_fast(mm, MM_ANONPAGES);
@@ -2645,12 +2604,14 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (pte_swp_soft_dirty(orig_pte))
 		pte = pte_mksoft_dirty(pte);
 	set_pte_at(mm, address, page_table, pte);
-	if (page == swapcache)
+	if (page == swapcache) {
 		do_page_add_anon_rmap(page, vma, address, exclusive);
-	else /* ksm created a completely new copy */
+		mem_cgroup_commit_charge(page, memcg, true);
+	} else { /* ksm created a completely new copy */
 		page_add_new_anon_rmap(page, vma, address);
-	/* It's better to call commit-charge after rmap is established */
-	mem_cgroup_commit_charge_swapin(page, ptr);
+		mem_cgroup_commit_charge(page, memcg, false);
+		lru_cache_add_active_or_unevictable(page, vma);
+	}
 
 	swap_free(entry);
 	if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
@@ -2681,9 +2642,13 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 out:
+	spin_lock_irq(&kstat_glb_lock);
+	KSTAT_LAT_ADD(&kstat_glob.swap_in, get_cycles() - start);
+	spin_unlock_irq(&kstat_glb_lock);
+
 	return ret;
 out_nomap:
-	mem_cgroup_cancel_charge_swapin(ptr);
+	mem_cgroup_cancel_charge(page, memcg);
 	pte_unmap_unlock(page_table, ptl);
 out_page:
 	unlock_page(page);
@@ -2739,6 +2704,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		unsigned int flags)
 {
+	struct mem_cgroup *memcg;
 	struct page *page;
 	spinlock_t *ptl;
 	pte_t entry;
@@ -2749,10 +2715,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (vma->vm_flags & VM_SHARED)
 		return VM_FAULT_SIGBUS;
 
-	/* Check if we need to add a guard page to the stack */
-	if (check_stack_guard_page(vma, address) < 0)
-		return VM_FAULT_SIGBUS;
-
 	/* Use the zero-page for reads */
 	if (!(flags & FAULT_FLAG_WRITE)) {
 		entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
@@ -2769,7 +2731,6 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto setpte;
 	}
 
-	/* Allocate our own private page. */
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
 	page = alloc_zeroed_user_highpage_movable(vma, address);
@@ -2782,7 +2743,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 */
 	__SetPageUptodate(page);
 
-	if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))
+	if (mem_cgroup_try_charge(page, mm, GFP_KERNEL, &memcg))
 		goto oom_free_page;
 
 	entry = mk_pte(page, vma->vm_page_prot);
@@ -2796,7 +2757,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	/* Deliver the page fault to userland, check inside PT lock */
 	if (userfaultfd_missing(vma)) {
 		pte_unmap_unlock(page_table, ptl);
-		mem_cgroup_uncharge_page(page);
+		mem_cgroup_cancel_charge(page, memcg);
 		page_cache_release(page);
 		return handle_userfault(vma, address, flags,
 					VM_UFFD_MISSING);
@@ -2804,6 +2765,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 
 	inc_mm_counter_fast(mm, MM_ANONPAGES);
 	page_add_new_anon_rmap(page, vma, address);
+	mem_cgroup_commit_charge(page, memcg, false);
+	lru_cache_add_active_or_unevictable(page, vma);
 setpte:
 	set_pte_at(mm, address, page_table, entry);
 
@@ -2813,7 +2776,7 @@ unlock:
 	pte_unmap_unlock(page_table, ptl);
 	return 0;
 release:
-	mem_cgroup_uncharge_page(page);
+	mem_cgroup_cancel_charge(page, memcg);
 	page_cache_release(page);
 	goto unlock;
 oom_free_page:
@@ -2828,6 +2791,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 {
 	struct vm_fault vmf;
 	int ret;
+	cycles_t start;
 
 	vmf.virtual_address = (void __user *)(address & PAGE_MASK);
 	vmf.pgoff = pgoff;
@@ -2835,6 +2799,7 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 	vmf.page = NULL;
 	vmf.cow_page = cow_page;
 
+	start = get_cycles();
 	ret = vma->vm_ops->fault(vma, &vmf);
 	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
 		return ret;
@@ -2853,6 +2818,10 @@ static int __do_fault(struct vm_area_struct *vma, unsigned long address,
 	else
 		VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page);
 
+	local_irq_disable();
+	KSTAT_LAT_PCPU_ADD(&kstat_glob.page_in, smp_processor_id(),
+			get_cycles() - start);
+	local_irq_enable();
  out:
 	*page = vmf.page;
 	return ret;
@@ -2867,14 +2836,12 @@ static void do_set_pte(struct vm_area_struct *vma, unsigned long address,
 	entry = mk_pte(page, vma->vm_page_prot);
 	if (write)
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-	else if (pte_file(*pte) && pte_file_soft_dirty(*pte))
-		pte_mksoft_dirty(entry);
 	if (anon) {
 		inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
 		page_add_new_anon_rmap(page, vma, address);
 	} else {
 		inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
-		page_add_file_rmap(page);
+		page_add_file_rmap(page, vma->vm_mm);
 	}
 	set_pte_at(vma->vm_mm, address, pte, entry);
 
@@ -2913,6 +2880,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
 {
 	struct page *fault_page, *new_page;
+	struct mem_cgroup *memcg;
 	spinlock_t *ptl;
 	pte_t *pte;
 	int ret;
@@ -2924,7 +2892,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (!new_page)
 		return VM_FAULT_OOM;
 
-	if (mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)) {
+	if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg)) {
 		page_cache_release(new_page);
 		return VM_FAULT_OOM;
 	}
@@ -2953,6 +2921,8 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		goto uncharge_out;
 	}
 	do_set_pte(vma, address, new_page, pte, true, true);
+	mem_cgroup_commit_charge(new_page, memcg, false);
+	lru_cache_add_active_or_unevictable(new_page, vma);
 	pte_unmap_unlock(pte, ptl);
 	if (fault_page) {
 		unlock_page(fault_page);
@@ -2966,7 +2936,7 @@ static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	}
 	return ret;
 uncharge_out:
-	mem_cgroup_uncharge_page(new_page);
+	mem_cgroup_cancel_charge(new_page, memcg);
 	page_cache_release(new_page);
 	return ret;
 }
@@ -3029,7 +2999,7 @@ static int do_shared_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	return ret;
 }
 
-static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		unsigned int flags, pte_t orig_pte)
 {
@@ -3049,44 +3019,6 @@ static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
 	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
 }
 
-/*
- * Fault of a previously existing named mapping. Repopulate the pte
- * from the encoded file_pte if possible. This enables swappable
- * nonlinear vmas.
- *
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
- * but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
- */
-static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
-		unsigned long address, pte_t *page_table, pmd_t *pmd,
-		unsigned int flags, pte_t orig_pte)
-{
-	pgoff_t pgoff;
-
-	flags |= FAULT_FLAG_NONLINEAR;
-
-	if (!pte_unmap_same(mm, pmd, page_table, orig_pte))
-		return 0;
-
-	if (unlikely(!(vma->vm_flags & VM_NONLINEAR))) {
-		/*
-		 * Page table corrupted: show pte and kill process.
-		 */
-		print_bad_pte(vma, address, orig_pte, NULL);
-		return VM_FAULT_SIGBUS;
-	}
-
-	pgoff = pte_to_pgoff(orig_pte);
-	if (!(flags & FAULT_FLAG_WRITE))
-		return do_read_fault(mm, vma, address, pmd, pgoff, flags,
-				orig_pte);
-	if (!(vma->vm_flags & VM_SHARED))
-		return do_cow_fault(mm, vma, address, pmd, pgoff, flags,
-				orig_pte);
-	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
-}
-
 int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
 				unsigned long addr, int page_nid,
 				int *flags)
@@ -3222,14 +3154,11 @@ static int handle_pte_fault(struct mm_struct *mm,
 	if (!pte_present(entry)) {
 		if (pte_none(entry)) {
 			if (vma->vm_ops)
-				return do_linear_fault(mm, vma, address,
+				return do_fault(mm, vma, address,
 						pte, pmd, flags, entry);
 			return do_anonymous_page(mm, vma, address,
 						 pte, pmd, flags);
 		}
-		if (pte_file(entry))
-			return do_nonlinear_fault(mm, vma, address,
-					pte, pmd, flags, entry);
 		return do_swap_page(mm, vma, address,
 					pte, pmd, flags, entry);
 	}
@@ -3716,7 +3645,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
 /*
  * Print the name of a VMA.
  */
-void print_vma_addr(char *prefix, unsigned long ip)
+void ve_print_vma_addr(int dst, char *prefix, unsigned long ip)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
@@ -3739,7 +3668,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
 			p = d_path(&f->f_path, buf, PAGE_SIZE);
 			if (IS_ERR(p))
 				p = "?";
-			printk("%s%s[%lx+%lx]", prefix, kbasename(p),
+			ve_printk(dst, "%s%s[%lx+%lx]", prefix, kbasename(p),
 					vma->vm_start,
 					vma->vm_end - vma->vm_start);
 			free_page((unsigned long)buf);
@@ -3871,3 +3800,181 @@ void ptlock_free(struct page *page)
 	kfree(page->ptl);
 }
 #endif
+
+#include <linux/file.h>
+
+int open_mapping_peer(struct address_space *mapping,
+		struct path *path, const struct cred *cred)
+{
+	struct inode *inode = path->dentry->d_inode;
+	struct address_space *peer = inode->i_mapping;
+	struct file *file = NULL;
+	struct user_beancounter *ub;
+
+restart:
+	if (!peer->i_peer_file) {
+		ub = set_exec_ub(&ub0);
+		file = dentry_open(path, O_RDONLY | O_LARGEFILE, cred);
+		set_exec_ub(ub);
+		if (IS_ERR(file)) {
+			return PTR_ERR(file);
+		}
+
+		spin_lock(&inode->i_lock);
+		if (atomic_read(&inode->i_writecount) > 0) {
+			spin_unlock(&inode->i_lock);
+			fput(file);
+			return -ETXTBSY;
+		}
+		if (inode->i_size != mapping->host->i_size) {
+			spin_unlock(&inode->i_lock);
+			fput(file);
+			return -EINVAL;
+		}
+		if (peer->i_peer_file) {
+			spin_unlock(&inode->i_lock);
+			fput(file);
+			file = NULL;
+			goto restart;
+		}
+		atomic_dec(&inode->i_writecount);
+		rcu_assign_pointer(peer->i_peer_file, get_file(file));
+		spin_unlock(&inode->i_lock);
+	}
+
+	mutex_lock_nested(&peer->i_mmap_mutex, SINGLE_DEPTH_NESTING);
+	if (!peer->i_peer_file) {
+		mutex_unlock(&peer->i_mmap_mutex);
+		goto restart;
+	}
+	mutex_lock(&mapping->i_mmap_mutex);
+	rcu_assign_pointer(mapping->i_peer_file, peer->i_peer_file);
+	list_add(&mapping->i_peer_list, &peer->i_peer_list);
+	mutex_unlock(&mapping->i_mmap_mutex);
+	mutex_unlock(&peer->i_mmap_mutex);
+
+	invalidate_mapping_pages(mapping, 0, -1);
+
+	if (file) {
+		file_accessed(file);
+		fput(file);
+	}
+	return 0;
+}
+EXPORT_SYMBOL(open_mapping_peer);
+
+static bool synchronize_mapping_faults_vma(struct address_space *mapping,
+		struct vm_area_struct *vma)
+{
+	struct mm_struct *mm = vma->vm_mm;
+
+	if (vma->vm_private_data2 == vma)
+		return false;
+	BUG_ON(vma->vm_private_data2);
+	vma->vm_private_data2 = vma;
+
+	atomic_inc(&mm->mm_count);
+	mutex_unlock(&mapping->i_mmap_mutex);
+	down_write(&mm->mmap_sem);
+	up_write(&mm->mmap_sem);
+	mmdrop(mm);
+	mutex_lock(&mapping->i_mmap_mutex);
+
+	return true;
+}
+
+static void synchronize_mapping_faults(struct address_space *mapping)
+{
+	struct vm_area_struct *vma;
+
+restart:
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, ULONG_MAX)
+		if (synchronize_mapping_faults_vma(mapping, vma))
+			goto restart;
+	vma_interval_tree_foreach(vma, &mapping->i_mmap, 0, ULONG_MAX)
+		vma->vm_private_data2 = NULL;
+}
+
+void close_mapping_peer(struct address_space *mapping)
+{
+	struct file *file = mapping->i_peer_file;
+	struct address_space *peer;
+
+	if (!file)
+		return;
+
+	mutex_lock(&mapping->i_mmap_mutex);
+
+	rcu_assign_pointer(mapping->i_peer_file, NULL);
+
+	if (mapping_mapped(mapping)) {
+		struct zap_details details = {
+			.check_mapping = file->f_mapping,
+			.first_index = 0,
+			.last_index = -1,
+		};
+
+		synchronize_mapping_faults(mapping);
+		unmap_mapping_range_tree(&mapping->i_mmap, &details);
+	}
+
+	mutex_unlock(&mapping->i_mmap_mutex);
+
+	peer = file->f_mapping;
+
+	mutex_lock(&peer->i_mmap_mutex);
+	list_del_init(&mapping->i_peer_list);
+	if (list_empty(&peer->i_peer_list))
+		rcu_assign_pointer(peer->i_peer_file, NULL);
+	else
+		file = NULL;
+	mutex_unlock(&peer->i_mmap_mutex);
+
+	if (file) {
+		atomic_inc(&file->f_inode->i_writecount);
+		file_accessed(file);
+		fput(file);
+	}
+}
+EXPORT_SYMBOL(close_mapping_peer);
+
+struct page *pick_peer_page(struct address_space *mapping, pgoff_t index,
+		struct file_ra_state *ra, unsigned ra_size)
+{
+	struct address_space *peer;
+	struct page *page;
+	struct file *file;
+
+	rcu_read_lock();
+	file = rcu_dereference(mapping->i_peer_file);
+	if (!file || !atomic_long_inc_not_zero(&file->f_count)) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	rcu_read_unlock();
+
+	peer = file->f_mapping;
+
+	page = find_get_page(peer, index);
+	if (!page) {
+		page_cache_sync_readahead(peer, ra, file, index, ra_size);
+		page = find_get_page(peer, index);
+		if (!page)
+			goto out;
+	}
+	if (PageReadahead(page))
+		page_cache_async_readahead(peer, ra, file,
+				page, index, ra->ra_pages);
+	if (!PageUptodate(page)) {
+		if (!lock_page_killable(page)) {
+			unlock_page(page);
+			if (PageUptodate(page))
+				goto out;;
+		}
+		put_page(page);
+		page = NULL;
+	}
+out:
+	fput(file);
+	return page;
+}
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -646,19 +646,18 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma,
  * @nodes and @flags,) it's isolated and queued to the pagelist which is
  * passed via @private.)
  */
-static struct vm_area_struct *
+static int
 queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 		const nodemask_t *nodes, unsigned long flags, void *private)
 {
-	int err;
-	struct vm_area_struct *first, *vma, *prev;
-
+	int err = 0;
+	struct vm_area_struct *vma, *prev;
 
-	first = find_vma(mm, start);
-	if (!first)
-		return ERR_PTR(-EFAULT);
+	vma = find_vma(mm, start);
+	if (!vma)
+		return -EFAULT;
 	prev = NULL;
-	for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
+	for (; vma && vma->vm_start < end; vma = vma->vm_next) {
 		unsigned long endvma = vma->vm_end;
 
 		if (endvma > end)
@@ -668,9 +667,9 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 
 		if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 			if (!vma->vm_next && vma->vm_end < end)
-				return ERR_PTR(-EFAULT);
+				return -EFAULT;
 			if (prev && prev->vm_end < vma->vm_start)
-				return ERR_PTR(-EFAULT);
+				return -EFAULT;
 		}
 
 		if (flags & MPOL_MF_LAZY) {
@@ -684,15 +683,13 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 
 			err = queue_pages_pgd_range(vma, start, endvma, nodes,
 						flags, private);
-			if (err) {
-				first = ERR_PTR(err);
+			if (err)
 				break;
-			}
 		}
 next:
 		prev = vma;
 	}
-	return first;
+	return err;
 }
 
 /*
@@ -1177,16 +1174,17 @@ out:
 
 /*
  * Allocate a new page for page migration based on vma policy.
- * Start assuming that page is mapped by vma pointed to by @private.
+ * Start by assuming the page is mapped by the same vma as contains @start.
  * Search forward from there, if not.  N.B., this assumes that the
  * list of pages handed to migrate_pages()--which is how we get here--
  * is in virtual address order.
  */
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
 {
-	struct vm_area_struct *vma = (struct vm_area_struct *)private;
+	struct vm_area_struct *vma;
 	unsigned long uninitialized_var(address);
 
+	vma = find_vma(current->mm, start);
 	while (vma) {
 		address = page_address_in_vma(page, vma);
 		if (address != -EFAULT)
@@ -1216,7 +1214,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
 	return -ENOSYS;
 }
 
-static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+static struct page *new_page(struct page *page, unsigned long start, int **x)
 {
 	return NULL;
 }
@@ -1226,7 +1224,6 @@ static long do_mbind(unsigned long start, unsigned long len,
 		     unsigned short mode, unsigned short mode_flags,
 		     nodemask_t *nmask, unsigned long flags)
 {
-	struct vm_area_struct *vma;
 	struct mm_struct *mm = current->mm;
 	struct mempolicy *new;
 	unsigned long end;
@@ -1292,11 +1289,9 @@ static long do_mbind(unsigned long start, unsigned long len,
 	if (err)
 		goto mpol_out;
 
-	vma = queue_pages_range(mm, start, end, nmask,
+	err = queue_pages_range(mm, start, end, nmask,
 			  flags | MPOL_MF_INVERT, &pagelist);
-
-	err = PTR_ERR(vma);	/* maybe ... */
-	if (!IS_ERR(vma))
+	if (!err)
 		err = mbind_range(mm, start, end, new);
 
 	if (!err) {
@@ -1304,9 +1299,8 @@ static long do_mbind(unsigned long start, unsigned long len,
 
 		if (!list_empty(&pagelist)) {
 			WARN_ON_ONCE(flags & MPOL_MF_LAZY);
-			nr_failed = migrate_pages(&pagelist, new_vma_page,
-					(unsigned long)vma,
-					MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
+			nr_failed = migrate_pages(&pagelist, new_page,
+				start, MIGRATE_SYNC, MR_MEMPOLICY_MBIND);
 			if (nr_failed)
 				putback_movable_pages(&pagelist);
 		}
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -6,25 +6,138 @@
  *  extreme VM load.
  *
  *  started by Ingo Molnar, Copyright (C) 2001
+ *  debugging by David Rientjes, Copyright (C) 2015
  */
 
 #include <linux/mm.h>
 #include <linux/slab.h>
+
+#include <linux/highmem.h>
+#include <linux/kasan.h>
+#include <linux/kmemleak.h>
 #include <linux/export.h>
 #include <linux/mempool.h>
 #include <linux/blkdev.h>
 #include <linux/writeback.h>
 
+#if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB_DEBUG_ON)
+static void poison_error(mempool_t *pool, void *element, size_t size,
+			 size_t byte)
+{
+	const int nr = pool->curr_nr;
+	const int start = max_t(int, byte - (BITS_PER_LONG / 8), 0);
+	const int end = min_t(int, byte + (BITS_PER_LONG / 8), size);
+	int i;
+
+	pr_err("BUG: mempool element poison mismatch\n");
+	pr_err("Mempool %p size %zu\n", pool, size);
+	pr_err(" nr=%d @ %p: %s0x", nr, element, start > 0 ? "... " : "");
+	for (i = start; i < end; i++)
+		pr_cont("%x ", *(u8 *)(element + i));
+	pr_cont("%s\n", end < size ? "..." : "");
+	dump_stack();
+}
+
+static void __check_element(mempool_t *pool, void *element, size_t size)
+{
+	u8 *obj = element;
+	size_t i;
+
+	for (i = 0; i < size; i++) {
+		u8 exp = (i < size - 1) ? POISON_FREE : POISON_END;
+
+		if (obj[i] != exp) {
+			poison_error(pool, element, size, i);
+			return;
+		}
+	}
+	memset(obj, POISON_INUSE, size);
+}
+
+static void check_element(mempool_t *pool, void *element)
+{
+	/* Mempools backed by slab allocator */
+	if (pool->free == mempool_free_slab || pool->free == mempool_kfree)
+		__check_element(pool, element, ksize(element));
+
+	/* Mempools backed by page allocator */
+	if (pool->free == mempool_free_pages) {
+		int order = (int)(long)pool->pool_data;
+		void *addr = kmap_atomic((struct page *)element);
+
+		__check_element(pool, addr, 1UL << (PAGE_SHIFT + order));
+		kunmap_atomic(addr);
+	}
+}
+
+static void __poison_element(void *element, size_t size)
+{
+	u8 *obj = element;
+
+	memset(obj, POISON_FREE, size - 1);
+	obj[size - 1] = POISON_END;
+}
+
+static void poison_element(mempool_t *pool, void *element)
+{
+	/* Mempools backed by slab allocator */
+	if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
+		__poison_element(element, ksize(element));
+
+	/* Mempools backed by page allocator */
+	if (pool->alloc == mempool_alloc_pages) {
+		int order = (int)(long)pool->pool_data;
+		void *addr = kmap_atomic((struct page *)element);
+
+		__poison_element(addr, 1UL << (PAGE_SHIFT + order));
+		kunmap_atomic(addr);
+	}
+}
+#else /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
+static inline void check_element(mempool_t *pool, void *element)
+{
+}
+static inline void poison_element(mempool_t *pool, void *element)
+{
+}
+#endif /* CONFIG_DEBUG_SLAB || CONFIG_SLUB_DEBUG_ON */
+
+static void kasan_poison_element(mempool_t *pool, void *element)
+{
+	if (pool->alloc == mempool_alloc_slab)
+		kasan_slab_free(pool->pool_data, element);
+	if (pool->alloc == mempool_kmalloc)
+		kasan_kfree(element);
+	if (pool->alloc == mempool_alloc_pages)
+		kasan_free_pages(element, (unsigned long)pool->pool_data);
+}
+
+static void kasan_unpoison_element(mempool_t *pool, void *element)
+{
+	if (pool->alloc == mempool_alloc_slab)
+		kasan_slab_alloc(pool->pool_data, element);
+	if (pool->alloc == mempool_kmalloc)
+		kasan_krealloc(element, (size_t)pool->pool_data);
+	if (pool->alloc == mempool_alloc_pages)
+		kasan_alloc_pages(element, (unsigned long)pool->pool_data);
+}
+
 static void add_element(mempool_t *pool, void *element)
 {
 	BUG_ON(pool->curr_nr >= pool->min_nr);
+	poison_element(pool, element);
+	kasan_poison_element(pool, element);
 	pool->elements[pool->curr_nr++] = element;
 }
 
 static void *remove_element(mempool_t *pool)
 {
-	BUG_ON(pool->curr_nr <= 0);
-	return pool->elements[--pool->curr_nr];
+	void *element = pool->elements[--pool->curr_nr];
+
+	BUG_ON(pool->curr_nr < 0);
+	check_element(pool, element);
+	kasan_unpoison_element(pool, element);
+	return element;
 }
 
 /**
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -37,6 +37,7 @@
 #include <linux/gfp.h>
 #include <linux/balloon_compaction.h>
 #include <linux/mmu_notifier.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlbflush.h>
 
@@ -181,7 +182,7 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma,
 	} else if (PageAnon(new))
 		page_add_anon_rmap(new, vma, addr);
 	else
-		page_add_file_rmap(new);
+		page_add_file_rmap(new, mm);
 
 	/* No need to invalidate - it was non-present before */
 	update_mmu_cache(vma, addr, ptep);
@@ -197,7 +198,12 @@ out:
  */
 static void remove_migration_ptes(struct page *old, struct page *new)
 {
-	rmap_walk(new, remove_migration_pte, old);
+	struct rmap_walk_control rwc = {
+		.rmap_one = remove_migration_pte,
+		.arg = old,
+	};
+
+	rmap_walk(new, &rwc);
 }
 
 /*
@@ -532,6 +538,11 @@ void migrate_page_copy(struct page *newpage, struct page *page)
 			__set_page_dirty_nobuffers(newpage);
  	}
 
+	if (page_is_young(page))
+		set_page_young(newpage);
+	if (page_is_idle(page))
+		set_page_idle(newpage);
+
 	/*
 	 * Copy NUMA information to the new page, to prevent over-eager
 	 * future migrations of this same page.
@@ -767,6 +778,7 @@ static int move_to_new_page(struct page *newpage, struct page *page,
 	if (rc != MIGRATEPAGE_SUCCESS) {
 		newpage->mapping = NULL;
 	} else {
+		mem_cgroup_migrate(page, newpage, false);
 		if (page_was_mapped)
 			remove_migration_ptes(page, newpage);
 		page->mapping = NULL;
@@ -782,7 +794,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 {
 	int rc = -EAGAIN;
 	int page_was_mapped = 0;
-	struct mem_cgroup *mem;
 	struct anon_vma *anon_vma = NULL;
 
 	if (!trylock_page(page)) {
@@ -808,9 +819,6 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 		lock_page(page);
 	}
 
-	/* charge against new page */
-	mem_cgroup_prepare_migration(page, newpage, &mem);
-
 	if (PageWriteback(page)) {
 		/*
 		 * Only in the case of a full synchronous migration is it
@@ -820,10 +828,10 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 		 */
 		if (mode != MIGRATE_SYNC) {
 			rc = -EBUSY;
-			goto uncharge;
+			goto out_unlock;
 		}
 		if (!force)
-			goto uncharge;
+			goto out_unlock;
 		wait_on_page_writeback(page);
 	}
 	/*
@@ -858,7 +866,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 			 * completes
 			 */
 		} else {
-			goto uncharge;
+			goto out_unlock;
 		}
 	}
 
@@ -871,7 +879,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 		 * the page migration right away (proteced by page lock).
 		 */
 		rc = balloon_page_migrate(newpage, page, mode);
-		goto uncharge;
+		goto out_unlock;
 	}
 
 	/*
@@ -890,7 +898,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
 		VM_BUG_ON_PAGE(PageAnon(page), page);
 		if (page_has_private(page)) {
 			try_to_free_buffers(page);
-			goto uncharge;
+			goto out_unlock;
 		}
 		goto skip_unmap;
 	}
@@ -913,9 +921,7 @@ skip_unmap:
 	if (anon_vma)
 		put_anon_vma(anon_vma);
 
-uncharge:
-	mem_cgroup_end_migration(mem, page, newpage,
-				 rc == MIGRATEPAGE_SUCCESS);
+out_unlock:
 	unlock_page(page);
 out:
 	return rc;
@@ -1539,7 +1545,7 @@ static bool migrate_balanced_pgdat(struct pglist_data *pgdat,
 		if (!populated_zone(zone))
 			continue;
 
-		if (zone->all_unreclaimable)
+		if (!zone_reclaimable(zone))
 			continue;
 
 		/* Avoid waking kswapd by allocating pages_to_migrate pages. */
@@ -1740,7 +1746,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 	pg_data_t *pgdat = NODE_DATA(node);
 	int isolated = 0;
 	struct page *new_page = NULL;
-	struct mem_cgroup *memcg = NULL;
 	int page_lru = page_is_file_cache(page);
 	unsigned long mmun_start = address & HPAGE_PMD_MASK;
 	unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
@@ -1806,17 +1811,6 @@ fail_putback:
 		goto out_unlock;
 	}
 
-	/*
-	 * Traditional migration needs to prepare the memcg charge
-	 * transaction early to prevent the old page from being
-	 * uncharged when installing migration entries.  Here we can
-	 * save the potential rollback and start the charge transfer
-	 * only when migration is already known to end successfully.
-	 */
-	mem_cgroup_prepare_migration(page, new_page, &memcg);
-
-	init_trans_huge_mmu_gather_count(new_page);
-
 	orig_entry = *pmd;
 	entry = mk_pmd(new_page, vma->vm_page_prot);
 	entry = pmd_mkhuge(entry);
@@ -1845,14 +1839,10 @@ fail_putback:
 		goto fail_putback;
 	}
 
+	mem_cgroup_migrate(page, new_page, false);
+
 	page_remove_rmap(page);
 
-	/*
-	 * Finish the charge transaction under the page table lock to
-	 * prevent split_huge_page() from dividing up the charge
-	 * before it's fully transferred to the new page.
-	 */
-	mem_cgroup_end_migration(memcg, page, new_page, true);
 	spin_unlock(ptl);
 	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
 
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -124,17 +124,13 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 	ptep = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	do {
 		pte_t pte = *ptep;
-		pgoff_t pgoff;
 
 		next = addr + PAGE_SIZE;
 		if (pte_none(pte))
 			mincore_unmapped_range(vma, addr, next, vec);
 		else if (pte_present(pte))
 			*vec = 1;
-		else if (pte_file(pte)) {
-			pgoff = pte_to_pgoff(pte);
-			*vec = mincore_page(vma->vm_file->f_mapping, pgoff);
-		} else { /* pte is a swap entry */
+		else { /* pte is a swap entry */
 			swp_entry_t entry = pte_to_swp_entry(pte);
 
 			if (is_migration_entry(entry)) {
@@ -142,9 +138,8 @@ static void mincore_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 				*vec = 1;
 			} else {
 #ifdef CONFIG_SWAP
-				pgoff = entry.val;
 				*vec = mincore_page(swap_address_space(entry),
-					pgoff);
+					entry.val);
 #else
 				WARN_ON(1);
 				*vec = 1;
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -19,11 +19,13 @@
 #include <linux/mmzone.h>
 #include <linux/hugetlb.h>
 
+#include <bc/vmpages.h>
+
 #include "internal.h"
 
 int can_do_mlock(void)
 {
-	if (capable(CAP_IPC_LOCK))
+	if (ve_capable(CAP_IPC_LOCK))
 		return 1;
 	if (rlimit(RLIMIT_MEMLOCK) != 0)
 		return 1;
@@ -229,11 +231,14 @@ static int __mlock_posix_error_return(long retval)
  * and re-mlocked by try_to_{munlock|unmap} before we unmap and
  * free them.  This will result in freeing mlocked pages.
  */
-void munlock_vma_pages_range(struct vm_area_struct *vma,
-			     unsigned long start, unsigned long end)
+void __munlock_vma_pages_range(struct vm_area_struct *vma,
+			       unsigned long start, unsigned long end, int acct)
 {
 	vma->vm_flags &= ~VM_LOCKED;
 
+	if (acct)
+		ub_locked_uncharge(vma->vm_mm, end - start);
+
 	while (start < end) {
 		struct page *page;
 		unsigned int page_mask, page_increm;
@@ -287,6 +292,12 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	    is_vm_hugetlb_page(vma) || vma == get_gate_vma(current->mm))
 		goto out;	/* don't set VM_LOCKED,  don't count */
 
+	if (newflags & VM_LOCKED) {
+		ret = ub_locked_charge(mm, end - start);
+		if (ret < 0)
+			goto out;
+	}
+
 	pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
 	*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
 			  vma->vm_file, pgoff, vma_policy(vma),
@@ -299,13 +310,13 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
 	if (start != vma->vm_start) {
 		ret = split_vma(mm, vma, start, 1);
 		if (ret)
-			goto out;
+			goto out_uncharge;
 	}
 
 	if (end != vma->vm_end) {
 		ret = split_vma(mm, vma, end, 0);
 		if (ret)
-			goto out;
+			goto out_uncharge;
 	}
 
 success:
@@ -331,6 +342,11 @@ success:
 out:
 	*prev = vma;
 	return ret;
+
+out_uncharge:
+	if (newflags & VM_LOCKED)
+		ub_locked_uncharge(mm, end - start);
+	goto out;
 }
 
 static int do_mlock(unsigned long start, size_t len, int on)
@@ -469,7 +485,7 @@ SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len)
 	lock_limit >>= PAGE_SHIFT;
 
 	/* check against resource limits */
-	if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
+	if ((locked <= lock_limit) || ve_capable(CAP_IPC_LOCK))
 		error = do_mlock(start, len, 1);
 	up_write(&current->mm->mmap_sem);
 	if (!error)
@@ -536,7 +552,7 @@ SYSCALL_DEFINE1(mlockall, int, flags)
 
 	ret = -ENOMEM;
 	if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
-	    capable(CAP_IPC_LOCK))
+	    ve_capable(CAP_IPC_LOCK))
 		ret = do_mlockall(flags);
 	up_write(&current->mm->mmap_sem);
 	if (!ret && (flags & MCL_CURRENT))
@@ -573,7 +589,7 @@ int user_shm_lock(size_t size, struct user_struct *user)
 	lock_limit >>= PAGE_SHIFT;
 	spin_lock(&shmlock_user_lock);
 	if (!allowed &&
-	    locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
+	    locked + user->locked_shm > lock_limit && !ve_capable(CAP_IPC_LOCK))
 		goto out;
 	get_uid(user);
 	user->locked_shm += locked;
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -148,5 +148,4 @@ static int __init mm_sysfs_init(void)
 
 	return 0;
 }
-
-__initcall(mm_sysfs_init);
+postcore_initcall(mm_sysfs_init);
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -29,6 +29,7 @@
 #include <linux/rmap.h>
 #include <linux/mmu_notifier.h>
 #include <linux/perf_event.h>
+#include <linux/virtinfo.h>
 #include <linux/audit.h>
 #include <linux/khugepaged.h>
 #include <linux/uprobes.h>
@@ -38,6 +39,9 @@
 #include <linux/memory.h>
 #include <linux/userfaultfd_k.h>
 
+#include <bc/beancounter.h>
+#include <bc/vmpages.h>
+
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlb.h>
@@ -150,6 +154,9 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
 {
 	unsigned long free, allowed, reserve;
 
+	if (mm && ub_enough_memory(mm, pages) != 0)
+		return -ENOMEM;
+
 	vm_acct_memory(pages);
 
 	/*
@@ -235,10 +242,7 @@ static void __remove_shared_vm_struct(struct vm_area_struct *vma,
 		mapping_unmap_writable(mapping);
 
 	flush_dcache_mmap_lock(mapping);
-	if (unlikely(vma->vm_flags & VM_NONLINEAR))
-		list_del_init(&vma->shared.nonlinear);
-	else
-		vma_interval_tree_remove(vma, &mapping->i_mmap);
+	vma_interval_tree_remove(vma, &mapping->i_mmap);
 	flush_dcache_mmap_unlock(mapping);
 }
 
@@ -266,22 +270,26 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma)
 	struct vm_area_struct *next = vma->vm_next;
 
 	might_sleep();
+
+	ub_memory_uncharge(vma->vm_mm, vma->vm_end - vma->vm_start,
+			vma->vm_flags, vma->vm_file);
 	if (vma->vm_ops && vma->vm_ops->close)
 		vma->vm_ops->close(vma);
 	if (vma->vm_file)
 		fput(vma->vm_file);
 	mpol_put(vma_policy(vma));
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(vma->vm_mm, vma);
 	return next;
 }
 
-static unsigned long do_brk(unsigned long addr, unsigned long len);
+static unsigned long do_brk(unsigned long addr, unsigned long len, int soft);
 
 SYSCALL_DEFINE1(brk, unsigned long, brk)
 {
 	unsigned long rlim, retval;
 	unsigned long newbrk, oldbrk;
 	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *next;
 	unsigned long min_brk;
 	bool populate;
 
@@ -327,11 +335,12 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
 	}
 
 	/* Check against existing mmap mappings. */
-	if (find_vma_intersection(mm, oldbrk, newbrk+PAGE_SIZE))
+	next = find_vma(mm, oldbrk);
+	if (next && newbrk + PAGE_SIZE > vm_start_gap(next))
 		goto out;
 
 	/* Ok, looks good - let it rip. */
-	if (do_brk(oldbrk, newbrk-oldbrk) != oldbrk)
+	if (do_brk(oldbrk, newbrk-oldbrk, UB_HARD) != oldbrk)
 		goto out;
 
 set_brk:
@@ -350,10 +359,22 @@ out:
 
 static long vma_compute_subtree_gap(struct vm_area_struct *vma)
 {
-	unsigned long max, subtree_gap;
-	max = vma->vm_start;
-	if (vma->vm_prev)
-		max -= vma->vm_prev->vm_end;
+	unsigned long max, prev_end, subtree_gap;
+
+	/*
+	 * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we
+	 * allow two stack_guard_gaps between them here, and when choosing
+	 * an unmapped area; whereas when expanding we only require one.
+	 * That's a little inconsistent, but keeps the code here simpler.
+	 */
+	max = vm_start_gap(vma);
+	if (vma->vm_prev) {
+		prev_end = vm_end_gap(vma->vm_prev);
+		if (max > prev_end)
+			max -= prev_end;
+		else
+			max = 0;
+	}
 	if (vma->vm_rb.rb_left) {
 		subtree_gap = rb_entry(vma->vm_rb.rb_left,
 				struct vm_area_struct, vm_rb)->rb_subtree_gap;
@@ -437,7 +458,7 @@ void validate_mm(struct mm_struct *mm)
 		list_for_each_entry(avc, &vma->anon_vma_chain, same_vma)
 			anon_vma_interval_tree_verify(avc);
 		vma_unlock_anon_vma(vma);
-		highest_address = vma->vm_end;
+		highest_address = vm_end_gap(vma);
 		vma = vma->vm_next;
 		i++;
 	}
@@ -605,7 +626,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
 	if (vma->vm_next)
 		vma_gap_update(vma->vm_next);
 	else
-		mm->highest_vm_end = vma->vm_end;
+		mm->highest_vm_end = vm_end_gap(vma);
 
 	/*
 	 * vma->vm_prev wasn't known when we followed the rbtree to find the
@@ -636,10 +657,7 @@ static void __vma_link_file(struct vm_area_struct *vma)
 			atomic_inc(&mapping->i_mmap_writable);
 
 		flush_dcache_mmap_lock(mapping);
-		if (unlikely(vma->vm_flags & VM_NONLINEAR))
-			vma_nonlinear_insert(vma, &mapping->i_mmap_nonlinear);
-		else
-			vma_interval_tree_insert(vma, &mapping->i_mmap);
+		vma_interval_tree_insert(vma, &mapping->i_mmap);
 		flush_dcache_mmap_unlock(mapping);
 	}
 }
@@ -771,14 +789,11 @@ again:			remove_next = 1 + (end > next->vm_end);
 
 	if (file) {
 		mapping = file->f_mapping;
-		if (!(vma->vm_flags & VM_NONLINEAR)) {
-			root = &mapping->i_mmap;
-			uprobe_munmap(vma, vma->vm_start, vma->vm_end);
+		root = &mapping->i_mmap;
+		uprobe_munmap(vma, vma->vm_start, vma->vm_end);
 
-			if (adjust_next)
-				uprobe_munmap(next, next->vm_start,
-							next->vm_end);
-		}
+		if (adjust_next)
+			uprobe_munmap(next, next->vm_start, next->vm_end);
 
 		mutex_lock(&mapping->i_mmap_mutex);
 		if (insert) {
@@ -854,7 +869,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 			vma_gap_update(vma);
 		if (end_changed) {
 			if (!next)
-				mm->highest_vm_end = end;
+				mm->highest_vm_end = vm_end_gap(vma);
 			else if (!adjust_next)
 				vma_gap_update(next);
 		}
@@ -885,7 +900,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 			anon_vma_merge(vma, next);
 		mm->map_count--;
 		mpol_put(vma_policy(next));
-		kmem_cache_free(vm_area_cachep, next);
+		free_vma(mm, next);
 		/*
 		 * In mprotect's case 6 (see comments on vma_merge),
 		 * we must remove another next too. It would clutter
@@ -897,7 +912,7 @@ again:			remove_next = 1 + (end > next->vm_end);
 		else if (next)
 			vma_gap_update(next);
 		else
-			mm->highest_vm_end = end;
+			WARN_ON(mm->highest_vm_end != vm_end_gap(vma));
 	}
 	if (insert && file)
 		uprobe_mmap(insert);
@@ -1300,7 +1315,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
 		locked += mm->locked_vm;
 		lock_limit = rlimit(RLIMIT_MEMLOCK);
 		lock_limit >>= PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+		if (locked > lock_limit && !ve_capable(CAP_IPC_LOCK))
 			return -EAGAIN;
 	}
 
@@ -1522,6 +1537,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
 	int error;
 	struct rb_node **rb_link, *rb_parent;
 	unsigned long charged = 0;
+	unsigned long ub_charged = 0;
 
 	/* Check against address space limit. */
 	if (!may_expand_vm(mm, len >> PAGE_SHIFT)) {
@@ -1559,6 +1575,10 @@ munmap_back:
 		vm_flags |= VM_ACCOUNT;
 	}
 
+	if (ub_memory_charge(mm, len, vm_flags, file, UB_HARD))
+		goto charge_error;
+	ub_charged = 1;
+
 	/*
 	 * Can we just expand an old mapping?
 	 */
@@ -1572,7 +1592,7 @@ munmap_back:
 	 * specific mapper. the address has already been validated, but
 	 * not unmapped, but the maps are removed from the list.
 	 */
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = allocate_vma(mm, GFP_KERNEL | __GFP_ZERO);
 	if (!vma) {
 		error = -ENOMEM;
 		goto unacct_error;
@@ -1611,6 +1631,18 @@ munmap_back:
 		error = file->f_op->mmap(file, vma);
 		if (error)
 			goto unmap_and_free_vma;
+		if (vm_flags != vma->vm_flags) {
+		/*
+		 * ->vm_flags has been changed in f_op->mmap method.
+		 * We have to recharge ub memory.
+		 */
+			ub_memory_uncharge(mm, len, vm_flags, file);
+			if (ub_memory_charge(mm, len, vma->vm_flags, file, UB_HARD)) {
+				ub_charged = 0;
+				error = -ENOMEM;
+				goto unmap_and_free_vma;
+			}
+		}
 
 		/* Can addr have changed??
 		 *
@@ -1649,8 +1681,10 @@ out:
 		if (!((vm_flags & VM_SPECIAL) || is_vm_hugetlb_page(vma) ||
 					vma == get_gate_vma(current->mm)))
 			mm->locked_vm += (len >> PAGE_SHIFT);
-		else
+		else {
 			vma->vm_flags &= ~VM_LOCKED;
+			ub_locked_uncharge(mm, len);
+		}
 	}
 
 	if (file)
@@ -1682,8 +1716,11 @@ allow_write_and_free_vma:
 	if (vm_flags & VM_DENYWRITE)
 		allow_write_access(file);
 free_vma:
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(mm, vma);
 unacct_error:
+	if (ub_charged)
+		ub_memory_uncharge(mm, len, vm_flags, file);
+charge_error:
 	if (charged)
 		vm_unacct_memory(charged);
 	return error;
@@ -1726,7 +1763,7 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
 
 	while (true) {
 		/* Visit left subtree if it looks promising */
-		gap_end = vma->vm_start;
+		gap_end = vm_start_gap(vma);
 		if (gap_end >= low_limit && vma->vm_rb.rb_left) {
 			struct vm_area_struct *left =
 				rb_entry(vma->vm_rb.rb_left,
@@ -1737,12 +1774,13 @@ unsigned long unmapped_area(struct vm_unmapped_area_info *info)
 			}
 		}
 
-		gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+		gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
 check_current:
 		/* Check if current node has a suitable gap */
 		if (gap_start > high_limit)
 			return -ENOMEM;
-		if (gap_end >= low_limit && gap_end - gap_start >= length)
+		if (gap_end >= low_limit &&
+		    gap_end > gap_start && gap_end - gap_start >= length)
 			goto found;
 
 		/* Visit right subtree if it looks promising */
@@ -1764,8 +1802,8 @@ check_current:
 			vma = rb_entry(rb_parent(prev),
 				       struct vm_area_struct, vm_rb);
 			if (prev == vma->vm_rb.rb_left) {
-				gap_start = vma->vm_prev->vm_end;
-				gap_end = vma->vm_start;
+				gap_start = vm_end_gap(vma->vm_prev);
+				gap_end = vm_start_gap(vma);
 				goto check_current;
 			}
 		}
@@ -1829,7 +1867,7 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
 
 	while (true) {
 		/* Visit right subtree if it looks promising */
-		gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0;
+		gap_start = vma->vm_prev ? vm_end_gap(vma->vm_prev) : 0;
 		if (gap_start <= high_limit && vma->vm_rb.rb_right) {
 			struct vm_area_struct *right =
 				rb_entry(vma->vm_rb.rb_right,
@@ -1842,10 +1880,11 @@ unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info)
 
 check_current:
 		/* Check if current node has a suitable gap */
-		gap_end = vma->vm_start;
+		gap_end = vm_start_gap(vma);
 		if (gap_end < low_limit)
 			return -ENOMEM;
-		if (gap_start <= high_limit && gap_end - gap_start >= length)
+		if (gap_start <= high_limit &&
+		    gap_end > gap_start && gap_end - gap_start >= length)
 			goto found;
 
 		/* Visit left subtree if it looks promising */
@@ -1868,7 +1907,7 @@ check_current:
 				       struct vm_area_struct, vm_rb);
 			if (prev == vma->vm_rb.rb_right) {
 				gap_start = vma->vm_prev ?
-					vma->vm_prev->vm_end : 0;
+					vm_end_gap(vma->vm_prev) : 0;
 				goto check_current;
 			}
 		}
@@ -1906,7 +1945,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 		unsigned long len, unsigned long pgoff, unsigned long flags)
 {
 	struct mm_struct *mm = current->mm;
-	struct vm_area_struct *vma;
+	struct vm_area_struct *vma, *prev;
 	struct vm_unmapped_area_info info;
 
 	if (len > TASK_SIZE - mmap_min_addr)
@@ -1917,9 +1956,10 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
 
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
-		vma = find_vma(mm, addr);
+		vma = find_vma_prev(mm, addr, &prev);
 		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
-		    (!vma || addr + len <= vma->vm_start))
+		    (!vma || addr + len <= vm_start_gap(vma)) &&
+		    (!prev || addr >= vm_end_gap(prev)))
 			return addr;
 	}
 
@@ -1951,7 +1991,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 			  const unsigned long len, const unsigned long pgoff,
 			  const unsigned long flags)
 {
-	struct vm_area_struct *vma;
+	struct vm_area_struct *vma, *prev;
 	struct mm_struct *mm = current->mm;
 	unsigned long addr = addr0;
 	struct vm_unmapped_area_info info;
@@ -1966,9 +2006,10 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
 	/* requesting a specific address */
 	if (addr) {
 		addr = PAGE_ALIGN(addr);
-		vma = find_vma(mm, addr);
+		vma = find_vma_prev(mm, addr, &prev);
 		if (TASK_SIZE - len >= addr && addr >= mmap_min_addr &&
-				(!vma || addr + len <= vma->vm_start))
+				(!vma || addr + len <= vm_start_gap(vma)) &&
+				(!prev || addr >= vm_end_gap(prev)))
 			return addr;
 	}
 
@@ -2108,7 +2149,8 @@ find_vma_prev(struct mm_struct *mm, unsigned long addr,
  * update accounting. This is shared with both the
  * grow-up and grow-down cases.
  */
-static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, unsigned long grow)
+static int acct_stack_growth(struct vm_area_struct *vma,
+			    unsigned long size, unsigned long grow)
 {
 	struct mm_struct *mm = vma->vm_mm;
 	struct rlimit *rlim = current->signal->rlim;
@@ -2129,7 +2171,7 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
 		locked = mm->locked_vm + grow;
 		limit = ACCESS_ONCE(rlim[RLIMIT_MEMLOCK].rlim_cur);
 		limit >>= PAGE_SHIFT;
-		if (locked > limit && !capable(CAP_IPC_LOCK))
+		if (locked > limit && !ve_capable(CAP_IPC_LOCK))
 			return -ENOMEM;
 	}
 
@@ -2139,18 +2181,27 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
 	if (is_hugepage_only_range(vma->vm_mm, new_start, size))
 		return -EFAULT;
 
+	if (ub_memory_charge(mm, grow << PAGE_SHIFT, vma->vm_flags,
+				vma->vm_file, UB_SOFT))
+		goto fail_charge;
+
 	/*
 	 * Overcommit..  This must be the final test, as it will
 	 * update security statistics.
 	 */
 	if (security_vm_enough_memory_mm(mm, grow))
-		return -ENOMEM;
+		goto fail_sec;
 
 	/* Ok, everything looks good - let it rip */
 	if (vma->vm_flags & VM_LOCKED)
 		mm->locked_vm += grow;
 	vm_stat_account(mm, vma->vm_flags, vma->vm_file, grow);
 	return 0;
+
+fail_sec:
+	ub_memory_uncharge(mm, grow << PAGE_SHIFT, vma->vm_flags, vma->vm_file);
+fail_charge:
+	return -ENOMEM;
 }
 
 #if defined(CONFIG_STACK_GROWSUP) || defined(CONFIG_IA64)
@@ -2160,32 +2211,40 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
  */
 int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 {
-	int error;
+	struct vm_area_struct *next;
+	unsigned long gap_addr;
+	int error = 0;
 
 	if (!(vma->vm_flags & VM_GROWSUP))
 		return -EFAULT;
 
-	/*
-	 * We must make sure the anon_vma is allocated
-	 * so that the anon_vma locking is not a noop.
-	 */
+	/* Guard against wrapping around to address 0. */
+	address &= PAGE_MASK;
+	address += PAGE_SIZE;
+	if (!address)
+		return -ENOMEM;
+
+	/* Enforce stack_guard_gap */
+	gap_addr = address + stack_guard_gap;
+	if (gap_addr < address)
+		return -ENOMEM;
+	next = vma->vm_next;
+	if (next && next->vm_start < gap_addr) {
+		if (!(next->vm_flags & VM_GROWSUP))
+			return -ENOMEM;
+		/* Check that both stack segments have the same anon_vma? */
+	}
+
+	/* We must make sure the anon_vma is allocated. */
 	if (unlikely(anon_vma_prepare(vma)))
 		return -ENOMEM;
-	vma_lock_anon_vma(vma);
 
 	/*
 	 * vma->vm_start/vm_end cannot change under us because the caller
 	 * is required to hold the mmap_sem in read mode.  We need the
 	 * anon_vma lock to serialize against concurrent expand_stacks.
-	 * Also guard against wrapping around to address 0.
 	 */
-	if (address < PAGE_ALIGN(address+4))
-		address = PAGE_ALIGN(address+4);
-	else {
-		vma_unlock_anon_vma(vma);
-		return -ENOMEM;
-	}
-	error = 0;
+	vma_lock_anon_vma(vma);
 
 	/* Somebody else might have raced and expanded it already */
 	if (address > vma->vm_end) {
@@ -2216,7 +2275,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 				if (vma->vm_next)
 					vma_gap_update(vma->vm_next);
 				else
-					vma->vm_mm->highest_vm_end = address;
+					vma->vm_mm->highest_vm_end = vm_end_gap(vma);
 				spin_unlock(&vma->vm_mm->page_table_lock);
 
 				perf_event_mmap(vma);
@@ -2236,27 +2295,36 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
 int expand_downwards(struct vm_area_struct *vma,
 				   unsigned long address)
 {
+	struct vm_area_struct *prev;
+	unsigned long gap_addr;
 	int error;
 
-	/*
-	 * We must make sure the anon_vma is allocated
-	 * so that the anon_vma locking is not a noop.
-	 */
-	if (unlikely(anon_vma_prepare(vma)))
-		return -ENOMEM;
-
 	address &= PAGE_MASK;
 	error = security_mmap_addr(address);
 	if (error)
 		return error;
 
-	vma_lock_anon_vma(vma);
+	/* Enforce stack_guard_gap */
+	gap_addr = address - stack_guard_gap;
+	if (gap_addr > address)
+		return -ENOMEM;
+	prev = vma->vm_prev;
+	if (prev && prev->vm_end > gap_addr) {
+		if (!(prev->vm_flags & VM_GROWSDOWN))
+			return -ENOMEM;
+		/* Check that both stack segments have the same anon_vma? */
+	}
+
+	/* We must make sure the anon_vma is allocated. */
+	if (unlikely(anon_vma_prepare(vma)))
+		return -ENOMEM;
 
 	/*
 	 * vma->vm_start/vm_end cannot change under us because the caller
 	 * is required to hold the mmap_sem in read mode.  We need the
 	 * anon_vma lock to serialize against concurrent expand_stacks.
 	 */
+	vma_lock_anon_vma(vma);
 
 	/* Somebody else might have raced and expanded it already */
 	if (address < vma->vm_start) {
@@ -2298,28 +2366,25 @@ int expand_downwards(struct vm_area_struct *vma,
 	return error;
 }
 
-/*
- * Note how expand_stack() refuses to expand the stack all the way to
- * abut the next virtual mapping, *unless* that mapping itself is also
- * a stack mapping. We want to leave room for a guard page, after all
- * (the guard page itself is not added here, that is done by the
- * actual page faulting logic)
- *
- * This matches the behavior of the guard page logic (see mm/memory.c:
- * check_stack_guard_page()), which only allows the guard page to be
- * removed under these circumstances.
- */
+/* enforced gap between the expanding stack and other mappings. */
+unsigned long stack_guard_gap = 256UL<<PAGE_SHIFT;
+
+static int __init cmdline_parse_stack_guard_gap(char *p)
+{
+	unsigned long val;
+	char *endptr;
+
+	val = simple_strtoul(p, &endptr, 10);
+	if (!*endptr)
+		stack_guard_gap = val << PAGE_SHIFT;
+
+	return 0;
+}
+__setup("stack_guard_gap=", cmdline_parse_stack_guard_gap);
+
 #ifdef CONFIG_STACK_GROWSUP
 int expand_stack(struct vm_area_struct *vma, unsigned long address)
 {
-	struct vm_area_struct *next;
-
-	address &= PAGE_MASK;
-	next = vma->vm_next;
-	if (next && next->vm_start == address + PAGE_SIZE) {
-		if (!(next->vm_flags & VM_GROWSUP))
-			return -ENOMEM;
-	}
 	return expand_upwards(vma, address);
 }
 
@@ -2341,14 +2406,6 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr)
 #else
 int expand_stack(struct vm_area_struct *vma, unsigned long address)
 {
-	struct vm_area_struct *prev;
-
-	address &= PAGE_MASK;
-	prev = vma->vm_prev;
-	if (prev && prev->vm_end == address) {
-		if (!(prev->vm_flags & VM_GROWSDOWN))
-			return -ENOMEM;
-	}
 	return expand_downwards(vma, address);
 }
 
@@ -2447,7 +2504,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
 		vma->vm_prev = prev;
 		vma_gap_update(vma);
 	} else
-		mm->highest_vm_end = prev ? prev->vm_end : 0;
+		mm->highest_vm_end = prev ? vm_end_gap(prev) : 0;
 	tail_vma->vm_next = NULL;
 	if (mm->unmap_area == arch_unmap_area)
 		addr = prev ? prev->vm_end : mm->mmap_base;
@@ -2472,7 +2529,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
 					~(huge_page_mask(hstate_vma(vma)))))
 		return -EINVAL;
 
-	new = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+	new = allocate_vma(mm, GFP_KERNEL);
 	if (!new)
 		goto out_err;
 
@@ -2523,7 +2580,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
  out_free_mpol:
 	mpol_put(pol);
  out_free_vma:
-	kmem_cache_free(vm_area_cachep, new);
+	free_vma(mm, new);
  out_err:
 	return err;
 }
@@ -2648,6 +2705,99 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len)
 	return vm_munmap(addr, len);
 }
 
+
+/*
+ * Emulation of deprecated remap_file_pages() syscall.
+ */
+SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
+		unsigned long, prot, unsigned long, pgoff, unsigned long, flags)
+{
+
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	unsigned long populate = 0;
+	unsigned long ret = -EINVAL;
+	struct file *file;
+
+	pr_warn_once("%s (%d) uses deprecated remap_file_pages() syscall. "
+			"See Documentation/vm/remap_file_pages.txt.\n",
+			current->comm, current->pid);
+
+	if (prot)
+		return ret;
+	start = start & PAGE_MASK;
+	size = size & PAGE_MASK;
+
+	if (start + size <= start)
+		return ret;
+
+	/* Does pgoff wrap? */
+	if (pgoff + (size >> PAGE_SHIFT) < pgoff)
+		return ret;
+
+	down_write(&mm->mmap_sem);
+	vma = find_vma(mm, start);
+
+	if (!vma || !(vma->vm_flags & VM_SHARED))
+		goto out;
+
+	if (start < vma->vm_start)
+		goto out;
+
+	if (start + size > vma->vm_end) {
+		struct vm_area_struct *next;
+
+		for (next = vma->vm_next; next; next = next->vm_next) {
+			/* hole between vmas ? */
+			if (next->vm_start != next->vm_prev->vm_end)
+				goto out;
+
+			if (next->vm_file != vma->vm_file)
+				goto out;
+
+			if (next->vm_flags != vma->vm_flags)
+				goto out;
+
+			if (start + size <= next->vm_end)
+				break;
+		}
+
+		if (!next)
+			goto out;
+	}
+
+	prot |= vma->vm_flags & VM_READ ? PROT_READ : 0;
+	prot |= vma->vm_flags & VM_WRITE ? PROT_WRITE : 0;
+	prot |= vma->vm_flags & VM_EXEC ? PROT_EXEC : 0;
+
+	flags &= MAP_NONBLOCK;
+	flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
+	if (vma->vm_flags & VM_LOCKED) {
+		struct vm_area_struct *tmp;
+		flags |= MAP_LOCKED;
+
+		/* drop PG_Mlocked flag for over-mapped range */
+		for (tmp = vma; tmp->vm_start >= start + size;
+				tmp = tmp->vm_next) {
+			munlock_vma_pages_range(tmp,
+					max(tmp->vm_start, start),
+					min(tmp->vm_end, start + size));
+		}
+	}
+
+	file = get_file(vma->vm_file);
+	ret = do_mmap_pgoff(vma->vm_file, start, size,
+			prot, flags, pgoff, &populate);
+	fput(file);
+out:
+	up_write(&mm->mmap_sem);
+	if (populate)
+		mm_populate(ret, populate);
+	if (!IS_ERR_VALUE(ret))
+		ret = 0;
+	return ret;
+}
+
 static inline void verify_mm_writelocked(struct mm_struct *mm)
 {
 #ifdef CONFIG_DEBUG_VM
@@ -2663,7 +2813,7 @@ static inline void verify_mm_writelocked(struct mm_struct *mm)
  *  anonymous maps.  eventually we may be able to do some
  *  brk-specific accounting here.
  */
-static unsigned long do_brk(unsigned long addr, unsigned long len)
+static unsigned long do_brk(unsigned long addr, unsigned long len, int soft)
 {
 	struct mm_struct * mm = current->mm;
 	struct vm_area_struct * vma, * prev;
@@ -2691,7 +2841,7 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 		locked += mm->locked_vm;
 		lock_limit = rlimit(RLIMIT_MEMLOCK);
 		lock_limit >>= PAGE_SHIFT;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+		if (locked > lock_limit && !ve_capable(CAP_IPC_LOCK))
 			return -EAGAIN;
 	}
 
@@ -2718,8 +2868,11 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 	if (mm->map_count > sysctl_max_map_count)
 		return -ENOMEM;
 
+	if (ub_memory_charge(mm, len, flags, NULL, soft))
+		goto fail_charge;
+
 	if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT))
-		return -ENOMEM;
+		goto fail_sec;
 
 	/* Can we just expand an old private anonymous mapping? */
 	vma = vma_merge(mm, prev, addr, addr + len, flags,
@@ -2730,11 +2883,9 @@ static unsigned long do_brk(unsigned long addr, unsigned long len)
 	/*
 	 * create a vma struct for an anonymous mapping
 	 */
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
-	if (!vma) {
-		vm_unacct_memory(len >> PAGE_SHIFT);
-		return -ENOMEM;
-	}
+	vma = allocate_vma(mm, GFP_KERNEL | __GFP_ZERO);
+	if (!vma)
+		goto fail_alloc;
 
 	INIT_LIST_HEAD(&vma->anon_vma_chain);
 	vma->vm_mm = mm;
@@ -2751,6 +2902,13 @@ out:
 		mm->locked_vm += (len >> PAGE_SHIFT);
 	vma->vm_flags |= VM_SOFTDIRTY;
 	return addr;
+
+fail_alloc:
+	vm_unacct_memory(len >> PAGE_SHIFT);
+fail_sec:
+	ub_memory_uncharge(mm, len, flags, NULL);
+fail_charge:
+	return -ENOMEM;
 }
 
 unsigned long vm_brk(unsigned long addr, unsigned long len)
@@ -2760,7 +2918,7 @@ unsigned long vm_brk(unsigned long addr, unsigned long len)
 	bool populate;
 
 	down_write(&mm->mmap_sem);
-	ret = do_brk(addr, len);
+	ret = do_brk(addr, len, UB_SOFT);
 	populate = ((mm->def_flags & VM_LOCKED) != 0);
 	up_write(&mm->mmap_sem);
 	if (populate)
@@ -2908,7 +3066,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
 		}
 		*need_rmap_locks = (new_vma->vm_pgoff <= vma->vm_pgoff);
 	} else {
-		new_vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL);
+		new_vma = allocate_vma(mm, GFP_KERNEL);
 		if (new_vma) {
 			*new_vma = *vma;
 			new_vma->vm_start = addr;
@@ -2934,7 +3092,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
  out_free_mempol:
 	mpol_put(pol);
  out_free_vma:
-	kmem_cache_free(vm_area_cachep, new_vma);
+	free_vma(mm, new_vma);
 	return NULL;
 }
 
@@ -3010,7 +3168,7 @@ int install_special_mapping(struct mm_struct *mm,
 	int ret;
 	struct vm_area_struct *vma;
 
-	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
+	vma = allocate_vma(mm, GFP_KERNEL | __GFP_ZERO);
 	if (unlikely(vma == NULL))
 		return -ENOMEM;
 
@@ -3036,7 +3194,7 @@ int install_special_mapping(struct mm_struct *mm,
 	return 0;
 
 out:
-	kmem_cache_free(vm_area_cachep, vma);
+	free_vma(mm, vma);
 	return ret;
 }
 
@@ -3094,8 +3252,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
  *
  * mmap_sem in write mode is required in order to block all operations
  * that could modify pagetables and free pages without need of
- * altering the vma layout (for example populate_range() with
- * nonlinear vmas). It's also needed in write mode to avoid new
+ * altering the vma layout. It's also needed in write mode to avoid new
  * anon_vmas to be associated with existing vmas.
  *
  * A single task can't take more than one mm_take_all_locks() in a row
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -168,6 +168,22 @@ int __mmu_notifier_clear_flush_young(struct mm_struct *mm,
 	return young;
 }
 
+int __mmu_notifier_clear_young(struct mm_struct *mm,
+			       unsigned long address)
+{
+	struct mmu_notifier *mn;
+	int young = 0, id;
+
+	id = srcu_read_lock(&srcu);
+	hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) {
+		if (mn->ops->clear_young)
+			young |= mn->ops->clear_young(mn, mm, address);
+	}
+	srcu_read_unlock(&srcu, id);
+
+	return young;
+}
+
 int __mmu_notifier_test_young(struct mm_struct *mm,
 			      unsigned long address)
 {
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -8,11 +8,13 @@
 #include <linux/stddef.h>
 #include <linux/mm.h>
 #include <linux/mmzone.h>
+#include <linux/module.h>
 
 struct pglist_data *first_online_pgdat(void)
 {
 	return NODE_DATA(first_online_node);
 }
+EXPORT_SYMBOL(first_online_pgdat);
 
 struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
 {
@@ -22,6 +24,7 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
 		return NULL;
 	return NODE_DATA(nid);
 }
+EXPORT_SYMBOL(next_online_pgdat);
 
 /*
  * next_zone - helper magic for for_each_zone()
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -24,11 +24,14 @@
 #include <linux/migrate.h>
 #include <linux/perf_event.h>
 #include <linux/ksm.h>
+#include <linux/module.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include <bc/vmpages.h>
+
 /*
  * For a prot_numa update we only hold mmap_sem for read so there is a
  * potential race with faulting where a pmd was temporarily none. This
@@ -105,7 +108,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 			}
 			if (updated)
 				pages++;
-		} else if (IS_ENABLED(CONFIG_MIGRATION) && !pte_file(oldpte)) {
+		} else if (IS_ENABLED(CONFIG_MIGRATION)) {
 			swp_entry_t entry = pte_to_swp_entry(oldpte);
 
 			if (is_write_migration_entry(entry)) {
@@ -271,6 +274,12 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 		return 0;
 	}
 
+	error = -ENOMEM;
+	if (!VM_UB_PRIVATE(oldflags, vma->vm_file) &&
+	    VM_UB_PRIVATE(newflags, vma->vm_file) &&
+	    charge_beancounter_fast(mm_ub(mm), UB_PRIVVMPAGES, nrpages, UB_SOFT))
+		goto fail_ch;
+
 	/*
 	 * If we make a private mapping writable we increase our commit;
 	 * but (without finer accounting) cannot reduce our commit if we
@@ -282,7 +291,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
 						VM_SHARED|VM_NORESERVE))) {
 			charged = nrpages;
 			if (security_vm_enough_memory_mm(mm, charged))
-				return -ENOMEM;
+				goto fail_sec;
 			newflags |= VM_ACCOUNT;
 		}
 	}
@@ -327,11 +336,21 @@ success:
 
 	vm_stat_account(mm, oldflags, vma->vm_file, -nrpages);
 	vm_stat_account(mm, newflags, vma->vm_file, nrpages);
+
+	if (VM_UB_PRIVATE(oldflags, vma->vm_file) &&
+	    !VM_UB_PRIVATE(newflags, vma->vm_file))
+		uncharge_beancounter_fast(mm_ub(mm), UB_PRIVVMPAGES, nrpages);
+
 	perf_event_mmap(vma);
 	return 0;
 
 fail:
 	vm_unacct_memory(charged);
+fail_sec:
+	if (!VM_UB_PRIVATE(oldflags, vma->vm_file) &&
+	    VM_UB_PRIVATE(newflags, vma->vm_file))
+		uncharge_beancounter_fast(mm_ub(mm), UB_PRIVVMPAGES, nrpages);
+fail_ch:
 	return error;
 }
 
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -24,6 +24,8 @@
 #include <linux/uaccess.h>
 #include <linux/mm-arch-hooks.h>
 
+#include <bc/vmpages.h>
+
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
@@ -83,8 +85,6 @@ static pte_t move_soft_dirty_pte(pte_t pte)
 		pte = pte_mksoft_dirty(pte);
 	else if (is_swap_pte(pte))
 		pte = pte_swp_mksoft_dirty(pte);
-	else if (pte_file(pte))
-		pte = pte_file_mksoft_dirty(pte);
 #endif
 	return pte;
 }
@@ -250,12 +250,16 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	int err;
 	bool need_rmap_locks;
 
+	if (ub_memory_charge(mm, new_len, vm_flags,
+			     vma->vm_file, UB_HARD))
+		goto err;
+
 	/*
 	 * We'd prefer to avoid failure later on in do_munmap:
 	 * which may split one vma into three before unmapping.
 	 */
 	if (mm->map_count >= sysctl_max_map_count - 3)
-		return -ENOMEM;
+		goto err_nomem;
 
 	/*
 	 * Advise KSM to break any KSM pages in the area to be moved:
@@ -267,13 +271,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 	err = ksm_madvise(vma, old_addr, old_addr + old_len,
 						MADV_UNMERGEABLE, &vm_flags);
 	if (err)
-		return err;
+		goto err_nomem;
 
 	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
 	new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
 			   &need_rmap_locks);
 	if (!new_vma)
-		return -ENOMEM;
+		goto err_nomem;
 
 	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
 				     need_rmap_locks);
@@ -344,7 +348,13 @@ static unsigned long move_vma(struct vm_area_struct *vma,
 		*locked = true;
 	}
 
-	return new_addr;
+	if (new_addr != -ENOMEM)
+		return new_addr;
+
+err_nomem:
+	ub_memory_uncharge(mm, new_len, vm_flags, vma->vm_file);
+err:
+	return -ENOMEM;
 }
 
 static struct vm_area_struct *vma_to_resize(unsigned long addr,
@@ -380,7 +390,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
 		locked = mm->locked_vm << PAGE_SHIFT;
 		lock_limit = rlimit(RLIMIT_MEMLOCK);
 		locked += new_len - old_len;
-		if (locked > lock_limit && !capable(CAP_IPC_LOCK))
+		if (locked > lock_limit && !ve_capable(CAP_IPC_LOCK))
 			goto Eagain;
 	}
 
@@ -550,10 +560,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
 	if (old_len == vma->vm_end - addr) {
 		/* can we just expand the current mapping? */
 		if (vma_expandable(vma, new_len - old_len)) {
-			int pages = (new_len - old_len) >> PAGE_SHIFT;
+			unsigned long len = (new_len - old_len);
+			int pages = len >> PAGE_SHIFT;
+
+			ret = -ENOMEM;
+			if (ub_memory_charge(mm, len, vma->vm_flags,
+						vma->vm_file, UB_HARD))
+				goto out;
 
 			if (vma_adjust(vma, vma->vm_start, addr + new_len,
 				       vma->vm_pgoff, NULL)) {
+				ub_memory_uncharge(mm, len,
+						vma->vm_flags, vma->vm_file);
 				ret = -ENOMEM;
 				goto out;
 			}
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -48,6 +48,8 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
 	if (end < start)
 		goto out;
 	error = 0;
+	if (ve_fsync_behavior() == FSYNC_NEVER)
+		goto out;
 	if (end == start)
 		goto out;
 	/*
@@ -86,10 +88,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
 				(vma->vm_flags & VM_SHARED)) {
 			get_file(file);
 			up_read(&mm->mmap_sem);
-			if (vma->vm_flags & VM_NONLINEAR)
-				error = vfs_fsync(file, 1);
-			else
-				error = vfs_fsync_range(file, fstart, fend, 1);
+			error = vfs_fsync_range(file, fstart, fend, 1);
 			fput(file);
 			if (error || start >= end)
 				goto out;
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -567,7 +567,7 @@ void __init mmap_init(void)
 
 	ret = percpu_counter_init(&vm_committed_as, 0, GFP_KERNEL);
 	VM_BUG_ON(ret);
-	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
+	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC|SLAB_ACCOUNT);
 }
 
 /*
@@ -2020,14 +2020,6 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 }
 EXPORT_SYMBOL(filemap_fault);
 
-int generic_file_remap_pages(struct vm_area_struct *vma, unsigned long addr,
-			     unsigned long size, pgoff_t pgoff)
-{
-	BUG();
-	return 0;
-}
-EXPORT_SYMBOL(generic_file_remap_pages);
-
 static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
 		unsigned long addr, void *buf, int len, int write)
 {
--- /dev/null
+++ b/mm/oom_group.c
@@ -0,0 +1,226 @@
+/*
+ *  mm/oom_group.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <asm/uaccess.h>
+#include <linux/seq_file.h>
+#include <linux/ctype.h>
+#include <linux/oom.h>
+#include <linux/ve.h>
+
+#include <bc/beancounter.h>
+
+static LIST_HEAD(oom_group_list_head);
+static DEFINE_RWLOCK(oom_group_lock);
+
+struct oom_group_pattern {
+	char comm[TASK_COMM_LEN], pcomm[TASK_COMM_LEN];
+	int oom_uid;
+	int oom_score_adj;
+	struct list_head group_list;
+};
+
+static void oom_groups_append(struct list_head *list)
+{
+	write_lock_irq(&oom_group_lock);
+	list_splice_tail(list, &oom_group_list_head);
+	write_unlock_irq(&oom_group_lock);
+}
+
+static void oom_groups_reset(void)
+{
+	struct list_head list;
+	struct oom_group_pattern *gp, *tmp;
+
+	write_lock_irq(&oom_group_lock);
+	list_replace_init(&oom_group_list_head, &list);
+	write_unlock_irq(&oom_group_lock);
+
+	list_for_each_entry_safe(gp, tmp, &list, group_list)
+		kfree(gp);
+}
+
+/*
+ * If mask ends with asterisk it matches any comm suffix:
+ * "foo" matches only "foo", "foo*" matches "foo" and "foobar"
+ * "*" matches any string.
+ */
+static bool oom_match_comm(const char *comm, const char *mask)
+{
+	while (*comm && *mask != '*' && *comm == *mask) {
+		comm++;
+		mask++;
+	}
+	return (!*mask && !*comm) || (*mask == '*');
+}
+
+int get_task_oom_score_adj(struct task_struct *t)
+{
+	struct oom_group_pattern *gp;
+	unsigned long flags;
+	const struct cred *cred;
+	uid_t task_uid;
+	int adj = t->signal->oom_score_adj;
+
+	/* Do not impose grouping rules if the score is adjusted by the user */
+	if (adj != 0)
+		return adj;
+
+	rcu_read_lock();
+	cred = __task_cred(t);
+	task_uid = from_kuid_munged(cred->user_ns, cred->uid);
+	rcu_read_unlock();
+
+	read_lock_irqsave(&oom_group_lock, flags);
+	list_for_each_entry(gp, &oom_group_list_head, group_list) {
+		if (gp->oom_uid >= 0 && task_uid != gp->oom_uid)
+			continue;
+		if (gp->oom_uid < -1 && task_uid >= -gp->oom_uid)
+			continue;
+		if (!oom_match_comm(t->comm, gp->comm))
+			continue;
+		if (!oom_match_comm(t->parent->comm, gp->pcomm))
+			continue;
+		adj = gp->oom_score_adj;
+		break;
+	}
+	read_unlock_irqrestore(&oom_group_lock, flags);
+	return adj;
+}
+
+static int oom_group_parse_line(struct list_head *list, char *line)
+{
+	struct oom_group_pattern *gp;
+	char dummy;
+	int ret;
+
+	gp = kmalloc(sizeof(struct oom_group_pattern), GFP_KERNEL);
+	if (gp == NULL)
+		return -ENOMEM;
+
+	BUILD_BUG_ON(TASK_COMM_LEN != 16);
+	ret = sscanf(line, "%15s %15s %d %d %c",
+			gp->comm, gp->pcomm, &gp->oom_uid,
+			&gp->oom_score_adj, &dummy);
+
+	if (ret != 4 || gp->oom_score_adj < OOM_SCORE_ADJ_MIN ||
+			gp->oom_score_adj > OOM_SCORE_ADJ_MAX) {
+		kfree(gp);
+		return -EINVAL;
+	}
+
+	list_add_tail(&gp->group_list, list);
+
+	return 0;
+}
+
+static ssize_t oom_group_write(struct file * file, const char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	char *line, *next, *page;
+	int ret, len;
+	LIST_HEAD(groups);
+
+	page = (unsigned char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+	len = min(count, PAGE_SIZE - 1);
+	ret = copy_from_user(page, buf, len);
+	if (ret)
+		goto err;
+
+	page[len] = '\0';
+
+	next = page;
+	while (1) {
+		line = skip_spaces(next);
+		next = strchr(line, '\n');
+		if (next) {
+			*next++ = '\0';
+		} else if (len < count) {
+			ret = line != page ? line - page : -EINVAL;
+			break;
+		}
+		if (*line && *line != '#') {
+			ret = oom_group_parse_line(&groups, line);
+			if (ret)
+				break;
+		}
+		if (!next) {
+			ret = len;
+			break;
+		}
+	}
+
+	oom_groups_append(&groups);
+err:
+	free_page((unsigned long)page);
+	return ret;
+}
+
+static void *oom_group_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	read_lock_irq(&oom_group_lock);
+	return seq_list_start(&oom_group_list_head, *pos);
+}
+
+static void oom_group_seq_stop(struct seq_file *s, void *v)
+{
+	read_unlock_irq(&oom_group_lock);
+}
+
+static int oom_group_seq_show(struct seq_file *s, void *v)
+{
+	struct list_head *entry = v;
+	struct oom_group_pattern *p;
+
+	p = list_entry(entry, struct oom_group_pattern, group_list);
+	seq_printf(s, "%s %s %d %d\n", p->comm, p->pcomm,
+			p->oom_uid, p->oom_score_adj);
+	return 0;
+}
+
+static void *oom_group_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &oom_group_list_head, pos);
+}
+
+static struct seq_operations oom_group_seq_ops = {
+	.start = oom_group_seq_start,
+	.next  = oom_group_seq_next,
+	.stop  = oom_group_seq_stop,
+	.show  = oom_group_seq_show,
+};
+
+static int oom_group_seq_open(struct inode *inode, struct file *file)
+{
+	if (file->f_flags & O_TRUNC)
+		oom_groups_reset();
+	return seq_open(file, &oom_group_seq_ops);
+}
+
+static struct file_operations proc_oom_group_ops = {
+	.owner   = THIS_MODULE,
+	.open    = oom_group_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = oom_group_write,
+};
+
+static int __init oom_group_init(void) {
+	struct proc_dir_entry *proc;
+
+	proc = proc_create("oom_score_adj", 0660,
+			   proc_vz_dir, &proc_oom_group_ops);
+	if (!proc)
+		return -ENOMEM;
+	return 0;
+}
+
+module_init(oom_group_init);
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -41,8 +41,47 @@
 
 int sysctl_panic_on_oom;
 int sysctl_oom_kill_allocating_task;
-int sysctl_oom_dump_tasks = 1;
-static DEFINE_SPINLOCK(zone_scan_lock);
+int sysctl_oom_dump_tasks;
+int sysctl_oom_relaxation = HZ;
+
+static DEFINE_SPINLOCK(oom_context_lock);
+
+#define OOM_TIMEOUT	(5 * HZ)
+
+#define OOM_BASE_RAGE	-10
+#define OOM_MAX_RAGE	20
+
+struct oom_context global_oom_ctx = {
+	.rage		= OOM_BASE_RAGE,
+	.waitq		= __WAIT_QUEUE_HEAD_INITIALIZER(global_oom_ctx.waitq),
+};
+
+void init_oom_context(struct oom_context *ctx)
+{
+	ctx->owner = NULL;
+	ctx->victim = NULL;
+	ctx->marked = false;
+	ctx->oom_start = 0;
+	ctx->oom_end = 0;
+	ctx->rage = OOM_BASE_RAGE;
+	init_waitqueue_head(&ctx->waitq);
+}
+
+static void __release_oom_context(struct oom_context *ctx)
+{
+	ctx->owner = NULL;
+	ctx->victim = NULL;
+	ctx->marked = false;
+	ctx->oom_end = jiffies;
+	wake_up_all(&ctx->waitq);
+}
+
+void release_oom_context(struct oom_context *ctx)
+{
+	spin_lock(&oom_context_lock);
+	__release_oom_context(ctx);
+	spin_unlock(&oom_context_lock);
+}
 
 #ifdef CONFIG_NUMA
 /**
@@ -137,6 +176,21 @@ static bool oom_unkillable_task(struct task_struct *p,
 	return false;
 }
 
+static unsigned long mm_overdraft(struct mm_struct *mm)
+{
+	struct mem_cgroup *memcg;
+	struct oom_context *ctx;
+	unsigned long overdraft;
+
+	memcg = get_mem_cgroup_from_mm(mm);
+	ctx = mem_cgroup_oom_context(memcg);
+	overdraft = ctx->overdraft;
+	if (memcg)
+		mem_cgroup_put(memcg);
+
+	return overdraft;
+}
+
 /**
  * oom_badness - heuristic function to determine which candidate task to kill
  * @p: task struct of which task we should calculate
@@ -147,11 +201,15 @@ static bool oom_unkillable_task(struct task_struct *p,
  * task consuming the most memory to avoid subsequent oom failures.
  */
 unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
-			  const nodemask_t *nodemask, unsigned long totalpages)
+			  const nodemask_t *nodemask, unsigned long totalpages,
+			  unsigned long *overdraft)
 {
 	long points;
 	long adj;
 
+	if (overdraft)
+		*overdraft = 0;
+
 	if (oom_unkillable_task(p, memcg, nodemask))
 		return 0;
 
@@ -159,7 +217,10 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg,
 	if (!p)
 		return 0;
 
-	adj = (long)p->signal->oom_score_adj;
+	if (overdraft)
+		*overdraft = mm_overdraft(p->mm);
+
+	adj = get_task_oom_score_adj(p);
 	if (adj == OOM_SCORE_ADJ_MIN) {
 		task_unlock(p);
 		return 0;
@@ -255,24 +316,21 @@ static enum oom_constraint constrained_alloc(struct zonelist *zonelist,
 #endif
 
 enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
-		unsigned long totalpages, const nodemask_t *nodemask,
-		bool force_kill)
+					const nodemask_t *nodemask)
 {
-	if (task->exit_state)
-		return OOM_SCAN_CONTINUE;
 	if (oom_unkillable_task(task, NULL, nodemask))
 		return OOM_SCAN_CONTINUE;
 
 	/*
 	 * This task already has access to memory reserves and is being killed.
-	 * Don't allow any other task to have access to the reserves.
+	 * Try to select another one.
+	 *
+	 * This can only happen if oom_trylock timeout-ed, which most probably
+	 * means that the victim had dead-locked.
 	 */
-	if (test_tsk_thread_flag(task, TIF_MEMDIE)) {
-		if (unlikely(frozen(task)))
-			__thaw_task(task);
-		if (!force_kill)
-			return OOM_SCAN_ABORT;
-	}
+	if (test_tsk_thread_flag(task, TIF_MEMDIE))
+		return OOM_SCAN_CONTINUE;
+
 	if (!task->mm)
 		return OOM_SCAN_CONTINUE;
 
@@ -283,14 +341,6 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
 	if (oom_task_origin(task))
 		return OOM_SCAN_SELECT;
 
-	if (task->flags & PF_EXITING && !force_kill) {
-		/*
-		 * If this task is not being ptraced on exit, then wait for it
-		 * to finish before killing some other task unnecessarily.
-		 */
-		if (!(task->group_leader->ptrace & PT_TRACE_EXIT))
-			return OOM_SCAN_ABORT;
-	}
 	return OOM_SCAN_OK;
 }
 
@@ -300,43 +350,43 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task,
  *
  * (not docbooked, we don't want this one cluttering up the manual)
  */
-static struct task_struct *select_bad_process(unsigned int *ppoints,
-		unsigned long totalpages, const nodemask_t *nodemask,
-		bool force_kill)
+static struct task_struct *select_bad_process(unsigned long *ppoints,
+		unsigned long *poverdraft,
+		unsigned long totalpages, const nodemask_t *nodemask)
 {
 	struct task_struct *g, *p;
 	struct task_struct *chosen = NULL;
 	unsigned long chosen_points = 0;
+	unsigned long max_overdraft = 0;
 
 	rcu_read_lock();
 	for_each_process_thread(g, p) {
 		unsigned int points;
+		unsigned long overdraft;
 
-		switch (oom_scan_process_thread(p, totalpages, nodemask,
-						force_kill)) {
+		switch (oom_scan_process_thread(p, nodemask)) {
 		case OOM_SCAN_SELECT:
 			chosen = p;
 			chosen_points = ULONG_MAX;
+			max_overdraft = ULONG_MAX;
 			/* fall through */
 		case OOM_SCAN_CONTINUE:
 			continue;
-		case OOM_SCAN_ABORT:
-			rcu_read_unlock();
-			return ERR_PTR(-1UL);
 		case OOM_SCAN_OK:
 			break;
 		};
-		points = oom_badness(p, NULL, nodemask, totalpages);
-		if (points > chosen_points) {
+		points = oom_badness(p, NULL, nodemask, totalpages,
+				     &overdraft);
+		if (oom_worse(points, overdraft, &chosen_points,
+			      &max_overdraft))
 			chosen = p;
-			chosen_points = points;
-		}
 	}
 	if (chosen)
 		get_task_struct(chosen);
 	rcu_read_unlock();
 
-	*ppoints = chosen_points * 1000 / totalpages;
+	*ppoints = chosen_points;
+	*poverdraft = max_overdraft;
 	return chosen;
 }
 
@@ -402,13 +452,341 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
 		dump_tasks(memcg, nodemask);
 }
 
+/**
+ * mark_oom_victim - mark the given task as OOM victim
+ * @tsk: task to mark
+ */
+void mark_oom_victim(struct task_struct *tsk)
+{
+	struct mem_cgroup *memcg;
+	struct oom_context *ctx;
+
+	set_tsk_thread_flag(tsk, TIF_MEMDIE);
+
+	/*
+	 * Make sure that the task is woken up from uninterruptible sleep
+	 * if it is frozen because OOM killer wouldn't be able to free
+	 * any memory and livelock. freezing_slow_path will tell the freezer
+	 * that TIF_MEMDIE tasks should be ignored.
+	 */
+	__thaw_task(tsk);
+
+	/*
+	 * Record the pointer to the victim in the oom context of the
+	 * owner memcg so that others can wait for it to exit. It will
+	 * be cleared in exit_oom_victim.
+	 */
+	memcg = get_mem_cgroup_from_mm(tsk->mm);
+	ctx = mem_cgroup_oom_context(memcg);
+	spin_lock(&oom_context_lock);
+	if (!ctx->victim) {
+		ctx->victim = tsk;
+		ctx->marked = true;
+	}
+	spin_unlock(&oom_context_lock);
+	if (memcg)
+		mem_cgroup_put(memcg);
+}
+
+/**
+ * exit_oom_victim - note the exit of an OOM victim
+ */
+void exit_oom_victim(void)
+{
+	struct mem_cgroup *iter;
+	struct oom_context *ctx;
+
+	clear_thread_flag(TIF_MEMDIE);
+
+	/*
+	 * Wake up every process waiting for this oom victim to exit.
+	 */
+	spin_lock(&oom_context_lock);
+	iter = mem_cgroup_iter(NULL, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		if (ctx->victim != current)
+			continue;
+		if (!ctx->owner)
+			__release_oom_context(ctx);
+		else
+			/* To be released by owner (see oom_unlock) */
+			ctx->victim = NULL;
+	} while ((iter = mem_cgroup_iter(NULL, iter, NULL)));
+	spin_unlock(&oom_context_lock);
+}
+
+static void __wait_oom_context(struct oom_context *ctx)
+{
+	unsigned long now = jiffies;
+	unsigned long timeout;
+	DEFINE_WAIT(wait);
+
+	if (ctx->victim == current ||
+	    time_after_eq(now, ctx->oom_start + OOM_TIMEOUT)) {
+		spin_unlock(&oom_context_lock);
+		return;
+	}
+
+	prepare_to_wait(&ctx->waitq, &wait, TASK_KILLABLE);
+	timeout = ctx->oom_start + OOM_TIMEOUT - now;
+	spin_unlock(&oom_context_lock);
+	schedule_timeout(timeout);
+	finish_wait(&ctx->waitq, &wait);
+}
+
+bool oom_trylock(struct mem_cgroup *memcg)
+{
+	unsigned long now = jiffies;
+	struct mem_cgroup *iter, *parent;
+	struct oom_context *ctx;
+
+	spin_lock(&oom_context_lock);
+
+	/*
+	 * Check if oom context of memcg or any of its descendants is
+	 * active, i.e. if there is a process selecting a victim or a
+	 * victim dying. If there is, wait for it to finish, otherwise
+	 * proceed to oom.
+	 */
+	iter = mem_cgroup_iter(memcg, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		if ((ctx->owner || ctx->victim) &&
+		    time_before(now, ctx->oom_start + OOM_TIMEOUT)) {
+			__wait_oom_context(ctx);
+			mem_cgroup_iter_break(memcg, iter);
+			return false;
+		} else if (ctx->owner || ctx->victim) {
+			/*
+			 * Timeout. Release the context and dump stack
+			 * trace of the stuck process.
+			 *
+			 * To avoid dumping stack trace of the same task
+			 * more than once, we mark the context that
+			 * contained the victim when it was killed (see
+			 * mark_oom_victim).
+			 */
+			struct task_struct *p = ctx->victim;
+
+			if (p && ctx->marked) {
+				pr_err("OOM kill timeout: %d (%s)\n",
+				       task_pid_nr(p), p->comm);
+				show_stack(p, NULL);
+			}
+
+			__release_oom_context(ctx);
+		}
+	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
+
+	/*
+	 * Acquire oom context of memcg and all its descendants.
+	 */
+	iter = mem_cgroup_iter(memcg, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		BUG_ON(ctx->owner);
+		BUG_ON(ctx->victim);
+		ctx->owner = current;
+		ctx->oom_start = now;
+		/*
+		 * Update overdraft of each cgroup under us. This
+		 * information will be used in oom_badness.
+		 */
+		ctx->overdraft = mem_cgroup_overdraft(iter);
+		parent = iter ? parent_mem_cgroup(iter) : NULL;
+		if (parent && iter != memcg)
+			ctx->overdraft = max(ctx->overdraft,
+				mem_cgroup_oom_context(parent)->overdraft);
+	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
+
+	spin_unlock(&oom_context_lock);
+
+	return true;
+}
+
+void oom_unlock(struct mem_cgroup *memcg)
+{
+	struct task_struct *victim = NULL;
+	struct mem_cgroup *iter, *victim_memcg = NULL;
+	struct oom_context *ctx;
+
+	spin_lock(&oom_context_lock);
+
+	/*
+	 * Find oom victim if any.
+	 */
+	iter = mem_cgroup_iter(memcg, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		if (ctx->owner != current) {
+			/* Lost ownership on timeout */
+			mem_cgroup_iter_break(memcg, iter);
+			break;
+		}
+		if (ctx->victim) {
+			victim = ctx->victim;
+			/*
+			 * Remember the victim memcg so that we can wait
+			 * on it for the victim to exit below.
+			 */
+			victim_memcg = iter;
+			if (iter)
+				mem_cgroup_get(iter);
+
+			mem_cgroup_iter_break(memcg, iter);
+			break;
+		}
+	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
+
+	/*
+	 * Propagate victim up to the context that initiated oom.
+	 */
+	for (iter = victim_memcg; iter; iter = parent_mem_cgroup(iter)) {
+		ctx = mem_cgroup_oom_context(iter);
+		BUG_ON(ctx->owner != current);
+		if (!ctx->victim)
+			ctx->victim = victim;
+		if (iter == memcg)
+			break;
+	}
+
+	/*
+	 * Release oom context of memcg and all its descendants.
+	 */
+	iter = mem_cgroup_iter(memcg, NULL, NULL);
+	do {
+		ctx = mem_cgroup_oom_context(iter);
+		if (ctx->owner != current)
+			/* Lost ownership on timeout */
+			continue;
+		if (!ctx->victim)
+			/*
+			 * Victim already exited or nobody was killed in
+			 * this cgroup? It's our responsibility to wake
+			 * up blocked processes then.
+			 */
+			__release_oom_context(ctx);
+		else
+			/* To be released by victim (see exit_oom_victim) */
+			ctx->owner = NULL;
+	} while ((iter = mem_cgroup_iter(memcg, iter, NULL)));
+
+	if (!victim) {
+		spin_unlock(&oom_context_lock);
+		return;
+	}
+
+	/*
+	 * Wait for the victim to exit.
+	 */
+	ctx = mem_cgroup_oom_context(victim_memcg);
+	__wait_oom_context(ctx);
+	if (victim_memcg)
+		mem_cgroup_put(victim_memcg);
+}
+
+/*
+ * Kill more processes if oom happens too often in this context.
+ */
+static void oom_berserker(unsigned long points, unsigned long overdraft,
+			  unsigned long totalpages, struct mem_cgroup *memcg,
+			  nodemask_t *nodemask)
+{
+	static DEFINE_RATELIMIT_STATE(berserker_rs,
+				      DEFAULT_RATELIMIT_INTERVAL,
+				      DEFAULT_RATELIMIT_BURST);
+	struct oom_context *ctx;
+	struct task_struct *p;
+	int rage;
+	int killed = 0;
+
+	spin_lock(&oom_context_lock);
+	ctx = mem_cgroup_oom_context(memcg);
+	if (ctx->owner != current) {
+		/* Lost ownership on timeout */
+		spin_unlock(&oom_context_lock);
+		return;
+	}
+	/*
+	 * Increase rage if oom happened recently in this context, reset
+	 * rage otherwise.
+	 *
+	 * previous oom                            this oom (unfinished)
+	 * ++++++++++++----------------------------++++++++
+	 *            ^                            ^
+	 *         oom_end  <<oom_relaxation>>  oom_start
+	 */
+	if (time_after(ctx->oom_start, ctx->oom_end + sysctl_oom_relaxation))
+		ctx->rage = OOM_BASE_RAGE;
+	else if (ctx->rage < OOM_MAX_RAGE)
+		ctx->rage++;
+	rage = ctx->rage;
+	spin_unlock(&oom_context_lock);
+
+	if (rage < 0)
+		return;
+
+	/*
+	 * So, we are in rage. Kill (1 << rage) youngest tasks that are
+	 * as bad as the victim.
+	 */
+	read_lock(&tasklist_lock);
+	list_for_each_entry_reverse(p, &init_task.tasks, tasks) {
+		unsigned long tsk_points;
+		unsigned long tsk_overdraft;
+
+		if (!p->mm || test_tsk_thread_flag(p, TIF_MEMDIE) ||
+		    fatal_signal_pending(p) || p->flags & PF_EXITING ||
+		    oom_unkillable_task(p, memcg, nodemask))
+			continue;
+
+		tsk_points = oom_badness(p, memcg, nodemask, totalpages,
+					 &tsk_overdraft);
+		if (tsk_overdraft < overdraft)
+			continue;
+
+		/*
+		 * oom_badness never returns a negative value, even if
+		 * oom_score_adj would make badness so, instead it
+		 * returns 1. So we do not kill task with badness 1 if
+		 * the victim has badness > 1 so as not to risk killing
+		 * protected tasks.
+		 */
+		if (tsk_points <= 1 && points > 1)
+			continue;
+
+		/*
+		 * Consider tasks as equally bad if they occupy equal
+		 * percentage of available memory.
+		 */
+		if (tsk_points * 100 / totalpages <
+		    points * 100 / totalpages)
+			continue;
+
+		if (__ratelimit(&berserker_rs))
+			pr_err("Rage kill process %d (%s)\n",
+			       task_pid_nr(p), p->comm);
+
+		do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+		mem_cgroup_note_oom_kill(memcg, p);
+
+		if (++killed >= 1 << rage)
+			break;
+	}
+	read_unlock(&tasklist_lock);
+
+	pr_err("OOM killer in rage %d: %d tasks killed\n", rage, killed);
+}
+
 #define K(x) ((x) << (PAGE_SHIFT-10))
 /*
  * Must be called while holding a reference to p, which will be released upon
  * returning.
  */
 void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
-		      unsigned int points, unsigned long totalpages,
+		      unsigned long points, unsigned long overdraft,
+		      unsigned long totalpages,
 		      struct mem_cgroup *memcg, nodemask_t *nodemask,
 		      const char *message)
 {
@@ -424,19 +802,19 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 	 * If the task is already exiting, don't alarm the sysadmin or kill
 	 * its children or threads, just set TIF_MEMDIE so it can die quickly
 	 */
-	if (p->flags & PF_EXITING) {
-		set_tsk_thread_flag(p, TIF_MEMDIE);
-		put_task_struct(p);
-		return;
+	task_lock(p);
+	if (p->mm && p->flags & PF_EXITING) {
+		mark_oom_victim(p);
+		task_unlock(p);
+		goto out;
 	}
+	task_unlock(p);
 
 	if (__ratelimit(&oom_rs))
 		dump_header(p, gfp_mask, order, memcg, nodemask);
 
-	task_lock(p);
-	pr_err("%s: Kill process %d (%s) score %d or sacrifice child\n",
-		message, task_pid_nr(p), p->comm, points);
-	task_unlock(p);
+	pr_err("%s: Kill process %d (%s) score %lu or sacrifice child\n",
+		message, task_pid_nr(p), p->comm, points * 1000 / totalpages);
 
 	/*
 	 * If any of p's children has a different mm and is eligible for kill,
@@ -451,11 +829,14 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 
 			if (child->mm == p->mm)
 				continue;
+			if (!child->mm ||
+			    test_tsk_thread_flag(child, TIF_MEMDIE))
+				continue;
 			/*
 			 * oom_badness() returns 0 if the thread is unkillable
 			 */
 			child_points = oom_badness(child, memcg, nodemask,
-								totalpages);
+						   totalpages, NULL);
 			if (child_points > victim_points) {
 				put_task_struct(victim);
 				victim = child;
@@ -468,8 +849,7 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 
 	p = find_lock_task_mm(victim);
 	if (!p) {
-		put_task_struct(victim);
-		return;
+		goto out;
 	} else if (victim != p) {
 		get_task_struct(p);
 		put_task_struct(victim);
@@ -478,11 +858,15 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 
 	/* mm cannot safely be dereferenced after task_unlock(victim) */
 	mm = victim->mm;
-	pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
-		task_pid_nr(victim), victim->comm, K(victim->mm->total_vm),
+	mark_oom_victim(victim);
+	rcu_read_lock();
+	pr_err("Killed process %d (%s) in VE \"%s\" total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n",
+		task_pid_nr(victim), victim->comm, task_ve_name(victim),
+		K(victim->mm->total_vm),
 		K(get_mm_counter(victim->mm, MM_ANONPAGES)),
 		K(get_mm_counter(victim->mm, MM_FILEPAGES)),
 		K(get_mm_counter(victim->mm, MM_SHMEMPAGES)));
+	rcu_read_unlock();
 	task_unlock(victim);
 
 	/*
@@ -501,17 +885,18 @@ void oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
 			if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN)
 				continue;
 
-			task_lock(p);	/* Protect ->comm from prctl() */
-			pr_err("Kill process %d (%s) sharing same memory\n",
-				task_pid_nr(p), p->comm);
-			task_unlock(p);
+			pr_err("Kill process %d (%s) in VE \"%s\" sharing same memory\n",
+				task_pid_nr(p), p->comm, task_ve_name(p));
 			do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true);
+			mem_cgroup_note_oom_kill(memcg, p);
 		}
 	rcu_read_unlock();
 
-	set_tsk_thread_flag(victim, TIF_MEMDIE);
 	do_send_sig_info(SIGKILL, SEND_SIG_FORCED, victim, true);
+	mem_cgroup_note_oom_kill(memcg, victim);
+out:
 	put_task_struct(victim);
+	oom_berserker(points, overdraft, totalpages, memcg, nodemask);
 }
 #undef K
 
@@ -551,63 +936,12 @@ int unregister_oom_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL_GPL(unregister_oom_notifier);
 
-/*
- * Try to acquire the OOM killer lock for the zones in zonelist.  Returns zero
- * if a parallel OOM killing is already taking place that includes a zone in
- * the zonelist.  Otherwise, locks all zones in the zonelist and returns 1.
- */
-int try_set_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
-{
-	struct zoneref *z;
-	struct zone *zone;
-	int ret = 1;
-
-	spin_lock(&zone_scan_lock);
-	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
-		if (zone_is_oom_locked(zone)) {
-			ret = 0;
-			goto out;
-		}
-	}
-
-	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
-		/*
-		 * Lock each zone in the zonelist under zone_scan_lock so a
-		 * parallel invocation of try_set_zonelist_oom() doesn't succeed
-		 * when it shouldn't.
-		 */
-		zone_set_flag(zone, ZONE_OOM_LOCKED);
-	}
-
-out:
-	spin_unlock(&zone_scan_lock);
-	return ret;
-}
-
-/*
- * Clears the ZONE_OOM_LOCKED flag for all zones in the zonelist so that failed
- * allocation attempts with zonelists containing them may now recall the OOM
- * killer, if necessary.
- */
-void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
-{
-	struct zoneref *z;
-	struct zone *zone;
-
-	spin_lock(&zone_scan_lock);
-	for_each_zone_zonelist(zone, z, zonelist, gfp_zone(gfp_mask)) {
-		zone_clear_flag(zone, ZONE_OOM_LOCKED);
-	}
-	spin_unlock(&zone_scan_lock);
-}
-
 /**
  * out_of_memory - kill the "best" process when we run out of memory
  * @zonelist: zonelist pointer
  * @gfp_mask: memory allocation flags
  * @order: amount of memory being requested as a power of 2
  * @nodemask: nodemask passed to page allocator
- * @force_kill: true if a task must be killed, even if others are exiting
  *
  * If we run out of memory, we have the choice between either
  * killing a random task (bad), letting the system crash (worse)
@@ -615,15 +949,15 @@ void clear_zonelist_oom(struct zonelist *zonelist, gfp_t gfp_mask)
  * don't have to be perfect here, we just have to be good.
  */
 void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
-		int order, nodemask_t *nodemask, bool force_kill)
+		   int order, nodemask_t *nodemask)
 {
 	const nodemask_t *mpol_mask;
 	struct task_struct *p;
 	unsigned long totalpages;
 	unsigned long freed = 0;
-	unsigned int uninitialized_var(points);
+	unsigned long uninitialized_var(points);
+	unsigned long uninitialized_var(overdraft);
 	enum oom_constraint constraint = CONSTRAINT_NONE;
-	int killed = 0;
 
 	blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
 	if (freed > 0)
@@ -634,9 +968,13 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	 * If current has a pending SIGKILL or is exiting, then automatically
 	 * select it.  The goal is to allow it to allocate so that it may
 	 * quickly exit and free its memory.
+	 *
+	 * But don't select if current has already released its mm and cleared
+	 * TIF_MEMDIE flag at exit_mm(), otherwise an OOM livelock may occur.
 	 */
-	if (fatal_signal_pending(current) || current->flags & PF_EXITING) {
-		set_thread_flag(TIF_MEMDIE);
+	if (current->mm &&
+	    (fatal_signal_pending(current) || current->flags & PF_EXITING)) {
+		mark_oom_victim(current);
 		return;
 	}
 
@@ -653,30 +991,21 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
 	    !oom_unkillable_task(current, NULL, nodemask) &&
 	    current->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) {
 		get_task_struct(current);
-		oom_kill_process(current, gfp_mask, order, 0, totalpages, NULL,
-				 nodemask,
+		oom_kill_process(current, gfp_mask, order, 0, 0, totalpages,
+				 NULL, nodemask,
 				 "Out of memory (oom_kill_allocating_task)");
-		goto out;
+		return;
 	}
 
-	p = select_bad_process(&points, totalpages, mpol_mask, force_kill);
+	p = select_bad_process(&points, &overdraft, totalpages, mpol_mask);
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
 		dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
 		panic("Out of memory and no killable processes...\n");
-	}
-	if (PTR_ERR(p) != -1UL) {
-		oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
+	} else
+		oom_kill_process(p, gfp_mask, order, points, overdraft,
+				 totalpages, NULL,
 				 nodemask, "Out of memory");
-		killed = 1;
-	}
-out:
-	/*
-	 * Give the killed threads a good chance of exiting before trying to
-	 * allocate memory again.
-	 */
-	if (killed)
-		schedule_timeout_killable(1);
 }
 
 /*
@@ -686,14 +1015,11 @@ out:
  */
 void pagefault_out_of_memory(void)
 {
-	struct zonelist *zonelist;
-
 	if (mem_cgroup_oom_synchronize(true))
 		return;
 
-	zonelist = node_zonelist(first_memory_node, GFP_KERNEL);
-	if (try_set_zonelist_oom(zonelist, GFP_KERNEL)) {
-		out_of_memory(NULL, 0, 0, NULL, false);
-		clear_zonelist_oom(zonelist, GFP_KERNEL);
+	if (oom_trylock(NULL)) {
+		out_of_memory(NULL, 0, 0, NULL);
+		oom_unlock(NULL);
 	}
 }
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -36,6 +36,7 @@
 #include <linux/pagevec.h>
 #include <linux/timer.h>
 #include <linux/sched/rt.h>
+#include <linux/virtinfo.h>
 #include <trace/events/writeback.h>
 
 /*
@@ -525,6 +526,41 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio)
 }
 EXPORT_SYMBOL(bdi_set_max_ratio);
 
+int bdi_set_min_dirty(struct backing_dev_info *bdi, unsigned min_dirty)
+{
+	int ret = 0;
+
+	spin_lock_bh(&bdi_lock);
+	if (min_dirty > bdi->max_dirty_pages) {
+		ret = -EINVAL;
+	} else {
+		bdi->min_dirty_pages = min_dirty;
+	}
+	spin_unlock_bh(&bdi_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(bdi_set_min_dirty);
+
+int bdi_set_max_dirty(struct backing_dev_info *bdi, unsigned max_dirty)
+{
+	int ret = 0;
+
+	if (max_dirty > num_physpages)
+		return -EINVAL;
+
+	spin_lock_bh(&bdi_lock);
+	if (bdi->min_dirty_pages > max_dirty) {
+		ret = -EINVAL;
+	} else {
+		bdi->max_dirty_pages = max_dirty;
+	}
+	spin_unlock_bh(&bdi_lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(bdi_set_max_dirty);
+
 static unsigned long dirty_freerun_ceiling(unsigned long thresh,
 					   unsigned long bg_thresh)
 {
@@ -576,6 +612,12 @@ unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
 	if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
 		bdi_dirty = dirty * bdi->max_ratio / 100;
 
+	if (bdi->min_dirty_pages && bdi_dirty < bdi->min_dirty_pages)
+		bdi_dirty = min((unsigned long)bdi->min_dirty_pages, dirty);
+
+	if (bdi->max_dirty_pages && bdi_dirty > bdi->max_dirty_pages)
+		bdi_dirty = bdi->max_dirty_pages;
+
 	return bdi_dirty;
 }
 
@@ -749,7 +791,8 @@ static unsigned long bdi_position_ratio(struct backing_dev_info *bdi,
 		if (bdi_dirty >= bdi_thresh)
 			return 0;
 
-		bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
+		bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh,
+					thresh + 1);
 		bdi_setpoint = dirty_freerun_ceiling(bdi_thresh,
 						     bdi_bg_thresh);
 
@@ -1087,12 +1130,15 @@ static void bdi_update_dirty_ratelimit(struct backing_dev_info *bdi,
 	 * of backing device (see the implementation of bdi_dirty_limit()).
 	 */
 	if (unlikely(bdi->capabilities & BDI_CAP_STRICTLIMIT)) {
+		unsigned long bdi_bg_thresh;
+
+		bdi_bg_thresh = div_u64((u64)bdi_thresh * bg_thresh, thresh);
+
 		dirty = bdi_dirty;
 		if (bdi_dirty < 8)
 			setpoint = bdi_dirty + 1;
 		else
-			setpoint = (bdi_thresh +
-				    bdi_dirty_limit(bdi, bg_thresh)) / 2;
+			setpoint = (bdi_thresh + bdi_bg_thresh) / 2;
 	}
 
 	if (dirty < setpoint) {
@@ -1324,9 +1370,9 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
 	*bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
 
 	if (bdi_bg_thresh)
-		*bdi_bg_thresh = div_u64((u64)*bdi_thresh *
-					 background_thresh,
-					 dirty_thresh);
+		*bdi_bg_thresh = dirty_thresh ? div_u64((u64)*bdi_thresh *
+							background_thresh,
+							dirty_thresh) : 0;
 
 	/*
 	 * In order to avoid the stacked BDI deadlock we need
@@ -1349,6 +1395,102 @@ static inline void bdi_dirty_limits(struct backing_dev_info *bdi,
 	}
 }
 
+static void balance_dirty_pages_ub(struct address_space *mapping,
+				unsigned long write_chunk)
+{
+	long ub_dirty, ub_writeback;
+	long ub_thresh, ub_background_thresh;
+	unsigned long pages_written = 0;
+	unsigned long pause = 1;
+	struct user_beancounter *ub = get_io_ub();
+	struct backing_dev_info *bdi = mapping->backing_dev_info;
+
+	if (ub == get_ub0())
+		return;
+
+	for (;;) {
+		unsigned long nr_to_write = write_chunk - pages_written;
+
+		ub_dirty = ub_stat_get(ub, dirty_pages);
+		ub_writeback = ub_stat_get(ub, writeback_pages);
+
+		if (!ub_dirty_limits(&ub_background_thresh, &ub_thresh, ub))
+			break;
+
+		/*
+		 * Check thresholds and start background writeback
+		 * before throttling.
+		 */
+		if (ub_dirty + ub_writeback <= ub_thresh)
+			break;
+		if (!writeback_in_progress(bdi))
+			bdi_start_background_writeback(bdi);
+
+		/*
+		 * Throttle it only when the background writeback cannot
+		 * catch-up. This avoids (excessively) small writeouts
+		 * when the bdi limits are ramping up.
+		 */
+		if (ub_dirty + ub_writeback <
+			(ub_background_thresh + ub_thresh) / 2)
+			break;
+
+		if (ub_dirty > ub_thresh) {
+			pages_written += writeback_inodes_wb(&bdi->wb,
+						nr_to_write,
+						WB_REASON_BACKGROUND, ub);
+			ub_dirty = ub_stat_get(ub, dirty_pages);
+			ub_writeback = ub_stat_get(ub, writeback_pages);
+		}
+
+		/* fixup ub-stat per-cpu drift to avoid false-positive */
+		if (ub_dirty + ub_writeback > ub_thresh &&
+		    ub_dirty + ub_writeback - ub_thresh <
+				    UB_STAT_BATCH * num_possible_cpus()) {
+			ub_dirty = ub_stat_get_exact(ub, dirty_pages);
+			ub_writeback = ub_stat_get_exact(ub, writeback_pages);
+		}
+
+		if (ub_dirty + ub_writeback <= ub_thresh)
+			break;
+
+		if (pages_written >= write_chunk)
+			break;		/* We've done our duty */
+
+		__set_current_state(TASK_KILLABLE);
+		io_schedule_timeout(pause);
+
+		/*
+		 * Increase the delay for each loop, up to our previous
+		 * default of taking a 100ms nap.
+		 */
+		pause <<= 1;
+		if (pause > HZ / 10)
+			pause = HZ / 10;
+
+		if (fatal_signal_pending(current))
+			break;
+	}
+
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_BALANCE_DIRTY,
+			       (void*)write_chunk);
+
+	/*
+	 * Even if this is filtered writeback for other ub it will write
+	 * inodes for this ub, because we check ub limits of inode (via
+	 * __ub_over_bground_thresh(ub)) during writeback.
+	 */
+	if (writeback_in_progress(bdi))
+		return;
+
+	/*
+	 * We start background writeout at the lower ub_background_thresh,
+	 * to keep the amount of dirty memory low.
+	 */
+	if (ub_dirty > ub_background_thresh)
+		bdi_start_background_writeback(bdi);
+}
+
 /*
  * balance_dirty_pages() must be called by processes which are generating dirty
  * data.  It looks at the number of dirty pages in the machine and will force
@@ -1544,6 +1686,9 @@ pause:
 	if (!dirty_exceeded && bdi->dirty_exceeded)
 		bdi->dirty_exceeded = 0;
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_BALANCE_DIRTY,
+			       (void*)pages_dirtied);
+
 	if (writeback_in_progress(bdi))
 		return;
 
@@ -1644,8 +1789,10 @@ void balance_dirty_pages_ratelimited(struct address_space *mapping)
 	}
 	preempt_enable();
 
-	if (unlikely(current->nr_dirtied >= ratelimit))
+	if (unlikely(current->nr_dirtied >= ratelimit)) {
+		balance_dirty_pages_ub(mapping, current->nr_dirtied);
 		balance_dirty_pages(mapping, current->nr_dirtied);
+	}
 }
 EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
 
@@ -1928,6 +2075,8 @@ retry:
 
 			done_index = page->index;
 
+			virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 			lock_page(page);
 
 			/*
@@ -2129,7 +2278,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
 		__inc_zone_page_state(page, NR_DIRTIED);
 		__inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE);
 		__inc_bdi_stat(mapping->backing_dev_info, BDI_DIRTIED);
-		task_io_account_write(PAGE_CACHE_SIZE);
+		task_io_account_dirty(PAGE_CACHE_SIZE);
 		current->nr_dirtied++;
 		this_cpu_inc(bdp_ratelimits);
 	}
@@ -2180,6 +2329,11 @@ int __set_page_dirty_nobuffers(struct page *page)
 			account_page_dirtied(page, mapping);
 			radix_tree_tag_set(&mapping->page_tree,
 				page_index(page), PAGECACHE_TAG_DIRTY);
+			if (mapping_cap_account_dirty(mapping) &&
+					!radix_tree_prev_tag_get(
+						&mapping->page_tree,
+						PAGECACHE_TAG_DIRTY))
+				ub_io_account_dirty(mapping);
 		}
 		spin_unlock_irqrestore(&mapping->tree_lock, flags);
 		if (mapping->host) {
@@ -2268,6 +2422,18 @@ int set_page_dirty(struct page *page)
 }
 EXPORT_SYMBOL(set_page_dirty);
 
+int set_page_dirty_mm(struct page *page, struct mm_struct *mm)
+{
+	struct user_beancounter *old_ub;
+	int ret;
+
+	old_ub = set_exec_ub(mm_ub(mm));
+	ret = set_page_dirty(page);
+	(void)set_exec_ub(old_ub);
+	return ret;
+}
+EXPORT_SYMBOL(set_page_dirty_mm);
+
 /*
  * set_page_dirty() is racy if the caller has no reference against
  * page->mapping->host, and if the page is unlocked.  This is because another
@@ -2375,6 +2541,9 @@ int test_clear_page_writeback(struct page *page)
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
 			if (bdi_cap_account_writeback(bdi)) {
+				if (radix_tree_prev_tag_get(&mapping->page_tree,
+							PAGECACHE_TAG_WRITEBACK))
+					ub_io_writeback_dec(mapping);
 				__dec_bdi_stat(bdi, BDI_WRITEBACK);
 				__bdi_writeout_inc(bdi);
 			}
@@ -2405,13 +2574,23 @@ int __test_set_page_writeback(struct page *page, bool keep_write)
 			radix_tree_tag_set(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_WRITEBACK);
-			if (bdi_cap_account_writeback(bdi))
+			if (bdi_cap_account_writeback(bdi)) {
+				if (!radix_tree_prev_tag_get(&mapping->page_tree,
+							PAGECACHE_TAG_WRITEBACK))
+					ub_io_writeback_inc(mapping);
 				__inc_bdi_stat(bdi, BDI_WRITEBACK);
+			}
 		}
-		if (!PageDirty(page))
+		if (!PageDirty(page)) {
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
 						PAGECACHE_TAG_DIRTY);
+			if (mapping_cap_account_dirty(mapping) &&
+					radix_tree_prev_tag_get(
+						&mapping->page_tree,
+						PAGECACHE_TAG_DIRTY))
+				ub_io_account_clean(mapping);
+		}
 		if (!keep_write)
 			radix_tree_tag_clear(&mapping->page_tree,
 						page_index(page),
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -25,6 +25,7 @@
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/kmemcheck.h>
+#include <linux/kasan.h>
 #include <linux/module.h>
 #include <linux/suspend.h>
 #include <linux/pagevec.h>
@@ -62,6 +63,7 @@
 #include <linux/hugetlb.h>
 #include <linux/sched/rt.h>
 #include <linux/kthread.h>
+#include <linux/ve.h>
 
 #include <asm/sections.h>
 #include <asm/tlbflush.h>
@@ -123,6 +125,24 @@ unsigned long dirty_balance_reserve __read_mostly;
 int percpu_pagelist_fraction;
 gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
 
+static int zero_data_pages_enabled;
+struct static_key __initdata zero_free_pages = STATIC_KEY_INIT_FALSE;
+
+static int __init enable_zero_free_pages(char *__unused)
+{
+	zero_data_pages_enabled = 1;
+	return 1;
+}
+__setup("zero-free-pages", enable_zero_free_pages);
+
+static int __init setup_zero_free_pages(void)
+{
+	if (zero_data_pages_enabled)
+		static_key_slow_inc(&zero_free_pages);
+	return 0;
+}
+early_initcall(setup_zero_free_pages);
+
 #ifdef CONFIG_PM_SLEEP
 /*
  * The following functions are used by the suspend/hibernate code to temporarily
@@ -769,7 +789,6 @@ static void free_pcppages_bulk(struct zone *zone, int count,
 	int to_free = count;
 
 	spin_lock(&zone->lock);
-	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 
 	while (to_free) {
@@ -818,7 +837,6 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
 				int migratetype)
 {
 	spin_lock(&zone->lock);
-	zone->all_unreclaimable = 0;
 	zone->pages_scanned = 0;
 
 	__free_one_page(page, zone, order, migratetype);
@@ -903,11 +921,16 @@ static bool free_pages_prepare(struct page *page, unsigned int order)
 
 	trace_mm_page_free(page, order);
 	kmemcheck_free_shadow(page, order);
+	kasan_free_pages(page, order);
 
 	if (PageAnon(page))
 		page->mapping = NULL;
-	for (i = 0; i < (1 << order); i++)
+	memcg_kmem_uncharge_pages(page, order);
+	for (i = 0; i < (1 << order); i++) {
 		bad += free_pages_check(page + i);
+		if (static_key_false(&zero_free_pages))
+			clear_highpage(page + i);
+	}
 	if (bad)
 		return false;
 
@@ -1296,6 +1319,7 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
 	set_page_private(page, 0);
 	arch_alloc_page(page, order);
 	kernel_map_pages(page, 1 << order, 1);
+	kasan_alloc_pages(page, order);
 
 	if (gfp_flags & __GFP_ZERO)
 		prep_zero_page(page, order, gfp_flags);
@@ -2579,10 +2603,8 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 	struct page *page;
 
 	/* Acquire the OOM killer lock for the zones in zonelist */
-	if (!try_set_zonelist_oom(zonelist, gfp_mask)) {
-		schedule_timeout_uninterruptible(1);
+	if (!oom_trylock(NULL))
 		return NULL;
-	}
 
 	/*
 	 * Go through the zonelist yet one more time, keep very high watermark
@@ -2614,10 +2636,10 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
 			goto out;
 	}
 	/* Exhausted what can be done so it's blamo time */
-	out_of_memory(zonelist, gfp_mask, order, nodemask, false);
+	out_of_memory(zonelist, gfp_mask, order, nodemask);
 
 out:
-	clear_zonelist_oom(zonelist, gfp_mask);
+	oom_unlock(NULL);
 	return page;
 }
 
@@ -2876,6 +2898,8 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 	return !!(gfp_to_alloc_flags(gfp_mask) & ALLOC_NO_WATERMARKS);
 }
 
+int alloc_fail_warn;
+
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
 	struct zonelist *zonelist, enum zone_type high_zoneidx,
@@ -3081,6 +3105,36 @@ got_pg:
 	return page;
 }
 
+static void __alloc_collect_stats(gfp_t gfp_mask, unsigned int order,
+		struct page *page, u64 time)
+{
+#ifdef CONFIG_VE
+	unsigned long flags;
+	int ind, cpu;
+
+	time = jiffies_to_usecs(jiffies - time) * 1000;
+	if (!(gfp_mask & __GFP_WAIT))
+		ind = KSTAT_ALLOCSTAT_ATOMIC;
+	else if (!(gfp_mask & __GFP_HIGHMEM))
+		if (order > 0)
+			ind = KSTAT_ALLOCSTAT_LOW_MP;
+		else
+			ind = KSTAT_ALLOCSTAT_LOW;
+	else
+		if (order > 0)
+			ind = KSTAT_ALLOCSTAT_HIGH_MP;
+		else
+			ind = KSTAT_ALLOCSTAT_HIGH;
+
+	local_irq_save(flags);
+	cpu = smp_processor_id();
+	KSTAT_LAT_PCPU_ADD(&kstat_glob.alloc_lat[ind], cpu, time);
+	if (!page)
+		kstat_glob.alloc_fails[cpu][ind]++;
+	local_irq_restore(flags);
+#endif
+}
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
@@ -3094,13 +3148,15 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	int migratetype = allocflags_to_migratetype(gfp_mask);
 	unsigned int cpuset_mems_cookie;
 	int alloc_flags = ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_FAIR;
-	struct mem_cgroup *memcg = NULL;
+	cycles_t start;
 
 	gfp_mask &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(gfp_mask);
 
 	might_sleep_if(gfp_mask & __GFP_WAIT);
+	WARN_ON_ONCE((gfp_mask & __GFP_FS) && current->journal_info &&
+		!(current->flags & PF_MEMALLOC));
 
 	if (should_fail_alloc_page(gfp_mask, order))
 		return NULL;
@@ -3113,13 +3169,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order,
 	if (unlikely(!zonelist->_zonerefs->zone))
 		return NULL;
 
-	/*
-	 * Will only have any effect when __GFP_KMEMCG is set.  This is
-	 * verified in the (always inline) callee
-	 */
-	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-		return NULL;
-
 retry_cpuset:
 	cpuset_mems_cookie = read_mems_allowed_begin();
 
@@ -3135,6 +3184,7 @@ retry_cpuset:
 		alloc_flags |= ALLOC_CMA;
 #endif
 retry:
+	start = jiffies;
 	/* First allocation attempt */
 	page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, nodemask, order,
 			zonelist, high_zoneidx, alloc_flags,
@@ -3167,6 +3217,7 @@ retry:
 				preferred_zone, migratetype);
 	}
 
+	__alloc_collect_stats(gfp_mask, order, page, start);
 	trace_mm_page_alloc(page, order, gfp_mask, migratetype);
 
 out:
@@ -3179,7 +3230,10 @@ out:
 	if (unlikely(!page && read_mems_allowed_retry(cpuset_mems_cookie)))
 		goto retry_cpuset;
 
-	memcg_kmem_commit_charge(page, memcg, order);
+	if (page && !memcg_kmem_newpage_charge(page, gfp_mask, order)) {
+		__free_pages(page, order);
+		return NULL;
+	}
 
 	return page;
 }
@@ -3331,62 +3385,6 @@ void __free_page_frag(void *addr)
 }
 EXPORT_SYMBOL(__free_page_frag);
 
-/*
- * alloc_kmem_pages charges newly allocated pages to the kmem resource counter
- * of the current memory cgroup.
- *
- * It should be used when the caller would like to use kmalloc, but since the
- * allocation is large, it has to fall back to the page allocator.
- */
-struct page *alloc_kmem_pages(gfp_t gfp_mask, unsigned int order)
-{
-	struct page *page;
-	struct mem_cgroup *memcg = NULL;
-
-	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-		return NULL;
-	page = alloc_pages(gfp_mask, order);
-	memcg_kmem_commit_charge(page, memcg, order);
-	return page;
-}
-
-struct page *alloc_kmem_pages_node(int nid, gfp_t gfp_mask, unsigned int order)
-{
-	struct page *page;
-	struct mem_cgroup *memcg = NULL;
-
-	if (!memcg_kmem_newpage_charge(gfp_mask, &memcg, order))
-		return NULL;
-	page = alloc_pages_node(nid, gfp_mask, order);
-	memcg_kmem_commit_charge(page, memcg, order);
-	return page;
-}
-
-/*
- * __free_memcg_kmem_pages and free_memcg_kmem_pages will free
- * pages allocated with __GFP_KMEMCG.
- *
- * Those pages are accounted to a particular memcg, embedded in the
- * corresponding page_cgroup. To avoid adding a hit in the allocator to search
- * for that information only to find out that it is NULL for users who have no
- * interest in that whatsoever, we provide these functions.
- *
- * The caller knows better which flags it relies on.
- */
-void __free_memcg_kmem_pages(struct page *page, unsigned int order)
-{
-	memcg_kmem_uncharge_pages(page, order);
-	__free_pages(page, order);
-}
-
-void free_memcg_kmem_pages(unsigned long addr, unsigned int order)
-{
-	if (addr != 0) {
-		VM_BUG_ON(!virt_addr_valid((void *)addr));
-		__free_memcg_kmem_pages(virt_to_page((void *)addr), order);
-	}
-}
-
 static void *make_alloc_exact(unsigned long addr, unsigned order, size_t size)
 {
 	if (addr) {
@@ -3524,6 +3522,10 @@ static inline void show_node(struct zone *zone)
 		printk("Node %d ", zone_to_nid(zone));
 }
 
+#ifdef CONFIG_TCACHE
+extern unsigned long get_nr_tcache_pages(void);
+#endif
+
 long si_mem_available(void)
 {
 	long available;
@@ -3561,6 +3563,10 @@ long si_mem_available(void)
 	available += global_page_state(NR_SLAB_RECLAIMABLE) -
 		     min(global_page_state(NR_SLAB_RECLAIMABLE) / 2, wmark_low);
 
+#ifdef CONFIG_TCACHE
+	available += get_nr_tcache_pages();
+#endif
+
 	if (available < 0)
 		available = 0;
 	return available;
@@ -3673,7 +3679,7 @@ void show_free_areas(unsigned int filter)
 
 	printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
 		" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
-		" unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+		" unevictable:%lu dirty:%lu writeback:%lu wbtmp:%lu unstable:%lu\n"
 		" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
 		" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
 		" free:%lu free_pcp:%lu free_cma:%lu\n",
@@ -3686,6 +3692,7 @@ void show_free_areas(unsigned int filter)
 		global_page_state(NR_UNEVICTABLE),
 		global_page_state(NR_FILE_DIRTY),
 		global_page_state(NR_WRITEBACK),
+		global_page_state(NR_WRITEBACK_TEMP),
 		global_page_state(NR_UNSTABLE_NFS),
 		global_page_state(NR_SLAB_RECLAIMABLE),
 		global_page_state(NR_SLAB_UNRECLAIMABLE),
@@ -3771,7 +3778,7 @@ void show_free_areas(unsigned int filter)
 			K(zone_page_state(zone, NR_FREE_CMA_PAGES)),
 			K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
 			zone->pages_scanned,
-			(zone->all_unreclaimable ? "yes" : "no")
+			(!zone_reclaimable(zone) ? "yes" : "no")
 			);
 		printk("lowmem_reserve[]:");
 		for (i = 0; i < MAX_NR_ZONES; i++)
@@ -6822,9 +6829,9 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count,
 		if (!PageLRU(page))
 			found++;
 		/*
-		 * If there are RECLAIMABLE pages, we need to check it.
-		 * But now, memory offline itself doesn't call shrink_slab()
-		 * and it still to be fixed.
+		 * If there are RECLAIMABLE pages, we need to check
+		 * it.  But now, memory offline itself doesn't call
+		 * shrink_node_slabs() and it still to be fixed.
 		 */
 		/*
 		 * If the page is not RAM, page_count()should be 0.
@@ -7213,6 +7220,10 @@ static const struct trace_print_flags pageflag_names[] = {
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
 	{1UL << PG_compound_lock,	"compound_lock"	},
 #endif
+#ifdef CONFIG_IDLE_PAGE_TRACKING
+	{1UL << PG_young,		"young"		},
+	{1UL << PG_idle,		"idle"		},
+#endif
 };
 
 static void dump_page_flags(unsigned long flags)
--- /dev/null
+++ b/mm/page_idle.c
@@ -0,0 +1,229 @@
+/*
+ *  mm/page_idle.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/init.h>
+#include <linux/bootmem.h>
+#include <linux/fs.h>
+#include <linux/sysfs.h>
+#include <linux/kobject.h>
+#include <linux/mm.h>
+#include <linux/mmzone.h>
+#include <linux/pagemap.h>
+#include <linux/rmap.h>
+#include <linux/mmu_notifier.h>
+#include <linux/ksm.h>
+#include <linux/page_idle.h>
+
+#define BITMAP_CHUNK_SIZE	sizeof(u64)
+#define BITMAP_CHUNK_BITS	(BITMAP_CHUNK_SIZE * BITS_PER_BYTE)
+
+/*
+ * Idle page tracking only considers user memory pages, for other types of
+ * pages the idle flag is always unset and an attempt to set it is silently
+ * ignored.
+ *
+ * We treat a page as a user memory page if it is on an LRU list, because it is
+ * always safe to pass such a page to rmap_walk(), which is essential for idle
+ * page tracking. With such an indicator of user pages we can skip isolated
+ * pages, but since there are not usually many of them, it will hardly affect
+ * the overall result.
+ *
+ * This function tries to get a user memory page by pfn as described above.
+ */
+static struct page *page_idle_get_page(unsigned long pfn)
+{
+	struct page *page;
+	struct zone *zone;
+
+	if (!pfn_valid(pfn))
+		return NULL;
+
+	page = pfn_to_page(pfn);
+	if (!page || !PageLRU(page) ||
+	    !get_page_unless_zero(page))
+		return NULL;
+
+	zone = page_zone(page);
+	spin_lock_irq(&zone->lru_lock);
+	if (unlikely(!PageLRU(page))) {
+		put_page(page);
+		page = NULL;
+	}
+	spin_unlock_irq(&zone->lru_lock);
+	return page;
+}
+
+static int page_idle_clear_pte_refs_one(struct page *page,
+					struct vm_area_struct *vma,
+					unsigned long addr, void *arg)
+{
+	struct mm_struct *mm = vma->vm_mm;
+	spinlock_t *ptl;
+	pmd_t *pmd;
+	pte_t *pte;
+	bool referenced = false;
+
+	if (unlikely(PageTransHuge(page))) {
+		pmd = page_check_address_pmd(page, mm, addr,
+					     PAGE_CHECK_ADDRESS_PMD_FLAG, &ptl);
+		if (pmd) {
+			referenced = pmdp_clear_young_notify(vma, addr, pmd);
+			spin_unlock(ptl);
+		}
+	} else {
+		pte = page_check_address(page, mm, addr, &ptl, 0);
+		if (pte) {
+			referenced = ptep_clear_young_notify(vma, addr, pte);
+			pte_unmap_unlock(pte, ptl);
+		}
+	}
+	if (referenced) {
+		clear_page_idle(page);
+		/*
+		 * We cleared the referenced bit in a mapping to this page. To
+		 * avoid interference with page reclaim, mark it young so that
+		 * page_referenced() will return > 0.
+		 */
+		set_page_young(page);
+	}
+	return SWAP_AGAIN;
+}
+
+static void page_idle_clear_pte_refs(struct page *page)
+{
+	/*
+	 * Since rwc.arg is unused, rwc is effectively immutable, so we
+	 * can make it static const to save some cycles and stack.
+	 */
+	static const struct rmap_walk_control rwc = {
+		.rmap_one = page_idle_clear_pte_refs_one,
+		.anon_lock = page_lock_anon_vma_read,
+	};
+	bool need_lock;
+
+	if (!page_mapped(page) ||
+	    !page_rmapping(page))
+		return;
+
+	need_lock = !PageAnon(page) || PageKsm(page);
+	if (need_lock && !trylock_page(page))
+		return;
+
+	rmap_walk(page, (struct rmap_walk_control *)&rwc);
+
+	if (need_lock)
+		unlock_page(page);
+}
+
+static ssize_t page_idle_bitmap_read(struct file *file, struct kobject *kobj,
+				     struct bin_attribute *attr, char *buf,
+				     loff_t pos, size_t count)
+{
+	u64 *out = (u64 *)buf;
+	struct page *page;
+	unsigned long pfn, end_pfn;
+	int bit;
+
+	if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+		return -EINVAL;
+
+	pfn = pos * BITS_PER_BYTE;
+	if (pfn >= max_pfn)
+		return 0;
+
+	end_pfn = pfn + count * BITS_PER_BYTE;
+	if (end_pfn > max_pfn)
+		end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+
+	for (; pfn < end_pfn; pfn++) {
+		bit = pfn % BITMAP_CHUNK_BITS;
+		if (!bit)
+			*out = 0ULL;
+		page = page_idle_get_page(pfn);
+		if (page) {
+			if (page_is_idle(page)) {
+				/*
+				 * The page might have been referenced via a
+				 * pte, in which case it is not idle. Clear
+				 * refs and recheck.
+				 */
+				page_idle_clear_pte_refs(page);
+				if (page_is_idle(page))
+					*out |= 1ULL << bit;
+			}
+			put_page(page);
+		}
+		if (bit == BITMAP_CHUNK_BITS - 1)
+			out++;
+		cond_resched();
+	}
+	return (char *)out - buf;
+}
+
+static ssize_t page_idle_bitmap_write(struct file *file, struct kobject *kobj,
+				      struct bin_attribute *attr, char *buf,
+				      loff_t pos, size_t count)
+{
+	const u64 *in = (u64 *)buf;
+	struct page *page;
+	unsigned long pfn, end_pfn;
+	int bit;
+
+	if (pos % BITMAP_CHUNK_SIZE || count % BITMAP_CHUNK_SIZE)
+		return -EINVAL;
+
+	pfn = pos * BITS_PER_BYTE;
+	if (pfn >= max_pfn)
+		return -ENXIO;
+
+	end_pfn = pfn + count * BITS_PER_BYTE;
+	if (end_pfn > max_pfn)
+		end_pfn = ALIGN(max_pfn, BITMAP_CHUNK_BITS);
+
+	for (; pfn < end_pfn; pfn++) {
+		bit = pfn % BITMAP_CHUNK_BITS;
+		if ((*in >> bit) & 1) {
+			page = page_idle_get_page(pfn);
+			if (page) {
+				page_idle_clear_pte_refs(page);
+				set_page_idle(page);
+				put_page(page);
+			}
+		}
+		if (bit == BITMAP_CHUNK_BITS - 1)
+			in++;
+		cond_resched();
+	}
+	return (char *)in - buf;
+}
+
+static struct bin_attribute page_idle_bitmap_attr =
+		__BIN_ATTR(bitmap, S_IRUSR | S_IWUSR,
+			   page_idle_bitmap_read, page_idle_bitmap_write, 0);
+
+static struct bin_attribute *page_idle_bin_attrs[] = {
+	&page_idle_bitmap_attr,
+	NULL,
+};
+
+static struct attribute_group page_idle_attr_group = {
+	.bin_attrs = page_idle_bin_attrs,
+	.name = "page_idle",
+};
+
+static int __init page_idle_init(void)
+{
+	int err;
+
+	err = sysfs_create_group(mm_kobj, &page_idle_attr_group);
+	if (err) {
+		pr_err("page_idle: register sysfs failed\n");
+		return err;
+	}
+	return 0;
+}
+subsys_initcall(page_idle_init);
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -24,12 +24,14 @@
 #include <linux/blkdev.h>
 #include <asm/pgtable.h>
 
+static struct bio_set *swap_bio_set;
+
 static struct bio *get_swap_bio(gfp_t gfp_flags,
 				struct page *page, bio_end_io_t end_io)
 {
 	struct bio *bio;
 
-	bio = bio_alloc(gfp_flags, 1);
+	bio = bio_alloc_bioset(gfp_flags, 1, swap_bio_set);
 	if (bio) {
 		bio->bi_sector = map_swap_page(page, &bio->bi_bdev);
 		bio->bi_sector <<= PAGE_SHIFT - 9;
@@ -336,3 +338,12 @@ int swap_set_page_dirty(struct page *page)
 		return __set_page_dirty_no_writeback(page);
 	}
 }
+
+static int __init swap_init(void)
+{
+	swap_bio_set = bioset_create(SWAP_CLUSTER_MAX, 0);
+	if (!swap_bio_set)
+		panic("can't allocate swap_bio_set\n");
+	return 0;
+}
+late_initcall(swap_init);
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -19,6 +19,7 @@
 #include <linux/pagemap.h>
 #include <linux/syscalls.h>
 #include <linux/file.h>
+#include <linux/virtinfo.h>
 
 /*
  * Initialise a struct file's readahead state.  Assumes that the caller has
@@ -117,6 +118,8 @@ static int read_pages(struct address_space *mapping, struct file *filp,
 
 	blk_start_plug(&plug);
 
+	virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_PREPARE, NULL);
+
 	if (mapping->a_ops->readpages) {
 		ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
 		/* Clean up the remaining pages */
@@ -513,6 +516,10 @@ void page_cache_sync_readahead(struct address_space *mapping,
 		return;
 	}
 
+	if (virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_READAHEAD,
+				NULL) & NOTIFY_FAIL)
+		return;
+
 	/* do read-ahead */
 	ondemand_readahead(mapping, ra, filp, false, offset, req_size);
 }
@@ -557,6 +564,10 @@ page_cache_async_readahead(struct address_space *mapping,
 	if (bdi_read_congested(mapping->backing_dev_info))
 		return;
 
+	if (virtinfo_notifier_call(VITYPE_IO, VIRTINFO_IO_READAHEAD,
+				NULL) & NOTIFY_FAIL)
+		return;
+
 	/* do read-ahead */
 	ondemand_readahead(mapping, ra, filp, true, offset, req_size);
 }
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -57,6 +57,7 @@
 #include <linux/migrate.h>
 #include <linux/hugetlb.h>
 #include <linux/backing-dev.h>
+#include <linux/page_idle.h>
 
 #include <asm/tlbflush.h>
 
@@ -424,8 +425,10 @@ static void anon_vma_ctor(void *data)
 void __init anon_vma_init(void)
 {
 	anon_vma_cachep = kmem_cache_create("anon_vma", sizeof(struct anon_vma),
-			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC, anon_vma_ctor);
-	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain, SLAB_PANIC);
+			0, SLAB_DESTROY_BY_RCU|SLAB_PANIC|SLAB_ACCOUNT,
+			anon_vma_ctor);
+	anon_vma_chain_cachep = KMEM_CACHE(anon_vma_chain,
+			SLAB_PANIC|SLAB_ACCOUNT);
 }
 
 /*
@@ -702,9 +705,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
 		if (!vma->anon_vma || !page__anon_vma ||
 		    vma->anon_vma->root != page__anon_vma->root)
 			return -EFAULT;
-	} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
-		if (!vma->vm_file ||
-		    vma->vm_file->f_mapping != page->mapping)
+	} else if (page->mapping) {
+		if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping)
 			return -EFAULT;
 	} else
 		return -EFAULT;
@@ -883,6 +885,11 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
 		pte_unmap_unlock(pte, ptl);
 	}
 
+	if (referenced)
+		clear_page_idle(page);
+	if (test_and_clear_page_young(page))
+		referenced++;
+
 	(*mapcount)--;
 
 	if (referenced)
@@ -945,7 +952,7 @@ static int page_referenced_file(struct page *page,
 				unsigned long *vm_flags)
 {
 	unsigned int mapcount;
-	struct address_space *mapping = page->mapping;
+	struct address_space *mapping = page->mapping, *peer;
 	pgoff_t pgoff = page_to_pgoff(page);
 	struct vm_area_struct *vma;
 	int referenced = 0;
@@ -965,7 +972,7 @@ static int page_referenced_file(struct page *page,
 	 */
 	BUG_ON(!PageLocked(page));
 
-	mutex_lock(&mapping->i_mmap_mutex);
+	mutex_lock_nested(&mapping->i_mmap_mutex, SINGLE_DEPTH_NESTING);
 
 	/*
 	 * i_mmap_mutex does not stabilize mapcount at all, but mapcount
@@ -985,9 +992,36 @@ static int page_referenced_file(struct page *page,
 		referenced += page_referenced_one(page, vma, address,
 						  &mapcount, vm_flags);
 		if (!mapcount)
-			break;
+			goto out;
 	}
 
+	/* Does page belong to pfcache mapping? */
+	if (!mapping->i_peer_file ||
+	    mapping->i_peer_file->f_mapping != mapping)
+		goto out;
+
+	list_for_each_entry(peer, &mapping->i_peer_list, i_peer_list) {
+		if (!mapping_mapped(peer))
+			continue;
+
+		mutex_lock(&peer->i_mmap_mutex);
+
+		vma_interval_tree_foreach(vma, &peer->i_mmap, pgoff, pgoff) {
+			unsigned long address = vma_address(page, vma);
+			if (memcg && !mm_match_cgroup(vma->vm_mm, memcg))
+				continue;
+			referenced += page_referenced_one(page, vma, address,
+							  &mapcount, vm_flags);
+			if (!mapcount)
+				break;
+		}
+
+		mutex_unlock(&peer->i_mmap_mutex);
+
+		if (!mapcount)
+			goto out;
+	}
+out:
 	mutex_unlock(&mapping->i_mmap_mutex);
 	return referenced;
 }
@@ -1247,11 +1281,6 @@ void page_add_new_anon_rmap(struct page *page,
 	else
 		__inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES);
 	__page_set_anon_rmap(page, vma, address, 1);
-	if (!mlocked_vma_newpage(vma, page)) {
-		SetPageActive(page);
-		lru_cache_add(page);
-	} else
-		add_page_to_unevictable_list(page);
 }
 
 /**
@@ -1260,7 +1289,7 @@ void page_add_new_anon_rmap(struct page *page,
  *
  * The caller needs to hold the pte lock.
  */
-void page_add_file_rmap(struct page *page)
+void page_add_file_rmap(struct page *page, struct mm_struct *mm)
 {
 	bool locked;
 	unsigned long flags;
@@ -1304,7 +1333,6 @@ void page_remove_rmap(struct page *page)
 	if (unlikely(PageHuge(page)))
 		goto out;
 	if (anon) {
-		mem_cgroup_uncharge_page(page);
 		if (!PageTransHuge(page))
 			__dec_zone_page_state(page, NR_ANON_PAGES);
 		else
@@ -1387,7 +1415,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 
 	/* Move the dirty bit to the physical page now the pte is gone. */
 	if (pte_dirty(pteval))
-		set_page_dirty(page);
+		set_page_dirty_mm(page, mm);
 
 	/* Update high watermark before we lower rss */
 	update_hiwater_rss(mm);
@@ -1432,7 +1460,6 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
 		if (pte_soft_dirty(pteval))
 			swp_pte = pte_swp_mksoft_dirty(swp_pte);
 		set_pte_at(mm, address, pte, swp_pte);
-		BUG_ON(pte_file(*pte));
 	} else if (IS_ENABLED(CONFIG_MIGRATION) &&
 		   (TTU_ACTION(flags) == TTU_MIGRATION)) {
 		/* Establish migration entry for a file page */
@@ -1474,133 +1501,6 @@ out_mlock:
 	return ret;
 }
 
-/*
- * objrmap doesn't work for nonlinear VMAs because the assumption that
- * offset-into-file correlates with offset-into-virtual-addresses does not hold.
- * Consequently, given a particular page and its ->index, we cannot locate the
- * ptes which are mapping that page without an exhaustive linear search.
- *
- * So what this code does is a mini "virtual scan" of each nonlinear VMA which
- * maps the file to which the target page belongs.  The ->vm_private_data field
- * holds the current cursor into that scan.  Successive searches will circulate
- * around the vma's virtual address space.
- *
- * So as more replacement pressure is applied to the pages in a nonlinear VMA,
- * more scanning pressure is placed against them as well.   Eventually pages
- * will become fully unmapped and are eligible for eviction.
- *
- * For very sparsely populated VMAs this is a little inefficient - chances are
- * there there won't be many ptes located within the scan cluster.  In this case
- * maybe we could scan further - to the end of the pte page, perhaps.
- *
- * Mlocked pages:  check VM_LOCKED under mmap_sem held for read, if we can
- * acquire it without blocking.  If vma locked, mlock the pages in the cluster,
- * rather than unmapping them.  If we encounter the "check_page" that vmscan is
- * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN.
- */
-#define CLUSTER_SIZE	min(32*PAGE_SIZE, PMD_SIZE)
-#define CLUSTER_MASK	(~(CLUSTER_SIZE - 1))
-
-static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount,
-		struct vm_area_struct *vma, struct page *check_page)
-{
-	struct mm_struct *mm = vma->vm_mm;
-	pmd_t *pmd;
-	pte_t *pte;
-	pte_t pteval;
-	spinlock_t *ptl;
-	struct page *page;
-	unsigned long address;
-	unsigned long mmun_start;	/* For mmu_notifiers */
-	unsigned long mmun_end;		/* For mmu_notifiers */
-	unsigned long end;
-	int ret = SWAP_AGAIN;
-	int locked_vma = 0;
-
-	address = (vma->vm_start + cursor) & CLUSTER_MASK;
-	end = address + CLUSTER_SIZE;
-	if (address < vma->vm_start)
-		address = vma->vm_start;
-	if (end > vma->vm_end)
-		end = vma->vm_end;
-
-	pmd = mm_find_pmd(mm, address);
-	if (!pmd)
-		return ret;
-
-	mmun_start = address;
-	mmun_end   = end;
-	mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end);
-
-	/*
-	 * If we can acquire the mmap_sem for read, and vma is VM_LOCKED,
-	 * keep the sem while scanning the cluster for mlocking pages.
-	 */
-	if (down_read_trylock(&vma->vm_mm->mmap_sem)) {
-		locked_vma = (vma->vm_flags & VM_LOCKED);
-		if (!locked_vma)
-			up_read(&vma->vm_mm->mmap_sem); /* don't need it */
-	}
-
-	pte = pte_offset_map_lock(mm, pmd, address, &ptl);
-
-	/* Update high watermark before we lower rss */
-	update_hiwater_rss(mm);
-
-	for (; address < end; pte++, address += PAGE_SIZE) {
-		if (!pte_present(*pte))
-			continue;
-		page = vm_normal_page(vma, address, *pte);
-		BUG_ON(!page || PageAnon(page));
-
-		if (locked_vma) {
-			if (page == check_page) {
-				/* we know we have check_page locked */
-				mlock_vma_page(page);
-				ret = SWAP_MLOCK;
-			} else if (trylock_page(page)) {
-				/*
-				 * If we can lock the page, perform mlock.
-				 * Otherwise leave the page alone, it will be
-				 * eventually encountered again later.
-				 */
-				mlock_vma_page(page);
-				unlock_page(page);
-			}
-			continue;	/* don't unmap */
-		}
-
-		if (ptep_clear_flush_young_notify(vma, address, pte))
-			continue;
-
-		/* Nuke the page table entry. */
-		flush_cache_page(vma, address, pte_pfn(*pte));
-		pteval = ptep_clear_flush_notify(vma, address, pte);
-
-		/* If nonlinear, store the file page offset in the pte. */
-		if (page->index != linear_page_index(vma, address)) {
-			pte_t ptfile = pgoff_to_pte(page->index);
-			if (pte_soft_dirty(pteval))
-				ptfile = pte_file_mksoft_dirty(ptfile);
-			set_pte_at(mm, address, pte, ptfile);
-		}
-
-		/* Move the dirty bit to the physical page now the pte is gone. */
-		if (pte_dirty(pteval))
-			set_page_dirty(page);
-
-		page_remove_rmap(page);
-		page_cache_release(page);
-		dec_mm_counter(mm, mm_counter_file(page));
-		(*mapcount)--;
-	}
-	pte_unmap_unlock(pte - 1, ptl);
-	mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end);
-	if (locked_vma)
-		up_read(&vma->vm_mm->mmap_sem);
-	return ret;
-}
-
 bool is_vma_temporary_stack(struct vm_area_struct *vma)
 {
 	int maybe_stack = vma->vm_flags & (VM_GROWSDOWN | VM_GROWSUP);
@@ -1684,18 +1584,13 @@ static int try_to_unmap_anon(struct page *page, enum ttu_flags flags)
  * vm_flags for that VMA.  That should be OK, because that vma shouldn't be
  * 'LOCKED.
  */
-static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
+static int try_to_unmap_mapping(struct page *page,
+		struct address_space *mapping, enum ttu_flags flags)
 {
-	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff = page_to_pgoff(page);
 	struct vm_area_struct *vma;
 	int ret = SWAP_AGAIN;
-	unsigned long cursor;
-	unsigned long max_nl_cursor = 0;
-	unsigned long max_nl_size = 0;
-	unsigned int mapcount;
 
-	mutex_lock(&mapping->i_mmap_mutex);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
 		ret = try_to_unmap_one(page, vma, address, flags);
@@ -1703,75 +1598,39 @@ static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
 			goto out;
 	}
 
-	if (list_empty(&mapping->i_mmap_nonlinear))
-		goto out;
+out:
+	return ret;
+}
 
-	/*
-	 * We don't bother to try to find the munlocked page in nonlinears.
-	 * It's costly. Instead, later, page reclaim logic may call
-	 * try_to_unmap(TTU_MUNLOCK) and recover PG_mlocked lazily.
-	 */
-	if (TTU_ACTION(flags) == TTU_MUNLOCK)
-		goto out;
+static int try_to_unmap_file(struct page *page, enum ttu_flags flags)
+{
+	struct address_space *mapping = page->mapping, *peer;
+	int ret;
 
-	list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-							shared.nonlinear) {
-		cursor = (unsigned long) vma->vm_private_data;
-		if (cursor > max_nl_cursor)
-			max_nl_cursor = cursor;
-		cursor = vma->vm_end - vma->vm_start;
-		if (cursor > max_nl_size)
-			max_nl_size = cursor;
-	}
+	mutex_lock_nested(&mapping->i_mmap_mutex, SINGLE_DEPTH_NESTING);
 
-	if (max_nl_size == 0) {	/* all nonlinears locked or reserved ? */
-		ret = SWAP_FAIL;
+	ret = try_to_unmap_mapping(page, mapping, flags);
+	if (ret != SWAP_AGAIN || !page_mapped(page))
 		goto out;
-	}
 
-	/*
-	 * We don't try to search for this page in the nonlinear vmas,
-	 * and page_referenced wouldn't have found it anyway.  Instead
-	 * just walk the nonlinear vmas trying to age and unmap some.
-	 * The mapcount of the page we came in with is irrelevant,
-	 * but even so use it as a guide to how hard we should try?
-	 */
-	mapcount = page_mapcount(page);
-	if (!mapcount)
+	/* Does page belong to pfcache mapping? */
+	if (!mapping->i_peer_file ||
+	    mapping->i_peer_file->f_mapping != mapping)
 		goto out;
-	cond_resched();
-
-	max_nl_size = (max_nl_size + CLUSTER_SIZE - 1) & CLUSTER_MASK;
-	if (max_nl_cursor == 0)
-		max_nl_cursor = CLUSTER_SIZE;
-
-	do {
-		list_for_each_entry(vma, &mapping->i_mmap_nonlinear,
-							shared.nonlinear) {
-			cursor = (unsigned long) vma->vm_private_data;
-			while ( cursor < max_nl_cursor &&
-				cursor < vma->vm_end - vma->vm_start) {
-				if (try_to_unmap_cluster(cursor, &mapcount,
-						vma, page) == SWAP_MLOCK)
-					ret = SWAP_MLOCK;
-				cursor += CLUSTER_SIZE;
-				vma->vm_private_data = (void *) cursor;
-				if ((int)mapcount <= 0)
-					goto out;
-			}
-			vma->vm_private_data = (void *) max_nl_cursor;
-		}
-		cond_resched();
-		max_nl_cursor += CLUSTER_SIZE;
-	} while (max_nl_cursor <= max_nl_size);
 
 	/*
-	 * Don't loop forever (perhaps all the remaining pages are
-	 * in locked vmas).  Reset cursor on all unreserved nonlinear
-	 * vmas, now forgetting on which ones it had fallen behind.
+	 * Handle TTU_MIGRATION like TTU_UNMAP, without migration ptes.
 	 */
-	list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.nonlinear)
-		vma->vm_private_data = NULL;
+	if (TTU_ACTION(flags) != TTU_MUNLOCK)
+		flags = TTU_UNMAP | (flags & ~TTU_ACTION_MASK);
+
+	list_for_each_entry(peer, &mapping->i_peer_list, i_peer_list) {
+		mutex_lock(&peer->i_mmap_mutex);
+		ret = try_to_unmap_mapping(page, peer, flags);
+		mutex_unlock(&peer->i_mmap_mutex);
+		if (ret != SWAP_AGAIN || !page_mapped(page))
+			break;
+	}
 out:
 	mutex_unlock(&mapping->i_mmap_mutex);
 	return ret;
@@ -1846,17 +1705,13 @@ void __put_anon_vma(struct anon_vma *anon_vma)
 }
 
 #ifdef CONFIG_MIGRATION
-/*
- * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
- * Called by migrate.c to remove migration ptes, but might be used more later.
- */
-static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
-		struct vm_area_struct *, unsigned long, void *), void *arg)
+static struct anon_vma *rmap_walk_anon_lock(struct page *page,
+					struct rmap_walk_control *rwc)
 {
 	struct anon_vma *anon_vma;
-	pgoff_t pgoff;
-	struct anon_vma_chain *avc;
-	int ret = SWAP_AGAIN;
+
+	if (rwc->anon_lock)
+		return rwc->anon_lock(page);
 
 	/*
 	 * Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
@@ -1866,58 +1721,91 @@ static int rmap_walk_anon(struct page *page, int (*rmap_one)(struct page *,
 	 */
 	anon_vma = page_anon_vma(page);
 	if (!anon_vma)
-		return ret;
+		return NULL;
+
 	anon_vma_lock_read(anon_vma);
+	return anon_vma;
+}
+
+/*
+ * rmap_walk() and its helpers rmap_walk_anon() and rmap_walk_file():
+ * Called by migrate.c to remove migration ptes, but might be used more later.
+ */
+static int rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc)
+{
+	struct anon_vma *anon_vma;
+	pgoff_t pgoff;
+	struct anon_vma_chain *avc;
+	int ret = SWAP_AGAIN;
+
+	anon_vma = rmap_walk_anon_lock(page, rwc);
+	if (!anon_vma)
+		return ret;
+
 	pgoff = page_to_pgoff(page);
+
 	anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root, pgoff, pgoff) {
 		struct vm_area_struct *vma = avc->vma;
 		unsigned long address = vma_address(page, vma);
-		ret = rmap_one(page, vma, address, arg);
+
+		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+			continue;
+
+		ret = rwc->rmap_one(page, vma, address, rwc->arg);
 		if (ret != SWAP_AGAIN)
 			break;
+		if (rwc->done && rwc->done(page))
+			break;
 	}
 	anon_vma_unlock_read(anon_vma);
 	return ret;
 }
 
-static int rmap_walk_file(struct page *page, int (*rmap_one)(struct page *,
-		struct vm_area_struct *, unsigned long, void *), void *arg)
+static int rmap_walk_file(struct page *page, struct rmap_walk_control *rwc)
 {
 	struct address_space *mapping = page->mapping;
 	pgoff_t pgoff;
 	struct vm_area_struct *vma;
 	int ret = SWAP_AGAIN;
 
+	/*
+	 * The page lock not only makes sure that page->mapping cannot
+	 * suddenly be NULLified by truncation, it makes sure that the
+	 * structure at mapping cannot be freed and reused yet,
+	 * so we can safely take mapping->i_mmap_mutex.
+	 */
+	VM_BUG_ON(!PageLocked(page));
+
 	if (!mapping)
 		return ret;
 	pgoff = page_to_pgoff(page);
 	mutex_lock(&mapping->i_mmap_mutex);
 	vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) {
 		unsigned long address = vma_address(page, vma);
-		ret = rmap_one(page, vma, address, arg);
+
+		if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg))
+			continue;
+
+		ret = rwc->rmap_one(page, vma, address, rwc->arg);
 		if (ret != SWAP_AGAIN)
-			break;
+			goto done;
+		if (rwc->done && rwc->done(page))
+			goto done;
 	}
-	/*
-	 * No nonlinear handling: being always shared, nonlinear vmas
-	 * never contain migration ptes.  Decide what to do about this
-	 * limitation to linear when we need rmap_walk() on nonlinear.
-	 */
+
+done:
 	mutex_unlock(&mapping->i_mmap_mutex);
 	return ret;
 }
 
-int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
-		struct vm_area_struct *, unsigned long, void *), void *arg)
+int rmap_walk(struct page *page, struct rmap_walk_control *rwc)
 {
-	VM_BUG_ON(!PageLocked(page));
-
 	if (unlikely(PageKsm(page)))
-		return rmap_walk_ksm(page, rmap_one, arg);
+		return rmap_walk_ksm(page, rwc);
 	else if (PageAnon(page))
-		return rmap_walk_anon(page, rmap_one, arg);
+		return rmap_walk_anon(page, rwc);
 	else
-		return rmap_walk_file(page, rmap_one, arg);
+		return rmap_walk_file(page, rwc);
 }
 #endif /* CONFIG_MIGRATION */
 
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -68,7 +68,12 @@ static struct vfsmount *shm_mnt;
 #include <linux/highmem.h>
 #include <linux/seq_file.h>
 #include <linux/magic.h>
+#include <linux/syscalls.h>
 #include <linux/fcntl.h>
+#include <uapi/linux/memfd.h>
+
+#include <bc/beancounter.h>
+#include <bc/vmpages.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -105,14 +110,24 @@ enum sgp_type {
 };
 
 #ifdef CONFIG_TMPFS
+static unsigned long tmpfs_ram_pages(void)
+{
+	struct user_beancounter *ub = get_exec_ub();
+
+	if (ub == get_ub0())
+		return totalram_pages;
+
+	return min(totalram_pages, ub_total_pages(ub, false));
+}
+
 static unsigned long shmem_default_max_blocks(void)
 {
-	return totalram_pages / 2;
+	return tmpfs_ram_pages() / 2;
 }
 
 static unsigned long shmem_default_max_inodes(void)
 {
-	return min(totalram_pages - totalhigh_pages, totalram_pages / 2);
+	return min(totalram_pages - totalhigh_pages, tmpfs_ram_pages() / 2);
 }
 #endif
 
@@ -140,16 +155,67 @@ static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb)
  * (unless MAP_NORESERVE and sysctl_overcommit_memory <= 1),
  * consistent with the pre-accounting of private mappings ...
  */
-static inline int shmem_acct_size(unsigned long flags, loff_t size)
+static inline int shmem_acct_size(unsigned long flags, loff_t size,
+				  struct user_beancounter *ub)
 {
-	return (flags & VM_NORESERVE) ?
-		0 : security_vm_enough_memory_mm(current->mm, VM_ACCT(size));
+	long pages = VM_ACCT(size);
+	int ret;
+
+	if (flags & VM_NORESERVE)
+		return 0;
+
+	ret = charge_beancounter(ub, UB_SHMPAGES, pages, UB_HARD);
+	if (ret)
+		goto no_shm;
+
+	ret = charge_beancounter_fast(ub, UB_PRIVVMPAGES, pages, UB_HARD);
+	if (ret)
+		goto no_privvm;
+
+	ret = security_vm_enough_memory_mm(current->mm, pages);
+	if (ret)
+		goto no_vm;
+#ifdef CONFIG_VE
+	/*
+	 * In container the maximal amount of shared pages available
+	 * is limited with @max_blocks so make sure we have space
+	 * left thus users won't wonder why their applications get
+	 * VM_FAULT_SIGBUS when pool exceeded.
+	 */
+	if (!ve_is_super(get_exec_env())) {
+		struct shmem_sb_info *sbinfo = SHMEM_SB(shm_mnt->mnt_sb);
+
+		if (sbinfo->max_blocks) {
+			if (sbinfo->max_blocks < pages ||
+			    percpu_counter_compare(&sbinfo->used_blocks,
+						   sbinfo->max_blocks - pages) > 0) {
+				ret = -ENOSPC;
+				goto no_vm;
+			}
+		}
+	}
+#endif
+
+	return 0;
+
+no_vm:
+	uncharge_beancounter_fast(ub, UB_PRIVVMPAGES, pages);
+no_privvm:
+	uncharge_beancounter(ub, UB_SHMPAGES, pages);
+no_shm:
+	return ret;
 }
 
-static inline void shmem_unacct_size(unsigned long flags, loff_t size)
+static inline void shmem_unacct_size(unsigned long flags, loff_t size,
+				     struct user_beancounter *ub)
 {
-	if (!(flags & VM_NORESERVE))
-		vm_unacct_memory(VM_ACCT(size));
+	long pages = VM_ACCT(size);
+
+	if (!(flags & VM_NORESERVE)) {
+		vm_unacct_memory(pages);
+		uncharge_beancounter_fast(ub, UB_PRIVVMPAGES, pages);
+		uncharge_beancounter(ub, UB_SHMPAGES, pages);
+	}
 }
 
 /*
@@ -158,16 +224,16 @@ static inline void shmem_unacct_size(unsigned long flags, loff_t size)
  * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM,
  * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM.
  */
-static inline int shmem_acct_block(unsigned long flags)
+static inline int shmem_acct_block(struct shmem_inode_info *info)
 {
-	return (flags & VM_NORESERVE) ?
-		security_vm_enough_memory_mm(current->mm, VM_ACCT(PAGE_CACHE_SIZE)) : 0;
+	return shmem_acct_size(info->flags ^ VM_NORESERVE,
+			       PAGE_CACHE_SIZE, info->shmi_ub);
 }
 
-static inline void shmem_unacct_blocks(unsigned long flags, long pages)
+static inline void shmem_unacct_blocks(struct shmem_inode_info *info, long pages)
 {
-	if (flags & VM_NORESERVE)
-		vm_unacct_memory(pages * VM_ACCT(PAGE_CACHE_SIZE));
+	shmem_unacct_size(info->flags ^ VM_NORESERVE,
+			  pages << PAGE_SHIFT, info->shmi_ub);
 }
 
 static const struct super_operations shmem_ops;
@@ -235,7 +301,7 @@ static void shmem_recalc_inode(struct inode *inode)
 			percpu_counter_add(&sbinfo->used_blocks, -freed);
 		info->alloced -= freed;
 		inode->i_blocks -= freed * BLOCKS_PER_PAGE;
-		shmem_unacct_blocks(info->flags, freed);
+		shmem_unacct_blocks(info, freed);
 	}
 }
 
@@ -490,7 +556,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			pvec.pages, indices);
 		if (!pvec.nr)
 			break;
-		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -518,7 +583,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 		}
 		pagevec_remove_exceptionals(&pvec);
 		pagevec_release(&pvec);
-		mem_cgroup_uncharge_end();
 		cond_resched();
 		index++;
 	}
@@ -566,7 +630,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 			index = start;
 			continue;
 		}
-		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -602,7 +665,6 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
 		}
 		pagevec_remove_exceptionals(&pvec);
 		pagevec_release(&pvec);
-		mem_cgroup_uncharge_end();
 		index++;
 	}
 
@@ -664,7 +726,7 @@ static void shmem_evict_inode(struct inode *inode)
 	struct shmem_inode_info *info = SHMEM_I(inode);
 
 	if (inode->i_mapping->a_ops == &shmem_aops) {
-		shmem_unacct_size(info->flags, inode->i_size);
+		shmem_unacct_size(info->flags, inode->i_size, info->shmi_ub);
 		inode->i_size = 0;
 		shmem_truncate_range(inode, 0, (loff_t)-1);
 		if (!list_empty(&info->swaplist)) {
@@ -696,7 +758,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
 	radswap = swp_to_radix_entry(swap);
 	index = radix_tree_locate_item(&mapping->page_tree, radswap);
 	if (index == -1)
-		return 0;
+		return -EAGAIN;	/* tell shmem_unuse we found nothing */
 
 	/*
 	 * Move _head_ to start search for next from here.
@@ -755,7 +817,6 @@ static int shmem_unuse_inode(struct shmem_inode_info *info,
 			spin_unlock(&info->lock);
 			swap_free(swap);
 		}
-		error = 1;	/* not an error, but entry was found */
 	}
 	return error;
 }
@@ -767,7 +828,7 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
 {
 	struct list_head *this, *next;
 	struct shmem_inode_info *info;
-	int found = 0;
+	struct mem_cgroup *memcg;
 	int error = 0;
 
 	/*
@@ -782,26 +843,32 @@ int shmem_unuse(swp_entry_t swap, struct page *page)
 	 * the shmem_swaplist_mutex which might hold up shmem_writepage().
 	 * Charged back to the user (not to caller) when swap account is used.
 	 */
-	error = mem_cgroup_cache_charge(page, current->mm, GFP_KERNEL);
+	error = mem_cgroup_try_charge(page, current->mm, GFP_KERNEL, &memcg);
 	if (error)
 		goto out;
 	/* No radix_tree_preload: swap entry keeps a place for page in tree */
+	error = -EAGAIN;
 
 	mutex_lock(&shmem_swaplist_mutex);
 	list_for_each_safe(this, next, &shmem_swaplist) {
 		info = list_entry(this, struct shmem_inode_info, swaplist);
 		if (info->swapped)
-			found = shmem_unuse_inode(info, swap, &page);
+			error = shmem_unuse_inode(info, swap, &page);
 		else
 			list_del_init(&info->swaplist);
 		cond_resched();
-		if (found)
+		if (error != -EAGAIN)
 			break;
+		/* found nothing in this: move on to search the next */
 	}
 	mutex_unlock(&shmem_swaplist_mutex);
 
-	if (found < 0)
-		error = found;
+	if (error) {
+		if (error != -ENOMEM)
+			error = 0;
+		mem_cgroup_cancel_charge(page, memcg);
+	} else
+		mem_cgroup_commit_charge(page, memcg, true);
 out:
 	unlock_page(page);
 	page_cache_release(page);
@@ -905,7 +972,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
 	}
 
 	mutex_unlock(&shmem_swaplist_mutex);
-	swapcache_free(swap, NULL);
+	swapcache_free(swap);
 redirty:
 	set_page_dirty(page);
 	if (wbc->for_reclaim)
@@ -1078,7 +1145,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
 		 */
 		oldpage = newpage;
 	} else {
-		mem_cgroup_replace_page_cache(oldpage, newpage);
+		mem_cgroup_migrate(oldpage, newpage, false);
 		lru_cache_add_anon(newpage);
 		*pagep = newpage;
 	}
@@ -1105,6 +1172,7 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
 	struct address_space *mapping = inode->i_mapping;
 	struct shmem_inode_info *info;
 	struct shmem_sb_info *sbinfo;
+	struct mem_cgroup *memcg;
 	struct page *page;
 	swp_entry_t swap;
 	int error;
@@ -1180,8 +1248,7 @@ repeat:
 				goto failed;
 		}
 
-		error = mem_cgroup_cache_charge(page, current->mm,
-						gfp & GFP_RECLAIM_MASK);
+		error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
 		if (!error) {
 			error = shmem_add_to_page_cache(page, mapping, index,
 						gfp, swp_to_radix_entry(swap));
@@ -1197,12 +1264,16 @@ repeat:
 			 * Reset swap.val? No, leave it so "failed" goes back to
 			 * "repeat": reading a hole and writing should succeed.
 			 */
-			if (error)
+			if (error) {
+				mem_cgroup_cancel_charge(page, memcg);
 				delete_from_swap_cache(page);
+			}
 		}
 		if (error)
 			goto failed;
 
+		mem_cgroup_commit_charge(page, memcg, true);
+
 		spin_lock(&info->lock);
 		info->swapped--;
 		shmem_recalc_inode(inode);
@@ -1213,7 +1284,7 @@ repeat:
 		swap_free(swap);
 
 	} else {
-		if (shmem_acct_block(info->flags)) {
+		if (shmem_acct_block(info)) {
 			error = -ENOSPC;
 			goto failed;
 		}
@@ -1234,8 +1305,8 @@ repeat:
 
 		SetPageSwapBacked(page);
 		__set_page_locked(page);
-		error = mem_cgroup_cache_charge(page, current->mm,
-						gfp & GFP_RECLAIM_MASK);
+
+		error = mem_cgroup_try_charge(page, current->mm, gfp, &memcg);
 		if (error)
 			goto decused;
 		error = radix_tree_maybe_preload(gfp & GFP_RECLAIM_MASK);
@@ -1245,9 +1316,10 @@ repeat:
 			radix_tree_preload_end();
 		}
 		if (error) {
-			mem_cgroup_uncharge_cache_page(page);
+			mem_cgroup_cancel_charge(page, memcg);
 			goto decused;
 		}
+		mem_cgroup_commit_charge(page, memcg, false);
 		lru_cache_add_anon(page);
 
 		spin_lock(&info->lock);
@@ -1305,7 +1377,7 @@ decused:
 	if (sbinfo->max_blocks)
 		percpu_counter_add(&sbinfo->used_blocks, -1);
 unacct:
-	shmem_unacct_blocks(info->flags, 1);
+	shmem_unacct_blocks(info, 1);
 failed:
 	if (swap.val && error != -EINVAL &&
 	    !shmem_confirm_swap(mapping, index, swap))
@@ -1327,6 +1399,18 @@ unlock:
 	return error;
 }
 
+/*
+ * This is like autoremove_wake_function, but it removes the wait queue
+ * entry unconditionally - even if something else had already woken the
+ * target.
+ */
+static int synchronous_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
+{
+	int ret = default_wake_function(wait, mode, sync, key);
+	list_del_init(&wait->task_list);
+	return ret;
+}
+
 static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 {
 	struct inode *inode = file_inode(vma->vm_file);
@@ -1360,7 +1444,7 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
 		    vmf->pgoff >= shmem_falloc->start &&
 		    vmf->pgoff < shmem_falloc->next) {
 			wait_queue_head_t *shmem_falloc_waitq;
-			DEFINE_WAIT(shmem_fault_wait);
+			DEFINE_WAIT_FUNC(shmem_fault_wait, synchronous_wake_function);
 
 			ret = VM_FAULT_NOPAGE;
 			if ((vmf->flags & FAULT_FLAG_ALLOW_RETRY) &&
@@ -1428,19 +1512,25 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
 
 	spin_lock(&info->lock);
 	if (lock && !(info->flags & VM_LOCKED)) {
+		if (ub_lockedshm_charge(info, inode->i_size) < 0)
+			goto out_ch;
 		if (!user_shm_lock(inode->i_size, user))
 			goto out_nomem;
 		info->flags |= VM_LOCKED;
 		mapping_set_unevictable(file->f_mapping);
 	}
 	if (!lock && (info->flags & VM_LOCKED) && user) {
+		ub_lockedshm_uncharge(info, inode->i_size);
 		user_shm_unlock(inode->i_size, user);
 		info->flags &= ~VM_LOCKED;
 		mapping_clear_unevictable(file->f_mapping);
 	}
-	retval = 0;
+	spin_unlock(&info->lock);
+	return 0;
 
 out_nomem:
+	ub_lockedshm_uncharge(info, inode->i_size);
+out_ch:
 	spin_unlock(&info->lock);
 	return retval;
 }
@@ -1906,7 +1996,8 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence)
 		}
 	}
 
-	offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
+	if (offset >= 0)
+		offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE);
 	mutex_unlock(&inode->i_mutex);
 	return offset;
 }
@@ -2175,6 +2266,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 		spin_lock(&inode->i_lock);
 		inode->i_private = NULL;
 		wake_up_all(&shmem_falloc_waitq);
+		WARN_ON_ONCE(!list_empty(&shmem_falloc_waitq.task_list));
 		spin_unlock(&inode->i_lock);
 		error = 0;
 		goto out;
@@ -2211,11 +2303,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset,
 		struct page *page;
 
 		/*
-		 * Good, the fallocate(2) manpage permits EINTR: we may have
-		 * been interrupted because we are using up too much memory.
+		 * Although fallocate(2) manpage permits EINTR, the more
+		 * places use ERESTARTSYS the better. If we have been
+		 * interrupted because we are using up too much memory,
+		 * oom-killer used fatal signal and we will die anyway.
 		 */
 		if (signal_pending(current))
-			error = -EINTR;
+			error = -ERESTARTSYS;
 		else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced)
 			error = -ENOMEM;
 		else
@@ -2832,6 +2926,8 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 		}
 		if (!*this_char)
 			continue;
+		if (!strcmp(this_char, "relatime"))
+			continue;
 		if ((value = strchr(this_char,'=')) != NULL) {
 			*value++ = 0;
 		} else {
@@ -2846,7 +2942,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
 			size = memparse(value,&rest);
 			if (*rest == '%') {
 				size <<= PAGE_SHIFT;
-				size *= totalram_pages;
+				size *= tmpfs_ram_pages();
 				do_div(size, 100);
 				rest++;
 			}
@@ -3134,12 +3230,14 @@ static struct inode *shmem_alloc_inode(struct super_block *sb)
 	info = kmem_cache_alloc(shmem_inode_cachep, GFP_KERNEL);
 	if (!info)
 		return NULL;
+	info->shmi_ub = get_beancounter(get_exec_ub());
 	return &info->vfs_inode;
 }
 
 static void shmem_destroy_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
+	put_beancounter(SHMEM_I(inode)->shmi_ub);
 	kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode));
 }
 
@@ -3160,7 +3258,7 @@ static int shmem_init_inodecache(void)
 {
 	shmem_inode_cachep = kmem_cache_create("shmem_inode_cache",
 				sizeof(struct shmem_inode_info),
-				0, SLAB_PANIC, shmem_init_inode);
+				0, SLAB_PANIC|SLAB_ACCOUNT, shmem_init_inode);
 	return 0;
 }
 
@@ -3264,7 +3362,6 @@ static const struct vm_operations_struct shmem_vm_ops = {
 	.set_policy     = shmem_set_policy,
 	.get_policy     = shmem_get_policy,
 #endif
-	.remap_pages	= generic_file_remap_pages,
 };
 
 static struct dentry *shmem_mount(struct file_system_type *fs_type,
@@ -3278,7 +3375,7 @@ static struct file_system_type shmem_fs_type = {
 	.name		= "tmpfs",
 	.mount		= shmem_mount,
 	.kill_sb	= kill_litter_super,
-	.fs_flags	= FS_USERNS_MOUNT,
+	.fs_flags	= FS_USERNS_MOUNT | FS_VIRTUALIZED,
 };
 
 int __init shmem_init(void)
@@ -3374,7 +3471,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
 #define shmem_vm_ops				generic_file_vm_ops
 #define shmem_file_operations			ramfs_file_operations
 #define shmem_get_inode(sb, dir, mode, dev, flags)	ramfs_get_inode(sb, dir, mode, dev)
-#define shmem_acct_size(flags, size)		0
+#define shmem_acct_size(flags, size, ub)	0
 #define shmem_unacct_size(flags, size)		do {} while (0)
 
 #endif /* CONFIG_SHMEM */
@@ -3400,7 +3497,7 @@ static struct file *__shmem_file_setup(const char *name, loff_t size,
 	if (size < 0 || size > MAX_LFS_FILESIZE)
 		return ERR_PTR(-EINVAL);
 
-	if (shmem_acct_size(flags, size))
+	if (shmem_acct_size(flags, size, get_exec_ub()))
 		return ERR_PTR(-ENOMEM);
 
 	res = ERR_PTR(-ENOMEM);
@@ -3408,16 +3505,16 @@ static struct file *__shmem_file_setup(const char *name, loff_t size,
 	this.len = strlen(name);
 	this.hash = 0; /* will go */
 	sb = shm_mnt->mnt_sb;
+	path.mnt = mntget(shm_mnt);
 	path.dentry = d_alloc_pseudo(sb, &this);
 	if (!path.dentry)
 		goto put_memory;
 	d_set_d_op(path.dentry, &anon_ops);
-	path.mnt = mntget(shm_mnt);
 
 	res = ERR_PTR(-ENOSPC);
 	inode = shmem_get_inode(sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
 	if (!inode)
-		goto put_dentry;
+		goto put_memory;
 
 	inode->i_flags |= i_flags;
 	d_instantiate(path.dentry, inode);
@@ -3434,10 +3531,10 @@ static struct file *__shmem_file_setup(const char *name, loff_t size,
 
 	return res;
 
+put_memory:
+	shmem_unacct_size(flags, size, get_exec_ub());
 put_dentry:
 	path_put(&path);
-put_memory:
-	shmem_unacct_size(flags, size);
 	return res;
 }
 
@@ -3483,6 +3580,9 @@ int shmem_zero_setup(struct vm_area_struct *vma)
 
 	if (vma->vm_file)
 		fput(vma->vm_file);
+	else if (vma->vm_flags & VM_WRITE)
+		uncharge_beancounter_fast(mm_ub(vma->vm_mm), UB_PRIVVMPAGES,
+					  size >> PAGE_SHIFT);
 	vma->vm_file = file;
 	vma->vm_ops = &shmem_vm_ops;
 	return 0;
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -116,6 +116,7 @@
 #include	<linux/kmemcheck.h>
 #include	<linux/memory.h>
 #include	<linux/prefetch.h>
+#include	<linux/vzstat.h>
 
 #include	<net/sock.h>
 
@@ -1748,8 +1749,12 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	if (cachep->flags & SLAB_RECLAIM_ACCOUNT)
 		flags |= __GFP_RECLAIMABLE;
 
+	if (memcg_charge_slab(cachep, flags, cachep->gfporder))
+		return NULL;
+
 	page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder);
 	if (!page) {
+		memcg_uncharge_slab(cachep, cachep->gfporder);
 		if (!(flags & __GFP_NOWARN) && printk_ratelimit())
 			slab_out_of_memory(cachep, flags, nodeid);
 		return NULL;
@@ -1772,7 +1777,6 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 		if (page_is_pfmemalloc(page))
 			SetPageSlabPfmemalloc(page + i);
 	}
-	memcg_bind_pages(cachep, cachep->gfporder);
 
 	if (kmemcheck_enabled && !(cachep->flags & SLAB_NOTRACK)) {
 		kmemcheck_alloc_shadow(page, cachep->gfporder, flags, nodeid);
@@ -1810,10 +1814,10 @@ static void kmem_freepages(struct kmem_cache *cachep, void *addr)
 		page++;
 	}
 
-	memcg_release_pages(cachep, cachep->gfporder);
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += nr_freed;
-	free_memcg_kmem_pages((unsigned long)addr, cachep->gfporder);
+	free_pages((unsigned long)addr, cachep->gfporder);
+	memcg_uncharge_slab(cachep, cachep->gfporder);
 }
 
 static void kmem_rcu_free(struct rcu_head *head)
@@ -2094,7 +2098,7 @@ static size_t calculate_slab_order(struct kmem_cache *cachep,
 			offslab_limit = size - sizeof(struct slab);
 			offslab_limit /= sizeof(kmem_bufctl_t);
 
- 			if (num > offslab_limit)
+			if (num > offslab_limit)
 				break;
 		}
 
@@ -2387,7 +2391,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)
 
 	err = setup_cpu_cache(cachep, gfp);
 	if (err) {
-		__kmem_cache_shutdown(cachep);
+		__kmem_cache_release(cachep);
 		return err;
 	}
 
@@ -2518,8 +2522,7 @@ out:
 	return nr_freed;
 }
 
-/* Called with slab_mutex held to protect against cpu hotplug */
-static int __cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
 {
 	int ret = 0, i = 0;
 	struct kmem_cache_node *n;
@@ -2540,35 +2543,15 @@ static int __cache_shrink(struct kmem_cache *cachep)
 	return (ret ? 1 : 0);
 }
 
-/**
- * kmem_cache_shrink - Shrink a cache.
- * @cachep: The cache to shrink.
- *
- * Releases as many slabs as possible for a cache.
- * To help debugging, a zero exit status indicates all slabs were released.
- */
-int kmem_cache_shrink(struct kmem_cache *cachep)
+int __kmem_cache_shutdown(struct kmem_cache *cachep)
 {
-	int ret;
-	BUG_ON(!cachep || in_interrupt());
-
-	get_online_cpus();
-	mutex_lock(&slab_mutex);
-	ret = __cache_shrink(cachep);
-	mutex_unlock(&slab_mutex);
-	put_online_cpus();
-	return ret;
+	return __kmem_cache_shrink(cachep, false);
 }
-EXPORT_SYMBOL(kmem_cache_shrink);
 
-int __kmem_cache_shutdown(struct kmem_cache *cachep)
+void __kmem_cache_release(struct kmem_cache *cachep)
 {
 	int i;
 	struct kmem_cache_node *n;
-	int rc = __cache_shrink(cachep);
-
-	if (rc)
-		return rc;
 
 	for_each_online_cpu(i)
 	    kfree(cachep->array[i]);
@@ -2582,7 +2565,6 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
 			kfree(n);
 		}
 	}
-	return 0;
 }
 
 /*
@@ -3361,6 +3343,8 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	flags &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(flags);
+	WARN_ON_ONCE((flags & __GFP_FS) && current->journal_info &&
+		!(current->flags & PF_MEMALLOC));
 
 	if (slab_should_failslab(cachep, flags))
 		return NULL;
@@ -3404,6 +3388,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
 	if (unlikely((flags & __GFP_ZERO) && ptr))
 		memset(ptr, 0, cachep->object_size);
 
+	memcg_kmem_put_cache(cachep);
 	return ptr;
 }
 
@@ -3449,6 +3434,8 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 	flags &= gfp_allowed_mask;
 
 	lockdep_trace_alloc(flags);
+	WARN_ON_ONCE((flags & __GFP_FS) && current->journal_info &&
+		!(current->flags & PF_MEMALLOC));
 
 	if (slab_should_failslab(cachep, flags))
 		return NULL;
@@ -3470,6 +3457,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
 	if (unlikely((flags & __GFP_ZERO) && objp))
 		memset(objp, 0, cachep->object_size);
 
+	memcg_kmem_put_cache(cachep);
 	return objp;
 }
 
@@ -3978,8 +3966,7 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 				int batchcount, int shared, gfp_t gfp)
 {
 	int ret;
-	struct kmem_cache *c = NULL;
-	int i = 0;
+	struct kmem_cache *c;
 
 	ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
 
@@ -3989,12 +3976,10 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
 	if ((ret < 0) || !is_root_cache(cachep))
 		return ret;
 
-	VM_BUG_ON(!mutex_is_locked(&slab_mutex));
-	for_each_memcg_cache_index(i) {
-		c = cache_from_memcg(cachep, i);
-		if (c)
-			/* return value determined by the parent cache only */
-			__do_tune_cpucache(c, limit, batchcount, shared, gfp);
+	lockdep_assert_held(&slab_mutex);
+	for_each_memcg_cache(c, cachep) {
+		/* return value determined by the root cache only */
+		__do_tune_cpucache(c, limit, batchcount, shared, gfp);
 	}
 
 	return ret;
@@ -4119,6 +4104,7 @@ static void cache_reap(struct work_struct *w)
 		/* Give up. Setup the next iteration. */
 		goto out;
 
+	KSTAT_PERF_ENTER(cache_reap)
 	list_for_each_entry(searchp, &slab_caches, list) {
 		check_irq_on();
 
@@ -4159,11 +4145,80 @@ next:
 	check_irq_on();
 	mutex_unlock(&slab_mutex);
 	next_reap_node();
+	KSTAT_PERF_LEAVE(cache_reap);
 out:
 	/* Set up the next iteration */
 	schedule_delayed_work(work, round_jiffies_relative(REAPTIMEOUT_CPUC));
 }
 
+#define SHOW_TOP_SLABS	10
+
+static unsigned long get_cache_size(struct kmem_cache *cachep)
+{
+	unsigned long flags;
+	unsigned long slabs;
+	struct kmem_list3 *l3;
+	struct list_head *lh;
+	int node;
+
+	slabs = 0;
+
+	for_each_online_node (node) {
+		l3 = cachep->nodelists[node];
+		if (l3 == NULL)
+			continue;
+
+		spin_lock_irqsave(&l3->list_lock, flags);
+		list_for_each (lh, &l3->slabs_full)
+			slabs++;
+		list_for_each (lh, &l3->slabs_partial)
+			slabs++;
+		list_for_each (lh, &l3->slabs_free)
+			slabs++;
+		spin_unlock_irqrestore(&l3->list_lock, flags);
+	}
+
+	return slabs * (PAGE_SIZE << cachep->gfporder) +
+		(OFF_SLAB(cachep) ?
+		 cachep->slabp_cache->size * slabs : 0);
+}
+
+void show_slab_info(void)
+{
+	int i, j;
+	unsigned long size;
+	struct kmem_cache *ptr;
+	unsigned long sizes[SHOW_TOP_SLABS];
+	struct kmem_cache *top[SHOW_TOP_SLABS];
+
+	memset(top, 0, sizeof(top));
+	memset(sizes, 0, sizeof(sizes));
+
+	printk("Top %d caches:\n", SHOW_TOP_SLABS);
+
+//	spin_lock(&cache_chain_lock);
+	list_for_each_entry (ptr, &slab_caches, list) {
+		size = get_cache_size(ptr);
+
+		j = 0;
+		for (i = 1; i < SHOW_TOP_SLABS; i++)
+			if (sizes[i] < sizes[j])
+				j = i;
+
+		if (size > sizes[j]) {
+			sizes[j] = size;
+			top[j] = ptr;
+		}
+	}
+
+	for (i = 0; i < SHOW_TOP_SLABS; i++)
+		if (top[i])
+			printk("%-21s: size %10lu objsize %10u\n",
+					top[i]->name, sizes[i],
+					top[i]->size);
+//	spin_unlock(&cache_chain_lock);
+}
+
 #ifdef CONFIG_SLABINFO
 void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo)
 {
@@ -4319,12 +4374,6 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
 
 #ifdef CONFIG_DEBUG_SLAB_LEAK
 
-static void *leaks_start(struct seq_file *m, loff_t *pos)
-{
-	mutex_lock(&slab_mutex);
-	return seq_list_start(&slab_caches, *pos);
-}
-
 static inline int add_caller(unsigned long *n, unsigned long v)
 {
 	unsigned long *p;
@@ -4445,20 +4494,10 @@ static int leaks_show(struct seq_file *m, void *p)
 	return 0;
 }
 
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
-{
-	return seq_list_next(p, &slab_caches, pos);
-}
-
-static void s_stop(struct seq_file *m, void *p)
-{
-	mutex_unlock(&slab_mutex);
-}
-
 static const struct seq_operations slabstats_op = {
-	.start = leaks_start,
-	.next = s_next,
-	.stop = s_stop,
+	.start = slab_start,
+	.next = slab_next,
+	.stop = slab_stop,
 	.show = leaks_show,
 };
 
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -4,6 +4,41 @@
  * Internal slab definitions
  */
 
+#ifdef CONFIG_SLOB
+/*
+ * Common fields provided in kmem_cache by all slab allocators
+ * This struct is either used directly by the allocator (SLOB)
+ * or the allocator must include definitions for all fields
+ * provided in kmem_cache_common in their definition of kmem_cache.
+ *
+ * Once we can do anonymous structs (C11 standard) we could put a
+ * anonymous struct definition in these allocators so that the
+ * separate allocations in the kmem_cache structure of SLAB and
+ * SLUB is no longer needed.
+ */
+struct kmem_cache {
+	unsigned int object_size;/* The original size of the object */
+	unsigned int size;	/* The aligned/padded/added on size  */
+	unsigned int align;	/* Alignment as calculated */
+	unsigned long flags;	/* Active flags on the slab */
+	const char *name;	/* Slab name for sysfs */
+	int refcount;		/* Use counter */
+	void (*ctor)(void *);	/* Called on object slot creation */
+	struct list_head list;	/* List of all slab caches on the system */
+};
+
+#endif /* CONFIG_SLOB */
+
+#ifdef CONFIG_SLAB
+#include <linux/slab_def.h>
+#endif
+
+#ifdef CONFIG_SLUB
+#include <linux/slub_def.h>
+#endif
+
+#include <linux/memcontrol.h>
+
 /*
  * State of the slab allocator.
  *
@@ -52,15 +87,14 @@ extern struct kmem_cache *create_kmalloc_cache(const char *name, size_t size,
 extern void create_boot_cache(struct kmem_cache *, const char *name,
 			size_t size, unsigned long flags);
 
-struct mem_cgroup;
 #ifdef CONFIG_SLUB
 struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
-		   size_t align, unsigned long flags, void (*ctor)(void *));
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+		   unsigned long flags, void (*ctor)(void *));
 #else
 static inline struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
-		   size_t align, unsigned long flags, void (*ctor)(void *))
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+		   unsigned long flags, void (*ctor)(void *))
 { return NULL; }
 #endif
 
@@ -80,10 +114,11 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
 
 #if defined(CONFIG_SLAB)
 #define SLAB_CACHE_FLAGS (SLAB_MEM_SPREAD | SLAB_NOLEAKTRACE | \
-			  SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | SLAB_NOTRACK)
+			  SLAB_RECLAIM_ACCOUNT | SLAB_TEMPORARY | \
+			  SLAB_NOTRACK | SLAB_ACCOUNT)
 #elif defined(CONFIG_SLUB)
 #define SLAB_CACHE_FLAGS (SLAB_NOLEAKTRACE | SLAB_RECLAIM_ACCOUNT | \
-			  SLAB_TEMPORARY | SLAB_NOTRACK)
+			  SLAB_TEMPORARY | SLAB_NOTRACK | SLAB_ACCOUNT)
 #else
 #define SLAB_CACHE_FLAGS (0)
 #endif
@@ -91,6 +126,9 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
 #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)
 
 int __kmem_cache_shutdown(struct kmem_cache *);
+void __kmem_cache_release(struct kmem_cache *);
+int __kmem_cache_shrink(struct kmem_cache *, bool);
+void slab_kmem_cache_release(struct kmem_cache *);
 
 struct seq_file;
 struct file;
@@ -123,38 +161,27 @@ void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
 int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
 
 #ifdef CONFIG_MEMCG_KMEM
-static inline bool is_root_cache(struct kmem_cache *s)
-{
-	return !s->memcg_params || s->memcg_params->is_root_cache;
-}
-
-static inline bool cache_match_memcg(struct kmem_cache *cachep,
-				     struct mem_cgroup *memcg)
-{
-	return (is_root_cache(cachep) && !memcg) ||
-				(cachep->memcg_params->memcg == memcg);
-}
+/*
+ * Iterate over all memcg caches of the given root cache. The caller must hold
+ * slab_mutex.
+ */
+#define for_each_memcg_cache(iter, root) \
+	list_for_each_entry(iter, &(root)->memcg_params.list, \
+			    memcg_params.list)
 
-static inline void memcg_bind_pages(struct kmem_cache *s, int order)
-{
-	if (!is_root_cache(s))
-		atomic_add(1 << order, &s->memcg_params->nr_pages);
-}
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+	list_for_each_entry_safe(iter, tmp, &(root)->memcg_params.list, \
+				 memcg_params.list)
 
-static inline void memcg_release_pages(struct kmem_cache *s, int order)
+static inline bool is_root_cache(struct kmem_cache *s)
 {
-	if (is_root_cache(s))
-		return;
-
-	if (atomic_sub_and_test((1 << order), &s->memcg_params->nr_pages))
-		mem_cgroup_destroy_cache(s);
+	return s->memcg_params.is_root_cache;
 }
 
 static inline bool slab_equal_or_root(struct kmem_cache *s,
-					struct kmem_cache *p)
+				      struct kmem_cache *p)
 {
-	return (p == s) ||
-		(s->memcg_params && (p == s->memcg_params->root_cache));
+	return p == s || p == s->memcg_params.root_cache;
 }
 
 /*
@@ -165,41 +192,76 @@ static inline bool slab_equal_or_root(struct kmem_cache *s,
 static inline const char *cache_name(struct kmem_cache *s)
 {
 	if (!is_root_cache(s))
-		return s->memcg_params->root_cache->name;
+		s = s->memcg_params.root_cache;
 	return s->name;
 }
 
-static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+/*
+ * Note, we protect with RCU only the memcg_caches array, not per-memcg caches.
+ * That said the caller must assure the memcg's cache won't go away by either
+ * taking a css reference to the owner cgroup, or holding the slab_mutex.
+ */
+static inline struct kmem_cache *
+cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
-	if (!s->memcg_params)
-		return NULL;
-	return s->memcg_params->memcg_caches[idx];
+	struct kmem_cache *cachep;
+	struct memcg_cache_array *arr;
+
+	rcu_read_lock();
+	arr = rcu_dereference(s->memcg_params.memcg_caches);
+	cachep = arr->entries[idx];
+	rcu_read_unlock();
+
+	/*
+	 * Make sure we will access the up-to-date value. The code updating
+	 * memcg_caches issues a write barrier to match this (see
+	 * memcg_create_kmem_cache()).
+	 */
+	smp_read_barrier_depends();
+	return cachep;
 }
 
 static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 {
 	if (is_root_cache(s))
 		return s;
-	return s->memcg_params->root_cache;
-}
-#else
-static inline bool is_root_cache(struct kmem_cache *s)
-{
-	return true;
+	return s->memcg_params.root_cache;
 }
 
-static inline bool cache_match_memcg(struct kmem_cache *cachep,
-				     struct mem_cgroup *memcg)
+extern int __memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, unsigned int nr_pages);
+extern void __memcg_uncharge_slab(struct kmem_cache *s, unsigned int nr_pages);
+
+static __always_inline int memcg_charge_slab(struct kmem_cache *s,
+					     gfp_t gfp, int order)
 {
-	return true;
+	if (!memcg_kmem_enabled())
+		return 0;
+	if (is_root_cache(s))
+		return 0;
+	return __memcg_charge_slab(s, gfp, 1 << order);
 }
 
-static inline void memcg_bind_pages(struct kmem_cache *s, int order)
+static __always_inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
 {
+	if (!memcg_kmem_enabled())
+		return;
+	if (is_root_cache(s))
+		return;
+	__memcg_uncharge_slab(s, 1 << order);
 }
 
-static inline void memcg_release_pages(struct kmem_cache *s, int order)
+extern void slab_init_memcg_params(struct kmem_cache *);
+
+#else /* !CONFIG_MEMCG_KMEM */
+
+#define for_each_memcg_cache(iter, root) \
+	for ((void)(iter), (void)(root); 0; )
+#define for_each_memcg_cache_safe(iter, tmp, root) \
+	for ((void)(iter), (void)(tmp), (void)(root); 0; )
+
+static inline bool is_root_cache(struct kmem_cache *s)
 {
+	return true;
 }
 
 static inline bool slab_equal_or_root(struct kmem_cache *s,
@@ -213,7 +275,8 @@ static inline const char *cache_name(struct kmem_cache *s)
 	return s->name;
 }
 
-static inline struct kmem_cache *cache_from_memcg(struct kmem_cache *s, int idx)
+static inline struct kmem_cache *
+cache_from_memcg_idx(struct kmem_cache *s, int idx)
 {
 	return NULL;
 }
@@ -222,7 +285,20 @@ static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
 {
 	return s;
 }
-#endif
+
+static inline int memcg_charge_slab(struct kmem_cache *s, gfp_t gfp, int order)
+{
+	return 0;
+}
+
+static inline void memcg_uncharge_slab(struct kmem_cache *s, int order)
+{
+}
+
+static inline void slab_init_memcg_params(struct kmem_cache *s)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
 
 static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
 {
@@ -282,3 +358,8 @@ struct kmem_cache_node {
 #endif
 
 };
+
+void *slab_start(struct seq_file *m, loff_t *pos);
+void *slab_next(struct seq_file *m, void *p, loff_t *pos);
+void slab_stop(struct seq_file *m, void *p);
+int memcg_slab_show(struct mem_cgroup *memcg, struct seq_file *m, void *p);
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -20,6 +20,9 @@
 #include <asm/page.h>
 #include <linux/memcontrol.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/kmem.h>
+
 #include "slab.h"
 
 enum slab_state slab_state;
@@ -27,9 +30,17 @@ LIST_HEAD(slab_caches);
 DEFINE_MUTEX(slab_mutex);
 struct kmem_cache *kmem_cache;
 
+/*
+ * Determine the size of a slab object
+ */
+unsigned int kmem_cache_size(struct kmem_cache *s)
+{
+	return s->object_size;
+}
+EXPORT_SYMBOL(kmem_cache_size);
+
 #ifdef CONFIG_DEBUG_VM
-static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
-				   size_t size)
+static int kmem_cache_sanity_check(const char *name, size_t size)
 {
 	struct kmem_cache *s = NULL;
 
@@ -55,13 +66,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
 			continue;
 		}
 
-		/*
-		 * For simplicity, we won't check this in the list of memcg
-		 * caches. We have control over memcg naming, and if there
-		 * aren't duplicates in the global list, there won't be any
-		 * duplicates in the memcg lists as well.
-		 */
-		if (!memcg && !strcmp(s->name, name)) {
+		if (!strcmp(s->name, name)) {
 			pr_err("%s (%s): Cache name already exists.\n",
 			       __func__, name);
 			dump_stack();
@@ -74,8 +79,7 @@ static int kmem_cache_sanity_check(struct mem_cgroup *memcg, const char *name,
 	return 0;
 }
 #else
-static inline int kmem_cache_sanity_check(struct mem_cgroup *memcg,
-					  const char *name, size_t size)
+static inline int kmem_cache_sanity_check(const char *name, size_t size)
 {
 	return 0;
 }
@@ -105,32 +109,99 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
 }
 
 #ifdef CONFIG_MEMCG_KMEM
+void slab_init_memcg_params(struct kmem_cache *s)
+{
+	s->memcg_params.is_root_cache = true;
+	INIT_LIST_HEAD(&s->memcg_params.list);
+	RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
+}
+
+static int init_memcg_params(struct kmem_cache *s,
+		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+	struct memcg_cache_array *arr;
+
+	if (memcg) {
+		s->memcg_params.is_root_cache = false;
+		s->memcg_params.memcg = memcg;
+		s->memcg_params.root_cache = root_cache;
+		return 0;
+	}
+
+	slab_init_memcg_params(s);
+
+	if (!memcg_nr_cache_ids)
+		return 0;
+
+	arr = kzalloc(sizeof(struct memcg_cache_array) +
+		      memcg_nr_cache_ids * sizeof(void *),
+		      GFP_KERNEL);
+	if (!arr)
+		return -ENOMEM;
+
+	RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
+	return 0;
+}
+
+static void destroy_memcg_params(struct kmem_cache *s)
+{
+	if (is_root_cache(s))
+		kfree(rcu_access_pointer(s->memcg_params.memcg_caches));
+}
+
+static int update_memcg_params(struct kmem_cache *s, int new_array_size)
+{
+	struct memcg_cache_array *old, *new;
+
+	if (!is_root_cache(s))
+		return 0;
+
+	new = kzalloc(sizeof(struct memcg_cache_array) +
+		      new_array_size * sizeof(void *), GFP_KERNEL);
+	if (!new)
+		return -ENOMEM;
+
+	old = rcu_dereference_protected(s->memcg_params.memcg_caches,
+					lockdep_is_held(&slab_mutex));
+	if (old)
+		memcpy(new->entries, old->entries,
+		       memcg_nr_cache_ids * sizeof(void *));
+
+	rcu_assign_pointer(s->memcg_params.memcg_caches, new);
+	if (old)
+		kfree_rcu(old, rcu);
+	return 0;
+}
+
 int memcg_update_all_caches(int num_memcgs)
 {
 	struct kmem_cache *s;
 	int ret = 0;
-	mutex_lock(&slab_mutex);
 
+	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list) {
-		if (!is_root_cache(s))
-			continue;
-
-		ret = memcg_update_cache_size(s, num_memcgs);
+		ret = update_memcg_params(s, num_memcgs);
 		/*
-		 * See comment in memcontrol.c, memcg_update_cache_size:
 		 * Instead of freeing the memory, we'll just leave the caches
 		 * up to this point in an updated state.
 		 */
 		if (ret)
-			goto out;
+			break;
 	}
-
-	memcg_update_array_size(num_memcgs);
-out:
 	mutex_unlock(&slab_mutex);
 	return ret;
 }
-#endif
+#else
+static inline int init_memcg_params(struct kmem_cache *s,
+		struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+	return 0;
+}
+
+static inline void destroy_memcg_params(struct kmem_cache *s)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
 
 /*
  * Figure out what the alignment of the objects will be given a set of
@@ -159,6 +230,45 @@ unsigned long calculate_alignment(unsigned long flags,
 	return ALIGN(align, sizeof(void *));
 }
 
+static struct kmem_cache *
+do_kmem_cache_create(char *name, size_t object_size, size_t size, size_t align,
+		     unsigned long flags, void (*ctor)(void *),
+		     struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+{
+	struct kmem_cache *s;
+	int err;
+
+	err = -ENOMEM;
+	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
+	if (!s)
+		goto out;
+
+	s->name = name;
+	s->object_size = object_size;
+	s->size = size;
+	s->align = align;
+	s->ctor = ctor;
+
+	err = init_memcg_params(s, memcg, root_cache);
+	if (err)
+		goto out_free_cache;
+
+	err = __kmem_cache_create(s, flags);
+	if (err)
+		goto out_free_cache;
+
+	s->refcount = 1;
+	list_add(&s->list, &slab_caches);
+out:
+	if (err)
+		return ERR_PTR(err);
+	return s;
+
+out_free_cache:
+	destroy_memcg_params(s);
+	kfree(s);
+	goto out;
+}
 
 /*
  * kmem_cache_create - Create a cache.
@@ -184,20 +294,21 @@ unsigned long calculate_alignment(unsigned long flags,
  * cacheline.  This can be beneficial if you're counting cycles as closely
  * as davem.
  */
-
 struct kmem_cache *
-kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
-			size_t align, unsigned long flags, void (*ctor)(void *),
-			struct kmem_cache *parent_cache)
+kmem_cache_create(const char *name, size_t size, size_t align,
+		  unsigned long flags, void (*ctor)(void *))
 {
-	struct kmem_cache *s = NULL;
-	int err = 0;
+	struct kmem_cache *s;
+	char *cache_name;
+	int err;
 
 	get_online_cpus();
+	memcg_get_cache_ids();
 	mutex_lock(&slab_mutex);
 
-	if (!kmem_cache_sanity_check(memcg, name, size) == 0)
-		goto out_locked;
+	err = kmem_cache_sanity_check(name, size);
+	if (err)
+		goto out_unlock;
 
 	/*
 	 * Some allocators will constraint the set of valid flags to a subset
@@ -207,47 +318,30 @@ kmem_cache_create_memcg(struct mem_cgroup *memcg, const char *name, size_t size,
 	 */
 	flags &= CACHE_CREATE_MASK;
 
-	s = __kmem_cache_alias(memcg, name, size, align, flags, ctor);
+	s = __kmem_cache_alias(name, size, align, flags, ctor);
 	if (s)
-		goto out_locked;
-
-	s = kmem_cache_zalloc(kmem_cache, GFP_KERNEL);
-	if (s) {
-		s->object_size = s->size = size;
-		s->align = calculate_alignment(flags, align, size);
-		s->ctor = ctor;
-
-		if (memcg_register_cache(memcg, s, parent_cache)) {
-			kmem_cache_free(kmem_cache, s);
-			err = -ENOMEM;
-			goto out_locked;
-		}
-
-		s->name = kstrdup(name, GFP_KERNEL);
-		if (!s->name) {
-			kmem_cache_free(kmem_cache, s);
-			err = -ENOMEM;
-			goto out_locked;
-		}
+		goto out_unlock;
 
-		err = __kmem_cache_create(s, flags);
-		if (!err) {
-			s->refcount = 1;
-			list_add(&s->list, &slab_caches);
-			memcg_cache_list_add(memcg, s);
-		} else {
-			kfree(s->name);
-			kmem_cache_free(kmem_cache, s);
-		}
-	} else
+	cache_name = kstrdup(name, GFP_KERNEL);
+	if (!cache_name) {
 		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	s = do_kmem_cache_create(cache_name, size, size,
+				 calculate_alignment(flags, align, size),
+				 flags, ctor, NULL, NULL);
+	if (IS_ERR(s)) {
+		err = PTR_ERR(s);
+		kfree(cache_name);
+	}
 
-out_locked:
+out_unlock:
 	mutex_unlock(&slab_mutex);
+	memcg_put_cache_ids();
 	put_online_cpus();
 
 	if (err) {
-
 		if (flags & SLAB_PANIC)
 			panic("kmem_cache_create: Failed to create slab '%s'. Error %d\n",
 				name, err);
@@ -256,57 +350,243 @@ out_locked:
 				name, err);
 			dump_stack();
 		}
-
 		return NULL;
 	}
-
 	return s;
 }
+EXPORT_SYMBOL(kmem_cache_create);
 
-struct kmem_cache *
-kmem_cache_create(const char *name, size_t size, size_t align,
-		  unsigned long flags, void (*ctor)(void *))
+static int do_kmem_cache_shutdown(struct kmem_cache *s,
+		struct list_head *release, bool *need_rcu_barrier)
 {
-	return kmem_cache_create_memcg(NULL, name, size, align, flags, ctor, NULL);
+	if (__kmem_cache_shutdown(s) != 0) {
+		printk(KERN_ERR "kmem_cache_destroy %s: "
+		       "Slab cache still has objects\n", s->name);
+		dump_stack();
+		return -EBUSY;
+	}
+
+	if (s->flags & SLAB_DESTROY_BY_RCU)
+		*need_rcu_barrier = true;
+
+#ifdef CONFIG_MEMCG_KMEM
+	if (!is_root_cache(s))
+		list_del(&s->memcg_params.list);
+#endif
+	list_move(&s->list, release);
+	return 0;
+}
+
+static void do_kmem_cache_release(struct list_head *release,
+				  bool need_rcu_barrier)
+{
+	struct kmem_cache *s, *s2;
+
+	if (need_rcu_barrier)
+		rcu_barrier();
+
+	list_for_each_entry_safe(s, s2, release, list) {
+#ifdef SLAB_SUPPORTS_SYSFS
+		sysfs_slab_remove(s);
+#else
+		slab_kmem_cache_release(s);
+#endif
+	}
+}
+
+#ifdef CONFIG_MEMCG_KMEM
+/*
+ * memcg_create_kmem_cache - Create a cache for a memory cgroup.
+ * @memcg: The memory cgroup the new cache is for.
+ * @root_cache: The parent of the new cache.
+ *
+ * This function attempts to create a kmem cache that will serve allocation
+ * requests going from @memcg to @root_cache. The new cache inherits properties
+ * from its parent.
+ */
+void memcg_create_kmem_cache(struct mem_cgroup *memcg,
+			     struct kmem_cache *root_cache)
+{
+	static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
+	struct cgroup_subsys_state *css = mem_cgroup_css(memcg);
+	struct memcg_cache_array *arr;
+	struct kmem_cache *s = NULL;
+	char *cache_name;
+	int idx;
+
+	get_online_cpus();
+	mutex_lock(&slab_mutex);
+
+	/*
+	 * The memory cgroup could have been deactivated while the cache
+	 * creation work was pending.
+	 */
+	if (!memcg_kmem_is_active(memcg))
+		goto out_unlock;
+
+	idx = memcg_cache_id(memcg);
+	arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
+					lockdep_is_held(&slab_mutex));
+
+	/*
+	 * Since per-memcg caches are created asynchronously on first
+	 * allocation (see memcg_kmem_get_cache()), several threads can try to
+	 * create the same cache, but only one of them may succeed.
+	 */
+	if (arr->entries[idx])
+		goto out_unlock;
+
+	rcu_read_lock();
+	strlcpy(memcg_name_buf, cgroup_name(css->cgroup), NAME_MAX + 1);
+	rcu_read_unlock();
+	cache_name = kasprintf(GFP_KERNEL, "%s(%d:%s)", root_cache->name,
+			       css_id(css), memcg_name_buf);
+	if (!cache_name)
+		goto out_unlock;
+
+	s = do_kmem_cache_create(cache_name, root_cache->object_size,
+				 root_cache->size, root_cache->align,
+				 root_cache->flags, root_cache->ctor,
+				 memcg, root_cache);
+	/*
+	 * If we could not create a memcg cache, do not complain, because
+	 * that's not critical at all as we can always proceed with the root
+	 * cache.
+	 */
+	if (IS_ERR(s)) {
+		kfree(cache_name);
+		goto out_unlock;
+	}
+
+	list_add(&s->memcg_params.list, &root_cache->memcg_params.list);
+
+	/*
+	 * Since readers won't lock (see cache_from_memcg_idx()), we need a
+	 * barrier here to ensure nobody will see the kmem_cache partially
+	 * initialized.
+	 */
+	smp_wmb();
+	arr->entries[idx] = s;
+
+out_unlock:
+	mutex_unlock(&slab_mutex);
+	put_online_cpus();
+}
+
+void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg)
+{
+	int idx;
+	struct memcg_cache_array *arr;
+	struct kmem_cache *s, *c;
+
+	idx = memcg_cache_id(memcg);
+
+	get_online_cpus();
+	mutex_lock(&slab_mutex);
+	list_for_each_entry(s, &slab_caches, list) {
+		if (!is_root_cache(s))
+			continue;
+
+		arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
+						lockdep_is_held(&slab_mutex));
+		c = arr->entries[idx];
+		if (!c)
+			continue;
+
+		__kmem_cache_shrink(c, true);
+		arr->entries[idx] = NULL;
+	}
+	mutex_unlock(&slab_mutex);
+	put_online_cpus();
+}
+
+void memcg_destroy_kmem_caches(struct mem_cgroup *memcg)
+{
+	LIST_HEAD(release);
+	bool need_rcu_barrier = false;
+	struct kmem_cache *s, *s2;
+
+	get_online_cpus();
+	mutex_lock(&slab_mutex);
+	list_for_each_entry_safe(s, s2, &slab_caches, list) {
+		if (is_root_cache(s) || s->memcg_params.memcg != memcg)
+			continue;
+		/*
+		 * The cgroup is about to be freed and therefore has no charges
+		 * left. Hence, all its caches must be empty by now.
+		 */
+		BUG_ON(do_kmem_cache_shutdown(s, &release, &need_rcu_barrier));
+	}
+	mutex_unlock(&slab_mutex);
+	put_online_cpus();
+
+	do_kmem_cache_release(&release, need_rcu_barrier);
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
+void slab_kmem_cache_release(struct kmem_cache *s)
+{
+	__kmem_cache_release(s);
+	destroy_memcg_params(s);
+	kfree(s->name);
+	kmem_cache_free(kmem_cache, s);
 }
-EXPORT_SYMBOL(kmem_cache_create);
 
 void kmem_cache_destroy(struct kmem_cache *s)
 {
+	struct kmem_cache *c, *c2;
+	LIST_HEAD(release);
+	bool need_rcu_barrier = false;
+	bool busy = false;
+
 	if (unlikely(!s))
 		return;
 
-	/* Destroy all the children caches if we aren't a memcg cache */
-	kmem_cache_destroy_memcg_children(s);
+	BUG_ON(!is_root_cache(s));
 
 	get_online_cpus();
 	mutex_lock(&slab_mutex);
+
 	s->refcount--;
-	if (!s->refcount) {
-		list_del(&s->list);
-
-		if (!__kmem_cache_shutdown(s)) {
-			mutex_unlock(&slab_mutex);
-			if (s->flags & SLAB_DESTROY_BY_RCU)
-				rcu_barrier();
-
-			memcg_release_cache(s);
-			kfree(s->name);
-			kmem_cache_free(kmem_cache, s);
-		} else {
-			list_add(&s->list, &slab_caches);
-			mutex_unlock(&slab_mutex);
-			printk(KERN_ERR "kmem_cache_destroy %s: Slab cache still has objects\n",
-				s->name);
-			dump_stack();
-		}
-	} else {
-		mutex_unlock(&slab_mutex);
+	if (s->refcount)
+		goto out_unlock;
+
+	for_each_memcg_cache_safe(c, c2, s) {
+		if (do_kmem_cache_shutdown(c, &release, &need_rcu_barrier))
+			busy = true;
 	}
+
+	if (!busy)
+		do_kmem_cache_shutdown(s, &release, &need_rcu_barrier);
+
+out_unlock:
+	mutex_unlock(&slab_mutex);
 	put_online_cpus();
+
+	do_kmem_cache_release(&release, need_rcu_barrier);
 }
 EXPORT_SYMBOL(kmem_cache_destroy);
 
+/**
+ * kmem_cache_shrink - Shrink a cache.
+ * @cachep: The cache to shrink.
+ *
+ * Releases as many slabs as possible for a cache.
+ * To help debugging, a zero exit status indicates all slabs were released.
+ */
+int kmem_cache_shrink(struct kmem_cache *cachep)
+{
+	int ret;
+
+	get_online_cpus();
+	mutex_lock(&slab_mutex);
+	ret = __kmem_cache_shrink(cachep, false);
+	mutex_unlock(&slab_mutex);
+	put_online_cpus();
+	return ret;
+}
+EXPORT_SYMBOL(kmem_cache_shrink);
+
 int slab_is_available(void)
 {
 	return slab_state >= UP;
@@ -322,6 +602,9 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name, size_t siz
 	s->name = name;
 	s->size = s->object_size = size;
 	s->align = calculate_alignment(flags, ARCH_KMALLOC_MINALIGN, size);
+
+	slab_init_memcg_params(s);
+
 	err = __kmem_cache_create(s, flags);
 
 	if (err)
@@ -537,9 +820,32 @@ void __init create_kmalloc_caches(unsigned long flags)
 }
 #endif /* !CONFIG_SLOB */
 
+void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
+{
+	void *ret;
+	struct page *page;
+
+	flags |= __GFP_COMP;
+	page = alloc_pages(flags, order);
+	ret = page ? page_address(page) : NULL;
+	kmemleak_alloc(ret, size, 1, flags);
+	kasan_kmalloc_large(ret, size);
+	return ret;
+}
+EXPORT_SYMBOL(kmalloc_order);
+
+#ifdef CONFIG_TRACING
+void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
+{
+	void *ret = kmalloc_order(size, flags, order);
+	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
+	return ret;
+}
+EXPORT_SYMBOL(kmalloc_order_trace);
+#endif
 
 #ifdef CONFIG_SLABINFO
-void print_slabinfo_header(struct seq_file *m)
+static void print_slabinfo_header(struct seq_file *m)
 {
 	/*
 	 * Output format version, so at least we can change it
@@ -562,23 +868,18 @@ void print_slabinfo_header(struct seq_file *m)
 	seq_putc(m, '\n');
 }
 
-static void *s_start(struct seq_file *m, loff_t *pos)
+void *slab_start(struct seq_file *m, loff_t *pos)
 {
-	loff_t n = *pos;
-
 	mutex_lock(&slab_mutex);
-	if (!n)
-		print_slabinfo_header(m);
-
 	return seq_list_start(&slab_caches, *pos);
 }
 
-static void *s_next(struct seq_file *m, void *p, loff_t *pos)
+void *slab_next(struct seq_file *m, void *p, loff_t *pos)
 {
 	return seq_list_next(p, &slab_caches, pos);
 }
 
-static void s_stop(struct seq_file *m, void *p)
+void slab_stop(struct seq_file *m, void *p)
 {
 	mutex_unlock(&slab_mutex);
 }
@@ -588,16 +889,11 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
 {
 	struct kmem_cache *c;
 	struct slabinfo sinfo;
-	int i;
 
 	if (!is_root_cache(s))
 		return;
 
-	for_each_memcg_cache_index(i) {
-		c = cache_from_memcg(s, i);
-		if (!c)
-			continue;
-
+	for_each_memcg_cache(c, s) {
 		memset(&sinfo, 0, sizeof(sinfo));
 		get_slabinfo(c, &sinfo);
 
@@ -609,7 +905,7 @@ memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
 	}
 }
 
-int cache_show(struct kmem_cache *s, struct seq_file *m)
+static void cache_show(struct kmem_cache *s, struct seq_file *m)
 {
 	struct slabinfo sinfo;
 
@@ -628,17 +924,31 @@ int cache_show(struct kmem_cache *s, struct seq_file *m)
 		   sinfo.active_slabs, sinfo.num_slabs, sinfo.shared_avail);
 	slabinfo_show_stats(m, s);
 	seq_putc(m, '\n');
+}
+
+static int slab_show(struct seq_file *m, void *p)
+{
+	struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
+
+	if (p == slab_caches.next)
+		print_slabinfo_header(m);
+	if (is_root_cache(s))
+		cache_show(s, m);
 	return 0;
 }
 
-static int s_show(struct seq_file *m, void *p)
+#ifdef CONFIG_MEMCG_KMEM
+int memcg_slab_show(struct mem_cgroup *memcg, struct seq_file *m, void *p)
 {
 	struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
 
-	if (!is_root_cache(s))
-		return 0;
-	return cache_show(s, m);
+	if (p == slab_caches.next)
+		print_slabinfo_header(m);
+	if (!is_root_cache(s) && s->memcg_params.memcg == memcg)
+		cache_show(s, m);
+	return 0;
 }
+#endif
 
 /*
  * slabinfo_op - iterator that generates /proc/slabinfo
@@ -654,10 +964,10 @@ static int s_show(struct seq_file *m, void *p)
  * + further values on SMP and with statistics enabled
  */
 static const struct seq_operations slabinfo_op = {
-	.start = s_start,
-	.next = s_next,
-	.stop = s_stop,
-	.show = s_show,
+	.start = slab_start,
+	.next = slab_next,
+	.stop = slab_stop,
+	.show = slab_show,
 };
 
 static int slabinfo_open(struct inode *inode, struct file *file)
@@ -680,3 +990,104 @@ static int __init slab_proc_init(void)
 }
 module_init(slab_proc_init);
 #endif /* CONFIG_SLABINFO */
+
+static __always_inline void *__do_krealloc(const void *p, size_t new_size,
+					   gfp_t flags)
+{
+	void *ret;
+	size_t ks = 0;
+
+	if (p)
+		ks = ksize(p);
+
+	if (ks >= new_size) {
+		kasan_krealloc((void *)p, new_size);
+		return (void *)p;
+	}
+
+	ret = kmalloc_track_caller(new_size, flags);
+	if (ret && p)
+		memcpy(ret, p, ks);
+
+	return ret;
+}
+
+/**
+ * __krealloc - like krealloc() but don't free @p.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * This function is like krealloc() except it never frees the originally
+ * allocated buffer. Use this if you don't want to free the buffer immediately
+ * like, for example, with RCU.
+ */
+void *__krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+	if (unlikely(!new_size))
+		return ZERO_SIZE_PTR;
+
+	return __do_krealloc(p, new_size, flags);
+
+}
+EXPORT_SYMBOL(__krealloc);
+
+/**
+ * krealloc - reallocate memory. The contents will remain unchanged.
+ * @p: object to reallocate memory for.
+ * @new_size: how many bytes of memory are required.
+ * @flags: the type of memory to allocate.
+ *
+ * The contents of the object pointed to are preserved up to the
+ * lesser of the new and old sizes.  If @p is %NULL, krealloc()
+ * behaves exactly like kmalloc().  If @new_size is 0 and @p is not a
+ * %NULL pointer, the object pointed to is freed.
+ */
+void *krealloc(const void *p, size_t new_size, gfp_t flags)
+{
+	void *ret;
+
+	if (unlikely(!new_size)) {
+		kfree(p);
+		return ZERO_SIZE_PTR;
+	}
+
+	ret = __do_krealloc(p, new_size, flags);
+	if (ret && p != ret)
+		kfree(p);
+
+	return ret;
+}
+EXPORT_SYMBOL(krealloc);
+
+/**
+ * kzfree - like kfree but zero memory
+ * @p: object to free memory of
+ *
+ * The memory of the object @p points to is zeroed before freed.
+ * If @p is %NULL, kzfree() does nothing.
+ *
+ * Note: this function zeroes the whole allocated buffer which can be a good
+ * deal bigger than the requested buffer size passed to kmalloc(). So be
+ * careful when using this function in performance sensitive code.
+ */
+void kzfree(const void *p)
+{
+	size_t ks;
+	void *mem = (void *)p;
+
+	if (unlikely(ZERO_OR_NULL_PTR(mem)))
+		return;
+	ks = ksize(mem);
+	memset(mem, 0, ks);
+	kfree(mem);
+}
+EXPORT_SYMBOL(kzfree);
+
+/* Tracepoints definitions. */
+EXPORT_TRACEPOINT_SYMBOL(kmalloc);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
+EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
+EXPORT_TRACEPOINT_SYMBOL(kfree);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -462,11 +462,11 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
 	return ret;
 }
 
-void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+void *__kmalloc(size_t size, gfp_t gfp)
 {
-	return __do_kmalloc_node(size, gfp, node, _RET_IP_);
+	return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, _RET_IP_);
 }
-EXPORT_SYMBOL(__kmalloc_node);
+EXPORT_SYMBOL(__kmalloc);
 
 #ifdef CONFIG_TRACING
 void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
@@ -534,7 +534,7 @@ int __kmem_cache_create(struct kmem_cache *c, unsigned long flags)
 	return 0;
 }
 
-void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
+void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 {
 	void *b;
 
@@ -560,7 +560,27 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 	kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags);
 	return b;
 }
+EXPORT_SYMBOL(slob_alloc_node);
+
+void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
+{
+	return slob_alloc_node(cachep, flags, NUMA_NO_NODE);
+}
+EXPORT_SYMBOL(kmem_cache_alloc);
+
+#ifdef CONFIG_NUMA
+void *__kmalloc_node(size_t size, gfp_t gfp, int node)
+{
+	return __do_kmalloc_node(size, gfp, node, _RET_IP_);
+}
+EXPORT_SYMBOL(__kmalloc_node);
+
+void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t gfp, int node)
+{
+	return slob_alloc_node(cachep, gfp, node);
+}
 EXPORT_SYMBOL(kmem_cache_alloc_node);
+#endif
 
 static void __kmem_cache_free(void *b, int size)
 {
@@ -613,11 +633,14 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
 	return 0;
 }
 
-int kmem_cache_shrink(struct kmem_cache *d)
+void __kmem_cache_release(struct kmem_cache *c)
+{
+}
+
+int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
 {
 	return 0;
 }
-EXPORT_SYMBOL(kmem_cache_shrink);
 
 struct kmem_cache kmem_cache_boot = {
 	.name = "kmem_cache",
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -20,6 +20,7 @@
 #include <linux/proc_fs.h>
 #include <linux/notifier.h>
 #include <linux/seq_file.h>
+#include <linux/kasan.h>
 #include <linux/kmemcheck.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
@@ -168,7 +169,7 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
 		SLAB_FAILSLAB)
 
 #define SLUB_MERGE_SAME (SLAB_DEBUG_FREE | SLAB_RECLAIM_ACCOUNT | \
-		SLAB_CACHE_DMA | SLAB_NOTRACK)
+		SLAB_CACHE_DMA | SLAB_NOTRACK | SLAB_ACCOUNT)
 
 #define OO_SHIFT	16
 #define OO_MASK		((1 << OO_SHIFT) - 1)
@@ -201,14 +202,11 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
 #ifdef CONFIG_SYSFS
 static int sysfs_slab_add(struct kmem_cache *);
 static int sysfs_slab_alias(struct kmem_cache *, const char *);
-static void sysfs_slab_remove(struct kmem_cache *);
 static void memcg_propagate_slab_attrs(struct kmem_cache *s);
 #else
 static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
 static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
 							{ return 0; }
-static inline void sysfs_slab_remove(struct kmem_cache *s) { }
-
 static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
 #endif
 
@@ -459,6 +457,8 @@ static void get_map(struct kmem_cache *s, struct page *page, unsigned long *map)
  */
 #ifdef CONFIG_SLUB_DEBUG_ON
 static int slub_debug = DEBUG_DEFAULT_FLAGS;
+#elif defined (CONFIG_KASAN)
+static int slub_debug = SLAB_STORE_USER;
 #else
 static int slub_debug;
 #endif
@@ -467,12 +467,30 @@ static char *slub_debug_slabs;
 static int disable_higher_order_debug;
 
 /*
+ * slub is about to manipulate internal object metadata.  This memory lies
+ * outside the range of the allocated object, so accessing it would normally
+ * be reported by kasan as a bounds error.  metadata_access_enable() is used
+ * to tell kasan that these accesses are OK.
+ */
+static inline void metadata_access_enable(void)
+{
+	kasan_disable_current();
+}
+
+static inline void metadata_access_disable(void)
+{
+	kasan_enable_current();
+}
+
+/*
  * Object debugging
  */
 static void print_section(char *text, u8 *addr, unsigned int length)
 {
+	metadata_access_enable();
 	print_hex_dump(KERN_ERR, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
 			length, 1);
+	metadata_access_disable();
 }
 
 static struct track *get_track(struct kmem_cache *s, void *object,
@@ -502,7 +520,9 @@ static void set_track(struct kmem_cache *s, void *object,
 		trace.max_entries = TRACK_ADDRS_COUNT;
 		trace.entries = p->addrs;
 		trace.skip = 3;
+		metadata_access_enable();
 		save_stack_trace(&trace);
+		metadata_access_disable();
 
 		/* See rant in lockdep.c */
 		if (trace.nr_entries != 0 &&
@@ -628,7 +648,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
 	dump_stack();
 }
 
-static void object_err(struct kmem_cache *s, struct page *page,
+void object_err(struct kmem_cache *s, struct page *page,
 			u8 *object, char *reason)
 {
 	slab_bug(s, "%s", reason);
@@ -675,7 +695,9 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
 	u8 *fault;
 	u8 *end;
 
+	metadata_access_enable();
 	fault = memchr_inv(start, value, bytes);
+	metadata_access_disable();
 	if (!fault)
 		return 1;
 
@@ -768,7 +790,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
 	if (!remainder)
 		return 1;
 
+	metadata_access_enable();
 	fault = memchr_inv(end - remainder, POISON_INUSE, remainder);
+	metadata_access_disable();
 	if (!fault)
 		return 1;
 	while (end > fault && end[-1] == POISON_INUSE)
@@ -931,60 +955,7 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
 }
 
 /*
- * Hooks for other subsystems that check memory allocations. In a typical
- * production configuration these hooks all should produce no code at all.
- */
-static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
-{
-	flags &= gfp_allowed_mask;
-	lockdep_trace_alloc(flags);
-	might_sleep_if(flags & __GFP_WAIT);
-
-	return should_failslab(s->object_size, flags, s->flags);
-}
-
-static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
-					size_t size, void **p)
-{
-	size_t i;
-
-	flags &= gfp_allowed_mask;
-	for (i = 0; i < size; i++) {
-		void *object = p[i];
-
-		kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
-		kmemleak_alloc_recursive(object, s->object_size, 1,
-					 s->flags, flags);
-	}
-}
-
-static inline void slab_free_hook(struct kmem_cache *s, void *x)
-{
-	kmemleak_free_recursive(x, s->flags);
-
-	/*
-	 * Trouble is that we may no longer disable interupts in the fast path
-	 * So in order to make the debug calls that expect irqs to be
-	 * disabled we need to disable interrupts temporarily.
-	 */
-#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
-	{
-		unsigned long flags;
-
-		local_irq_save(flags);
-		kmemcheck_slab_free(s, x, s->object_size);
-		debug_check_no_locks_freed(x, s->object_size);
-		local_irq_restore(flags);
-	}
-#endif
-	if (!(s->flags & SLAB_DEBUG_OBJECTS))
-		debug_check_no_obj_freed(x, s->object_size);
-}
-
-/*
  * Tracking of fully allocated slabs for debugging purposes.
- *
- * list_lock must be held.
  */
 static void add_full(struct kmem_cache *s,
 	struct kmem_cache_node *n, struct page *page)
@@ -992,17 +963,16 @@ static void add_full(struct kmem_cache *s,
 	if (!(s->flags & SLAB_STORE_USER))
 		return;
 
+	lockdep_assert_held(&n->list_lock);
 	list_add(&page->lru, &n->full);
 }
 
-/*
- * list_lock must be held.
- */
-static void remove_full(struct kmem_cache *s, struct page *page)
+static void remove_full(struct kmem_cache *s, struct kmem_cache_node *n, struct page *page)
 {
 	if (!(s->flags & SLAB_STORE_USER))
 		return;
 
+	lockdep_assert_held(&n->list_lock);
 	list_del(&page->lru);
 }
 
@@ -1266,7 +1236,8 @@ static inline int check_object(struct kmem_cache *s, struct page *page,
 			void *object, u8 val) { return 1; }
 static inline void add_full(struct kmem_cache *s, struct kmem_cache_node *n,
 					struct page *page) {}
-static inline void remove_full(struct kmem_cache *s, struct page *page) {}
+static inline void remove_full(struct kmem_cache *s, struct kmem_cache_node *n,
+					struct page *page) {}
 static inline unsigned long kmem_cache_flags(unsigned long object_size,
 	unsigned long flags, const char *name,
 	void (*ctor)(void *))
@@ -1285,16 +1256,62 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
 							int objects) {}
 static inline void dec_slabs_node(struct kmem_cache *s, int node,
 							int objects) {}
-
+#endif /* CONFIG_SLUB_DEBUG */
+/*
+ * Hooks for other subsystems that check memory allocations. In a typical
+ * production configuration these hooks all should produce no code at all.
+ */
 static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
-							{ return 0; }
+{
+	flags &= gfp_allowed_mask;
+	lockdep_trace_alloc(flags);
+	might_sleep_if(flags & __GFP_WAIT);
+	WARN_ON_ONCE((flags & __GFP_FS) && current->journal_info &&
+		!(current->flags & PF_MEMALLOC));
+
+	return should_failslab(s->object_size, flags, s->flags);
+}
 
 static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
-		void *object) {}
+		size_t size, void **p)
+{
+	size_t i;
 
-static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
+	flags &= gfp_allowed_mask;
+	for (i = 0; i < size; i++) {
+		void *object = p[i];
 
-#endif /* CONFIG_SLUB_DEBUG */
+		kmemcheck_slab_alloc(s, flags, object, slab_ksize(s));
+		kmemleak_alloc_recursive(object, s->object_size, 1,
+				s->flags, flags);
+		kasan_slab_alloc(s, object);
+	}
+}
+
+static inline void slab_free_hook(struct kmem_cache *s, void *x)
+{
+	kmemleak_free_recursive(x, s->flags);
+
+	/*
+	 * Trouble is that we may no longer disable interupts in the fast path
+	 * So in order to make the debug calls that expect irqs to be
+	 * disabled we need to disable interrupts temporarily.
+	 */
+#if defined(CONFIG_KMEMCHECK) || defined(CONFIG_LOCKDEP)
+	{
+		unsigned long flags;
+
+		local_irq_save(flags);
+		kmemcheck_slab_free(s, x, s->object_size);
+		debug_check_no_locks_freed(x, s->object_size);
+		local_irq_restore(flags);
+	}
+#endif
+	if (!(s->flags & SLAB_DEBUG_OBJECTS))
+		debug_check_no_obj_freed(x, s->object_size);
+
+	kasan_slab_free(s, x);
+}
 
 static inline void slab_free_freelist_hook(struct kmem_cache *s,
 					   void *head, void *tail)
@@ -1322,17 +1339,26 @@ static inline void slab_free_freelist_hook(struct kmem_cache *s,
 /*
  * Slab allocation and freeing
  */
-static inline struct page *alloc_slab_page(gfp_t flags, int node,
-					struct kmem_cache_order_objects oo)
+static inline struct page *alloc_slab_page(struct kmem_cache *s,
+		gfp_t flags, int node, struct kmem_cache_order_objects oo)
 {
+	struct page *page;
 	int order = oo_order(oo);
 
 	flags |= __GFP_NOTRACK;
 
+	if (memcg_charge_slab(s, flags, order))
+		return NULL;
+
 	if (node == NUMA_NO_NODE)
-		return alloc_pages(flags, order);
+		page = alloc_pages(flags, order);
 	else
-		return alloc_pages_exact_node(node, flags, order);
+		page = alloc_pages_exact_node(node, flags, order);
+
+	if (!page)
+		memcg_uncharge_slab(s, order);
+
+	return page;
 }
 
 static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1354,14 +1380,14 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node)
 	 */
 	alloc_gfp = (flags | __GFP_NOWARN | __GFP_NORETRY) & ~__GFP_NOFAIL;
 
-	page = alloc_slab_page(alloc_gfp, node, oo);
+	page = alloc_slab_page(s, alloc_gfp, node, oo);
 	if (unlikely(!page)) {
 		oo = s->min;
 		/*
 		 * Allocation may have failed due to fragmentation.
 		 * Try a lower order alloc if possible
 		 */
-		page = alloc_slab_page(flags, node, oo);
+		page = alloc_slab_page(s, flags, node, oo);
 
 		if (page)
 			stat(s, ORDER_FALLBACK);
@@ -1401,8 +1427,11 @@ static void setup_object(struct kmem_cache *s, struct page *page,
 				void *object)
 {
 	setup_object_debug(s, page, object);
-	if (unlikely(s->ctor))
+	if (unlikely(s->ctor)) {
+		kasan_unpoison_object_data(s, object);
 		s->ctor(object);
+		kasan_poison_object_data(s, object);
+	}
 }
 
 static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
@@ -1422,7 +1451,6 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 
 	order = compound_order(page);
 	inc_slabs_node(s, page_to_nid(page), page->objects);
-	memcg_bind_pages(s, order);
 	page->slab_cache = s;
 	__SetPageSlab(page);
 	if (page_is_pfmemalloc(page))
@@ -1433,6 +1461,8 @@ static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
 	if (unlikely(s->flags & SLAB_POISON))
 		memset(start, POISON_INUSE, PAGE_SIZE << order);
 
+	kasan_poison_slab(page);
+
 	last = start;
 	for_each_object(p, s, start, page->objects) {
 		setup_object(s, page, last);
@@ -1473,11 +1503,11 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
 	__ClearPageSlabPfmemalloc(page);
 	__ClearPageSlab(page);
 
-	memcg_release_pages(s, order);
 	page_mapcount_reset(page);
 	if (current->reclaim_state)
 		current->reclaim_state->reclaimed_slab += pages;
-	__free_memcg_kmem_pages(page, order);
+	__free_pages(page, order);
+	memcg_uncharge_slab(s, order);
 }
 
 #define need_reserve_slab_rcu						\
@@ -1526,11 +1556,9 @@ static void discard_slab(struct kmem_cache *s, struct page *page)
 
 /*
  * Management of partially allocated slabs.
- *
- * list_lock must be held.
  */
-static inline void add_partial(struct kmem_cache_node *n,
-				struct page *page, int tail)
+static inline void
+__add_partial(struct kmem_cache_node *n, struct page *page, int tail)
 {
 	n->nr_partial++;
 	if (tail == DEACTIVATE_TO_TAIL)
@@ -1539,12 +1567,17 @@ static inline void add_partial(struct kmem_cache_node *n,
 		list_add(&page->lru, &n->partial);
 }
 
-/*
- * list_lock must be held.
- */
+static inline void add_partial(struct kmem_cache_node *n,
+				struct page *page, int tail)
+{
+	lockdep_assert_held(&n->list_lock);
+	__add_partial(n, page, tail);
+}
+
 static inline void remove_partial(struct kmem_cache_node *n,
 					struct page *page)
 {
+	lockdep_assert_held(&n->list_lock);
 	list_del(&page->lru);
 	n->nr_partial--;
 }
@@ -1554,8 +1587,6 @@ static inline void remove_partial(struct kmem_cache_node *n,
  * return the pointer to the freelist.
  *
  * Returns a list of objects or NULL if it fails.
- *
- * Must hold list_lock since we modify the partial list.
  */
 static inline void *acquire_slab(struct kmem_cache *s,
 		struct kmem_cache_node *n, struct page *page,
@@ -1565,6 +1596,8 @@ static inline void *acquire_slab(struct kmem_cache *s,
 	unsigned long counters;
 	struct page new;
 
+	lockdep_assert_held(&n->list_lock);
+
 	/*
 	 * Zap the freelist and set the frozen bit.
 	 * The old freelist is the list of objects for the
@@ -1876,7 +1909,7 @@ redo:
 
 	new.frozen = 0;
 
-	if (!new.inuse && n->nr_partial > s->min_partial)
+	if (!new.inuse && n->nr_partial >= s->min_partial)
 		m = M_FREE;
 	else if (new.freelist) {
 		m = M_PARTIAL;
@@ -1910,7 +1943,7 @@ redo:
 
 		else if (l == M_FULL)
 
-			remove_full(s, page);
+			remove_full(s, n, page);
 
 		if (m == M_PARTIAL) {
 
@@ -1986,7 +2019,7 @@ static void unfreeze_partials(struct kmem_cache *s,
 				new.freelist, new.counters,
 				"unfreezing slab"));
 
-		if (unlikely(!new.inuse && n->nr_partial > s->min_partial)) {
+		if (unlikely(!new.inuse && n->nr_partial >= s->min_partial)) {
 			page->next = discard_page;
 			discard_page = page;
 		} else {
@@ -2023,6 +2056,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 	int pages;
 	int pobjects;
 
+	preempt_disable();
 	do {
 		pages = 0;
 		pobjects = 0;
@@ -2055,6 +2089,15 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain)
 		page->next = oldpage;
 
 	} while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage);
+
+	if (unlikely(!s->cpu_partial)) {
+		unsigned long flags;
+
+		local_irq_save(flags);
+		unfreeze_partials(s, this_cpu_ptr(s->cpu_slab));
+		local_irq_restore(flags);
+	}
+	preempt_enable();
 }
 
 static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
@@ -2480,6 +2523,7 @@ redo:
 		memset(object, 0, s->object_size);
 
 	slab_post_alloc_hook(s, gfpflags, 1, &object);
+	memcg_kmem_put_cache(s);
 
 	return object;
 }
@@ -2505,17 +2549,10 @@ void *kmem_cache_alloc_trace(struct kmem_cache *s, gfp_t gfpflags, size_t size)
 {
 	void *ret = slab_alloc(s, gfpflags, _RET_IP_);
 	trace_kmalloc(_RET_IP_, ret, size, s->size, gfpflags);
+	kasan_kmalloc(s, ret, size);
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_trace);
-
-void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order)
-{
-	void *ret = kmalloc_order(size, flags, order);
-	trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags);
-	return ret;
-}
-EXPORT_SYMBOL(kmalloc_order_trace);
 #endif
 
 #ifdef CONFIG_NUMA
@@ -2539,6 +2576,8 @@ void *kmem_cache_alloc_node_trace(struct kmem_cache *s,
 
 	trace_kmalloc_node(_RET_IP_, ret,
 			   size, s->size, gfpflags, node);
+
+	kasan_kmalloc(s, ret, size);
 	return ret;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node_trace);
@@ -2585,7 +2624,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 		new.inuse -= cnt;
 		if ((!new.inuse || !prior) && !was_frozen) {
 
-			if (!kmem_cache_debug(s) && !prior)
+			if (!kmem_cache_debug(s) && !prior) {
 
 				/*
 				 * Slab was on no list before and will be partially empty
@@ -2593,7 +2632,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 				 */
 				new.frozen = 1;
 
-			else { /* Needs to be taken off a list */
+			} else { /* Needs to be taken off a list */
 
 	                        n = get_node(s, page_to_nid(page));
 				/*
@@ -2633,7 +2672,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
                 return;
         }
 
-	if (unlikely(!new.inuse && n->nr_partial > s->min_partial))
+	if (unlikely(!new.inuse && n->nr_partial >= s->min_partial))
 		goto slab_empty;
 
 	/*
@@ -2641,7 +2680,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
 	 * then add it.
 	 */
 	if (kmem_cache_debug(s) && unlikely(!prior)) {
-		remove_full(s, page);
+		remove_full(s, n, page);
 		add_partial(n, page, DEACTIVATE_TO_TAIL);
 		stat(s, FREE_ADD_PARTIAL);
 	}
@@ -2655,9 +2694,10 @@ slab_empty:
 		 */
 		remove_partial(n, page);
 		stat(s, FREE_REMOVE_PARTIAL);
-	} else
+	} else {
 		/* Slab must be on the full list */
-		remove_full(s, page);
+		remove_full(s, n, page);
+	}
 
 	spin_unlock_irqrestore(&n->list_lock, flags);
 	stat(s, FREE_SLAB);
@@ -3087,10 +3127,16 @@ static void early_kmem_cache_node_alloc(int node)
 	init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
 	init_tracking(kmem_cache_node, n);
 #endif
+	kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node));
 	init_kmem_cache_node(n);
 	inc_slabs_node(kmem_cache_node, node, page->objects);
 
-	add_partial(n, page, DEACTIVATE_TO_HEAD);
+	/*
+	 * No locks need to be taken here as it has just been
+	 * initialized and there is no concurrent access.
+	 */
+
+	__add_partial(n, page, DEACTIVATE_TO_HEAD);
 }
 
 static void free_kmem_cache_nodes(struct kmem_cache *s)
@@ -3107,6 +3153,12 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
 	}
 }
 
+void __kmem_cache_release(struct kmem_cache *s)
+{
+	free_percpu(s->cpu_slab);
+	free_kmem_cache_nodes(s);
+}
+
 static int init_kmem_cache_nodes(struct kmem_cache *s)
 {
 	int node;
@@ -3367,28 +3419,31 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
 
 /*
  * Attempt to free all partial slabs on a node.
- * This is called from kmem_cache_close(). We must be the last thread
- * using the cache and therefore we do not need to lock anymore.
+ * This is called from __kmem_cache_shutdown(). We must take list_lock
+ * because sysfs file might still access partial list after the shutdowning.
  */
 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 {
 	struct page *page, *h;
 
+	BUG_ON(irqs_disabled());
+	spin_lock_irq(&n->list_lock);
 	list_for_each_entry_safe(page, h, &n->partial, lru) {
 		if (!page->inuse) {
 			remove_partial(n, page);
 			discard_slab(s, page);
 		} else {
 			list_slab_objects(s, page,
-			"Objects remaining in %s on kmem_cache_close()");
+			"Objects remaining in %s on __kmem_cache_shutdown()");
 		}
 	}
+	spin_unlock_irq(&n->list_lock);
 }
 
 /*
  * Release all resources used by a slab cache.
  */
-static inline int kmem_cache_close(struct kmem_cache *s)
+int __kmem_cache_shutdown(struct kmem_cache *s)
 {
 	int node;
 
@@ -3401,33 +3456,9 @@ static inline int kmem_cache_close(struct kmem_cache *s)
 		if (n->nr_partial || slabs_node(s, node))
 			return 1;
 	}
-	free_percpu(s->cpu_slab);
-	free_kmem_cache_nodes(s);
 	return 0;
 }
 
-int __kmem_cache_shutdown(struct kmem_cache *s)
-{
-	int rc = kmem_cache_close(s);
-
-	if (!rc) {
-		/*
-		 * Since slab_attr_store may take the slab_mutex, we should
-		 * release the lock while removing the sysfs entry in order to
-		 * avoid a deadlock. Because this is pretty much the last
-		 * operation we do and the lock will be released shortly after
-		 * that in slab_common.c, we could just move sysfs_slab_remove
-		 * to a later point in common code. We should do that when we
-		 * have a common sysfs framework for all allocators.
-		 */
-		mutex_unlock(&slab_mutex);
-		sysfs_slab_remove(s);
-		mutex_lock(&slab_mutex);
-	}
-
-	return rc;
-}
-
 /********************************************************************
  *		Kmalloc subsystem
  *******************************************************************/
@@ -3485,6 +3516,8 @@ void *__kmalloc(size_t size, gfp_t flags)
 
 	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
 
+	kasan_kmalloc(s, ret, size);
+
 	return ret;
 }
 EXPORT_SYMBOL(__kmalloc);
@@ -3495,12 +3528,13 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
 	struct page *page;
 	void *ptr = NULL;
 
-	flags |= __GFP_COMP | __GFP_NOTRACK | __GFP_KMEMCG;
+	flags |= __GFP_COMP | __GFP_NOTRACK;
 	page = alloc_pages_node(node, flags, get_order(size));
 	if (page)
 		ptr = page_address(page);
 
 	kmemleak_alloc(ptr, size, 1, flags);
+	kasan_kmalloc_large(ptr, size);
 	return ptr;
 }
 
@@ -3528,12 +3562,14 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 
 	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
 
+	kasan_kmalloc(s, ret, size);
+
 	return ret;
 }
 EXPORT_SYMBOL(__kmalloc_node);
 #endif
 
-size_t ksize(const void *object)
+static size_t __ksize(const void *object)
 {
 	struct page *page;
 
@@ -3549,6 +3585,15 @@ size_t ksize(const void *object)
 
 	return slab_ksize(page->slab_cache);
 }
+
+size_t ksize(const void *object)
+{
+	size_t size = __ksize(object);
+	/* We assume that ksize callers could use whole allocated area,
+	   so we need unpoison this area. */
+	kasan_krealloc(object, size);
+	return size;
+}
 EXPORT_SYMBOL(ksize);
 
 #ifdef CONFIG_SLUB_DEBUG
@@ -3601,37 +3646,53 @@ void kfree(const void *x)
 	if (unlikely(!PageSlab(page))) {
 		BUG_ON(!PageCompound(page));
 		kmemleak_free(x);
-		__free_memcg_kmem_pages(page, compound_order(page));
+		kasan_kfree_large(x);
+		__free_pages(page, compound_order(page));
 		return;
 	}
 	slab_free(page->slab_cache, page, object, NULL, 1, _RET_IP_);
 }
 EXPORT_SYMBOL(kfree);
 
+#define SHRINK_PROMOTE_MAX 32
+
 /*
- * kmem_cache_shrink removes empty slabs from the partial lists and sorts
- * the remaining slabs by the number of items in use. The slabs with the
- * most items in use come first. New allocations will then fill those up
- * and thus they can be removed from the partial lists.
+ * kmem_cache_shrink discards empty slabs and promotes the slabs filled
+ * up most to the head of the partial lists. New allocations will then
+ * fill those up and thus they can be removed from the partial lists.
  *
  * The slabs with the least items are placed last. This results in them
  * being allocated from last increasing the chance that the last objects
  * are freed in them.
  */
-int kmem_cache_shrink(struct kmem_cache *s)
+int __kmem_cache_shrink(struct kmem_cache *s, bool deactivate)
 {
 	int node;
 	int i;
 	struct kmem_cache_node *n;
 	struct page *page;
 	struct page *t;
-	int objects = oo_objects(s->max);
-	struct list_head *slabs_by_inuse =
-		kmalloc(sizeof(struct list_head) * objects, GFP_KERNEL);
+	LIST_HEAD(discard);
+	struct list_head promote[SHRINK_PROMOTE_MAX];
 	unsigned long flags;
 
-	if (!slabs_by_inuse)
-		return -ENOMEM;
+	if (deactivate) {
+		/*
+		 * Disable empty slabs caching. Used to avoid pinning offline
+		 * memory cgroups by kmem pages that can be freed.
+		 */
+		s->cpu_partial = 0;
+		s->min_partial = 0;
+
+		/*
+		 * s->cpu_partial is checked locklessly (see put_cpu_partial),
+		 * so we have to make sure the change is visible.
+		 */
+		kick_all_cpus_sync();
+	}
+
+	for (i = 0; i < SHRINK_PROMOTE_MAX; i++)
+		INIT_LIST_HEAD(promote + i);
 
 	flush_all(s);
 	for_each_node_state(node, N_NORMAL_MEMORY) {
@@ -3640,41 +3701,48 @@ int kmem_cache_shrink(struct kmem_cache *s)
 		if (!n->nr_partial)
 			continue;
 
-		for (i = 0; i < objects; i++)
-			INIT_LIST_HEAD(slabs_by_inuse + i);
-
 		spin_lock_irqsave(&n->list_lock, flags);
 
 		/*
-		 * Build lists indexed by the items in use in each slab.
+		 * Build lists of slabs to discard or promote.
 		 *
 		 * Note that concurrent frees may occur while we hold the
 		 * list_lock. page->inuse here is the upper limit.
 		 */
 		list_for_each_entry_safe(page, t, &n->partial, lru) {
-			list_move(&page->lru, slabs_by_inuse + page->inuse);
-			if (!page->inuse)
+			int free = page->objects - page->inuse;
+
+			/* Do not reread page->inuse */
+			barrier();
+
+			/* We do not keep full slabs on the list */
+			BUG_ON(free <= 0);
+
+			if (free == page->objects) {
+				list_move(&page->lru, &discard);
 				n->nr_partial--;
+			} else if (free <= SHRINK_PROMOTE_MAX)
+				list_move(&page->lru, promote + free - 1);
 		}
 
 		/*
-		 * Rebuild the partial list with the slabs filled up most
-		 * first and the least used slabs at the end.
+		 * Promote the slabs filled up most to the head of the
+		 * partial list.
 		 */
-		for (i = objects - 1; i > 0; i--)
-			list_splice(slabs_by_inuse + i, n->partial.prev);
+		for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--)
+			list_splice_init(promote + i, &n->partial);
 
 		spin_unlock_irqrestore(&n->list_lock, flags);
 
 		/* Release empty slabs */
-		list_for_each_entry_safe(page, t, slabs_by_inuse, lru)
+		list_for_each_entry_safe(page, t, &discard, lru)
 			discard_slab(s, page);
+
+		INIT_LIST_HEAD(&discard);
 	}
 
-	kfree(slabs_by_inuse);
 	return 0;
 }
-EXPORT_SYMBOL(kmem_cache_shrink);
 
 static int slab_mem_going_offline_callback(void *arg)
 {
@@ -3682,7 +3750,7 @@ static int slab_mem_going_offline_callback(void *arg)
 
 	mutex_lock(&slab_mutex);
 	list_for_each_entry(s, &slab_caches, list)
-		kmem_cache_shrink(s);
+		__kmem_cache_shrink(s, false);
 	mutex_unlock(&slab_mutex);
 
 	return 0;
@@ -3832,6 +3900,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
 #endif
 		}
 	}
+	slab_init_memcg_params(s);
 	list_add(&s->list, &slab_caches);
 	return s;
 }
@@ -3896,6 +3965,9 @@ static int slab_unmergeable(struct kmem_cache *s)
 	if (slub_nomerge || (s->flags & SLUB_NEVER_MERGE))
 		return 1;
 
+	if (!is_root_cache(s))
+		return 1;
+
 	if (s->ctor)
 		return 1;
 
@@ -3908,9 +3980,8 @@ static int slab_unmergeable(struct kmem_cache *s)
 	return 0;
 }
 
-static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
-		size_t align, unsigned long flags, const char *name,
-		void (*ctor)(void *))
+static struct kmem_cache *find_mergeable(size_t size, size_t align,
+		unsigned long flags, const char *name, void (*ctor)(void *))
 {
 	struct kmem_cache *s;
 
@@ -3933,7 +4004,7 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
 			continue;
 
 		if ((flags & SLUB_MERGE_SAME) != (s->flags & SLUB_MERGE_SAME))
-				continue;
+			continue;
 		/*
 		 * Check if alignment is compatible.
 		 * Courtesy of Adrian Drzewiecki
@@ -3944,23 +4015,21 @@ static struct kmem_cache *find_mergeable(struct mem_cgroup *memcg, size_t size,
 		if (s->size - size >= sizeof(void *))
 			continue;
 
-		if (!cache_match_memcg(s, memcg))
-			continue;
-
 		return s;
 	}
 	return NULL;
 }
 
 struct kmem_cache *
-__kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
-		   size_t align, unsigned long flags, void (*ctor)(void *))
+__kmem_cache_alias(const char *name, size_t size, size_t align,
+		   unsigned long flags, void (*ctor)(void *))
 {
-	struct kmem_cache *s;
+	struct kmem_cache *s, *c;
 
-	s = find_mergeable(memcg, size, align, flags, name, ctor);
+	s = find_mergeable(size, align, flags, name, ctor);
 	if (s) {
 		s->refcount++;
+
 		/*
 		 * Adjust the object sizes so that we clear
 		 * the complete object on kzalloc.
@@ -3968,6 +4037,12 @@ __kmem_cache_alias(struct mem_cgroup *memcg, const char *name, size_t size,
 		s->object_size = max(s->object_size, (int)size);
 		s->inuse = max_t(int, s->inuse, ALIGN(size, sizeof(void *)));
 
+		for_each_memcg_cache(c, s) {
+			c->object_size = s->object_size;
+			c->inuse = max_t(int, c->inuse,
+					 ALIGN(size, sizeof(void *)));
+		}
+
 		if (sysfs_slab_alias(s, name)) {
 			s->refcount--;
 			s = NULL;
@@ -3992,7 +4067,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
 	memcg_propagate_slab_attrs(s);
 	err = sysfs_slab_add(s);
 	if (err)
-		kmem_cache_close(s);
+		__kmem_cache_release(s);
 
 	return err;
 }
@@ -4999,12 +5074,9 @@ static ssize_t shrink_show(struct kmem_cache *s, char *buf)
 static ssize_t shrink_store(struct kmem_cache *s,
 			const char *buf, size_t length)
 {
-	if (buf[0] == '1') {
-		int rc = kmem_cache_shrink(s);
-
-		if (rc)
-			return rc;
-	} else
+	if (buf[0] == '1')
+		kmem_cache_shrink(s);
+	else
 		return -EINVAL;
 	return length;
 }
@@ -5228,7 +5300,7 @@ static ssize_t slab_attr_store(struct kobject *kobj,
 	err = attribute->store(s, buf, len);
 #ifdef CONFIG_MEMCG_KMEM
 	if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
-		int i;
+		struct kmem_cache *c;
 
 		mutex_lock(&slab_mutex);
 		if (s->max_attr_size < len)
@@ -5251,11 +5323,8 @@ static ssize_t slab_attr_store(struct kobject *kobj,
 		 * directly either failed or succeeded, in which case we loop
 		 * through the descendants with best-effort propagation.
 		 */
-		for_each_memcg_cache_index(i) {
-			struct kmem_cache *c = cache_from_memcg(s, i);
-			if (c)
-				attribute->store(c, buf, len);
-		}
+		for_each_memcg_cache(c, s)
+			attribute->store(c, buf, len);
 		mutex_unlock(&slab_mutex);
 	}
 #endif
@@ -5267,15 +5336,18 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
 #ifdef CONFIG_MEMCG_KMEM
 	int i;
 	char *buffer = NULL;
+	struct kmem_cache *root_cache;
 
-	if (!is_root_cache(s))
+	if (is_root_cache(s))
 		return;
 
+	root_cache = s->memcg_params.root_cache;
+
 	/*
 	 * This mean this cache had no attribute written. Therefore, no point
 	 * in copying default values around
 	 */
-	if (!s->max_attr_size)
+	if (!root_cache->max_attr_size)
 		return;
 
 	for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
@@ -5297,7 +5369,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
 		 */
 		if (buffer)
 			buf = buffer;
-		else if (s->max_attr_size < ARRAY_SIZE(mbuf))
+		else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf))
 			buf = mbuf;
 		else {
 			buffer = (char *) get_zeroed_page(GFP_KERNEL);
@@ -5306,7 +5378,7 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
 			buf = buffer;
 		}
 
-		attr->show(s->memcg_params->root_cache, buf);
+		attr->show(root_cache, buf);
 		attr->store(s, buf, strlen(buf));
 	}
 
@@ -5315,6 +5387,11 @@ static void memcg_propagate_slab_attrs(struct kmem_cache *s)
 #endif
 }
 
+static void kmem_cache_release(struct kobject *k)
+{
+	slab_kmem_cache_release(to_slab(k));
+}
+
 static const struct sysfs_ops slab_sysfs_ops = {
 	.show = slab_attr_show,
 	.store = slab_attr_store,
@@ -5322,6 +5399,7 @@ static const struct sysfs_ops slab_sysfs_ops = {
 
 static struct kobj_type slab_ktype = {
 	.sysfs_ops = &slab_sysfs_ops,
+	.release = kmem_cache_release,
 };
 
 static int uevent_filter(struct kset *kset, struct kobject *kobj)
@@ -5339,6 +5417,15 @@ static const struct kset_uevent_ops slab_uevent_ops = {
 
 static struct kset *slab_kset;
 
+static inline struct kset *cache_kset(struct kmem_cache *s)
+{
+#ifdef CONFIG_MEMCG_KMEM
+	if (!is_root_cache(s))
+		return s->memcg_params.root_cache->memcg_kset;
+#endif
+	return slab_kset;
+}
+
 #define ID_STR_LENGTH 64
 
 /* Create a unique string id for a slab cache:
@@ -5368,13 +5455,15 @@ static char *create_unique_id(struct kmem_cache *s)
 		*p++ = 'F';
 	if (!(s->flags & SLAB_NOTRACK))
 		*p++ = 't';
+	if (s->flags & SLAB_ACCOUNT)
+		*p++ = 'A';
 	if (p != name + 1)
 		*p++ = '-';
 	p += sprintf(p, "%07d", s->size);
 
 #ifdef CONFIG_MEMCG_KMEM
 	if (!is_root_cache(s))
-		p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params->memcg));
+		p += sprintf(p, "-%08d", memcg_cache_id(s->memcg_params.memcg));
 #endif
 
 	BUG_ON(p > name + ID_STR_LENGTH - 1);
@@ -5403,7 +5492,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
 		name = create_unique_id(s);
 	}
 
-	s->kobj.kset = slab_kset;
+	s->kobj.kset = cache_kset(s);
 	err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, name);
 	if (err) {
 		kobject_put(&s->kobj);
@@ -5416,6 +5505,18 @@ static int sysfs_slab_add(struct kmem_cache *s)
 		kobject_put(&s->kobj);
 		return err;
 	}
+
+#ifdef CONFIG_MEMCG_KMEM
+	if (is_root_cache(s)) {
+		s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
+		if (!s->memcg_kset) {
+			kobject_del(&s->kobj);
+			kobject_put(&s->kobj);
+			return -ENOMEM;
+		}
+	}
+#endif
+
 	kobject_uevent(&s->kobj, KOBJ_ADD);
 	if (!unmergeable) {
 		/* Setup first alias */
@@ -5425,7 +5526,7 @@ static int sysfs_slab_add(struct kmem_cache *s)
 	return 0;
 }
 
-static void sysfs_slab_remove(struct kmem_cache *s)
+void sysfs_slab_remove(struct kmem_cache *s)
 {
 	if (slab_state < FULL)
 		/*
@@ -5434,6 +5535,9 @@ static void sysfs_slab_remove(struct kmem_cache *s)
 		 */
 		return;
 
+#ifdef CONFIG_MEMCG_KMEM
+	kset_unregister(s->memcg_kset);
+#endif
 	kobject_uevent(&s->kobj, KOBJ_REMOVE);
 	kobject_del(&s->kobj);
 	kobject_put(&s->kobj);
@@ -5520,6 +5624,77 @@ __initcall(slab_sysfs_init);
  * The /proc/slabinfo ABI
  */
 #ifdef CONFIG_SLABINFO
+
+#define SHOW_TOP_SLABS	10
+
+static unsigned long get_cache_size(struct kmem_cache *cache)
+{
+	unsigned long flags;
+	unsigned long slabs;
+	struct kmem_cache_node *n;
+	struct list_head *lh;
+	int cpu, node;
+
+	slabs = 0;
+
+	for_each_online_cpu(cpu)
+		slabs++;
+
+	for_each_online_node(node) {
+		n = get_node(cache, node);
+		if (!n)
+			continue;
+		spin_lock_irqsave(&n->list_lock, flags);
+#ifdef CONFIG_SLUB_DEBUG
+		list_for_each(lh, &n->full)
+			slabs++;
+#endif
+		list_for_each(lh, &n->partial)
+			slabs++;
+		spin_unlock_irqrestore(&n->list_lock, flags);
+	}
+
+	return slabs * (PAGE_SIZE << oo_order(cache->oo));
+}
+
+void show_slab_info(void)
+{
+	int i, j;
+	unsigned long size;
+	struct kmem_cache *ptr;
+	unsigned long sizes[SHOW_TOP_SLABS];
+	struct kmem_cache *top[SHOW_TOP_SLABS];
+
+	memset(top, 0, sizeof(top));
+	memset(sizes, 0, sizeof(sizes));
+
+	printk("Top %d caches:\n", SHOW_TOP_SLABS);
+
+//	spin_lock(&cache_chain_lock);
+	list_for_each_entry(ptr, &slab_caches, list) {
+		size = get_cache_size(ptr);
+
+		j = 0;
+		for (i = 1; i < SHOW_TOP_SLABS; i++) {
+			if (sizes[i] < sizes[j])
+				j = i;
+		}
+		if (size > sizes[j]) {
+			sizes[j] = size;
+			top[j] = ptr;
+		}
+	}
+
+	for (i = 0; i < SHOW_TOP_SLABS; i++) {
+		if (top[i])
+			printk("%-21s: size %10lu objsize %10u\n",
+				top[i]->name, sizes[i],
+				top[i]->size);
+	}
+
+//	spin_unlock(&cache_chain_lock);
+}
+
 void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo)
 {
 	unsigned long nr_partials = 0;
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -32,6 +32,7 @@
 #include <linux/memcontrol.h>
 #include <linux/gfp.h>
 #include <linux/uio.h>
+#include <linux/page_idle.h>
 
 #include "internal.h"
 
@@ -63,6 +64,7 @@ static void __page_cache_release(struct page *page)
 		del_page_from_lru_list(page, lruvec, page_off_lru(page));
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 	}
+	mem_cgroup_uncharge(page);
 }
 
 static void __put_single_page(struct page *page)
@@ -615,6 +617,8 @@ void mark_page_accessed(struct page *page)
 	} else if (!PageReferenced(page)) {
 		SetPageReferenced(page);
 	}
+	if (page_is_idle(page))
+		clear_page_idle(page);
 }
 EXPORT_SYMBOL(mark_page_accessed);
 
@@ -670,6 +674,40 @@ void add_page_to_unevictable_list(struct page *page)
 	spin_unlock_irq(&zone->lru_lock);
 }
 
+/**
+ * lru_cache_add_active_or_unevictable
+ * @page:  the page to be added to LRU
+ * @vma:   vma in which page is mapped for determining reclaimability
+ *
+ * Place @page on the active or unevictable LRU list, depending on its
+ * evictability.  Note that if the page is not evictable, it goes
+ * directly back onto it's zone's unevictable list, it does NOT use a
+ * per cpu pagevec.
+ */
+void lru_cache_add_active_or_unevictable(struct page *page,
+					 struct vm_area_struct *vma)
+{
+	VM_BUG_ON_PAGE(PageLRU(page), page);
+
+	if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) {
+		SetPageActive(page);
+		lru_cache_add(page);
+		return;
+	}
+
+	if (!TestSetPageMlocked(page)) {
+		/*
+		 * We use the irq-unsafe __mod_zone_page_stat because this
+		 * counter is not modified from interrupt context, and the pte
+		 * lock is held(spinlock), which implies preemption disabled.
+		 */
+		__mod_zone_page_state(page_zone(page), NR_MLOCK,
+				    hpage_nr_pages(page));
+		count_vm_event(UNEVICTABLE_PGMLOCKED);
+	}
+	add_page_to_unevictable_list(page);
+}
+
 /*
  * If the page can not be invalidated, it is moved to the
  * inactive list to speed up its reclaim.  It is moved to the
@@ -1000,11 +1038,15 @@ void release_pages(struct page **pages, int nr, bool cold)
 	if (zone)
 		spin_unlock_irqrestore(&zone->lru_lock, flags);
 
-	if (!list_empty(&pages_to_free))
+	if (!list_empty(&pages_to_free)) {
+		mem_cgroup_uncharge_list(&pages_to_free);
 		free_hot_cold_page_list(&pages_to_free, cold);
+	}
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
-	if (!list_empty(&trans_huge_pages_to_free))
+	if (!list_empty(&trans_huge_pages_to_free)) {
+		mem_cgroup_uncharge_list(&trans_huge_pages_to_free);
 		free_trans_huge_page_list(&trans_huge_pages_to_free);
+	}
 #endif
 }
 EXPORT_SYMBOL(release_pages);
@@ -1184,10 +1226,8 @@ void __init swap_setup(void)
 	int i;
 
 	bdi_init(swapper_spaces[0].backing_dev_info);
-	for (i = 0; i < MAX_SWAPFILES; i++) {
+	for (i = 0; i < MAX_SWAPFILES; i++)
 		spin_lock_init(&swapper_spaces[i].tree_lock);
-		INIT_LIST_HEAD(&swapper_spaces[i].i_mmap_nonlinear);
-	}
 #endif
 
 	/* Use a smaller cluster for small-memory machines */
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -47,12 +47,13 @@ struct address_space swapper_spaces[MAX_SWAPFILES] = {
 
 #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
 
-static struct {
+struct {
 	unsigned long add_total;
 	unsigned long del_total;
 	unsigned long find_success;
 	unsigned long find_total;
 } swap_cache_info;
+EXPORT_SYMBOL(swap_cache_info);
 
 unsigned long total_swapcache_pages(void)
 {
@@ -175,7 +176,7 @@ int add_to_swap(struct page *page, struct list_head *list)
 
 	if (unlikely(PageTransHuge(page)))
 		if (unlikely(split_huge_page_to_list(page, list))) {
-			swapcache_free(entry, NULL);
+			swapcache_free(entry);
 			return 0;
 		}
 
@@ -201,7 +202,7 @@ int add_to_swap(struct page *page, struct list_head *list)
 		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 		 * clear SWAP_HAS_CACHE flag.
 		 */
-		swapcache_free(entry, NULL);
+		swapcache_free(entry);
 		return 0;
 	}
 }
@@ -224,7 +225,7 @@ void delete_from_swap_cache(struct page *page)
 	__delete_from_swap_cache(page);
 	spin_unlock_irq(&address_space->tree_lock);
 
-	swapcache_free(entry, page);
+	swapcache_free(entry);
 	page_cache_release(page);
 }
 
@@ -394,7 +395,7 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 		 * clear SWAP_HAS_CACHE flag.
 		 */
-		swapcache_free(entry, NULL);
+		swapcache_free(entry);
 	} while (err != -ENOMEM);
 
 	if (new_page)
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -641,16 +641,13 @@ void swap_free(swp_entry_t entry)
 /*
  * Called after dropping swapcache to decrease refcnt to swap entries.
  */
-void swapcache_free(swp_entry_t entry, struct page *page)
+void swapcache_free(swp_entry_t entry)
 {
 	struct swap_info_struct *p;
-	unsigned char count;
 
 	p = swap_info_get(entry);
 	if (p) {
-		count = swap_entry_free(p, entry, SWAP_HAS_CACHE);
-		if (page)
-			mem_cgroup_uncharge_swapcache(page, entry, count != 0);
+		swap_entry_free(p, entry, SWAP_HAS_CACHE);
 		spin_unlock(&p->lock);
 	}
 }
@@ -692,7 +689,20 @@ int reuse_swap_page(struct page *page)
 	if (count <= 1 && PageSwapCache(page)) {
 		count += page_swapcount(page);
 		if (count == 1 && !PageWriteback(page)) {
-			delete_from_swap_cache(page);
+			swp_entry_t entry;
+			struct address_space *address_space;
+
+			entry.val = page_private(page);
+
+			address_space = swap_address_space(entry);
+			spin_lock_irq(&address_space->tree_lock);
+			__delete_from_swap_cache(page);
+			spin_unlock_irq(&address_space->tree_lock);
+
+			/* the page is still in use, do not uncharge */
+			swapcache_free(entry);
+			page_cache_release(page);
+
 			SetPageDirty(page);
 		}
 	}
@@ -898,35 +908,38 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
 	spinlock_t *ptl;
 	pte_t *pte;
 	int ret = 1;
+	struct mm_struct *mm = vma->vm_mm;
 
 	swapcache = page;
 	page = ksm_might_need_to_copy(page, vma, addr);
 	if (unlikely(!page))
 		return -ENOMEM;
 
-	if (mem_cgroup_try_charge_swapin(vma->vm_mm, page,
-					 GFP_KERNEL, &memcg)) {
+	if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg)) {
 		ret = -ENOMEM;
 		goto out_nolock;
 	}
 
 	pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 	if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) {
-		mem_cgroup_cancel_charge_swapin(memcg);
+		mem_cgroup_cancel_charge(page, memcg);
 		ret = 0;
 		goto out;
 	}
 
 	dec_mm_counter(vma->vm_mm, MM_SWAPENTS);
-	inc_mm_counter(vma->vm_mm, MM_ANONPAGES);
+	inc_mm_counter(mm, MM_ANONPAGES);
 	get_page(page);
-	set_pte_at(vma->vm_mm, addr, pte,
+	set_pte_at(mm, addr, pte,
 		   pte_mkold(mk_pte(page, vma->vm_page_prot)));
-	if (page == swapcache)
+	if (page == swapcache) {
 		page_add_anon_rmap(page, vma, addr);
-	else /* ksm created a completely new copy */
+		mem_cgroup_commit_charge(page, memcg, true);
+	} else { /* ksm created a completely new copy */
 		page_add_new_anon_rmap(page, vma, addr);
-	mem_cgroup_commit_charge_swapin(page, memcg);
+		mem_cgroup_commit_charge(page, memcg, false);
+		lru_cache_add_active_or_unevictable(page, vma);
+	}
 	swap_free(entry);
 	/*
 	 * Move the page to the active list so it is not
@@ -1266,6 +1279,7 @@ int try_to_unuse(unsigned int type, bool frontswap,
 			mmput(start_mm);
 			start_mm = new_start_mm;
 		}
+
 		if (retval) {
 			unlock_page(page);
 			page_cache_release(page);
@@ -1594,6 +1608,10 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 	struct filename *pathname;
 	int err, found = 0;
 
+	/* VE admin check is just to be on the safe side, the admin may affect
+	 * swaps only if he has access to special, i.e. if he has been granted
+	 * access to the block device or if the swap file is in the area
+	 * visible to him. */
 	if (!capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
@@ -1815,11 +1833,42 @@ static const struct seq_operations swaps_op = {
 	.show =		swap_show
 };
 
+#include <linux/virtinfo.h>
+
+static int swap_show_ve(struct seq_file *swap, void *v)
+{
+	struct user_beancounter *old_ub;
+	struct sysinfo si;
+	int ret;
+
+	si_swapinfo(&si);
+	old_ub = set_exec_ub(current->mm->mm_ub);
+	ret = virtinfo_notifier_call(VITYPE_GENERAL, VIRTINFO_SYSINFO, &si);
+	(void)set_exec_ub(old_ub);
+	if (ret & NOTIFY_FAIL)
+		goto out;
+
+	seq_printf(swap, "Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+	if (!si.totalswap)
+		goto out;
+	seq_printf(swap, "%-40s%s\t%lu\t%lu\t%d\n",
+			"/dev/null",
+			"partition",
+			si.totalswap  << (PAGE_SHIFT - 10),
+			(si.totalswap - si.freeswap) << (PAGE_SHIFT - 10),
+			-1);
+out:
+	return 0;
+}
+
 static int swaps_open(struct inode *inode, struct file *file)
 {
 	struct seq_file *seq;
 	int ret;
 
+	if (!ve_is_super(get_exec_env()))
+		return single_open(file, &swap_show_ve, NULL);
+
 	ret = seq_open(file, &swaps_op);
 	if (ret)
 		return ret;
@@ -1829,17 +1878,26 @@ static int swaps_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+static int swaps_release(struct inode *inode, struct file *file)
+{
+	struct seq_file *f = file->private_data;
+
+	if (f->op != &swaps_op)
+		return single_release(inode, file);
+	return seq_release(inode, file);
+}
+
 static const struct file_operations proc_swaps_operations = {
 	.open		= swaps_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
-	.release	= seq_release,
+	.release	= swaps_release,
 	.poll		= swaps_poll,
 };
 
 static int __init procswaps_init(void)
 {
-	proc_create("swaps", 0, NULL, &proc_swaps_operations);
+	proc_create("swaps", S_ISVTX, NULL, &proc_swaps_operations);
 	return 0;
 }
 __initcall(procswaps_init);
--- /dev/null
+++ b/mm/tcache.c
@@ -0,0 +1,1347 @@
+/*
+ *  mm/tcache.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/sched.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/spinlock.h>
+#include <linux/rcupdate.h>
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/radix-tree.h>
+#include <linux/idr.h>
+#include <linux/atomic.h>
+#include <linux/kref.h>
+#include <linux/jhash.h>
+#include <linux/completion.h>
+#include <linux/shrinker.h>
+#include <linux/vmstat.h>
+#include <linux/swap.h>
+#include <linux/cleancache.h>
+
+/* cleancache_put_page is called from atomic context */
+#define TCACHE_GFP_MASK			(__GFP_NORETRY | __GFP_NOWARN)
+
+struct tcache_node_tree {
+	struct rb_root			root;
+	spinlock_t			lock;
+};
+
+/*
+ * Per NUMA node data of a tcache_pool. Protected by tcache_nodeinfo->lock.
+ */
+struct tcache_pool_nodeinfo {
+	struct tcache_pool		*pool;
+
+	/* node in tcache_nodeinfo->reclaim_tree */
+	struct rb_node			reclaim_node;
+
+	/* LRU list of pages, linked through page->lru */
+	struct list_head		lru;
+
+	/* number of pages on the LRU list */
+	unsigned long			nr_pages;
+
+	/* recent number of successful gets and puts from the pool;
+	 * used in calculating reclaim prio */
+	unsigned long			recent_gets;
+	unsigned long			recent_puts;
+
+	/* reuse_ratio is basically recent_gets / recent_puts;
+	 * it shows the efficiency of the pool */
+	unsigned long			reuse_ratio;
+
+	/* timestamp of the eldest page on the LRU list */
+	unsigned long			timestamp;
+
+	/* increased on every LRU add/del, reset once it gets big enough;
+	 * used for rate limiting rebalancing of reclaim_tree */
+	unsigned long			events;
+} ____cacheline_aligned_in_smp;
+
+/*
+ * Tcache pools correspond to super blocks. A pool is created on FS mount
+ * (cleancache_init_fs) and destroyed on unmount (cleancache_invalidate_fs).
+ */
+struct tcache_pool {
+	/*
+	 * Reference counter. Pool destruction (triggered by unmount) will
+	 * actually start only after it reaches zero.
+	 *
+	 * Initialized to 1 on creation, decremented on destruction. May be
+	 * held temporarily by active users.
+	 */
+	struct kref			kref;
+
+	/*
+	 * Binary search trees of tcache_node structs that belong to this pool.
+	 * Linked by tcache_node->tree_node.
+	 */
+	struct tcache_node_tree		*node_tree;
+
+	/* track total number of nodes in each pool for debugging */
+	atomic_long_t			nr_nodes;
+
+	/* used to synchronize destruction */
+	struct completion		completion;
+	struct rcu_head			rcu;
+
+	/* Per NUMA node data. This must be the last element of the struct. */
+	struct tcache_pool_nodeinfo	nodeinfo[0];
+};
+
+static atomic_long_t nr_tcache_nodes;
+
+/*
+ * Tcache nodes correspond to inodes. A node is created automatically when a
+ * new page is added to the cache (cleancache_put_page) and destroyed either
+ * when the corresponding inode is invalidated (cleancache_invalidate_inode) or
+ * when the last page is removed from it (by the shrinker, cleancache_get_page,
+ * or cleancache_invalidate_page).
+ */
+struct tcache_node {
+	/*
+	 * Reference counter. Node is freed when it reaches zero.
+	 *
+	 * Incremented when the first page is attached to the node (node
+	 * becomes non-empty) and decremented when the last page is detached
+	 * (node becomes empty). May also be held temporarily by active users.
+	 *
+	 * Note that a node with a non-zero reference count is not guaranteed
+	 * to be present on the tcache_pool->node_tree - it could have been
+	 * removed by cleancache_invalidate_inode. However, if a node is found
+	 * on the tree with the tree_lock held, it must have a positive
+	 * reference count.
+	 */
+	struct kref			kref;
+
+	struct tcache_pool		*pool;
+	struct cleancache_filekey	key;
+	struct rb_node			tree_node;
+
+	/*
+	 * Radix tree of pages attached to this node. Protected by tree_lock.
+	 */
+	struct radix_tree_root		page_tree;
+	spinlock_t			tree_lock;
+
+	unsigned long			nr_pages;
+	bool				invalidated;
+};
+
+/*
+ * To reduce contention on tcache_node_tree->lock, we maintain several trees
+ * per each pool and distribute nodes among them in accordance with their hash.
+ */
+static int num_node_trees __read_mostly = 1;
+
+/*
+ * tcache_pool_idr provides id -> tcache_pool map. Lookups are lock free (RCU).
+ * Updated are protected by the tcache_pool_lock.
+ */
+static DEFINE_IDR(tcache_pool_idr);
+static DEFINE_SPINLOCK(tcache_pool_lock);
+
+struct tcache_nodeinfo {
+	spinlock_t lock;
+
+	/* tree of pools, sorted by reclaim prio */
+	struct rb_root reclaim_tree;
+
+	/* total number of pages on all LRU lists corresponding to this node */
+	unsigned long nr_pages;
+} ____cacheline_aligned_in_smp;
+
+/*
+ * Global per NUMA node data.
+ */
+static struct tcache_nodeinfo *tcache_nodeinfo;
+
+/*
+ * Locking rules:
+ *
+ *  tcache_node->tree_lock
+ *       tcache_node_tree->lock
+ *       tcache_nodeinfo->lock
+ */
+
+/* Enable/disable tcache backend (set at boot time) */
+static bool tcache_enabled __read_mostly = true;
+module_param_named(enabled, tcache_enabled, bool, 0444);
+
+/* Enable/disable populating the cache */
+static bool tcache_active __read_mostly = true;
+module_param_named(active, tcache_active, bool, 0644);
+
+/*
+ * How long a tcache page is considered active, i.e. likely to be reused.
+ * A pool that contains only active pages will be given a boost over other
+ * pools while selecting a reclaim target.
+ */
+static unsigned long tcache_active_interval __read_mostly = 60 * HZ;
+
+/* Total number of pages cached */
+static DEFINE_PER_CPU(long, nr_tcache_pages);
+
+static inline u32 key_hash(const struct cleancache_filekey *key)
+{
+	return jhash2(key->u.key, CLEANCACHE_KEY_MAX, 0);
+}
+
+static inline struct tcache_node_tree *
+node_tree_from_key(struct tcache_pool *pool,
+		   const struct cleancache_filekey *key)
+{
+	return &pool->node_tree[key_hash(key) & (num_node_trees - 1)];
+}
+
+static void __tcache_insert_reclaim_node(struct tcache_nodeinfo *ni,
+					 struct tcache_pool_nodeinfo *pni);
+
+static inline void __tcache_check_events(struct tcache_nodeinfo *ni,
+					 struct tcache_pool_nodeinfo *pni)
+{
+	/*
+	 * We don't want to rebalance reclaim_tree on each get/put, because it
+	 * would be way too costly. Instead we count get/put events per each
+	 * pool and update a pool's reclaim prio only once the counter gets big
+	 * enough. This should yield satisfactory reclaim fairness while still
+	 * keeping the cost of get/put low.
+	 */
+	pni->events++;
+	if (likely(pni->events < 1024))
+		return;
+
+	pni->events = 0;
+
+	/*
+	 * The pool is empty, so there's no point in adding it to the
+	 * reclaim_tree. Neither do we need to remove it from the tree -
+	 * it will be done by the shrinker once it tries to scan it.
+	 */
+	if (unlikely(list_empty(&pni->lru)))
+		return;
+
+	/*
+	 * This can only happen if the node was removed from the tree on pool
+	 * destruction (see tcache_remove_from_reclaim_trees()). Nothing to do
+	 * then.
+	 */
+	if (unlikely(RB_EMPTY_NODE(&pni->reclaim_node)))
+		return;
+
+	rb_erase(&pni->reclaim_node, &ni->reclaim_tree);
+	__tcache_insert_reclaim_node(ni, pni);
+}
+
+/*
+ * Add a page to the LRU list. This effectively makes the page visible to the
+ * shrinker, so it must only be called after the page was properly initialized
+ * and added to the corresponding page tree.
+ */
+static void tcache_lru_add(struct tcache_pool *pool, struct page *page)
+{
+	int nid = page_to_nid(page);
+	struct tcache_nodeinfo *ni = &tcache_nodeinfo[nid];
+	struct tcache_pool_nodeinfo *pni = &pool->nodeinfo[nid];
+
+	spin_lock(&ni->lock);
+
+	ni->nr_pages++;
+	pni->nr_pages++;
+	list_add_tail(&page->lru, &pni->lru);
+
+	pni->recent_puts++;
+	if (unlikely(pni->recent_puts > pni->nr_pages / 2)) {
+		pni->recent_gets /= 2;
+		pni->recent_puts /= 2;
+	}
+
+	__tcache_check_events(ni, pni);
+
+	if (unlikely(RB_EMPTY_NODE(&pni->reclaim_node)))
+		__tcache_insert_reclaim_node(ni, pni);
+
+	spin_unlock(&ni->lock);
+}
+
+static void __tcache_lru_del(struct tcache_nodeinfo *ni,
+			     struct tcache_pool_nodeinfo *pni,
+			     struct page *page)
+{
+	ni->nr_pages--;
+	pni->nr_pages--;
+	list_del_init(&page->lru);
+}
+
+/*
+ * Remove a page from the LRU list. This function is safe to call on the same
+ * page from concurrent threads - the page will be removed only once.
+ */
+static void tcache_lru_del(struct tcache_pool *pool, struct page *page,
+			   bool reused)
+{
+	int nid = page_to_nid(page);
+	struct tcache_nodeinfo *ni = &tcache_nodeinfo[nid];
+	struct tcache_pool_nodeinfo *pni = &pool->nodeinfo[nid];
+
+	spin_lock(&ni->lock);
+
+	/* Raced with reclaimer? */
+	if (unlikely(list_empty(&page->lru)))
+		goto out;
+
+	__tcache_lru_del(ni, pni, page);
+
+	if (reused)
+		pni->recent_gets++;
+
+	__tcache_check_events(ni, pni);
+out:
+	spin_unlock(&ni->lock);
+}
+
+static int tcache_create_pool(void)
+{
+	size_t size;
+	struct tcache_pool *pool;
+	struct tcache_pool_nodeinfo *pni;
+	int id;
+	int i;
+
+	size = sizeof(struct tcache_pool);
+	size += nr_node_ids * sizeof(struct tcache_pool_nodeinfo);
+
+	pool = kzalloc(size, GFP_KERNEL);
+	if (!pool)
+		goto fail;
+
+	pool->node_tree = kcalloc(num_node_trees, sizeof(*pool->node_tree),
+				  GFP_KERNEL);
+	if (!pool->node_tree)
+		goto free_pool;
+
+	kref_init(&pool->kref);
+	init_completion(&pool->completion);
+
+	for (i = 0; i < num_node_trees; i++) {
+		pool->node_tree[i].root = RB_ROOT;
+		spin_lock_init(&pool->node_tree[i].lock);
+	}
+
+	for (i = 0; i < nr_node_ids; i++) {
+		pni = &pool->nodeinfo[i];
+		pni->pool = pool;
+		RB_CLEAR_NODE(&pni->reclaim_node);
+		INIT_LIST_HEAD(&pni->lru);
+	}
+
+	idr_preload(GFP_KERNEL);
+	spin_lock(&tcache_pool_lock);
+
+	id = idr_alloc(&tcache_pool_idr, pool, 0, 0, GFP_NOWAIT);
+
+	spin_unlock(&tcache_pool_lock);
+	idr_preload_end();
+
+	if (id < 0)
+		goto free_trees;
+	return id;
+
+free_trees:
+	kfree(pool->node_tree);
+free_pool:
+	kfree(pool);
+fail:
+	return -1;
+}
+
+/*
+ * Take a reference to a pool unless it is being destroyed. Returns true on
+ * success, false on failure. The caller must guarantee that the pool can be
+ * safely dereferenced.
+ */
+static bool tcache_grab_pool(struct tcache_pool *pool)
+{
+	return kref_get_unless_zero(&pool->kref);
+}
+
+static void tcache_hold_pool(struct tcache_pool *pool)
+{
+	kref_get(&pool->kref);
+}
+
+/*
+ * Return the pool corresponding to an id (or NULL if there is no such). The
+ * reference counter of the returned pool is incremented.
+ */
+static struct tcache_pool *tcache_get_pool(int id)
+{
+	struct tcache_pool *pool;
+
+	if (id < 0)
+		return NULL;
+
+	rcu_read_lock();
+	pool = idr_find(&tcache_pool_idr, id);
+	if (pool && !tcache_grab_pool(pool))
+		pool = NULL;
+	rcu_read_unlock();
+
+	return pool;
+}
+
+static void tcache_pool_release_fn(struct kref *kref)
+{
+	struct tcache_pool *pool = container_of(kref, struct tcache_pool, kref);
+
+	/*
+	 * Notify tcache_destroy_pool that it is now safe to proceed to
+	 * destruction.
+	 */
+	complete(&pool->completion);
+}
+
+/*
+ * Release reference to a pool taken by tcache_grab_pool or tcache_get_pool.
+ */
+static inline void tcache_put_pool(struct tcache_pool *pool)
+{
+	kref_put(&pool->kref, tcache_pool_release_fn);
+}
+
+static void tcache_remove_from_reclaim_trees(struct tcache_pool *pool);
+static void tcache_invalidate_node_tree(struct tcache_node_tree *tree);
+
+static void tcache_destroy_pool(int id)
+{
+	int i;
+	struct tcache_pool *pool;
+
+	spin_lock(&tcache_pool_lock);
+	pool = idr_find(&tcache_pool_idr, id);
+	if (pool)
+		idr_remove(&tcache_pool_idr, id);
+	spin_unlock(&tcache_pool_lock);
+
+	if (!pool)
+		return;
+
+	tcache_put_pool(pool);
+
+	/*
+	 * Wait until all references to this pool are released.
+	 *
+	 * We removed the pool from id -> pool map, so now new references can
+	 * only be taken by the shrinker. The latter takes a reference to this
+	 * pool only in order to remove a page from it. Since no new pages can
+	 * be added to the pool, we are guaranteed to make progress.
+	 */
+	wait_for_completion(&pool->completion);
+
+	tcache_remove_from_reclaim_trees(pool);
+
+	for (i = 0; i < num_node_trees; i++)
+		tcache_invalidate_node_tree(&pool->node_tree[i]);
+
+	BUG_ON(atomic_long_read(&pool->nr_nodes) != 0);
+
+	kfree(pool->node_tree);
+	kfree_rcu(pool, rcu);
+}
+
+static struct tcache_node *tcache_alloc_node(void)
+{
+	struct tcache_node *node;
+
+	node = kzalloc(sizeof(*node), TCACHE_GFP_MASK);
+	if (!node)
+		return NULL;
+
+	kref_init(&node->kref);
+	INIT_RADIX_TREE(&node->page_tree, TCACHE_GFP_MASK);
+	spin_lock_init(&node->tree_lock);
+
+	return node;
+}
+
+static struct tcache_node *__tcache_lookup_node(struct rb_root *rb_root,
+		const struct cleancache_filekey *key,
+		struct rb_node ***rb_link, struct rb_node **rb_parent)
+{
+	struct rb_node **__rb_link = &rb_root->rb_node;
+	struct rb_node *__rb_parent = NULL;
+	struct tcache_node *node;
+	int ret;
+
+	*rb_link = NULL;
+	*rb_parent = NULL;
+
+	while (*__rb_link) {
+		__rb_parent = *__rb_link;
+		node = rb_entry(__rb_parent, struct tcache_node, tree_node);
+
+		ret = memcmp(&node->key, key, sizeof(*key));
+		if (ret > 0)
+			__rb_link = &__rb_parent->rb_left;
+		else if (ret < 0)
+			__rb_link = &__rb_parent->rb_right;
+		else
+			return node;
+	}
+
+	*rb_parent = __rb_parent;
+	*rb_link = __rb_link;
+
+	return NULL;
+}
+
+static void __tcache_insert_node(struct rb_root *rb_root,
+		struct tcache_node *node,
+		struct rb_node **rb_link, struct rb_node *rb_parent)
+{
+	rb_link_node(&node->tree_node, rb_parent, rb_link);
+	rb_insert_color(&node->tree_node, rb_root);
+}
+
+static void __tcache_delete_node(struct rb_root *rb_root,
+				 struct tcache_node *node)
+{
+	/*
+	 * A node is deleted from the tree automatically by the node release
+	 * function as soon as the last reference to it has been dropped (all
+	 * pages and users have gone), but it can also be deleted explicitly by
+	 * tcache_invalidate_node, in which case the release function will
+	 * receive a node which is already not on the tree.
+	 */
+	if (!RB_EMPTY_NODE(&node->tree_node)) {
+		rb_erase(&node->tree_node, rb_root);
+		RB_CLEAR_NODE(&node->tree_node);
+	}
+}
+
+/*
+ * Take a reference to a node. The caller must guarantee that the node has a
+ * positive reference count. In particular, the function is safe to call if the
+ * node is known to be on the tree.
+ */
+static inline void tcache_hold_node(struct tcache_node *node)
+{
+	kref_get(&node->kref);
+}
+
+/*
+ * Find and get a reference to the node corresponding to a key in a pool. If
+ * the requested node does not exist and may_create is true, try to create a
+ * new one.
+ */
+static noinline_for_stack struct tcache_node *
+tcache_get_node(struct tcache_pool *pool, const struct cleancache_filekey *key,
+		bool may_create)
+{
+	struct tcache_node_tree *tree;
+	struct tcache_node *new_node = NULL, *node;
+	struct rb_node **rb_link, *rb_parent;
+	unsigned long flags;
+
+	tree = node_tree_from_key(pool, key);
+retry:
+	spin_lock_irqsave(&tree->lock, flags);
+	node = __tcache_lookup_node(&tree->root, key, &rb_link, &rb_parent);
+	if (node)
+		tcache_hold_node(node);
+	else if (new_node) {
+		node = new_node;
+		node->pool = pool;
+		node->key = *key;
+		atomic_long_inc(&pool->nr_nodes);
+		atomic_long_inc(&nr_tcache_nodes);
+		__tcache_insert_node(&tree->root, node, rb_link, rb_parent);
+	}
+	spin_unlock_irqrestore(&tree->lock, flags);
+
+	if (node) {
+		BUG_ON(node->pool != pool);
+		if (node != new_node)
+			kfree(new_node);
+		return node;
+	}
+
+	if (may_create) {
+		new_node = tcache_alloc_node();
+		if (new_node)
+			goto retry;
+	}
+	return NULL;
+}
+
+static void tcache_node_release_fn(struct kref *kref)
+{
+	struct tcache_node *node = container_of(kref, struct tcache_node, kref);
+	struct tcache_node_tree *tree;
+
+	tree = node_tree_from_key(node->pool, &node->key);
+
+	__tcache_delete_node(&tree->root, node);
+	spin_unlock(&tree->lock);
+
+	atomic_long_dec(&nr_tcache_nodes);
+	atomic_long_dec(&node->pool->nr_nodes);
+	kfree(node);
+}
+
+/*
+ * Release a reference to a node taken by tcache_hold_node or tcache_get_node.
+ */
+static inline void tcache_put_node(struct tcache_node *node)
+{
+	struct tcache_node_tree *tree;
+
+	tree = node_tree_from_key(node->pool, &node->key);
+	kref_put_spinlock_irqsave(&node->kref, tcache_node_release_fn,
+				  &tree->lock);
+}
+
+static struct tcache_node *tcache_get_node_and_pool(int pool_id,
+		const struct cleancache_filekey *key, bool may_create)
+{
+	struct tcache_pool *pool;
+	struct tcache_node *node;
+
+	pool = tcache_get_pool(pool_id);
+	if (!pool)
+		return NULL;
+	node = tcache_get_node(pool, key, may_create);
+	if (!node)
+		tcache_put_pool(pool);
+	return node;
+}
+
+static void tcache_put_node_and_pool(struct tcache_node *node)
+{
+	struct tcache_pool *pool = node->pool;
+
+	tcache_put_node(node);
+	tcache_put_pool(pool);
+}
+
+static void tcache_invalidate_node_pages(struct tcache_node *node);
+
+/*
+ * Remove a node from the tree and invalidate its pages.
+ */
+static void tcache_invalidate_node(struct tcache_pool *pool,
+				   const struct cleancache_filekey *key)
+{
+	struct tcache_node_tree *tree;
+	struct tcache_node *node;
+	struct rb_node **rb_link, *rb_parent;
+
+	tree = node_tree_from_key(pool, key);
+
+	spin_lock_irq(&tree->lock);
+	node = __tcache_lookup_node(&tree->root, key, &rb_link, &rb_parent);
+	if (node) {
+		tcache_hold_node(node);
+		__tcache_delete_node(&tree->root, node);
+	}
+	spin_unlock_irq(&tree->lock);
+
+	if (node) {
+		tcache_invalidate_node_pages(node);
+		tcache_put_node(node);
+	}
+}
+
+static noinline_for_stack void
+tcache_invalidate_node_tree(struct tcache_node_tree *tree)
+{
+	struct tcache_node *node;
+
+	/*
+	 * There is no need to take tree->lock, because this function is only
+	 * called when the pool is about to be destroyed.
+	 */
+	while (!RB_EMPTY_ROOT(&tree->root)) {
+		node = rb_entry(rb_first(&tree->root),
+				struct tcache_node, tree_node);
+
+		/* Remaining nodes must be held solely by their pages */
+		BUG_ON(atomic_read(&node->kref.refcount) != 1);
+		BUG_ON(node->nr_pages == 0);
+		BUG_ON(node->invalidated);
+
+		tcache_hold_node(node);
+		tcache_invalidate_node_pages(node);
+		tcache_put_node(node);
+	}
+}
+
+static inline struct tcache_node *tcache_page_node(struct page *page)
+{
+	return (struct tcache_node *)page->mapping;
+}
+
+static inline unsigned long tcache_page_timestamp(struct page *page)
+{
+	return page->private;
+}
+
+static inline void tcache_init_page(struct page *page,
+				    struct tcache_node *node, pgoff_t index)
+{
+	page->mapping = (struct address_space *)node;
+	page->private = jiffies;
+	page->index = index;
+}
+
+static inline void tcache_hold_page(struct page *page)
+{
+	get_page(page);
+}
+
+static inline void tcache_put_page(struct page *page)
+{
+	if (put_page_testzero(page)) {
+		page->mapping = NULL;	/* to make free_pages_check happy */
+		free_hot_cold_page(page, false);
+	}
+}
+
+static int tcache_page_tree_replace(struct tcache_node *node, pgoff_t index,
+				    struct page *page, struct page **old_page)
+{
+	void **pslot;
+	int err = 0;
+
+	*old_page = NULL;
+
+	/*
+	 * If the node was invalidated after we looked it up, abort in order to
+	 * avoid clashes with tcache_invalidate_node_pages.
+	 */
+	if (unlikely(node->invalidated)) {
+		err = -EAGAIN;
+		goto out;
+	}
+
+	pslot = radix_tree_lookup_slot(&node->page_tree, index);
+	if (pslot) {
+		*old_page = radix_tree_deref_slot_protected(pslot,
+							    &node->tree_lock);
+		radix_tree_replace_slot(pslot, page);
+		__dec_zone_page_state(*old_page, NR_FILE_PAGES);
+		__inc_zone_page_state(page, NR_FILE_PAGES);
+	} else {
+		err = radix_tree_insert(&node->page_tree, index, page);
+		BUG_ON(err == -EEXIST);
+		if (!err) {
+			if (!node->nr_pages++)
+				tcache_hold_node(node);
+			__this_cpu_inc(nr_tcache_pages);
+			__inc_zone_page_state(page, NR_FILE_PAGES);
+		}
+	}
+out:
+	return err;
+}
+
+static struct page *__tcache_page_tree_delete(struct tcache_node *node,
+					      pgoff_t index, struct page *page)
+{
+	page = radix_tree_delete_item(&node->page_tree, index, page);
+	if (page) {
+		if (!--node->nr_pages)
+			tcache_put_node(node);
+		__this_cpu_dec(nr_tcache_pages);
+		__dec_zone_page_state(page, NR_FILE_PAGES);
+	}
+	return page;
+}
+
+static struct page *tcache_page_tree_delete(struct tcache_node *node,
+					    pgoff_t index, struct page *page)
+{
+	spin_lock(&node->tree_lock);
+	page = __tcache_page_tree_delete(node, index, page);
+	spin_unlock(&node->tree_lock);
+	return page;
+}
+
+/*
+ * Attempt to attach a page to a node at a given offset. If there is already a
+ * page at the given offset, it will be replaced. Returns 0 on success. The
+ * caller must put the page no matter if the function succeeds or fails.
+ */
+static noinline_for_stack int
+tcache_attach_page(struct tcache_node *node, pgoff_t index, struct page *page)
+{
+	struct page *old_page;
+	unsigned long flags;
+	int err = 0;
+
+	tcache_init_page(page, node, index);
+
+	spin_lock_irqsave(&node->tree_lock, flags);
+	err = tcache_page_tree_replace(node, index, page, &old_page);
+	if (err)
+		goto out;
+
+	if (old_page) {
+		tcache_lru_del(node->pool, old_page, false);
+		tcache_put_page(old_page);
+	}
+	tcache_hold_page(page);
+	tcache_lru_add(node->pool, page);
+out:
+	spin_unlock_irqrestore(&node->tree_lock, flags);
+	return err;
+}
+
+/*
+ * Detach and return the page at a given offset of a node. The caller must put
+ * the page when it is done with it.
+ */
+static struct page *tcache_detach_page(struct tcache_node *node, pgoff_t index,
+				       bool reused)
+{
+	unsigned long flags;
+	struct page *page;
+
+	local_irq_save(flags);
+	page = tcache_page_tree_delete(node, index, NULL);
+	if (page)
+		tcache_lru_del(node->pool, page, reused);
+	local_irq_restore(flags);
+
+	return page;
+}
+
+static noinline_for_stack void
+tcache_invalidate_node_pages(struct tcache_node *node)
+{
+	struct radix_tree_iter iter;
+	struct page *page;
+	void **slot;
+	pgoff_t index = 0;
+
+	spin_lock_irq(&node->tree_lock);
+
+	/*
+	 * First forbid new page insertions - see tcache_page_tree_replace.
+	 */
+	node->invalidated = true;
+
+	/*
+	 * Now truncate all pages. Be careful, because pages can still be
+	 * deleted from this node by the shrinker or by concurrent lookups.
+	 */
+restart:
+	radix_tree_for_each_slot(slot, &node->page_tree, &iter, index) {
+		page = radix_tree_deref_slot_protected(slot, &node->tree_lock);
+		BUG_ON(!__tcache_page_tree_delete(node, page->index, page));
+		tcache_lru_del(node->pool, page, false);
+		tcache_put_page(page);
+
+		if (need_resched()) {
+			spin_unlock_irq(&node->tree_lock);
+			cond_resched();
+			spin_lock_irq(&node->tree_lock);
+			/*
+			 * Restart iteration over the radix tree, because the
+			 * current node could have been freed when we dropped
+			 * the lock.
+			 */
+			index = iter.index + 1;
+			goto restart;
+		}
+	}
+
+	BUG_ON(node->nr_pages != 0);
+
+	spin_unlock_irq(&node->tree_lock);
+}
+
+static noinline_for_stack void
+tcache_remove_from_reclaim_trees(struct tcache_pool *pool)
+{
+	int i;
+	struct tcache_nodeinfo *ni;
+	struct tcache_pool_nodeinfo *pni;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		ni = &tcache_nodeinfo[i];
+		pni = &pool->nodeinfo[i];
+
+		spin_lock_irq(&ni->lock);
+		if (!RB_EMPTY_NODE(&pni->reclaim_node)) {
+			rb_erase(&pni->reclaim_node, &ni->reclaim_tree);
+			/*
+			 * Clear the node for __tcache_check_events() not to
+			 * reinsert the pool back into the tree.
+			 */
+			RB_CLEAR_NODE(&pni->reclaim_node);
+		}
+		spin_unlock_irq(&ni->lock);
+	}
+}
+
+static inline bool tcache_reclaim_node_before(struct tcache_pool_nodeinfo *a,
+					      struct tcache_pool_nodeinfo *b,
+					      unsigned long now)
+{
+	bool a_active = now - a->timestamp < tcache_active_interval;
+	bool b_active = now - b->timestamp < tcache_active_interval;
+
+	/*
+	 * Always favor active pools over inactive. If the two pools are both
+	 * active or both inactive, the order in the reclaim_tree is determined
+	 * by the reuse ratio.
+	 */
+	if (a_active && !b_active)
+		return false;
+	if (!a_active && b_active)
+		return true;
+	return a->reuse_ratio < b->reuse_ratio;
+}
+
+static noinline_for_stack void
+__tcache_insert_reclaim_node(struct tcache_nodeinfo *ni,
+			     struct tcache_pool_nodeinfo *pni)
+{
+	struct rb_node **link = &ni->reclaim_tree.rb_node;
+	struct rb_node *parent = NULL;
+	struct tcache_pool_nodeinfo *pni2;
+	unsigned long now = jiffies;
+
+	BUG_ON(list_empty(&pni->lru));
+
+	pni->reuse_ratio = pni->recent_gets * 100 / (pni->recent_puts + 1);
+	pni->timestamp = tcache_page_timestamp(list_first_entry(&pni->lru,
+							struct page, lru));
+
+	while (*link) {
+		parent = *link;
+		pni2 = rb_entry(parent, struct tcache_pool_nodeinfo,
+				reclaim_node);
+		if (tcache_reclaim_node_before(pni, pni2, now))
+			link = &parent->rb_left;
+		else
+			link = &parent->rb_right;
+	}
+
+	rb_link_node(&pni->reclaim_node, parent, link);
+	rb_insert_color(&pni->reclaim_node, &ni->reclaim_tree);
+}
+
+static noinline_for_stack int
+__tcache_lru_isolate(struct tcache_nodeinfo *ni,
+		     struct tcache_pool_nodeinfo *pni,
+		     struct page **pages, int nr_to_isolate)
+{
+	struct tcache_node *node;
+	struct page *page;
+	int nr_isolated = 0;
+
+	while (nr_to_isolate > 0 && !list_empty(&pni->lru)) {
+		page = list_first_entry(&pni->lru, struct page, lru);
+		__tcache_lru_del(ni, pni, page);
+
+		tcache_hold_page(page);
+		/*
+		 * A node can be destroyed only if all its pages have been
+		 * removed both from the tree and the LRU list. Since we are
+		 * holding the LRU lock here and hence preventing the page
+		 * from being removed from the LRU list, it is therefore safe
+		 * to access the node which the page is attached to.
+		 */
+		node = tcache_page_node(page);
+		tcache_hold_node(node);
+		tcache_hold_pool(node->pool);
+
+		pages[nr_isolated++] = page;
+		nr_to_isolate--;
+	}
+	return nr_isolated;
+}
+
+static noinline_for_stack int
+tcache_lru_isolate(int nid, struct page **pages, int nr_to_isolate)
+{
+	struct tcache_nodeinfo *ni = &tcache_nodeinfo[nid];
+	struct tcache_pool_nodeinfo *pni;
+	int nr, nr_isolated = 0;
+	struct rb_node *rbn;
+
+	spin_lock_irq(&ni->lock);
+again:
+	rbn = rb_first(&ni->reclaim_tree);
+	if (!rbn)
+		goto out;
+
+	rb_erase(rbn, &ni->reclaim_tree);
+	RB_CLEAR_NODE(rbn);
+
+	pni = rb_entry(rbn, struct tcache_pool_nodeinfo, reclaim_node);
+	if (!tcache_grab_pool(pni->pool))
+		goto again;
+
+	nr = __tcache_lru_isolate(ni, pni, pages + nr_isolated, nr_to_isolate);
+	nr_isolated += nr;
+	nr_to_isolate -= nr;
+
+	if (!list_empty(&pni->lru))
+		__tcache_insert_reclaim_node(ni, pni);
+
+	tcache_put_pool(pni->pool);
+
+	if (nr_to_isolate > 0)
+		goto again;
+out:
+	spin_unlock_irq(&ni->lock);
+	return nr_isolated;
+}
+
+static bool __tcache_reclaim_page(struct page *page)
+{
+	struct tcache_node *node;
+	bool ret;
+
+	node = tcache_page_node(page);
+	if (tcache_page_tree_delete(node, page->index, page)) {
+		/*
+		 * We deleted the page from the tree - drop the
+		 * corresponding reference.
+		 */
+		tcache_put_page(page);
+		ret = true;
+	} else
+		/* The page was deleted by a concurrent thread - abort. */
+		ret = false;
+
+	/* Drop the reference taken in __tcache_lru_isolate. */
+	tcache_put_node_and_pool(node);
+	return ret;
+}
+
+static int tcache_reclaim_pages(struct page **pages, int nr)
+{
+	int i;
+	int nr_reclaimed = 0;
+
+	local_irq_disable();
+	for (i = 0; i < nr; i++) {
+		nr_reclaimed += !!__tcache_reclaim_page(pages[i]);
+		/* Drop the reference taken in __tcache_lru_isolate. */
+		tcache_put_page(pages[i]);
+		pages[i] = NULL;
+	}
+	local_irq_enable();
+	return nr_reclaimed;
+}
+
+static noinline_for_stack struct page *
+tcache_try_to_reclaim_page(struct tcache_pool *pool, int nid)
+{
+	struct tcache_nodeinfo *ni = &tcache_nodeinfo[nid];
+	struct tcache_pool_nodeinfo *pni = &pool->nodeinfo[nid];
+	struct page *page = NULL;
+	unsigned long flags;
+	int ret;
+
+	local_irq_save(flags);
+
+	spin_lock(&ni->lock);
+	ret = __tcache_lru_isolate(ni, pni, &page, 1);
+	spin_unlock(&ni->lock);
+
+	if (!ret)
+		goto out;
+
+	if (!__tcache_reclaim_page(page)) {
+		tcache_put_page(page);
+		page = NULL;
+	}
+out:
+	local_irq_restore(flags);
+	return page;
+}
+
+static struct page *tcache_alloc_page(struct tcache_pool *pool)
+{
+	struct page *page;
+
+	page = alloc_page(TCACHE_GFP_MASK | __GFP_HIGHMEM);
+	if (!page)
+		page = tcache_try_to_reclaim_page(pool, numa_node_id());
+
+	return page;
+}
+
+static unsigned long tcache_shrink_count(struct shrinker *shrink,
+					 struct shrink_control *sc)
+{
+	return tcache_nodeinfo[sc->nid].nr_pages;
+}
+
+#define TCACHE_SCAN_BATCH 128UL
+static DEFINE_PER_CPU(struct page * [TCACHE_SCAN_BATCH], tcache_page_vec);
+
+static unsigned long tcache_shrink_scan(struct shrinker *shrink,
+					struct shrink_control *sc)
+{
+	struct page **pages = get_cpu_var(tcache_page_vec);
+	int nr_isolated, nr_reclaimed;
+
+	BUG_ON(sc->nr_to_scan > TCACHE_SCAN_BATCH);
+
+	nr_isolated = tcache_lru_isolate(sc->nid, pages, sc->nr_to_scan);
+	if (!nr_isolated) {
+		put_cpu_var(tcache_page_vec);
+		return SHRINK_STOP;
+	}
+
+	nr_reclaimed = tcache_reclaim_pages(pages, nr_isolated);
+	put_cpu_var(tcache_page_vec);
+
+	if (current->reclaim_state)
+		current->reclaim_state->reclaimed_slab += nr_reclaimed;
+
+	return nr_reclaimed;
+}
+
+struct shrinker tcache_shrinker = {
+	.count_objects		= tcache_shrink_count,
+	.scan_objects		= tcache_shrink_scan,
+	.seeks			= 1,
+	.batch			= TCACHE_SCAN_BATCH,
+	.flags			= SHRINKER_NUMA_AWARE,
+};
+
+static int tcache_cleancache_init_fs(size_t pagesize)
+{
+	BUG_ON(pagesize != PAGE_SIZE);
+	return tcache_create_pool();
+}
+
+static int tcache_cleancache_init_shared_fs(char *uuid, size_t pagesize)
+{
+	return -1;
+}
+
+static void tcache_cleancache_put_page(int pool_id,
+				       struct cleancache_filekey key,
+				       pgoff_t index, struct page *page)
+{
+	struct tcache_node *node;
+	struct page *cache_page = NULL;
+
+	/* It makes no sense to populate tcache when we are short on memory */
+	if (!READ_ONCE(tcache_active) || !(current->flags & PF_MEMCG_RECLAIM))
+		return;
+
+	node = tcache_get_node_and_pool(pool_id, &key, true);
+	if (node) {
+		cache_page = tcache_alloc_page(node->pool);
+		if (cache_page) {
+			copy_highpage(cache_page, page);
+			/* cleancache does not care about failures */
+			(void)tcache_attach_page(node, index, cache_page);
+		}
+		tcache_put_node_and_pool(node);
+		if (cache_page)
+			tcache_put_page(cache_page);
+
+	}
+}
+
+static int tcache_cleancache_get_page(int pool_id,
+				      struct cleancache_filekey key,
+				      pgoff_t index, struct page *page)
+{
+	struct tcache_node *node;
+	struct page *cache_page = NULL;
+
+	if (!atomic_long_read(&nr_tcache_nodes))
+		return -1;
+
+	node = tcache_get_node_and_pool(pool_id, &key, false);
+	if (node) {
+		cache_page = tcache_detach_page(node, index, true);
+		if (unlikely(cache_page && node->invalidated)) {
+			tcache_put_page(cache_page);
+			cache_page = NULL;
+		}
+		tcache_put_node_and_pool(node);
+	}
+
+	if (cache_page) {
+		copy_highpage(page, cache_page);
+		tcache_put_page(cache_page);
+		return 0;
+	}
+	return -1;
+}
+
+static void tcache_cleancache_invalidate_page(int pool_id,
+		struct cleancache_filekey key, pgoff_t index)
+{
+	struct tcache_node *node;
+	struct page *page;
+
+	if (!atomic_long_read(&nr_tcache_nodes))
+		return;
+
+	node = tcache_get_node_and_pool(pool_id, &key, false);
+	if (node) {
+		page = tcache_detach_page(node, index, false);
+		if (page)
+			tcache_put_page(page);
+		tcache_put_node_and_pool(node);
+	}
+}
+
+static void tcache_cleancache_invalidate_inode(int pool_id,
+					       struct cleancache_filekey key)
+{
+	struct tcache_pool *pool;
+
+	if (!atomic_long_read(&nr_tcache_nodes))
+		return;
+
+	pool = tcache_get_pool(pool_id);
+	if (pool) {
+		tcache_invalidate_node(pool, &key);
+		tcache_put_pool(pool);
+	}
+}
+
+static void tcache_cleancache_invalidate_fs(int pool_id)
+{
+	tcache_destroy_pool(pool_id);
+}
+
+static struct cleancache_ops tcache_cleancache_ops = {
+	.init_fs		= tcache_cleancache_init_fs,
+	.init_shared_fs		= tcache_cleancache_init_shared_fs,
+	.put_page		= tcache_cleancache_put_page,
+	.get_page		= tcache_cleancache_get_page,
+	.invalidate_page	= tcache_cleancache_invalidate_page,
+	.invalidate_inode	= tcache_cleancache_invalidate_inode,
+	.invalidate_fs		= tcache_cleancache_invalidate_fs,
+};
+
+unsigned long get_nr_tcache_pages(void)
+{
+	int cpu;
+	long val = 0;
+
+	for_each_possible_cpu(cpu)
+		val += per_cpu(nr_tcache_pages, cpu);
+	if (val < 0)
+		val = 0;
+	return val;
+}
+
+static int param_get_nr_pages(char *buffer, const struct kernel_param *kp)
+{
+	return sprintf(buffer, "%lu", get_nr_tcache_pages());
+}
+
+static struct kernel_param_ops param_ops_nr_pages = {
+	.get = param_get_nr_pages,
+};
+module_param_cb(nr_pages, &param_ops_nr_pages, NULL, 0444);
+
+static int param_set_active_interval(const char *val,
+				     const struct kernel_param *kp)
+{
+	int ret;
+	unsigned int msecs;
+
+	ret = kstrtouint(val, 10, &msecs);
+	if (ret)
+		return ret;
+
+	tcache_active_interval = msecs_to_jiffies(msecs);
+	return 0;
+}
+
+static int param_get_active_interval(char *buffer,
+				     const struct kernel_param *kp)
+{
+	unsigned int msecs;
+
+	msecs = jiffies_to_msecs(tcache_active_interval);
+	return sprintf(buffer, "%u", msecs);
+}
+
+static struct kernel_param_ops param_ops_active_interval = {
+	.set = param_set_active_interval,
+	.get = param_get_active_interval,
+};
+module_param_cb(active_interval_msecs, &param_ops_active_interval, NULL, 0644);
+
+static int __init tcache_nodeinfo_init(void)
+{
+	int i;
+	struct tcache_nodeinfo *ni;
+
+	tcache_nodeinfo = kcalloc(nr_node_ids, sizeof(*tcache_nodeinfo),
+				  GFP_KERNEL);
+	if (!tcache_nodeinfo)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_node_ids; i++) {
+		ni = &tcache_nodeinfo[i];
+		spin_lock_init(&ni->lock);
+		ni->reclaim_tree = RB_ROOT;
+	}
+	return 0;
+}
+
+static int __init tcache_init(void)
+{
+	int err;
+
+	if (!tcache_enabled)
+		return 0;
+
+	err = tcache_nodeinfo_init();
+	if (err)
+		goto out_fail;
+
+	err = register_shrinker(&tcache_shrinker);
+	if (err)
+		goto out_free_lru;
+
+#ifdef CONFIG_SMP
+	num_node_trees = roundup_pow_of_two(2 * num_possible_cpus());
+#endif
+
+	err = cleancache_register_ops(&tcache_cleancache_ops);
+	if (err)
+		goto out_unregister_shrinker;
+
+	pr_info("tcache loaded\n");
+	return 0;
+
+out_unregister_shrinker:
+	unregister_shrinker(&tcache_shrinker);
+out_free_lru:
+	kfree(tcache_nodeinfo);
+out_fail:
+	return err;
+}
+module_init(tcache_init);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Transcendent file cache");
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -64,7 +64,8 @@ static void clear_exceptional_entry(struct address_space *mapping,
 		 */
 		if (!workingset_node_shadows(node) &&
 		    !list_empty(&node->private_list))
-			workingset_forget_node(node);
+			list_lru_del(&workingset_shadow_nodes,
+				     &node->private_list);
 		__radix_tree_delete_node(&mapping->page_tree, node);
 	}
 unlock:
@@ -282,16 +283,16 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	pgoff_t		indices[PAGEVEC_SIZE];
 	pgoff_t		index;
 	int		i;
+	int		bug_if_page_has_bh = 0;
 
-	cleancache_invalidate_inode(mapping);
 	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
-		return;
+		goto out;
 
 	/* Offsets within partial pages */
 	partial_start = lstart & (PAGE_CACHE_SIZE - 1);
 	partial_end = (lend + 1) & (PAGE_CACHE_SIZE - 1);
 	if (!inode_has_invalidate_range(mapping->host))
-		BUG_ON(partial_end);
+		bug_if_page_has_bh = 1;
 
 	/*
 	 * 'start' and 'end' always covers the range of pages to be fully
@@ -315,7 +316,6 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	while (index < end && __pagevec_lookup(&pvec, mapping, index,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE),
 			indices)) {
-		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -341,7 +341,6 @@ void truncate_inode_pages_range(struct address_space *mapping,
 		}
 		pagevec_remove_exceptionals(&pvec);
 		pagevec_release(&pvec);
-		mem_cgroup_uncharge_end();
 		cond_resched();
 		index++;
 	}
@@ -376,9 +375,11 @@ void truncate_inode_pages_range(struct address_space *mapping,
 			wait_on_page_writeback(page);
 			zero_user_segment(page, 0, partial_end);
 			cleancache_invalidate_page(mapping, page);
-			if (page_has_private(page))
+			if (page_has_private(page)) {
+				BUG_ON(bug_if_page_has_bh);
 				do_invalidatepage_range(page, 0,
 							partial_end);
+			}
 			unlock_page(page);
 			page_cache_release(page);
 		}
@@ -388,7 +389,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
 	 * will be released, just zeroed, so we can bail out now.
 	 */
 	if (start >= end)
-		return;
+		goto out;
 
 	index = start;
 	for ( ; ; ) {
@@ -406,7 +407,6 @@ void truncate_inode_pages_range(struct address_space *mapping,
 			pagevec_release(&pvec);
 			break;
 		}
-		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -428,9 +428,10 @@ void truncate_inode_pages_range(struct address_space *mapping,
 		}
 		pagevec_remove_exceptionals(&pvec);
 		pagevec_release(&pvec);
-		mem_cgroup_uncharge_end();
 		index++;
 	}
+
+out:
 	cleancache_invalidate_inode(mapping);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -527,7 +528,6 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 	while (index <= end && __pagevec_lookup(&pvec, mapping, index,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
 			indices)) {
-		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -556,7 +556,6 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
 		}
 		pagevec_remove_exceptionals(&pvec);
 		pagevec_release(&pvec);
-		mem_cgroup_uncharge_end();
 		cond_resched();
 		index++;
 	}
@@ -587,7 +586,6 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page)
 	BUG_ON(page_has_private(page));
 	__delete_from_page_cache(page, NULL);
 	spin_unlock_irq(&mapping->tree_lock);
-	mem_cgroup_uncharge_cache_page(page);
 
 	if (mapping->a_ops->freepage)
 		mapping->a_ops->freepage(page);
@@ -630,13 +628,14 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 	int ret2 = 0;
 	int did_range_unmap = 0;
 
-	cleancache_invalidate_inode(mapping);
+	if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+		goto out;
+
 	pagevec_init(&pvec, 0);
 	index = start;
 	while (index <= end && __pagevec_lookup(&pvec, mapping, index,
 			min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1,
 			indices)) {
-		mem_cgroup_uncharge_start();
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			struct page *page = pvec.pages[i];
 
@@ -689,10 +688,11 @@ int invalidate_inode_pages2_range(struct address_space *mapping,
 		}
 		pagevec_remove_exceptionals(&pvec);
 		pagevec_release(&pvec);
-		mem_cgroup_uncharge_end();
 		cond_resched();
 		index++;
 	}
+
+out:
 	cleancache_invalidate_inode(mapping);
 	return ret;
 }
--- /dev/null
+++ b/mm/tswap.c
@@ -0,0 +1,400 @@
+/*
+ *  mm/tswap.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+#include <linux/atomic.h>
+#include <linux/spinlock.h>
+#include <linux/radix-tree.h>
+#include <linux/list.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/pagemap.h>
+#include <linux/shrinker.h>
+#include <linux/frontswap.h>
+
+#define TSWAP_GFP_MASK		(GFP_NOIO | __GFP_NORETRY | __GFP_NOWARN)
+
+static RADIX_TREE(tswap_page_tree, GFP_ATOMIC | __GFP_NOWARN);
+static DEFINE_SPINLOCK(tswap_lock);
+
+struct tswap_lru {
+	struct list_head list;
+	unsigned long nr_items;
+} ____cacheline_aligned_in_smp;
+
+static struct tswap_lru *tswap_lru_node;
+
+/* Enable/disable tswap backend (set at boot time) */
+static bool tswap_enabled __read_mostly = true;
+module_param_named(enabled, tswap_enabled, bool, 0444);
+
+/* Enable/disable populating the cache */
+static bool tswap_active __read_mostly = true;
+module_param_named(active, tswap_active, bool, 0644);
+
+/* Total number of pages cached */
+static unsigned long tswap_nr_pages;
+module_param_named(nr_pages, tswap_nr_pages, ulong, 0444);
+
+unsigned long get_nr_tswap_pages(void)
+{
+	return tswap_nr_pages;
+}
+
+static void tswap_lru_add(struct page *page)
+{
+	struct tswap_lru *lru = &tswap_lru_node[page_to_nid(page)];
+
+	list_add_tail(&page->lru, &lru->list);
+	lru->nr_items++;
+}
+
+static void tswap_lru_del(struct page *page)
+{
+	struct tswap_lru *lru = &tswap_lru_node[page_to_nid(page)];
+
+	list_del(&page->lru);
+	lru->nr_items--;
+}
+
+static struct page *tswap_lookup_page(swp_entry_t entry)
+{
+	struct page *page;
+
+	spin_lock(&tswap_lock);
+	page = radix_tree_lookup(&tswap_page_tree, entry.val);
+	spin_unlock(&tswap_lock);
+	BUG_ON(page && page_private(page) != entry.val);
+	return page;
+}
+
+static int tswap_insert_page(swp_entry_t entry, struct page *page)
+{
+	int err;
+
+	err = radix_tree_preload(TSWAP_GFP_MASK);
+	if (err)
+		return err;
+
+	set_page_private(page, entry.val);
+	spin_lock(&tswap_lock);
+	err = radix_tree_insert(&tswap_page_tree, entry.val, page);
+	if (!err) {
+		tswap_lru_add(page);
+		tswap_nr_pages++;
+	}
+	spin_unlock(&tswap_lock);
+
+	radix_tree_preload_end();
+	return err;
+}
+
+static struct page *tswap_delete_page(swp_entry_t entry, struct page *expected)
+{
+	struct page *page;
+
+	spin_lock(&tswap_lock);
+	page = radix_tree_delete_item(&tswap_page_tree, entry.val, expected);
+	if (page) {
+		tswap_lru_del(page);
+		tswap_nr_pages--;
+	}
+	spin_unlock(&tswap_lock);
+	if (page) {
+		BUG_ON(expected && page != expected);
+		BUG_ON(page_private(page) != entry.val);
+	}
+	return page;
+}
+
+static unsigned long tswap_shrink_count(struct shrinker *shrink,
+					struct shrink_control *sc)
+{
+	return tswap_lru_node[sc->nid].nr_items;
+}
+
+static int tswap_evict_page(struct page *page)
+{
+	struct address_space *swapper_space;
+	struct page *found_page;
+	swp_entry_t entry;
+	int err;
+
+	BUG_ON(!PageLocked(page));
+
+	entry.val = page_private(page);
+	swapper_space = swap_address_space(entry);
+retry:
+	err = -EEXIST;
+	found_page = find_get_page(swapper_space, entry.val);
+	if (found_page) {
+		/*
+		 * There is already a swap cache page at the given offset. If
+		 * the page is uptodate, we can safely free the frontswap page,
+		 * marking the swapcache page dirty. Otherwise, the frontswap
+		 * page is about to be loaded and cannot be released.
+		 */
+		err = -EBUSY;
+		if (!trylock_page(found_page)) {
+			put_page(found_page);
+			goto out;
+		}
+		/* recheck that the page is still in the swap cache */
+		if (!PageSwapCache(found_page) ||
+		    page_private(found_page) != entry.val) {
+			unlock_page(found_page);
+			put_page(found_page);
+			goto retry;
+		}
+		if (PageUptodate(found_page)) {
+			/*
+			 * Since we are holding the swap cache page lock, no
+			 * frontswap callbacks are allowed now. However, the
+			 * frontswap page could have been invalidated before we
+			 * took the lock, in which case we have nothing to do.
+			 */
+			err = -ENOENT;
+			if (tswap_delete_page(entry, page)) {
+				SetPageDirty(found_page);
+				put_page(page);
+				err = 0;
+			}
+		}
+		unlock_page(found_page);
+		put_page(found_page);
+		goto out;
+	}
+
+	err = swapcache_prepare(entry);
+	if (err == -EEXIST) {
+		cond_resched();
+		goto retry;
+	}
+	if (err)
+		/* the swap entry has been freed, and therefore the page must
+		 * have been invalidated */
+		goto out;
+
+	/*
+	 * From now on, no frontswap callbacks can be called on the swap entry,
+	 * because we hold its swap cache reference.
+	 */
+
+	err = -ENOENT;
+	if (tswap_lookup_page(entry) != page)
+		/* the page could have been removed from tswap before we
+		 * prepared swap cache */
+		goto out_free_swapcache;
+
+	SetPageSwapBacked(page);
+	err = __add_to_swap_cache(page, entry);
+	if (err) {
+		ClearPageSwapBacked(page);
+		/* __add_to_swap_cache clears page->private on failure */
+		set_page_private(page, entry.val);
+		/* __add_to_swap_cache does not return -EEXIST, so we can
+		 * safely clear SWAP_HAS_CACHE flag */
+		goto out_free_swapcache;
+	}
+
+	/* the page is now in the swap cache, remove it from tswap */
+	BUG_ON(!tswap_delete_page(entry, page));
+	put_page(page);
+
+	lru_cache_add_anon(page);
+	SetPageUptodate(page);
+	SetPageDirty(page);
+	return 0;
+
+out_free_swapcache:
+	swapcache_free(entry);
+out:
+	return err;
+}
+
+static unsigned long tswap_shrink_scan(struct shrinker *shrink,
+				       struct shrink_control *sc)
+{
+	struct tswap_lru *lru = &tswap_lru_node[sc->nid];
+	unsigned long nr_reclaimed = 0;
+
+	spin_lock(&tswap_lock);
+	while (sc->nr_to_scan-- > 0) {
+		struct page *page;
+
+		if (!lru->nr_items)
+			break;
+		
+		page = list_first_entry(&lru->list, struct page, lru);
+		/* lock the page to avoid interference with
+		 * other reclaiming threads */
+		if (!trylock_page(page)) {
+			list_move_tail(&page->lru, &lru->list);
+			cond_resched_lock(&tswap_lock);
+			continue;
+		}
+		get_page(page);
+		spin_unlock(&tswap_lock);
+
+		if (tswap_evict_page(page) == 0)
+			nr_reclaimed++;
+
+		unlock_page(page);
+		put_page(page);
+
+		cond_resched();
+		spin_lock(&tswap_lock);
+	}
+	spin_unlock(&tswap_lock);
+
+	return nr_reclaimed;
+}
+
+static struct shrinker tswap_shrinker = {
+	.count_objects = tswap_shrink_count,
+	.scan_objects = tswap_shrink_scan,
+	.seeks = 1,
+	.flags = SHRINKER_NUMA_AWARE,
+};
+
+static void tswap_frontswap_init(unsigned type)
+{
+	/*
+	 * We maintain the single page tree for all swap types, so nothing to
+	 * do here.
+	 */
+}
+
+static int tswap_frontswap_store(unsigned type, pgoff_t offset,
+				 struct page *page)
+{
+	swp_entry_t entry = swp_entry(type, offset);
+	struct page *cache_page;
+	int err = 0;
+
+	if (!tswap_active)
+		return -1;
+
+	cache_page = tswap_lookup_page(entry);
+	if (cache_page)
+		goto copy;
+
+	if (!(current->flags & PF_MEMCG_RECLAIM))
+		return -1;
+
+	cache_page = alloc_page(TSWAP_GFP_MASK | __GFP_HIGHMEM);
+	if (!cache_page)
+		return -1;
+
+	err = tswap_insert_page(entry, cache_page);
+	if (err) {
+		/*
+		 * Frontswap stores proceed under the page lock, so this can
+		 * only fail with ENOMEM.
+		 */
+		BUG_ON(err == -EEXIST);
+		put_page(cache_page);
+		return -1;
+	}
+copy:
+	copy_highpage(cache_page, page);
+	return 0;
+}
+
+static int tswap_frontswap_load(unsigned type, pgoff_t offset,
+				struct page *page)
+{
+	struct page *cache_page;
+
+	cache_page = tswap_delete_page(swp_entry(type, offset), NULL);
+	if (!cache_page)
+		return -1;
+
+	copy_highpage(page, cache_page);
+	put_page(cache_page);
+	return 0;
+}
+
+static void tswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
+{
+	struct page *cache_page;
+
+	cache_page = tswap_delete_page(swp_entry(type, offset), NULL);
+	if (cache_page)
+		put_page(cache_page);
+}
+
+static void tswap_frontswap_invalidate_area(unsigned type)
+{
+	/*
+	 * This function is called on swapoff after all swap entries of the
+	 * given type has been freed and therefore all frontswap pages has been
+	 * invalidated, so nothing to do here.
+	 */
+}
+
+static struct frontswap_ops tswap_frontswap_ops = {
+	.init = tswap_frontswap_init,
+	.store = tswap_frontswap_store,
+	.load = tswap_frontswap_load,
+	.invalidate_page = tswap_frontswap_invalidate_page,
+	.invalidate_area = tswap_frontswap_invalidate_area,
+};
+
+static int __init tswap_lru_init(void)
+{
+	int i;
+
+	tswap_lru_node = kcalloc(nr_node_ids, sizeof(*tswap_lru_node),
+				 GFP_KERNEL);
+	if (!tswap_lru_node)
+		return -ENOMEM;
+
+	for (i = 0; i < nr_node_ids; i++)
+		INIT_LIST_HEAD(&tswap_lru_node[i].list);
+	return 0;
+}
+
+static int __init tswap_init(void)
+{
+	int err;
+	struct frontswap_ops *old_ops;
+
+	if (!tswap_enabled)
+		return 0;
+
+	err = tswap_lru_init();
+	if (err)
+		goto out_fail;
+
+	err = register_shrinker(&tswap_shrinker);
+	if (err)
+		goto out_free_lru;
+
+	frontswap_tmem_exclusive_gets(true);
+
+	old_ops = frontswap_register_ops(&tswap_frontswap_ops);
+	pr_info("tswap loaded\n");
+	if (old_ops)
+		pr_warn("tswap: frontswap_ops %p overridden\n", old_ops);
+
+	return 0;
+
+out_free_lru:
+	kfree(tswap_lru_node);
+out_fail:
+	return err;
+}
+module_init(tswap_init);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("Transcendent swap cache");
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -24,6 +24,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 			    unsigned long src_addr,
 			    struct page **pagep)
 {
+	struct mem_cgroup *memcg;
 	pte_t _dst_pte, *dst_pte;
 	spinlock_t *ptl;
 	void *page_kaddr;
@@ -62,7 +63,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 	__SetPageUptodate(page);
 
 	ret = -ENOMEM;
-	if (mem_cgroup_newpage_charge(page, dst_mm, GFP_KERNEL))
+	if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg))
 		goto out_release;
 
 	_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
@@ -76,6 +77,8 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
 
 	inc_mm_counter(dst_mm, MM_ANONPAGES);
 	page_add_new_anon_rmap(page, dst_vma, dst_addr);
+	mem_cgroup_commit_charge(page, memcg, false);
+	lru_cache_add_active_or_unevictable(page, dst_vma);
 
 	set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
 
@@ -88,7 +91,7 @@ out:
 	return ret;
 out_release_uncharge_unlock:
 	pte_unmap_unlock(dst_pte, ptl);
-	mem_cgroup_uncharge_page(page);
+	mem_cgroup_cancel_charge(page, memcg);
 out_release:
 	page_cache_release(page);
 	goto out;
--- a/mm/util.c
+++ b/mm/util.c
@@ -16,9 +16,6 @@
 
 #include "internal.h"
 
-#define CREATE_TRACE_POINTS
-#include <trace/events/kmem.h>
-
 static inline int is_kernel_rodata(unsigned long addr)
 {
 	return addr >= (unsigned long)__start_rodata &&
@@ -149,97 +146,6 @@ void *memdup_user(const void __user *src, size_t len)
 }
 EXPORT_SYMBOL(memdup_user);
 
-static __always_inline void *__do_krealloc(const void *p, size_t new_size,
-					   gfp_t flags)
-{
-	void *ret;
-	size_t ks = 0;
-
-	if (p)
-		ks = ksize(p);
-
-	if (ks >= new_size)
-		return (void *)p;
-
-	ret = kmalloc_track_caller(new_size, flags);
-	if (ret && p)
-		memcpy(ret, p, ks);
-
-	return ret;
-}
-
-/**
- * __krealloc - like krealloc() but don't free @p.
- * @p: object to reallocate memory for.
- * @new_size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * This function is like krealloc() except it never frees the originally
- * allocated buffer. Use this if you don't want to free the buffer immediately
- * like, for example, with RCU.
- */
-void *__krealloc(const void *p, size_t new_size, gfp_t flags)
-{
-	if (unlikely(!new_size))
-		return ZERO_SIZE_PTR;
-
-	return __do_krealloc(p, new_size, flags);
-
-}
-EXPORT_SYMBOL(__krealloc);
-
-/**
- * krealloc - reallocate memory. The contents will remain unchanged.
- * @p: object to reallocate memory for.
- * @new_size: how many bytes of memory are required.
- * @flags: the type of memory to allocate.
- *
- * The contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes.  If @p is %NULL, krealloc()
- * behaves exactly like kmalloc().  If @new_size is 0 and @p is not a
- * %NULL pointer, the object pointed to is freed.
- */
-void *krealloc(const void *p, size_t new_size, gfp_t flags)
-{
-	void *ret;
-
-	if (unlikely(!new_size)) {
-		kfree(p);
-		return ZERO_SIZE_PTR;
-	}
-
-	ret = __do_krealloc(p, new_size, flags);
-	if (ret && p != ret)
-		kfree(p);
-
-	return ret;
-}
-EXPORT_SYMBOL(krealloc);
-
-/**
- * kzfree - like kfree but zero memory
- * @p: object to free memory of
- *
- * The memory of the object @p points to is zeroed before freed.
- * If @p is %NULL, kzfree() does nothing.
- *
- * Note: this function zeroes the whole allocated buffer which can be a good
- * deal bigger than the requested buffer size passed to kmalloc(). So be
- * careful when using this function in performance sensitive code.
- */
-void kzfree(const void *p)
-{
-	size_t ks;
-	void *mem = (void *)p;
-
-	if (unlikely(ZERO_OR_NULL_PTR(mem)))
-		return;
-	ks = ksize(mem);
-	memset(mem, 0, ks);
-	kfree(mem);
-}
-EXPORT_SYMBOL(kzfree);
-
 /*
  * strndup_user - duplicate an existing string from user space
  * @s: The string to duplicate
@@ -425,6 +331,11 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
 
 	ret = security_mmap_file(file, prot, flag);
 	if (!ret) {
+		/* Ugly fix for PSBM-23133 vdavydov@ */
+		if (file && file->f_op && (flag & MAP_TYPE) == MAP_SHARED &&
+		    S_ISREG(file_inode(file)->i_mode) &&
+		    (file_inode(file)->i_sb->s_type->fs_flags & FS_HAS_MMAP_PREP))
+			file->f_op->mmap(file, NULL);
 		down_write(&mm->mmap_sem);
 		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
 				    &populate);
@@ -515,12 +426,3 @@ unsigned long vm_commit_limit(void)
 
 	return allowed;
 }
-
-
-/* Tracepoints definitions. */
-EXPORT_TRACEPOINT_SYMBOL(kmalloc);
-EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
-EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
-EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
-EXPORT_TRACEPOINT_SYMBOL(kfree);
-EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -1290,7 +1290,7 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
 int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages)
 {
 	unsigned long addr = (unsigned long)area->addr;
-	unsigned long end = addr + area->size - PAGE_SIZE;
+	unsigned long end = addr + get_vm_area_size(area);
 	int err;
 
 	err = vmap_page_range(addr, end, prot, *pages);
@@ -1361,10 +1361,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
 	if (unlikely(!area))
 		return NULL;
 
-	/*
-	 * We always allocate a guard page.
-	 */
-	size += PAGE_SIZE;
+	if (!(flags & VM_NO_GUARD))
+		size += PAGE_SIZE;
 
 	va = alloc_vmap_area(size, align, start, end, node, gfp_mask);
 	if (IS_ERR(va)) {
@@ -1466,6 +1464,7 @@ struct vm_struct *remove_vm_area(const void *addr)
 		spin_unlock(&vmap_area_lock);
 
 		vmap_debug_free_range(va->va_start, va->va_end);
+		kasan_free_shadow(vm);
 		free_unmap_vmap_area(va);
 		vm->size -= PAGE_SIZE;
 
@@ -1610,7 +1609,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 	unsigned int nr_pages, array_size, i;
 	gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
 
-	nr_pages = (area->size - PAGE_SIZE) >> PAGE_SHIFT;
+	nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
 	array_size = (nr_pages * sizeof(struct page *));
 
 	area->nr_pages = nr_pages;
@@ -1635,7 +1634,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
 		gfp_t tmp_mask = gfp_mask | __GFP_NOWARN;
 
 		if (node < 0)
-			page = alloc_page(tmp_mask);
+			page = alloc_pages(tmp_mask, order);
 		else
 			page = alloc_pages_node(node, tmp_mask, order);
 
@@ -1667,6 +1666,7 @@ fail:
  *	@end:		vm area range end
  *	@gfp_mask:	flags for the page level allocator
  *	@prot:		protection mask for the allocated pages
+ *	@vm_flags:	additional vm area flags (e.g. %VM_NO_GUARD)
  *	@node:		node to use for allocation or NUMA_NO_NODE
  *	@caller:	caller's return address
  *
@@ -1676,7 +1676,8 @@ fail:
  */
 void *__vmalloc_node_range(unsigned long size, unsigned long align,
 			unsigned long start, unsigned long end, gfp_t gfp_mask,
-			pgprot_t prot, int node, const void *caller)
+			pgprot_t prot, unsigned long vm_flags, int node,
+			const void *caller)
 {
 	struct vm_struct *area;
 	void *addr;
@@ -1686,8 +1687,8 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
 	if (!size || (size >> PAGE_SHIFT) > totalram_pages)
 		goto fail;
 
-	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
-				  start, end, node, gfp_mask, caller);
+	area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST |
+				vm_flags, start, end, node, gfp_mask, caller);
 	if (!area)
 		goto fail;
 
@@ -1736,7 +1737,7 @@ static void *__vmalloc_node(unsigned long size, unsigned long align,
 			    int node, const void *caller)
 {
 	return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
-				gfp_mask, prot, node, caller);
+				gfp_mask, prot, 0, node, caller);
 }
 
 void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
@@ -1786,6 +1787,20 @@ void *vzalloc(unsigned long size)
 }
 EXPORT_SYMBOL(vzalloc);
 
+void *vmalloc_account(unsigned long size)
+{
+	return __vmalloc_node_flags(size, NUMA_NO_NODE,
+			GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM);
+}
+EXPORT_SYMBOL(vmalloc_account);
+
+void *vzalloc_account(unsigned long size)
+{
+	return __vmalloc_node_flags(size, NUMA_NO_NODE,
+			GFP_KERNEL_ACCOUNT | __GFP_HIGHMEM | __GFP_ZERO);
+}
+EXPORT_SYMBOL(vzalloc_account);
+
 /**
  * vmalloc_user - allocate zeroed virtually contiguous memory for userspace
  * @size: allocation size
@@ -2042,7 +2057,7 @@ long vread(char *buf, char *addr, unsigned long count)
 
 		vm = va->vm;
 		vaddr = (char *) vm->addr;
-		if (addr >= vaddr + vm->size - PAGE_SIZE)
+		if (addr >= vaddr + get_vm_area_size(vm))
 			continue;
 		while (addr < vaddr) {
 			if (count == 0)
@@ -2052,7 +2067,7 @@ long vread(char *buf, char *addr, unsigned long count)
 			addr++;
 			count--;
 		}
-		n = vaddr + vm->size - PAGE_SIZE - addr;
+		n = vaddr + get_vm_area_size(vm) - addr;
 		if (n > count)
 			n = count;
 		if (!(vm->flags & VM_IOREMAP))
@@ -2124,7 +2139,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
 
 		vm = va->vm;
 		vaddr = (char *) vm->addr;
-		if (addr >= vaddr + vm->size - PAGE_SIZE)
+		if (addr >= vaddr + get_vm_area_size(vm))
 			continue;
 		while (addr < vaddr) {
 			if (count == 0)
@@ -2133,7 +2148,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
 			addr++;
 			count--;
 		}
-		n = vaddr + vm->size - PAGE_SIZE - addr;
+		n = vaddr + get_vm_area_size(vm) - addr;
 		if (n > count)
 			n = count;
 		if (!(vm->flags & VM_IOREMAP)) {
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -50,6 +50,7 @@
 
 #include <linux/swapops.h>
 #include <linux/balloon_compaction.h>
+#include <linux/vzstat.h>
 
 #include "internal.h"
 
@@ -79,6 +80,9 @@ struct scan_control {
 	/* Can pages be swapped as part of reclaim? */
 	int may_swap;
 
+	/* Can cgroups be reclaimed below their normal consumption range? */
+	int may_thrash;
+
 	int order;
 
 	/* Scan (total_size >> priority) pages at once */
@@ -162,7 +166,7 @@ static unsigned long zone_reclaimable_pages(struct zone *zone)
 	return nr;
 }
 
-static bool zone_reclaimable(struct zone *zone)
+bool zone_reclaimable(struct zone *zone)
 {
 	return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
 }
@@ -176,14 +180,31 @@ static unsigned long get_lru_size(struct lruvec *lruvec, enum lru_list lru)
 }
 
 /*
- * Add a shrinker callback to be called from the vm
+ * Add a shrinker callback to be called from the vm.
  */
-void register_shrinker(struct shrinker *shrinker)
+int register_shrinker(struct shrinker *shrinker)
 {
-	atomic_long_set(&shrinker->nr_in_batch, 0);
+	size_t size = sizeof(*shrinker->nr_deferred);
+
+	/*
+	 * If we only have one possible node in the system anyway, save
+	 * ourselves the trouble and disable NUMA aware behavior. This way we
+	 * will save memory and some small loop time later.
+	 */
+	if (nr_node_ids == 1)
+		shrinker->flags &= ~SHRINKER_NUMA_AWARE;
+
+	if (shrinker->flags & SHRINKER_NUMA_AWARE)
+		size *= nr_node_ids;
+
+	shrinker->nr_deferred = kzalloc(size, GFP_KERNEL);
+	if (!shrinker->nr_deferred)
+		return -ENOMEM;
+
 	down_write(&shrinker_rwsem);
 	list_add_tail(&shrinker->list, &shrinker_list);
 	up_write(&shrinker_rwsem);
+	return 0;
 }
 EXPORT_SYMBOL(register_shrinker);
 
@@ -195,159 +216,212 @@ void unregister_shrinker(struct shrinker *shrinker)
 	down_write(&shrinker_rwsem);
 	list_del(&shrinker->list);
 	up_write(&shrinker_rwsem);
+	kfree(shrinker->nr_deferred);
 }
 EXPORT_SYMBOL(unregister_shrinker);
 
-static inline int do_shrinker_shrink(struct shrinker *shrinker,
-				     struct shrink_control *sc,
-				     unsigned long nr_to_scan)
-{
-	int objects;
-	sc->nr_to_scan = nr_to_scan;
-	objects = (*shrinker->shrink)(shrinker, sc);
+#define SHRINK_BATCH 128
+
+static unsigned long do_shrink_slab(struct shrink_control *shrinkctl,
+				    struct shrinker *shrinker,
+				    unsigned long nr_scanned,
+				    unsigned long nr_eligible)
+{
+	unsigned long freed = 0;
+	unsigned long long delta;
+	long total_scan;
+	long max_pass;
+	long nr;
+	long new_nr;
+	int nid = shrinkctl->nid;
+	long batch_size = shrinker->batch ? shrinker->batch
+					  : SHRINK_BATCH;
+
+	max_pass = shrinker->count_objects(shrinker, shrinkctl);
+	if (max_pass == 0)
+		return 0;
+
+	/*
+	 * copy the current shrinker scan count into a local variable
+	 * and zero it so that other concurrent shrinker invocations
+	 * don't also do this scanning work.
+	 */
+	nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0);
+
+	total_scan = nr;
+	delta = nr_scanned / shrinker->seeks;
+	delta *= max_pass;
+	do_div(delta, nr_eligible + 1);
+	total_scan += delta;
+	if (total_scan < 0) {
+		printk(KERN_ERR
+		"shrink_slab: %pF negative objects to delete nr=%ld\n",
+		       shrinker->scan_objects, total_scan);
+		total_scan = max_pass;
+	}
+
+	/*
+	 * We need to avoid excessive windup on filesystem shrinkers
+	 * due to large numbers of GFP_NOFS allocations causing the
+	 * shrinkers to return -1 all the time. This results in a large
+	 * nr being built up so when a shrink that can do some work
+	 * comes along it empties the entire cache due to nr >>>
+	 * max_pass.  This is bad for sustaining a working set in
+	 * memory.
+	 *
+	 * Hence only allow the shrinker to scan the entire cache when
+	 * a large delta change is calculated directly.
+	 */
+	if (delta < max_pass / 4)
+		total_scan = min(total_scan, max_pass / 2);
+
+	/*
+	 * Avoid risking looping forever due to too large nr value:
+	 * never try to free more than twice the estimate number of
+	 * freeable entries.
+	 */
+	if (total_scan > max_pass * 2)
+		total_scan = max_pass * 2;
+
+	trace_mm_shrink_slab_start(shrinker, shrinkctl, nr,
+				   nr_scanned, nr_eligible,
+				   max_pass, delta, total_scan);
+
+	while (total_scan >= batch_size) {
+		unsigned long ret;
+
+		if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
+			break;
+
+		shrinkctl->nr_to_scan = batch_size;
+		ret = shrinker->scan_objects(shrinker, shrinkctl);
+		if (ret == SHRINK_STOP)
+			break;
+		freed += ret;
+
+		count_vm_events(SLABS_SCANNED, batch_size);
+		total_scan -= batch_size;
+
+		cond_resched();
+	}
+
 	/*
-	 * A shrinker can legitimately return -1 meaning that it cannot do
-	 * much work without a risk of deadlock.
-	 * However, in some extreme cases, specially when there is abusive
-	 * usage of vm.vfs_cache_pressure, a shrinker might return a negative
-	 * value indicating that its integer return value has overflown.
-	 * In such cases, we just go ahead and cap the return val to INT_MAX.
+	 * move the unused scan count back into the shrinker in a
+	 * manner that handles concurrent updates. If we exhausted the
+	 * scan, there is no need to do an update.
 	 */
-	if (objects < -1)
-		return INT_MAX;
+	if (total_scan > 0)
+		new_nr = atomic_long_add_return(total_scan,
+						&shrinker->nr_deferred[nid]);
+	else
+		new_nr = atomic_long_read(&shrinker->nr_deferred[nid]);
 
-	return objects;
+	trace_mm_shrink_slab_end(shrinker, freed, nr, new_nr);
+	return freed;
 }
 
-#define SHRINK_BATCH 128
-/*
- * Call the shrink functions to age shrinkable caches
+/**
+ * shrink_slab - shrink slab caches
+ * @gfp_mask: allocation context
+ * @nid: node whose slab caches to target
+ * @memcg: memory cgroup whose slab caches to target
+ * @nr_scanned: pressure numerator
+ * @nr_eligible: pressure denominator
  *
- * Here we assume it costs one seek to replace a lru page and that it also
- * takes a seek to recreate a cache object.  With this in mind we age equal
- * percentages of the lru and ageable caches.  This should balance the seeks
- * generated by these structures.
+ * Call the shrink functions to age shrinkable caches.
  *
- * If the vm encountered mapped pages on the LRU it increase the pressure on
- * slab to avoid swapping.
+ * @nid is passed along to shrinkers with SHRINKER_NUMA_AWARE set,
+ * unaware shrinkers will receive a node id of 0 instead.
  *
- * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
+ * @memcg specifies the memory cgroup to target. If it is not NULL,
+ * only shrinkers with SHRINKER_MEMCG_AWARE set will be called to scan
+ * objects from the memory cgroup specified. Otherwise all shrinkers
+ * are called, and memcg aware shrinkers are supposed to scan the
+ * global list then.
  *
- * `lru_pages' represents the number of on-LRU pages in all the zones which
- * are eligible for the caller's allocation attempt.  It is used for balancing
- * slab reclaim versus page reclaim.
+ * @nr_scanned and @nr_eligible form a ratio that indicate how much of
+ * the available objects should be scanned.  Page reclaim for example
+ * passes the number of pages scanned and the number of pages on the
+ * LRU lists that it considered on @nid, plus a bias in @nr_scanned
+ * when it encountered mapped pages.  The ratio is further biased by
+ * the ->seeks setting of the shrink function, which indicates the
+ * cost to recreate an object relative to that of an LRU page.
  *
- * Returns the number of slab objects which we shrunk.
+ * Returns the number of reclaimed slab objects.
  */
-unsigned long shrink_slab(struct shrink_control *shrink,
-			  unsigned long nr_pages_scanned,
-			  unsigned long lru_pages)
+static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
+				 struct mem_cgroup *memcg,
+				 unsigned long nr_scanned,
+				 unsigned long nr_eligible,
+				 bool for_drop_caches)
 {
 	struct shrinker *shrinker;
-	unsigned long ret = 0;
+	unsigned long freed = 0;
+
+	if (memcg && !memcg_kmem_is_active(memcg))
+		return 0;
+
+	if (nr_scanned == 0)
+		nr_scanned = SWAP_CLUSTER_MAX;
 
-	if (nr_pages_scanned == 0)
-		nr_pages_scanned = SWAP_CLUSTER_MAX;
+	if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
+		return 0;
 
 	if (!down_read_trylock(&shrinker_rwsem)) {
-		/* Assume we'll be able to shrink next time */
-		ret = 1;
+		/*
+		 * If we would return 0, our callers would understand that we
+		 * have nothing else to shrink and give up trying. By returning
+		 * 1 we keep it going and assume we'll be able to shrink next
+		 * time.
+		 */
+		freed = 1;
 		goto out;
 	}
 
 	list_for_each_entry(shrinker, &shrinker_list, list) {
-		unsigned long long delta;
-		long total_scan;
-		long max_pass;
-		int shrink_ret = 0;
-		long nr;
-		long new_nr;
-		long batch_size = shrinker->batch ? shrinker->batch
-						  : SHRINK_BATCH;
-
-		max_pass = do_shrinker_shrink(shrinker, shrink, 0);
-		if (max_pass <= 0)
-			continue;
+		struct shrink_control sc = {
+			.gfp_mask = gfp_mask,
+			.nid = nid,
+			.memcg = memcg,
+			.for_drop_caches = for_drop_caches,
+		};
 
-		/*
-		 * copy the current shrinker scan count into a local variable
-		 * and zero it so that other concurrent shrinker invocations
-		 * don't also do this scanning work.
-		 */
-		nr = atomic_long_xchg(&shrinker->nr_in_batch, 0);
-
-		total_scan = nr;
-		delta = (4 * nr_pages_scanned) / shrinker->seeks;
-		delta *= max_pass;
-		do_div(delta, lru_pages + 1);
-		total_scan += delta;
-		if (total_scan < 0) {
-			printk(KERN_ERR "shrink_slab: %pF negative objects to "
-			       "delete nr=%ld\n",
-			       shrinker->shrink, total_scan);
-			total_scan = max_pass;
-		}
+		if (memcg && !(shrinker->flags & SHRINKER_MEMCG_AWARE))
+			continue;
 
-		/*
-		 * We need to avoid excessive windup on filesystem shrinkers
-		 * due to large numbers of GFP_NOFS allocations causing the
-		 * shrinkers to return -1 all the time. This results in a large
-		 * nr being built up so when a shrink that can do some work
-		 * comes along it empties the entire cache due to nr >>>
-		 * max_pass.  This is bad for sustaining a working set in
-		 * memory.
-		 *
-		 * Hence only allow the shrinker to scan the entire cache when
-		 * a large delta change is calculated directly.
-		 */
-		if (delta < max_pass / 4)
-			total_scan = min(total_scan, max_pass / 2);
+		if (!(shrinker->flags & SHRINKER_NUMA_AWARE))
+			sc.nid = 0;
 
-		/*
-		 * Avoid risking looping forever due to too large nr value:
-		 * never try to free more than twice the estimate number of
-		 * freeable entries.
-		 */
-		if (total_scan > max_pass * 2)
-			total_scan = max_pass * 2;
+		freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible);
+	}
 
-		trace_mm_shrink_slab_start(shrinker, shrink, nr,
-					nr_pages_scanned, lru_pages,
-					max_pass, delta, total_scan);
+	up_read(&shrinker_rwsem);
+out:
+	cond_resched();
+	return freed;
+}
 
-		while (total_scan >= batch_size) {
-			int nr_before;
+void drop_slab_node(int nid)
+{
+	unsigned long freed;
 
-			nr_before = do_shrinker_shrink(shrinker, shrink, 0);
-			shrink_ret = do_shrinker_shrink(shrinker, shrink,
-							batch_size);
-			if (shrink_ret == -1)
-				break;
-			if (shrink_ret < nr_before)
-				ret += nr_before - shrink_ret;
-			count_vm_events(SLABS_SCANNED, batch_size);
-			total_scan -= batch_size;
+	do {
+		struct mem_cgroup *memcg = NULL;
 
-			cond_resched();
-		}
+		freed = 0;
+		do {
+			freed += shrink_slab(GFP_KERNEL, nid, memcg,
+					     1000, 1000, true);
+		} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+	} while (freed > 10);
+}
 
-		/*
-		 * move the unused scan count back into the shrinker in a
-		 * manner that handles concurrent updates. If we exhausted the
-		 * scan, there is no need to do an update.
-		 */
-		if (total_scan > 0)
-			new_nr = atomic_long_add_return(total_scan,
-					&shrinker->nr_in_batch);
-		else
-			new_nr = atomic_long_read(&shrinker->nr_in_batch);
+void drop_slab(void)
+{
+	int nid;
 
-		trace_mm_shrink_slab_end(shrinker, shrink_ret, nr, new_nr);
-	}
-	up_read(&shrinker_rwsem);
-out:
-	cond_resched();
-	return ret;
+	for_each_online_node(nid)
+		drop_slab_node(nid);
 }
 
 static inline int is_page_cache_freeable(struct page *page)
@@ -526,9 +600,10 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 
 	if (PageSwapCache(page)) {
 		swp_entry_t swap = { .val = page_private(page) };
+		mem_cgroup_swapout(page, swap);
 		__delete_from_swap_cache(page);
 		spin_unlock_irq(&mapping->tree_lock);
-		swapcache_free(swap, page);
+		swapcache_free(swap);
 	} else {
 		void (*freepage)(struct page *);
 		void *shadow = NULL;
@@ -555,7 +630,6 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
 			shadow = workingset_eviction(mapping, page);
 		__delete_from_page_cache(page, shadow);
 		spin_unlock_irq(&mapping->tree_lock);
-		mem_cgroup_uncharge_cache_page(page);
 
 		if (freepage != NULL)
 			freepage(page);
@@ -777,7 +851,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
 	cond_resched();
 
-	mem_cgroup_uncharge_start();
 	while (!list_empty(page_list)) {
 		struct address_space *mapping;
 		struct page *page;
@@ -850,27 +923,23 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 		 *    caller can stall after page list has been processed.
 		 *
 		 * 2) Global reclaim encounters a page, memcg encounters a
-		 *    page that is not marked for immediate reclaim or
-		 *    the caller does not have __GFP_IO. In this case mark
-		 *    the page for immediate reclaim and continue scanning.
+		 *    page that is not marked for immediate reclaim, or the
+		 *    caller does not have __GFP_FS (or __GFP_IO if it's simply
+		 *    going to swap, not to fs). In this case mark the page for
+		 *    immediate reclaim and continue scanning.
 		 *
-		 *    __GFP_IO is checked  because a loop driver thread might
+		 *    Require may_enter_fs because we would wait on fs, which
+		 *    may not have submitted IO yet. And the loop driver might
 		 *    enter reclaim, and deadlock if it waits on a page for
 		 *    which it is needed to do the write (loop masks off
 		 *    __GFP_IO|__GFP_FS for this reason); but more thought
 		 *    would probably show more reasons.
 		 *
-		 *    Don't require __GFP_FS, since we're not going into the
-		 *    FS, just waiting on its writeback completion. Worryingly,
-		 *    ext4 gfs2 and xfs allocate pages with
-		 *    grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing
-		 *    may_enter_fs here is liable to OOM on them.
-		 *
-		 * 3) memcg encounters a page that is not already marked
+		 * 3) memcg encounters a page that is already marked
 		 *    PageReclaim. memcg does not have any dirty pages
 		 *    throttling so we could easily OOM just because too many
 		 *    pages are in writeback and there is nothing else to
-		 *    reclaim. Wait for the writeback to complete.
+		 *    reclaim. Stall memcg reclaim then.
 		 */
 		if (PageWriteback(page)) {
 			/* Case 1 above */
@@ -882,7 +951,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
 			/* Case 2 above */
 			} else if (global_reclaim(sc) ||
-			    !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) {
+			    !PageReclaim(page) || !may_enter_fs) {
 				/*
 				 * This is slightly racy - end_page_writeback()
 				 * might have just cleared PageReclaim, then
@@ -891,7 +960,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 				 * enough to care.  What we do want is for this
 				 * page to have PageReclaim set next time memcg
 				 * reclaim reaches the tests above, so it will
-				 * then wait_on_page_writeback() to avoid OOM;
+				 * then stall to avoid OOM;
 				 * and it's also appropriate in global reclaim.
 				 */
 				SetPageReclaim(page);
@@ -901,7 +970,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 
 			/* Case 3 above */
 			} else {
-				wait_on_page_writeback(page);
+				nr_immediate++;
+				goto keep_locked;
 			}
 		}
 
@@ -1094,12 +1164,13 @@ keep:
 		VM_BUG_ON_PAGE(PageLRU(page) || PageUnevictable(page), page);
 	}
 
+	mem_cgroup_uncharge_list(&free_pages);
 	try_to_unmap_flush();
 	free_hot_cold_page_list(&free_pages, true);
 
 	list_splice(&ret_pages, page_list);
 	count_vm_events(PGACTIVATE, pgactivate);
-	mem_cgroup_uncharge_end();
+
 	*ret_nr_dirty += nr_dirty;
 	*ret_nr_congested += nr_congested;
 	*ret_nr_unqueued_dirty += nr_unqueued_dirty;
@@ -1239,7 +1310,8 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
 	unsigned long nr_taken = 0;
 	unsigned long scan;
 
-	for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+	for (scan = 0; scan < nr_to_scan && nr_taken < nr_to_scan &&
+					!list_empty(src); scan++) {
 		struct page *page;
 		int nr_pages;
 
@@ -1321,6 +1393,32 @@ int isolate_lru_page(struct page *page)
 	return ret;
 }
 
+static int __too_many_isolated(struct zone *zone, int file,
+			       struct scan_control *sc, int safe)
+{
+	unsigned long inactive, isolated;
+
+	if (safe) {
+		inactive = zone_page_state_snapshot(zone,
+				NR_INACTIVE_ANON + 2 * file);
+		isolated = zone_page_state_snapshot(zone,
+				NR_ISOLATED_ANON + file);
+	} else {
+		inactive = zone_page_state(zone, NR_INACTIVE_ANON + 2 * file);
+		isolated = zone_page_state(zone, NR_ISOLATED_ANON + file);
+	}
+
+	/*
+	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
+	 * won't get blocked by normal direct-reclaimers, forming a circular
+	 * deadlock.
+	 */
+	if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
+		inactive >>= 3;
+
+	return isolated > inactive;
+}
+
 /*
  * A direct reclaimer may isolate SWAP_CLUSTER_MAX pages from the LRU list and
  * then get resheduled. When there are massive number of tasks doing page
@@ -1329,33 +1427,24 @@ int isolate_lru_page(struct page *page)
  * unnecessary swapping, thrashing and OOM.
  */
 static int too_many_isolated(struct zone *zone, int file,
-		struct scan_control *sc)
+			     struct scan_control *sc)
 {
-	unsigned long inactive, isolated;
-
 	if (current_is_kswapd())
 		return 0;
 
 	if (!global_reclaim(sc))
 		return 0;
 
-	if (file) {
-		inactive = zone_page_state(zone, NR_INACTIVE_FILE);
-		isolated = zone_page_state(zone, NR_ISOLATED_FILE);
-	} else {
-		inactive = zone_page_state(zone, NR_INACTIVE_ANON);
-		isolated = zone_page_state(zone, NR_ISOLATED_ANON);
-	}
-
 	/*
-	 * GFP_NOIO/GFP_NOFS callers are allowed to isolate more pages, so they
-	 * won't get blocked by normal direct-reclaimers, forming a circular
-	 * deadlock.
+	 * __too_many_isolated(safe=0) is fast but inaccurate, because it
+	 * doesn't account for the vm_stat_diff[] counters.  So if it looks
+	 * like too_many_isolated() is about to return true, fall back to the
+	 * slower, more accurate zone_page_state_snapshot().
 	 */
-	if ((sc->gfp_mask & GFP_IOFS) == GFP_IOFS)
-		inactive >>= 3;
+	if (unlikely(__too_many_isolated(zone, file, sc, 0)))
+		return __too_many_isolated(zone, file, sc, 1);
 
-	return isolated > inactive;
+	return 0;
 }
 
 static noinline_for_stack void
@@ -1399,6 +1488,7 @@ putback_inactive_pages(struct lruvec *lruvec, struct list_head *page_list)
 
 			if (unlikely(PageCompound(page))) {
 				spin_unlock_irq(&zone->lru_lock);
+				mem_cgroup_uncharge(page);
 				(*get_compound_page_dtor(page))(page);
 				spin_lock_irq(&zone->lru_lock);
 			} else
@@ -1493,6 +1583,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 
 	spin_unlock_irq(&zone->lru_lock);
 
+	mem_cgroup_uncharge_list(&page_list);
 	free_hot_cold_page_list(&page_list, true);
 
 	/*
@@ -1512,10 +1603,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
 	if (nr_writeback && nr_writeback == nr_taken)
 		zone_set_flag(zone, ZONE_WRITEBACK);
 
-	/*
-	 * memcg will stall in page writeback so only consider forcibly
-	 * stalling for global reclaim
-	 */
+	if (!global_reclaim(sc) && nr_immediate)
+		congestion_wait(BLK_RW_ASYNC, HZ/10);
+
 	if (global_reclaim(sc)) {
 		/*
 		 * Tag a zone as congested if all the dirty pages scanned were
@@ -1607,6 +1697,7 @@ static void move_active_pages_to_lru(struct lruvec *lruvec,
 
 			if (unlikely(PageCompound(page))) {
 				spin_unlock_irq(&zone->lru_lock);
+				mem_cgroup_uncharge(page);
 				(*get_compound_page_dtor(page))(page);
 				spin_lock_irq(&zone->lru_lock);
 			} else
@@ -1636,6 +1727,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	int file = is_file_lru(lru);
 	struct zone *zone = lruvec_zone(lruvec);
 
+	KSTAT_PERF_ENTER(refill_inact);
+
 	lru_add_drain();
 
 	if (!sc->may_unmap)
@@ -1714,7 +1807,10 @@ static void shrink_active_list(unsigned long nr_to_scan,
 	__mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken);
 	spin_unlock_irq(&zone->lru_lock);
 
+	mem_cgroup_uncharge_list(&l_hold);
 	free_hot_cold_page_list(&l_hold, true);
+
+	KSTAT_PERF_LEAVE(refill_inact);
 }
 
 #ifdef CONFIG_SWAP
@@ -1811,6 +1907,51 @@ static int vmscan_swappiness(struct scan_control *sc)
 	return mem_cgroup_swappiness(sc->target_mem_cgroup);
 }
 
+#ifdef CONFIG_MEMCG
+int sysctl_force_scan_thresh = 50;
+
+static inline bool zone_force_scan(struct zone *zone)
+{
+	return zone->force_scan;
+}
+
+static void zone_update_force_scan(struct zone *zone)
+{
+	struct mem_cgroup *memcg;
+	int tiny, total;
+
+	tiny = total = 0;
+
+	memcg = mem_cgroup_iter(NULL, NULL, NULL);
+	do {
+		struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+		unsigned long size;
+
+		size = max(get_lru_size(lruvec, LRU_ACTIVE_FILE),
+			   get_lru_size(lruvec, LRU_INACTIVE_FILE));
+		if (get_nr_swap_pages() > 0)
+			size = max3(size,
+				    get_lru_size(lruvec, LRU_ACTIVE_ANON),
+				    get_lru_size(lruvec, LRU_INACTIVE_ANON));
+
+		if (size && size >> DEF_PRIORITY == 0)
+			tiny++;
+		total++;
+	} while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)));
+
+	zone->force_scan = tiny * 100 > total * sysctl_force_scan_thresh;
+}
+#else
+static inline bool zone_force_scan(struct zone *zone)
+{
+	return false;
+}
+
+static inline void zone_update_force_scan(struct zone *zone)
+{
+}
+#endif
+
 enum scan_balance {
 	SCAN_EQUAL,
 	SCAN_FRACT,
@@ -1828,7 +1969,7 @@ enum scan_balance {
  * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan
  */
 static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
-			   unsigned long *nr)
+			   unsigned long *nr, unsigned long *lru_pages)
 {
 	struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
 	u64 fraction[2];
@@ -1851,10 +1992,12 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	 * latencies, so it's better to scan a minimum amount there as
 	 * well.
 	 */
-	if (current_is_kswapd() && zone->all_unreclaimable)
+	if (current_is_kswapd() && !zone_reclaimable(zone))
 		force_scan = true;
 	if (!global_reclaim(sc))
 		force_scan = true;
+	if (zone_force_scan(zone))
+		force_scan = true;
 
 	/* If we have no swap space, do not bother scanning anon pages. */
 	if (!sc->may_swap || (get_nr_swap_pages() <= 0)) {
@@ -1914,7 +2057,8 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	 * There is enough inactive page cache, do not reclaim
 	 * anything from the anonymous working set right now.
 	 */
-	if (!inactive_file_is_low(lruvec)) {
+	if (!inactive_file_is_low(lruvec) &&
+	    get_lru_size(lruvec, LRU_INACTIVE_FILE) >> sc->priority > 0) {
 		scan_balance = SCAN_FILE;
 		goto out;
 	}
@@ -1966,6 +2110,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
 	fraction[1] = fp;
 	denominator = ap + fp + 1;
 out:
+	*lru_pages = 0;
 	for_each_evictable_lru(lru) {
 		int file = is_file_lru(lru);
 		unsigned long size;
@@ -1991,13 +2136,17 @@ out:
 		case SCAN_FILE:
 		case SCAN_ANON:
 			/* Scan one type exclusively */
-			if ((scan_balance == SCAN_FILE) != file)
+			if ((scan_balance == SCAN_FILE) != file) {
+				size = 0;
 				scan = 0;
+			}
 			break;
 		default:
 			/* Look ma, no brain */
 			BUG();
 		}
+
+		*lru_pages += size;
 		nr[lru] = scan;
 	}
 }
@@ -2022,7 +2171,8 @@ static inline void init_tlb_ubc(void)
 /*
  * This is a basic per-zone page freer.  Used by both kswapd and direct reclaim.
  */
-static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
+static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc,
+			  unsigned long *lru_pages)
 {
 	unsigned long nr[NR_LRU_LISTS];
 	unsigned long targets[NR_LRU_LISTS];
@@ -2033,7 +2183,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
 	struct blk_plug plug;
 	bool scan_adjusted;
 
-	get_scan_count(lruvec, sc, nr);
+	get_scan_count(lruvec, sc, nr, lru_pages);
 
 	/* Record the original scan target for proportional adjustments later */
 	memcpy(targets, nr, sizeof(nr));
@@ -2213,9 +2363,17 @@ static inline bool should_continue_reclaim(struct zone *zone,
 	}
 }
 
-static void shrink_zone(struct zone *zone, struct scan_control *sc)
+static void shrink_zone(struct zone *zone, struct scan_control *sc,
+			bool is_classzone)
 {
+	struct reclaim_state *reclaim_state = current->reclaim_state;
 	unsigned long nr_reclaimed, nr_scanned;
+	gfp_t slab_gfp = sc->gfp_mask;
+
+	/* Disable fs-related IO for direct reclaim */
+	if (!sc->target_mem_cgroup &&
+	    (current->flags & (PF_MEMALLOC|PF_KSWAPD)) == PF_MEMALLOC)
+		slab_gfp &= ~__GFP_FS;
 
 	do {
 		struct mem_cgroup *root = sc->target_mem_cgroup;
@@ -2223,6 +2381,7 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 			.zone = zone,
 			.priority = sc->priority,
 		};
+		unsigned long zone_lru_pages = 0;
 		struct mem_cgroup *memcg;
 
 		nr_reclaimed = sc->nr_reclaimed;
@@ -2230,11 +2389,22 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 
 		memcg = mem_cgroup_iter(root, NULL, &reclaim);
 		do {
+			unsigned long lru_pages, scanned;
 			struct lruvec *lruvec;
 
+			if (!sc->may_thrash && mem_cgroup_low(root, memcg))
+				continue;
+
 			lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+			scanned = sc->nr_scanned;
+
+			shrink_lruvec(lruvec, sc, &lru_pages);
+			zone_lru_pages += lru_pages;
 
-			shrink_lruvec(lruvec, sc);
+			if (memcg && is_classzone)
+				shrink_slab(slab_gfp, zone_to_nid(zone),
+					    memcg, sc->nr_scanned - scanned,
+					    lru_pages, false);
 
 			/*
 			 * Direct reclaim and kswapd have to scan all memory
@@ -2251,8 +2421,40 @@ static void shrink_zone(struct zone *zone, struct scan_control *sc)
 				mem_cgroup_iter_break(root, memcg);
 				break;
 			}
-			memcg = mem_cgroup_iter(root, memcg, &reclaim);
-		} while (memcg);
+		} while ((memcg = mem_cgroup_iter(root, memcg, &reclaim)));
+
+		/*
+		 * Shrink the slab caches in the same proportion that
+		 * the eligible LRU pages were scanned.
+		 */
+		if (global_reclaim(sc) && is_classzone) {
+			unsigned long scanned, eligible;
+
+			scanned = sc->nr_scanned - nr_scanned;
+			eligible = zone_lru_pages;
+
+			/*
+			 * If most processes reside in memory cgroups protected
+			 * with memory.low there won't be a lot of user pages
+			 * in the root lruvec so that the lru scanned/eligible
+			 * ratio ratio can get high even on the default scan
+			 * priority. In order not to subject memcg unaware slab
+			 * caches to disproportionately high pressure, we forge
+			 * the ratio in this case.
+			 */
+			if (eligible >> sc->priority == 0) {
+				scanned = 1000;
+				eligible = 1000 << sc->priority;
+			}
+
+			shrink_slab(slab_gfp, zone_to_nid(zone), NULL,
+				    scanned, eligible, false);
+		}
+
+		if (reclaim_state) {
+			sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+			reclaim_state->reclaimed_slab = 0;
+		}
 
 		vmpressure(sc->gfp_mask, sc->target_mem_cgroup,
 			   sc->nr_scanned - nr_scanned,
@@ -2325,6 +2527,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 	struct zone *zone;
 	unsigned long nr_soft_reclaimed;
 	unsigned long nr_soft_scanned;
+	enum zone_type requested_highidx = gfp_zone(sc->gfp_mask);
 	bool aborted_reclaim = false;
 
 	/*
@@ -2336,9 +2539,17 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 		sc->gfp_mask |= __GFP_HIGHMEM;
 
 	for_each_zone_zonelist_nodemask(zone, z, zonelist,
-					gfp_zone(sc->gfp_mask), sc->nodemask) {
+					requested_highidx, sc->nodemask) {
+		enum zone_type classzone_idx;
+
 		if (!populated_zone(zone))
 			continue;
+
+		classzone_idx = requested_highidx;
+		while (!populated_zone(zone->zone_pgdat->node_zones +
+							classzone_idx))
+			classzone_idx--;
+
 		/*
 		 * Take care memory controller reclaiming has small influence
 		 * to global LRU.
@@ -2346,8 +2557,8 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 		if (global_reclaim(sc)) {
 			if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 				continue;
-			if (zone->all_unreclaimable &&
-					sc->priority != DEF_PRIORITY)
+			if (sc->priority != DEF_PRIORITY &&
+			    !zone_reclaimable(zone))
 				continue;	/* Let kswapd poll it */
 			if (IS_ENABLED(CONFIG_COMPACTION)) {
 				/*
@@ -2379,7 +2590,10 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
 			/* need some check for avoid more shrink_zone() */
 		}
 
-		shrink_zone(zone, sc);
+		shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
+
+		if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
+			break;
 	}
 
 	return aborted_reclaim;
@@ -2398,7 +2612,7 @@ static bool all_unreclaimable(struct zonelist *zonelist,
 			continue;
 		if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
 			continue;
-		if (!zone->all_unreclaimable)
+		if (zone_reclaimable(zone))
 			return false;
 	}
 
@@ -2422,16 +2636,15 @@ static bool all_unreclaimable(struct zonelist *zonelist,
  * 		else, the number of pages reclaimed
  */
 static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
-					struct scan_control *sc,
-					struct shrink_control *shrink)
+					  struct scan_control *sc)
 {
+	int initial_priority = sc->priority;
 	unsigned long total_scanned = 0;
-	struct reclaim_state *reclaim_state = current->reclaim_state;
-	struct zoneref *z;
-	struct zone *zone;
 	unsigned long writeback_threshold;
 	bool aborted_reclaim;
 
+retry:
+	{KSTAT_PERF_ENTER(ttfp);
 	delayacct_freepages_start();
 
 	if (global_reclaim(sc))
@@ -2443,28 +2656,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 		sc->nr_scanned = 0;
 		aborted_reclaim = shrink_zones(zonelist, sc);
 
-		/*
-		 * Don't shrink slabs when reclaiming memory from over limit
-		 * cgroups but do shrink slab at least once when aborting
-		 * reclaim for compaction to avoid unevenly scanning file/anon
-		 * LRU pages over slab pages.
-		 */
-		if (global_reclaim(sc)) {
-			unsigned long lru_pages = 0;
-			for_each_zone_zonelist(zone, z, zonelist,
-					gfp_zone(sc->gfp_mask)) {
-				if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-					continue;
-
-				lru_pages += zone_reclaimable_pages(zone);
-			}
-
-			shrink_slab(shrink, sc->nr_scanned, lru_pages);
-			if (reclaim_state) {
-				sc->nr_reclaimed += reclaim_state->reclaimed_slab;
-				reclaim_state->reclaimed_slab = 0;
-			}
-		}
 		total_scanned += sc->nr_scanned;
 		if (sc->nr_reclaimed >= sc->nr_to_reclaim)
 			goto out;
@@ -2489,10 +2680,14 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
 						WB_REASON_TRY_TO_FREE_PAGES);
 			sc->may_writepage = 1;
 		}
+
+		if (unlikely(test_tsk_thread_flag(current, TIF_MEMDIE)))
+			aborted_reclaim = 1;
 	} while (--sc->priority >= 0 && !aborted_reclaim);
 
 out:
 	delayacct_freepages_end();
+	KSTAT_PERF_LEAVE(ttfp);}
 
 	if (sc->nr_reclaimed)
 		return sc->nr_reclaimed;
@@ -2509,6 +2704,13 @@ out:
 	if (aborted_reclaim)
 		return 1;
 
+	/* Untapped cgroup reserves?  Don't OOM, retry. */
+	if (!sc->may_thrash) {
+		sc->priority = initial_priority;
+		sc->may_thrash = 1;
+		goto retry;
+	}
+
 	/* top priority shrink_zones still had more to do? don't OOM, then */
 	if (global_reclaim(sc) && !all_unreclaimable(zonelist, sc))
 		return 1;
@@ -2657,9 +2859,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 		.target_mem_cgroup = NULL,
 		.nodemask = nodemask,
 	};
-	struct shrink_control shrink = {
-		.gfp_mask = sc.gfp_mask,
-	};
 
 	/*
 	 * Do not enter reclaim if fatal signal was delivered while throttled.
@@ -2673,7 +2872,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
 				sc.may_writepage,
 				gfp_mask);
 
-	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
+	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
 	trace_mm_vmscan_direct_reclaim_end(nr_reclaimed);
 
@@ -2698,6 +2897,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 		.target_mem_cgroup = memcg,
 	};
 	struct lruvec *lruvec = mem_cgroup_zone_lruvec(zone, memcg);
+	unsigned long lru_pages;
 
 	sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 			(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
@@ -2713,7 +2913,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 	 * will pick up pages from other mem cgroup's as well. We hack
 	 * the priority and make it zero.
 	 */
-	shrink_lruvec(lruvec, &sc);
+	shrink_lruvec(lruvec, &sc, &lru_pages);
 
 	trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed);
 
@@ -2722,6 +2922,7 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *memcg,
 }
 
 unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
+					   unsigned long nr_pages,
 					   gfp_t gfp_mask,
 					   bool noswap)
 {
@@ -2732,7 +2933,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.may_writepage = !laptop_mode,
 		.may_unmap = 1,
 		.may_swap = !noswap,
-		.nr_to_reclaim = SWAP_CLUSTER_MAX,
+		.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
 		.order = 0,
 		.priority = DEF_PRIORITY,
 		.target_mem_cgroup = memcg,
@@ -2740,9 +2941,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 		.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
 				(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK),
 	};
-	struct shrink_control shrink = {
-		.gfp_mask = sc.gfp_mask,
-	};
 
 	/*
 	 * Unlike direct reclaim via alloc_pages(), memcg's reclaim doesn't
@@ -2757,9 +2955,9 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
 					    sc.may_writepage,
 					    sc.gfp_mask);
 
-	current->flags |= PF_MEMALLOC;
-	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
-	current->flags &= ~PF_MEMALLOC;
+	current->flags |= PF_MEMALLOC | PF_MEMCG_RECLAIM;
+	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
+	current->flags &= ~(PF_MEMALLOC | PF_MEMCG_RECLAIM);
 
 	trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
 
@@ -2842,7 +3040,7 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
 		 * DEF_PRIORITY. Effectively, it considers them balanced so
 		 * they must be considered balanced here as well!
 		 */
-		if (zone->all_unreclaimable) {
+		if (!zone_reclaimable(zone)) {
 			balanced_pages += zone->managed_pages;
 			continue;
 		}
@@ -2900,16 +3098,10 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining,
 static bool kswapd_shrink_zone(struct zone *zone,
 			       int classzone_idx,
 			       struct scan_control *sc,
-			       unsigned long lru_pages,
 			       unsigned long *nr_attempted)
 {
-	unsigned long nr_slab;
 	int testorder = sc->order;
 	unsigned long balance_gap;
-	struct reclaim_state *reclaim_state = current->reclaim_state;
-	struct shrink_control shrink = {
-		.gfp_mask = sc->gfp_mask,
-	};
 	bool lowmem_pressure;
 
 	/* Reclaim above the high watermark. */
@@ -2945,18 +3137,11 @@ static bool kswapd_shrink_zone(struct zone *zone,
 						balance_gap, classzone_idx))
 		return true;
 
-	shrink_zone(zone, sc);
-
-	reclaim_state->reclaimed_slab = 0;
-	nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages);
-	sc->nr_reclaimed += reclaim_state->reclaimed_slab;
+	shrink_zone(zone, sc, zone_idx(zone) == classzone_idx);
 
 	/* Account for the number of pages attempted to reclaim */
 	*nr_attempted += sc->nr_to_reclaim;
 
-	if (nr_slab == 0 && !zone_reclaimable(zone))
-		zone->all_unreclaimable = 1;
-
 	zone_clear_flag(zone, ZONE_WRITEBACK);
 
 	/*
@@ -2965,7 +3150,7 @@ static bool kswapd_shrink_zone(struct zone *zone,
 	 * BDIs but as pressure is relieved, speculatively avoid congestion
 	 * waits.
 	 */
-	if (!zone->all_unreclaimable &&
+	if (zone_reclaimable(zone) &&
 	    zone_balanced(zone, testorder, 0, classzone_idx)) {
 		zone_clear_flag(zone, ZONE_CONGESTED);
 		zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY);
@@ -3014,7 +3199,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 	count_vm_event(PAGEOUTRUN);
 
 	do {
-		unsigned long lru_pages = 0;
 		unsigned long nr_attempted = 0;
 		bool raise_priority = true;
 		bool pgdat_needs_compaction = (order > 0);
@@ -3031,8 +3215,10 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 			if (!populated_zone(zone))
 				continue;
 
-			if (zone->all_unreclaimable &&
-			    sc.priority != DEF_PRIORITY)
+			zone_update_force_scan(zone);
+
+			if (sc.priority != DEF_PRIORITY &&
+			    !zone_reclaimable(zone))
 				continue;
 
 			/*
@@ -3074,8 +3260,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 			if (!populated_zone(zone))
 				continue;
 
-			lru_pages += zone_reclaimable_pages(zone);
-
 			/*
 			 * If any zone is currently balanced then kswapd will
 			 * not call compaction as it is expected that the
@@ -3110,8 +3294,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 			if (!populated_zone(zone))
 				continue;
 
-			if (zone->all_unreclaimable &&
-			    sc.priority != DEF_PRIORITY)
+			if (sc.priority != DEF_PRIORITY &&
+			    !zone_reclaimable(zone))
 				continue;
 
 			sc.nr_scanned = 0;
@@ -3131,8 +3315,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order,
 			 * that that high watermark would be met at 100%
 			 * efficiency.
 			 */
-			if (kswapd_shrink_zone(zone, end_zone, &sc,
-					lru_pages, &nr_attempted))
+			if (kswapd_shrink_zone(zone, end_zone,
+					       &sc, &nr_attempted))
 				raise_priority = false;
 		}
 
@@ -3399,9 +3583,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 		.order = 0,
 		.priority = DEF_PRIORITY,
 	};
-	struct shrink_control shrink = {
-		.gfp_mask = sc.gfp_mask,
-	};
 	struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
 	struct task_struct *p = current;
 	unsigned long nr_reclaimed;
@@ -3411,7 +3592,7 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
 	reclaim_state.reclaimed_slab = 0;
 	p->reclaim_state = &reclaim_state;
 
-	nr_reclaimed = do_try_to_free_pages(zonelist, &sc, &shrink);
+	nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
 
 	p->reclaim_state = NULL;
 	lockdep_clear_current_reclaim_state();
@@ -3588,10 +3769,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		.order = order,
 		.priority = ZONE_RECLAIM_PRIORITY,
 	};
-	struct shrink_control shrink = {
-		.gfp_mask = sc.gfp_mask,
-	};
-	unsigned long nr_slab_pages0, nr_slab_pages1;
 
 	cond_resched();
 	/*
@@ -3610,45 +3787,10 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 		 * priorities until we have enough memory freed.
 		 */
 		do {
-			shrink_zone(zone, &sc);
+			shrink_zone(zone, &sc, true);
 		} while (sc.nr_reclaimed < nr_pages && --sc.priority >= 0);
 	}
 
-	nr_slab_pages0 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
-	if (nr_slab_pages0 > zone->min_slab_pages) {
-		/*
-		 * shrink_slab() does not currently allow us to determine how
-		 * many pages were freed in this zone. So we take the current
-		 * number of slab pages and shake the slab until it is reduced
-		 * by the same nr_pages that we used for reclaiming unmapped
-		 * pages.
-		 *
-		 * Note that shrink_slab will free memory on all zones and may
-		 * take a long time.
-		 */
-		for (;;) {
-			unsigned long lru_pages = zone_reclaimable_pages(zone);
-
-			/* No reclaimable slab or very low memory pressure */
-			if (!shrink_slab(&shrink, sc.nr_scanned, lru_pages))
-				break;
-
-			/* Freed enough memory */
-			nr_slab_pages1 = zone_page_state(zone,
-							NR_SLAB_RECLAIMABLE);
-			if (nr_slab_pages1 + nr_pages <= nr_slab_pages0)
-				break;
-		}
-
-		/*
-		 * Update nr_reclaimed by the number of slab pages we
-		 * reclaimed from this zone.
-		 */
-		nr_slab_pages1 = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
-		if (nr_slab_pages1 < nr_slab_pages0)
-			sc.nr_reclaimed += nr_slab_pages0 - nr_slab_pages1;
-	}
-
 	p->reclaim_state = NULL;
 	current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
 	lockdep_clear_current_reclaim_state();
@@ -3674,7 +3816,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 	    zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
 		return ZONE_RECLAIM_FULL;
 
-	if (zone->all_unreclaimable)
+	if (!zone_reclaimable(zone))
 		return ZONE_RECLAIM_FULL;
 
 	/*
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -18,10 +18,13 @@
 #include <linux/cpumask.h>
 #include <linux/vmstat.h>
 #include <linux/sched.h>
+#include <linux/virtinfo.h>
 #include <linux/math64.h>
 #include <linux/writeback.h>
 #include <linux/compaction.h>
 
+#include "internal.h"
+
 #ifdef CONFIG_VM_EVENT_COUNTERS
 DEFINE_PER_CPU(struct vm_event_state, vm_event_states) = {{0}};
 EXPORT_PER_CPU_SYMBOL(vm_event_states);
@@ -1115,10 +1118,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
 	seq_printf(m,
 		   "\n  all_unreclaimable: %u"
 		   "\n  start_pfn:         %lu"
-		   "\n  inactive_ratio:    %u",
-		   zone->all_unreclaimable,
+		   "\n  inactive_ratio:    %u"
+		   "\n  force_scan:        %d",
+		   !zone_reclaimable(zone),
 		   zone->zone_start_pfn,
-		   zone->inactive_ratio);
+		   zone->inactive_ratio,
+		   zone->force_scan);
 	seq_putc(m, '\n');
 }
 
@@ -1176,19 +1181,32 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
 	m->private = v;
 	if (!v)
 		return ERR_PTR(-ENOMEM);
-	for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-		v[i] = global_page_state(i);
-	v += NR_VM_ZONE_STAT_ITEMS;
 
-	global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
-			    v + NR_DIRTY_THRESHOLD);
-	v += NR_VM_WRITEBACK_STAT_ITEMS;
+	if (ve_is_super(get_exec_env())) {
+		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
+			v[i] = global_page_state(i);
+
+		v += NR_VM_ZONE_STAT_ITEMS;
+
+		global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
+				    v + NR_DIRTY_THRESHOLD);
+		v += NR_VM_WRITEBACK_STAT_ITEMS;
 
 #ifdef CONFIG_VM_EVENT_COUNTERS
-	all_vm_events(v);
-	v[PGPGIN] /= 2;		/* sectors -> kbytes */
-	v[PGPGOUT] /= 2;
+		all_vm_events(v);
+		v[PGPGIN] /= 2;		/* sectors -> kbytes */
+		v[PGPGOUT] /= 2;
 #endif
+	} else
+		memset(v, 0, stat_items_size);
+
+	if (virtinfo_notifier_call(VITYPE_GENERAL,
+				VIRTINFO_VMSTAT, v) & NOTIFY_FAIL) {
+		kfree(v);
+		m->private = NULL;
+		return ERR_PTR(-ENOMSG);
+	}
+
 	return (unsigned long *)m->private + *pos;
 }
 
@@ -1427,7 +1445,7 @@ static int __init setup_vmstat(void)
 #ifdef CONFIG_PROC_FS
 	proc_create("buddyinfo", S_IRUGO, NULL, &fragmentation_file_operations);
 	proc_create("pagetypeinfo", S_IRUGO, NULL, &pagetypeinfo_file_ops);
-	proc_create("vmstat", S_IRUGO, NULL, &proc_vmstat_file_operations);
+	proc_create("vmstat", S_IRUGO|S_ISVTX, NULL, &proc_vmstat_file_operations);
 	proc_create("zoneinfo", S_IRUGO, NULL, &proc_zoneinfo_file_operations);
 #endif
 	return 0;
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -264,34 +264,21 @@ void workingset_activation(struct page *page)
  * point where they would still be useful.
  */
 
-static unsigned long nr_shadow_nodes;
-static LIST_HEAD(shadow_nodes);
-static DEFINE_SPINLOCK(shadow_node_lock);
+struct list_lru workingset_shadow_nodes;
 
-void workingset_remember_node(struct radix_tree_node *node)
-{
-	spin_lock(&shadow_node_lock);
-	list_add(&node->private_list, &shadow_nodes);
-	nr_shadow_nodes++;
-	spin_unlock(&shadow_node_lock);
-}
-
-void workingset_forget_node(struct radix_tree_node *node)
-{
-	spin_lock(&shadow_node_lock);
-	list_del_init(&node->private_list);
-	nr_shadow_nodes--;
-	spin_unlock(&shadow_node_lock);
-}
-
-static unsigned long nr_excessive_shadows(void)
+static unsigned long count_shadow_nodes(struct shrinker *shrinker,
+					struct shrink_control *sc)
 {
 	unsigned long shadow_nodes;
 	unsigned long max_nodes;
 	unsigned long pages;
 
-	shadow_nodes = nr_shadow_nodes;
-	pages = totalram_pages;
+	/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
+	local_irq_disable();
+	shadow_nodes = list_lru_shrink_count(&workingset_shadow_nodes, sc);
+	local_irq_enable();
+
+	pages = node_present_pages(sc->nid);
 	/*
 	 * Active cache pages are limited to 50% of memory, and shadow
 	 * entries that represent a refault distance bigger than that
@@ -314,12 +301,15 @@ static unsigned long nr_excessive_shadows(void)
 	return shadow_nodes - max_nodes;
 }
 
-static void shadow_lru_isolate(struct list_head *item,
-			       spinlock_t *lru_lock)
+static enum lru_status shadow_lru_isolate(struct list_head *item,
+					  struct list_lru_one *lru,
+					  spinlock_t *lru_lock,
+					  void *arg)
 {
 	struct address_space *mapping;
 	struct radix_tree_node *node;
 	unsigned int i;
+	int ret;
 
 	/*
 	 * Page cache insertions and deletions synchroneously maintain
@@ -339,12 +329,11 @@ static void shadow_lru_isolate(struct list_head *item,
 	/* Coming from the list, invert the lock order */
 	if (!spin_trylock(&mapping->tree_lock)) {
 		spin_unlock(lru_lock);
+		ret = LRU_RETRY;
 		goto out;
 	}
 
-	list_del_init(item);
-	nr_shadow_nodes--;
-
+	list_lru_isolate(lru, item);
 	spin_unlock(lru_lock);
 
 	/*
@@ -372,37 +361,55 @@ static void shadow_lru_isolate(struct list_head *item,
 		BUG();
 
 	spin_unlock(&mapping->tree_lock);
+	ret = LRU_REMOVED_RETRY;
 out:
 	local_irq_enable();
 	cond_resched();
 	local_irq_disable();
 	spin_lock(lru_lock);
+	return ret;
 }
 
-static int shrink_shadow_nodes(struct shrinker *shrink,
-			       struct shrink_control *sc)
+static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
+				       struct shrink_control *sc)
 {
-	unsigned long nr_to_scan = sc->nr_to_scan;
-
-	if (!nr_to_scan)
-		return nr_excessive_shadows();
+	unsigned long ret;
 
-	spin_lock_irq(&shadow_node_lock);
-	while (--nr_to_scan && !list_empty(&shadow_nodes))
-		shadow_lru_isolate(shadow_nodes.prev, &shadow_node_lock);
-	spin_unlock_irq(&shadow_node_lock);
-
-	return nr_excessive_shadows();
+	/* list_lru lock nests inside IRQ-safe mapping->tree_lock */
+	local_irq_disable();
+	ret =  list_lru_shrink_walk(&workingset_shadow_nodes, sc,
+				    shadow_lru_isolate, NULL);
+	local_irq_enable();
+	return ret;
 }
 
 static struct shrinker workingset_shadow_shrinker = {
-	.shrink = shrink_shadow_nodes,
+	.count_objects = count_shadow_nodes,
+	.scan_objects = scan_shadow_nodes,
 	.seeks = DEFAULT_SEEKS,
+	.flags = SHRINKER_NUMA_AWARE,
 };
 
+/*
+ * Our list_lru->lock is IRQ-safe as it nests inside the IRQ-safe
+ * mapping->tree_lock.
+ */
+static struct lock_class_key shadow_nodes_key;
+
 static int __init workingset_init(void)
 {
-	register_shrinker(&workingset_shadow_shrinker);
+	int ret;
+
+	ret = list_lru_init_key(&workingset_shadow_nodes, &shadow_nodes_key);
+	if (ret)
+		goto err;
+	ret = register_shrinker(&workingset_shadow_shrinker);
+	if (ret)
+		goto err_list_lru;
 	return 0;
+err_list_lru:
+	list_lru_destroy(&workingset_shadow_nodes);
+err:
+	return ret;
 }
 module_init(workingset_init);
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -507,7 +507,7 @@ static int zswap_get_swap_cache_page(swp_entry_t entry,
 		 * add_to_swap_cache() doesn't return -EEXIST, so we can safely
 		 * clear SWAP_HAS_CACHE flag.
 		 */
-		swapcache_free(entry, NULL);
+		swapcache_free(entry);
 	} while (err != -ENOMEM);
 
 	if (new_page)
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -67,7 +67,7 @@ static int vlan_group_prealloc_vid(struct vlan_group *vg,
 		return 0;
 
 	size = sizeof(struct net_device *) * VLAN_GROUP_ARRAY_PART_LEN;
-	array = kzalloc(size, GFP_KERNEL);
+	array = kzalloc(size, GFP_KERNEL_ACCOUNT);
 	if (array == NULL)
 		return -ENOBUFS;
 
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -27,6 +27,7 @@
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
+#include <linux/sched.h>
 #include <linux/ethtool.h>
 #include <net/arp.h>
 
@@ -155,6 +156,7 @@ static netdev_tx_t vlan_dev_hard_start_xmit(struct sk_buff *skb,
 
 	skb->dev = vlan->real_dev;
 	len = skb->len;
+
 	if (unlikely(netpoll_tx_running(dev)))
 		return vlan_netpoll_send_skb(vlan, skb);
 
@@ -574,7 +576,7 @@ static int vlan_dev_init(struct net_device *dev)
 			   NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC |
 			   NETIF_F_ALL_FCOE;
 
-	dev->features |= real_dev->vlan_features | NETIF_F_LLTX;
+	dev->features |= dev->hw_features | NETIF_F_LLTX;
 	dev->gso_max_size = real_dev->gso_max_size;
 
 	dev->vlan_features = real_dev->vlan_features & ~NETIF_F_ALL_FCOE;
@@ -803,4 +805,5 @@ void vlan_setup(struct net_device *dev)
 	dev->ethtool_ops	= &vlan_ethtool_ops;
 
 	memset(dev->broadcast, 0, ETH_ALEN);
+	dev->features |= NETIF_F_VIRTUAL;
 }
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -150,8 +150,8 @@ int __net_init vlan_proc_init(struct net *net)
 	if (!vn->proc_vlan_dir)
 		goto err;
 
-	vn->proc_vlan_conf = proc_create(name_conf, S_IFREG|S_IRUSR|S_IWUSR,
-				     vn->proc_vlan_dir, &vlan_fops);
+	vn->proc_vlan_conf = proc_net_create_data(name_conf, S_IFREG|S_IRUSR|S_IWUSR,
+				     vn->proc_vlan_dir, &vlan_fops, NULL);
 	if (!vn->proc_vlan_conf)
 		goto err;
 	return 0;
@@ -172,7 +172,7 @@ int vlan_proc_add_dev(struct net_device *vlandev)
 	struct vlan_net *vn = net_generic(dev_net(vlandev), vlan_net_id);
 
 	vlan->dent =
-		proc_create_data(vlandev->name, S_IFREG|S_IRUSR|S_IWUSR,
+		proc_net_create_data(vlandev->name, S_IFREG|S_IRUSR|S_IWUSR,
 				 vn->proc_vlan_dir, &vlandev_fops, vlandev);
 	if (!vlan->dent)
 		return -ENOBUFS;
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -90,8 +90,12 @@ out:
 static int br_dev_init(struct net_device *dev)
 {
 	struct net_bridge *br = netdev_priv(dev);
+	struct net *net = dev_net(dev);
 	int err;
 
+	if (!(net->owner_ve->features & VE_FEATURE_BRIDGE))
+		return -EACCES;
+
 	br->stats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
 	if (!br->stats)
 		return -ENOMEM;
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -42,7 +42,8 @@ static inline unsigned int packet_length(const struct sk_buff *skb)
 
 int br_dev_queue_push_xmit(struct sock *sk, struct sk_buff *skb)
 {
-	if (!is_skb_forwardable(skb->dev, skb))
+	if (!(skb->dev->features & NETIF_F_VENET) &&
+	    !is_skb_forwardable(skb->dev, skb))
 		goto drop;
 
 	skb_push(skb, ETH_HLEN);
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -24,6 +24,12 @@
 br_should_route_hook_t __rcu *br_should_route_hook __read_mostly;
 EXPORT_SYMBOL(br_should_route_hook);
 
+static int br_netif_receive_skb(struct sock *sk, struct sk_buff *skb)
+{
+	br_drop_fake_rtable(skb);
+	return netif_receive_skb(skb);
+}
+
 static int br_pass_frame_up(struct sk_buff *skb)
 {
 	struct net_device *indev, *brdev = BR_INPUT_SKB_CB(skb)->brdev;
@@ -55,7 +61,7 @@ static int br_pass_frame_up(struct sk_buff *skb)
 
 	return NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, NULL, skb,
 		       indev, NULL,
-		       netif_receive_skb_sk);
+		       br_netif_receive_skb);
 }
 
 /* note: already called with rcu_read_lock */
--- /dev/null
+++ b/net/bridge/br_netfilter.c
@@ -0,0 +1,1144 @@
+/*
+ *	Handle firewalling
+ *	Linux ethernet bridge
+ *
+ *	Authors:
+ *	Lennert Buytenhek		<buytenh@gnu.org>
+ *	Bart De Schuymer		<bdschuym@pandora.be>
+ *
+ *	This program is free software; you can redistribute it and/or
+ *	modify it under the terms of the GNU General Public License
+ *	as published by the Free Software Foundation; either version
+ *	2 of the License, or (at your option) any later version.
+ *
+ *	Lennert dedicates this file to Kerstin Wurdinger.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/ip.h>
+#include <linux/netdevice.h>
+#include <linux/skbuff.h>
+#include <linux/if_arp.h>
+#include <linux/if_ether.h>
+#include <linux/if_vlan.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
+#include <linux/netfilter_bridge.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/netfilter_arp.h>
+#include <linux/in_route.h>
+#include <linux/inetdevice.h>
+
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <net/addrconf.h>
+#include <net/route.h>
+
+#include <asm/uaccess.h>
+#include "br_private.h"
+#ifdef CONFIG_SYSCTL
+#include <linux/sysctl.h>
+#endif
+
+#define skb_origaddr(skb)	 (((struct bridge_skb_cb *) \
+				 (skb->nf_bridge->data))->daddr.ipv4)
+#define store_orig_dstaddr(skb)	 (skb_origaddr(skb) = ip_hdr(skb)->daddr)
+#define dnat_took_place(skb)	 (skb_origaddr(skb) != ip_hdr(skb)->daddr)
+
+#ifdef CONFIG_SYSCTL
+static struct ctl_table_header *brnf_sysctl_header;
+static int brnf_call_iptables __read_mostly = 1;
+static int brnf_call_ip6tables __read_mostly = 1;
+static int brnf_call_arptables __read_mostly = 1;
+static int brnf_filter_vlan_tagged __read_mostly = 0;
+static int brnf_filter_pppoe_tagged __read_mostly = 0;
+static int brnf_pass_vlan_indev __read_mostly = 0;
+#else
+#define brnf_call_iptables 1
+#define brnf_call_ip6tables 1
+#define brnf_call_arptables 1
+#define brnf_filter_vlan_tagged 0
+#define brnf_filter_pppoe_tagged 0
+#define brnf_pass_vlan_indev 0
+#endif
+
+#define IS_IP(skb) \
+	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IP))
+
+#define IS_IPV6(skb) \
+	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6))
+
+#define IS_ARP(skb) \
+	(!skb_vlan_tag_present(skb) && skb->protocol == htons(ETH_P_ARP))
+
+static inline __be16 vlan_proto(const struct sk_buff *skb)
+{
+	if (skb_vlan_tag_present(skb))
+		return skb->protocol;
+	else if (skb->protocol == htons(ETH_P_8021Q))
+		return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
+	else
+		return 0;
+}
+
+#define IS_VLAN_IP(skb) \
+	(vlan_proto(skb) == htons(ETH_P_IP) && \
+	 brnf_filter_vlan_tagged)
+
+#define IS_VLAN_IPV6(skb) \
+	(vlan_proto(skb) == htons(ETH_P_IPV6) && \
+	 brnf_filter_vlan_tagged)
+
+#define IS_VLAN_ARP(skb) \
+	(vlan_proto(skb) == htons(ETH_P_ARP) &&	\
+	 brnf_filter_vlan_tagged)
+
+static inline __be16 pppoe_proto(const struct sk_buff *skb)
+{
+	return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
+			    sizeof(struct pppoe_hdr)));
+}
+
+#define IS_PPPOE_IP(skb) \
+	(skb->protocol == htons(ETH_P_PPP_SES) && \
+	 pppoe_proto(skb) == htons(PPP_IP) && \
+	 brnf_filter_pppoe_tagged)
+
+#define IS_PPPOE_IPV6(skb) \
+	(skb->protocol == htons(ETH_P_PPP_SES) && \
+	 pppoe_proto(skb) == htons(PPP_IPV6) && \
+	 brnf_filter_pppoe_tagged)
+
+static void fake_update_pmtu(struct dst_entry *dst, struct sock *sk,
+			     struct sk_buff *skb, u32 mtu)
+{
+}
+
+static void fake_redirect(struct dst_entry *dst, struct sock *sk,
+			  struct sk_buff *skb)
+{
+}
+
+static u32 *fake_cow_metrics(struct dst_entry *dst, unsigned long old)
+{
+	return NULL;
+}
+
+static struct neighbour *fake_neigh_lookup(const struct dst_entry *dst,
+					   struct sk_buff *skb,
+					   const void *daddr)
+{
+	return NULL;
+}
+
+static unsigned int fake_mtu(const struct dst_entry *dst)
+{
+	return dst->dev->mtu;
+}
+
+static struct dst_ops fake_dst_ops = {
+	.family =		AF_INET,
+	.protocol =		cpu_to_be16(ETH_P_IP),
+	.update_pmtu =		fake_update_pmtu,
+	.redirect =		fake_redirect,
+	.cow_metrics =		fake_cow_metrics,
+	.neigh_lookup =		fake_neigh_lookup,
+	.mtu =			fake_mtu,
+};
+
+/*
+ * Initialize bogus route table used to keep netfilter happy.
+ * Currently, we fill in the PMTU entry because netfilter
+ * refragmentation needs it, and the rt_flags entry because
+ * ipt_REJECT needs it.  Future netfilter modules might
+ * require us to fill additional fields.
+ */
+static const u32 br_dst_default_metrics[RTAX_MAX] = {
+	[RTAX_MTU - 1] = 1500,
+};
+
+void br_netfilter_rtable_init(struct net_bridge *br)
+{
+	struct rtable *rt = &br->fake_rtable;
+
+	atomic_set(&rt->dst.__refcnt, 1);
+	rt->dst.dev = br->dev;
+	rt->dst.path = &rt->dst;
+	dst_init_metrics(&rt->dst, br_dst_default_metrics, true);
+	rt->dst.flags	= DST_NOXFRM | DST_NOPEER | DST_FAKE_RTABLE;
+	rt->dst.ops = &fake_dst_ops;
+}
+
+static inline struct rtable *bridge_parent_rtable(const struct net_device *dev)
+{
+	struct net_bridge_port *port;
+
+	port = br_port_get_rcu(dev);
+	return port ? &port->br->fake_rtable : NULL;
+}
+
+static inline struct net_device *bridge_parent(const struct net_device *dev)
+{
+	struct net_bridge_port *port;
+
+	port = br_port_get_rcu(dev);
+	return port ? port->br->dev : NULL;
+}
+
+static inline struct nf_bridge_info *nf_bridge_alloc(struct sk_buff *skb)
+{
+	skb->nf_bridge = kzalloc(sizeof(struct nf_bridge_info), GFP_ATOMIC);
+	if (likely(skb->nf_bridge))
+		atomic_set(&(skb->nf_bridge->use), 1);
+
+	return skb->nf_bridge;
+}
+
+static inline struct nf_bridge_info *nf_bridge_unshare(struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+
+	if (atomic_read(&nf_bridge->use) > 1) {
+		struct nf_bridge_info *tmp = nf_bridge_alloc(skb);
+
+		if (tmp) {
+			memcpy(tmp, nf_bridge, sizeof(struct nf_bridge_info));
+			atomic_set(&tmp->use, 1);
+		}
+		nf_bridge_put(nf_bridge);
+		nf_bridge = tmp;
+	}
+	return nf_bridge;
+}
+
+static inline void nf_bridge_push_encap_header(struct sk_buff *skb)
+{
+	unsigned int len = nf_bridge_encap_header_len(skb);
+
+	skb_push(skb, len);
+	skb->network_header -= len;
+}
+
+static inline void nf_bridge_pull_encap_header(struct sk_buff *skb)
+{
+	unsigned int len = nf_bridge_encap_header_len(skb);
+
+	skb_pull(skb, len);
+	skb->network_header += len;
+}
+
+static inline void nf_bridge_pull_encap_header_rcsum(struct sk_buff *skb)
+{
+	unsigned int len = nf_bridge_encap_header_len(skb);
+
+	skb_pull_rcsum(skb, len);
+	skb->network_header += len;
+}
+
+static inline void nf_bridge_save_header(struct sk_buff *skb)
+{
+	int header_size = ETH_HLEN + nf_bridge_encap_header_len(skb);
+
+	skb_copy_from_linear_data_offset(skb, -header_size,
+					 skb->nf_bridge->data, header_size);
+}
+
+static inline void nf_bridge_update_protocol(struct sk_buff *skb)
+{
+	if (skb->nf_bridge->mask & BRNF_8021Q)
+		skb->protocol = htons(ETH_P_8021Q);
+	else if (skb->nf_bridge->mask & BRNF_PPPoE)
+		skb->protocol = htons(ETH_P_PPP_SES);
+}
+
+/* When handing a packet over to the IP layer
+ * check whether we have a skb that is in the
+ * expected format
+ */
+
+static int br_parse_ip_options(struct sk_buff *skb)
+{
+	struct ip_options *opt;
+	const struct iphdr *iph;
+	struct net_device *dev = skb->dev;
+	u32 len;
+
+	if (!pskb_may_pull(skb, sizeof(struct iphdr)))
+		goto inhdr_error;
+
+	iph = ip_hdr(skb);
+	opt = &(IPCB(skb)->opt);
+
+	/* Basic sanity checks */
+	if (iph->ihl < 5 || iph->version != 4)
+		goto inhdr_error;
+
+	if (!pskb_may_pull(skb, iph->ihl*4))
+		goto inhdr_error;
+
+	iph = ip_hdr(skb);
+	if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl)))
+		goto inhdr_error;
+
+	len = ntohs(iph->tot_len);
+	if (skb->len < len) {
+		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS);
+		goto drop;
+	} else if (len < (iph->ihl*4))
+		goto inhdr_error;
+
+	if (pskb_trim_rcsum(skb, len)) {
+		IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS);
+		goto drop;
+	}
+
+	memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
+	if (iph->ihl == 5)
+		return 0;
+
+	opt->optlen = iph->ihl*4 - sizeof(struct iphdr);
+	if (ip_options_compile(dev_net(dev), opt, skb))
+		goto inhdr_error;
+
+	/* Check correct handling of SRR option */
+	if (unlikely(opt->srr)) {
+		struct in_device *in_dev = __in_dev_get_rcu(dev);
+		if (in_dev && !IN_DEV_SOURCE_ROUTE(in_dev))
+			goto drop;
+
+		if (ip_options_rcv_srr(skb))
+			goto drop;
+	}
+
+	return 0;
+
+inhdr_error:
+	IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS);
+drop:
+	return -1;
+}
+
+/* We only check the length. A bridge shouldn't do any hop-by-hop stuff
+ * anyway
+ */
+static int check_hbh_len(struct sk_buff *skb)
+{
+	unsigned char *raw = (u8 *)(ipv6_hdr(skb) + 1);
+	u32 pkt_len;
+	const unsigned char *nh = skb_network_header(skb);
+	int off = raw - nh;
+	int len = (raw[1] + 1) << 3;
+
+	if ((raw + len) - skb->data > skb_headlen(skb))
+		goto bad;
+
+	off += 2;
+	len -= 2;
+
+	while (len > 0) {
+		int optlen = nh[off + 1] + 2;
+
+		switch (nh[off]) {
+		case IPV6_TLV_PAD1:
+			optlen = 1;
+			break;
+
+		case IPV6_TLV_PADN:
+			break;
+
+		case IPV6_TLV_JUMBO:
+			if (nh[off + 1] != 4 || (off & 3) != 2)
+				goto bad;
+			pkt_len = ntohl(*(__be32 *)(nh + off + 2));
+			if (pkt_len <= IPV6_MAXPLEN ||
+			    ipv6_hdr(skb)->payload_len)
+				goto bad;
+			if (pkt_len > skb->len - sizeof(struct ipv6hdr))
+				goto bad;
+			if (pskb_trim_rcsum(skb,
+					    pkt_len + sizeof(struct ipv6hdr)))
+				goto bad;
+			nh = skb_network_header(skb);
+			break;
+		default:
+			if (optlen > len)
+				goto bad;
+			break;
+		}
+		off += optlen;
+		len -= optlen;
+	}
+	if (len == 0)
+		return 0;
+bad:
+	return -1;
+}
+
+/* Equivalent to br_validate_ipv4 for IPv6 */
+static int br_validate_ipv6(struct sk_buff *skb)
+{
+	const struct ipv6hdr *hdr;
+	struct net_device *dev = skb->dev;
+	struct inet6_dev *idev = __in6_dev_get(skb->dev);
+	u32 pkt_len;
+	u8 ip6h_len = sizeof(struct ipv6hdr);
+
+	if (!pskb_may_pull(skb, ip6h_len))
+		goto inhdr_error;
+
+	if (skb->len < ip6h_len)
+		goto drop;
+
+	hdr = ipv6_hdr(skb);
+
+	if (hdr->version != 6)
+		goto inhdr_error;
+
+	pkt_len = ntohs(hdr->payload_len);
+
+	if (pkt_len || hdr->nexthdr != NEXTHDR_HOP) {
+		if (pkt_len + ip6h_len > skb->len) {
+			IP6_INC_STATS_BH(dev_net(dev), idev,
+					 IPSTATS_MIB_INTRUNCATEDPKTS);
+			goto drop;
+		}
+		if (pskb_trim_rcsum(skb, pkt_len + ip6h_len)) {
+			IP6_INC_STATS_BH(dev_net(dev), idev,
+					 IPSTATS_MIB_INDISCARDS);
+			goto drop;
+		}
+	}
+	if (hdr->nexthdr == NEXTHDR_HOP && check_hbh_len(skb))
+		goto drop;
+
+	memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
+	/* No IP options in IPv6 header; however it should be
+	 * checked if some next headers need special treatment
+	 */
+	return 0;
+
+inhdr_error:
+	IP6_INC_STATS_BH(dev_net(dev), idev, IPSTATS_MIB_INHDRERRORS);
+drop:
+	return -1;
+}
+
+/* Fill in the header for fragmented IP packets handled by
+ * the IPv4 connection tracking code.
+ */
+int nf_bridge_copy_header(struct sk_buff *skb)
+{
+	int err;
+	unsigned int header_size;
+
+	nf_bridge_update_protocol(skb);
+	header_size = ETH_HLEN + nf_bridge_encap_header_len(skb);
+	err = skb_cow_head(skb, header_size);
+	if (err)
+		return err;
+
+	skb_copy_to_linear_data_offset(skb, -header_size,
+				       skb->nf_bridge->data, header_size);
+	__skb_push(skb, nf_bridge_encap_header_len(skb));
+	return 0;
+}
+
+/* PF_BRIDGE/PRE_ROUTING *********************************************/
+/* Undo the changes made for ip6tables PREROUTING and continue the
+ * bridge PRE_ROUTING hook. */
+static int br_nf_pre_routing_finish_ipv6(struct sock *sk, struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct rtable *rt;
+
+	nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;
+
+	if (nf_bridge->mask & BRNF_PKT_TYPE) {
+		skb->pkt_type = PACKET_OTHERHOST;
+		nf_bridge->mask ^= BRNF_PKT_TYPE;
+	}
+	nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
+
+	rt = bridge_parent_rtable(nf_bridge->physindev);
+	if (!rt) {
+		kfree_skb(skb);
+		return 0;
+	}
+	skb_dst_set_noref(skb, &rt->dst);
+
+	skb->dev = nf_bridge->physindev;
+	nf_bridge_update_protocol(skb);
+	nf_bridge_push_encap_header(skb);
+	NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb,
+		       skb->dev, NULL,
+		       br_handle_frame_finish, 1);
+
+	return 0;
+}
+
+/* Obtain the correct destination MAC address, while preserving the original
+ * source MAC address. If we already know this address, we just copy it. If we
+ * don't, we use the neighbour framework to find out. In both cases, we make
+ * sure that br_handle_frame_finish() is called afterwards.
+ */
+static int br_nf_pre_routing_finish_bridge(struct sock *sk, struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct neighbour *neigh;
+	struct dst_entry *dst;
+
+	skb->dev = bridge_parent(skb->dev);
+	if (!skb->dev)
+		goto free_skb;
+	dst = skb_dst(skb);
+	neigh = dst_neigh_lookup_skb(dst, skb);
+	if (neigh) {
+		int ret;
+
+		if (neigh->hh.hh_len) {
+			neigh_hh_bridge(&neigh->hh, skb);
+			skb->dev = nf_bridge->physindev;
+			ret = br_handle_frame_finish(sk, skb);
+		} else {
+			/* the neighbour function below overwrites the complete
+			 * MAC header, so we save the Ethernet source address and
+			 * protocol number.
+			 */
+			skb_copy_from_linear_data_offset(skb,
+							 -(ETH_HLEN-ETH_ALEN),
+							 skb->nf_bridge->data,
+							 ETH_HLEN-ETH_ALEN);
+			/* tell br_dev_xmit to continue with forwarding */
+			nf_bridge->mask |= BRNF_BRIDGED_DNAT;
+			/* FIXME Need to refragment */
+			ret = neigh->output(neigh, skb);
+		}
+		neigh_release(neigh);
+		return ret;
+	}
+free_skb:
+	kfree_skb(skb);
+	return 0;
+}
+
+/* This requires some explaining. If DNAT has taken place,
+ * we will need to fix up the destination Ethernet address.
+ *
+ * There are two cases to consider:
+ * 1. The packet was DNAT'ed to a device in the same bridge
+ *    port group as it was received on. We can still bridge
+ *    the packet.
+ * 2. The packet was DNAT'ed to a different device, either
+ *    a non-bridged device or another bridge port group.
+ *    The packet will need to be routed.
+ *
+ * The correct way of distinguishing between these two cases is to
+ * call ip_route_input() and to look at skb->dst->dev, which is
+ * changed to the destination device if ip_route_input() succeeds.
+ *
+ * Let's first consider the case that ip_route_input() succeeds:
+ *
+ * If the output device equals the logical bridge device the packet
+ * came in on, we can consider this bridging. The corresponding MAC
+ * address will be obtained in br_nf_pre_routing_finish_bridge.
+ * Otherwise, the packet is considered to be routed and we just
+ * change the destination MAC address so that the packet will
+ * later be passed up to the IP stack to be routed. For a redirected
+ * packet, ip_route_input() will give back the localhost as output device,
+ * which differs from the bridge device.
+ *
+ * Let's now consider the case that ip_route_input() fails:
+ *
+ * This can be because the destination address is martian, in which case
+ * the packet will be dropped.
+ * If IP forwarding is disabled, ip_route_input() will fail, while
+ * ip_route_output_key() can return success. The source
+ * address for ip_route_output_key() is set to zero, so ip_route_output_key()
+ * thinks we're handling a locally generated packet and won't care
+ * if IP forwarding is enabled. If the output device equals the logical bridge
+ * device, we proceed as if ip_route_input() succeeded. If it differs from the
+ * logical bridge port or if ip_route_output_key() fails we drop the packet.
+ */
+static int br_nf_pre_routing_finish(struct sock *sk, struct sk_buff *skb)
+{
+	struct net_device *dev = skb->dev;
+	struct iphdr *iph = ip_hdr(skb);
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct rtable *rt;
+	int err;
+
+	nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
+
+	if (nf_bridge->mask & BRNF_PKT_TYPE) {
+		skb->pkt_type = PACKET_OTHERHOST;
+		nf_bridge->mask ^= BRNF_PKT_TYPE;
+	}
+	nf_bridge->mask ^= BRNF_NF_BRIDGE_PREROUTING;
+	if (dnat_took_place(skb)) {
+		if ((err = ip_route_input(skb, iph->daddr, iph->saddr, iph->tos, dev))) {
+			struct in_device *in_dev = __in_dev_get_rcu(dev);
+
+			/* If err equals -EHOSTUNREACH the error is due to a
+			 * martian destination or due to the fact that
+			 * forwarding is disabled. For most martian packets,
+			 * ip_route_output_key() will fail. It won't fail for 2 types of
+			 * martian destinations: loopback destinations and destination
+			 * 0.0.0.0. In both cases the packet will be dropped because the
+			 * destination is the loopback device and not the bridge. */
+			if (err != -EHOSTUNREACH || !in_dev || IN_DEV_FORWARD(in_dev))
+				goto free_skb;
+
+			rt = ip_route_output(dev_net(dev), iph->daddr, 0,
+					     RT_TOS(iph->tos), 0);
+			if (!IS_ERR(rt)) {
+				/* - Bridged-and-DNAT'ed traffic doesn't
+				 *   require ip_forwarding. */
+				if (rt->dst.dev == dev) {
+					skb_dst_set(skb, &rt->dst);
+					goto bridged_dnat;
+				}
+				ip_rt_put(rt);
+			}
+free_skb:
+			kfree_skb(skb);
+			return 0;
+		} else {
+			if (skb_dst(skb)->dev == dev) {
+bridged_dnat:
+				skb->dev = nf_bridge->physindev;
+				nf_bridge_update_protocol(skb);
+				nf_bridge_push_encap_header(skb);
+				NF_HOOK_THRESH(NFPROTO_BRIDGE,
+					       NF_BR_PRE_ROUTING,
+					       sk, skb, skb->dev, NULL,
+					       br_nf_pre_routing_finish_bridge,
+					       1);
+				return 0;
+			}
+			memcpy(eth_hdr(skb)->h_dest, dev->dev_addr, ETH_ALEN);
+			skb->pkt_type = PACKET_HOST;
+		}
+	} else {
+		rt = bridge_parent_rtable(nf_bridge->physindev);
+		if (!rt) {
+			kfree_skb(skb);
+			return 0;
+		}
+		skb_dst_set_noref(skb, &rt->dst);
+	}
+
+	skb->dev = nf_bridge->physindev;
+	nf_bridge_update_protocol(skb);
+	nf_bridge_push_encap_header(skb);
+	NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_PRE_ROUTING, sk, skb,
+		       skb->dev, NULL,
+		       br_handle_frame_finish, 1);
+
+	return 0;
+}
+
+static struct net_device *brnf_get_logical_dev(struct sk_buff *skb, const struct net_device *dev)
+{
+	struct net_device *vlan, *br;
+
+	br = bridge_parent(dev);
+	if (brnf_pass_vlan_indev == 0 || !skb_vlan_tag_present(skb))
+		return br;
+
+	vlan = __vlan_find_dev_deep(br, skb->vlan_proto,
+				    skb_vlan_tag_get(skb) & VLAN_VID_MASK);
+
+	return vlan ? vlan : br;
+}
+
+/* Some common code for IPv4/IPv6 */
+static struct net_device *setup_pre_routing(struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+
+	if (skb->pkt_type == PACKET_OTHERHOST) {
+		skb->pkt_type = PACKET_HOST;
+		nf_bridge->mask |= BRNF_PKT_TYPE;
+	}
+
+	nf_bridge->mask |= BRNF_NF_BRIDGE_PREROUTING;
+	nf_bridge->physindev = skb->dev;
+	skb->dev = brnf_get_logical_dev(skb, skb->dev);
+	if (skb->protocol == htons(ETH_P_8021Q))
+		nf_bridge->mask |= BRNF_8021Q;
+	else if (skb->protocol == htons(ETH_P_PPP_SES))
+		nf_bridge->mask |= BRNF_PPPoE;
+
+	return skb->dev;
+}
+
+/* Replicate the checks that IPv6 does on packet reception and pass the packet
+ * to ip6tables.
+ */
+static unsigned int br_nf_pre_routing_ipv6(const struct nf_hook_ops *ops,
+					   struct sk_buff *skb,
+					   const struct net_device *in,
+					   const struct net_device *out,
+					   const struct nf_hook_state *state)
+{
+	if (br_validate_ipv6(skb))
+		return NF_DROP;
+
+	nf_bridge_put(skb->nf_bridge);
+	if (!nf_bridge_alloc(skb))
+		return NF_DROP;
+	if (!setup_pre_routing(skb))
+		return NF_DROP;
+
+	skb->protocol = htons(ETH_P_IPV6);
+	NF_HOOK(NFPROTO_IPV6, NF_INET_PRE_ROUTING, state->sk, skb,
+		skb->dev, NULL,
+		br_nf_pre_routing_finish_ipv6);
+
+	return NF_STOLEN;
+}
+
+/* Direct IPv6 traffic to br_nf_pre_routing_ipv6.
+ * Replicate the checks that IPv4 does on packet reception.
+ * Set skb->dev to the bridge device (i.e. parent of the
+ * receiving device) to make netfilter happy, the REDIRECT
+ * target in particular.  Save the original destination IP
+ * address to be able to detect DNAT afterwards. */
+static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
+				      struct sk_buff *skb,
+				      const struct net_device *in,
+				      const struct net_device *out,
+				      const struct nf_hook_state *state)
+{
+	struct net_bridge_port *p;
+	struct net_bridge *br;
+	__u32 len = nf_bridge_encap_header_len(skb);
+
+	if (unlikely(!pskb_may_pull(skb, len)))
+		return NF_DROP;
+
+	p = br_port_get_rcu(state->in);
+	if (p == NULL)
+		return NF_DROP;
+	br = p->br;
+
+	if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) {
+		if (!brnf_call_ip6tables && !br->nf_call_ip6tables)
+			return NF_ACCEPT;
+
+		nf_bridge_pull_encap_header_rcsum(skb);
+		return br_nf_pre_routing_ipv6(ops, skb, state->in, state->out, state);
+	}
+
+	if (!brnf_call_iptables && !br->nf_call_iptables)
+		return NF_ACCEPT;
+
+	if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb))
+		return NF_ACCEPT;
+
+	nf_bridge_pull_encap_header_rcsum(skb);
+
+	if (br_parse_ip_options(skb))
+		return NF_DROP;
+
+	nf_bridge_put(skb->nf_bridge);
+	if (!nf_bridge_alloc(skb))
+		return NF_DROP;
+	if (!setup_pre_routing(skb))
+		return NF_DROP;
+	store_orig_dstaddr(skb);
+	skb->protocol = htons(ETH_P_IP);
+
+	NF_HOOK(NFPROTO_IPV4, NF_INET_PRE_ROUTING, state->sk, skb,
+		skb->dev, NULL,
+		br_nf_pre_routing_finish);
+
+	return NF_STOLEN;
+}
+
+
+/* PF_BRIDGE/FORWARD *************************************************/
+static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct net_device *in;
+
+	if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) {
+
+		if (skb->protocol == htons(ETH_P_IP))
+			nf_bridge->frag_max_size = IPCB(skb)->frag_max_size;
+
+		if (skb->protocol == htons(ETH_P_IPV6))
+			nf_bridge->frag_max_size = IP6CB(skb)->frag_max_size;
+
+		in = nf_bridge->physindev;
+		if (nf_bridge->mask & BRNF_PKT_TYPE) {
+			skb->pkt_type = PACKET_OTHERHOST;
+			nf_bridge->mask ^= BRNF_PKT_TYPE;
+		}
+		nf_bridge_update_protocol(skb);
+	} else {
+		in = *((struct net_device **)(skb->cb));
+	}
+	nf_bridge_push_encap_header(skb);
+
+	NF_HOOK_THRESH(NFPROTO_BRIDGE, NF_BR_FORWARD, sk, skb,
+		       in, skb->dev, br_forward_finish, 1);
+	return 0;
+}
+
+
+/* This is the 'purely bridged' case.  For IP, we pass the packet to
+ * netfilter with indev and outdev set to the bridge device,
+ * but we are still able to filter on the 'real' indev/outdev
+ * because of the physdev module. For ARP, indev and outdev are the
+ * bridge ports. */
+static unsigned int br_nf_forward_ip(const struct nf_hook_ops *ops,
+				     struct sk_buff *skb,
+				     const struct net_device *in,
+				     const struct net_device *out,
+				     const struct nf_hook_state *state)
+{
+	struct nf_bridge_info *nf_bridge;
+	struct net_device *parent;
+	u_int8_t pf;
+
+	if (!skb->nf_bridge)
+		return NF_ACCEPT;
+
+	/* Need exclusive nf_bridge_info since we might have multiple
+	 * different physoutdevs. */
+	if (!nf_bridge_unshare(skb))
+		return NF_DROP;
+
+	parent = bridge_parent(state->out);
+	if (!parent)
+		return NF_DROP;
+
+	if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+		pf = NFPROTO_IPV4;
+	else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+		pf = NFPROTO_IPV6;
+	else
+		return NF_ACCEPT;
+
+	nf_bridge_pull_encap_header(skb);
+
+	nf_bridge = skb->nf_bridge;
+	if (skb->pkt_type == PACKET_OTHERHOST) {
+		skb->pkt_type = PACKET_HOST;
+		nf_bridge->mask |= BRNF_PKT_TYPE;
+	}
+
+	if (pf == NFPROTO_IPV4) {
+		if (br_parse_ip_options(skb))
+			return NF_DROP;
+		IPCB(skb)->frag_max_size = nf_bridge->frag_max_size;
+	}
+
+	if (pf == NFPROTO_IPV6) {
+		if (br_validate_ipv6(skb))
+			return NF_DROP;
+		IP6CB(skb)->frag_max_size = nf_bridge->frag_max_size;
+	}
+
+	/* The physdev module checks on this */
+	nf_bridge->mask |= BRNF_BRIDGED;
+	nf_bridge->physoutdev = skb->dev;
+	if (pf == NFPROTO_IPV4)
+		skb->protocol = htons(ETH_P_IP);
+	else
+		skb->protocol = htons(ETH_P_IPV6);
+
+	NF_HOOK(pf, NF_INET_FORWARD, NULL, skb,
+		brnf_get_logical_dev(skb, state->in),
+		parent,	br_nf_forward_finish);
+
+	return NF_STOLEN;
+}
+
+static unsigned int br_nf_forward_arp(const struct nf_hook_ops *ops,
+				      struct sk_buff *skb,
+				      const struct net_device *in,
+				      const struct net_device *out,
+				      const struct nf_hook_state *state)
+{
+	struct net_bridge_port *p;
+	struct net_bridge *br;
+	struct net_device **d = (struct net_device **)(skb->cb);
+
+	p = br_port_get_rcu(state->out);
+	if (p == NULL)
+		return NF_ACCEPT;
+	br = p->br;
+
+	if (!brnf_call_arptables && !br->nf_call_arptables)
+		return NF_ACCEPT;
+
+	if (!IS_ARP(skb)) {
+		if (!IS_VLAN_ARP(skb))
+			return NF_ACCEPT;
+		nf_bridge_pull_encap_header(skb);
+	}
+
+	if (arp_hdr(skb)->ar_pln != 4) {
+		if (IS_VLAN_ARP(skb))
+			nf_bridge_push_encap_header(skb);
+		return NF_ACCEPT;
+	}
+	*d = state->in;
+	NF_HOOK(NFPROTO_ARP, NF_ARP_FORWARD, state->sk, skb,
+		state->in, state->out, br_nf_forward_finish);
+
+	return NF_STOLEN;
+}
+
+static int br_nf_dev_queue_xmit(struct sock *sk, struct sk_buff *skb)
+{
+	if (skb_is_gso(skb) || skb->len + nf_bridge_mtu_reduction(skb) <=
+	    skb->dev->mtu)
+		return br_dev_queue_push_xmit(sk, skb);
+
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV4)
+	/* This is wrong! We should preserve the original fragment
+	 * boundaries by preserving frag_list rather than refragmenting.
+	 */
+	if (skb->protocol == htons(ETH_P_IP) &&
+	    !(skb->dev->features & NETIF_F_VENET)) {
+		if (br_parse_ip_options(skb))
+			/* Drop invalid packet */
+			goto drop;
+
+		IPCB(skb)->frag_max_size = skb->nf_bridge->frag_max_size;
+		return ip_fragment(sk, skb, br_dev_queue_push_xmit);
+	}
+#endif
+#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
+	if (skb->protocol == htons(ETH_P_IPV6)) {
+		const struct nf_ipv6_ops *v6ops = nf_get_ipv6_ops();
+
+		if (br_validate_ipv6(skb))
+			goto drop;
+
+		IP6CB(skb)->frag_max_size = skb->nf_bridge->frag_max_size;
+		if (v6ops)
+			return v6ops->fragment(sk, skb, br_dev_queue_push_xmit);
+
+		kfree_skb(skb);
+		return -EMSGSIZE;
+	}
+#endif
+	return br_dev_queue_push_xmit(sk, skb);
+ drop:
+	kfree_skb(skb);
+	return 0;
+}
+
+/* PF_BRIDGE/POST_ROUTING ********************************************/
+static unsigned int br_nf_post_routing(const struct nf_hook_ops *ops,
+				       struct sk_buff *skb,
+				       const struct net_device *in,
+				       const struct net_device *out,
+				       const struct nf_hook_state *state)
+{
+	struct nf_bridge_info *nf_bridge = skb->nf_bridge;
+	struct net_device *realoutdev = bridge_parent(skb->dev);
+	u_int8_t pf;
+
+	if (!nf_bridge || !(nf_bridge->mask & BRNF_BRIDGED))
+		return NF_ACCEPT;
+
+	if (!realoutdev)
+		return NF_DROP;
+
+	if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb))
+		pf = NFPROTO_IPV4;
+	else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb))
+		pf = NFPROTO_IPV6;
+	else
+		return NF_ACCEPT;
+
+	/* We assume any code from br_dev_queue_push_xmit onwards doesn't care
+	 * about the value of skb->pkt_type. */
+	if (skb->pkt_type == PACKET_OTHERHOST) {
+		skb->pkt_type = PACKET_HOST;
+		nf_bridge->mask |= BRNF_PKT_TYPE;
+	}
+
+	nf_bridge_pull_encap_header(skb);
+	nf_bridge_save_header(skb);
+	if (pf == NFPROTO_IPV4)
+		skb->protocol = htons(ETH_P_IP);
+	else
+		skb->protocol = htons(ETH_P_IPV6);
+
+	NF_HOOK(pf, NF_INET_POST_ROUTING, state->sk, skb,
+		NULL, realoutdev,
+		br_nf_dev_queue_xmit);
+
+	return NF_STOLEN;
+}
+
+/* IP/SABOTAGE *****************************************************/
+/* Don't hand locally destined packets to PF_INET(6)/PRE_ROUTING
+ * for the second time. */
+static unsigned int ip_sabotage_in(const struct nf_hook_ops *ops,
+				   struct sk_buff *skb,
+				   const struct net_device *in,
+				   const struct net_device *out,
+				   const struct nf_hook_state *state)
+{
+	if (skb->nf_bridge &&
+	    !(skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING)) {
+		return NF_STOP;
+	}
+
+	return NF_ACCEPT;
+}
+
+/* For br_nf_post_routing, we need (prio = NF_BR_PRI_LAST), because
+ * br_dev_queue_push_xmit is called afterwards */
+static struct nf_hook_ops br_nf_ops[] __read_mostly = {
+	{
+		.hook = br_nf_pre_routing,
+		.owner = THIS_MODULE,
+		.pf = NFPROTO_BRIDGE,
+		.hooknum = NF_BR_PRE_ROUTING,
+		.priority = NF_BR_PRI_BRNF,
+	},
+	{
+		.hook = br_nf_forward_ip,
+		.owner = THIS_MODULE,
+		.pf = NFPROTO_BRIDGE,
+		.hooknum = NF_BR_FORWARD,
+		.priority = NF_BR_PRI_BRNF - 1,
+	},
+	{
+		.hook = br_nf_forward_arp,
+		.owner = THIS_MODULE,
+		.pf = NFPROTO_BRIDGE,
+		.hooknum = NF_BR_FORWARD,
+		.priority = NF_BR_PRI_BRNF,
+	},
+	{
+		.hook = br_nf_post_routing,
+		.owner = THIS_MODULE,
+		.pf = NFPROTO_BRIDGE,
+		.hooknum = NF_BR_POST_ROUTING,
+		.priority = NF_BR_PRI_LAST,
+	},
+	{
+		.hook = ip_sabotage_in,
+		.owner = THIS_MODULE,
+		.pf = NFPROTO_IPV4,
+		.hooknum = NF_INET_PRE_ROUTING,
+		.priority = NF_IP_PRI_FIRST,
+	},
+	{
+		.hook = ip_sabotage_in,
+		.owner = THIS_MODULE,
+		.pf = NFPROTO_IPV6,
+		.hooknum = NF_INET_PRE_ROUTING,
+		.priority = NF_IP6_PRI_FIRST,
+	},
+};
+
+#ifdef CONFIG_SYSCTL
+static
+int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
+			    void __user * buffer, size_t * lenp, loff_t * ppos)
+{
+	int ret;
+
+	ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+
+	if (write && *(int *)(ctl->data))
+		*(int *)(ctl->data) = 1;
+	return ret;
+}
+
+static struct ctl_table brnf_table[] = {
+	{
+		.procname	= "bridge-nf-call-arptables",
+		.data		= &brnf_call_arptables,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{
+		.procname	= "bridge-nf-call-iptables",
+		.data		= &brnf_call_iptables,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{
+		.procname	= "bridge-nf-call-ip6tables",
+		.data		= &brnf_call_ip6tables,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{
+		.procname	= "bridge-nf-filter-vlan-tagged",
+		.data		= &brnf_filter_vlan_tagged,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{
+		.procname	= "bridge-nf-filter-pppoe-tagged",
+		.data		= &brnf_filter_pppoe_tagged,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{
+		.procname	= "bridge-nf-pass-vlan-input-dev",
+		.data		= &brnf_pass_vlan_indev,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= brnf_sysctl_call_tables,
+	},
+	{ }
+};
+#endif
+
+int __init br_netfilter_init(void)
+{
+	int ret;
+
+	ret = dst_entries_init(&fake_dst_ops);
+	if (ret < 0)
+		return ret;
+
+	ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+	if (ret < 0) {
+		dst_entries_destroy(&fake_dst_ops);
+		return ret;
+	}
+#ifdef CONFIG_SYSCTL
+	brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table);
+	if (brnf_sysctl_header == NULL) {
+		printk(KERN_WARNING
+		       "br_netfilter: can't register to sysctl.\n");
+		nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+		dst_entries_destroy(&fake_dst_ops);
+		return -ENOMEM;
+	}
+#endif
+	printk(KERN_NOTICE "Bridge firewalling registered\n");
+	return 0;
+}
+
+void br_netfilter_fini(void)
+{
+	nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+#ifdef CONFIG_SYSCTL
+	unregister_net_sysctl_table(brnf_sysctl_header);
+#endif
+	dst_entries_destroy(&fake_dst_ops);
+}
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -520,23 +520,6 @@ static unsigned int br_nf_pre_routing(const struct nf_hook_ops *ops,
 }
 
 
-/* PF_BRIDGE/LOCAL_IN ************************************************/
-/* The packet is locally destined, which requires a real
- * dst_entry, so detach the fake one.  On the way up, the
- * packet would pass through PRE_ROUTING again (which already
- * took place when the packet entered the bridge), but we
- * register an IPv4 PRE_ROUTING 'sabotage' hook that will
- * prevent this from happening. */
-static unsigned int br_nf_local_in(const struct nf_hook_ops *ops,
-				   struct sk_buff *skb,
-				   const struct net_device *in,
-				   const struct net_device *out,
-				   const struct nf_hook_state *state)
-{
-	br_drop_fake_rtable(skb);
-	return NF_ACCEPT;
-}
-
 /* PF_BRIDGE/FORWARD *************************************************/
 static int br_nf_forward_finish(struct sock *sk, struct sk_buff *skb)
 {
@@ -922,13 +905,6 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = {
 		.priority = NF_BR_PRI_BRNF,
 	},
 	{
-		.hook = br_nf_local_in,
-		.owner = THIS_MODULE,
-		.pf = NFPROTO_BRIDGE,
-		.hooknum = NF_BR_LOCAL_IN,
-		.priority = NF_BR_PRI_BRNF,
-	},
-	{
 		.hook = br_nf_forward_ip,
 		.owner = THIS_MODULE,
 		.pf = NFPROTO_BRIDGE,
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -21,6 +21,10 @@
 #include <net/ip6_fib.h>
 #include <linux/if_vlan.h>
 
+#include <linux/ve.h>
+#include <linux/ve_proto.h>
+#include <uapi/linux/vzcalluser.h>
+
 #define BR_HASH_BITS 8
 #define BR_HASH_SIZE (1 << BR_HASH_BITS)
 
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -51,7 +51,7 @@ struct arppayload
 };
 
 static void
-print_ports(const struct sk_buff *skb, uint8_t protocol, int offset)
+print_ports(const struct sk_buff *skb, uint8_t protocol, int offset, struct ve_struct *ve)
 {
 	if (protocol == IPPROTO_TCP ||
 	    protocol == IPPROTO_UDP ||
@@ -64,10 +64,10 @@ print_ports(const struct sk_buff *skb, uint8_t protocol, int offset)
 		pptr = skb_header_pointer(skb, offset,
 					  sizeof(_ports), &_ports);
 		if (pptr == NULL) {
-			printk(" INCOMPLETE TCP/UDP header");
+			ve_log_printk(ve, " INCOMPLETE TCP/UDP header");
 			return;
 		}
-		printk(" SPT=%u DPT=%u", ntohs(pptr->src), ntohs(pptr->dst));
+		ve_log_printk(ve, " SPT=%u DPT=%u", ntohs(pptr->src), ntohs(pptr->dst));
 	}
 }
 
@@ -78,13 +78,10 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
 	       const char *prefix)
 {
 	unsigned int bitmask;
-
-	/* FIXME: Disabled from containers until syslog ns is supported */
-	if (!net_eq(net, &init_net))
-		return;
+	struct ve_struct *ve = net->owner_ve;
 
 	spin_lock_bh(&ebt_log_lock);
-	printk(KERN_SOH "%c%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x",
+	ve_log_printk(ve, KERN_SOH "%c%s IN=%s OUT=%s MAC source = %pM MAC dest = %pM proto = 0x%04x",
 	       '0' + loginfo->u.log.level, prefix,
 	       in ? in->name : "", out ? out->name : "",
 	       eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
@@ -102,12 +99,12 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
 
 		ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
 		if (ih == NULL) {
-			printk(" INCOMPLETE IP header");
+			ve_log_printk(ve, " INCOMPLETE IP header");
 			goto out;
 		}
-		printk(" IP SRC=%pI4 IP DST=%pI4, IP tos=0x%02X, IP proto=%d",
+		ve_log_printk(ve, " IP SRC=%pI4 IP DST=%pI4, IP tos=0x%02X, IP proto=%d",
 		       &ih->saddr, &ih->daddr, ih->tos, ih->protocol);
-		print_ports(skb, ih->protocol, ih->ihl*4);
+		print_ports(skb, ih->protocol, ih->ihl*4, ve);
 		goto out;
 	}
 
@@ -122,16 +119,16 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
 
 		ih = skb_header_pointer(skb, 0, sizeof(_iph), &_iph);
 		if (ih == NULL) {
-			printk(" INCOMPLETE IPv6 header");
+			ve_log_printk(ve, " INCOMPLETE IPv6 header");
 			goto out;
 		}
-		printk(" IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, Next Header=%d",
+		ve_log_printk(ve, " IPv6 SRC=%pI6 IPv6 DST=%pI6, IPv6 priority=0x%01X, Next Header=%d",
 		       &ih->saddr, &ih->daddr, ih->priority, ih->nexthdr);
 		nexthdr = ih->nexthdr;
 		offset_ph = ipv6_skip_exthdr(skb, sizeof(_iph), &nexthdr, &frag_off);
 		if (offset_ph == -1)
 			goto out;
-		print_ports(skb, nexthdr, offset_ph);
+		print_ports(skb, nexthdr, offset_ph, ve);
 		goto out;
 	}
 #endif
@@ -144,10 +141,10 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
 
 		ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
 		if (ah == NULL) {
-			printk(" INCOMPLETE ARP header");
+			ve_log_printk(ve, " INCOMPLETE ARP header");
 			goto out;
 		}
-		printk(" ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d",
+		ve_log_printk(ve, " ARP HTYPE=%d, PTYPE=0x%04x, OPCODE=%d",
 		       ntohs(ah->ar_hrd), ntohs(ah->ar_pro),
 		       ntohs(ah->ar_op));
 
@@ -162,15 +159,15 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
 			ap = skb_header_pointer(skb, sizeof(_arph),
 						sizeof(_arpp), &_arpp);
 			if (ap == NULL) {
-				printk(" INCOMPLETE ARP payload");
+				ve_log_printk(ve, " INCOMPLETE ARP payload");
 				goto out;
 			}
-			printk(" ARP MAC SRC=%pM ARP IP SRC=%pI4 ARP MAC DST=%pM ARP IP DST=%pI4",
+			ve_log_printk(ve, " ARP MAC SRC=%pM ARP IP SRC=%pI4 ARP MAC DST=%pM ARP IP DST=%pI4",
 					ap->mac_src, ap->ip_src, ap->mac_dst, ap->ip_dst);
 		}
 	}
 out:
-	printk("\n");
+	ve_log_printk(ve, "\n");
 	spin_unlock_bh(&ebt_log_lock);
 
 }
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -31,8 +31,9 @@
 /* needed for logical [in,out]-dev filtering */
 #include "../br_private.h"
 
-#define BUGPRINT(format, args...) printk("kernel msg: ebtables bug: please "\
-					 "report to author: "format, ## args)
+#define BUGPRINT(format, args...)					\
+	ve_printk(VE_LOG, "kernel msg: ebtables bug: please "		\
+			"report to author: "format, ## args)
 /* #define BUGPRINT(format, args...) */
 
 /*
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -135,9 +135,12 @@
 #include <linux/hashtable.h>
 #include <linux/vmalloc.h>
 #include <linux/hrtimer.h>
+#include <linux/fence-watchdog.h>
 
 #include "net-sysfs.h"
 
+#include <linux/ve.h>
+
 /* Instead of increasing this, you should create a hash table. */
 #define MAX_GRO_SKBS 8
 
@@ -187,18 +190,6 @@ static inline void dev_base_seq_inc(struct net *net)
 	while (++net->dev_base_seq == 0);
 }
 
-static inline struct hlist_head *dev_name_hash(struct net *net, const char *name)
-{
-	unsigned int hash = full_name_hash(name, strnlen(name, IFNAMSIZ));
-
-	return &net->dev_name_head[hash_32(hash, NETDEV_HASHBITS)];
-}
-
-static inline struct hlist_head *dev_index_hash(struct net *net, int ifindex)
-{
-	return &net->dev_index_head[ifindex & (NETDEV_HASHENTRIES - 1)];
-}
-
 static inline void rps_lock(struct softnet_data *sd)
 {
 #ifdef CONFIG_RPS
@@ -1168,11 +1159,14 @@ int dev_change_name(struct net_device *dev, const char *newname)
 	}
 
 rollback:
-	ret = device_rename(&dev->dev, dev->name);
-	if (ret) {
-		memcpy(dev->name, oldname, IFNAMSIZ);
-		write_seqcount_end(&devnet_rename_seq);
-		return ret;
+	if (!dev_net(dev)->owner_ve->ve_netns ||
+	    dev_net(dev)->owner_ve->ve_netns == dev->nd_net) {
+		ret = device_rename(&dev->dev, dev->name);
+		if (ret) {
+			memcpy(dev->name, oldname, IFNAMSIZ);
+			write_seqcount_end(&devnet_rename_seq);
+			return ret;
+		}
 	}
 
 	write_seqcount_end(&devnet_rename_seq);
@@ -2705,6 +2699,14 @@ struct sk_buff *dev_hard_start_xmit(struct sk_buff *first, struct net_device *de
 	struct sk_buff *skb = first;
 	int rc = NETDEV_TX_OK;
 
+#ifdef CONFIG_FENCE_WATCHDOG
+	if (unlikely(fence_wdog_check_timer())) {
+		kfree_skb(skb);
+		*ret = rc;
+		return NULL;
+	}
+#endif
+
 	while (skb) {
 		struct sk_buff *next = skb->next;
 
@@ -3924,12 +3926,6 @@ int netif_receive_skb(struct sk_buff *skb)
 }
 EXPORT_SYMBOL(netif_receive_skb);
 
-int netif_receive_skb_sk(struct sock *sk, struct sk_buff *skb)
-{
-	return netif_receive_skb(skb);
-}
-EXPORT_SYMBOL(netif_receive_skb_sk);
-
 /* Network device is going away, flush any packets still pending
  * Called with irqs disabled.
  */
@@ -4113,6 +4109,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
 		NAPI_GRO_CB(skb)->encap_mark = 0;
 		NAPI_GRO_CB(skb)->recursion_counter = 0;
 		NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
+		NAPI_GRO_CB(skb)->recursion_counter = 0;
 
 		/* Setup for GRO checksum validation */
 		switch (skb->ip_summed) {
@@ -4836,6 +4833,10 @@ static void net_rx_action(struct softirq_action *h)
 	list_splice_init(&sd->poll_list, &list);
 	local_irq_enable();
 
+#ifdef CONFIG_FENCE_WATCHDOG
+	fence_wdog_check_timer();
+#endif
+
 	for (;;) {
 		struct napi_struct *n;
 
@@ -5173,9 +5174,14 @@ void netdev_adjacent_sysfs_del(struct net_device *dev,
 	sysfs_remove_link(&(dev->dev.kobj), linkname);
 }
 
-#define netdev_adjacent_is_neigh_list(dev, dev_list) \
-		(dev_list == &dev->adj_list.upper || \
-		 dev_list == &dev->adj_list.lower)
+static inline bool netdev_adjacent_is_neigh_list(struct net_device *dev,
+						 struct net_device *adj_dev,
+						 struct list_head *dev_list)
+{
+	return (dev_list == &dev->adj_list.upper ||
+		dev_list == &dev->adj_list.lower) &&
+		net_eq(dev_net(dev), dev_net(adj_dev));
+}
 
 static int __netdev_adjacent_dev_insert(struct net_device *dev,
 					struct net_device *adj_dev,
@@ -5205,7 +5211,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
 	pr_debug("dev_hold for %s, because of link added from %s to %s\n",
 		 adj_dev->name, dev->name, adj_dev->name);
 
-	if (netdev_adjacent_is_neigh_list(dev, dev_list)) {
+	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list)) {
 		ret = netdev_adjacent_sysfs_add(dev, adj_dev, dev_list);
 		if (ret)
 			goto free_adj;
@@ -5226,7 +5232,7 @@ static int __netdev_adjacent_dev_insert(struct net_device *dev,
 	return 0;
 
 remove_symlinks:
-	if (netdev_adjacent_is_neigh_list(dev, dev_list))
+	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 free_adj:
 	kfree(adj);
@@ -5258,7 +5264,7 @@ void __netdev_adjacent_dev_remove(struct net_device *dev,
 	if (adj->master)
 		sysfs_remove_link(&(dev->dev.kobj), "master");
 
-	if (netdev_adjacent_is_neigh_list(dev, dev_list))
+	if (netdev_adjacent_is_neigh_list(dev, adj_dev, dev_list))
 		netdev_adjacent_sysfs_del(dev, adj_dev->name, dev_list);
 
 	list_del_rcu(&adj->list);
@@ -5564,11 +5570,65 @@ void netdev_bonding_info_change(struct net_device *dev,
 }
 EXPORT_SYMBOL(netdev_bonding_info_change);
 
+void netdev_adjacent_add_links(struct net_device *dev)
+{
+	struct netdev_adjacent *iter;
+
+	struct net *net = dev_net(dev);
+
+	list_for_each_entry(iter, &dev->adj_list.upper, list) {
+		if (!net_eq(net,dev_net(iter->dev)))
+			continue;
+		netdev_adjacent_sysfs_add(iter->dev, dev,
+					  &iter->dev->adj_list.lower);
+		netdev_adjacent_sysfs_add(dev, iter->dev,
+					  &dev->adj_list.upper);
+	}
+
+	list_for_each_entry(iter, &dev->adj_list.lower, list) {
+		if (!net_eq(net,dev_net(iter->dev)))
+			continue;
+		netdev_adjacent_sysfs_add(iter->dev, dev,
+					  &iter->dev->adj_list.upper);
+		netdev_adjacent_sysfs_add(dev, iter->dev,
+					  &dev->adj_list.lower);
+	}
+}
+
+void netdev_adjacent_del_links(struct net_device *dev)
+{
+	struct netdev_adjacent *iter;
+
+	struct net *net = dev_net(dev);
+
+	list_for_each_entry(iter, &dev->adj_list.upper, list) {
+		if (!net_eq(net,dev_net(iter->dev)))
+			continue;
+		netdev_adjacent_sysfs_del(iter->dev, dev->name,
+					  &iter->dev->adj_list.lower);
+		netdev_adjacent_sysfs_del(dev, iter->dev->name,
+					  &dev->adj_list.upper);
+	}
+
+	list_for_each_entry(iter, &dev->adj_list.lower, list) {
+		if (!net_eq(net,dev_net(iter->dev)))
+			continue;
+		netdev_adjacent_sysfs_del(iter->dev, dev->name,
+					  &iter->dev->adj_list.upper);
+		netdev_adjacent_sysfs_del(dev, iter->dev->name,
+					  &dev->adj_list.lower);
+	}
+}
+
 void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
 {
 	struct netdev_adjacent *iter;
 
+	struct net *net = dev_net(dev);
+
 	list_for_each_entry(iter, &dev->adj_list.upper, list) {
+		if (!net_eq(net,dev_net(iter->dev)))
+			continue;
 		netdev_adjacent_sysfs_del(iter->dev, oldname,
 					  &iter->dev->adj_list.lower);
 		netdev_adjacent_sysfs_add(iter->dev, dev,
@@ -5576,6 +5636,8 @@ void netdev_adjacent_rename_links(struct net_device *dev, char *oldname)
 	}
 
 	list_for_each_entry(iter, &dev->adj_list.lower, list) {
+		if (!net_eq(net,dev_net(iter->dev)))
+			continue;
 		netdev_adjacent_sysfs_del(iter->dev, oldname,
 					  &iter->dev->adj_list.upper);
 		netdev_adjacent_sysfs_add(iter->dev, dev,
@@ -5669,8 +5731,13 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc, bool notify)
 			return -EOVERFLOW;
 		}
 	}
-	if (dev->flags != old_flags) {
-		pr_info("device %s %s promiscuous mode\n",
+	/*
+	 * Promiscous mode on LOOPBACK/POINTTOPOINT devices does
+	 * not mean anything
+	 */
+	if ((dev->flags != old_flags) &&
+			!(dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT))) {
+		ve_printk(VE_LOG, KERN_INFO "device %s %s promiscuous mode\n",
 			dev->name,
 			dev->flags & IFF_PROMISC ? "entered" : "left");
 		if (audit_enabled) {
@@ -6483,9 +6550,9 @@ static int netif_alloc_netdev_queues(struct net_device *dev)
 	if (count < 1 || count > 0xffff)
 		return -EINVAL;
 
-	tx = kzalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
+	tx = kzalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_NOWARN | __GFP_REPEAT);
 	if (!tx) {
-		tx = vzalloc(sz);
+		tx = vzalloc_account(sz);
 		if (!tx)
 			return -ENOMEM;
 	}
@@ -6528,6 +6595,14 @@ int register_netdevice(struct net_device *dev)
 	BUG_ON(dev->reg_state != NETREG_UNINITIALIZED);
 	BUG_ON(!net);
 
+	ret = -EPERM;
+	if (!ve_is_super(net->owner_ve) && ve_is_dev_movable(dev))
+		goto out;
+
+	ret = -ENOMEM;
+	if (atomic_dec_if_positive(&net->owner_ve->netif_avail_nr) < 0)
+		goto out;
+
 	spin_lock_init(&dev->addr_list_lock);
 	netdev_set_addr_lockdep_class(dev);
 
@@ -6535,7 +6610,7 @@ int register_netdevice(struct net_device *dev)
 
 	ret = dev_get_valid_name(net, dev, dev->name);
 	if (ret < 0)
-		goto out;
+		goto err_avail;
 
 	/* Init, if this function is available */
 	if (dev->netdev_ops->ndo_init) {
@@ -6543,7 +6618,7 @@ int register_netdevice(struct net_device *dev)
 		if (ret) {
 			if (ret > 0)
 				ret = -EIO;
-			goto out;
+			goto err_avail;
 		}
 	}
 
@@ -6657,10 +6732,65 @@ out:
 err_uninit:
 	if (dev->netdev_ops->ndo_uninit)
 		dev->netdev_ops->ndo_uninit(dev);
+err_avail:
+	atomic_inc(&net->owner_ve->netif_avail_nr);
 	goto out;
 }
 EXPORT_SYMBOL(register_netdevice);
 
+/*
+ * We do horrible things -- we left a netdevice
+ * in "leaked" state, which means we release as much
+ * resources as possible but the device will remain
+ * present in namespace because someone holds a reference.
+ *
+ * The idea is to be able to force stop VE.
+ */
+static void ve_netdev_leak(struct net_device *dev)
+{
+	struct napi_struct *p, *n;
+
+	dev->is_leaked = 1;
+	barrier();
+
+	/*
+	 * Make sure we're unable to tx/rx
+	 * network packets to outside.
+	 */
+	WARN_ON_ONCE(dev->flags & IFF_UP);
+	WARN_ON_ONCE(dev->qdisc != &noop_qdisc);
+
+	rtnl_lock();
+
+	/*
+	 * No address and napi after that.
+	 */
+	dev_addr_flush(dev);
+	list_for_each_entry_safe(p, n, &dev->napi_list, dev_list)
+		netif_napi_del(p);
+
+	/*
+	 * No release_net() here since the device remains
+	 * present in the namespace.
+	 */
+
+	__rtnl_unlock();
+
+	/*
+	 * Since we've already screwed the device and releasing
+	 * it in a normal way is not possible anymore, we're
+	 * to be sure the device will remain here forever.
+	 */
+	dev_hold(dev);
+
+	synchronize_net();
+
+	pr_emerg("Device (%s:%d:%s:%p) marked as leaked\n",
+			dev->name, netdev_refcnt_read(dev) - 1,
+			ve_name(dev_net(dev)->owner_ve), dev);
+	dst_cache_dump();
+}
+
 /**
  *	init_dummy_netdev	- init a dummy network device for NAPI
  *	@dev: device to init
@@ -6748,10 +6878,11 @@ EXPORT_SYMBOL(netdev_refcnt_read);
  * We can get stuck here if buggy protocols don't correctly
  * call dev_put.
  */
-static void netdev_wait_allrefs(struct net_device *dev)
+static int netdev_wait_allrefs(struct net_device *dev)
 {
 	unsigned long rebroadcast_time, warning_time;
 	int refcnt;
+	int i = 0;
 
 	linkwatch_forget_dev(dev);
 
@@ -6791,11 +6922,25 @@ static void netdev_wait_allrefs(struct net_device *dev)
 		refcnt = netdev_refcnt_read(dev);
 
 		if (time_after(jiffies, warning_time + 10 * HZ)) {
-			pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n",
-				 dev->name, refcnt);
+			pr_emerg("unregister_netdevice: waiting for %s=%p to "
+				"become free. Usage count = %d\n ve=%s",
+				 dev->name, dev, refcnt,
+				 ve_name(dev_net(dev)->owner_ve));
 			warning_time = jiffies;
 		}
+
+		/*
+		 * If device has lost the reference we might stuck
+		 * in this loop forever not having a chance the VE
+		 * to stop.
+		 */
+		if (++i > 200) { /* give 50 seconds to try */
+			ve_netdev_leak(dev);
+			return -EBUSY;
+		}
 	}
+
+	return 0;
 }
 
 /* The sequence is:
@@ -6831,7 +6976,6 @@ void netdev_run_todo(void)
 
 	__rtnl_unlock();
 
-
 	/* Wait for rcu callbacks to finish before next phase */
 	if (!list_empty(&list))
 		rcu_barrier();
@@ -6854,7 +6998,12 @@ void netdev_run_todo(void)
 
 		dev->reg_state = NETREG_UNREGISTERED;
 
-		netdev_wait_allrefs(dev);
+		/*
+		 * Even if device get stuck here we are
+		 * to proceed the rest of the list.
+		 */
+		if (netdev_wait_allrefs(dev))
+			continue;
 
 		/* paranoia */
 		BUG_ON(netdev_refcnt_read(dev));
@@ -6862,6 +7011,11 @@ void netdev_run_todo(void)
 		WARN_ON(rcu_access_pointer(dev->ip6_ptr));
 		WARN_ON(dev->dn_ptr);
 
+		atomic_inc(&dev_net(dev)->owner_ve->netif_avail_nr);
+
+		/* It must be the very last action,
+		 * after this 'dev' may point to freed up memory.
+		 */
 		if (dev->destructor)
 			dev->destructor(dev);
 
@@ -7015,11 +7169,13 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
 	/* ensure 32-byte alignment of whole construct */
 	alloc_size += NETDEV_ALIGN - 1;
 
-	p = kzalloc(alloc_size, GFP_KERNEL | __GFP_NOWARN | __GFP_REPEAT);
-	if (!p)
-		p = vzalloc(alloc_size);
+	p = kzalloc(alloc_size, GFP_KERNEL_ACCOUNT | __GFP_NOWARN | __GFP_REPEAT);
 	if (!p)
+		p = vzalloc_account(alloc_size);
+	if (!p) {
+		pr_err("alloc_netdev: Unable to allocate device\n");
 		return NULL;
+	}
 
 	dev = PTR_ALIGN(p, NETDEV_ALIGN);
 	dev->padded = (char *)dev - (char *)p;
@@ -7099,6 +7255,13 @@ void free_netdev(struct net_device *dev)
 {
 	struct napi_struct *p, *n;
 
+	if (dev->is_leaked) {
+		pr_emerg("%s: device %s=%p is leaked\n",
+				__func__, dev->name, dev);
+		dump_stack();
+		return;
+	}
+
 	might_sleep();
 	netif_free_tx_queues(dev);
 #ifdef CONFIG_RPS
@@ -7254,6 +7417,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 			goto out;
 	}
 
+	err = -ENOMEM;
+	if (atomic_dec_if_positive(&net->owner_ve->netif_avail_nr) < 0)
+		goto out;
+	atomic_inc(&dev_net(dev)->owner_ve->netif_avail_nr);
+
 	/*
 	 * And now a mini version of register_netdevice unregister_netdevice.
 	 */
@@ -7290,6 +7458,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 
 	/* Send a netdev-removed uevent to the old namespace */
 	kobject_uevent(&dev->dev.kobj, KOBJ_REMOVE);
+	netdev_adjacent_del_links(dev);
 
 	/* Actually switch the network namespace */
 	dev_net_set(dev, net);
@@ -7304,6 +7473,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
 
 	/* Send a netdev-add uevent to the new namespace */
 	kobject_uevent(&dev->dev.kobj, KOBJ_ADD);
+	netdev_adjacent_add_links(dev);
 
 	/* Fixup kobjects */
 	err = device_rename(&dev->dev, dev->name);
@@ -7400,7 +7570,7 @@ netdev_features_t netdev_increment_features(netdev_features_t all,
 		mask |= NETIF_F_CSUM_MASK;
 	mask |= NETIF_F_VLAN_CHALLENGED;
 
-	all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
+	all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_CSUM_MASK|NETIF_F_VIRTUAL) & mask;
 	all &= one | ~NETIF_F_ALL_FOR_ALL;
 
 	/* If one device supports hw checksumming, set for all. */
@@ -7416,7 +7586,7 @@ static struct hlist_head *netdev_create_hash(void)
 	int i;
 	struct hlist_head *hash;
 
-	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL);
+	hash = kmalloc(sizeof(*hash) * NETDEV_HASHENTRIES, GFP_KERNEL_ACCOUNT);
 	if (hash != NULL)
 		for (i = 0; i < NETDEV_HASHENTRIES; i++)
 			INIT_HLIST_HEAD(&hash[i]);
--- a/net/core/dev_ioctl.c
+++ b/net/core/dev_ioctl.c
@@ -294,6 +294,8 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
 		return dev_mc_del_global(dev, ifr->ifr_hwaddr.sa_data);
 
 	case SIOCSIFTXQLEN:
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+			return -EPERM;
 		if (ifr->ifr_qlen < 0)
 			return -EINVAL;
 		dev->tx_queue_len = ifr->ifr_qlen;
@@ -491,25 +493,23 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg)
 
 	/*
 	 *	These ioctl calls:
-	 *	- require superuser power.
-	 *	- require strict serialization.
-	 *	- do not return a value
-	 */
-	case SIOCSIFMAP:
-	case SIOCSIFTXQLEN:
-		if (!capable(CAP_NET_ADMIN))
-			return -EPERM;
-		/* fall through */
-	/*
-	 *	These ioctl calls:
 	 *	- require local superuser power.
 	 *	- require strict serialization.
 	 *	- do not return a value
 	 */
-	case SIOCSIFFLAGS:
-	case SIOCSIFMETRIC:
+	case SIOCSIFMAP:
 	case SIOCSIFMTU:
 	case SIOCSIFHWADDR:
+	case SIOCSIFFLAGS:
+		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+			return -EPERM;
+		dev_load(net, ifr.ifr_name);
+		rtnl_lock();
+		ret = dev_ifsioc(net, &ifr, cmd);
+		rtnl_unlock();
+		return ret;
+
+	case SIOCSIFMETRIC:
 	case SIOCSIFSLAVE:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -266,6 +266,7 @@ again:
 
 	lwtstate_put(dst->lwtstate);
 
+	dst->flags |= DST_FREE;
 	if (dst->flags & DST_METADATA)
 		metadata_dst_free((struct metadata_dst *)dst);
 	else
@@ -299,6 +300,21 @@ static void dst_destroy_rcu(struct rcu_head *head)
 		__dst_free(dst);
 }
 
+void dst_dump_one(struct dst_entry *d)
+{
+	printk("\tdev %p err %d obs %d flags %x i/o %p/%p ref %d use %d\n",
+			d->dev, (int)d->error, (int)d->obsolete, d->flags,
+			d->input, d->output, atomic_read(&d->__refcnt), d->__use);
+}
+EXPORT_SYMBOL(dst_dump_one);
+
+void dst_cache_dump(void)
+{
+	ip_rt_dump_dsts();
+	if (ip6_rt_dump_dsts)
+		ip6_rt_dump_dsts();
+}
+
 void dst_release(struct dst_entry *dst)
 {
 	if (dst) {
@@ -436,6 +452,7 @@ static int dst_dev_event(struct notifier_block *this, unsigned long event,
 	switch (event) {
 	case NETDEV_UNREGISTER_FINAL:
 	case NETDEV_DOWN:
+		dst_gc_task(NULL);
 		mutex_lock(&dst_gc_mutex);
 		for (dst = dst_busy_list; dst; dst = dst->next) {
 			last = dst;
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -2397,8 +2397,11 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
 	case ETHTOOL_GEEE:
 	case ETHTOOL_GTUNABLE:
 		break;
+	case ETHTOOL_SEEPROM:
+		if (!capable(CAP_NET_ADMIN))
+			return -EPERM;
 	default:
-		if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
+		if (!ve_capable(CAP_NET_ADMIN))
 			return -EPERM;
 	}
 
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -23,7 +23,7 @@ int fib_default_rule_add(struct fib_rules_ops *ops,
 {
 	struct fib_rule *r;
 
-	r = kzalloc(ops->rule_size, GFP_KERNEL);
+	r = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT);
 	if (r == NULL)
 		return -ENOMEM;
 
@@ -285,7 +285,7 @@ static int fib_nl_newrule(struct sk_buff *skb, struct nlmsghdr* nlh)
 	if (err < 0)
 		goto errout;
 
-	rule = kzalloc(ops->rule_size, GFP_KERNEL);
+	rule = kzalloc(ops->rule_size, GFP_KERNEL_ACCOUNT);
 	if (rule == NULL) {
 		err = -ENOMEM;
 		goto errout;
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -67,9 +67,10 @@ static inline void *load_pointer(const struct sk_buff *skb, int k,
 }
 
 /**
- *	sk_filter - run a packet through a socket filter
+ *	sk_filter_trim_cap - run a packet through a socket filter
  *	@sk: sock associated with &sk_buff
  *	@skb: buffer to filter
+ *	@cap: limit on how short the eBPF program may trim the packet
  *
  * Run the filter code and then cut skb->data to correct size returned by
  * sk_run_filter. If pkt_len is 0 we toss packet. If skb->len is smaller
@@ -78,7 +79,7 @@ static inline void *load_pointer(const struct sk_buff *skb, int k,
  * be accepted or -EPERM if the packet should be tossed.
  *
  */
-int sk_filter(struct sock *sk, struct sk_buff *skb)
+int sk_filter_trim_cap(struct sock *sk, struct sk_buff *skb, unsigned int cap)
 {
 	int err;
 	struct sk_filter *filter;
@@ -99,14 +100,13 @@ int sk_filter(struct sock *sk, struct sk_buff *skb)
 	filter = rcu_dereference(sk->sk_filter);
 	if (filter) {
 		unsigned int pkt_len = SK_RUN_FILTER(filter, skb);
-
-		err = pkt_len ? pskb_trim(skb, pkt_len) : -EPERM;
+		err = pkt_len ? pskb_trim(skb, max(cap, pkt_len)) : -EPERM;
 	}
 	rcu_read_unlock();
 
 	return err;
 }
-EXPORT_SYMBOL(sk_filter);
+EXPORT_SYMBOL(sk_filter_trim_cap);
 
 /**
  *	sk_run_filter - run a filter on a socket
@@ -734,7 +734,7 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
 	if (fprog->filter == NULL)
 		return -EINVAL;
 
-	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL);
+	fp = sock_kmalloc(sk, fsize+sizeof(*fp), GFP_KERNEL_ACCOUNT);
 	if (!fp)
 		return -ENOMEM;
 	if (copy_from_user(fp->insns, fprog->filter, fsize)) {
--- a/net/core/iovec.c
+++ b/net/core/iovec.c
@@ -128,6 +128,10 @@ int memcpy_toiovecend_partial(const struct iovec *iov, unsigned char *kdata,
 int memcpy_fromiovecend(unsigned char *kdata, const struct iovec *iov,
 			int offset, int len)
 {
+	/* No data? Done! */
+	if (len == 0)
+		return 0;
+
 	/* Skip over the finished iovecs */
 	while (offset >= iov->iov_len) {
 		offset -= iov->iov_len;
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -24,6 +24,7 @@
 #include <linux/socket.h>
 #include <linux/netdevice.h>
 #include <linux/proc_fs.h>
+#include <linux/ve.h>
 #ifdef CONFIG_SYSCTL
 #include <linux/sysctl.h>
 #endif
@@ -40,6 +41,7 @@
 #include <linux/log2.h>
 #include <linux/inetdevice.h>
 #include <net/addrconf.h>
+#include <bc/beancounter.h>
 
 #define DEBUG
 #define NEIGH_DEBUG 1
@@ -276,6 +278,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
 	int entries;
 
 	entries = atomic_inc_return(&tbl->entries) - 1;
+	n = ERR_PTR(-ENOBUFS);
 	if (entries >= tbl->gc_thresh3 ||
 	    (entries >= tbl->gc_thresh2 &&
 	     time_after(now, tbl->last_flush + 5 * HZ))) {
@@ -286,7 +289,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
 
 	n = kzalloc(tbl->entry_size + dev->neigh_priv_len, GFP_ATOMIC);
 	if (!n)
-		goto out_entries;
+		goto out_nomem;
 
 	__skb_queue_head_init(&n->arp_queue);
 	rwlock_init(&n->lock);
@@ -305,6 +308,8 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device
 out:
 	return n;
 
+out_nomem:
+	n = ERR_PTR(-ENOMEM);
 out_entries:
 	atomic_dec(&tbl->entries);
 	goto out;
@@ -466,13 +471,12 @@ struct neighbour *__neigh_create(struct neigh_table *tbl, const void *pkey,
 	u32 hash_val;
 	int key_len = tbl->key_len;
 	int error;
-	struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
+	struct neighbour *n1, *rc, *n;
 	struct neigh_hash_table *nht;
 
-	if (!n) {
-		rc = ERR_PTR(-ENOBUFS);
+	rc = n = neigh_alloc(tbl, dev);
+	if (IS_ERR(n))
 		goto out;
-	}
 
 	memcpy(n->primary_key, pkey, key_len);
 	n->dev = dev;
@@ -698,6 +702,13 @@ void neigh_destroy(struct neighbour *neigh)
 
 	NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
 
+	if (neigh->dev->is_leaked) {
+		printk(KERN_WARNING
+		       "Destroying neighbour %p on leaked device\n", neigh);
+		dump_stack();
+		return;
+	}
+
 	if (!neigh->dead) {
 		pr_warn("Destroying alive neighbour %p\n", neigh);
 		dump_stack();
@@ -802,7 +813,9 @@ static void neigh_periodic_work(struct work_struct *work)
 				*np = n->next;
 				n->dead = 1;
 				write_unlock(&n->lock);
+
 				neigh_cleanup_and_release(n);
+
 				continue;
 			}
 			write_unlock(&n->lock);
@@ -1443,7 +1456,7 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,
 
 	for (p = &tbl->parms; p; p = p->next) {
 		if ((p->dev && p->dev->ifindex == ifindex && net_eq(neigh_parms_net(p), net)) ||
-		    (!p->dev && !ifindex))
+		    (!p->dev && !ifindex && net_eq(net, &init_net)))
 			return p;
 	}
 
@@ -1453,15 +1466,11 @@ static inline struct neigh_parms *lookup_neigh_parms(struct neigh_table *tbl,
 struct neigh_parms *neigh_parms_alloc(struct net_device *dev,
 				      struct neigh_table *tbl)
 {
-	struct neigh_parms *p, *ref;
+	struct neigh_parms *p;
 	struct net *net = dev_net(dev);
 	const struct net_device_ops *ops = dev->netdev_ops;
 
-	ref = lookup_neigh_parms(tbl, net, 0);
-	if (!ref)
-		return NULL;
-
-	p = kmemdup(ref, sizeof(*p), GFP_KERNEL);
+	p = kmemdup(&tbl->parms, sizeof(*p), GFP_KERNEL);
 	if (p) {
 		p->tbl		  = tbl;
 		atomic_set(&p->refcnt, 1);
@@ -2099,6 +2108,12 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh)
 		}
 	}
 
+	err = -ENOENT;
+	if ((tb[NDTA_THRESH1] || tb[NDTA_THRESH2] ||
+	     tb[NDTA_THRESH3] || tb[NDTA_GC_INTERVAL]) &&
+	    !net_eq(net, &init_net))
+		goto errout_tbl_lock;
+
 	if (tb[NDTA_THRESH1])
 		tbl->gc_thresh1 = nla_get_u32(tb[NDTA_THRESH1]);
 
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -3,10 +3,10 @@
 #include <linux/workqueue.h>
 #include <linux/rtnetlink.h>
 #include <linux/cache.h>
+#include <linux/proc_fs.h>
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/delay.h>
-#include <linux/sched.h>
 #include <linux/idr.h>
 #include <linux/rculist.h>
 #include <linux/nsproxy.h>
@@ -16,12 +16,14 @@
 #include <linux/export.h>
 #include <linux/user_namespace.h>
 #include <linux/net_namespace.h>
+#include <linux/netdevice.h>
 #ifndef __GENKSYMS__
 #include <net/sock.h>
 #endif
 #include <net/netlink.h>
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
+#include <linux/ve.h>
 
 /*
  *	Our network namespace constructor/destructor lists
@@ -36,6 +38,12 @@ EXPORT_SYMBOL_GPL(net_namespace_list);
 
 struct net init_net = {
 	.dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head),
+#ifdef CONFIG_VE
+	.owner_ve = &ve0,
+#ifdef CONFIG_VE_IPTABLES
+	._iptables_modules = VE_IP_NONE,
+#endif
+#endif
 };
 EXPORT_SYMBOL(init_net);
 
@@ -55,7 +63,7 @@ static struct net_generic *net_alloc_generic(void)
 	return ng;
 }
 
-static int net_assign_generic(struct net *net, int id, void *data)
+int net_assign_generic(struct net *net, int id, void *data)
 {
 	struct net_generic *ng, *old_ng;
 
@@ -91,6 +99,7 @@ assign:
 	ng->ptr[id - 1] = data;
 	return 0;
 }
+EXPORT_SYMBOL_GPL(net_assign_generic);
 
 static int ops_init(const struct pernet_operations *ops, struct net *net)
 {
@@ -278,6 +287,10 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns)
 	int error = 0;
 	LIST_HEAD(net_exit_list);
 
+#ifdef CONFIG_VE
+	net->owner_ve = get_ve(get_exec_env());
+#endif
+
 	atomic_set(&net->count, 1);
 	atomic_set(&net->passive, 1);
 	net->dev_base_seq = 1;
@@ -307,6 +320,9 @@ out_undo:
 		ops_free_list(ops, &net_exit_list);
 
 	rcu_barrier();
+#ifdef CONFIG_VE
+	put_ve(net->owner_ve);
+#endif
 	goto out;
 }
 
@@ -353,12 +369,16 @@ void net_drop_ns(void *p)
 struct net *copy_net_ns(unsigned long flags,
 			struct user_namespace *user_ns, struct net *old_net)
 {
+	struct ve_struct *ve = get_exec_env();
 	struct net *net;
 	int rv;
 
 	if (!(flags & CLONE_NEWNET))
 		return get_net(old_net);
 
+	if (atomic_dec_if_positive(&ve->netns_avail_nr) < 0)
+		return ERR_PTR(-ENOMEM);
+
 	net = net_alloc();
 	if (!net)
 		return ERR_PTR(-ENOMEM);
@@ -376,6 +396,7 @@ struct net *copy_net_ns(unsigned long flags,
 	if (rv < 0) {
 		put_user_ns(user_ns);
 		net_drop_ns(net);
+		atomic_inc(&ve->netns_avail_nr);
 		return ERR_PTR(rv);
 	}
 	return net;
@@ -390,12 +411,26 @@ static void cleanup_net(struct work_struct *work)
 	struct net *net, *tmp;
 	struct list_head net_kill_list;
 	LIST_HEAD(net_exit_list);
+	bool reload = false;
+	int i = 0;
 
 	/* Atomically snapshot the list of namespaces to cleanup */
 	spin_lock_irq(&cleanup_list_lock);
-	list_replace_init(&cleanup_list, &net_kill_list);
+	list_for_each_entry_safe(net, tmp, &cleanup_list, cleanup_list)
+		if (++i == 16)
+			break;
+
+	if (i == 16) {
+		list_cut_position(&net_kill_list, &cleanup_list,
+						&net->cleanup_list);
+		reload = true;
+	} else
+		list_replace_init(&cleanup_list, &net_kill_list);
 	spin_unlock_irq(&cleanup_list_lock);
 
+	if (reload)
+		queue_work(netns_wq, work);
+
 	mutex_lock(&net_mutex);
 
 	/* Don't let anyone else find us. */
@@ -436,6 +471,15 @@ static void cleanup_net(struct work_struct *work)
 	list_for_each_entry_reverse(ops, &pernet_list, list)
 		ops_free_list(ops, &net_exit_list);
 
+	list_for_each_entry(net, &net_kill_list, cleanup_list) {
+		struct ve_struct *ve = net->owner_ve;
+
+		atomic_inc(&ve->netns_avail_nr);
+		if (ve->ve_netns == net)
+			ve->ve_netns = NULL;
+		put_ve(ve);
+	}
+
 	mutex_unlock(&net_mutex);
 
 	/* Ensure there are no outstanding rcu callbacks using this
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -39,6 +39,7 @@
 #include <linux/if_vlan.h>
 #include <linux/pci.h>
 #include <linux/etherdevice.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 
@@ -2416,6 +2417,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 {
 	int idx;
 	int s_idx = cb->family;
+	struct net *net = sock_net(skb->sk);
 
 	if (s_idx == 0)
 		s_idx = 1;
@@ -2426,6 +2428,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb)
 		if (rtnl_msg_handlers[idx] == NULL ||
 		    rtnl_msg_handlers[idx][type].dumpit == NULL)
 			continue;
+		if (vz_security_family_check(net, idx, cb->nlh->nlmsg_type))
+			continue;
 		if (idx > s_idx) {
 			memset(&cb->args[0], 0, sizeof(cb->args));
 			cb->prev_seq = 0;
@@ -3229,6 +3233,9 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 		return 0;
 
 	family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
+	if (vz_security_family_check(net, family, nlh->nlmsg_type))
+		return -EAFNOSUPPORT;
+
 	sz_idx = type>>2;
 	kind = type&3;
 
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -38,7 +38,6 @@
 #include <net/scm.h>
 #include <net/cls_cgroup.h>
 
-
 /*
  *	Only allow a user to send credentials, that they could set with
  *	setu(g)id.
@@ -54,6 +53,7 @@ static __inline__ int scm_check_creds(struct ucred *creds)
 		return -EINVAL;
 
 	if ((creds->pid == task_tgid_vnr(current) ||
+	     creds->pid == current->tgid ||
 	     ns_capable(task_active_pid_ns(current)->user_ns, CAP_SYS_ADMIN)) &&
 	    ((uid_eq(uid, cred->uid)   || uid_eq(uid, cred->euid) ||
 	      uid_eq(uid, cred->suid)) || ns_capable(cred->user_ns, CAP_SETUID)) &&
@@ -81,7 +81,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
 
 	if (!fpl)
 	{
-		fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL);
+		fpl = kmalloc(sizeof(struct scm_fp_list), GFP_KERNEL_ACCOUNT);
 		if (!fpl)
 			return -ENOMEM;
 		*fplp = fpl;
@@ -338,7 +338,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
 		return NULL;
 
 	new_fpl = kmemdup(fpl, offsetof(struct scm_fp_list, fp[fpl->count]),
-			  GFP_KERNEL);
+			  GFP_KERNEL_ACCOUNT);
 	if (new_fpl) {
 		for (i = 0; i < fpl->count; i++)
 			get_file(fpl->fp[i]);
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3875,7 +3875,8 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
 		return;
 
 	skb_orphan(skb);
-	skb->mark = 0;
+	if (!(skb->dev->features & NETIF_F_VENET))
+		skb->mark = 0;
 }
 EXPORT_SYMBOL_GPL(skb_scrub_packet);
 
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -131,6 +131,8 @@
 #include <net/cls_cgroup.h>
 #include <net/netprio_cgroup.h>
 
+#include <bc/beancounter.h>
+
 #include <linux/filter.h>
 
 #include <trace/events/sock.h>
@@ -366,8 +368,8 @@ static void sock_warn_obsolete_bsdism(const char *name)
 	static char warncomm[TASK_COMM_LEN];
 	if (strcmp(warncomm, current->comm) && warned < 5) {
 		strcpy(warncomm,  current->comm);
-		pr_warn("process `%s' is using obsolete %s SO_BSDCOMPAT\n",
-			warncomm, name);
+		ve_printk(VE_LOG, KERN_WARNING "process `%s' is using obsolete "
+			"%s SO_BSDCOMPAT\n", warncomm, name);
 		warned++;
 	}
 }
@@ -1567,6 +1569,8 @@ EXPORT_SYMBOL_GPL(sk_clone_lock);
 
 void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 {
+	extern int sysctl_tcp_use_sg;
+
 	sk_dst_set(sk, dst);
 	sk->sk_route_caps = dst->dev->features;
 	if (sk->sk_route_caps & NETIF_F_GSO)
@@ -1581,6 +1585,8 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
 			sk->sk_gso_max_segs = dst->dev->gso_max_segs;
 		}
 	}
+	if (!sysctl_tcp_use_sg)
+		sk->sk_route_caps &= ~NETIF_F_SG;
 }
 EXPORT_SYMBOL_GPL(sk_setup_caps);
 
@@ -2716,7 +2722,7 @@ int proto_register(struct proto *prot, int alloc_slab)
 {
 	if (alloc_slab) {
 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
-					SLAB_HWCACHE_ALIGN | prot->slab_flags,
+					SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT | prot->slab_flags,
 					NULL);
 
 		if (prot->slab == NULL) {
@@ -2732,7 +2738,7 @@ int proto_register(struct proto *prot, int alloc_slab)
 
 			prot->rsk_prot->slab = kmem_cache_create(prot->rsk_prot->slab_name,
 								 prot->rsk_prot->obj_size, 0,
-								 SLAB_HWCACHE_ALIGN, NULL);
+								 SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
 
 			if (prot->rsk_prot->slab == NULL) {
 				pr_crit("%s: Can't create request sock SLAB cache!\n",
@@ -2751,7 +2757,7 @@ int proto_register(struct proto *prot, int alloc_slab)
 				kmem_cache_create(prot->twsk_prot->twsk_slab_name,
 						  prot->twsk_prot->twsk_obj_size,
 						  0,
-						  SLAB_HWCACHE_ALIGN |
+						  SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT |
 							prot->slab_flags,
 						  NULL);
 			if (prot->twsk_prot->twsk_slab == NULL)
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -486,6 +486,9 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 		newsk->sk_backlog_rcv = dccp_v4_do_rcv;
 		newnp->pktoptions  = NULL;
 		newnp->opt	   = NULL;
+		newnp->ipv6_mc_list = NULL;
+		newnp->ipv6_ac_list = NULL;
+		newnp->ipv6_fl_list = NULL;
 		newnp->mcast_oif   = inet6_iif(skb);
 		newnp->mcast_hops  = ipv6_hdr(skb)->hop_limit;
 
@@ -540,6 +543,8 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 	ip6_dst_store(newsk, dst, NULL, NULL);
 	newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM |
 						      NETIF_F_TSO);
+	if (!sysctl_tcp_use_sg)
+		newsk->sk_route_caps &= ~NETIF_F_SG;
 	newdp6 = (struct dccp6_sock *)newsk;
 	newinet = inet_sk(newsk);
 	newinet->pinet6 = &newdp6->inet6;
@@ -561,6 +566,10 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk,
 	/* Clone RX bits */
 	newnp->rxopt.all = np->rxopt.all;
 
+	newnp->ipv6_mc_list = NULL;
+	newnp->ipv6_ac_list = NULL;
+	newnp->ipv6_fl_list = NULL;
+
 	/* Clone pktoptions received with SYN */
 	newnp->pktoptions = NULL;
 	if (ireq->pktopts != NULL) {
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -1056,7 +1056,7 @@ adjudge_to_death:
 	bh_lock_sock(sk);
 	WARN_ON(sock_owned_by_user(sk));
 
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	orphan_count_inc(sk);
 
 	/* Have we already been destroyed by a softirq or backlog? */
 	if (state != DCCP_CLOSED && sk->sk_state == DCCP_CLOSED)
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -442,6 +442,15 @@ config INET_UDP_DIAG
 	  Support for UDP socket monitoring interface used by the ss tool.
 	  If unsure, say Y.
 
+config INET_RAW_DIAG
+	tristate "RAW: socket monitoring interface"
+	depends on INET_DIAG && (IPV6 || IPV6=n)
+	default n
+	---help---
+	  Support for RAW socket monitoring interface used by the ss tool.
+	  If unsure, say Y.
+
+
 menuconfig TCP_CONG_ADVANCED
 	bool "TCP: advanced congestion control"
 	---help---
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -40,6 +40,7 @@ obj-$(CONFIG_NETFILTER)	+= netfilter.o netfilter/
 obj-$(CONFIG_INET_DIAG) += inet_diag.o 
 obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o
 obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
+obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
 obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
@@ -55,6 +56,7 @@ obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
 obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
 obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
 obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
+obj-$(CONFIG_MEMCG_KMEM) += udp_memcontrol.o
 obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
 
 obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -74,7 +74,6 @@
 #include <linux/in.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/sched.h>
 #include <linux/timer.h>
 #include <linux/string.h>
 #include <linux/sockios.h>
@@ -89,6 +88,7 @@
 #include <linux/netfilter_ipv4.h>
 #include <linux/random.h>
 #include <linux/slab.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 
@@ -310,6 +310,10 @@ lookup_protocol:
 			goto out_rcu_unlock;
 	}
 
+	err = vz_security_protocol_check(net, answer->protocol);
+	if (err < 0)
+		goto out_rcu_unlock;
+
 	err = -EPERM;
 	if (sock->type == SOCK_RAW && !kern &&
 	    !ns_capable(net->user_ns, CAP_NET_RAW))
@@ -1426,27 +1430,29 @@ int inet_ctl_sock_create(struct sock **sk, unsigned short family,
 }
 EXPORT_SYMBOL_GPL(inet_ctl_sock_create);
 
-unsigned long snmp_fold_field(void __percpu *mib[], int offt)
+unsigned long __snmp_fold_field(void __percpu *mib[], int offt,
+				const struct cpumask *mask)
 {
 	unsigned long res = 0;
 	int i, j;
 
-	for_each_possible_cpu(i) {
+	for_each_cpu(i, mask) {
 		for (j = 0; j < SNMP_ARRAY_SZ; j++)
 			res += *(((unsigned long *) per_cpu_ptr(mib[j], i)) + offt);
 	}
 	return res;
 }
-EXPORT_SYMBOL_GPL(snmp_fold_field);
+EXPORT_SYMBOL_GPL(__snmp_fold_field);
 
 #if BITS_PER_LONG==32
 
-u64 snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset)
+u64 __snmp_fold_field64(void __percpu *mib[], int offt, size_t syncp_offset,
+			const struct cpumask *mask)
 {
 	u64 res = 0;
 	int cpu;
 
-	for_each_possible_cpu(cpu) {
+	for_each_cpu(cpu, mask) {
 		void *bhptr;
 		struct u64_stats_sync *syncp;
 		u64 v;
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -195,7 +195,7 @@ static void devinet_sysctl_unregister(struct in_device *idev)
 
 static struct in_ifaddr *inet_alloc_ifa(void)
 {
-	return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL);
+	return kzalloc(sizeof(struct in_ifaddr), GFP_KERNEL_ACCOUNT);
 }
 
 static void inet_rcu_free_ifa(struct rcu_head *head)
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -264,7 +264,8 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst,
 	if (fib_lookup(net, &fl4, &res))
 		goto last_resort;
 	if (res.type != RTN_UNICAST) {
-		if (res.type != RTN_LOCAL || !accept_local)
+		if (!(dev->features & NETIF_F_VENET) ||
+		    res.type != RTN_LOCAL || !accept_local)
 			goto e_inval;
 	}
 	fib_combine_itag(itag, &res);
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1718,11 +1718,11 @@ void __init fib_trie_init(void)
 {
 	fn_alias_kmem = kmem_cache_create("ip_fib_alias",
 					  sizeof(struct fib_alias),
-					  0, SLAB_PANIC, NULL);
+					  0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
 
 	trie_leaf_kmem = kmem_cache_create("ip_fib_trie",
 					   LEAF_SIZE,
-					   0, SLAB_PANIC, NULL);
+					   0, SLAB_PANIC | SLAB_ACCOUNT, NULL);
 }
 
 struct fib_table *fib_trie_table(u32 id)
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -2139,11 +2139,11 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
 	struct ip_sf_socklist *psl;
 	struct net *net = sock_net(sk);
 
+	ASSERT_RTNL();
+
 	if (!ipv4_is_multicast(addr))
 		return -EINVAL;
 
-	rtnl_lock();
-
 	imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
 	imr.imr_address.s_addr = msf->imsf_interface;
 	imr.imr_ifindex = 0;
@@ -2164,7 +2164,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
 		goto done;
 	msf->imsf_fmode = pmc->sfmode;
 	psl = rtnl_dereference(pmc->sflist);
-	rtnl_unlock();
 	if (!psl) {
 		len = 0;
 		count = 0;
@@ -2183,7 +2182,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
 		return -EFAULT;
 	return 0;
 done:
-	rtnl_unlock();
 	return err;
 }
 
@@ -2197,6 +2195,8 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 	struct inet_sock *inet = inet_sk(sk);
 	struct ip_sf_socklist *psl;
 
+	ASSERT_RTNL();
+
 	psin = (struct sockaddr_in *)&gsf->gf_group;
 	if (psin->sin_family != AF_INET)
 		return -EINVAL;
@@ -2204,8 +2204,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 	if (!ipv4_is_multicast(addr))
 		return -EINVAL;
 
-	rtnl_lock();
-
 	err = -EADDRNOTAVAIL;
 
 	for_each_pmc_rtnl(inet, pmc) {
@@ -2217,7 +2215,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 		goto done;
 	gsf->gf_fmode = pmc->sfmode;
 	psl = rtnl_dereference(pmc->sflist);
-	rtnl_unlock();
 	count = psl ? psl->sl_count : 0;
 	copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
 	gsf->gf_numsrc = count;
@@ -2237,7 +2234,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
 	}
 	return 0;
 done:
-	rtnl_unlock();
 	return err;
 }
 
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -23,6 +23,7 @@
 #include <net/route.h>
 #include <net/tcp_states.h>
 #include <net/xfrm.h>
+#include <net/tcp.h>
 
 #ifdef INET_CSK_DEBUG
 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
@@ -694,6 +695,8 @@ struct sock *inet_csk_clone_lock(const struct sock *sk,
 		inet_sk(newsk)->inet_sport = htons(inet_rsk(req)->ir_num);
 		newsk->sk_write_space = sk_stream_write_space;
 
+		inet_sk(newsk)->mc_list = NULL;
+
 		newicsk->icsk_retransmits = 0;
 		newicsk->icsk_backoff	  = 0;
 		newicsk->icsk_probes_out  = 0;
@@ -732,7 +735,7 @@ void inet_csk_destroy_sock(struct sock *sk)
 
 	sk_refcnt_debug_release(sk);
 
-	percpu_counter_dec(sk->sk_prot->orphan_count);
+	orphan_count_dec(sk);
 	sock_put(sk);
 }
 EXPORT_SYMBOL(inet_csk_destroy_sock);
@@ -749,7 +752,7 @@ void inet_csk_prepare_forced_close(struct sock *sk)
 
 	/* The below has to be done to allow calling inet_csk_destroy_sock */
 	sock_set_flag(sk, SOCK_DEAD);
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	orphan_count_inc(sk);
 	inet_sk(sk)->inet_num = 0;
 }
 EXPORT_SYMBOL(inet_csk_prepare_forced_close);
@@ -828,7 +831,7 @@ void inet_csk_listen_stop(struct sock *sk)
 
 		sock_orphan(child);
 
-		percpu_counter_inc(sk->sk_prot->orphan_count);
+		orphan_count_inc(sk);
 
 		if (sk->sk_protocol == IPPROTO_TCP && tcp_rsk(req)->listener) {
 			BUG_ON(tcp_sk(child)->fastopen_rsk != req);
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -193,6 +193,15 @@ int inet_sk_diag_fill(struct sock *sk, struct inet_connection_sock *icsk,
 		if (sock_diag_put_meminfo(sk, skb, INET_DIAG_SKMEMINFO))
 			goto errout;
 
+	/*
+	 * RAW sockets might have user-defined protocols assigned,
+	 * so report the one supplied on socket creation.
+	 */
+	if (sk->sk_type == SOCK_RAW) {
+		if (nla_put_u8(skb, INET_DIAG_PROTOCOL, sk->sk_protocol))
+			goto errout;
+	}
+
 	if (!icsk) {
 		handler->idiag_get_info(sk, r, NULL);
 		goto out;
@@ -859,7 +868,6 @@ void inet_diag_dump_icsk(struct inet_hashinfo *hashinfo, struct sk_buff *skb,
 
 				if (!net_eq(sock_net(sk), net))
 					continue;
-
 				if (num < s_num) {
 					num++;
 					continue;
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -20,6 +20,7 @@
 #include <linux/skbuff.h>
 #include <linux/rtnetlink.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
 
 #include <net/sock.h>
 #include <net/inet_frag.h>
@@ -295,6 +296,7 @@ static struct inet_frag_queue *inet_frag_alloc(struct netns_frags *nf,
 		return NULL;
 
 	q->net = nf;
+
 	f->constructor(q, arg);
 	add_frag_mem_limit(q, f->qsize);
 
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -115,6 +115,24 @@ int ip_forward(struct sk_buff *skb)
 		goto drop;
 	}
 
+	/*
+	 * We try to optimize forwarding of VE packets:
+	 * do not decrement TTL (and so save skb_cow)
+	 * during forwarding of outgoing pkts from VE.
+	 * For incoming pkts we still do ttl decr,
+	 * since such skb is not cloned and does not require
+	 * actual cow. So, there is at least one place
+	 * in pkts path with mandatory ttl decr, that is
+	 * sufficient to prevent routing loops.
+	 */
+	iph = ip_hdr(skb);
+	if (
+#ifdef CONFIG_IP_ROUTE_NAT			
+	    (rt->rt_flags & RTCF_NAT) == 0 &&	  /* no NAT mangling expected */
+#endif						  /* and */
+	    (skb->dev->features & NETIF_F_VENET)) /* src is VENET device */
+		goto no_ttl_decr;
+
 	/* We are about to mangle packet. Copy it! */
 	if (skb_cow(skb, LL_RESERVED_SPACE(rt->dst.dev)+rt->dst.header_len))
 		goto drop;
@@ -123,6 +141,8 @@ int ip_forward(struct sk_buff *skb)
 	/* Decrease ttl after skb cow done */
 	ip_decrease_ttl(iph);
 
+no_ttl_decr:
+
 	/*
 	 *	We now generate an ICMP HOST REDIRECT giving the route
 	 *	we calculated.
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -195,10 +195,11 @@ static void ip_evictor(struct net *net)
  */
 static void ip_expire(unsigned long arg)
 {
+	struct inet_frag_queue *q = (struct inet_frag_queue *)arg;
 	struct ipq *qp;
 	struct net *net;
 
-	qp = container_of((struct inet_frag_queue *) arg, struct ipq, q);
+	qp = container_of(q, struct ipq, q);
 	net = container_of(qp->q.net, struct net, ipv4.frags);
 
 	spin_lock(&qp->q.lock);
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -833,6 +833,7 @@ static void ipgre_tunnel_setup(struct net_device *dev)
 	dev->netdev_ops		= &ipgre_netdev_ops;
 	dev->type		= ARPHRD_IPGRE;
 	ip_tunnel_setup(dev, ipgre_net_id);
+	dev->features |= NETIF_F_VIRTUAL;
 }
 
 static void __gre_tunnel_init(struct net_device *dev)
@@ -1072,6 +1073,7 @@ static void ipgre_tap_setup(struct net_device *dev)
 	ether_setup(dev);
 	dev->netdev_ops		= &gre_tap_netdev_ops;
 	ip_tunnel_setup(dev, gre_tap_net_id);
+	dev->features |= NETIF_F_VIRTUAL;
 }
 
 static int ipgre_newlink(struct net *src_net, struct net_device *dev,
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -192,6 +192,8 @@ static int ip_local_deliver_finish(struct sock *sk, struct sk_buff *skb)
 {
 	struct net *net = dev_net(skb->dev);
 
+	if (skb->destructor)
+		skb_orphan(skb);
 	__skb_pull(skb, skb_network_header_len(skb));
 
 	rcu_read_lock();
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -286,6 +286,10 @@ int ip_options_compile(struct net *net,
 			optptr++;
 			continue;
 		}
+		if (unlikely(l < 2)) {
+			pp_ptr = optptr;
+			goto error;
+		}
 		optlen = optptr[1];
 		if (optlen<2 || optlen>l) {
 			pp_ptr = optptr;
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -373,6 +373,7 @@ static void ip_copy_addrs(struct iphdr *iph, const struct flowi4 *fl4)
 	memcpy(&iph->saddr, &fl4->saddr,
 	       sizeof(fl4->saddr) + sizeof(fl4->daddr));
 }
+EXPORT_SYMBOL(ip_output);
 
 int ip_queue_xmit(struct sk_buff *skb, struct flowi *fl)
 {
@@ -1550,6 +1551,7 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
 	if (__ip_options_echo(&replyopts.opt.opt, skb, sopt))
 		return;
 
+	saddr = ip_hdr(skb)->daddr;
 	ipc.addr = daddr;
 	ipc.opt = NULL;
 	ipc.tx_flags = 0;
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -1093,7 +1093,14 @@ void ipv4_pktinfo_prepare(const struct sock *sk, struct sk_buff *skb)
 		pktinfo->ipi_ifindex = 0;
 		pktinfo->ipi_spec_dst.s_addr = 0;
 	}
-	skb_dst_drop(skb);
+	/* We need to keep the dst for __ip_options_echo()
+	 * We could restrict the test to opt.ts_needtime || opt.srr,
+	 * but the following is good enough as IP options are not often used.
+	 */
+	if (unlikely(IPCB(skb)->opt.optlen))
+		skb_dst_force(skb);
+	else
+		skb_dst_drop(skb);
 }
 
 int ip_setsockopt(struct sock *sk, int level,
@@ -1156,11 +1163,22 @@ EXPORT_SYMBOL(compat_ip_setsockopt);
  *	the _received_ ones. The set sets the _sent_ ones.
  */
 
+static bool getsockopt_needs_rtnl(int optname)
+{
+	switch (optname) {
+	case IP_MSFILTER:
+	case MCAST_MSFILTER:
+		return true;
+	}
+	return false;
+}
+
 static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 			    char __user *optval, int __user *optlen, unsigned int flags)
 {
 	struct inet_sock *inet = inet_sk(sk);
-	int val;
+	bool needs_rtnl = getsockopt_needs_rtnl(optname);
+	int val, err = 0;
 	int len;
 
 	if (level != SOL_IP)
@@ -1174,6 +1192,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	if (len < 0)
 		return -EINVAL;
 
+	if (needs_rtnl)
+		rtnl_lock();
 	lock_sock(sk);
 
 	switch (optname) {
@@ -1285,39 +1305,35 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 	case IP_MSFILTER:
 	{
 		struct ip_msfilter msf;
-		int err;
 
 		if (len < IP_MSFILTER_SIZE(0)) {
-			release_sock(sk);
-			return -EINVAL;
+			err = -EINVAL;
+			goto out;
 		}
 		if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
-			release_sock(sk);
-			return -EFAULT;
+			err = -EFAULT;
+			goto out;
 		}
 		err = ip_mc_msfget(sk, &msf,
 				   (struct ip_msfilter __user *)optval, optlen);
-		release_sock(sk);
-		return err;
+		goto out;
 	}
 	case MCAST_MSFILTER:
 	{
 		struct group_filter gsf;
-		int err;
 
 		if (len < GROUP_FILTER_SIZE(0)) {
-			release_sock(sk);
-			return -EINVAL;
+			err = -EINVAL;
+			goto out;
 		}
 		if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
-			release_sock(sk);
-			return -EFAULT;
+			err = -EFAULT;
+			goto out;
 		}
 		err = ip_mc_gsfget(sk, &gsf,
 				   (struct group_filter __user *)optval,
 				   optlen);
-		release_sock(sk);
-		return err;
+		goto out;
 	}
 	case IP_MULTICAST_ALL:
 		val = inet->mc_all;
@@ -1384,6 +1400,12 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
 			return -EFAULT;
 	}
 	return 0;
+
+out:
+	release_sock(sk);
+	if (needs_rtnl)
+		rtnl_unlock();
+	return err;
 }
 
 int ip_getsockopt(struct sock *sk, int level,
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -58,6 +58,9 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 spi,
 	struct net *net = dev_net(skb->dev);
 	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
 
+	if (itn == NULL)
+		return -EINVAL;
+
 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
 				  iph->saddr, iph->daddr, 0);
 	if (tunnel != NULL) {
@@ -256,6 +259,9 @@ static int vti4_err(struct sk_buff *skb, u32 info)
 	int protocol = iph->protocol;
 	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
 
+	if (itn == NULL)
+		return -1;
+
 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
 				  iph->daddr, iph->saddr, 0);
 	if (!tunnel)
@@ -413,6 +419,9 @@ static int __net_init vti_init_net(struct net *net)
 	int err;
 	struct ip_tunnel_net *itn;
 
+	if (!ve_is_super(net->owner_ve))
+		return net_assign_generic(net, vti_net_id, NULL);
+
 	err = ip_tunnel_init_net(net, vti_net_id, &vti_link_ops, "ip_vti0");
 	if (err)
 		return err;
@@ -424,6 +433,9 @@ static int __net_init vti_init_net(struct net *net)
 static void __net_exit vti_exit_net(struct net *net)
 {
 	struct ip_tunnel_net *itn = net_generic(net, vti_net_id);
+
+	if (itn == NULL)
+		return;
 	ip_tunnel_delete_net(itn, &vti_link_ops);
 }
 
@@ -473,6 +485,9 @@ static int vti_newlink(struct net *src_net, struct net_device *dev,
 {
 	struct ip_tunnel_parm parms;
 
+	if (net_generic(dev_net(dev), vti_net_id) == NULL)
+		return -EACCES;
+
 	vti_netlink_parms(data, &parms);
 	return ip_tunnel_newlink(dev, tb, &parms);
 }
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -107,6 +107,8 @@
 #include <linux/init.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/if_ether.h>
+#include <uapi/linux/vzcalluser.h>
+#include <linux/ve.h>
 
 #include <net/sock.h>
 #include <net/ip.h>
@@ -142,6 +144,9 @@ static int ipip_err(struct sk_buff *skb, u32 info)
 	const int code = icmp_hdr(skb)->code;
 
 	err = -ENOENT;
+	if (itn == NULL)
+		goto out;
+
 	t = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
 			     iph->daddr, iph->saddr, 0);
 	if (t == NULL)
@@ -191,6 +196,10 @@ static int ipip_rcv(struct sk_buff *skb)
 	const struct iphdr *iph;
 
 	iph = ip_hdr(skb);
+
+	if (itn == NULL)
+		return -1;
+
 	tunnel = ip_tunnel_lookup(itn, skb->dev->ifindex, TUNNEL_NO_KEY,
 			iph->saddr, iph->daddr, 0);
 	if (tunnel) {
@@ -292,6 +301,7 @@ static void ipip_tunnel_setup(struct net_device *dev)
 	netif_keep_dst(dev);
 
 	dev->features		|= IPIP_FEATURES;
+	dev->features		|= NETIF_F_VIRTUAL;
 	dev->hw_features	|= IPIP_FEATURES;
 	ip_tunnel_setup(dev, ipip_net_id);
 }
@@ -347,6 +357,9 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev,
 {
 	struct ip_tunnel_parm p;
 
+	if (net_generic(dev_net(dev), ipip_net_id) == NULL)
+		return -EACCES;
+
 	ipip_netlink_parms(data, &p);
 	return ip_tunnel_newlink(dev, tb, &p);
 }
@@ -433,13 +446,21 @@ static struct xfrm_tunnel ipip_handler __read_mostly = {
 
 static int __net_init ipip_init_net(struct net *net)
 {
+	if (!(net->owner_ve->features & VE_FEATURE_IPIP))
+		return net_assign_generic(net, ipip_net_id, NULL);
+
 	return ip_tunnel_init_net(net, ipip_net_id, &ipip_link_ops, "tunl0");
 }
 
 static void __net_exit ipip_exit_net(struct net *net)
 {
 	struct ip_tunnel_net *itn = net_generic(net, ipip_net_id);
+
+	if (itn == NULL) /* no VE_FEATURE_IPIP */
+		return;
+
 	ip_tunnel_delete_net(itn, &ipip_link_ops);
+	net_assign_generic(net, ipip_net_id, NULL);
 }
 
 static struct pernet_operations ipip_net_ops = {
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -265,7 +265,7 @@ static int __net_init ipmr_rules_init(struct net *net)
 	return 0;
 
 err2:
-	kfree(mrt);
+	ipmr_free_table(mrt);
 err1:
 	fib_rules_unregister(ops);
 	return err;
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -428,5 +428,11 @@ config IP_NF_ARP_MANGLE
 
 endif # IP_NF_ARPTABLES
 
+config VE_IP_NF_VZPRIVNET
+	tristate "VE private networking filtering"
+	default m
+	depends on IP_NF_IPTABLES && m
+	help
+	  This option allows filtering private subnets.
 endmenu
 
--- a/net/ipv4/netfilter/Makefile
+++ b/net/ipv4/netfilter/Makefile
@@ -45,6 +45,7 @@ obj-$(CONFIG_NF_TABLES_ARP) += nf_tables_arp.o
 
 # generic IP tables 
 obj-$(CONFIG_IP_NF_IPTABLES) += ip_tables.o
+obj-$(CONFIG_VE_IP_NF_VZPRIVNET) += ip_vzprivnet.o
 
 # the three instances of ip_tables
 obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -25,6 +25,7 @@
 #include <net/compat.h>
 #include <net/sock.h>
 #include <asm/uaccess.h>
+#include <linux/fence-watchdog.h>
 
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_arp/arp_tables.h>
@@ -112,6 +113,14 @@ static inline int arp_packet_match(const struct arphdr *arphdr,
 
 #define FWINV(bool, invflg) ((bool) ^ !!(arpinfo->invflags & (invflg)))
 
+#ifdef CONFIG_FENCE_WATCHDOG
+	if (FWINV((arpinfo->flags & ARPT_WDOGTMO) && !fence_wdog_tmo_match(),
+		  ARPT_INV_WDOGTMO)) {
+		dprintf("Watchdog timeout mismatch.\n");
+		return 0;
+	}
+#endif
+
 	if (FWINV((arphdr->ar_op & arpinfo->arpop_mask) != arpinfo->arpop,
 		  ARPT_INV_ARPOP)) {
 		dprintf("ARP operation field mismatch.\n");
@@ -591,6 +600,10 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
 	if (err)
 		return err;
 
+	/* target start is within the ip/ip6/arpt_entry struct */
+	if (e->target_offset < ((const void *)e->elems - (const void *)e))
+		return -EINVAL;
+
 	/* Check hooks & underflows */
 	for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
 		if (!(valid_hooks & (1 << h)))
@@ -1235,6 +1248,10 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
 	if (ret)
 		return ret;
 
+	/* target start is within the ip/ip6/arpt_entry struct */
+	if (e->target_offset < ((const void *)e->elems - (const void *)e))
+		return -EINVAL;
+
 	off = sizeof(struct arpt_entry) - sizeof(struct compat_arpt_entry);
 	entry_offset = (void *)e - (void *)base;
 
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -301,6 +301,9 @@ ipt_do_table(struct sk_buff *skb,
 	struct xt_action_param acpar;
 	unsigned int addend;
 
+	if (ve_xt_table_forbidden(table))
+		return NF_ACCEPT;
+
 	/* Initialization */
 	ip = ip_hdr(skb);
 	indev = state->in ? state->in->name : nulldevname;
@@ -465,8 +468,9 @@ mark_source_chains(const struct xt_table_info *newinfo,
 			int visited = e->comefrom & (1 << hook);
 
 			if (e->comefrom & (1 << NF_INET_NUMHOOKS)) {
-				pr_err("iptables: loop hook %u pos %u %08X.\n",
-				       hook, pos, e->comefrom);
+				ve_printk(VE_LOG, "iptables: loop hook %u pos "
+						  "%u %08X.\n",
+					  hook, pos, e->comefrom);
 				return 0;
 			}
 			e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
@@ -744,6 +748,10 @@ check_entry_size_and_hooks(struct ipt_entry *e,
 	if (err)
 		return err;
 
+	/* target start is within the ip/ip6/arpt_entry struct */
+	if (e->target_offset < ((const void *)e->elems - (const void *)e))
+		return -EINVAL;
+
 	/* Check hooks & underflows */
 	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
 		if (!(valid_hooks & (1 << h)))
@@ -925,7 +933,7 @@ static struct xt_counters *alloc_counters(const struct xt_table *table)
 	   (other than comefrom, which userspace doesn't care
 	   about). */
 	countersize = sizeof(struct xt_counters) * private->number;
-	counters = vzalloc(countersize);
+	counters = vzalloc_account(countersize);
 
 	if (counters == NULL)
 		return ERR_PTR(-ENOMEM);
@@ -1193,7 +1201,7 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks,
 	struct ipt_entry *iter;
 
 	ret = 0;
-	counters = vzalloc(num_counters * sizeof(struct xt_counters));
+	counters = vzalloc_account(num_counters * sizeof(struct xt_counters));
 	if (!counters) {
 		ret = -ENOMEM;
 		goto out;
@@ -1472,6 +1480,10 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
 	if (ret)
 		return ret;
 
+	/* target start is within the ip/ip6/arpt_entry struct */
+	if (e->target_offset < ((const void *)e->elems - (const void *)e))
+		return -EINVAL;
+
 	off = sizeof(struct ipt_entry) - sizeof(struct compat_ipt_entry);
 	entry_offset = (void *)e - (void *)base;
 	j = 0;
@@ -1701,12 +1713,16 @@ compat_do_replace(struct net *net, void __user *user, unsigned int len)
 }
 
 static int
+do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len);
+
+static int
 compat_do_ipt_set_ctl(struct sock *sk,	int cmd, void __user *user,
 		      unsigned int len)
 {
+	struct user_namespace *user_ns = sock_net(sk)->user_ns;
 	int ret;
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1719,8 +1735,7 @@ compat_do_ipt_set_ctl(struct sock *sk,	int cmd, void __user *user,
 		break;
 
 	default:
-		duprintf("do_ipt_set_ctl:  unknown request %i\n", cmd);
-		ret = -EINVAL;
+		ret = do_ipt_set_ctl(sk, cmd, user, len);
 	}
 
 	return ret;
@@ -1814,9 +1829,10 @@ static int do_ipt_get_ctl(struct sock *, int, void __user *, int *);
 static int
 compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 {
+	struct user_namespace *user_ns = sock_net(sk)->user_ns;
 	int ret;
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1836,9 +1852,10 @@ compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 static int
 do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 {
+	struct user_namespace *user_ns = sock_net(sk)->user_ns;
 	int ret;
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1861,9 +1878,10 @@ do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 static int
 do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 {
+	struct user_namespace *user_ns = sock_net(sk)->user_ns;
 	int ret;
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -2061,12 +2079,24 @@ static struct xt_match ipt_builtin_mt[] __read_mostly = {
 
 static int __net_init ip_tables_net_init(struct net *net)
 {
-	return xt_proto_init(net, NFPROTO_IPV4);
+	int res;
+
+	if (!net_ipt_permitted(net, VE_IP_IPTABLES))
+		return 0;
+	res = xt_proto_init(net, NFPROTO_IPV4);
+	if (!res)
+		net_ipt_module_set(net, VE_IP_IPTABLES);
+	return res;
 }
 
 static void __net_exit ip_tables_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_IPTABLES))
+		return;
+
 	xt_proto_fini(net, NFPROTO_IPV4);
+
+	net_ipt_module_clear(net, VE_IP_IPTABLES);
 }
 
 static struct pernet_operations ip_tables_net_ops = {
--- /dev/null
+++ b/net/ipv4/netfilter/ip_vzprivnet.c
@@ -0,0 +1,1151 @@
+/*
+ *  net/ipv4/netfilter/ip_vzprivnet.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * This is implementation of the private network filtering.
+ * How does it work:
+ *   _______      _______       _______
+ *  |  VE1  |    |  VE2  |     | VE-N  |
+ *  |_______|    |_______|     |_______|
+ *      | venet      | venet       | venet
+ *      |            |             |
+ *      |_______ip_forward__ ... __| VE0
+ *             vzfilter_hook
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <net/ip.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv4.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock.h>
+#include <linux/proc_fs.h>
+#include <linux/log2.h>
+#include <linux/ctype.h>
+#include <linux/sysctl.h>
+#include <linux/inet.h>
+#include <asm/page.h>
+
+#include <linux/vzprivnet.h>
+#define VZPRIV_PROCNAME "ip_vzprivnet"
+
+static DEFINE_PER_CPU(unsigned long, lookup_stat[2]);
+
+struct vzprivnet {
+	u32 nmask;
+	int weak;
+};
+
+struct vzprivnet_sparse {
+	struct vzprivnet pn;
+
+	unsigned int netid;
+	struct list_head list;
+	struct list_head entries;
+};
+
+struct vzprivnet_range {
+	struct vzprivnet *pn;
+
+	/* In big-endian */
+	u32 netip;
+	u32 rmask;
+	struct rb_node node;
+};
+
+struct vzprivnet_entry {
+	struct vzprivnet_range range;
+	struct list_head list;
+};
+
+static DEFINE_RWLOCK(vzprivlock);
+static LIST_HEAD(vzpriv_sparse);
+static struct rb_root entries_root = RB_ROOT;
+
+/*
+ * Tree helpers
+ */
+
+static struct rb_root rbroot = RB_ROOT;
+/* ip: big-endian IP address */
+static struct vzprivnet_range *tree_search(struct rb_root *root, u32 ip)
+{
+	struct rb_node *node = root->rb_node;
+
+	ip = ntohl(ip);
+	while (node) {
+		struct vzprivnet_range *p = rb_entry(node, struct vzprivnet_range, node);
+		u32 start, end;
+
+		start = ntohl(p->netip);
+		end = start | ~ntohl(p->rmask);
+
+		if (ip <= end) {
+			if (start <= ip)
+				return p;
+
+			node = node->rb_left;
+		} else
+			node = node->rb_right;
+	}
+	return NULL;
+}
+
+static struct vzprivnet_range *legacy_search(u32 ip)
+{
+	return tree_search(&rbroot, ip);
+}
+
+static int tree_insert(struct rb_root *root, struct vzprivnet_range *data)
+{
+	struct rb_node **link = &(root->rb_node), *parent = NULL;
+	u32 ip;
+	u32 end_ip;
+
+	ip = ntohl(data->netip);
+	end_ip = ip | ~ntohl(data->rmask);
+
+
+	while (*link) {
+		struct vzprivnet_range *p = rb_entry(*link, struct vzprivnet_range, node);
+		u32 start, end;
+
+		start = ntohl(p->netip);
+		end = start | ~ntohl(p->rmask);
+
+		if (!(ip > end || start > end_ip))
+			return -EEXIST;
+
+		parent = *link;
+		if (ip < end)
+			link = &((*link)->rb_left);
+		else
+			link = &((*link)->rb_right);
+	}
+
+	/* Add link node and rebalance tree. */
+	rb_link_node(&data->node, parent, link);
+	rb_insert_color(&data->node, root);
+
+	return 0;
+}
+
+static int legacy_insert(struct vzprivnet_range *data)
+{
+	return tree_insert(&rbroot, data);
+}
+
+static void legacy_delete(struct vzprivnet_range *p)
+{
+	rb_erase(&p->node, &rbroot);
+}
+
+static struct vzprivnet_range *legacy_first(void)
+{
+	struct rb_node *node;
+
+	node = rb_first(&rbroot);
+	if (!node)
+		return NULL;
+
+	return rb_entry(node, struct vzprivnet_range, node);
+}
+
+static struct vzprivnet_range *legacy_next(struct vzprivnet_range *p)
+{
+	struct rb_node *node;
+
+	node = rb_next(&p->node);
+	if (!node)
+		return NULL;
+
+	return rb_entry(node, struct vzprivnet_range, node);
+}
+
+/*
+ * Generic code
+ */
+
+static struct vzprivnet vzpriv_internet = {
+	.nmask = 0,
+	.weak = VZPRIVNET_INET
+};
+
+static struct vzprivnet *vzpriv_search(u32 ip)
+{
+	struct vzprivnet_range *pnr;
+
+	pnr = tree_search(&entries_root, ip);
+	if (pnr == NULL)
+		pnr = legacy_search(ip);
+
+	if (pnr != NULL)
+		return pnr->pn;
+	else
+		return &vzpriv_internet;
+}
+
+static noinline unsigned int vzprivnet_classify(struct sk_buff *skb, int type)
+{
+	int res;
+	u32 saddr, daddr;
+	struct vzprivnet *p1, *p2;
+
+	per_cpu(lookup_stat[type], smp_processor_id())++;
+
+	saddr = ip_hdr(skb)->saddr;
+	daddr = ip_hdr(skb)->daddr;
+
+	read_lock(&vzprivlock);
+	p1 = vzpriv_search(saddr);
+	p2 = vzpriv_search(daddr);
+
+	if (p1 == p2) {
+		if ((saddr & p1->nmask) == (daddr & p1->nmask))
+			res = NF_ACCEPT;
+		else
+			res = NF_DROP;
+	} else {
+		if (p1->weak + p2->weak >= 3)
+			res = NF_ACCEPT;
+		else
+			res = NF_DROP;
+	}
+
+	read_unlock(&vzprivlock);
+	return res;
+}
+
+int vzpn_handle_bridged = 0;
+EXPORT_SYMBOL(vzpn_handle_bridged);
+
+int vzpn_filter_host = 0;
+EXPORT_SYMBOL(vzpn_filter_host);
+
+static unsigned int vzprivnet_hook(struct sk_buff *skb, int can_be_bridge)
+{
+	struct dst_entry *dst;
+	struct net *src_net;
+
+	if (WARN_ON_ONCE(!skb->dev && !skb->sk))
+		return NF_ACCEPT;
+
+	src_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+	if (!ve_is_super(src_net->owner_ve))
+		return NF_ACCEPT;
+
+	dst = skb_dst(skb);
+	if (dst != NULL && can_be_bridge && dst->output != ip_output) { /* bridge */
+		if (vzpn_handle_bridged)
+			return vzprivnet_classify(skb, 1);
+		else
+			return NF_ACCEPT;
+	}
+
+	return vzprivnet_classify(skb, 0);
+}
+
+static unsigned int vzprivnet_fwd_hook(const struct nf_hook_ops *ops,
+		struct sk_buff *skb, const struct net_device *in,
+		const struct net_device *out, const struct nf_hook_state *state)
+{
+	return vzprivnet_hook(skb, 1);
+}
+
+static unsigned int vzprivnet_host_hook(struct sk_buff *skb,
+		const struct net_device *dev, int can_be_bridge)
+{
+	if (!vzpn_filter_host)
+		return NF_ACCEPT;
+
+	/*
+	 * Only packets coming from venet or going to one matter
+	 */
+	if (!(dev->features & NETIF_F_VENET))
+		return NF_ACCEPT;
+
+	return vzprivnet_hook(skb, can_be_bridge);
+}
+
+static unsigned int vzprivnet_in_hook(const struct nf_hook_ops *ops,
+		struct sk_buff *skb, const struct net_device *in,
+		const struct net_device *out, const struct nf_hook_state *state)
+{
+	return vzprivnet_host_hook(skb, in, 0); /* bridge doesn't call it */
+}
+
+static unsigned int vzprivnet_out_hook(const struct nf_hook_ops *ops,
+		struct sk_buff *skb, const struct net_device *in,
+		const struct net_device *out, const struct nf_hook_state *state)
+{
+	return vzprivnet_host_hook(skb, out, 1);
+}
+
+static struct nf_hook_ops vzprivnet_ops[] = {
+	{
+		.hook = vzprivnet_fwd_hook,
+		.owner = THIS_MODULE,
+		.pf = PF_INET,
+		.hooknum = NF_INET_FORWARD,
+		.priority = NF_IP_PRI_FIRST
+	},
+	{
+		.hook = vzprivnet_in_hook,
+		.owner = THIS_MODULE,
+		.pf = PF_INET,
+		.hooknum = NF_INET_LOCAL_IN,
+		.priority = NF_IP_PRI_FIRST
+	},
+	{
+		.hook = vzprivnet_out_hook,
+		.owner = THIS_MODULE,
+		.pf = PF_INET,
+		.hooknum = NF_INET_LOCAL_OUT,
+		.priority = NF_IP_PRI_FIRST
+	},
+};
+
+static inline u32 to_netmask(int prefix)
+{
+	return htonl((~0 << (32 - prefix)));
+}
+
+static inline unsigned int to_prefix(u32 netmask)
+{
+	netmask = ntohl(netmask);
+	return 32 - ilog2(~netmask + 1);
+}
+
+static char *nextline(char *s)
+{
+	while(*s && *s != '\n') s++;
+	while(*s && *s == '\n') s++;
+	return s;
+}
+
+static int vzprivnet_add(u32 net, u32 m1, u32 m2, int weak)
+{
+	struct vzprivnet_range *p;
+	struct vzprivnet *pn;
+	int err;
+
+	p = kmalloc(sizeof(struct vzprivnet_range), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	pn = kmalloc(sizeof(struct vzprivnet), GFP_KERNEL);
+	if (!pn) {
+		kfree(p);
+		return -ENOMEM;
+	}
+
+	p->pn = pn;
+	p->netip = net;
+	p->rmask = m1;
+	pn->nmask = m2;
+	pn->weak = weak;
+
+	write_lock_bh(&vzprivlock);
+	err = legacy_insert(p);
+	write_unlock_bh(&vzprivlock);
+	if (err) {
+		kfree(pn);
+		kfree(p);
+	}
+
+	return err;
+}
+
+static int vzprivnet_del(u32 net)
+{
+	struct vzprivnet_range *p;
+
+	write_lock_bh(&vzprivlock);
+	p = legacy_search(net);
+	if (p == NULL) {
+		write_unlock_bh(&vzprivlock);
+		return -ENOENT;
+	}
+
+	legacy_delete(p);
+	write_unlock_bh(&vzprivlock);
+	kfree(p->pn);
+	kfree(p);
+	return 0;
+}
+
+static void sparse_free_one(struct vzprivnet_sparse *pns);
+static void vzprivnet_cleanup(void)
+{
+	struct vzprivnet_range *p;
+	struct vzprivnet_sparse *pns;
+
+	write_lock_bh(&vzprivlock);
+	while (1) {
+		p = legacy_first();
+		if (!p)
+			break;
+		legacy_delete(p);
+		kfree(p->pn);
+		kfree(p);
+	}
+
+	while (!list_empty(&vzpriv_sparse)) {
+		pns = list_first_entry(&vzpriv_sparse,
+				struct vzprivnet_sparse, list);
+		sparse_free_one(pns);
+	}
+	write_unlock_bh(&vzprivlock);
+}
+
+/*     +a.b.c.d/M1/M2
+ * or
+ *     -a.b.c.d/M1/M2
+ *
+ * add: 0 - delete, 1 - add
+ * if delete, netmasks don't matter
+ */
+static int parse_param(const char *param, int *add, u32 *net,
+			u32 *netmask1, u32 *netmask2, int *weak)
+{
+	int err;
+	unsigned char ch, e;
+	unsigned int a,b,c,d;
+	unsigned int m1, m2;
+
+	if (!*param)
+		return -EINVAL;
+
+	ch = *param;
+	if (ch != '+' && ch != '-')
+		return -EINVAL;
+
+	param++;
+	err = sscanf(param, "%u.%u.%u.%u/%u/%u%c\n",
+				&a, &b, &c, &d, &m1, &m2, &e);
+	if (err < 4 || (a == 0 || a > 255 || b > 255 || c > 255 || d > 255))
+		return -EINVAL;
+
+	*weak = VZPRIVNET_STRONG;
+	if (err == 7) {
+		if (e == '*')
+			*weak = VZPRIVNET_WEAK;
+		else if (e != '\n' || !isspace(e))
+			return -EINVAL;
+	}
+
+	*net = htonl((a << 24) + (b << 16) + (c << 8) + d);
+	if (ch == '+') {
+		if (err < 6 || m1 == 0 || m1 > 32 || m2 == 0 || m2 > 32)
+			return -EINVAL;
+
+		*netmask1 = to_netmask(m1);
+		*netmask2 = to_netmask(m2);
+		*net &= *netmask1;
+	} else
+		*netmask1 = *netmask2 = 0;
+
+	*add = (ch == '+') ? 1 : 0;
+	return 0;
+}
+
+static ssize_t vzpriv_write(struct file * file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	char *s, *page;
+	int err;
+	int offset;
+
+	page = (unsigned char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	if (count > (PAGE_SIZE - 1))
+		count = (PAGE_SIZE - 1);
+
+	err = copy_from_user(page, buf, count);
+	if (err)
+		goto err;
+
+	s = page;
+	s[count] = 0;
+
+	err = -EINVAL;
+	while (*s) {
+		u32 net, m1, m2;
+		int add, weak;
+
+		err = parse_param(s, &add, &net, &m1, &m2, &weak);
+		if (err)
+			goto out;
+
+		if (add)
+			err = vzprivnet_add(net, m1, m2, weak);
+		else
+			err = vzprivnet_del(net);
+
+		if (err)
+			goto out;
+
+		s = nextline(s);
+	}
+out:
+	offset = s - page;
+	if (offset > 0)
+		err = offset;
+err:
+	free_page((unsigned long)page);
+	return err;
+}
+
+static void *vzprivnet_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	unsigned int n = *pos;
+
+	read_lock_bh(&vzprivlock);
+	if (n > 0) {
+		struct vzprivnet_range *p;
+
+		p = legacy_first();
+		while (n-- && p)
+			p = legacy_next(p);
+
+		return p;
+	}
+
+	return legacy_first();
+}
+
+static void *vzprivnet_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	(*pos)++;
+
+	return legacy_next(v);
+}
+
+static void vzprivnet_seq_stop(struct seq_file *s, void *v)
+{
+	read_unlock_bh(&vzprivlock);
+}
+
+static int vzprivnet_seq_show(struct seq_file *s, void *v)
+{
+	struct vzprivnet_range *p = v;
+
+	seq_printf(s, "%pI4/%u/%u", &p->netip,
+		   to_prefix(p->rmask), to_prefix(p->pn->nmask));
+	if (p->pn->weak == VZPRIVNET_WEAK)
+		seq_printf(s, "*\n");
+	else
+		seq_printf(s, "\n");
+	return 0;
+}
+
+static struct seq_operations vzprivnet_seq_ops = {
+	.start = vzprivnet_seq_start,
+	.next  = vzprivnet_seq_next,
+	.stop  = vzprivnet_seq_stop,
+	.show  = vzprivnet_seq_show,
+};
+
+static int vzprivnet_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &vzprivnet_seq_ops);
+}
+
+static struct file_operations proc_vzprivnet_ops = {
+	.owner   = THIS_MODULE,
+	.open    = vzprivnet_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = vzpriv_write,
+};
+
+static int sparse_add(unsigned int netid, u32 ip, u32 mask, int weak)
+{
+	int err;
+	struct vzprivnet_sparse *pns, *epns = NULL;
+	struct vzprivnet_entry *pne = NULL;
+
+	err = -ENOMEM;
+
+	pns = kmalloc(sizeof(struct vzprivnet_sparse), GFP_KERNEL);
+	if (pns == NULL)
+		goto out;
+
+	pne = kmalloc(sizeof(struct vzprivnet_entry), GFP_KERNEL);
+	if (pne == NULL)
+		goto out;
+
+	write_lock_bh(&vzprivlock);
+	list_for_each_entry(epns, &vzpriv_sparse, list)
+		if (epns->netid == netid) {
+			kfree(pns);
+			pns = epns;
+			goto found_net;
+		}
+
+	pns->netid = netid;
+	pns->pn.nmask = 0;
+	pns->pn.weak =  VZPRIVNET_STRONG;
+	INIT_LIST_HEAD(&pns->entries);
+
+found_net:
+	if (ip != 0) {
+		pne->range.netip = ip & mask;
+		pne->range.rmask = mask;
+		pne->range.pn = &pns->pn;
+		err = tree_insert(&entries_root, &pne->range);
+		if (err)
+			goto out_unlock;
+
+		list_add_tail(&pne->list, &pns->entries);
+		pne = NULL;
+	} else if (weak == VZPRIVNET_WEAK) {
+		pns->pn.weak = VZPRIVNET_WEAK;
+	} else if (pns == epns) {
+		err = -EEXIST;
+		goto out_unlock;
+	}
+
+	if (pns != epns) {
+		list_add_tail(&pns->list, &vzpriv_sparse);
+		pns = NULL;
+	}
+
+	err = 0;
+
+out_unlock:
+	write_unlock_bh(&vzprivlock);
+out:
+	if (pns != epns)
+		kfree(pns);
+	kfree(pne);
+
+	return err;
+}
+
+static void sparse_free_entry(struct vzprivnet_entry *pne)
+{
+	list_del(&pne->list);
+	rb_erase(&pne->range.node, &entries_root);
+	kfree(pne);
+}
+
+static void sparse_free_one(struct vzprivnet_sparse *pns)
+{
+	struct vzprivnet_entry *pne;
+
+	list_del(&pns->list);
+
+	while (!list_empty(&pns->entries)) {
+		pne = list_first_entry(&pns->entries,
+				struct vzprivnet_entry, list);
+		sparse_free_entry(pne);
+	}
+
+	kfree(pns);
+}
+
+static int sparse_del_net(unsigned int netid, int weak)
+{
+	struct vzprivnet_sparse *pns;
+
+	list_for_each_entry(pns, &vzpriv_sparse, list)
+		if (pns->netid == netid) {
+			if (weak == VZPRIVNET_WEAK)
+				pns->pn.weak = VZPRIVNET_STRONG;
+			else
+				sparse_free_one(pns);
+			return 0;
+		}
+
+	return -ENOENT;
+}
+
+static int sparse_del_ip(u32 ip)
+{
+	struct vzprivnet_range *rng;
+	struct vzprivnet_entry *pne;
+
+	rng = tree_search(&entries_root, ip);
+	if (rng == NULL)
+		return -ENOENT;
+
+	pne = container_of(rng, struct vzprivnet_entry, range);
+	sparse_free_entry(pne);
+
+	return 0;
+}
+
+static int sparse_del(unsigned int netid, u32 ip, int weak)
+{
+	int err;
+
+	write_lock_bh(&vzprivlock);
+	if (ip != 0)
+		err = sparse_del_ip(ip);
+	else
+		err = sparse_del_net(netid, weak);
+	write_unlock_bh(&vzprivlock);
+
+	return err;
+}
+
+/*
+ * +ID			to add a network
+ * +ID:a.b.c.d		to add an IP to network
+ * +ID:a.b.c.d/m	to add a subnet to network
+ * +ID:*		to make a network weak
+ * -ID			to remove the whole network
+ * -a.b.c.d		to remove an IP or bounding subnet (from its network)
+ * -ID:*		to make a network "strong" ;)
+ *
+ *  No weak networks here!
+ */
+
+static int parse_sparse_add(const char *str, unsigned int *netid, u32 *ip, u32 *mask, int *weak)
+{
+	unsigned int m;
+	char *end;
+
+	*netid = simple_strtol(str, &end, 10);
+	if (is_eol(*end)) {
+		*ip = 0;
+		return 0;
+	}
+
+	if (*end != ':')
+		return -EINVAL;
+
+	str = end + 1;
+	if (*str == '*') {
+		if (!is_eol(*(str + 1)))
+			return -EINVAL;
+
+		*weak = VZPRIVNET_WEAK;
+		return 0;
+	}
+
+	if (!in4_pton(str, -1, (u8 *)ip, -1, (const char **)&end))
+		return -EINVAL;
+
+	if (is_eol(*end)) {
+		*mask = -1; /* match only one IP */
+		return 0;
+	}
+
+	if (*end != '/')
+		return -EINVAL;
+
+	str = end + 1;
+	m = simple_strtol(str, &end, 10);
+	if (!is_eol(*end))
+		return -EINVAL;
+
+	*mask = to_netmask(m);
+	return 0;
+}
+
+static int parse_sparse_remove(const char *str, unsigned int *netid, u32 *ip, int *weak)
+{
+	char *end;
+
+	if (strchr(str, '.')) {
+		if (!in4_pton(str, -1, (u8 *)ip, -1, (const char **)&end))
+			return -EINVAL;
+	} else {
+		*netid = simple_strtol(str, &end, 10);
+		if (end[0] == ':' && end[1] == '*') {
+			end += 2;
+			*weak = VZPRIVNET_WEAK;
+		}
+	}
+
+	return (is_eol(*end) ? 0 : -EINVAL);
+}
+
+static int parse_sparse(const char *param, int *add,
+		unsigned int *netid, u32 *ip, u32 *mask, int *weak)
+{
+	if (param[0] == '+') {
+		*add = 1;
+		return parse_sparse_add(param + 1, netid, ip, mask, weak);
+	}
+
+	if (param[0] == '-') {
+		*add = 0;
+		return parse_sparse_remove(param + 1, netid, ip, weak);
+	}
+
+	return -EINVAL;
+}
+
+static ssize_t sparse_write(struct file * file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	char *s, *page;
+	int err;
+	int offset;
+
+	page = (unsigned char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	if (count > (PAGE_SIZE - 1))
+		count = (PAGE_SIZE - 1);
+
+	err = copy_from_user(page, buf, count);
+	if (err)
+		goto err;
+
+	s = page;
+	s[count] = 0;
+
+	err = -EINVAL;
+	while (*s) {
+		int add, weak = VZPRIVNET_STRONG;
+		unsigned int netid = 0;
+		u32 ip = 0, mask = 0;
+
+		err = parse_sparse(s, &add, &netid, &ip, &mask, &weak);
+		if (err)
+			goto out;
+
+		if (add)
+			err = sparse_add(netid, ip, mask, weak);
+		else
+			err = sparse_del(netid, ip, weak);
+
+		if (err)
+			goto out;
+
+		s = nextline(s);
+	}
+out:
+	offset = s - page;
+	if (offset > 0)
+		err = offset;
+err:
+	free_page((unsigned long)page);
+	return err;
+}
+
+static void *sparse_seq_start(struct seq_file *seq, loff_t *pos)
+{
+	read_lock_bh(&vzprivlock);
+	return seq_list_start(&vzpriv_sparse, *pos);
+}
+
+static void *sparse_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &vzpriv_sparse, pos);
+}
+
+static void sparse_seq_stop(struct seq_file *s, void *v)
+{
+	read_unlock_bh(&vzprivlock);
+}
+
+static int sparse_seq_show(struct seq_file *s, void *v)
+{
+	struct list_head *lh = v;
+	struct vzprivnet_sparse *pns;
+	struct vzprivnet_entry *pne;
+
+	pns = list_entry(lh, struct vzprivnet_sparse, list);
+	seq_printf(s, "%u: ", pns->netid);
+
+	if (pns->pn.weak == VZPRIVNET_WEAK)
+		seq_puts(s, "* ");
+
+	list_for_each_entry(pne, &pns->entries, list) {
+		seq_printf(s, "%pI4", &pne->range.netip);
+		if (~pne->range.rmask != 0) /* subnet */
+			seq_printf(s, "/%u", to_prefix(pne->range.rmask));
+		seq_putc(s, ' ');
+	}
+
+	seq_putc(s, '\n');
+
+	return 0;
+}
+
+static struct seq_operations sparse_seq_ops = {
+	.start = sparse_seq_start,
+	.next  = sparse_seq_next,
+	.stop  = sparse_seq_stop,
+	.show  = sparse_seq_show,
+};
+
+static int sparse_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &sparse_seq_ops);
+}
+
+static struct file_operations proc_sparse_ops = {
+	.owner   = THIS_MODULE,
+	.open    = sparse_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = sparse_write,
+};
+
+static void (*show_more)(struct seq_file *s);
+static DEFINE_MUTEX(show_lock);
+
+static void vzprivnet_reg_swap(vzprivnet_show_fn old, vzprivnet_show_fn new)
+{
+	mutex_lock(&show_lock);
+	if (show_more == old)
+		show_more = new;
+	mutex_unlock(&show_lock);
+}
+
+static void vzprivnet_show_more(struct seq_file *f)
+{
+	mutex_lock(&show_lock);
+	if (show_more != NULL)
+		show_more(f);
+	mutex_unlock(&show_lock);
+}
+
+void vzprivnet_reg_show(vzprivnet_show_fn fn)
+{
+	vzprivnet_reg_swap(NULL, fn);
+}
+EXPORT_SYMBOL(vzprivnet_reg_show);
+
+void vzprivnet_unreg_show(vzprivnet_show_fn fn)
+{
+	vzprivnet_reg_swap(fn, NULL);
+}
+EXPORT_SYMBOL(vzprivnet_unreg_show);
+
+static int stat_seq_show(struct seq_file *s, void *v)
+{
+	unsigned long sum[2];
+	int cpu;
+
+	sum[0] = sum[1] = 0;
+	for_each_possible_cpu(cpu) {
+		sum[0] += per_cpu(lookup_stat[0], cpu);
+		sum[1] += per_cpu(lookup_stat[1], cpu);
+	}
+
+	seq_printf(s, "Lookups: %lu\n", sum[0]);
+	seq_printf(s, "Br-lookups: %lu\n", sum[1]);
+	vzprivnet_show_more(s);
+
+	return 0;
+}
+
+static int stat_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, &stat_seq_show, NULL);
+}
+
+static struct file_operations proc_stat_ops = {
+	.owner   = THIS_MODULE,
+	.open    = stat_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+};
+
+static char sample_ip[16];
+
+static ssize_t classify_write(struct file * file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	int len;
+	char *tmp;
+
+	len = count;
+	if (len >= sizeof(sample_ip))
+		len = sizeof(sample_ip) - 1;
+
+	if (copy_from_user(sample_ip, buf, len))
+		return -EFAULT;
+
+	sample_ip[len] = '\0';
+	tmp = strchr(sample_ip, '\n');
+	if (tmp)
+		*tmp = '\0';
+
+	return count;
+}
+
+static int classify_seq_show(struct seq_file *s, void *v)
+{
+	u32 ip;
+	struct vzprivnet_range *pnr;
+
+	seq_printf(s, "%s: ", sample_ip);
+
+	if (!in4_pton(sample_ip, sizeof(sample_ip), (u8 *)&ip, -1, NULL)) {
+		seq_puts(s, "invalid IP\n");
+		return 0;
+	}
+
+	read_lock(&vzprivlock);
+	pnr = tree_search(&entries_root, ip);
+	if (pnr != NULL) {
+		struct vzprivnet_sparse *pns;
+
+		pns = container_of(pnr->pn, struct vzprivnet_sparse, pn);
+		seq_printf(s, "net %u, ", pns->netid);
+		seq_printf(s, "rule %pI4", &pnr->netip);
+		if (~pnr->rmask != 0)
+			seq_printf(s, "/%u", to_prefix(pnr->rmask));
+		seq_putc(s, '\n');
+
+		goto out;
+	}
+
+	pnr = legacy_search(ip);
+	if (pnr != NULL) {
+		seq_printf(s, "legacy %pI4/%u/%u\n",
+				&pnr->netip,
+				to_prefix(pnr->rmask),
+				to_prefix(pnr->pn->nmask));
+
+		goto out;
+	}
+
+	seq_printf(s, "internet\n");
+out:
+	read_unlock(&vzprivlock);
+	return 0;
+}
+
+static int classify_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, &classify_seq_show, NULL);
+}
+
+static struct file_operations proc_classify_ops = {
+	.owner   = THIS_MODULE,
+	.open    = classify_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write	 = classify_write,
+};
+
+struct proc_dir_entry *vzpriv_proc_dir;
+EXPORT_SYMBOL(vzpriv_proc_dir);
+
+static struct ctl_table vzprivnet_table[] = {
+	{
+		.procname = "net",
+		.child = vzprivnet_table + 2,
+	},
+	{ },
+	{
+		.procname = "vzpriv_handle_bridge",
+		.data = &vzpn_handle_bridged,
+		.maxlen = sizeof(vzpn_handle_bridged),
+		.mode = 0600,
+		.proc_handler = proc_dointvec,
+	},
+	{
+		.procname = "vzpriv_filter_host",
+		.data = &vzpn_filter_host,
+		.maxlen = sizeof(vzpn_filter_host),
+		.mode = 0600,
+		.proc_handler = proc_dointvec,
+	},
+	{ },
+};
+
+static struct ctl_table_header *ctl;
+
+static int __init iptable_vzprivnet_init(void)
+{
+	int err = -ENOMEM;
+	struct proc_dir_entry *proc;
+
+	vzpriv_proc_dir = proc_mkdir("privnet", proc_vz_dir);
+	if (vzpriv_proc_dir == NULL)
+		goto err_mkdir;
+
+	proc = proc_create("legacy", 0644,
+			vzpriv_proc_dir, &proc_vzprivnet_ops);
+	if (proc == NULL)
+		goto err_legacy;
+
+	proc = proc_create("sparse", 0644,
+			vzpriv_proc_dir, &proc_sparse_ops);
+	if (proc == NULL)
+		goto err_net;
+
+	proc = proc_create("stat", 0644,
+			vzpriv_proc_dir, &proc_stat_ops);
+	if (proc == NULL)
+		goto err_stat;
+
+	proc = proc_create("classify", 0644,
+			vzpriv_proc_dir, &proc_classify_ops);
+	if (proc == NULL)
+		goto err_classify;
+
+	proc = proc_symlink(VZPRIV_PROCNAME, init_net.proc_net, "/proc/vz/privnet/legacy");
+	if (proc == NULL)
+		goto err_link;
+
+	err = -ENOMEM;
+	ctl = register_sysctl_table(vzprivnet_table);
+	if (ctl == NULL)
+		goto err_ctl;
+
+	err = nf_register_hooks(vzprivnet_ops, 3);
+	if (err)
+		goto err_reg;
+
+	return 0;
+
+err_reg:
+	unregister_sysctl_table(ctl);
+err_ctl:
+	remove_proc_entry(VZPRIV_PROCNAME, init_net.proc_net);
+err_link:
+	remove_proc_entry("classify", vzpriv_proc_dir);
+err_classify:
+	remove_proc_entry("stat", vzpriv_proc_dir);
+err_stat:
+	remove_proc_entry("sparse", vzpriv_proc_dir);
+err_net:
+	remove_proc_entry("legacy", vzpriv_proc_dir);
+err_legacy:
+	remove_proc_entry("privnet", proc_vz_dir);
+err_mkdir:
+	return err;
+}
+
+static void __exit iptable_vzprivnet_exit(void)
+{
+	nf_unregister_hooks(vzprivnet_ops, 3);
+	unregister_sysctl_table(ctl);
+	remove_proc_entry(VZPRIV_PROCNAME, init_net.proc_net);
+	remove_proc_entry("classify", vzpriv_proc_dir);
+	remove_proc_entry("stat", vzpriv_proc_dir);
+	remove_proc_entry("sparse", vzpriv_proc_dir);
+	remove_proc_entry("legacy", vzpriv_proc_dir);
+	remove_proc_entry("privnet", proc_vz_dir);
+	vzprivnet_cleanup();
+}
+
+module_init(iptable_vzprivnet_init)
+module_exit(iptable_vzprivnet_exit)
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- a/net/ipv4/netfilter/ipt_CLUSTERIP.c
+++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c
@@ -28,6 +28,7 @@
 #include <linux/netfilter_ipv4/ipt_CLUSTERIP.h>
 #include <net/netfilter/nf_conntrack.h>
 #include <net/net_namespace.h>
+#include <net/netns/generic.h>
 #include <net/checksum.h>
 #include <net/ip.h>
 
@@ -57,15 +58,21 @@ struct clusterip_config {
 	struct rcu_head rcu;
 };
 
-static LIST_HEAD(clusterip_configs);
+#ifdef CONFIG_PROC_FS
+static const struct file_operations clusterip_proc_fops;
+#endif
 
-/* clusterip_lock protects the clusterip_configs list */
-static DEFINE_SPINLOCK(clusterip_lock);
+static int clusterip_net_id __read_mostly;
+
+struct clusterip_net {
+	struct list_head configs;
+	/* lock protects the configs list */
+	spinlock_t lock;
 
 #ifdef CONFIG_PROC_FS
-static const struct file_operations clusterip_proc_fops;
-static struct proc_dir_entry *clusterip_procdir;
+	struct proc_dir_entry *procdir;
 #endif
+};
 
 static inline void
 clusterip_config_get(struct clusterip_config *c)
@@ -92,10 +99,13 @@ clusterip_config_put(struct clusterip_config *c)
 static inline void
 clusterip_config_entry_put(struct clusterip_config *c)
 {
+	struct net *net = dev_net(c->dev);
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
 	local_bh_disable();
-	if (atomic_dec_and_lock(&c->entries, &clusterip_lock)) {
+	if (atomic_dec_and_lock(&c->entries, &cn->lock)) {
 		list_del_rcu(&c->list);
-		spin_unlock(&clusterip_lock);
+		spin_unlock(&cn->lock);
 		local_bh_enable();
 
 		dev_mc_del(c->dev, c->clustermac);
@@ -113,11 +123,12 @@ clusterip_config_entry_put(struct clusterip_config *c)
 }
 
 static struct clusterip_config *
-__clusterip_config_find(__be32 clusterip)
+__clusterip_config_find(struct net *net, __be32 clusterip)
 {
 	struct clusterip_config *c;
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
 
-	list_for_each_entry_rcu(c, &clusterip_configs, list) {
+	list_for_each_entry_rcu(c, &cn->configs, list) {
 		if (c->clusterip == clusterip)
 			return c;
 	}
@@ -126,12 +137,12 @@ __clusterip_config_find(__be32 clusterip)
 }
 
 static inline struct clusterip_config *
-clusterip_config_find_get(__be32 clusterip, int entry)
+clusterip_config_find_get(struct net *net, __be32 clusterip, int entry)
 {
 	struct clusterip_config *c;
 
 	rcu_read_lock_bh();
-	c = __clusterip_config_find(clusterip);
+	c = __clusterip_config_find(net, clusterip);
 	if (c) {
 		if (unlikely(!atomic_inc_not_zero(&c->refcount)))
 			c = NULL;
@@ -158,6 +169,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
 			struct net_device *dev)
 {
 	struct clusterip_config *c;
+	struct clusterip_net *cn = net_generic(dev_net(dev), clusterip_net_id);
 
 	c = kzalloc(sizeof(*c), GFP_ATOMIC);
 	if (!c)
@@ -180,7 +192,7 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
 		/* create proc dir entry */
 		sprintf(buffer, "%pI4", &ip);
 		c->pde = proc_create_data(buffer, S_IWUSR|S_IRUSR,
-					  clusterip_procdir,
+					  cn->procdir,
 					  &clusterip_proc_fops, c);
 		if (!c->pde) {
 			kfree(c);
@@ -189,9 +201,9 @@ clusterip_config_init(const struct ipt_clusterip_tgt_info *i, __be32 ip,
 	}
 #endif
 
-	spin_lock_bh(&clusterip_lock);
-	list_add_rcu(&c->list, &clusterip_configs);
-	spin_unlock_bh(&clusterip_lock);
+	spin_lock_bh(&cn->lock);
+	list_add_rcu(&c->list, &cn->configs);
+	spin_unlock_bh(&cn->lock);
 
 	return c;
 }
@@ -375,7 +387,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
 
 	/* FIXME: further sanity checks */
 
-	config = clusterip_config_find_get(e->ip.dst.s_addr, 1);
+	config = clusterip_config_find_get(par->net, e->ip.dst.s_addr, 1);
 	if (!config) {
 		if (!(cipinfo->flags & CLUSTERIP_FLAG_NEW)) {
 			pr_info("no config found for %pI4, need 'new'\n",
@@ -389,7 +401,7 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
 				return -EINVAL;
 			}
 
-			dev = dev_get_by_name(&init_net, e->ip.iniface);
+			dev = dev_get_by_name(par->net, e->ip.iniface);
 			if (!dev) {
 				pr_info("no such interface %s\n",
 					e->ip.iniface);
@@ -411,6 +423,8 @@ static int clusterip_tg_check(const struct xt_tgchk_param *par)
 	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
+	else
+		allow_conntrack_allocation(par->net);
 	return ret;
 }
 
@@ -497,6 +511,7 @@ arp_mangle(const struct nf_hook_ops *ops,
 	struct arphdr *arp = arp_hdr(skb);
 	struct arp_payload *payload;
 	struct clusterip_config *c;
+	struct net *net = dev_net(in ? in : out);
 
 	/* we don't care about non-ethernet and non-ipv4 ARP */
 	if (arp->ar_hrd != htons(ARPHRD_ETHER) ||
@@ -513,7 +528,7 @@ arp_mangle(const struct nf_hook_ops *ops,
 
 	/* if there is no clusterip configuration for the arp reply's
 	 * source ip, we don't want to mangle it */
-	c = clusterip_config_find_get(payload->src_ip, 0);
+	c = clusterip_config_find_get(net, payload->src_ip, 0);
 	if (!c)
 		return NF_ACCEPT;
 
@@ -703,48 +718,75 @@ static const struct file_operations clusterip_proc_fops = {
 
 #endif /* CONFIG_PROC_FS */
 
+static int clusterip_net_init(struct net *net)
+{
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+
+	INIT_LIST_HEAD(&cn->configs);
+
+	spin_lock_init(&cn->lock);
+
+#ifdef CONFIG_PROC_FS
+	cn->procdir = proc_mkdir("ipt_CLUSTERIP", net->proc_net);
+	if (!cn->procdir) {
+		pr_err("Unable to proc dir entry\n");
+		return -ENOMEM;
+	}
+#endif /* CONFIG_PROC_FS */
+
+	return 0;
+}
+
+static void clusterip_net_exit(struct net *net)
+{
+#ifdef CONFIG_PROC_FS
+	struct clusterip_net *cn = net_generic(net, clusterip_net_id);
+	proc_remove(cn->procdir);
+#endif
+}
+
+static struct pernet_operations clusterip_net_ops = {
+	.init = clusterip_net_init,
+	.exit = clusterip_net_exit,
+	.id   = &clusterip_net_id,
+	.size = sizeof(struct clusterip_net),
+};
+
 static int __init clusterip_tg_init(void)
 {
 	int ret;
 
-	ret = xt_register_target(&clusterip_tg_reg);
+	ret = register_pernet_subsys(&clusterip_net_ops);
 	if (ret < 0)
 		return ret;
 
+	ret = xt_register_target(&clusterip_tg_reg);
+	if (ret < 0)
+		goto cleanup_subsys;
+
 	ret = nf_register_hook(&cip_arp_ops);
 	if (ret < 0)
 		goto cleanup_target;
 
-#ifdef CONFIG_PROC_FS
-	clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net);
-	if (!clusterip_procdir) {
-		pr_err("Unable to proc dir entry\n");
-		ret = -ENOMEM;
-		goto cleanup_hook;
-	}
-#endif /* CONFIG_PROC_FS */
-
 	pr_info("ClusterIP Version %s loaded successfully\n",
 		CLUSTERIP_VERSION);
+
 	return 0;
 
-#ifdef CONFIG_PROC_FS
-cleanup_hook:
-	nf_unregister_hook(&cip_arp_ops);
-#endif /* CONFIG_PROC_FS */
 cleanup_target:
 	xt_unregister_target(&clusterip_tg_reg);
+cleanup_subsys:
+	unregister_pernet_subsys(&clusterip_net_ops);
 	return ret;
 }
 
 static void __exit clusterip_tg_exit(void)
 {
 	pr_info("ClusterIP Version %s unloading\n", CLUSTERIP_VERSION);
-#ifdef CONFIG_PROC_FS
-	proc_remove(clusterip_procdir);
-#endif
+
 	nf_unregister_hook(&cip_arp_ops);
 	xt_unregister_target(&clusterip_tg_reg);
+	unregister_pernet_subsys(&clusterip_net_ops);
 
 	/* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */
 	rcu_barrier_bh();
--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -41,6 +41,7 @@ static int masquerade_tg_check(const struct xt_tgchk_param *par)
 		pr_debug("bad rangesize %u\n", mr->rangesize);
 		return -EINVAL;
 	}
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
--- a/net/ipv4/netfilter/ipt_REJECT.c
+++ b/net/ipv4/netfilter/ipt_REJECT.c
@@ -74,13 +74,13 @@ static int reject_tg_check(const struct xt_tgchk_param *par)
 	const struct ipt_entry *e = par->entryinfo;
 
 	if (rejinfo->with == IPT_ICMP_ECHOREPLY) {
-		pr_info("ECHOREPLY no longer supported.\n");
+		ve_printk(VE_LOG, "ECHOREPLY no longer supported.\n");
 		return -EINVAL;
 	} else if (rejinfo->with == IPT_TCP_RESET) {
 		/* Must specify that it's a TCP packet */
 		if (e->ip.proto != IPPROTO_TCP ||
 		    (e->ip.invflags & XT_INV_PROTO)) {
-			pr_info("TCP_RESET invalid for non-tcp\n");
+			ve_printk(VE_LOG, "TCP_RESET invalid for non-tcp\n");
 			return -EINVAL;
 		}
 	}
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -408,12 +408,16 @@ static unsigned int ipv4_synproxy_hook(const struct nf_hook_ops *ops,
 static int synproxy_tg4_check(const struct xt_tgchk_param *par)
 {
 	const struct ipt_entry *e = par->entryinfo;
+	int ret;
 
 	if (e->ip.proto != IPPROTO_TCP ||
 	    e->ip.invflags & XT_INV_PROTO)
 		return -EINVAL;
 
-	return nf_ct_l3proto_try_module_get(par->family);
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret == 0)
+		allow_conntrack_allocation(par->net);
+	return ret;
 }
 
 static void synproxy_tg4_destroy(const struct xt_tgdtor_param *par)
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -59,6 +59,9 @@ static int __net_init iptable_filter_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
 
+	if (!net_ipt_permitted(net, VE_IP_FILTER))
+		return 0;
+
 	repl = ipt_alloc_initial_table(&packet_filter);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -69,12 +72,22 @@ static int __net_init iptable_filter_net_init(struct net *net)
 	net->ipv4.iptable_filter =
 		ipt_register_table(net, &packet_filter, repl);
 	kfree(repl);
+
+	if (!IS_ERR(net->ipv4.iptable_filter))
+		net_ipt_module_set(net, VE_IP_FILTER);
+
 	return PTR_RET(net->ipv4.iptable_filter);
 }
 
 static void __net_exit iptable_filter_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_FILTER))
+		return;
+
 	ipt_unregister_table(net, net->ipv4.iptable_filter);
+	net->ipv4.iptable_filter = NULL;
+
+	net_ipt_module_clear(net, VE_IP_FILTER);
 }
 
 static struct pernet_operations iptable_filter_net_ops = {
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -102,18 +102,31 @@ static int __net_init iptable_mangle_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
 
+	if (!net_ipt_permitted(net, VE_IP_MANGLE))
+		return 0;
+
 	repl = ipt_alloc_initial_table(&packet_mangler);
 	if (repl == NULL)
 		return -ENOMEM;
 	net->ipv4.iptable_mangle =
 		ipt_register_table(net, &packet_mangler, repl);
 	kfree(repl);
+
+	if (!IS_ERR(net->ipv4.iptable_mangle))
+		net_ipt_module_set(net, VE_IP_MANGLE);
+
 	return PTR_RET(net->ipv4.iptable_mangle);
 }
 
 static void __net_exit iptable_mangle_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_MANGLE))
+		return;
+
 	ipt_unregister_table(net, net->ipv4.iptable_mangle);
+	net->ipv4.iptable_mangle = NULL;
+
+	net_ipt_module_clear(net, VE_IP_MANGLE);
 }
 
 static struct pernet_operations iptable_mangle_net_ops = {
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -113,17 +113,30 @@ static int __net_init iptable_nat_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
 
+	if (!net_ipt_permitted(net, VE_IP_IPTABLE_NAT))
+		return 0;
+
 	repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
 	if (repl == NULL)
 		return -ENOMEM;
 	net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
 	kfree(repl);
+
+	if (!IS_ERR(net->ipv4.nat_table))
+		net_ipt_module_set(net, VE_IP_IPTABLE_NAT);
+
 	return PTR_RET(net->ipv4.nat_table);
 }
 
 static void __net_exit iptable_nat_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_IPTABLE_NAT))
+		return;
+
 	ipt_unregister_table(net, net->ipv4.nat_table);
+	net->ipv4.nat_table = NULL;
+
+	net_ipt_module_clear(net, VE_IP_IPTABLE_NAT);
 }
 
 static struct pernet_operations iptable_nat_net_ops = {
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -41,6 +41,12 @@ static struct nf_hook_ops *rawtable_ops __read_mostly;
 static int __net_init iptable_raw_net_init(struct net *net)
 {
 	struct ipt_replace *repl;
+	int ret;
+
+	if (!net_ipt_permitted(net, VE_IP_IPTABLES))
+		return 0;
+
+	BUG_ON(net->ipv4.iptable_raw);
 
 	repl = ipt_alloc_initial_table(&packet_raw);
 	if (repl == NULL)
@@ -48,12 +54,22 @@ static int __net_init iptable_raw_net_init(struct net *net)
 	net->ipv4.iptable_raw =
 		ipt_register_table(net, &packet_raw, repl);
 	kfree(repl);
-	return PTR_RET(net->ipv4.iptable_raw);
+
+	ret = PTR_RET(net->ipv4.iptable_raw);
+	if (ret)
+		net->ipv4.iptable_raw = NULL;
+
+	return ret;
 }
 
 static void __net_exit iptable_raw_net_exit(struct net *net)
 {
+	if (!net->ipv4.iptable_raw)
+		return;
+
 	ipt_unregister_table(net, net->ipv4.iptable_raw);
+
+	net->ipv4.iptable_raw = NULL;
 }
 
 static struct pernet_operations iptable_raw_net_ops = {
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -372,7 +372,7 @@ static int ipv4_init_net(struct net *net)
 	if (!in->ctl_table)
 		return -ENOMEM;
 
-	in->ctl_table[0].data = &nf_conntrack_max;
+	in->ctl_table[0].data = &net->ct.max;
 	in->ctl_table[1].data = &net->ct.count;
 	in->ctl_table[2].data = &net->ct.htable_size;
 	in->ctl_table[3].data = &net->ct.sysctl_checksum;
@@ -412,6 +412,9 @@ static int ipv4_net_init(struct net *net)
 {
 	int ret = 0;
 
+	if (!net_ipt_permitted(net, VE_IP_CONNTRACK))
+		return 0;
+
 	ret = nf_ct_l4proto_pernet_register(net, &nf_conntrack_l4proto_tcp4);
 	if (ret < 0) {
 		pr_err("nf_conntrack_tcp4: pernet registration failed\n");
@@ -432,6 +435,9 @@ static int ipv4_net_init(struct net *net)
 		pr_err("nf_conntrack_ipv4: pernet registration failed\n");
 		goto out_ipv4;
 	}
+
+	net_ipt_module_set(net, VE_IP_CONNTRACK);
+
 	return 0;
 out_ipv4:
 	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
@@ -449,6 +455,9 @@ static void ipv4_net_exit(struct net *net)
 	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_icmp);
 	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_udp4);
 	nf_ct_l4proto_pernet_unregister(net, &nf_conntrack_l4proto_tcp4);
+
+	if (net_is_ipt_module_set(net, VE_IP_CONNTRACK))
+		net_ipt_module_clear(net, VE_IP_CONNTRACK);
 }
 
 static struct pernet_operations ipv4_net_ops = {
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4_compat.c
@@ -427,8 +427,8 @@ static int __net_init ip_conntrack_net_init(struct net *net)
 	if (!proc_exp)
 		goto err2;
 
-	proc_stat = proc_create("ip_conntrack", S_IRUGO,
-				net->proc_net_stat, &ct_cpu_seq_fops);
+	proc_stat = proc_net_create_data("ip_conntrack", S_IRUGO,
+				net->proc_net_stat, &ct_cpu_seq_fops, NULL);
 	if (!proc_stat)
 		goto err3;
 	return 0;
--- a/net/ipv4/netfilter/nf_log_ipv4.c
+++ b/net/ipv4/netfilter/nf_log_ipv4.c
@@ -316,10 +316,6 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
 {
 	struct nf_log_buf *m;
 
-	/* FIXME: Disabled from containers until syslog ns is supported */
-	if (!net_eq(net, &init_net))
-		return;
-
 	m = nf_log_buf_open();
 
 	if (!loginfo)
@@ -333,7 +329,7 @@ static void nf_log_ip_packet(struct net *net, u_int8_t pf,
 
 	dump_ipv4_packet(m, loginfo, skb, 0);
 
-	nf_log_buf_close(m);
+	nf_log_buf_close(m, net->owner_ve);
 }
 
 static struct nf_logger nf_ip_logger __read_mostly = {
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -740,7 +740,7 @@ struct proto ping_prot = {
 	.init =		ping_init_sock,
 	.close =	ping_close,
 	.connect =	ip4_datagram_connect,
-	.disconnect =	udp_disconnect,
+	.disconnect =	__udp_disconnect,
 	.setsockopt =	ip_setsockopt,
 	.getsockopt =	ip_getsockopt,
 	.sendmsg =	ping_sendmsg,
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -79,9 +79,10 @@
 #include <linux/netfilter_ipv4.h>
 #include <linux/compat.h>
 
-static struct raw_hashinfo raw_v4_hashinfo = {
+struct raw_hashinfo raw_v4_hashinfo = {
 	.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
 };
+EXPORT_SYMBOL_GPL(raw_v4_hashinfo);
 
 void raw_hash_sk(struct sock *sk)
 {
@@ -108,7 +109,7 @@ void raw_unhash_sk(struct sock *sk)
 }
 EXPORT_SYMBOL_GPL(raw_unhash_sk);
 
-static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
+struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
 		unsigned short num, __be32 raddr, __be32 laddr, int dif)
 {
 	sk_for_each_from(sk) {
@@ -124,6 +125,7 @@ static struct sock *__raw_v4_lookup(struct net *net, struct sock *sk,
 found:
 	return sk;
 }
+EXPORT_SYMBOL_GPL(__raw_v4_lookup);
 
 /*
  *	0 - deliver
@@ -884,7 +886,7 @@ struct proto raw_prot = {
 	.close		   = raw_close,
 	.destroy	   = raw_destroy,
 	.connect	   = ip4_datagram_connect,
-	.disconnect	   = udp_disconnect,
+	.disconnect	   = __udp_disconnect,
 	.ioctl		   = raw_ioctl,
 	.init		   = raw_init,
 	.setsockopt	   = raw_setsockopt,
@@ -914,7 +916,7 @@ static struct sock *raw_get_first(struct seq_file *seq)
 	for (state->bucket = 0; state->bucket < RAW_HTABLE_SIZE;
 			++state->bucket) {
 		sk_for_each(sk, &state->h->ht[state->bucket])
-			if (sock_net(sk) == seq_file_net(seq))
+			if (net_access_allowed(sock_net(sk), seq_file_net(seq)))
 				goto found;
 	}
 	sk = NULL;
@@ -930,7 +932,7 @@ static struct sock *raw_get_next(struct seq_file *seq, struct sock *sk)
 		sk = sk_next(sk);
 try_again:
 		;
-	} while (sk && sock_net(sk) != seq_file_net(seq));
+	} while (sk && !net_access_allowed(sock_net(sk), seq_file_net(seq)));
 
 	if (!sk && ++state->bucket < RAW_HTABLE_SIZE) {
 		sk = sk_head(&state->h->ht[state->bucket]);
--- /dev/null
+++ b/net/ipv4/raw_diag.c
@@ -0,0 +1,243 @@
+#include <linux/module.h>
+
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+#include <net/inet_sock.h>
+#include <net/raw.h>
+#include <net/rawv6.h>
+
+#ifdef pr_fmt
+# undef pr_fmt
+#endif
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+static struct raw_hashinfo *
+raw_get_hashinfo(const struct inet_diag_req_v2 *r)
+{
+	if (r->sdiag_family == AF_INET) {
+		return &raw_v4_hashinfo;
+#if IS_ENABLED(CONFIG_IPV6)
+	} else if (r->sdiag_family == AF_INET6) {
+		return &raw_v6_hashinfo;
+#endif
+	} else {
+		pr_warn_once("Unexpected inet family %d\n",
+			     r->sdiag_family);
+		WARN_ON_ONCE(1);
+		return ERR_PTR(-EINVAL);
+	}
+}
+
+/*
+ * Due to requirement of not breaking user API we can't simply
+ * rename @pad field in inet_diag_req_v2 structure, instead
+ * use helper to figure it out.
+ */
+
+static struct sock *raw_lookup(struct net *net, struct sock *from,
+			       const struct inet_diag_req_v2 *req)
+{
+	struct inet_diag_req_raw *r = (void *)req;
+	struct sock *sk = NULL;
+
+	if (r->sdiag_family == AF_INET)
+		sk = __raw_v4_lookup(net, from, r->sdiag_raw_protocol,
+				     r->id.idiag_dst[0],
+				     r->id.idiag_src[0],
+				     r->id.idiag_if);
+#if IS_ENABLED(CONFIG_IPV6)
+	else
+		sk = __raw_v6_lookup(net, from, r->sdiag_raw_protocol,
+				     (const struct in6_addr *)r->id.idiag_src,
+				     (const struct in6_addr *)r->id.idiag_dst,
+				     r->id.idiag_if);
+#endif
+	return sk;
+}
+
+static struct sock *raw_sock_get(struct net *net, const struct inet_diag_req_v2 *r)
+{
+	struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
+	struct sock *sk = NULL, *s;
+	int slot;
+
+	if (IS_ERR(hashinfo))
+		return ERR_CAST(hashinfo);
+
+	read_lock(&hashinfo->lock);
+	for (slot = 0; slot < RAW_HTABLE_SIZE; slot++) {
+		sk_for_each(s, &hashinfo->ht[slot]) {
+			sk = raw_lookup(net, s, r);
+			if (sk) {
+				/*
+				 * Grab it and keep until we fill
+				 * diag meaage to be reported, so
+				 * caller should call sock_put then.
+				 * We can do that because we're keeping
+				 * hashinfo->lock here.
+				 */
+				sock_hold(sk);
+				goto out_unlock;
+			}
+		}
+	}
+out_unlock:
+	read_unlock(&hashinfo->lock);
+
+	return sk ? sk : ERR_PTR(-ENOENT);
+}
+
+static int raw_diag_dump_one(struct sk_buff *in_skb,
+			     const struct nlmsghdr *nlh,
+			     const struct inet_diag_req_v2 *r)
+{
+	struct net *net = sock_net(in_skb->sk);
+	struct sk_buff *rep;
+	struct sock *sk;
+	int err;
+
+	sk = raw_sock_get(net, r);
+	if (IS_ERR(sk))
+		return PTR_ERR(sk);
+
+	rep = nlmsg_new(sizeof(struct inet_diag_msg) +
+			sizeof(struct inet_diag_meminfo) + 64,
+			GFP_KERNEL);
+	if (!rep) {
+		sock_put(sk);
+		return -ENOMEM;
+	}
+
+	err = inet_sk_diag_fill(sk, NULL, rep, r,
+				sk_user_ns(NETLINK_CB(in_skb).sk),
+				NETLINK_CB(in_skb).portid,
+				nlh->nlmsg_seq, 0, nlh);
+	sock_put(sk);
+
+	if (err < 0) {
+		kfree_skb(rep);
+		return err;
+	}
+
+	err = netlink_unicast(net->diag_nlsk, rep,
+			      NETLINK_CB(in_skb).portid,
+			      MSG_DONTWAIT);
+	if (err > 0)
+		err = 0;
+	return err;
+}
+
+static int sk_diag_dump(struct sock *sk, struct sk_buff *skb,
+			struct netlink_callback *cb,
+			const struct inet_diag_req_v2 *r,
+			struct nlattr *bc)
+{
+	if (!inet_diag_bc_sk(bc, sk))
+		return 0;
+
+	return inet_sk_diag_fill(sk, NULL, skb, r,
+				 sk_user_ns(NETLINK_CB(cb->skb).sk),
+				 NETLINK_CB(cb->skb).portid,
+				 cb->nlh->nlmsg_seq, NLM_F_MULTI,
+				 cb->nlh);
+}
+
+static void raw_diag_dump(struct sk_buff *skb, struct netlink_callback *cb,
+			  const struct inet_diag_req_v2 *r, struct nlattr *bc)
+{
+	struct raw_hashinfo *hashinfo = raw_get_hashinfo(r);
+	struct net *net = sock_net(skb->sk);
+	int num, s_num, slot, s_slot;
+	struct sock *sk = NULL;
+
+	if (IS_ERR(hashinfo))
+		return;
+
+	s_slot = cb->args[0];
+	num = s_num = cb->args[1];
+
+	read_lock(&hashinfo->lock);
+	for (slot = s_slot; slot < RAW_HTABLE_SIZE; s_num = 0, slot++) {
+		num = 0;
+
+		sk_for_each(sk, &hashinfo->ht[slot]) {
+			struct inet_sock *inet = inet_sk(sk);
+
+			if (!net_eq(sock_net(sk), net))
+				continue;
+			if (num < s_num)
+				goto next;
+			if (sk->sk_family != r->sdiag_family)
+				goto next;
+			if (r->id.idiag_sport != inet->inet_sport &&
+			    r->id.idiag_sport)
+				goto next;
+			if (r->id.idiag_dport != inet->inet_dport &&
+			    r->id.idiag_dport)
+				goto next;
+			if (sk_diag_dump(sk, skb, cb, r, bc) < 0)
+				goto out_unlock;
+next:
+			num++;
+		}
+	}
+
+out_unlock:
+	read_unlock(&hashinfo->lock);
+
+	cb->args[0] = slot;
+	cb->args[1] = num;
+}
+
+static void raw_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
+			      void *info)
+{
+	r->idiag_rqueue = sk_rmem_alloc_get(sk);
+	r->idiag_wqueue = sk_wmem_alloc_get(sk);
+}
+
+static const struct inet_diag_handler raw_diag_handler = {
+	.dump			= raw_diag_dump,
+	.dump_one		= raw_diag_dump_one,
+	.idiag_get_info		= raw_diag_get_info,
+	.idiag_type		= IPPROTO_RAW,
+};
+
+static void __always_unused __check_inet_diag_req_raw(void)
+{
+	/*
+	 * Make sure the two structures are identical,
+	 * except the @pad field.
+	 */
+#define __offset_mismatch(m1, m2)			\
+	(offsetof(struct inet_diag_req_v2, m1) !=	\
+	 offsetof(struct inet_diag_req_raw, m2))
+
+	BUILD_BUG_ON(sizeof(struct inet_diag_req_v2) !=
+		     sizeof(struct inet_diag_req_raw));
+	BUILD_BUG_ON(__offset_mismatch(sdiag_family, sdiag_family));
+	BUILD_BUG_ON(__offset_mismatch(sdiag_protocol, sdiag_protocol));
+	BUILD_BUG_ON(__offset_mismatch(idiag_ext, idiag_ext));
+	BUILD_BUG_ON(__offset_mismatch(pad, sdiag_raw_protocol));
+	BUILD_BUG_ON(__offset_mismatch(idiag_states, idiag_states));
+	BUILD_BUG_ON(__offset_mismatch(id, id));
+#undef __offset_mismatch
+}
+
+static int __init raw_diag_init(void)
+{
+	return inet_diag_register(&raw_diag_handler);
+}
+
+static void __exit raw_diag_exit(void)
+{
+	inet_diag_unregister(&raw_diag_handler);
+}
+
+module_init(raw_diag_init);
+module_exit(raw_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 2-255 /* AF_INET - IPPROTO_RAW */);
+MODULE_ALIAS_NET_PF_PROTO_TYPE(PF_NETLINK, NETLINK_SOCK_DIAG, 10-255 /* AF_INET6 - IPPROTO_RAW */);
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -70,6 +70,7 @@
 #include <linux/types.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
+#include <linux/nsproxy.h>
 #include <linux/string.h>
 #include <linux/socket.h>
 #include <linux/sockios.h>
@@ -118,6 +119,7 @@
 
 #define RT_GC_TIMEOUT (300*HZ)
 
+int ip_rt_src_check		= 1;
 static int ip_rt_max_size;
 static int ip_rt_redirect_number __read_mostly	= 9;
 static int ip_rt_redirect_load __read_mostly	= HZ / 50;
@@ -388,13 +390,14 @@ static int __net_init ip_rt_do_proc_init(struct net *net)
 	if (!pde)
 		goto err1;
 
-	pde = proc_create("rt_cache", S_IRUGO,
-			  net->proc_net_stat, &rt_cpu_seq_fops);
+	pde = proc_net_create_data("rt_cache", S_IRUGO,
+			  net->proc_net_stat, &rt_cpu_seq_fops, NULL);
 	if (!pde)
 		goto err2;
 
 #ifdef CONFIG_IP_ROUTE_CLASSID
-	pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
+	pde = proc_net_create_data("rt_acct", 0,
+			net->proc_net, &rt_acct_proc_fops, NULL);
 	if (!pde)
 		goto err3;
 #endif
@@ -965,6 +968,7 @@ static int ip_error(struct sk_buff *skb)
 out:	kfree_skb(skb);
 	return 0;
 }
+EXPORT_SYMBOL(rt_cache_flush);
 
 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
 {
@@ -2179,7 +2183,7 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 			goto make_route;
 		}
 
-		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
+		if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC) && ip_rt_src_check) {
 			/* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
 			if (!__ip_dev_find(net, fl4->saddr, false))
 				goto out;
@@ -2742,6 +2746,15 @@ static struct ctl_table ipv4_route_table[] = {
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+#ifdef CONFIG_VE
+	{
+		.procname	= "src_check",
+		.data		= &ip_rt_src_check,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+#endif
 	{ }
 };
 
@@ -2901,3 +2914,28 @@ void __init ip_static_sysctl_init(void)
 	register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
 }
 #endif
+
+#if 0
+static void ip_rt_dump_dst(void *o)
+{
+	struct rtable *rt = (struct rtable *)o;
+
+	if (rt->dst.flags & DST_FREE)
+		return;
+
+	printk("=== %p\n", o);
+	dst_dump_one(&rt->dst);
+	printk("\tgen %x flags %x type %d\n",
+			rt->rt_genid, rt->rt_flags, (int)rt->rt_type);
+}
+#endif
+
+void ip_rt_dump_dsts(void)
+{
+	printk("IPv4 dst cache:\n");
+	//FIXME
+	//slab_obj_walk(ipv4_dst_ops.kmem_cachep, ip_rt_dump_dst);
+}
+
+void (*ip6_rt_dump_dsts)(void);
+EXPORT_SYMBOL_GPL(ip6_rt_dump_dsts);
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -44,6 +44,9 @@ static int ip_ping_group_range_max[] = { GID_T_MAX, GID_T_MAX };
 
 static int rhel_unused_sysctl __read_mostly;
 
+int sysctl_tcp_use_sg = 1;
+EXPORT_SYMBOL(sysctl_tcp_use_sg);
+
 /* Update system visible IP port range */
 static void set_local_port_range(struct net *net, int range[2])
 {
@@ -428,8 +431,8 @@ static struct ctl_table ipv4_table[] = {
 		.procname	= "tcp_syncookies",
 		.data		= &sysctl_tcp_syncookies,
 		.maxlen		= sizeof(int),
-		.mode		= 0644,
-		.proc_handler	= proc_dointvec
+		.mode		= 0644 | S_ISVTX,
+		.proc_handler	= proc_dointvec_immutable,
 	},
 #endif
 	{
@@ -809,6 +812,13 @@ static struct ctl_table ipv4_table[] = {
 		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &one
 	},
+	{
+		.procname	= "tcp_use_sg",
+		.data		= &sysctl_tcp_use_sg,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{ }
 };
 
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -268,6 +268,7 @@
 #include <linux/crypto.h>
 #include <linux/time.h>
 #include <linux/slab.h>
+#include <linux/ve.h>
 
 #include <net/icmp.h>
 #include <net/inet_common.h>
@@ -289,6 +290,7 @@ int sysctl_tcp_autocorking __read_mostly = 1;
 struct percpu_counter tcp_orphan_count;
 EXPORT_SYMBOL_GPL(tcp_orphan_count);
 
+int sysctl_tcp_mem[3] __read_mostly;
 int sysctl_tcp_wmem[3] __read_mostly;
 int sysctl_tcp_rmem[3] __read_mostly;
 
@@ -401,6 +403,8 @@ void tcp_init_sock(struct sock *sk)
 	tp->mss_cache = TCP_MSS_DEFAULT;
 	u64_stats_init(&tp->syncp);
 
+	tp->advmss = 65535; /* max value */
+
 	tp->reordering = sysctl_tcp_reordering;
 	tcp_enable_early_retrans(tp);
 	tcp_assign_congestion_control(sk);
@@ -764,6 +768,12 @@ ssize_t tcp_splice_read(struct socket *sock, loff_t *ppos,
 				ret = -EAGAIN;
 				break;
 			}
+			/* if __tcp_splice_read() got nothing while we have
+			 * an skb in receive queue, we do not want to loop.
+			 * This might happen with URG data.
+			 */
+			if (!skb_queue_empty(&sk->sk_receive_queue))
+				break;
 			sk_wait_data(sk, &timeo, NULL);
 			if (signal_pending(current)) {
 				ret = sock_intr_errno(timeo);
@@ -1260,7 +1270,7 @@ new_segment:
 wait_for_sndbuf:
 			set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 wait_for_memory:
-			if (copied)
+			if (copied && likely(!tp->repair))
 				tcp_push(sk, flags & ~MSG_MORE, mss_now,
 					 TCP_NAGLE_PUSH, size_goal);
 
@@ -1379,8 +1389,10 @@ static void tcp_cleanup_rbuf(struct sock *sk, int copied)
 	struct sk_buff *skb = skb_peek(&sk->sk_receive_queue);
 
 	WARN(skb && !before(tp->copied_seq, TCP_SKB_CB(skb)->end_seq),
-	     "cleanup rbuf bug: copied %X seq %X rcvnxt %X\n",
-	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq, tp->rcv_nxt);
+	     KERN_INFO "cleanup rbuf bug (%s/%s): copied %X seq %X/%X rcvnxt %X\n",
+	     ve_name(sock_net(sk)->owner_ve), current->comm,
+	     tp->copied_seq, TCP_SKB_CB(skb)->end_seq,
+	     TCP_SKB_CB(skb)->seq, tp->rcv_nxt);
 
 	if (inet_csk_ack_scheduled(sk)) {
 		const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -1649,7 +1661,9 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 			if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
 				goto found_fin_ok;
 			WARN(!(flags & MSG_PEEK),
-			     "recvmsg bug 2: copied %X seq %X rcvnxt %X fl %X\n",
+			     "recvmsg bug 2 (%s/%s): "
+			     "copied %X seq %X rcvnxt %X fl %X\n",
+			     ve_name(sock_net(sk)->owner_ve), current->comm,
 			     *seq, TCP_SKB_CB(skb)->seq, tp->rcv_nxt, flags);
 		}
 
@@ -1711,8 +1725,18 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
 
 			tp->ucopy.len = len;
 
-			WARN_ON(tp->copied_seq != tp->rcv_nxt &&
-				!(flags & (MSG_PEEK | MSG_TRUNC)));
+			if (WARN_ON(tp->copied_seq != tp->rcv_nxt &&
+				!(flags & (MSG_PEEK | MSG_TRUNC)))) {
+				printk("KERNEL: assertion: tp->copied_seq == "
+						"tp->rcv_nxt || ...\n");
+				printk("VE%s pid %d comm %.16s\n",
+						ve_name(sock_net(sk)->owner_ve),
+						current->pid, current->comm);
+				printk("flags=0x%x, len=%d, copied_seq=%d, "
+						"rcv_nxt=%d\n", flags,
+						(int)len, tp->copied_seq,
+						tp->rcv_nxt);
+			}
 
 			/* Ugly... If prequeue is not empty, we have to
 			 * process it before releasing socket, otherwise
@@ -2105,7 +2129,7 @@ adjudge_to_death:
 	bh_lock_sock(sk);
 	WARN_ON(sock_owned_by_user(sk));
 
-	percpu_counter_inc(sk->sk_prot->orphan_count);
+	orphan_count_inc(sk);
 
 	/* Have we already been destroyed by a softirq or backlog? */
 	if (state != TCP_CLOSE && sk->sk_state == TCP_CLOSE)
@@ -2231,6 +2255,7 @@ int tcp_disconnect(struct sock *sk, int flags)
 	tp->snd_ssthresh = TCP_INFINITE_SSTHRESH;
 	tp->snd_cwnd_cnt = 0;
 	tp->window_clamp = 0;
+	tp->advmss = 65535;
 	tcp_set_ca_state(sk, TCP_CA_Open);
 	tcp_clear_retrans(tp);
 	inet_csk_delack_init(sk);
@@ -2255,7 +2280,39 @@ void tcp_sock_destruct(struct sock *sk)
 static inline bool tcp_can_repair_sock(const struct sock *sk)
 {
 	return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) &&
-		((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_ESTABLISHED));
+		(sk->sk_state != TCP_LISTEN);
+}
+
+static int tcp_repair_set_window(struct tcp_sock *tp, char __user *optbuf, int len)
+{
+	struct tcp_repair_window opt;
+
+	if (!tp->repair)
+		return -EPERM;
+
+	if (len != sizeof(opt))
+		return -EINVAL;
+
+	if (copy_from_user(&opt, optbuf, sizeof(opt)))
+		return -EFAULT;
+
+	if (opt.max_window < opt.snd_wnd)
+		return -EINVAL;
+
+	if (after(opt.snd_wl1, tp->rcv_nxt + opt.rcv_wnd))
+		return -EINVAL;
+
+	if (after(opt.rcv_wup, tp->rcv_nxt))
+		return -EINVAL;
+
+	tp->snd_wl1	= opt.snd_wl1;
+	tp->snd_wnd	= opt.snd_wnd;
+	tp->max_window	= opt.max_window;
+
+	tp->rcv_wnd	= opt.rcv_wnd;
+	tp->rcv_wup	= opt.rcv_wup;
+
+	return 0;
 }
 
 static int tcp_repair_options_est(struct tcp_sock *tp,
@@ -2577,6 +2634,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
 		else
 			tp->tsoffset = val - tcp_time_stamp;
 		break;
+	case TCP_REPAIR_WINDOW:
+		err = tcp_repair_set_window(tp, optval, optlen);
+		break;
 	case TCP_NOTSENT_LOWAT:
 		tp->notsent_lowat = val;
 		sk->sk_write_space(sk);
@@ -2798,6 +2858,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
 			return -EINVAL;
 		break;
 
+	case TCP_REPAIR_WINDOW: {
+		struct tcp_repair_window opt;
+
+		if (get_user(len, optlen))
+			return -EFAULT;
+
+		if (len != sizeof(opt))
+			return -EINVAL;
+
+		if (!tp->repair)
+			return -EPERM;
+
+		opt.snd_wl1	= tp->snd_wl1;
+		opt.snd_wnd	= tp->snd_wnd;
+		opt.max_window	= tp->max_window;
+		opt.rcv_wnd	= tp->rcv_wnd;
+		opt.rcv_wup	= tp->rcv_wup;
+
+		if (copy_to_user(optval, &opt, len))
+			return -EFAULT;
+		return 0;
+	}
 	case TCP_QUEUE_SEQ:
 		if (tp->repair_queue == TCP_SEND_QUEUE)
 			val = tp->write_seq;
@@ -3041,7 +3123,7 @@ void __init tcp_init(void)
 	tcp_hashinfo.bind_bucket_cachep =
 		kmem_cache_create("tcp_bind_bucket",
 				  sizeof(struct inet_bind_bucket), 0,
-				  SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+				  SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL);
 
 	/* Size and allocate the main established and bind bucket
 	 * hash tables.
@@ -3086,6 +3168,11 @@ void __init tcp_init(void)
 	sysctl_tcp_max_orphans = cnt / 2;
 	sysctl_max_syn_backlog = max(128, cnt / 256);
 
+	if (sysctl_tcp_mem[2] - sysctl_tcp_mem[1] > 4096)
+		sysctl_tcp_mem[1] = sysctl_tcp_mem[2] - 4096;
+	if (sysctl_tcp_mem[1] - sysctl_tcp_mem[0] > 4096)
+		sysctl_tcp_mem[0] = sysctl_tcp_mem[1] - 4096;
+
 	tcp_init_mem(&init_net);
 	/* Set per-socket limits to no more than 1/128 the pressure threshold */
 	limit = nr_free_buffer_pages() << (PAGE_SHIFT - 7);
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -4497,7 +4497,7 @@ queue_and_out:
 				if (skb_queue_len(&sk->sk_receive_queue) == 0)
 					sk_forced_mem_schedule(sk, skb->truesize);
 				else if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
-					goto drop;
+					goto drop_part;
 			}
 			eaten = tcp_queue_rcv(sk, skb, 0, &fragstolen);
 		}
@@ -4540,6 +4540,12 @@ out_of_window:
 drop:
 		__kfree_skb(skb);
 		return;
+
+drop_part:
+		if (after(tp->copied_seq, tp->rcv_nxt))
+			tp->rcv_nxt = tp->copied_seq;
+		__kfree_skb(skb);
+		return;
 	}
 
 	/* Out of window. F.e. zero window probe. */
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -785,7 +785,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
 	struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
 
 	tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
-			tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
+			tcptw->tw_rcv_wnd >>
+				(tw->tw_rcv_wscale & TW_WSCALE_MASK),
 			tcp_time_stamp + tcptw->tw_ts_offset,
 			tcptw->tw_ts_recent,
 			tw->tw_bound_dev_if,
@@ -1470,7 +1471,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 			}
 		}
 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
-		return 0;
+		goto restore_context;
 	}
 
 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
@@ -1487,7 +1488,7 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 				rsk = nsk;
 				goto reset;
 			}
-			return 0;
+			goto restore_context;
 		}
 	} else
 		sock_rps_save_rxhash(sk, skb);
@@ -1496,6 +1497,8 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
 		rsk = sk;
 		goto reset;
 	}
+
+restore_context:
 	return 0;
 
 reset:
@@ -1507,7 +1510,7 @@ discard:
 	 * might be destroyed here. This current version compiles correctly,
 	 * but you have been warned.
 	 */
-	return 0;
+	goto restore_context;
 
 csum_err:
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
@@ -1608,6 +1611,21 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
 }
 EXPORT_SYMBOL(tcp_prequeue);
 
+int tcp_filter(struct sock *sk, struct sk_buff *skb)
+{
+	struct tcphdr *th = (struct tcphdr *)skb->data;
+	unsigned int eaten = skb->len;
+	int err;
+
+	err = sk_filter_trim_cap(sk, skb, th->doff * 4);
+	if (!err) {
+		eaten -= skb->len;
+		TCP_SKB_CB(skb)->end_seq -= eaten;
+	}
+	return err;
+}
+EXPORT_SYMBOL(tcp_filter);
+
 /*
  *	From tcp_input.c
  */
@@ -1684,8 +1702,10 @@ process:
 
 	nf_reset(skb);
 
-	if (sk_filter(sk, skb))
+	if (tcp_filter(sk, skb))
 		goto discard_and_relse;
+	th = (const struct tcphdr *)skb->data;
+	iph = ip_hdr(skb);
 
 	sk_mark_napi_id(sk, skb);
 	skb->dev = NULL;
@@ -2035,7 +2055,8 @@ static void *established_get_next(struct seq_file *seq, void *cur)
 	sk = sk_nulls_next(sk);
 
 	sk_nulls_for_each_from(sk, node) {
-		if (sk->sk_family == st->family && net_eq(sock_net(sk), net))
+		if (sk->sk_family == st->family &&
+		    net_eq(sock_net(sk), net))
 			return sk;
 	}
 
@@ -2210,7 +2231,7 @@ int tcp_proc_register(struct net *net, struct tcp_seq_afinfo *afinfo)
 	afinfo->seq_ops.next		= tcp_seq_next;
 	afinfo->seq_ops.stop		= tcp_seq_stop;
 
-	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+	p = proc_net_create_data(afinfo->name, S_IRUGO, net->proc_net,
 			     afinfo->seq_fops, afinfo);
 	if (!p)
 		rc = -ENOMEM;
--- a/net/ipv4/tcp_memcontrol.c
+++ b/net/ipv4/tcp_memcontrol.c
@@ -6,6 +6,8 @@
 #include <linux/memcontrol.h>
 #include <linux/module.h>
 
+#define RES_ORPHANS	1024
+
 static inline struct tcp_memcontrol *tcp_from_cgproto(struct cg_proto *cg_proto)
 {
 	return container_of(cg_proto, struct tcp_memcontrol, cg_proto);
@@ -18,6 +20,67 @@ static void memcg_tcp_enter_memory_pressure(struct sock *sk)
 }
 EXPORT_SYMBOL(memcg_tcp_enter_memory_pressure);
 
+void cg_orphan_count_inc(struct sock *sk)
+{
+	struct cg_proto *cg;
+
+	for (cg = sk->sk_cgrp; cg; cg = parent_cg_proto(sk->sk_prot, cg)) {
+		struct tcp_memcontrol *tcp;
+
+		tcp = tcp_from_cgproto(cg);
+		percpu_counter_inc(&tcp->tcp_orphan_count);
+	}
+}
+EXPORT_SYMBOL(cg_orphan_count_inc);
+
+void cg_orphan_count_dec(struct sock *sk)
+{
+	struct cg_proto *cg;
+
+	for (cg = sk->sk_cgrp; cg; cg = parent_cg_proto(sk->sk_prot, cg)) {
+		struct tcp_memcontrol *tcp;
+
+		tcp = tcp_from_cgproto(cg);
+		percpu_counter_dec(&tcp->tcp_orphan_count);
+	}
+}
+
+bool cg_too_many_orphans(struct sock *sk, int shift)
+{
+	struct cg_proto *cg;
+
+	for (cg = sk->sk_cgrp; cg; cg = parent_cg_proto(sk->sk_prot, cg)) {
+		struct tcp_memcontrol *tcp;
+		struct percpu_counter *ocp;
+		int orphans;
+
+		tcp = tcp_from_cgproto(cg);
+		ocp = &tcp->tcp_orphan_count;
+		orphans = percpu_counter_read_positive(ocp);
+
+		if (orphans << shift > tcp->tcp_max_orphans) {
+			orphans = percpu_counter_sum_positive(ocp);
+			if (orphans << shift > tcp->tcp_max_orphans)
+				return true;
+		}
+	}
+
+	return false;
+}
+
+static u64 tcp_read_orphans(struct mem_cgroup *mem)
+{
+	struct tcp_memcontrol *tcp;
+	struct cg_proto *cg_proto;
+
+	cg_proto = tcp_prot.proto_cgroup(mem);
+	if (!cg_proto)
+		return 0;
+
+	tcp = tcp_from_cgproto(cg_proto);
+	return percpu_counter_sum_positive(&tcp->tcp_orphan_count);
+}
+
 int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 {
 	/*
@@ -40,6 +103,7 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 	tcp->tcp_prot_mem[0] = net->ipv4.sysctl_tcp_mem[0];
 	tcp->tcp_prot_mem[1] = net->ipv4.sysctl_tcp_mem[1];
 	tcp->tcp_prot_mem[2] = net->ipv4.sysctl_tcp_mem[2];
+	tcp->tcp_max_orphans = sysctl_tcp_max_orphans >> 2;
 	tcp->tcp_memory_pressure = 0;
 
 	parent_cg = tcp_prot.proto_cgroup(parent);
@@ -48,6 +112,7 @@ int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
 
 	page_counter_init(&tcp->tcp_memory_allocated, counter_parent);
 	percpu_counter_init(&tcp->tcp_sockets_allocated, 0, GFP_KERNEL);
+	percpu_counter_init(&tcp->tcp_orphan_count, 0, GFP_KERNEL);
 
 	cg_proto->enter_memory_pressure = memcg_tcp_enter_memory_pressure;
 	cg_proto->memory_pressure = &tcp->tcp_memory_pressure;
@@ -71,6 +136,7 @@ void tcp_destroy_cgroup(struct mem_cgroup *memcg)
 
 	tcp = tcp_from_cgproto(cg_proto);
 	percpu_counter_destroy(&tcp->tcp_sockets_allocated);
+	percpu_counter_destroy(&tcp->tcp_orphan_count);
 }
 EXPORT_SYMBOL(tcp_destroy_cgroup);
 
@@ -191,6 +257,9 @@ static u64 tcp_cgroup_read(struct cgroup *cont, struct cftype *cft)
 		val = cg_proto->memory_allocated->watermark;
 		val *= PAGE_SIZE;
 		break;
+	case RES_ORPHANS:
+		val = tcp_read_orphans(memcg);
+		break;
 	default:
 		BUG();
 	}
@@ -259,6 +328,11 @@ static struct cftype tcp_files[] = {
 		.trigger = tcp_cgroup_reset,
 		.read_u64 = tcp_cgroup_read,
 	},
+	{
+		.name = "kmem.tcp.orphans",
+		.private = RES_ORPHANS,
+		.read_u64 = tcp_cgroup_read, /* XXX add configuration knob */
+	},
 	{ }	/* terminate */
 };
 
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -304,6 +304,8 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
 		tcptw->tw_ts_recent_stamp = tp->rx_opt.ts_recent_stamp;
 		tcptw->tw_ts_offset	= tp->tsoffset;
 		tcptw->tw_last_oow_ack_time = 0;
+		if (sk->sk_user_data != NULL)
+			tw->tw_rcv_wscale |= TW_WSCALE_SPEC;
 
 #if IS_ENABLED(CONFIG_IPV6)
 		if (tw->tw_family == PF_INET6) {
@@ -459,7 +461,6 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 		struct inet_connection_sock *newicsk = inet_csk(newsk);
 		struct tcp_sock *newtp = tcp_sk(newsk);
 
-		/* Now setup tcp_sock */
 		newtp->pred_flags = 0;
 
 		newtp->rcv_wup = newtp->copied_seq =
@@ -550,6 +551,7 @@ struct sock *tcp_create_openreq_child(struct sock *sk, struct request_sock *req,
 			newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len;
 		newtp->rx_opt.mss_clamp = req->mss;
 		TCP_ECN_openreq_child(newtp, req);
+		newtp->fastopen_req = NULL;
 		newtp->fastopen_rsk = NULL;
 		newtp->syn_data_acked = 0;
 
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -412,11 +412,6 @@ static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
 	TCP_SKB_CB(skb)->end_seq = seq;
 }
 
-static inline bool tcp_urg_mode(const struct tcp_sock *tp)
-{
-	return tp->snd_una != tp->snd_up;
-}
-
 #define OPTION_SACK_ADVERTISE	(1 << 0)
 #define OPTION_TS		(1 << 1)
 #define OPTION_MD5		(1 << 2)
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -114,6 +114,7 @@
 #include <trace/events/skb.h>
 #include <net/busy_poll.h>
 #include "udp_impl.h"
+#include <net/udp_memcontrol.h>
 
 struct udp_table udp_table __read_mostly;
 EXPORT_SYMBOL(udp_table);
@@ -150,6 +151,7 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
 		    sk2 != sk &&
 		    (bitmap || udp_sk(sk2)->udp_port_hash == num) &&
 		    (!sk2->sk_reuse || !sk->sk_reuse) &&
+		    sk->sk_reuse != SK_FORCE_REUSE &&
 		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
 		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
 		    (!sk2->sk_reuseport || !sk->sk_reuseport ||
@@ -185,6 +187,7 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
 		    sk2 != sk &&
 		    (udp_sk(sk2)->udp_port_hash == num) &&
 		    (!sk2->sk_reuse || !sk->sk_reuse) &&
+		    sk->sk_reuse != SK_FORCE_REUSE &&
 		    (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
 		     sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
 		    (!sk2->sk_reuseport || !sk->sk_reuseport ||
@@ -1351,7 +1354,7 @@ csum_copy_err:
 }
 
 
-int udp_disconnect(struct sock *sk, int flags)
+int __udp_disconnect(struct sock *sk, int flags)
 {
 	struct inet_sock *inet = inet_sk(sk);
 	/*
@@ -1373,6 +1376,15 @@ int udp_disconnect(struct sock *sk, int flags)
 	sk_dst_reset(sk);
 	return 0;
 }
+EXPORT_SYMBOL(__udp_disconnect);
+
+int udp_disconnect(struct sock *sk, int flags)
+{
+	lock_sock(sk);
+	__udp_disconnect(sk, flags);
+	release_sock(sk);
+	return 0;
+}
 EXPORT_SYMBOL(udp_disconnect);
 
 void udp_lib_unhash(struct sock *sk)
@@ -2023,6 +2035,7 @@ void udp_destroy_sock(struct sock *sk)
 		if (encap_destroy)
 			encap_destroy(sk);
 	}
+	sock_release_memcg(sk);
 }
 
 /*
@@ -2241,6 +2254,16 @@ unsigned int udp_poll(struct file *file, struct socket *sock, poll_table *wait)
 }
 EXPORT_SYMBOL(udp_poll);
 
+int udp_init_sock(struct sock *sk)
+{
+	local_bh_disable();
+	sock_update_memcg(sk);
+	local_bh_enable();
+
+	return 0;
+}
+EXPORT_SYMBOL(udp_init_sock);
+
 struct proto udp_prot = {
 	.name		   = "UDP",
 	.owner		   = THIS_MODULE,
@@ -2248,6 +2271,7 @@ struct proto udp_prot = {
 	.connect	   = ip4_datagram_connect,
 	.disconnect	   = udp_disconnect,
 	.ioctl		   = udp_ioctl,
+	.init		   = udp_init_sock,
 	.destroy	   = udp_destroy_sock,
 	.setsockopt	   = udp_setsockopt,
 	.getsockopt	   = udp_getsockopt,
@@ -2272,6 +2296,11 @@ struct proto udp_prot = {
 	.compat_getsockopt = compat_udp_getsockopt,
 #endif
 	.clear_sk	   = sk_prot_clear_portaddr_nulls,
+#ifdef CONFIG_MEMCG_KMEM
+	.init_cgroup		= udp_init_cgroup,
+	.destroy_cgroup		= udp_destroy_cgroup,
+	.proto_cgroup		= udp_proto_cgroup,
+#endif
 };
 EXPORT_SYMBOL(udp_prot);
 
@@ -2294,7 +2323,7 @@ static struct sock *udp_get_first(struct seq_file *seq, int start)
 
 		spin_lock_bh(&hslot->lock);
 		sk_nulls_for_each(sk, node, &hslot->head) {
-			if (!net_eq(sock_net(sk), net))
+			if (!net_access_allowed(sock_net(sk), net))
 				continue;
 			if (sk->sk_family == state->family)
 				goto found;
@@ -2313,7 +2342,7 @@ static struct sock *udp_get_next(struct seq_file *seq, struct sock *sk)
 
 	do {
 		sk = sk_nulls_next(sk);
-	} while (sk && (!net_eq(sock_net(sk), net) || sk->sk_family != state->family));
+	} while (sk && (!net_access_allowed(sock_net(sk), net) || sk->sk_family != state->family));
 
 	if (!sk) {
 		if (state->bucket <= state->udp_table->mask)
@@ -2390,7 +2419,7 @@ int udp_proc_register(struct net *net, struct udp_seq_afinfo *afinfo)
 	afinfo->seq_ops.next		= udp_seq_next;
 	afinfo->seq_ops.stop		= udp_seq_stop;
 
-	p = proc_create_data(afinfo->name, S_IRUGO, net->proc_net,
+	p = proc_net_create_data(afinfo->name, S_IRUGO, net->proc_net,
 			     afinfo->seq_fops, afinfo);
 	if (!p)
 		rc = -ENOMEM;
--- /dev/null
+++ b/net/ipv4/udp_memcontrol.c
@@ -0,0 +1,227 @@
+/*
+ *  net/ipv4/udp_memcontrol.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <net/udp.h>
+#include <net/udp_memcontrol.h>
+#include <net/sock.h>
+#include <net/ip.h>
+#include <linux/nsproxy.h>
+#include <linux/memcontrol.h>
+#include <linux/module.h>
+
+/*
+ * The below code is copied from tcp_memcontrol.c with
+ * s/tcp/udp/g and knowledge that udp doesn't need mem
+ * pressure state and sockets_allocated counter.
+ */
+
+static inline struct udp_memcontrol *udp_from_cgproto(struct cg_proto *cg_proto)
+{
+	return container_of(cg_proto, struct udp_memcontrol, cg_proto);
+}
+
+int udp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
+{
+	/*
+	 * The root cgroup does not use page_counters, but rather,
+	 * rely on the data already collected by the network
+	 * subsystem
+	 */
+	struct page_counter *counter_parent = NULL;
+	struct cg_proto *cg_proto, *parent_cg;
+	struct udp_memcontrol *udp;
+	struct mem_cgroup *parent = parent_mem_cgroup(memcg);
+
+	cg_proto = udp_prot.proto_cgroup(memcg);
+	if (!cg_proto)
+		return 0;
+
+	udp = udp_from_cgproto(cg_proto);
+
+	udp->udp_prot_mem[0] = sysctl_udp_mem[0];
+	udp->udp_prot_mem[1] = sysctl_udp_mem[1];
+	udp->udp_prot_mem[2] = sysctl_udp_mem[2];
+
+	parent_cg = udp_prot.proto_cgroup(parent);
+	if (parent_cg)
+		counter_parent = parent_cg->memory_allocated;
+
+	page_counter_init(&udp->udp_memory_allocated, counter_parent);
+
+	cg_proto->sysctl_mem = udp->udp_prot_mem;
+	cg_proto->memory_allocated = &udp->udp_memory_allocated;
+	cg_proto->memcg = memcg;
+
+	return 0;
+}
+
+void udp_destroy_cgroup(struct mem_cgroup *memcg)
+{
+}
+
+static int udp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
+{
+	struct udp_memcontrol *udp;
+	struct cg_proto *cg_proto;
+	u64 old_lim;
+	int i;
+	int ret;
+
+	cg_proto = udp_prot.proto_cgroup(memcg);
+	if (!cg_proto)
+		return -EINVAL;
+
+	udp = udp_from_cgproto(cg_proto);
+
+	old_lim = udp->udp_memory_allocated.limit;
+	ret = page_counter_limit(&udp->udp_memory_allocated, nr_pages);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < 3; i++)
+		udp->udp_prot_mem[i] = min_t(long, nr_pages, sysctl_udp_mem[i]);
+
+	if (nr_pages == PAGE_COUNTER_MAX)
+		clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+	else {
+		if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
+			static_key_slow_inc(&memcg_socket_limit_enabled);
+		set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
+	}
+
+	return 0;
+}
+
+enum {
+	RES_USAGE,
+	RES_LIMIT,
+	RES_MAX_USAGE,
+	RES_FAILCNT,
+};
+
+static DEFINE_MUTEX(udp_limit_mutex);
+
+static int udp_cgroup_write(struct cgroup *cont, struct cftype *cft,
+			    const char *buffer)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	unsigned long nr_pages;
+	int ret = 0;
+
+	switch (cft->private) {
+	case RES_LIMIT:
+		/* see memcontrol.c */
+		ret = page_counter_memparse(buffer, &nr_pages);
+		if (ret)
+			break;
+
+		mutex_lock(&udp_limit_mutex);
+		ret = udp_update_limit(memcg, nr_pages);
+		mutex_unlock(&udp_limit_mutex);
+		break;
+	default:
+		ret = -EINVAL;
+		break;
+	}
+	return ret;
+}
+
+static u64 udp_cgroup_read(struct cgroup *cont, struct cftype *cft)
+{
+	struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+	struct cg_proto *cg_proto = udp_prot.proto_cgroup(memcg);
+
+	u64 val;
+
+	switch (cft->private) {
+	case RES_LIMIT:
+		if (!cg_proto)
+			return PAGE_COUNTER_MAX;
+		val = cg_proto->memory_allocated->limit;
+		val *= PAGE_SIZE;
+		break;
+	case RES_USAGE:
+		if (!cg_proto)
+			val = atomic_long_read(&udp_memory_allocated);
+		else
+			val = page_counter_read(cg_proto->memory_allocated);
+		val *= PAGE_SIZE;
+		break;
+	case RES_FAILCNT:
+		if (!cg_proto)
+			return 0;
+		val = cg_proto->memory_allocated->failcnt;
+		break;
+	case RES_MAX_USAGE:
+		if (!cg_proto)
+			return 0;
+		val = cg_proto->memory_allocated->watermark;
+		val *= PAGE_SIZE;
+		break;
+	default:
+		BUG();
+	}
+	return val;
+}
+
+static int udp_cgroup_reset(struct cgroup *cont, unsigned int event)
+{
+	struct mem_cgroup *memcg;
+	struct udp_memcontrol *udp;
+	struct cg_proto *cg_proto;
+
+	memcg = mem_cgroup_from_cont(cont);
+	cg_proto = udp_prot.proto_cgroup(memcg);
+	if (!cg_proto)
+		return 0;
+	udp = udp_from_cgproto(cg_proto);
+
+	switch (event) {
+	case RES_MAX_USAGE:
+		page_counter_reset_watermark(&udp->udp_memory_allocated);
+		break;
+	case RES_FAILCNT:
+		cg_proto->memory_allocated->failcnt = 0;
+		break;
+	}
+
+	return 0;
+}
+
+static struct cftype udp_files[] = {
+	{
+		.name = "kmem.udp.limit_in_bytes",
+		.write_string = udp_cgroup_write,
+		.read_u64 = udp_cgroup_read,
+		.private = RES_LIMIT,
+	},
+	{
+		.name = "kmem.udp.usage_in_bytes",
+		.read_u64 = udp_cgroup_read,
+		.private = RES_USAGE,
+	},
+	{
+		.name = "kmem.udp.failcnt",
+		.private = RES_FAILCNT,
+		.trigger = udp_cgroup_reset,
+		.read_u64 = udp_cgroup_read,
+	},
+	{
+		.name = "kmem.udp.max_usage_in_bytes",
+		.private = RES_MAX_USAGE,
+		.trigger = udp_cgroup_reset,
+		.read_u64 = udp_cgroup_read,
+	},
+	{ }	/* terminate */
+};
+
+static int __init udp_memcontrol_init(void)
+{
+	WARN_ON(cgroup_add_cftypes(&mem_cgroup_subsys, udp_files));
+	return 0;
+}
+__initcall(udp_memcontrol_init);
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -90,6 +90,7 @@
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
 #include <linux/export.h>
+#include <linux/ve.h>
 
 /* Set to 3 to get tracing... */
 #define ACONF_DEBUG 2
@@ -818,7 +819,7 @@ ipv6_add_addr(struct inet6_dev *idev, const struct in6_addr *addr,
 		goto out;
 	}
 
-	ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC);
+	ifa = kzalloc(sizeof(struct inet6_ifaddr), GFP_ATOMIC | __GFP_ACCOUNT);
 
 	if (ifa == NULL) {
 		ADBG("ipv6_add_addr: malloc failed\n");
@@ -3033,7 +3034,6 @@ static int addrconf_ifdown(struct net_device *dev, int how)
 	ASSERT_RTNL();
 
 	rt6_ifdown(net, dev);
-	neigh_ifdown(&nd_tbl, dev);
 
 	idev = __in6_dev_get(dev);
 	if (idev == NULL)
@@ -3220,6 +3220,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
 	if (dev->flags&(IFF_NOARP|IFF_LOOPBACK) ||
 	    idev->cnf.accept_dad < 1 ||
 	    !(ifp->flags&IFA_F_TENTATIVE) ||
+	    dev_net(dev)->owner_ve->disable_net ||
 	    ifp->flags & IFA_F_NODAD) {
 		ifp->flags &= ~(IFA_F_TENTATIVE|IFA_F_OPTIMISTIC|IFA_F_DADFAILED);
 		spin_unlock(&ifp->lock);
@@ -4411,7 +4412,7 @@ static inline void __snmp6_fill_stats64(u64 *stats, void __percpu **mib,
 	/* Use put_unaligned() because stats may not be aligned for u64. */
 	put_unaligned(items, &stats[0]);
 	for (i = 1; i < items; i++)
-		put_unaligned(snmp_fold_field64(mib, i, syncpoff), &stats[i]);
+		put_unaligned(__snmp_fold_field64(mib, i, syncpoff, cpu_online_mask), &stats[i]);
 
 	memset(&stats[items], 0, pad);
 }
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -38,11 +38,13 @@
 #include <linux/stat.h>
 #include <linux/init.h>
 #include <linux/slab.h>
+#include <linux/ve.h>
 
 #include <linux/inet.h>
 #include <linux/netdevice.h>
 #include <linux/icmpv6.h>
 #include <linux/netfilter_ipv6.h>
+#include <linux/cpu.h>
 
 #include <net/ip.h>
 #include <net/ipv6.h>
@@ -59,6 +61,9 @@
 #ifdef CONFIG_IPV6_TUNNEL
 #include <net/ip6_tunnel.h>
 #endif
+#ifdef CONFIG_IPV6_MIP6
+#include <net/mip6.h>
+#endif
 
 #include <asm/uaccess.h>
 #include <linux/mroute6.h>
@@ -158,6 +163,10 @@ lookup_protocol:
 			goto out_rcu_unlock;
 	}
 
+	err = vz_security_protocol_check(net, answer->protocol);
+	if (err < 0)
+		goto out_rcu_unlock;
+
 	err = -EPERM;
 	if (sock->type == SOCK_RAW && !kern &&
 	    !ns_capable(net->user_ns, CAP_NET_RAW))
@@ -703,6 +712,54 @@ bool ipv6_opt_accepted(const struct sock *sk, const struct sk_buff *skb)
 }
 EXPORT_SYMBOL_GPL(ipv6_opt_accepted);
 
+static void move_ipv6_percpu_stats(int cpu, void **mib, int items)
+{
+	int this_cpu, i;
+
+	local_irq_disable();
+	this_cpu = smp_processor_id();
+
+	for (i = 1; i < items; i++) {
+		*(((u64 *) per_cpu_ptr(mib[0], this_cpu)) + i) +=
+		*(((u64 *) per_cpu_ptr(mib[0], cpu)) + i);
+
+		*(((u64 *) per_cpu_ptr(mib[0], cpu)) + i) = 0;
+	}
+	local_irq_enable();
+}
+
+
+static int ipv6_cpu_notify(struct notifier_block *self,
+			   unsigned long action, void *hcpu)
+{
+	int cpu = (unsigned long)hcpu;
+	struct net_device *dev;
+	struct inet6_dev *idev;
+	struct net *net;
+
+	switch (action) {
+		case CPU_DEAD:
+		case CPU_DEAD_FROZEN:
+		rtnl_lock();
+		for_each_net(net) {
+			for_each_netdev(net, dev) {
+				idev = __in6_dev_get(dev);
+				if (!idev)
+					continue;
+				move_ipv6_percpu_stats(cpu,
+						(void **)idev->stats.ipv6,
+						IPSTATS_MIB_MAX);
+			}
+		}
+		rtnl_unlock();
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block ipv6_cpu_notifier = {
+	.notifier_call  = ipv6_cpu_notify,
+};
+
 static struct packet_type ipv6_packet_type __read_mostly = {
 	.type = cpu_to_be16(ETH_P_IPV6),
 	.func = ipv6_rcv,
@@ -958,6 +1015,7 @@ static int __init inet6_init(void)
 	if (err)
 		goto sysctl_fail;
 #endif
+	register_cpu_notifier(&ipv6_cpu_notifier);
 out:
 	return err;
 
@@ -1029,6 +1087,8 @@ static void __exit inet6_exit(void)
 	if (disable_ipv6_mod)
 		return;
 
+	unregister_cpu_notifier(&ipv6_cpu_notifier);
+
 	/* First of all disallow new sockets creation. */
 	sock_unregister(PF_INET6);
 	/* Disallow any further netlink messages */
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -198,11 +198,9 @@ static void fib6_link_table(struct net *net, struct fib6_table *tb)
 
 	h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
 
-	/*
-	 * No protection necessary, this is the only list mutatation
-	 * operation, tables never disappear once they exist.
-	 */
+	write_lock_bh(&tb->tb6_lock);
 	hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
+	write_unlock_bh(&tb->tb6_lock);
 }
 
 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
@@ -1627,6 +1625,7 @@ void fib6_clean_all(struct net *net, int (*func)(struct rt6_info *, void *arg),
 	for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
 		head = &net->ipv6.fib_table_hash[h];
 		hlist_for_each_entry_rcu(table, head, tb6_hlist) {
+
 			write_lock_bh(&table->tb6_lock);
 			fib6_clean_tree(net, &table->tb6_root,
 					func, false, arg);
@@ -1791,6 +1790,7 @@ static int __net_init fib6_net_init(struct net *net)
 					   GFP_KERNEL);
 	if (!net->ipv6.fib6_local_tbl)
 		goto out_fib6_main_tbl;
+
 	net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
 	net->ipv6.fib6_local_tbl->tb6_root.leaf = net->ipv6.ip6_null_entry;
 	net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
@@ -1839,7 +1839,7 @@ int __init fib6_init(void)
 
 	fib6_node_kmem = kmem_cache_create("fib6_nodes",
 					   sizeof(struct fib6_node),
-					   0, SLAB_HWCACHE_ALIGN,
+					   0, SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT,
 					   NULL);
 	if (!fib6_node_kmem)
 		goto out;
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -61,7 +61,6 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 	const struct net_offload *ops;
 	int proto;
 	struct frag_hdr *fptr;
-	unsigned int unfrag_ip6hlen;
 	u8 *prevhdr;
 	int offset = 0;
 	bool encap, udpfrag;
@@ -104,8 +103,10 @@ static struct sk_buff *ipv6_gso_segment(struct sk_buff *skb,
 		skb->network_header = (u8 *)ipv6h - skb->head;
 
 		if (udpfrag) {
-			unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
-			fptr = (struct frag_hdr *)((u8 *)ipv6h + unfrag_ip6hlen);
+			int err = ip6_find_1stfragopt(skb, &prevhdr);
+			if (err < 0)
+				return ERR_PTR(err);
+			fptr = (struct frag_hdr *)((u8 *)ipv6h + err);
 			fptr->frag_off = htons(offset);
 			if (skb->next != NULL)
 				fptr->frag_off |= htons(IP6_MF);
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -148,6 +148,7 @@ int ip6_output(struct sock *sk, struct sk_buff *skb)
 			    ip6_finish_output,
 			    !(IP6CB(skb)->flags & IP6SKB_REROUTED));
 }
+EXPORT_SYMBOL(ip6_output);
 
 /*
  *	xmit an sk_buff (used by TCP, SCTP and DCCP)
@@ -476,6 +477,20 @@ int ip6_forward(struct sk_buff *skb)
 		return -EMSGSIZE;
 	}
 
+	/*
+	 * We try to optimize forwarding of VE packets:
+	 * do not decrement TTL (and so save skb_cow)
+	 * during forwarding of outgoing pkts from VE.
+	 * For incoming pkts we still do ttl decr,
+	 * since such skb is not cloned and does not require
+	 * actual cow. So, there is at least one place
+	 * in pkts path with mandatory ttl decr, that is
+	 * sufficient to prevent routing loops.
+	 */
+	hdr = ipv6_hdr(skb);
+	if (skb->dev->features & NETIF_F_VENET) /* src is VENET device */
+		goto no_ttl_decr;
+
 	if (skb_cow(skb, dst->dev->hard_header_len)) {
 		IP6_INC_STATS(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTDISCARDS);
 		goto drop;
@@ -487,6 +502,7 @@ int ip6_forward(struct sk_buff *skb)
 
 	hdr->hop_limit--;
 
+no_ttl_decr:
 	IP6_INC_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTFORWDATAGRAMS);
 	IP6_ADD_STATS_BH(net, ip6_dst_idev(dst), IPSTATS_MIB_OUTOCTETS, skb->len);
 	return NF_HOOK(NFPROTO_IPV6, NF_INET_FORWARD, NULL, skb,
@@ -536,7 +552,10 @@ int ip6_fragment(struct sock *sk, struct sk_buff *skb,
 	u8 *prevhdr, nexthdr = 0;
 	struct net *net = dev_net(skb_dst(skb)->dev);
 
-	hlen = ip6_find_1stfragopt(skb, &prevhdr);
+	err = ip6_find_1stfragopt(skb, &prevhdr);
+	if (err < 0)
+		goto fail;
+	hlen = err;
 	nexthdr = *prevhdr;
 
 	mtu = ip6_skb_dst_mtu(skb);
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -249,7 +249,7 @@ static int __net_init ip6mr_rules_init(struct net *net)
 	return 0;
 
 err2:
-	kfree(mrt);
+	ip6mr_free_table(mrt);
 err1:
 	fib_rules_unregister(ops);
 	return err;
@@ -778,7 +778,8 @@ failure:
  *	Delete a VIF entry
  */
 
-static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head)
+static int mif6_delete(struct mr6_table *mrt, int vifi, int notify,
+		       struct list_head *head)
 {
 	struct mif_device *v;
 	struct net_device *dev;
@@ -824,7 +825,7 @@ static int mif6_delete(struct mr6_table *mrt, int vifi, struct list_head *head)
 					     dev->ifindex, &in6_dev->cnf);
 	}
 
-	if (v->flags & MIFF_REGISTER)
+	if ((v->flags & MIFF_REGISTER) && !notify)
 		unregister_netdevice_queue(dev, head);
 
 	dev_put(dev);
@@ -1333,7 +1334,6 @@ static int ip6mr_device_event(struct notifier_block *this,
 	struct mr6_table *mrt;
 	struct mif_device *v;
 	int ct;
-	LIST_HEAD(list);
 
 	if (event != NETDEV_UNREGISTER)
 		return NOTIFY_DONE;
@@ -1342,10 +1342,9 @@ static int ip6mr_device_event(struct notifier_block *this,
 		v = &mrt->vif6_table[0];
 		for (ct = 0; ct < mrt->maxvif; ct++, v++) {
 			if (v->dev == dev)
-				mif6_delete(mrt, ct, &list);
+				mif6_delete(mrt, ct, 1, NULL);
 		}
 	}
-	unregister_netdevice_many(&list);
 
 	return NOTIFY_DONE;
 }
@@ -1549,7 +1548,7 @@ static void mroute_clean_tables(struct mr6_table *mrt)
 	 */
 	for (i = 0; i < mrt->maxvif; i++) {
 		if (!(mrt->vif6_table[i].flags & VIFF_STATIC))
-			mif6_delete(mrt, i, &list);
+			mif6_delete(mrt, i, 0, &list);
 	}
 	unregister_netdevice_many(&list);
 
@@ -1662,6 +1661,10 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 	struct net *net = sock_net(sk);
 	struct mr6_table *mrt;
 
+	if (sk->sk_type != SOCK_RAW ||
+	    inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+		return -EOPNOTSUPP;
+
 	mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
 	if (mrt == NULL)
 		return -ENOENT;
@@ -1673,9 +1676,6 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 
 	switch (optname) {
 	case MRT6_INIT:
-		if (sk->sk_type != SOCK_RAW ||
-		    inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
-			return -EOPNOTSUPP;
 		if (optlen < sizeof(int))
 			return -EINVAL;
 
@@ -1702,7 +1702,7 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, uns
 		if (copy_from_user(&mifi, optval, sizeof(mifi_t)))
 			return -EFAULT;
 		rtnl_lock();
-		ret = mif6_delete(mrt, mifi, NULL);
+		ret = mif6_delete(mrt, mifi, 0, NULL);
 		rtnl_unlock();
 		return ret;
 
@@ -1811,6 +1811,10 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval,
 	struct net *net = sock_net(sk);
 	struct mr6_table *mrt;
 
+	if (sk->sk_type != SOCK_RAW ||
+	    inet_sk(sk)->inet_num != IPPROTO_ICMPV6)
+		return -EOPNOTSUPP;
+
 	mrt = ip6mr_get_table(net, raw6_sk(sk)->ip6mr_table ? : RT6_TABLE_DFLT);
 	if (mrt == NULL)
 		return -ENOENT;
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -121,6 +121,7 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
 static bool setsockopt_needs_rtnl(int optname)
 {
 	switch (optname) {
+	case IPV6_ADDRFORM:
 	case IPV6_ADD_MEMBERSHIP:
 	case IPV6_DROP_MEMBERSHIP:
 	case IPV6_JOIN_ANYCAST:
@@ -199,7 +200,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
 			}
 
 			fl6_free_socklist(sk);
-			ipv6_sock_mc_close(sk);
+			__ipv6_sock_mc_close(sk);
 
 			/*
 			 * Sock is moving from IPv6 to IPv4 (sk_prot), so
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -208,7 +208,6 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, const struct in6_addr *addr)
 
 	return 0;
 }
-EXPORT_SYMBOL(ipv6_sock_mc_join);
 
 /*
  *	socket leave on multicast group
@@ -285,16 +284,14 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net,
 	return idev;
 }
 
-void ipv6_sock_mc_close(struct sock *sk)
+void __ipv6_sock_mc_close(struct sock *sk)
 {
 	struct ipv6_pinfo *np = inet6_sk(sk);
 	struct ipv6_mc_socklist *mc_lst;
 	struct net *net = sock_net(sk);
 
-	if (!rcu_access_pointer(np->ipv6_mc_list))
-		return;
+	ASSERT_RTNL();
 
-	rtnl_lock();
 	while ((mc_lst = rtnl_dereference(np->ipv6_mc_list)) != NULL) {
 		struct net_device *dev;
 
@@ -312,8 +309,17 @@ void ipv6_sock_mc_close(struct sock *sk)
 
 		atomic_sub(sizeof(*mc_lst), &sk->sk_omem_alloc);
 		kfree_rcu(mc_lst, rcu);
-
 	}
+}
+
+void ipv6_sock_mc_close(struct sock *sk)
+{
+	struct ipv6_pinfo *np = inet6_sk(sk);
+
+	if (!rcu_access_pointer(np->ipv6_mc_list))
+		return;
+	rtnl_lock();
+	__ipv6_sock_mc_close(sk);
 	rtnl_unlock();
 }
 
--- a/net/ipv6/netfilter/Makefile
+++ b/net/ipv6/netfilter/Makefile
@@ -16,6 +16,8 @@ nf_conntrack_ipv6-y  :=  nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o
 # l3 independent conntrack
 obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o
 
+obj-$(CONFIG_VE_IP_NF_VZPRIVNET) += ip6_vzprivnet.o
+
 nf_nat_ipv6-y		:= nf_nat_l3proto_ipv6.o nf_nat_proto_icmpv6.o
 obj-$(CONFIG_NF_NAT_IPV6) += nf_nat_ipv6.o
 obj-$(CONFIG_NF_NAT_MASQUERADE_IPV6) += nf_nat_masquerade_ipv6.o
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -326,6 +326,9 @@ ip6t_do_table(struct sk_buff *skb,
 	struct xt_action_param acpar;
 	unsigned int addend;
 
+	if (ve_xt_table_forbidden(table))
+		return NF_ACCEPT;
+
 	/* Initialization */
 	indev = state->in ? state->in->name : nulldevname;
 	outdev = state->out ? state->out->name : nulldevname;
@@ -754,6 +757,10 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
 	if (err)
 		return err;
 
+	/* target start is within the ip/ip6/arpt_entry struct */
+	if (e->target_offset < ((const void *)e->elems - (const void *)e))
+		return -EINVAL;
+
 	/* Check hooks & underflows */
 	for (h = 0; h < NF_INET_NUMHOOKS; h++) {
 		if (!(valid_hooks & (1 << h)))
@@ -1481,6 +1488,10 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
 	if (ret)
 		return ret;
 
+	/* target start is within the ip/ip6/arpt_entry struct */
+	if (e->target_offset < ((const void *)e->elems - (const void *)e))
+		return -EINVAL;
+
 	off = sizeof(struct ip6t_entry) - sizeof(struct compat_ip6t_entry);
 	entry_offset = (void *)e - (void *)base;
 	j = 0;
@@ -1704,9 +1715,10 @@ static int
 compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user,
 		       unsigned int len)
 {
+	struct user_namespace *user_ns = sock_net(sk)->user_ns;
 	int ret;
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1814,9 +1826,10 @@ static int do_ip6t_get_ctl(struct sock *, int, void __user *, int *);
 static int
 compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 {
+	struct user_namespace *user_ns = sock_net(sk)->user_ns;
 	int ret;
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1836,9 +1849,10 @@ compat_do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 static int
 do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 {
+	struct user_namespace *user_ns = sock_net(sk)->user_ns;
 	int ret;
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -1861,9 +1875,10 @@ do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len)
 static int
 do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
 {
+	struct user_namespace *user_ns = sock_net(sk)->user_ns;
 	int ret;
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 
 	switch (cmd) {
@@ -2061,12 +2076,25 @@ static struct xt_match ip6t_builtin_mt[] __read_mostly = {
 
 static int __net_init ip6_tables_net_init(struct net *net)
 {
-	return xt_proto_init(net, NFPROTO_IPV6);
+	int res;
+
+	if (!net_ipt_permitted(net, VE_IP_IPTABLES6))
+		return 0;
+
+	res = xt_proto_init(net, NFPROTO_IPV6);
+	if (!res)
+		net_ipt_module_set(net, VE_IP_IPTABLES6);
+	return res;
 }
 
 static void __net_exit ip6_tables_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_IPTABLES6))
+		return;
+
 	xt_proto_fini(net, NFPROTO_IPV6);
+
+	net_ipt_module_clear(net, VE_IP_IPTABLES6);
 }
 
 static struct pernet_operations ip6_tables_net_ops = {
--- /dev/null
+++ b/net/ipv6/netfilter/ip6_vzprivnet.c
@@ -0,0 +1,1134 @@
+/*
+ *  net/ipv6/netfilter/ip6_vzprivnet.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/netfilter.h>
+#include <linux/netfilter_ipv6.h>
+#include <linux/vzprivnet.h>
+#include <linux/list.h>
+#include <linux/seq_file.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/inet.h>
+#include <net/ipv6.h>
+
+static DEFINE_RWLOCK(vzpriv6lock);
+
+struct vzprivnet {
+	unsigned int netid;
+	int weak;
+	unsigned int subnet_preflen;
+	struct list_head list;
+	struct list_head entries;
+};
+
+static LIST_HEAD(sparse6_vzprivnets);
+
+struct vzprivnet_entry {
+	__u32 ip[4];
+	unsigned preflen;
+	struct vzprivnet *pn;
+	struct vzprivnet6_node *n;
+	struct list_head list;
+};
+
+struct vzprivnet6_node
+{
+	struct vzprivnet6_node	*parent;
+	struct vzprivnet6_node	*left;
+	struct vzprivnet6_node	*right;
+
+	struct vzprivnet_entry	*entry;
+
+	__u16			fn_bit;		/* bit key */
+	__u16			fn_flags;
+};
+
+struct vzprivnet internet = {
+	.weak = VZPRIVNET_INET,
+};
+
+#define RTN_RTINFO		1
+
+static struct vzprivnet_entry sparse6_null_entry = {
+	.preflen = 128,
+	.pn = &internet,
+};
+
+static struct vzprivnet6_node sparse6_root_node = {
+	.entry		= &sparse6_null_entry,
+	.fn_flags	= RTN_RTINFO,
+};
+
+static struct vzprivnet_entry legacy6_null_entry = {
+	.preflen = 128,
+	.pn = &internet,
+};
+
+static struct vzprivnet6_node legacy6_root_node = {
+	.entry		= &legacy6_null_entry,
+	.fn_flags	= RTN_RTINFO,
+};
+
+static LIST_HEAD(legacy6_vzprivnets);
+
+static inline int ip6_match(u32 *net, unsigned plen, u32 *ip)
+{
+	return ipv6_prefix_equal((const struct in6_addr *)net, (const struct in6_addr *)ip, plen);
+}
+
+static inline int ip6_intersect(u32 *ip1, unsigned len1, u32 *ip2, unsigned len2)
+{
+	return ip6_match(ip1, len1, ip2) || ip6_match(ip2, len2, ip1);
+}
+
+static __inline__ int addr_bit_set(void *ip, int fn_bit)
+{
+	__u32 *addr = ip;
+
+	return htonl(1 << ((~fn_bit)&0x1F)) & addr[fn_bit>>5];
+}
+
+static __inline__ void vzprivnet6_node_free(struct vzprivnet6_node * fn)
+{
+	kfree(fn);
+}
+
+static __inline__ struct vzprivnet6_node * vzprivnet6_node_alloc(void)
+{
+	return kzalloc(sizeof(struct vzprivnet6_node), GFP_ATOMIC);
+}
+
+static struct vzprivnet6_node * radix_tree_search(struct vzprivnet6_node *root,
+					struct in6_addr *addr)
+{
+	struct vzprivnet6_node *fn;
+	int dir;
+
+	fn = root;
+
+	for (;;) {
+		struct vzprivnet6_node *next;
+
+		dir = addr_bit_set(addr, fn->fn_bit);
+
+		next = dir ? fn->right : fn->left;
+		if (next) {
+			fn = next;
+			continue;
+		}
+
+		break;
+	}
+
+	if (ip6_match(fn->entry->ip, fn->entry->preflen, (u32 *)addr))
+		return fn;
+
+	return NULL;
+}
+
+static struct vzprivnet_entry *vzprivnet6_lookup(struct vzprivnet6_node *root,
+						u32 *ip)
+{
+	struct vzprivnet6_node *n;
+
+	n = radix_tree_search(root, (struct in6_addr *)ip);
+	return (n) ? n->entry : NULL;
+}
+
+static inline struct vzprivnet *vzprivnet6_lookup_net(u32 *ip)
+{
+	struct vzprivnet_entry *pne;
+
+	pne = vzprivnet6_lookup(&sparse6_root_node, ip);
+	if (pne == NULL)
+		pne = vzprivnet6_lookup(&legacy6_root_node, ip);
+
+	if (pne != NULL)
+		return pne->pn;
+	else
+		return &internet;
+}
+
+static inline int noip(u32 *ip)
+{
+	return (ip[0] | ip[1] | ip[2] | ip[3]) == 0;
+}
+
+static struct vzprivnet6_node * radix_tree_add(void *addr, unsigned plen,
+						struct vzprivnet6_node *root)
+{
+	struct vzprivnet6_node *fn, *in, *ln;
+	struct vzprivnet6_node *pn = NULL;
+	struct vzprivnet_entry *pne = NULL;
+	int	bit;
+	int	dir = 0;
+
+	/* insert node in tree */
+
+	fn = root;
+
+	do {
+		pne = fn->entry;
+		if (ip6_intersect(pne->ip, pne->preflen, (u32 *)addr, plen))
+			return ERR_PTR(-EEXIST);
+
+		/*
+		 *	Prefix match
+		 */
+		if (plen < fn->fn_bit ||
+		    !ipv6_prefix_equal((struct in6_addr *)pne->ip, addr, fn->fn_bit))
+			goto insert_intermediate_node;
+
+		dir = addr_bit_set(addr, fn->fn_bit);
+		pn = fn;
+		fn = dir ? fn->right : fn->left;
+	} while (fn);
+
+	/*
+	 *	We walked to the bottom of tree.
+	 *	Create new leaf node without children.
+	 */
+
+	ln = vzprivnet6_node_alloc();
+	if (ln == NULL)
+		return ERR_PTR(-ENOMEM);
+
+	ln->fn_bit = plen;
+	ln->parent = pn;
+
+	if (dir)
+		pn->right = ln;
+	else
+		pn->left  = ln;
+
+	return ln;
+
+insert_intermediate_node:
+
+	pn = fn->parent;
+
+	bit = ipv6_addr_diff(addr, (struct in6_addr *)pne->ip);
+
+	BUG_ON(plen <= bit);
+
+	/*
+	 *		(intermediate)[in]
+	 *	          /	   \
+	 *	(new leaf node)[ln] (old node)[fn]
+	 */
+	in = vzprivnet6_node_alloc();
+	ln = vzprivnet6_node_alloc();
+
+	if (in == NULL || ln == NULL) {
+		if (in)
+			vzprivnet6_node_free(in);
+		if (ln)
+			vzprivnet6_node_free(ln);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/*
+	 * new intermediate node.
+	 * RTN_RTINFO will be off
+	 */
+
+	in->fn_bit = bit;
+
+	in->parent = pn;
+	in->entry = fn->entry;
+
+	/* update parent pointer */
+	if (dir)
+		pn->right = in;
+	else
+		pn->left  = in;
+
+	ln->fn_bit = plen;
+
+	ln->parent = in;
+	fn->parent = in;
+
+	if (addr_bit_set(addr, bit)) {
+		in->right = ln;
+		in->left  = fn;
+	} else {
+		in->left  = ln;
+		in->right = fn;
+	}
+
+	return ln;
+}
+
+static struct vzprivnet6_node * sparse6_add_subnet(void *addr, unsigned plen)
+{
+	return radix_tree_add(addr, plen, &sparse6_root_node);
+}
+
+static int sparse6_add(unsigned netid, u32 *ip, unsigned preflen, int weak)
+{
+	int err;
+	struct vzprivnet *pn = NULL, *epn = NULL;
+	struct vzprivnet_entry *pne = NULL;
+
+	err = -ENOMEM;
+	pn = kzalloc(sizeof(*pn), GFP_KERNEL);
+	if (pn == NULL)
+		goto out;
+
+	pne = kzalloc(sizeof(*pne), GFP_KERNEL);
+	if (pne == NULL)
+		goto out;
+
+	write_lock_bh(&vzpriv6lock);
+	list_for_each_entry(epn, &sparse6_vzprivnets, list)
+		if (epn->netid == netid) {
+			kfree(pn);
+			pn = epn;
+			goto found_net;
+		}
+
+	pn->netid = netid;
+	pn->weak = weak;
+	INIT_LIST_HEAD(&pn->entries);
+
+found_net:
+	if (!noip(ip)) {
+		struct vzprivnet6_node *n;
+
+		n = sparse6_add_subnet(ip, preflen);
+		if (IS_ERR(n)) {
+			err = PTR_ERR(n);
+			goto out_unlock;
+		}
+
+		n->entry = pne;
+		n->fn_flags |= RTN_RTINFO;
+
+		memcpy(pne->ip, ip, sizeof(pne->ip));
+		pne->preflen = preflen;
+		pne->pn = pn;
+		list_add_tail(&pne->list, &pn->entries);
+		pne->n = n;
+		pne = NULL;
+	} else if (weak == VZPRIVNET_WEAK) {
+		pn->weak =  VZPRIVNET_WEAK;
+	} else if (pn == epn) {
+		err = -EEXIST;
+		goto out_unlock;
+	}
+
+	if (pn != epn) {
+		list_add_tail(&pn->list, &sparse6_vzprivnets);
+		pn = NULL;
+	}
+
+	err = 0;
+
+out_unlock:
+	write_unlock_bh(&vzpriv6lock);
+out:
+	if (pn != epn)
+		kfree(pn);
+	kfree(pne);
+
+	return err;
+}
+
+static void radix_tree_del(struct vzprivnet6_node *fn)
+{
+	int children;
+	struct vzprivnet6_node *child, *pn;
+
+	BUG_ON(fn->parent == NULL);
+
+	for (;;) {
+		children = 0;
+		child = NULL;
+
+		if (fn->right) {
+			child = fn->right;
+			children |= 1;
+		}
+		if (fn->left) {
+			child = fn->left;
+			children |= 2;
+		}
+
+		if (children == 3)
+			return;
+
+		pn = fn->parent;
+		if (pn->right == fn)
+			pn->right = child;
+		else if (pn->left == fn)
+			pn->left = child;
+
+		if (child)
+			child->parent = pn;
+
+		vzprivnet6_node_free(fn);
+		if (pn->fn_flags & RTN_RTINFO)
+			return;
+
+		fn = pn;
+	}
+}
+
+
+static void vzprivnet6_del_entry(struct vzprivnet_entry *pne)
+{
+	radix_tree_del(pne->n);
+}
+
+
+static void sparse6_free_entry(struct vzprivnet_entry *pne)
+{
+	list_del(&pne->list);
+	vzprivnet6_del_entry(pne);
+	kfree(pne);
+}
+
+static void vzprivnet6_del_one(struct vzprivnet *pn)
+{
+	struct vzprivnet_entry *pne;
+
+	list_del(&pn->list);
+
+	while (!list_empty(&pn->entries)) {
+		pne = list_first_entry(&pn->entries,
+				struct vzprivnet_entry, list);
+		sparse6_free_entry(pne);
+	}
+
+	kfree(pn);
+}
+
+static void vzprivnet6_cleanup(void)
+{
+	struct vzprivnet *pn;
+
+	write_lock_bh(&vzpriv6lock);
+	while (!list_empty(&sparse6_vzprivnets)) {
+		pn = list_first_entry(&sparse6_vzprivnets,
+				struct vzprivnet, list);
+		vzprivnet6_del_one(pn);
+	}
+	while (!list_empty(&legacy6_vzprivnets)) {
+		pn = list_first_entry(&legacy6_vzprivnets,
+				struct vzprivnet, list);
+		vzprivnet6_del_one(pn);
+	}
+	write_unlock_bh(&vzpriv6lock);
+}
+
+static int sparse6_del_net(unsigned netid, int weak)
+{
+	struct vzprivnet *pn;
+
+	list_for_each_entry(pn, &sparse6_vzprivnets, list) {
+		if (pn->netid != netid)
+			continue;
+
+		if (weak == VZPRIVNET_WEAK)
+			pn->weak = VZPRIVNET_STRONG;
+		else
+			vzprivnet6_del_one(pn);
+
+		return 0;
+	}
+
+	return -ENOENT;
+}
+
+static int sparse6_del_ip(u32 *ip)
+{
+	struct vzprivnet_entry *pne;
+
+	pne = vzprivnet6_lookup(&sparse6_root_node, ip);
+	if (pne == NULL)
+		return -ENOENT;
+
+	sparse6_free_entry(pne);
+	return 0;
+}
+
+static int sparse6_del(unsigned netid, u32 *ip, int weak)
+{
+	int err;
+
+	write_lock_bh(&vzpriv6lock);
+	if (!noip(ip))
+		err = sparse6_del_ip(ip);
+	else
+		err = sparse6_del_net(netid, weak);
+	write_unlock_bh(&vzpriv6lock);
+
+	return err;
+}
+
+static inline int is_ipv6_neighbour_solicit(const struct in6_addr *addr)
+{
+	/* see addrconf_addr_solict_mult */
+	return (addr->s6_addr32[0] == __constant_htonl(0xFF020000) &&
+		addr->s6_addr32[1] == 0 &&
+		addr->s6_addr32[2] == __constant_htonl(1) &&
+		(addr->s6_addr32[3] & __constant_htonl(0xFF000000)) == __constant_htonl(0xFF000000));
+}
+
+static unsigned int vzprivnet6_hook(struct sk_buff *skb, int can_be_bridge)
+{
+	int verdict = NF_DROP;
+	struct vzprivnet *dst, *src;
+	struct ipv6hdr *hdr;
+	struct net *src_net;
+
+	if (WARN_ON_ONCE(!skb->dev && !skb->sk))
+		return NF_ACCEPT;
+
+	src_net = skb->dev ? dev_net(skb->dev) : sock_net(skb->sk);
+	if (!ve_is_super(src_net->owner_ve))
+		return NF_ACCEPT;
+
+	hdr = ipv6_hdr(skb);
+
+	if (can_be_bridge) {
+		if (!vzpn_handle_bridged &&
+				skb_dst(skb) != NULL &&
+				skb_dst(skb)->output != ip6_output)
+			return NF_ACCEPT;
+		if (is_ipv6_neighbour_solicit(&hdr->daddr))
+			return NF_ACCEPT;
+	}
+
+	read_lock(&vzpriv6lock);
+
+	src = vzprivnet6_lookup_net(hdr->saddr.in6_u.u6_addr32);
+	dst = vzprivnet6_lookup_net(hdr->daddr.in6_u.u6_addr32);
+
+	if (src == dst) {
+		if (ipv6_prefix_equal(&hdr->saddr, &hdr->daddr,
+				      src->subnet_preflen))
+			verdict = NF_ACCEPT;
+	} else if (src->weak + dst->weak >= 3)
+		verdict = NF_ACCEPT;
+
+	read_unlock(&vzpriv6lock);
+
+	return verdict;
+}
+
+static unsigned int vzprivnet6_fwd_hook(const struct nf_hook_ops *ops,
+				  struct sk_buff *skb,
+				  const struct net_device *in,
+				  const struct net_device *out,
+				  const struct nf_hook_state *state)
+{
+	return vzprivnet6_hook(skb, 1);
+}
+
+static unsigned int vzprivnet6_host_hook(struct sk_buff *skb,
+		const struct net_device *dev, int can_be_bridge)
+{
+	if (!vzpn_filter_host)
+		return NF_ACCEPT;
+	if (!(dev->features & NETIF_F_VENET))
+		return NF_ACCEPT;
+
+	return vzprivnet6_hook(skb, can_be_bridge);
+}
+
+static unsigned int vzprivnet6_in_hook(const struct nf_hook_ops *ops,
+				  struct sk_buff *skb,
+				  const struct net_device *in,
+				  const struct net_device *out,
+				  const struct nf_hook_state *state)
+{
+	return vzprivnet6_host_hook(skb, in, 0);
+}
+
+static unsigned int vzprivnet6_out_hook(const struct nf_hook_ops *ops,
+				  struct sk_buff *skb,
+				  const struct net_device *in,
+				  const struct net_device *out,
+				  const struct nf_hook_state *state)
+{
+	return vzprivnet6_host_hook(skb, out, 1);
+}
+
+static struct nf_hook_ops vzprivnet6_ops[] = {
+	{
+		.hook = vzprivnet6_fwd_hook,
+		.owner = THIS_MODULE,
+		.pf = PF_INET6,
+		.hooknum = NF_INET_FORWARD,
+		.priority = NF_IP6_PRI_FIRST
+	},
+	{
+		.hook = vzprivnet6_in_hook,
+		.owner = THIS_MODULE,
+		.pf = PF_INET6,
+		.hooknum = NF_INET_LOCAL_IN,
+		.priority = NF_IP6_PRI_FIRST
+	},
+	{
+		.hook = vzprivnet6_out_hook,
+		.owner = THIS_MODULE,
+		.pf = PF_INET6,
+		.hooknum = NF_INET_LOCAL_OUT,
+		.priority = NF_IP6_PRI_FIRST
+	},
+};
+
+static char *nextline(char *s)
+{
+	while(*s && *s != '\n') s++;
+	while(*s && *s == '\n') s++;
+	return s;
+}
+
+static int parse_sparse6_add(const char *str, unsigned int *netid, u32 *ip, unsigned *preflen, int *weak)
+{
+	char *end;
+
+	*netid = simple_strtol(str, &end, 10);
+	if (is_eol(*end))
+		return 0;
+
+	if (*end != ':')
+		return -EINVAL;
+
+	str = end + 1;
+	if (*str == '*') {
+		if (!is_eol(*(str + 1)))
+			return -EINVAL;
+
+		*weak = VZPRIVNET_WEAK;
+		return 0;
+	}
+
+	if (!in6_pton(str, -1, (u8 *)ip, -1, (const char **)&end))
+		return -EINVAL;
+
+	if (*end != '/')
+		return -EINVAL;
+
+	str = end + 1;
+	*preflen = simple_strtol(str, &end, 10);
+	if (!is_eol(*end))
+		return -EINVAL;
+
+	return 0;
+}
+
+static int parse_sparse6_remove(const char *str, unsigned int *netid, u32 *ip, int *weak)
+{
+	char *end;
+
+	if (strchr(str, ':') && !strchr(str, '*')) {
+		if (!in6_pton(str, -1, (u8 *)ip, -1, (const char **)&end)) {
+			printk("Bad ip in %s\n", str);
+			return -EINVAL;
+		}
+
+		if (!is_eol(*end))
+			printk("No EOL in %s\n", str);
+	} else {
+		*netid = simple_strtol(str, &end, 10);
+		if (end[0] == ':' && end[1] == '*') {
+			end += 2;
+			*weak = VZPRIVNET_WEAK;
+		}
+	}
+
+	return (is_eol(*end) ? 0 : -EINVAL);
+}
+
+static int parse_sparse6(const char *param, int *add,
+		unsigned int *netid, u32 *ip, unsigned *preflen, int *weak)
+{
+	if (param[0] == '+') {
+		*add = 1;
+		return parse_sparse6_add(param + 1, netid, ip, preflen, weak);
+	}
+
+	if (param[0] == '-') {
+		*add = 0;
+		return parse_sparse6_remove(param + 1, netid, ip, weak);
+	}
+
+	return -EINVAL;
+}
+
+/*
+ * +ID			to add a network
+ * +ID:<addr>/m		to add a subnet to network
+ * +ID:*		to make a network weak
+ * -ID			to remove the whole network
+ * -<addr>		to remove an IP or bounding subnet (from its network)
+ * -ID:*		to make a network "strong" ;)
+ *
+ *  No weak networks here!
+ */
+
+static ssize_t sparse6_write(struct file * file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	char *s, *page;
+	int err;
+	int offset;
+
+	page = (unsigned char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	if (count > (PAGE_SIZE - 1))
+		count = (PAGE_SIZE - 1);
+
+	err = copy_from_user(page, buf, count);
+	if (err)
+		goto err;
+
+	s = page;
+	s[count] = 0;
+
+	err = -EINVAL;
+	while (*s) {
+		int add, weak = VZPRIVNET_STRONG;
+		unsigned int netid = 0, preflen = 0;
+		u32 ip[4] = { 0, 0, 0, 0 };
+
+		err = parse_sparse6(s, &add, &netid, ip, &preflen, &weak);
+		if (err)
+			goto out;
+
+		if (add)
+			err = sparse6_add(netid, ip, preflen, weak);
+		else
+			err = sparse6_del(netid, ip, weak);
+
+		if (err)
+			goto out;
+
+		s = nextline(s);
+	}
+out:
+	offset = s - page;
+	if (offset > 0)
+		err = offset;
+err:
+	free_page((unsigned long)page);
+	return err;
+
+}
+
+static void *sparse6_seq_start(struct seq_file *seq, loff_t *ppos)
+{
+	struct list_head *lh;
+	loff_t pos = *ppos;
+
+	read_lock(&vzpriv6lock);
+	list_for_each(lh, &sparse6_vzprivnets)
+		if (pos-- == 0)
+			return lh;
+
+	return NULL;
+}
+
+static void *sparse6_seq_next(struct seq_file *seq, void *v, loff_t *ppos)
+{
+	struct list_head *lh;
+
+	lh = ((struct list_head *)v)->next;
+	++*ppos;
+	return lh == &sparse6_vzprivnets ? NULL : lh;
+}
+
+static void sparse6_seq_stop(struct seq_file *s, void *v)
+{
+	read_unlock(&vzpriv6lock);
+}
+
+static int sparse6_seq_show(struct seq_file *s, void *v)
+{
+	struct vzprivnet *pn;
+	struct vzprivnet_entry *pne;
+
+	pn = list_entry(v, struct vzprivnet, list);
+	seq_printf(s, "%u: ", pn->netid);
+	if (pn->weak == VZPRIVNET_WEAK)
+		seq_puts(s, "* ");
+
+	list_for_each_entry(pne, &pn->entries, list)
+		seq_printf(s, "%pI6/%u ", pne->ip, pne->preflen);
+
+	seq_putc(s, '\n');
+
+	return 0;
+}
+
+static struct seq_operations sparse6_seq_ops = {
+	.start = sparse6_seq_start,
+	.next  = sparse6_seq_next,
+	.stop  = sparse6_seq_stop,
+	.show  = sparse6_seq_show,
+};
+
+static int sparse6_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &sparse6_seq_ops);
+}
+
+static struct file_operations proc_sparse6_ops = {
+	.owner   = THIS_MODULE,
+	.open    = sparse6_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = sparse6_write,
+};
+
+static char sample_ipv6[42];
+
+static ssize_t classify6_write(struct file * file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	int len;
+	char *tmp;
+
+	len = count;
+	if (len >= sizeof(sample_ipv6))
+		len = sizeof(sample_ipv6) - 1;
+
+	if (copy_from_user(sample_ipv6, buf, len))
+		return -EFAULT;
+
+	sample_ipv6[len] = '\0';
+	tmp = strchr(sample_ipv6, '\n');
+	if (tmp)
+		*tmp = '\0';
+
+	return count;
+}
+
+static int classify6_seq_show(struct seq_file *s, void *v)
+{
+	u32 ip[4];
+	struct vzprivnet_entry *pne;
+
+	seq_printf(s, "%s: ", sample_ipv6);
+
+	if (!in6_pton(sample_ipv6, sizeof(sample_ipv6), (u8 *)ip, -1, NULL)) {
+		seq_puts(s, "invalid IP\n");
+		return 0;
+	}
+
+	read_lock(&vzpriv6lock);
+	pne = vzprivnet6_lookup(&sparse6_root_node, ip);
+	if (pne != NULL) {
+		seq_printf(s, "net %u, ", pne->pn->netid);
+		seq_printf(s, "rule %pI6/%u\n", pne->ip, pne->preflen);
+		goto out;
+	}
+
+	pne = vzprivnet6_lookup(&legacy6_root_node, ip);
+	if (pne != NULL) {
+		seq_printf(s, "legacy %pI6/%u/%u\n",
+				pne->ip, pne->preflen, pne->pn->subnet_preflen);
+
+	} else
+		seq_printf(s, "internet\n");
+out:
+	read_unlock(&vzpriv6lock);
+	return 0;
+}
+
+static int classify6_seq_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, &classify6_seq_show, NULL);
+}
+
+static struct file_operations proc_classify6_ops = {
+	.owner   = THIS_MODULE,
+	.open    = classify6_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write	 = classify6_write,
+};
+
+static int legacy6_del(u32 *ip)
+{
+	struct vzprivnet_entry *pne;
+
+	write_lock_bh(&vzpriv6lock);
+	pne = vzprivnet6_lookup(&legacy6_root_node, ip);
+	if (pne == NULL) {
+		write_unlock_bh(&vzpriv6lock);
+		return -ENOENT;
+	}
+	vzprivnet6_del_one(pne->pn);
+	write_unlock_bh(&vzpriv6lock);
+
+	return 0;
+}
+
+static struct vzprivnet6_node * legacy6_add_subnet(void *addr, unsigned plen)
+{
+	return radix_tree_add(addr, plen, &legacy6_root_node);
+}
+
+static int legacy6_add(u32 *ip, u32 preflen, u32 subnet_preflen)
+{
+	int err;
+	struct vzprivnet *pn = NULL;
+	struct vzprivnet_entry *pne = NULL;
+	struct vzprivnet6_node *n;
+
+	err = -ENOMEM;
+	pn = kzalloc(sizeof(*pn), GFP_KERNEL);
+	if (pn == NULL)
+		goto out;
+
+	pn->subnet_preflen = subnet_preflen;
+	INIT_LIST_HEAD(&pn->entries);
+
+	pne = kzalloc(sizeof(*pne), GFP_KERNEL);
+	if (pne == NULL)
+		goto out;
+
+	write_lock_bh(&vzpriv6lock);
+	n = legacy6_add_subnet(ip, preflen);
+	if (IS_ERR(n)) {
+		err = PTR_ERR(n);
+		write_unlock_bh(&vzpriv6lock);
+		goto out;
+	}
+
+	n->entry = pne;
+	n->fn_flags |= RTN_RTINFO;
+
+	memcpy(pne->ip, ip, sizeof(struct in6_addr));
+	pne->preflen = preflen;
+	pne->pn = pn;
+	list_add_tail(&pne->list, &pn->entries);
+	pne->n = n;
+
+	list_add_tail(&pn->list, &legacy6_vzprivnets);
+	write_unlock_bh(&vzpriv6lock);
+
+	return 0;
+out:
+	kfree(pn);
+	kfree(pne);
+
+	return err;
+}
+
+static int parse_legacy6(char *param, int *add, u32 *ip,
+				unsigned *preflen, unsigned *subnet_preflen)
+{
+	char *str, *end;
+
+	if (param[0] == '+')
+		*add = 1;
+	else if (param[0] == '-')
+		*add = 0;
+	else
+		return -EINVAL;
+
+	str = param + 1;
+
+	if (!in6_pton(str, -1, (u8 *)ip, -1, (const char **)&end))
+		return -EINVAL;
+
+	if (*end != '/')
+		return -EINVAL;
+
+	str = end + 1;
+	*preflen = simple_strtol(str, &end, 10);
+
+	if (*end != '/')
+		return -EINVAL;
+
+	str = end + 1;
+	*subnet_preflen = simple_strtol(str, &end, 10);
+	if (!is_eol(*end))
+		return -EINVAL;
+
+	if ((*preflen == 0) || (*preflen > 128) ||
+		(*subnet_preflen == 0) || (*subnet_preflen > 128))
+		return -EINVAL;
+
+	if (*subnet_preflen < *preflen)
+		return -EINVAL;
+
+	return 0;
+}
+
+static ssize_t legacy6_write(struct file * file, const char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	char *s, *page;
+	int err;
+	int offset;
+
+	page = (unsigned char *)__get_free_page(GFP_KERNEL);
+	if (!page)
+		return -ENOMEM;
+
+	if (count > (PAGE_SIZE - 1))
+		count = (PAGE_SIZE - 1);
+
+	err = copy_from_user(page, buf, count);
+	if (err)
+		goto err;
+
+	s = page;
+	s[count] = 0;
+
+	err = -EINVAL;
+	while (*s) {
+		int add;
+		unsigned int preflen = 0, subnet_preflen = 0;
+		u32 ip[4] = { 0, 0, 0, 0 };
+
+		err = parse_legacy6(s, &add, ip, &preflen, &subnet_preflen);
+		if (err)
+			goto out;
+
+		if (add)
+			err = legacy6_add(ip, preflen, subnet_preflen);
+		else
+			err = legacy6_del(ip);
+
+		if (err)
+			goto out;
+
+		s = nextline(s);
+	}
+out:
+	offset = s - page;
+	if (offset > 0)
+		err = offset;
+err:
+	free_page((unsigned long)page);
+	return err;
+}
+
+static void *legacy6_seq_start(struct seq_file *seq, loff_t *ppos)
+{
+	struct list_head *lh;
+	loff_t pos = *ppos;
+
+	read_lock(&vzpriv6lock);
+	list_for_each(lh, &legacy6_vzprivnets)
+		if (pos-- == 0)
+			return lh;
+
+	return NULL;
+}
+
+static void *legacy6_seq_next(struct seq_file *seq, void *v, loff_t *ppos)
+{
+	struct list_head *lh;
+
+	lh = ((struct list_head *)v)->next;
+	++*ppos;
+	return lh == &legacy6_vzprivnets ? NULL : lh;
+}
+
+static void legacy6_seq_stop(struct seq_file *s, void *v)
+{
+	read_unlock(&vzpriv6lock);
+}
+
+static int legacy6_seq_show(struct seq_file *s, void *v)
+{
+	struct vzprivnet *pn;
+	struct vzprivnet_entry *pne;
+
+	pn = list_entry(v, struct vzprivnet, list);
+	list_for_each_entry(pne, &pn->entries, list)
+		seq_printf(s, "%pI6/%u/%u", pne->ip, pne->preflen,
+							pne->pn->subnet_preflen);
+
+	seq_putc(s, '\n');
+
+	return 0;
+}
+
+static struct seq_operations legacy6_seq_ops = {
+	.start = legacy6_seq_start,
+	.next  = legacy6_seq_next,
+	.stop  = legacy6_seq_stop,
+	.show  = legacy6_seq_show,
+};
+
+static int legacy6_seq_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &legacy6_seq_ops);
+}
+
+static struct file_operations proc_legacy6_ops = {
+	.owner   = THIS_MODULE,
+	.open    = legacy6_seq_open,
+	.read    = seq_read,
+	.llseek  = seq_lseek,
+	.release = seq_release,
+	.write   = legacy6_write,
+};
+
+static int __init ip6_vzprivnet_init(void)
+{
+	int err = -ENOMEM;
+	struct proc_dir_entry *proc;
+
+	proc = proc_create("sparse6", 0644,
+			vzpriv_proc_dir, &proc_sparse6_ops);
+	if (proc == NULL)
+		goto err_sparse6;
+
+	proc = proc_create("classify6", 0644,
+			vzpriv_proc_dir, &proc_classify6_ops);
+	if (proc == NULL)
+		goto err_classify6;
+
+	proc = proc_create("legacy6", 0644,
+			vzpriv_proc_dir, &proc_legacy6_ops);
+	if (proc == NULL)
+		goto err_legacy6;
+
+	err = nf_register_hooks(vzprivnet6_ops, 3);
+	if (err)
+		goto err_reg;
+
+	return 0;
+
+err_reg:
+	remove_proc_entry("legacy6", vzpriv_proc_dir);
+err_legacy6:
+	remove_proc_entry("classify6", vzpriv_proc_dir);
+err_classify6:
+	remove_proc_entry("sparse6", vzpriv_proc_dir);
+err_sparse6:
+	return err;
+}
+
+static void __exit ip6_vzprivnet_exit(void)
+{
+	nf_unregister_hooks(vzprivnet6_ops, 3);
+	remove_proc_entry("legacy6", vzpriv_proc_dir);
+	remove_proc_entry("classify6", vzpriv_proc_dir);
+	remove_proc_entry("sparse6", vzpriv_proc_dir);
+	vzprivnet6_cleanup();
+}
+
+module_init(ip6_vzprivnet_init)
+module_exit(ip6_vzprivnet_exit)
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Virtuozzo <devel@openvz.org>");
--- a/net/ipv6/netfilter/ip6t_MASQUERADE.c
+++ b/net/ipv6/netfilter/ip6t_MASQUERADE.c
@@ -33,6 +33,7 @@ static int masquerade_tg6_checkentry(const struct xt_tgchk_param *par)
 
 	if (range->flags & NF_NAT_RANGE_MAP_IPS)
 		return -EINVAL;
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
--- a/net/ipv6/netfilter/ip6t_SYNPROXY.c
+++ b/net/ipv6/netfilter/ip6t_SYNPROXY.c
@@ -432,13 +432,17 @@ static unsigned int ipv6_synproxy_hook(const struct nf_hook_ops *ops,
 static int synproxy_tg6_check(const struct xt_tgchk_param *par)
 {
 	const struct ip6t_entry *e = par->entryinfo;
+	int ret;
 
 	if (!(e->ipv6.flags & IP6T_F_PROTO) ||
 	    e->ipv6.proto != IPPROTO_TCP ||
 	    e->ipv6.invflags & XT_INV_PROTO)
 		return -EINVAL;
 
-	return nf_ct_l3proto_try_module_get(par->family);
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret == 0)
+		allow_conntrack_allocation(par->net);
+	return ret;
 }
 
 static void synproxy_tg6_destroy(const struct xt_tgdtor_param *par)
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -51,6 +51,9 @@ static int __net_init ip6table_filter_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 
+	if (!net_ipt_permitted(net, VE_IP_FILTER6))
+		return 0;
+
 	repl = ip6t_alloc_initial_table(&packet_filter);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -61,12 +64,22 @@ static int __net_init ip6table_filter_net_init(struct net *net)
 	net->ipv6.ip6table_filter =
 		ip6t_register_table(net, &packet_filter, repl);
 	kfree(repl);
+
+	if (!IS_ERR(net->ipv6.ip6table_filter))
+		net_ipt_module_set(net, VE_IP_FILTER6);
+
 	return PTR_RET(net->ipv6.ip6table_filter);
 }
 
 static void __net_exit ip6table_filter_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_FILTER6))
+		return;
+
 	ip6t_unregister_table(net, net->ipv6.ip6table_filter);
+	net->ipv6.ip6table_filter = NULL;
+
+	net_ipt_module_clear(net, VE_IP_FILTER6);
 }
 
 static struct pernet_operations ip6table_filter_net_ops = {
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -95,18 +95,31 @@ static int __net_init ip6table_mangle_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 
+	if (!net_ipt_permitted(net, VE_IP_MANGLE6))
+		return 0;
+
 	repl = ip6t_alloc_initial_table(&packet_mangler);
 	if (repl == NULL)
 		return -ENOMEM;
 	net->ipv6.ip6table_mangle =
 		ip6t_register_table(net, &packet_mangler, repl);
 	kfree(repl);
+
+	if (!IS_ERR(net->ipv6.ip6table_mangle))
+		net_ipt_module_set(net, VE_IP_MANGLE6);
+
 	return PTR_RET(net->ipv6.ip6table_mangle);
 }
 
 static void __net_exit ip6table_mangle_net_exit(struct net *net)
 {
+	if (!net_is_ipt_module_set(net, VE_IP_MANGLE6))
+		return;
+
 	ip6t_unregister_table(net, net->ipv6.ip6table_mangle);
+	net->ipv6.ip6table_mangle = NULL;
+
+	net_ipt_module_clear(net, VE_IP_MANGLE6);
 }
 
 static struct pernet_operations ip6table_mangle_net_ops = {
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -115,6 +115,10 @@ static int __net_init ip6table_nat_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
 
+	if (!net_ipt_permitted(net, VE_IP_IPTABLE_NAT) ||
+	    !net_ipt_permitted(net, VE_IP_IPTABLES6))
+		return 0;
+
 	repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table);
 	if (repl == NULL)
 		return -ENOMEM;
@@ -125,7 +129,11 @@ static int __net_init ip6table_nat_net_init(struct net *net)
 
 static void __net_exit ip6table_nat_net_exit(struct net *net)
 {
+	if (!net->ipv6.ip6table_nat)
+		return;
+
 	ip6t_unregister_table(net, net->ipv6.ip6table_nat);
+	net->ipv6.ip6table_nat = NULL;
 }
 
 static struct pernet_operations ip6table_nat_net_ops = {
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -33,19 +33,34 @@ static struct nf_hook_ops *rawtable_ops __read_mostly;
 static int __net_init ip6table_raw_net_init(struct net *net)
 {
 	struct ip6t_replace *repl;
+	struct xt_table *ip6table_raw;
+
+	if (WARN_ON(net->ipv6.ip6table_raw))
+		net->ipv6.ip6table_raw = NULL;
+
+	if (!net_ipt_permitted(net, VE_IP_IPTABLES6))
+		return 0;
 
 	repl = ip6t_alloc_initial_table(&packet_raw);
 	if (repl == NULL)
 		return -ENOMEM;
-	net->ipv6.ip6table_raw =
-		ip6t_register_table(net, &packet_raw, repl);
+	ip6table_raw = ip6t_register_table(net, &packet_raw, repl);
 	kfree(repl);
-	return PTR_RET(net->ipv6.ip6table_raw);
+
+	if (!IS_ERR(ip6table_raw))
+		net->ipv6.ip6table_raw = ip6table_raw;
+
+	return PTR_RET(ip6table_raw);
 }
 
 static void __net_exit ip6table_raw_net_exit(struct net *net)
 {
+	if (!net->ipv6.ip6table_raw)
+		return;
+
 	ip6t_unregister_table(net, net->ipv6.ip6table_raw);
+
+	net->ipv6.ip6table_raw = NULL;
 }
 
 static struct pernet_operations ip6table_raw_net_ops = {
--- a/net/ipv6/netfilter/nf_log_ipv6.c
+++ b/net/ipv6/netfilter/nf_log_ipv6.c
@@ -348,10 +348,6 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
 {
 	struct nf_log_buf *m;
 
-	/* FIXME: Disabled from containers until syslog ns is supported */
-	if (!net_eq(net, &init_net))
-		return;
-
 	m = nf_log_buf_open();
 
 	if (!loginfo)
@@ -365,7 +361,7 @@ static void nf_log_ip6_packet(struct net *net, u_int8_t pf,
 
 	dump_ipv6_packet(m, loginfo, skb, skb_network_offset(skb), 1);
 
-	nf_log_buf_close(m);
+	nf_log_buf_close(m, net->owner_ve);
 }
 
 static struct nf_logger nf_ip6_logger __read_mostly = {
--- a/net/ipv6/output_core.c
+++ b/net/ipv6/output_core.c
@@ -77,14 +77,13 @@ EXPORT_SYMBOL(ipv6_select_ident);
 int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 {
 	u16 offset = sizeof(struct ipv6hdr);
-	struct ipv6_opt_hdr *exthdr =
-				(struct ipv6_opt_hdr *)(ipv6_hdr(skb) + 1);
 	unsigned int packet_len = skb_tail_pointer(skb) -
 		skb_network_header(skb);
 	int found_rhdr = 0;
 	*nexthdr = &ipv6_hdr(skb)->nexthdr;
 
-	while (offset + 1 <= packet_len) {
+	while (offset <= packet_len) {
+		struct ipv6_opt_hdr *exthdr;
 
 		switch (**nexthdr) {
 
@@ -105,13 +104,16 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr)
 			return offset;
 		}
 
-		offset += ipv6_optlen(exthdr);
-		*nexthdr = &exthdr->nexthdr;
+		if (offset + sizeof(struct ipv6_opt_hdr) > packet_len)
+			return -EINVAL;
+
 		exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) +
 						 offset);
+		offset += ipv6_optlen(exthdr);
+		*nexthdr = &exthdr->nexthdr;
 	}
 
-	return offset;
+	return -EINVAL;
 }
 EXPORT_SYMBOL(ip6_find_1stfragopt);
 
--- a/net/ipv6/proc.c
+++ b/net/ipv6/proc.c
@@ -278,7 +278,7 @@ int snmp6_register_dev(struct inet6_dev *idev)
 	if (!net->mib.proc_net_devsnmp6)
 		return -ENOENT;
 
-	p = proc_create_data(idev->dev->name, S_IRUGO,
+	p = proc_net_create_data(idev->dev->name, S_IRUGO,
 			     net->mib.proc_net_devsnmp6,
 			     &snmp6_dev_seq_fops, idev);
 	if (!p)
@@ -309,7 +309,7 @@ static int __net_init ipv6_proc_init_net(struct net *net)
 	if (!proc_create("snmp6", S_IRUGO, net->proc_net, &snmp6_seq_fops))
 		goto proc_snmp6_fail;
 
-	net->mib.proc_net_devsnmp6 = proc_mkdir("dev_snmp6", net->proc_net);
+	net->mib.proc_net_devsnmp6 = proc_net_mkdir(net, "dev_snmp6", net->proc_net);
 	if (!net->mib.proc_net_devsnmp6)
 		goto proc_dev_snmp6_fail;
 	return 0;
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -63,11 +63,12 @@
 #include <linux/seq_file.h>
 #include <linux/export.h>
 
-static struct raw_hashinfo raw_v6_hashinfo = {
+struct raw_hashinfo raw_v6_hashinfo = {
 	.lock = __RW_LOCK_UNLOCKED(raw_v6_hashinfo.lock),
 };
+EXPORT_SYMBOL_GPL(raw_v6_hashinfo);
 
-static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
+struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
 		unsigned short num, const struct in6_addr *loc_addr,
 		const struct in6_addr *rmt_addr, int dif)
 {
@@ -100,6 +101,7 @@ static struct sock *__raw_v6_lookup(struct net *net, struct sock *sk,
 found:
 	return sk;
 }
+EXPORT_SYMBOL_GPL(__raw_v6_lookup);
 
 /*
  *	0 - deliver
@@ -968,6 +970,11 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
 		return -EFAULT;
 
 	switch (optname) {
+	case IPV6_HDRINCL:
+		if (sk->sk_type != SOCK_RAW)
+			return -EINVAL;
+		inet_sk(sk)->hdrincl = !!val;
+		return 0;
 	case IPV6_CHECKSUM:
 		if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 &&
 		    level == IPPROTO_IPV6) {
@@ -1012,7 +1019,8 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EOPNOTSUPP;
 		return rawv6_seticmpfilter(sk, level, optname, optval, optlen);
 	case SOL_IPV6:
-		if (optname == IPV6_CHECKSUM)
+		if (optname == IPV6_CHECKSUM ||
+		    optname == IPV6_HDRINCL)
 			break;
 	default:
 		return ipv6_setsockopt(sk, level, optname, optval, optlen);
@@ -1033,7 +1041,8 @@ static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname,
 			return -EOPNOTSUPP;
 		return rawv6_seticmpfilter(sk, level, optname, optval, optlen);
 	case SOL_IPV6:
-		if (optname == IPV6_CHECKSUM)
+		if (optname == IPV6_CHECKSUM ||
+		    optname == IPV6_HDRINCL)
 			break;
 	default:
 		return compat_ipv6_setsockopt(sk, level, optname,
@@ -1053,6 +1062,9 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname,
 		return -EFAULT;
 
 	switch (optname) {
+	case IPV6_HDRINCL:
+		val = inet_sk(sk)->hdrincl;
+		break;
 	case IPV6_CHECKSUM:
 		/*
 		 * We allow getsockopt() for IPPROTO_IPV6-level
@@ -1090,7 +1102,8 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
 			return -EOPNOTSUPP;
 		return rawv6_geticmpfilter(sk, level, optname, optval, optlen);
 	case SOL_IPV6:
-		if (optname == IPV6_CHECKSUM)
+		if (optname == IPV6_CHECKSUM ||
+		    optname == IPV6_HDRINCL)
 			break;
 	default:
 		return ipv6_getsockopt(sk, level, optname, optval, optlen);
@@ -1111,7 +1124,8 @@ static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname,
 			return -EOPNOTSUPP;
 		return rawv6_geticmpfilter(sk, level, optname, optval, optlen);
 	case SOL_IPV6:
-		if (optname == IPV6_CHECKSUM)
+		if (optname == IPV6_CHECKSUM ||
+		    optname == IPV6_HDRINCL)
 			break;
 	default:
 		return compat_ipv6_getsockopt(sk, level, optname,
@@ -1210,7 +1224,7 @@ struct proto rawv6_prot = {
 	.close		   = rawv6_close,
 	.destroy	   = raw6_destroy,
 	.connect	   = ip6_datagram_connect,
-	.disconnect	   = udp_disconnect,
+	.disconnect	   = __udp_disconnect,
 	.ioctl		   = rawv6_ioctl,
 	.init		   = rawv6_init_sk,
 	.setsockopt	   = rawv6_setsockopt,
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1168,7 +1168,7 @@ static struct dst_entry *ip6_route_input_lookup(struct net *net,
 	return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
 }
 
-void ip6_route_input(struct sk_buff *skb)
+void __ip6_route_input(struct sk_buff *skb, struct in6_addr *daddr)
 {
 	const struct ipv6hdr *iph = ipv6_hdr(skb);
 	struct net *net = dev_net(skb->dev);
@@ -1176,7 +1176,7 @@ void ip6_route_input(struct sk_buff *skb)
 	struct ip_tunnel_info *tun_info;
 	struct flowi6 fl6 = {
 		.flowi6_iif = skb->dev->ifindex,
-		.daddr = iph->daddr,
+		.daddr = *daddr,
 		.saddr = iph->saddr,
 		.flowlabel = ip6_flowinfo(iph),
 		.flowi6_mark = skb->mark,
@@ -1189,6 +1189,12 @@ void ip6_route_input(struct sk_buff *skb)
 	skb_dst_drop(skb);
 	skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
 }
+EXPORT_SYMBOL(__ip6_route_input);
+
+void ip6_route_input(struct sk_buff *skb)
+{
+	__ip6_route_input(skb, &ipv6_hdr(skb)->daddr);
+}
 
 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
 					     struct flowi6 *fl6, int flags)
@@ -3528,6 +3534,29 @@ static struct notifier_block ip6_route_dev_notifier = {
 	.priority = 0,
 };
 
+#if 0
+static void ip6_rt_dump_dst(void *o)
+{
+	struct rt6_info *r = (struct rt6_info *)o;
+
+	if (r->dst.flags & DST_FREE)
+		return;
+
+	printk("=== %p\n", o);
+	dst_dump_one(&r->dst);
+	printk("\tflags %x ref %d prot %d\n",
+			r->rt6i_flags, atomic_read(&r->rt6i_ref),
+			(int)r->rt6i_protocol);
+}
+#endif
+
+static void _ip6_rt_dump_dsts(void)
+{
+	printk("IPv6 dst cache:\n");
+	//FIXME
+	//slab_obj_walk(ip6_dst_ops_template.kmem_cachep, ip6_rt_dump_dst);
+}
+
 int __init ip6_route_init(void)
 {
 	int ret;
@@ -3536,7 +3565,7 @@ int __init ip6_route_init(void)
 	ret = -ENOMEM;
 	ip6_dst_ops_template.kmem_cachep =
 		kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
-				  SLAB_HWCACHE_ALIGN, NULL);
+				  SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
 	if (!ip6_dst_ops_template.kmem_cachep)
 		goto out;
 
@@ -3598,6 +3627,7 @@ int __init ip6_route_init(void)
 		spin_lock_init(&ul->lock);
 	}
 
+	ip6_rt_dump_dsts = _ip6_rt_dump_dsts;
 out:
 	return ret;
 
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -35,6 +35,8 @@
 #include <linux/init.h>
 #include <linux/netfilter_ipv4.h>
 #include <linux/if_ether.h>
+#include <uapi/linux/vzcalluser.h>
+#include <linux/ve.h>
 
 #include <net/sock.h>
 #include <net/snmp.h>
@@ -98,6 +100,9 @@ static struct ip_tunnel *ipip6_tunnel_lookup(struct net *net,
 	struct ip_tunnel *t;
 	struct sit_net *sitn = net_generic(net, sit_net_id);
 
+	if (sitn == NULL)
+		return NULL;
+
 	for_each_ip_tunnel_rcu(t, sitn->tunnels_r_l[h0 ^ h1]) {
 		if (local == t->parms.iph.saddr &&
 		    remote == t->parms.iph.daddr &&
@@ -303,8 +308,8 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
 	/* For simple GET or for root users,
 	 * we try harder to allocate.
 	 */
-	kp = (cmax <= 1 || capable(CAP_NET_ADMIN)) ?
-		kcalloc(cmax, sizeof(*kp), GFP_KERNEL) :
+	kp = (cmax <= 1 || ve_capable(CAP_NET_ADMIN)) ?
+		kcalloc(cmax, sizeof(*kp), GFP_KERNEL_ACCOUNT | __GFP_NOWARN) :
 		NULL;
 
 	rcu_read_lock();
@@ -317,7 +322,8 @@ static int ipip6_tunnel_get_prl(struct ip_tunnel *t,
 		 * For root users, retry allocating enough memory for
 		 * the answer.
 		 */
-		kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC);
+		kp = kcalloc(ca, sizeof(*kp), GFP_ATOMIC | __GFP_ACCOUNT |
+				__GFP_NOWARN);
 		if (!kp) {
 			ret = -ENOMEM;
 			goto out;
@@ -1469,6 +1475,9 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev,
 #endif
 	int err;
 
+	if (net_generic(net, sit_net_id) == NULL)
+		return -EACCES;
+
 	nt = netdev_priv(dev);
 
 	if (ipip6_netlink_encap_parms(data, &ipencap)) {
@@ -1717,6 +1726,9 @@ static int __net_init sit_init_net(struct net *net)
 	struct ip_tunnel *t;
 	int err;
 
+	if (!(net->owner_ve->features & VE_FEATURE_SIT))
+		return net_assign_generic(net, sit_net_id, NULL);
+
 	sitn->tunnels[0] = sitn->tunnels_wc;
 	sitn->tunnels[1] = sitn->tunnels_l;
 	sitn->tunnels[2] = sitn->tunnels_r;
@@ -1754,12 +1766,17 @@ err_alloc_dev:
 
 static void __net_exit sit_exit_net(struct net *net)
 {
+	struct sit_net *sitn = net_generic(net, sit_net_id);
 	LIST_HEAD(list);
 
+	if (sitn == NULL) /* no VE_FEATURE_SIT */
+		return;
+
 	rtnl_lock();
 	sit_destroy_tunnels(net, &list);
 	unregister_netdevice_many(&list);
 	rtnl_unlock();
+	net_assign_generic(net, sit_net_id, NULL);
 }
 
 static struct pernet_operations sit_net_ops = {
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1087,6 +1087,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 		newtp->af_specific = &tcp_sock_ipv6_mapped_specific;
 #endif
 
+		newnp->ipv6_mc_list = NULL;
 		newnp->ipv6_ac_list = NULL;
 		newnp->ipv6_fl_list = NULL;
 		newnp->pktoptions  = NULL;
@@ -1156,6 +1157,7 @@ static struct sock *tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
 	   First: no IPv4 options.
 	 */
 	newinet->inet_opt = NULL;
+	newnp->ipv6_mc_list = NULL;
 	newnp->ipv6_ac_list = NULL;
 	newnp->ipv6_fl_list = NULL;
 
@@ -1264,7 +1266,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 	if (skb->protocol == htons(ETH_P_IP))
 		return tcp_v4_do_rcv(sk, skb);
 
-	if (sk_filter(sk, skb))
+	if (tcp_filter(sk, skb))
 		goto discard;
 
 	/*
@@ -1303,7 +1305,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 		tcp_rcv_established(sk, skb, tcp_hdr(skb), skb->len);
 		if (opt_skb)
 			goto ipv6_pktoptions;
-		return 0;
+		goto restore_context;
 	}
 
 	if (skb->len < tcp_hdrlen(skb) || tcp_checksum_complete(skb))
@@ -1325,7 +1327,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 				goto reset;
 			if (opt_skb)
 				__kfree_skb(opt_skb);
-			return 0;
+			goto restore_context;
 		}
 	} else
 		sock_rps_save_rxhash(sk, skb);
@@ -1334,6 +1336,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
 		goto reset;
 	if (opt_skb)
 		goto ipv6_pktoptions;
+
+restore_context:
 	return 0;
 
 reset:
@@ -1342,7 +1346,7 @@ discard:
 	if (opt_skb)
 		__kfree_skb(opt_skb);
 	kfree_skb(skb);
-	return 0;
+	goto restore_context;
 csum_err:
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_CSUMERRORS);
 	TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_INERRS);
@@ -1376,7 +1380,7 @@ ipv6_pktoptions:
 	}
 
 	kfree_skb(opt_skb);
-	return 0;
+	goto restore_context;
 }
 
 static int tcp_v6_rcv(struct sk_buff *skb)
@@ -1440,8 +1444,10 @@ process:
 		goto discard_and_relse;
 #endif
 
-	if (sk_filter(sk, skb))
+	if (tcp_filter(sk, skb))
 		goto discard_and_relse;
+	th = (const struct tcphdr *)skb->data;
+	hdr = ipv6_hdr(skb);
 
 	sk_mark_napi_id(sk, skb);
 	skb->dev = NULL;
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -52,6 +52,7 @@
 #include <linux/seq_file.h>
 #include <trace/events/skb.h>
 #include "udp_impl.h"
+#include <net/udp_memcontrol.h>
 
 static unsigned int udp6_ehashfn(struct net *net,
 				  const struct in6_addr *laddr,
@@ -1417,6 +1418,7 @@ void udpv6_destroy_sock(struct sock *sk)
 	}
 
 	inet6_destroy_sock(sk);
+	sock_release_memcg(sk);
 }
 
 /*
@@ -1530,6 +1532,7 @@ struct proto udpv6_prot = {
 	.connect	   = ip6_datagram_connect,
 	.disconnect	   = udp_disconnect,
 	.ioctl		   = udp_ioctl,
+	.init		   = udp_init_sock,
 	.destroy	   = udpv6_destroy_sock,
 	.setsockopt	   = udpv6_setsockopt,
 	.getsockopt	   = udpv6_getsockopt,
@@ -1553,6 +1556,9 @@ struct proto udpv6_prot = {
 	.compat_getsockopt = compat_udpv6_getsockopt,
 #endif
 	.clear_sk	   = udp_v6_clear_sk,
+#ifdef CONFIG_MEMCG_KMEM
+	.proto_cgroup		= udp_proto_cgroup,
+#endif
 };
 
 static struct inet_protosw udpv6_protosw = {
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -29,6 +29,7 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 	u8 frag_hdr_sz = sizeof(struct frag_hdr);
 	__wsum csum;
 	int tnl_hlen;
+	int err;
 
 	mss = skb_shinfo(skb)->gso_size;
 	if (unlikely(skb->len <= mss))
@@ -84,7 +85,10 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
 		/* Find the unfragmentable header and shift it left by frag_hdr_sz
 		 * bytes to insert fragment header.
 		 */
-		unfrag_ip6hlen = ip6_find_1stfragopt(skb, &prevhdr);
+		err = ip6_find_1stfragopt(skb, &prevhdr);
+		if (err < 0)
+			return ERR_PTR(err);
+		unfrag_ip6hlen = err;
 		nexthdr = *prevhdr;
 		*prevhdr = NEXTHDR_FRAGMENT;
 		unfrag_len = (skb_network_header(skb) - skb_mac_header(skb)) +
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -336,7 +336,7 @@ static int l2tp_ip_disconnect(struct sock *sk, int flags)
 	if (sock_flag(sk, SOCK_ZAPPED))
 		return 0;
 
-	return udp_disconnect(sk, flags);
+	return __udp_disconnect(sk, flags);
 }
 
 static int l2tp_ip_getname(struct socket *sock, struct sockaddr *uaddr,
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -402,7 +402,7 @@ static int l2tp_ip6_disconnect(struct sock *sk, int flags)
 	if (sock_flag(sk, SOCK_ZAPPED))
 		return 0;
 
-	return udp_disconnect(sk, flags);
+	return __udp_disconnect(sk, flags);
 }
 
 static int l2tp_ip6_getname(struct socket *sock, struct sockaddr *uaddr,
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -67,7 +67,6 @@
 #include <linux/kernel.h>
 #include <linux/spinlock.h>
 #include <linux/kthread.h>
-#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/jiffies.h>
@@ -98,6 +97,8 @@
 #include <net/udp.h>
 #include <net/xfrm.h>
 #include <net/inet_common.h>
+#include <uapi/linux/vzcalluser.h>
+#include <linux/ve.h>
 
 #include <asm/byteorder.h>
 #include <linux/atomic.h>
@@ -550,6 +551,9 @@ static int pppol2tp_create(struct net *net, struct socket *sock)
 	int error = -ENOMEM;
 	struct sock *sk;
 
+	if (!(net->owner_ve->features & VE_FEATURE_PPP))
+		return -EACCES;
+
 	sk = sk_alloc(net, PF_PPPOX, GFP_KERNEL, &pppol2tp_sk_proto);
 	if (!sk)
 		goto out;
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1419,6 +1419,12 @@ config NETFILTER_XT_MATCH_U32
 
 	  Details and examples are in the kernel module source.
 
+config NETFILTER_XT_MATCH_WDOG_TMO
+	tristate '"wdog_tmo" watchdog timer match'
+	depends on NETFILTER_ADVANCED && NETFILTER_NETLINK && FENCE_WATCHDOG
+	help
+	  This option selects the watchdog timer match module.
+
 endif # NETFILTER_XTABLES
 
 endmenu
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -167,6 +167,7 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_STRING) += xt_string.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_TCPMSS) += xt_tcpmss.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_TIME) += xt_time.o
 obj-$(CONFIG_NETFILTER_XT_MATCH_U32) += xt_u32.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_WDOG_TMO) += xt_wdog_tmo.o
 
 # ipset
 obj-$(CONFIG_IP_SET) += ipset/
--- a/net/netfilter/core.c
+++ b/net/netfilter/core.c
@@ -74,6 +74,8 @@ int nf_register_hook(struct nf_hook_ops *reg)
 	struct nf_hook_ops *elem;
 	int err;
 
+	BUG_ON(!ve_is_super(get_exec_env()));
+
 	err = mutex_lock_interruptible(&nf_hook_mutex);
 	if (err < 0)
 		return err;
@@ -92,6 +94,8 @@ EXPORT_SYMBOL(nf_register_hook);
 
 void nf_unregister_hook(struct nf_hook_ops *reg)
 {
+	BUG_ON(!ve_is_super(get_exec_env()));
+
 	mutex_lock(&nf_hook_mutex);
 	list_del_rcu(&reg->list);
 	mutex_unlock(&nf_hook_mutex);
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -261,7 +261,8 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map,
 }
 
 static int
-bitmap_ip_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+		 u32 flags)
 {
 	struct bitmap_ip *map;
 	u32 first_ip, last_ip, hosts, cadt_flags = 0;
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -352,7 +352,7 @@ init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,
 }
 
 static int
-bitmap_ipmac_create(struct ip_set *set, struct nlattr *tb[],
+bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
 		    u32 flags)
 {
 	u32 first_ip, last_ip, cadt_flags = 0;
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -248,7 +248,8 @@ init_map_port(struct ip_set *set, struct bitmap_port *map,
 }
 
 static int
-bitmap_port_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+		   u32 flags)
 {
 	struct bitmap_port *map;
 	u16 first_port, last_port;
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -17,6 +17,8 @@
 #include <linux/spinlock.h>
 #include <linux/rculist.h>
 #include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
 
 #include <linux/netfilter.h>
 #include <linux/netfilter/x_tables.h>
@@ -27,8 +29,17 @@ static LIST_HEAD(ip_set_type_list);		/* all registered set types */
 static DEFINE_MUTEX(ip_set_type_mutex);		/* protects ip_set_type_list */
 static DEFINE_RWLOCK(ip_set_ref_lock);		/* protects the set refs */
 
-static struct ip_set * __rcu *ip_set_list;	/* all individual sets */
-static ip_set_id_t ip_set_max = CONFIG_IP_SET_MAX; /* max number of sets */
+struct ip_set_net {
+	struct ip_set * __rcu *ip_set_list;	/* all individual sets */
+	ip_set_id_t	ip_set_max;	/* max number of sets */
+	int		is_deleted;	/* deleted by ip_set_net_exit */
+};
+static int ip_set_net_id __read_mostly;
+
+static inline struct ip_set_net *ip_set_pernet(struct net *net)
+{
+	return net_generic(net, ip_set_net_id);
+}
 
 #define IP_SET_INC	64
 #define STREQ(a, b)	(strncmp(a, b, IPSET_MAXNAMELEN) == 0)
@@ -45,8 +56,8 @@ MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_IPSET);
 /* When the nfnl mutex is held: */
 #define ip_set_dereference(p)		\
 	rcu_dereference_protected(p, 1)
-#define ip_set(id)		\
-	ip_set_dereference(ip_set_list)[id]
+#define ip_set(inst, id)			\
+	ip_set_dereference((inst)->ip_set_list)[id]
 
 /*
  * The set types are implemented in modules and registered set types
@@ -374,13 +385,14 @@ __ip_set_put(struct ip_set *set)
  */
 
 static inline struct ip_set *
-ip_set_rcu_get(ip_set_id_t index)
+ip_set_rcu_get(struct net *net, ip_set_id_t index)
 {
 	struct ip_set *set;
+	struct ip_set_net *inst = ip_set_pernet(net);
 
 	rcu_read_lock();
 	/* ip_set_list itself needs to be protected */
-	set = rcu_dereference(ip_set_list)[index];
+	set = rcu_dereference(inst->ip_set_list)[index];
 	rcu_read_unlock();
 
 	return set;
@@ -390,7 +402,8 @@ int
 ip_set_test(ip_set_id_t index, const struct sk_buff *skb,
 	    const struct xt_action_param *par, struct ip_set_adt_opt *opt)
 {
-	struct ip_set *set = ip_set_rcu_get(index);
+	struct ip_set *set = ip_set_rcu_get(
+			dev_net(par->in ? par->in : par->out), index);
 	int ret = 0;
 
 	BUG_ON(set == NULL);
@@ -428,7 +441,8 @@ int
 ip_set_add(ip_set_id_t index, const struct sk_buff *skb,
 	   const struct xt_action_param *par, struct ip_set_adt_opt *opt)
 {
-	struct ip_set *set = ip_set_rcu_get(index);
+	struct ip_set *set = ip_set_rcu_get(
+			dev_net(par->in ? par->in : par->out), index);
 	int ret;
 
 	BUG_ON(set == NULL);
@@ -450,7 +464,8 @@ int
 ip_set_del(ip_set_id_t index, const struct sk_buff *skb,
 	   const struct xt_action_param *par, struct ip_set_adt_opt *opt)
 {
-	struct ip_set *set = ip_set_rcu_get(index);
+	struct ip_set *set = ip_set_rcu_get(
+			dev_net(par->in ? par->in : par->out), index);
 	int ret = 0;
 
 	BUG_ON(set == NULL);
@@ -474,14 +489,15 @@ EXPORT_SYMBOL_GPL(ip_set_del);
  *
  */
 ip_set_id_t
-ip_set_get_byname(const char *name, struct ip_set **set)
+ip_set_get_byname(struct net *net, const char *name, struct ip_set **set)
 {
 	ip_set_id_t i, index = IPSET_INVALID_ID;
 	struct ip_set *s;
+	struct ip_set_net *inst = ip_set_pernet(net);
 
 	rcu_read_lock();
-	for (i = 0; i < ip_set_max; i++) {
-		s = rcu_dereference(ip_set_list)[i];
+	for (i = 0; i < inst->ip_set_max; i++) {
+		s = rcu_dereference(inst->ip_set_list)[i];
 		if (s != NULL && STREQ(s->name, name)) {
 			__ip_set_get(s);
 			index = i;
@@ -501,17 +517,26 @@ EXPORT_SYMBOL_GPL(ip_set_get_byname);
  * to be valid, after calling this function.
  *
  */
-void
-ip_set_put_byindex(ip_set_id_t index)
+
+static inline void
+__ip_set_put_byindex(struct ip_set_net *inst, ip_set_id_t index)
 {
 	struct ip_set *set;
 
 	rcu_read_lock();
-	set = rcu_dereference(ip_set_list)[index];
+	set = rcu_dereference(inst->ip_set_list)[index];
 	if (set != NULL)
 		__ip_set_put(set);
 	rcu_read_unlock();
 }
+
+void
+ip_set_put_byindex(struct net *net, ip_set_id_t index)
+{
+	struct ip_set_net *inst = ip_set_pernet(net);
+
+	__ip_set_put_byindex(inst, index);
+}
 EXPORT_SYMBOL_GPL(ip_set_put_byindex);
 
 /*
@@ -522,9 +547,9 @@ EXPORT_SYMBOL_GPL(ip_set_put_byindex);
  *
  */
 const char *
-ip_set_name_byindex(ip_set_id_t index)
+ip_set_name_byindex(struct net *net, ip_set_id_t index)
 {
-	const struct ip_set *set = ip_set_rcu_get(index);
+	const struct ip_set *set = ip_set_rcu_get(net, index);
 
 	BUG_ON(set == NULL);
 	BUG_ON(set->ref == 0);
@@ -546,15 +571,16 @@ EXPORT_SYMBOL_GPL(ip_set_name_byindex);
  * The nfnl mutex is used in the function.
  */
 ip_set_id_t
-ip_set_nfnl_get_byindex(ip_set_id_t index)
+ip_set_nfnl_get_byindex(struct net *net, ip_set_id_t index)
 {
 	struct ip_set *set;
+	struct ip_set_net *inst = ip_set_pernet(net);
 
-	if (index > ip_set_max)
+	if (index > inst->ip_set_max)
 		return IPSET_INVALID_ID;
 
 	nfnl_lock(NFNL_SUBSYS_IPSET);
-	set = ip_set(index);
+	set = ip_set(inst, index);
 	if (set)
 		__ip_set_get(set);
 	else
@@ -573,13 +599,17 @@ EXPORT_SYMBOL_GPL(ip_set_nfnl_get_byindex);
  * The nfnl mutex is used in the function.
  */
 void
-ip_set_nfnl_put(ip_set_id_t index)
+ip_set_nfnl_put(struct net *net, ip_set_id_t index)
 {
 	struct ip_set *set;
+	struct ip_set_net *inst = ip_set_pernet(net);
+
 	nfnl_lock(NFNL_SUBSYS_IPSET);
-	set = ip_set(index);
-	if (set != NULL)
-		__ip_set_put(set);
+	if (!inst->is_deleted) { /* already deleted from ip_set_net_exit() */
+		set = ip_set(inst, index);
+		if (set != NULL)
+			__ip_set_put(set);
+	}
 	nfnl_unlock(NFNL_SUBSYS_IPSET);
 }
 EXPORT_SYMBOL_GPL(ip_set_nfnl_put);
@@ -637,14 +667,14 @@ static const struct nla_policy ip_set_create_policy[IPSET_ATTR_CMD_MAX + 1] = {
 };
 
 static struct ip_set *
-find_set_and_id(const char *name, ip_set_id_t *id)
+find_set_and_id(struct ip_set_net *inst, const char *name, ip_set_id_t *id)
 {
 	struct ip_set *set = NULL;
 	ip_set_id_t i;
 
 	*id = IPSET_INVALID_ID;
-	for (i = 0; i < ip_set_max; i++) {
-		set = ip_set(i);
+	for (i = 0; i < inst->ip_set_max; i++) {
+		set = ip_set(inst, i);
 		if (set != NULL && STREQ(set->name, name)) {
 			*id = i;
 			break;
@@ -654,22 +684,23 @@ find_set_and_id(const char *name, ip_set_id_t *id)
 }
 
 static inline struct ip_set *
-find_set(const char *name)
+find_set(struct ip_set_net *inst, const char *name)
 {
 	ip_set_id_t id;
 
-	return find_set_and_id(name, &id);
+	return find_set_and_id(inst, name, &id);
 }
 
 static int
-find_free_id(const char *name, ip_set_id_t *index, struct ip_set **set)
+find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index,
+	     struct ip_set **set)
 {
 	struct ip_set *s;
 	ip_set_id_t i;
 
 	*index = IPSET_INVALID_ID;
-	for (i = 0;  i < ip_set_max; i++) {
-		s = ip_set(i);
+	for (i = 0;  i < inst->ip_set_max; i++) {
+		s = ip_set(inst, i);
 		if (s == NULL) {
 			if (*index == IPSET_INVALID_ID)
 				*index = i;
@@ -698,6 +729,8 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 	      const struct nlmsghdr *nlh,
 	      const struct nlattr * const attr[])
 {
+	struct net *net = sock_net(ctnl);
+	struct ip_set_net *inst = ip_set_pernet(net);
 	struct ip_set *set, *clash = NULL;
 	ip_set_id_t index = IPSET_INVALID_ID;
 	struct nlattr *tb[IPSET_ATTR_CREATE_MAX+1] = {};
@@ -756,7 +789,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 		goto put_out;
 	}
 
-	ret = set->type->create(set, tb, flags);
+	ret = set->type->create(net, set, tb, flags);
 	if (ret != 0)
 		goto put_out;
 
@@ -767,7 +800,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 	 * by the nfnl mutex. Find the first free index in ip_set_list
 	 * and check clashing.
 	 */
-	ret = find_free_id(set->name, &index, &clash);
+	ret = find_free_id(inst, set->name, &index, &clash);
 	if (ret == -EEXIST) {
 		/* If this is the same set and requested, ignore error */
 		if ((flags & IPSET_FLAG_EXIST) &&
@@ -780,9 +813,9 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 		goto cleanup;
 	} else if (ret == -IPSET_ERR_MAX_SETS) {
 		struct ip_set **list, **tmp;
-		ip_set_id_t i = ip_set_max + IP_SET_INC;
+		ip_set_id_t i = inst->ip_set_max + IP_SET_INC;
 
-		if (i < ip_set_max || i == IPSET_INVALID_ID)
+		if (i < inst->ip_set_max || i == IPSET_INVALID_ID)
 			/* Wraparound */
 			goto cleanup;
 
@@ -790,14 +823,14 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 		if (!list)
 			goto cleanup;
 		/* nfnl mutex is held, both lists are valid */
-		tmp = ip_set_dereference(ip_set_list);
-		memcpy(list, tmp, sizeof(struct ip_set *) * ip_set_max);
-		rcu_assign_pointer(ip_set_list, list);
+		tmp = ip_set_dereference(inst->ip_set_list);
+		memcpy(list, tmp, sizeof(struct ip_set *) * inst->ip_set_max);
+		rcu_assign_pointer(inst->ip_set_list, list);
 		/* Make sure all current packets have passed through */
 		synchronize_net();
 		/* Use new list */
-		index = ip_set_max;
-		ip_set_max = i;
+		index = inst->ip_set_max;
+		inst->ip_set_max = i;
 		kfree(tmp);
 		ret = 0;
 	} else if (ret)
@@ -807,7 +840,7 @@ ip_set_create(struct sock *ctnl, struct sk_buff *skb,
 	 * Finally! Add our shiny new set to the list, and be done.
 	 */
 	pr_debug("create: '%s' created with index %u!\n", set->name, index);
-	ip_set(index) = set;
+	ip_set(inst, index) = set;
 
 	return ret;
 
@@ -830,12 +863,12 @@ ip_set_setname_policy[IPSET_ATTR_CMD_MAX + 1] = {
 };
 
 static void
-ip_set_destroy_set(ip_set_id_t index)
+ip_set_destroy_set(struct ip_set_net *inst, ip_set_id_t index)
 {
-	struct ip_set *set = ip_set(index);
+	struct ip_set *set = ip_set(inst, index);
 
 	pr_debug("set: %s\n",  set->name);
-	ip_set(index) = NULL;
+	ip_set(inst, index) = NULL;
 
 	/* Must call it without holding any lock */
 	set->variant->destroy(set);
@@ -848,6 +881,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
 	       const struct nlmsghdr *nlh,
 	       const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *s;
 	ip_set_id_t i;
 	int ret = 0;
@@ -867,21 +901,22 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
 	 */
 	read_lock_bh(&ip_set_ref_lock);
 	if (!attr[IPSET_ATTR_SETNAME]) {
-		for (i = 0; i < ip_set_max; i++) {
-			s = ip_set(i);
+		for (i = 0; i < inst->ip_set_max; i++) {
+			s = ip_set(inst, i);
 			if (s != NULL && s->ref) {
 				ret = -IPSET_ERR_BUSY;
 				goto out;
 			}
 		}
 		read_unlock_bh(&ip_set_ref_lock);
-		for (i = 0; i < ip_set_max; i++) {
-			s = ip_set(i);
+		for (i = 0; i < inst->ip_set_max; i++) {
+			s = ip_set(inst, i);
 			if (s != NULL)
-				ip_set_destroy_set(i);
+				ip_set_destroy_set(inst, i);
 		}
 	} else {
-		s = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME]), &i);
+		s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
+				    &i);
 		if (s == NULL) {
 			ret = -ENOENT;
 			goto out;
@@ -891,7 +926,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
 		}
 		read_unlock_bh(&ip_set_ref_lock);
 
-		ip_set_destroy_set(i);
+		ip_set_destroy_set(inst, i);
 	}
 	return 0;
 out:
@@ -916,6 +951,7 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
 	     const struct nlmsghdr *nlh,
 	     const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *s;
 	ip_set_id_t i;
 
@@ -923,13 +959,13 @@ ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
 		return -IPSET_ERR_PROTOCOL;
 
 	if (!attr[IPSET_ATTR_SETNAME]) {
-		for (i = 0; i < ip_set_max; i++) {
-			s = ip_set(i);
+		for (i = 0; i < inst->ip_set_max; i++) {
+			s = ip_set(inst, i);
 			if (s != NULL)
 				ip_set_flush_set(s);
 		}
 	} else {
-		s = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+		s = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
 		if (s == NULL)
 			return -ENOENT;
 
@@ -955,6 +991,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
 	      const struct nlmsghdr *nlh,
 	      const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *set, *s;
 	const char *name2;
 	ip_set_id_t i;
@@ -965,7 +1002,7 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
 		     attr[IPSET_ATTR_SETNAME2] == NULL))
 		return -IPSET_ERR_PROTOCOL;
 
-	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
 	if (set == NULL)
 		return -ENOENT;
 
@@ -976,8 +1013,8 @@ ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
 	}
 
 	name2 = nla_data(attr[IPSET_ATTR_SETNAME2]);
-	for (i = 0; i < ip_set_max; i++) {
-		s = ip_set(i);
+	for (i = 0; i < inst->ip_set_max; i++) {
+		s = ip_set(inst, i);
 		if (s != NULL && STREQ(s->name, name2)) {
 			ret = -IPSET_ERR_EXIST_SETNAME2;
 			goto out;
@@ -1004,6 +1041,7 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
 	    const struct nlmsghdr *nlh,
 	    const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *from, *to;
 	ip_set_id_t from_id, to_id;
 	char from_name[IPSET_MAXNAMELEN];
@@ -1013,11 +1051,13 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
 		     attr[IPSET_ATTR_SETNAME2] == NULL))
 		return -IPSET_ERR_PROTOCOL;
 
-	from = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME]), &from_id);
+	from = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
+			       &from_id);
 	if (from == NULL)
 		return -ENOENT;
 
-	to = find_set_and_id(nla_data(attr[IPSET_ATTR_SETNAME2]), &to_id);
+	to = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME2]),
+			     &to_id);
 	if (to == NULL)
 		return -IPSET_ERR_EXIST_SETNAME2;
 
@@ -1034,8 +1074,8 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
 
 	write_lock_bh(&ip_set_ref_lock);
 	swap(from->ref, to->ref);
-	ip_set(from_id) = to;
-	ip_set(to_id) = from;
+	ip_set(inst, from_id) = to;
+	ip_set(inst, to_id) = from;
 	write_unlock_bh(&ip_set_ref_lock);
 
 	return 0;
@@ -1054,9 +1094,10 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
 static int
 ip_set_dump_done(struct netlink_callback *cb)
 {
+	struct ip_set_net *inst = (struct ip_set_net *)cb->data;
 	if (cb->args[2]) {
-		pr_debug("release set %s\n", ip_set(cb->args[1])->name);
-		ip_set_put_byindex((ip_set_id_t) cb->args[1]);
+		pr_debug("release set %s\n", ip_set(inst, cb->args[1])->name);
+		__ip_set_put_byindex(inst, (ip_set_id_t) cb->args[1]);
 	}
 	return 0;
 }
@@ -1082,6 +1123,7 @@ dump_init(struct netlink_callback *cb)
 	struct nlattr *attr = (void *)nlh + min_len;
 	u32 dump_type;
 	ip_set_id_t index;
+	struct ip_set_net *inst = (struct ip_set_net *)cb->data;
 
 	/* Second pass, so parser can't fail */
 	nla_parse(cda, IPSET_ATTR_CMD_MAX,
@@ -1095,7 +1137,7 @@ dump_init(struct netlink_callback *cb)
 	if (cda[IPSET_ATTR_SETNAME]) {
 		struct ip_set *set;
 
-		set = find_set_and_id(nla_data(cda[IPSET_ATTR_SETNAME]),
+		set = find_set_and_id(inst, nla_data(cda[IPSET_ATTR_SETNAME]),
 				      &index);
 		if (set == NULL)
 			return -ENOENT;
@@ -1123,6 +1165,7 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
 	unsigned int flags = NETLINK_CB(cb->skb).portid ? NLM_F_MULTI : 0;
 	u32 dump_type, dump_flags;
 	int ret = 0;
+	struct ip_set_net *inst = (struct ip_set_net *)cb->data;
 
 	if (!cb->args[0]) {
 		ret = dump_init(cb);
@@ -1136,18 +1179,18 @@ ip_set_dump_start(struct sk_buff *skb, struct netlink_callback *cb)
 		}
 	}
 
-	if (cb->args[1] >= ip_set_max)
+	if (cb->args[1] >= inst->ip_set_max)
 		goto out;
 
 	dump_type = DUMP_TYPE(cb->args[0]);
 	dump_flags = DUMP_FLAGS(cb->args[0]);
-	max = dump_type == DUMP_ONE ? cb->args[1] + 1 : ip_set_max;
+	max = dump_type == DUMP_ONE ? cb->args[1] + 1 : inst->ip_set_max;
 dump_last:
 	pr_debug("args[0]: %u %u args[1]: %ld\n",
 		 dump_type, dump_flags, cb->args[1]);
 	for (; cb->args[1] < max; cb->args[1]++) {
 		index = (ip_set_id_t) cb->args[1];
-		set = ip_set(index);
+		set = ip_set(inst, index);
 		if (set == NULL) {
 			if (dump_type == DUMP_ONE) {
 				ret = -ENOENT;
@@ -1225,8 +1268,8 @@ next_set:
 release_refcount:
 	/* If there was an error or set is done, release set */
 	if (ret || !cb->args[2]) {
-		pr_debug("release set %s\n", ip_set(index)->name);
-		ip_set_put_byindex(index);
+		pr_debug("release set %s\n", ip_set(inst, index)->name);
+		__ip_set_put_byindex(inst, index);
 		cb->args[2] = 0;
 	}
 out:
@@ -1244,6 +1287,8 @@ ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
 	    const struct nlmsghdr *nlh,
 	    const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+
 	if (unlikely(protocol_failed(attr)))
 		return -IPSET_ERR_PROTOCOL;
 
@@ -1251,6 +1296,7 @@ ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
 		struct netlink_dump_control c = {
 			.dump = ip_set_dump_start,
 			.done = ip_set_dump_done,
+			.data = (void *)inst
 		};
 		return netlink_dump_start(ctnl, skb, nlh, &c);
 	}
@@ -1329,6 +1375,7 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
 	    const struct nlmsghdr *nlh,
 	    const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *set;
 	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
 	const struct nlattr *nla;
@@ -1347,7 +1394,7 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
 		       attr[IPSET_ATTR_LINENO] == NULL))))
 		return -IPSET_ERR_PROTOCOL;
 
-	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
 	if (set == NULL)
 		return -ENOENT;
 
@@ -1383,6 +1430,7 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
 	    const struct nlmsghdr *nlh,
 	    const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *set;
 	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
 	const struct nlattr *nla;
@@ -1401,7 +1449,7 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
 		       attr[IPSET_ATTR_LINENO] == NULL))))
 		return -IPSET_ERR_PROTOCOL;
 
-	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
 	if (set == NULL)
 		return -ENOENT;
 
@@ -1437,6 +1485,7 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
 	     const struct nlmsghdr *nlh,
 	     const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	struct ip_set *set;
 	struct nlattr *tb[IPSET_ATTR_ADT_MAX+1] = {};
 	int ret = 0;
@@ -1447,7 +1496,7 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
 		     !flag_nested(attr[IPSET_ATTR_DATA])))
 		return -IPSET_ERR_PROTOCOL;
 
-	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
 	if (set == NULL)
 		return -ENOENT;
 
@@ -1473,6 +1522,7 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb,
 	      const struct nlmsghdr *nlh,
 	      const struct nlattr * const attr[])
 {
+	struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
 	const struct ip_set *set;
 	struct sk_buff *skb2;
 	struct nlmsghdr *nlh2;
@@ -1482,7 +1532,7 @@ ip_set_header(struct sock *ctnl, struct sk_buff *skb,
 		     attr[IPSET_ATTR_SETNAME] == NULL))
 		return -IPSET_ERR_PROTOCOL;
 
-	set = find_set(nla_data(attr[IPSET_ATTR_SETNAME]));
+	set = find_set(inst, nla_data(attr[IPSET_ATTR_SETNAME]));
 	if (set == NULL)
 		return -ENOENT;
 
@@ -1707,8 +1757,10 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
 	unsigned int *op;
 	void *data;
 	int copylen = *len, ret = 0;
+	struct net *net = sock_net(sk);
+	struct ip_set_net *inst = ip_set_pernet(net);
 
-	if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
 		return -EPERM;
 	if (optval != SO_IP_SET)
 		return -EBADF;
@@ -1757,7 +1809,7 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
 		}
 		req_get->set.name[IPSET_MAXNAMELEN - 1] = '\0';
 		nfnl_lock(NFNL_SUBSYS_IPSET);
-		find_set_and_id(req_get->set.name, &id);
+		find_set_and_id(inst, req_get->set.name, &id);
 		req_get->set.index = id;
 		nfnl_unlock(NFNL_SUBSYS_IPSET);
 		goto copy;
@@ -1767,12 +1819,12 @@ ip_set_sockfn_get(struct sock *sk, int optval, void __user *user, int *len)
 		struct ip_set *set;
 
 		if (*len != sizeof(struct ip_set_req_get_set) ||
-		    req_get->set.index >= ip_set_max) {
+		    req_get->set.index >= inst->ip_set_max) {
 			ret = -EINVAL;
 			goto done;
 		}
 		nfnl_lock(NFNL_SUBSYS_IPSET);
-		set = ip_set(req_get->set.index);
+		set = ip_set(inst, req_get->set.index);
 		strncpy(req_get->set.name, set ? set->name : "",
 			IPSET_MAXNAMELEN);
 		nfnl_unlock(NFNL_SUBSYS_IPSET);
@@ -1801,49 +1853,82 @@ static struct nf_sockopt_ops so_set __read_mostly = {
 	.owner		= THIS_MODULE,
 };
 
-static int __init
-ip_set_init(void)
+static int __net_init
+ip_set_net_init(struct net *net)
 {
+	struct ip_set_net *inst = ip_set_pernet(net);
+
 	struct ip_set **list;
-	int ret;
 
-	if (max_sets)
-		ip_set_max = max_sets;
-	if (ip_set_max >= IPSET_INVALID_ID)
-		ip_set_max = IPSET_INVALID_ID - 1;
+	inst->ip_set_max = max_sets ? max_sets : CONFIG_IP_SET_MAX;
+	if (inst->ip_set_max >= IPSET_INVALID_ID)
+		inst->ip_set_max = IPSET_INVALID_ID - 1;
 
-	list = kzalloc(sizeof(struct ip_set *) * ip_set_max, GFP_KERNEL);
+	list = kzalloc(sizeof(struct ip_set *) * inst->ip_set_max, GFP_KERNEL);
 	if (!list)
 		return -ENOMEM;
+	inst->is_deleted = 0;
+	rcu_assign_pointer(inst->ip_set_list, list);
+	return 0;
+}
+
+static void __net_exit
+ip_set_net_exit(struct net *net)
+{
+	struct ip_set_net *inst = ip_set_pernet(net);
+
+	struct ip_set *set = NULL;
+	ip_set_id_t i;
 
-	rcu_assign_pointer(ip_set_list, list);
-	ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
+	inst->is_deleted = 1; /* flag for ip_set_nfnl_put */
+
+	for (i = 0; i < inst->ip_set_max; i++) {
+		set = ip_set(inst, i);
+		if (set != NULL)
+			ip_set_destroy_set(inst, i);
+	}
+	kfree(rcu_dereference_protected(inst->ip_set_list, 1));
+}
+
+static struct pernet_operations ip_set_net_ops = {
+	.init	= ip_set_net_init,
+	.exit   = ip_set_net_exit,
+	.id	= &ip_set_net_id,
+	.size	= sizeof(struct ip_set_net)
+};
+
+
+static int __init
+ip_set_init(void)
+{
+	int ret = nfnetlink_subsys_register(&ip_set_netlink_subsys);
 	if (ret != 0) {
 		pr_err("ip_set: cannot register with nfnetlink.\n");
-		kfree(list);
 		return ret;
 	}
 	ret = nf_register_sockopt(&so_set);
 	if (ret != 0) {
 		pr_err("SO_SET registry failed: %d\n", ret);
 		nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
-		kfree(list);
 		return ret;
 	}
-
-	pr_notice("ip_set: protocol %u\n", IPSET_PROTOCOL);
+	ret = register_pernet_subsys(&ip_set_net_ops);
+	if (ret) {
+		pr_err("ip_set: cannot register pernet_subsys.\n");
+		nf_unregister_sockopt(&so_set);
+		nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
+		return ret;
+	}
+	pr_info("ip_set: protocol %u\n", IPSET_PROTOCOL);
 	return 0;
 }
 
 static void __exit
 ip_set_fini(void)
 {
-	struct ip_set **list = rcu_dereference_protected(ip_set_list, 1);
-
-	/* There can't be any existing set */
+	unregister_pernet_subsys(&ip_set_net_ops);
 	nf_unregister_sockopt(&so_set);
 	nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
-	kfree(list);
 	pr_debug("these are the famous last words\n");
 }
 
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -946,7 +946,8 @@ static const struct ip_set_type_variant mtype_variant = {
 
 #ifdef IP_SET_EMIT_CREATE
 static int
-TOKEN(HTYPE, _create)(struct ip_set *set, struct nlattr *tb[], u32 flags)
+TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
+			    struct nlattr *tb[], u32 flags)
 {
 	u32 hashsize = IPSET_DEFAULT_HASHSIZE, maxelem = IPSET_DEFAULT_MAXELEM;
 	u32 cadt_flags = 0;
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -63,6 +63,7 @@ struct list_set {
 	u32 size;		/* size of set list array */
 	u32 timeout;		/* timeout value */
 	struct timer_list gc;	/* garbage collection */
+	struct net *net;	/* namespace */
 	struct set_elem members[0]; /* the set members */
 };
 
@@ -204,13 +205,13 @@ list_set_add(struct ip_set *set, u32 i, struct set_adt_elem *d,
 	if (e->id != IPSET_INVALID_ID) {
 		if (i == map->size - 1)
 			/* Last element replaced: e.g. add new,before,last */
-			ip_set_put_byindex(e->id);
+			ip_set_put_byindex(map->net, e->id);
 		else {
 			struct set_elem *x = list_set_elem(map, map->size - 1);
 
 			/* Last element pushed off */
 			if (x->id != IPSET_INVALID_ID)
-				ip_set_put_byindex(x->id);
+				ip_set_put_byindex(map->net, x->id);
 			memmove(list_set_elem(map, i + 1), e,
 				map->dsize * (map->size - (i + 1)));
 		}
@@ -230,7 +231,7 @@ list_set_del(struct ip_set *set, u32 i)
 	struct list_set *map = set->data;
 	struct set_elem *e = list_set_elem(map, i);
 
-	ip_set_put_byindex(e->id);
+	ip_set_put_byindex(map->net, e->id);
 
 	if (i < map->size - 1)
 		memmove(e, list_set_elem(map, i + 1),
@@ -324,7 +325,7 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
 		if (SET_WITH_COUNTER(set))
 			ip_set_init_counter(ext_counter(e, map), ext);
 		/* Set is already added to the list */
-		ip_set_put_byindex(d->id);
+		ip_set_put_byindex(map->net, d->id);
 		return 0;
 	}
 insert:
@@ -403,7 +404,7 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],
 	ret = ip_set_get_extensions(set, tb, &ext);
 	if (ret)
 		return ret;
-	e.id = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAME]), &s);
+	e.id = ip_set_get_byname(map->net, nla_data(tb[IPSET_ATTR_NAME]), &s);
 	if (e.id == IPSET_INVALID_ID)
 		return -IPSET_ERR_NAME;
 	/* "Loop detection" */
@@ -423,7 +424,8 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],
 	}
 
 	if (tb[IPSET_ATTR_NAMEREF]) {
-		e.refid = ip_set_get_byname(nla_data(tb[IPSET_ATTR_NAMEREF]),
+		e.refid = ip_set_get_byname(map->net,
+					    nla_data(tb[IPSET_ATTR_NAMEREF]),
 					    &s);
 		if (e.refid == IPSET_INVALID_ID) {
 			ret = -IPSET_ERR_NAMEREF;
@@ -439,9 +441,9 @@ list_set_uadt(struct ip_set *set, struct nlattr *tb[],
 
 finish:
 	if (e.refid != IPSET_INVALID_ID)
-		ip_set_put_byindex(e.refid);
+		ip_set_put_byindex(map->net, e.refid);
 	if (adt != IPSET_ADD || ret)
-		ip_set_put_byindex(e.id);
+		ip_set_put_byindex(map->net, e.id);
 
 	return ip_set_eexist(ret, flags) ? 0 : ret;
 }
@@ -456,7 +458,7 @@ list_set_flush(struct ip_set *set)
 	for (i = 0; i < map->size; i++) {
 		e = list_set_elem(map, i);
 		if (e->id != IPSET_INVALID_ID) {
-			ip_set_put_byindex(e->id);
+			ip_set_put_byindex(map->net, e->id);
 			e->id = IPSET_INVALID_ID;
 		}
 	}
@@ -530,7 +532,7 @@ list_set_list(const struct ip_set *set,
 				goto nla_put_failure;
 		}
 		if (nla_put_string(skb, IPSET_ATTR_NAME,
-				   ip_set_name_byindex(e->id)))
+				   ip_set_name_byindex(map->net, e->id)))
 			goto nla_put_failure;
 		if (SET_WITH_TIMEOUT(set) &&
 		    nla_put_net32(skb, IPSET_ATTR_TIMEOUT,
@@ -613,7 +615,7 @@ list_set_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
 /* Create list:set type of sets */
 
 static struct list_set *
-init_list_set(struct ip_set *set, u32 size, size_t dsize,
+init_list_set(struct net *net, struct ip_set *set, u32 size, size_t dsize,
 	      unsigned long timeout)
 {
 	struct list_set *map;
@@ -625,6 +627,7 @@ init_list_set(struct ip_set *set, u32 size, size_t dsize,
 		return NULL;
 
 	map->size = size;
+	map->net = net;
 	map->dsize = dsize;
 	map->timeout = timeout;
 	set->data = map;
@@ -638,7 +641,8 @@ init_list_set(struct ip_set *set, u32 size, size_t dsize,
 }
 
 static int
-list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
+list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
+		u32 flags)
 {
 	struct list_set *map;
 	u32 size = IP_SET_LIST_DEFAULT_SIZE, cadt_flags = 0;
@@ -662,7 +666,7 @@ list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 	if (cadt_flags & IPSET_FLAG_WITH_COUNTERS) {
 		set->extensions |= IPSET_EXT_COUNTER;
 		if (tb[IPSET_ATTR_TIMEOUT]) {
-			map = init_list_set(set, size,
+			map = init_list_set(net, set, size,
 					sizeof(struct setct_elem), timeout);
 			if (!map)
 				return -ENOMEM;
@@ -673,7 +677,7 @@ list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 				offsetof(struct setct_elem, counter);
 			list_set_gc_init(set, list_set_gc);
 		} else {
-			map = init_list_set(set, size,
+			map = init_list_set(net, set, size,
 					    sizeof(struct setc_elem), 0);
 			if (!map)
 				return -ENOMEM;
@@ -681,7 +685,7 @@ list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 				offsetof(struct setc_elem, counter);
 		}
 	} else if (tb[IPSET_ATTR_TIMEOUT]) {
-		map = init_list_set(set, size,
+		map = init_list_set(net, set, size,
 				    sizeof(struct sett_elem), timeout);
 		if (!map)
 			return -ENOMEM;
@@ -690,7 +694,7 @@ list_set_create(struct ip_set *set, struct nlattr *tb[], u32 flags)
 			offsetof(struct sett_elem, timeout);
 		list_set_gc_init(set, list_set_gc);
 	} else {
-		map = init_list_set(set, size, sizeof(struct set_elem), 0);
+		map = init_list_set(net, set, size, sizeof(struct set_elem), 0);
 		if (!map)
 			return -ENOMEM;
 	}
--- a/net/netfilter/ipvs/ip_vs_conn.c
+++ b/net/netfilter/ipvs/ip_vs_conn.c
@@ -1336,7 +1336,7 @@ int __init ip_vs_conn_init(void)
 	/* Allocate ip_vs_conn slab cache */
 	ip_vs_conn_cachep = kmem_cache_create("ip_vs_conn",
 					      sizeof(struct ip_vs_conn), 0,
-					      SLAB_HWCACHE_ALIGN, NULL);
+					      SLAB_HWCACHE_ALIGN | SLAB_ACCOUNT, NULL);
 	if (!ip_vs_conn_cachep) {
 		vfree(ip_vs_conn_tab);
 		return -ENOMEM;
--- a/net/netfilter/nf_conntrack_acct.c
+++ b/net/netfilter/nf_conntrack_acct.c
@@ -17,6 +17,7 @@
 #include <net/netfilter/nf_conntrack.h>
 #include <net/netfilter/nf_conntrack_extend.h>
 #include <net/netfilter/nf_conntrack_acct.h>
+#include <net/netfilter/nf_conntrack_core.h>
 
 static bool nf_ct_acct __read_mostly;
 
@@ -70,7 +71,7 @@ static int nf_conntrack_acct_init_sysctl(struct net *net)
 	table[0].data = &net->ct.sysctl_acct;
 
 	/* Don't export sysctls to unprivileged users */
-	if (net->user_ns != &init_user_ns)
+	if (ve_net_hide_sysctl(net))
 		table[0].procname = NULL;
 
 	net->ct.acct_sysctl_header = register_net_sysctl(net, "net/netfilter",
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -53,6 +53,8 @@
 #include <net/netfilter/nf_nat_core.h>
 #include <net/netfilter/nf_nat_helper.h>
 
+#include <net/sock.h>
+
 #define NF_CONNTRACK_VERSION	"0.5.0"
 
 int (*nfnetlink_parse_nat_setup_hook)(struct nf_conn *ct,
@@ -123,9 +125,6 @@ static void nf_conntrack_all_unlock(void)
 unsigned int nf_conntrack_htable_size __read_mostly;
 EXPORT_SYMBOL_GPL(nf_conntrack_htable_size);
 
-unsigned int nf_conntrack_max __read_mostly;
-EXPORT_SYMBOL_GPL(nf_conntrack_max);
-
 DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked);
 EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked);
 
@@ -853,6 +852,7 @@ __nf_conntrack_alloc(struct net *net,
 		     const struct nf_conntrack_tuple *repl,
 		     gfp_t gfp, u32 hash)
 {
+	unsigned int ct_max = net->ct.max ? net->ct.max : init_net.ct.max;
 	struct nf_conn *ct;
 
 	if (unlikely(!nf_conntrack_hash_rnd)) {
@@ -864,11 +864,13 @@ __nf_conntrack_alloc(struct net *net,
 	/* We don't want any race condition at early drop stage */
 	atomic_inc(&net->ct.count);
 
-	if (nf_conntrack_max &&
-	    unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) {
+	if (ct_max &&
+	    unlikely(atomic_read(&net->ct.count) > ct_max)) {
 		if (!early_drop(net, hash)) {
 			atomic_dec(&net->ct.count);
-			net_warn_ratelimited("nf_conntrack: table full, dropping packet\n");
+			net_veboth_ratelimited(KERN_WARNING "VE%s: "
+						"nf_conntrack table full, dropping packet\n",
+						net->owner_ve->ve_name);
 			return ERR_PTR(-ENOMEM);
 		}
 	}
@@ -1072,6 +1074,15 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl,
 	struct nf_conn *ct;
 	u32 hash;
 
+	if (!net_ipt_permitted(net, VE_NF_CONNTRACK))
+		return NULL;
+
+	if (!net->ct.can_alloc) {
+		/* No rules loaded */
+		return NULL;
+	}
+	smp_rmb(); /* Pairs with wmb in allow_conntrack_allocation() */
+
 	if (!nf_ct_get_tuple(skb, skb_network_offset(skb),
 			     dataoff, l3num, protonum, &tuple, l3proto,
 			     l4proto)) {
@@ -1606,11 +1617,11 @@ void *nf_ct_alloc_hashtable(unsigned int *sizep, int nulls)
 	BUILD_BUG_ON(sizeof(struct hlist_nulls_head) != sizeof(struct hlist_head));
 	nr_slots = *sizep = roundup(*sizep, PAGE_SIZE / sizeof(struct hlist_nulls_head));
 	sz = nr_slots * sizeof(struct hlist_nulls_head);
-	hash = (void *)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | __GFP_ZERO,
+	hash = (void *)__get_free_pages(GFP_KERNEL_ACCOUNT | __GFP_NOWARN | __GFP_ZERO,
 					get_order(sz));
 	if (!hash) {
 		printk(KERN_WARNING "nf_conntrack: falling back to vmalloc.\n");
-		hash = vzalloc(sz);
+		hash = vzalloc_account(sz);
 	}
 
 	if (hash && nulls)
@@ -1723,11 +1734,11 @@ int nf_conntrack_init_start(void)
 		 * entries. */
 		max_factor = 4;
 	}
-	nf_conntrack_max = max_factor * nf_conntrack_htable_size;
+	init_net.ct.max = max_factor * nf_conntrack_htable_size;
 
 	printk(KERN_INFO "nf_conntrack version %s (%u buckets, %d max)\n",
 	       NF_CONNTRACK_VERSION, nf_conntrack_htable_size,
-	       nf_conntrack_max);
+	       init_net.ct.max);
 
 	ret = nf_conntrack_expect_init();
 	if (ret < 0)
@@ -1824,6 +1835,7 @@ int nf_conntrack_init_net(struct net *net)
 	int cpu;
 
 	atomic_set(&net->ct.count, 0);
+	net->ct.max = init_net.ct.max;
 	seqcount_init(&net->ct.generation);
 
 	net->ct.pcpu_lists = alloc_percpu(struct ct_pcpu);
--- a/net/netfilter/nf_conntrack_ecache.c
+++ b/net/netfilter/nf_conntrack_ecache.c
@@ -199,7 +199,7 @@ static int nf_conntrack_event_init_sysctl(struct net *net)
 	table[1].data = &net->ct.sysctl_events_retry_timeout;
 
 	/* Don't export sysctls to unprivileged users */
-	if (net->user_ns != &init_user_ns)
+	if (ve_net_hide_sysctl(net))
 		table[0].procname = NULL;
 
 	net->ct.event_sysctl_header =
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -35,8 +35,6 @@
 unsigned int nf_ct_expect_hsize __read_mostly;
 EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
 
-unsigned int nf_ct_expect_max __read_mostly;
-
 static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
 
 /* nf_conntrack_expect helper functions */
@@ -430,8 +428,10 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
 		}
 	}
 
-	if (net->ct.expect_count >= nf_ct_expect_max) {
-		net_warn_ratelimited("nf_conntrack: expectation table full\n");
+	if (net->ct.expect_count >= init_net.ct.expect_max) {
+		net_veboth_ratelimited(KERN_WARNING "VE%s "
+					"nf_conntrack: expectation table full\n",
+					net->owner_ve->ve_name);
 		ret = -EMFILE;
 	}
 out:
@@ -617,6 +617,7 @@ int nf_conntrack_expect_pernet_init(struct net *net)
 	int err = -ENOMEM;
 
 	net->ct.expect_count = 0;
+	net->ct.expect_max = init_net.ct.expect_max;
 	net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
 	if (net->ct.expect_hash == NULL)
 		goto err1;
@@ -645,7 +646,7 @@ int nf_conntrack_expect_init(void)
 		if (!nf_ct_expect_hsize)
 			nf_ct_expect_hsize = 1;
 	}
-	nf_ct_expect_max = nf_ct_expect_hsize * 4;
+	init_net.ct.expect_max = nf_ct_expect_hsize * 4;
 	nf_ct_expect_cachep = kmem_cache_create("nf_conntrack_expect",
 				sizeof(struct nf_conntrack_expect),
 				0, 0, NULL);
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -54,6 +54,9 @@
 #include <linux/netfilter/nfnetlink.h>
 #include <linux/netfilter/nfnetlink_conntrack.h>
 
+#include <net/sock.h>
+#include <bc/beancounter.h>
+
 MODULE_LICENSE("GPL");
 
 static char __initdata version[] = "0.93";
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -32,6 +32,12 @@
 #include <net/netfilter/nf_conntrack_zones.h>
 #include <net/netfilter/nf_conntrack_timestamp.h>
 #include <linux/rculist_nulls.h>
+#include <linux/ve.h>
+#include <linux/vziptable_defs.h>
+
+int ip_conntrack_disable_ve0 = 0;
+module_param(ip_conntrack_disable_ve0, int, 0440);
+EXPORT_SYMBOL(ip_conntrack_disable_ve0);
 
 MODULE_LICENSE("GPL");
 
@@ -401,8 +407,8 @@ static int nf_conntrack_standalone_init_proc(struct net *net)
 	if (!pde)
 		goto out_nf_conntrack;
 
-	pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat,
-			  &ct_cpu_seq_fops);
+	pde = proc_net_create_data("nf_conntrack", S_IRUGO, net->proc_net_stat,
+			  &ct_cpu_seq_fops, NULL);
 	if (!pde)
 		goto out_stat_nf_conntrack;
 	return 0;
@@ -436,12 +442,10 @@ static void nf_conntrack_standalone_fini_proc(struct net *net)
 static int log_invalid_proto_min = 0;
 static int log_invalid_proto_max = 255;
 
-static struct ctl_table_header *nf_ct_netfilter_header;
-
 static struct ctl_table nf_ct_sysctl_table[] = {
 	{
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
@@ -478,7 +482,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 	},
 	{
 		.procname	= "nf_conntrack_expect_max",
-		.data		= &nf_ct_expect_max,
+		.data		= &init_net.ct.expect_max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
@@ -491,7 +495,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
 static struct ctl_table nf_ct_netfilter_table[] = {
 	{
 		.procname	= "nf_conntrack_max",
-		.data		= &nf_conntrack_max,
+		.data		= &init_net.ct.max,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
@@ -499,6 +503,44 @@ static struct ctl_table nf_ct_netfilter_table[] = {
 	{ }
 };
 
+static int zero;
+
+static int nf_conntrack_netfilter_init_sysctl(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = kmemdup(nf_ct_netfilter_table, sizeof(nf_ct_netfilter_table),
+			GFP_KERNEL);
+	if (!table)
+		goto out_kmemdup;
+
+	table[0].data = &net->ct.max;
+
+	/* Don't export sysctls to unprivileged users */
+	if (ve_net_hide_sysctl(net))
+		table[0].procname = NULL;
+
+	net->ct.netfilter_header = register_net_sysctl(net, "net", table);
+	if (!net->ct.netfilter_header)
+		goto out_unregister_netfilter;
+
+	return 0;
+
+out_unregister_netfilter:
+	kfree(table);
+out_kmemdup:
+	return -ENOMEM;
+}
+
+static void nf_conntrack_netfilter_fini_sysctl(struct net *net)
+{
+	struct ctl_table *table;
+
+	table = net->ct.netfilter_header->ctl_table_arg;
+	unregister_net_sysctl_table(net->ct.netfilter_header);
+	kfree(table);
+}
+
 static int nf_conntrack_standalone_init_sysctl(struct net *net)
 {
 	struct ctl_table *table;
@@ -508,15 +550,23 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
 	if (!table)
 		goto out_kmemdup;
 
+	table[0].data = &net->ct.max;
 	table[1].data = &net->ct.count;
 	table[2].data = &net->ct.htable_size;
 	table[3].data = &net->ct.sysctl_checksum;
 	table[4].data = &net->ct.sysctl_log_invalid;
+	table[5].data = &net->ct.expect_max;
 
 	/* Don't export sysctls to unprivileged users */
-	if (net->user_ns != &init_user_ns)
+	if (ve_net_hide_sysctl(net))
 		table[0].procname = NULL;
 
+	if (!net_eq(net, &init_net)) {
+		table[0].proc_handler = proc_dointvec_minmax;
+		table[0].extra1 = &zero;
+		table[0].extra2 = &init_net.ct.max;
+	}
+
 	net->ct.sysctl_header = register_net_sysctl(net, "net/netfilter", table);
 	if (!net->ct.sysctl_header)
 		goto out_unregister_netfilter;
@@ -538,6 +588,15 @@ static void nf_conntrack_standalone_fini_sysctl(struct net *net)
 	kfree(table);
 }
 #else
+static int nf_conntrack_netfilter_init_sysctl(struct net *net)
+{
+	return 0;
+}
+
+static void nf_conntrack_netfilter_fini_sysctl(struct net *net)
+{
+}
+
 static int nf_conntrack_standalone_init_sysctl(struct net *net)
 {
 	return 0;
@@ -566,8 +625,14 @@ static int nf_conntrack_pernet_init(struct net *net)
 	if (ret < 0)
 		goto out_sysctl;
 
+	ret = nf_conntrack_netfilter_init_sysctl(net);
+	if (ret < 0)
+		goto out_netfilter_sysctl;
+
 	return 0;
 
+out_netfilter_sysctl:
+	nf_conntrack_standalone_fini_sysctl(net);
 out_sysctl:
 	nf_conntrack_standalone_fini_proc(net);
 out_proc:
@@ -581,6 +646,7 @@ static void nf_conntrack_pernet_exit(struct list_head *net_exit_list)
 	struct net *net;
 
 	list_for_each_entry(net, net_exit_list, exit_list) {
+		nf_conntrack_netfilter_fini_sysctl(net);
 		nf_conntrack_standalone_fini_sysctl(net);
 		nf_conntrack_standalone_fini_proc(net);
 	}
@@ -594,20 +660,22 @@ static struct pernet_operations nf_conntrack_net_ops = {
 
 static int __init nf_conntrack_standalone_init(void)
 {
-	int ret = nf_conntrack_init_start();
-	if (ret < 0)
-		goto out_start;
+	int ret;
 
-#ifdef CONFIG_SYSCTL
-	nf_ct_netfilter_header =
-		register_net_sysctl(&init_net, "net", nf_ct_netfilter_table);
-	if (!nf_ct_netfilter_header) {
-		pr_err("nf_conntrack: can't register to sysctl.\n");
-		ret = -ENOMEM;
-		goto out_sysctl;
+#ifdef CONFIG_VE_IPTABLES
+	if (ip_conntrack_disable_ve0) {
+		printk("Disabling conntracks and NAT for ve0\n");
+		get_ve0()->ipt_mask &= ~(VE_NF_CONNTRACK_MOD | VE_IP_IPTABLE_NAT_MOD);
+	} else {
+		printk("Enabling conntracks and NAT for ve0\n");
+		get_ve0()->ipt_mask |= VE_NF_CONNTRACK_MOD | VE_IP_IPTABLE_NAT_MOD;
 	}
 #endif
 
+	ret = nf_conntrack_init_start();
+	if (ret < 0)
+		goto out_start;
+
 	ret = register_pernet_subsys(&nf_conntrack_net_ops);
 	if (ret < 0)
 		goto out_pernet;
@@ -616,10 +684,6 @@ static int __init nf_conntrack_standalone_init(void)
 	return 0;
 
 out_pernet:
-#ifdef CONFIG_SYSCTL
-	unregister_net_sysctl_table(nf_ct_netfilter_header);
-out_sysctl:
-#endif
 	nf_conntrack_cleanup_end();
 out_start:
 	return ret;
@@ -629,9 +693,6 @@ static void __exit nf_conntrack_standalone_fini(void)
 {
 	nf_conntrack_cleanup_start();
 	unregister_pernet_subsys(&nf_conntrack_net_ops);
-#ifdef CONFIG_SYSCTL
-	unregister_net_sysctl_table(nf_ct_netfilter_header);
-#endif
 	nf_conntrack_cleanup_end();
 }
 
--- a/net/netfilter/nf_log.c
+++ b/net/netfilter/nf_log.c
@@ -285,10 +285,10 @@ struct nf_log_buf *nf_log_buf_open(void)
 }
 EXPORT_SYMBOL_GPL(nf_log_buf_open);
 
-void nf_log_buf_close(struct nf_log_buf *m)
+void nf_log_buf_close(struct nf_log_buf *m, struct ve_struct *ve)
 {
 	m->buf[m->count] = 0;
-	printk("%s\n", m->buf);
+	ve_log_printk(ve, "%s\n", m->buf);
 
 	if (likely(m != &emergency))
 		kfree(m);
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -821,6 +821,9 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
 
 static int __net_init nf_nat_net_init(struct net *net)
 {
+	if (net_ipt_permitted(net, VE_IP_NAT))
+		net_ipt_module_set(net, VE_IP_NAT);
+
 	/* Leave them the same for the moment. */
 	net->ct.nat_htable_size = net->ct.htable_size;
 	net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0);
@@ -836,6 +839,8 @@ static void __net_exit nf_nat_net_exit(struct net *net)
 	nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0);
 	synchronize_rcu();
 	nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size);
+
+	net_ipt_module_clear(net, VE_IP_NAT);
 }
 
 static struct pernet_operations nf_nat_net_ops = {
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -57,6 +57,17 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
 		indev = __in_dev_get_rcu(skb->dev);
 		if (indev && indev->ifa_list) {
 			ifa = indev->ifa_list;
+#ifdef CONFIG_VE
+                       /*
+                        * Because of venet device specific, we should use
+                        * first nonloopback ifa in the list.
+                        */
+			if (skb->dev->features & NETIF_F_VENET) {
+				while (IN_LOOPBACK(ntohl(ifa->ifa_local)) &&
+				       ifa->ifa_next)
+					ifa = ifa->ifa_next;
+			}
+#endif
 			newdst = ifa->ifa_local;
 		}
 		rcu_read_unlock();
--- a/net/netfilter/nf_sockopt.c
+++ b/net/netfilter/nf_sockopt.c
@@ -6,6 +6,11 @@
 #include <linux/mutex.h>
 #include <net/sock.h>
 
+#ifdef CONFIG_VE_IPTABLES
+#include <linux/netfilter_ipv4/ip_tables.h>
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#endif /* CONFIG_VE_IPTABLES */
+
 #include "nf_internals.h"
 
 /* Sockopts only registered and called from user context, so
@@ -91,6 +96,73 @@ out:
 	mutex_unlock(&nf_sockopt_mutex);
 	return ops;
 }
+#ifdef CONFIG_VE_IPTABLES
+static int sockopt_module_fits(u_int8_t pf, int val, int get,
+			       u_int8_t mod_pf,
+			       int set_optmin, int set_optmax,
+			       int get_optmin, int get_optmax)
+{
+	if (pf != mod_pf)
+		return 0;
+	if (get)
+		return val >= get_optmin && val < get_optmax;
+	else
+		return val >= set_optmin && val < set_optmax;
+}
+
+static int ve0_load_sockopt_module(struct net *net, u8 pf, int val, int get)
+{
+	const char *name;
+	int ret = -EPERM;
+
+	if (!ve_capable(CAP_NET_ADMIN))
+		goto out;
+
+	if (sockopt_module_fits(pf, val, get, PF_INET,
+				     IPT_BASE_CTL, IPT_SO_SET_MAX + 1,
+				     IPT_BASE_CTL, IPT_SO_GET_MAX + 1)) {
+		name = "ip_tables";
+	} else if (sockopt_module_fits(pf, val, get, PF_INET6,
+				     IP6T_BASE_CTL, IP6T_SO_SET_MAX + 1,
+				     IP6T_BASE_CTL, IP6T_SO_GET_MAX + 1)) {
+		name = "ip6_tables";
+	} else {
+		ret = -EINVAL;
+		goto out;
+	}
+	/*
+	 * Currently loaded modules are free of locks used during
+	 * their initialization. So, if you add one more module
+	 * here research it before. Maybe you will have to use
+	 * nowait module request in the function below.
+	 */
+	ret = request_module(name);
+out:
+	return ret;
+}
+
+static struct nf_sockopt_ops *nf_sockopt_find_ve(struct sock *sk, u_int8_t pf,
+		int val, int get)
+{
+	struct nf_sockopt_ops *ops = nf_sockopt_find(sk, pf, val, get);
+
+	if (!IS_ERR(ops) || ve_is_super(get_exec_env()))
+		return ops;
+
+	/*
+	 * Containers are not able to load appropriate modules
+	 * from userspace. We tricky help them here. For containers
+	 * this looks like module is already loaded or driver
+	 * is built in kernel.
+	 */
+	if (ve0_load_sockopt_module(sock_net(sk), pf, val, get) != 0)
+		return ops;
+
+	return nf_sockopt_find(sk, pf, val, get);
+}
+#else /* !CONFIG_VE_IPTABLES */
+#define nf_sockopt_find_ve(sk, pf, val, get)	nf_sockopt_find(sk, pf, val, get)
+#endif /* !CONFIG_VE_IPTABLES */
 
 /* Call get/setsockopt() */
 static int nf_sockopt(struct sock *sk, u_int8_t pf, int val,
@@ -99,7 +171,7 @@ static int nf_sockopt(struct sock *sk, u_int8_t pf, int val,
 	struct nf_sockopt_ops *ops;
 	int ret;
 
-	ops = nf_sockopt_find(sk, pf, val, get);
+	ops = nf_sockopt_find_ve(sk, pf, val, get);
 	if (IS_ERR(ops))
 		return PTR_ERR(ops);
 
@@ -133,7 +205,7 @@ static int compat_nf_sockopt(struct sock *sk, u_int8_t pf, int val,
 	struct nf_sockopt_ops *ops;
 	int ret;
 
-	ops = nf_sockopt_find(sk, pf, val, get);
+	ops = nf_sockopt_find_ve(sk, pf, val, get);
 	if (IS_ERR(ops))
 		return PTR_ERR(ops);
 
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -47,6 +47,8 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = {
 	[NFNLGRP_CONNTRACK_EXP_NEW]	= NFNL_SUBSYS_CTNETLINK_EXP,
 	[NFNLGRP_CONNTRACK_EXP_UPDATE]	= NFNL_SUBSYS_CTNETLINK_EXP,
 	[NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP,
+	[NFNLGRP_NFTABLES]		= NFNL_SUBSYS_NFTABLES,
+	[NFNLGRP_ACCT_QUOTA]		= NFNL_SUBSYS_ACCT,
 	[NFNLGRP_NFTRACE]		= NFNL_SUBSYS_NFTABLES,
 };
 
@@ -327,7 +329,9 @@ replay:
 		nlh = nlmsg_hdr(skb);
 		err = 0;
 
-		if (nlh->nlmsg_len < NLMSG_HDRLEN) {
+		if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+		    skb->len < nlh->nlmsg_len ||
+		    nlmsg_len(nlh) < sizeof(struct nfgenmsg)) {
 			err = -EINVAL;
 			goto ack;
 		}
@@ -482,7 +486,12 @@ static void nfnetlink_rcv(struct sk_buff *skb)
 static int nfnetlink_bind(int group)
 {
 	const struct nfnetlink_subsystem *ss;
-	int type = nfnl_group2type[group];
+	int type;
+
+	if (group <= NFNLGRP_NONE || group > NFNLGRP_MAX)
+		return -EINVAL;
+
+	type = nfnl_group2type[group];
 
 	rcu_read_lock();
 	ss = nfnetlink_get_subsys(type);
@@ -532,6 +541,9 @@ static int __init nfnetlink_init(void)
 {
 	int i;
 
+	for (i = NFNLGRP_NONE + 1; i <= NFNLGRP_MAX; i++)
+		BUG_ON(nfnl_group2type[i] == NFNL_SUBSYS_NONE);
+
 	for (i=0; i<NFNL_SUBSYS_COUNT; i++)
 		mutex_init(&table[i].mutex);
 
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -347,6 +347,8 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
 	if (err < 0)
 		return err;
 
+	allow_conntrack_allocation(ctx->net);
+
 	return 0;
 }
 
--- a/net/netfilter/nft_nat.c
+++ b/net/netfilter/nft_nat.c
@@ -209,6 +209,8 @@ static int nft_nat_init(const struct nft_ctx *ctx, const struct nft_expr *expr,
 			return -EINVAL;
 	}
 
+	allow_conntrack_allocation(ctx->net);
+
 	return 0;
 }
 
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -27,6 +27,7 @@
 #include <linux/slab.h>
 #include <linux/audit.h>
 #include <net/net_namespace.h>
+#include <bc/beancounter.h>
 
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter_arp.h>
@@ -67,6 +68,43 @@ static const char *const xt_prefix[NFPROTO_NUMPROTO] = {
 	[NFPROTO_IPV6]   = "ip6",
 };
 
+#ifdef CONFIG_BEANCOUNTERS
+static void uncharge_xtables(struct xt_table_info *info, unsigned long size)
+{
+	uncharge_beancounter(info->ub, UB_NUMXTENT, size);
+}
+
+static int recharge_xtables(struct xt_table_info *new, struct xt_table_info *old)
+{
+	struct user_beancounter *ub, *old_ub;
+	long change;
+
+	ub = new->ub;
+	old_ub = old->number ? old->ub : ub;
+	change = (long)new->number - (long)old->number;
+	if (old_ub != ub) {
+		printk(KERN_WARNING "iptables resources are charged"
+				" from different UB (%s -> %s)\n",
+				old_ub->ub_name, ub->ub_name);
+		change = new->number;
+	}
+
+	if (change > 0) {
+		if (charge_beancounter(ub, UB_NUMXTENT, change, UB_SOFT))
+			return -ENOMEM;
+	} else if (change < 0)
+		uncharge_beancounter(ub, UB_NUMXTENT, -change);
+
+	if (old_ub != ub)
+		uncharge_beancounter(old_ub, UB_NUMXTENT, old->number);
+
+	return 0;
+}
+#else
+#define recharge_xtables(c, new, old)	(0)
+#define uncharge_xtables(info, s)	do { } while (0)
+#endif	/* CONFIG_BEANCOUNTERS */
+
 /* Allow this many total (re)entries. */
 static const unsigned int xt_jumpstack_multiplier = 2;
 
@@ -181,6 +219,29 @@ xt_unregister_matches(struct xt_match *match, unsigned int n)
 }
 EXPORT_SYMBOL(xt_unregister_matches);
 
+/*
+ * Convert xt_name to module name and check for it's allowed.
+ *
+ * xt_name is a module name without prefix.
+ */
+static bool xt_name_allowed(u8 af, const char *xt_name)
+{
+	char module_name[MODULE_NAME_LEN] = {'\0'};
+	const char *prefix = xt_prefix[af];
+	int len = strlen(prefix) + strlen("t_");
+
+	if (len + strnlen(xt_name, MODULE_NAME_LEN) >= MODULE_NAME_LEN)
+		return false;
+
+	/* Fallback targets (ipt_standard_target etc) */
+	if (strcmp(xt_name, XT_STANDARD_TARGET) == 0 ||
+	    strcmp(xt_name, XT_ERROR_TARGET) == 0)
+		return true;
+
+	sprintf(module_name, "%st_%s", prefix, xt_name);
+
+	return module_payload_allowed(module_name);
+}
 
 /*
  * These are weird, but module loading must not be done with mutex
@@ -194,6 +255,9 @@ struct xt_match *xt_find_match(u8 af, const char *name, u8 revision)
 	struct xt_match *m;
 	int err = -ENOENT;
 
+	if (!xt_name_allowed(af, name))
+		return ERR_PTR(err);
+
 	if (mutex_lock_interruptible(&xt[af].mutex) != 0)
 		return ERR_PTR(-EINTR);
 
@@ -239,6 +303,9 @@ struct xt_target *xt_find_target(u8 af, const char *name, u8 revision)
 	struct xt_target *t;
 	int err = -ENOENT;
 
+	if (!xt_name_allowed(af, name))
+		return ERR_PTR(err);
+
 	if (mutex_lock_interruptible(&xt[af].mutex) != 0)
 		return ERR_PTR(-EINTR);
 
@@ -391,7 +458,7 @@ int xt_check_match(struct xt_mtchk_param *par,
 		 * ebt_among is exempt from centralized matchsize checking
 		 * because it uses a dynamic-size data set.
 		 */
-		pr_err("%s_tables: %s.%u match: invalid size "
+		ve_printk(VE_LOG, "%s_tables: %s.%u match: invalid size "
 		       "%u (kernel) != (user) %u\n",
 		       xt_prefix[par->family], par->match->name,
 		       par->match->revision,
@@ -400,7 +467,7 @@ int xt_check_match(struct xt_mtchk_param *par,
 	}
 	if (par->match->table != NULL &&
 	    strcmp(par->match->table, par->table) != 0) {
-		pr_err("%s_tables: %s match: only valid in %s table, not %s\n",
+		ve_printk(VE_LOG, "%s_tables: %s match: only valid in %s table, not %s\n",
 		       xt_prefix[par->family], par->match->name,
 		       par->match->table, par->table);
 		return -EINVAL;
@@ -408,7 +475,7 @@ int xt_check_match(struct xt_mtchk_param *par,
 	if (par->match->hooks && (par->hook_mask & ~par->match->hooks) != 0) {
 		char used[64], allow[64];
 
-		pr_err("%s_tables: %s match: used from hooks %s, but only "
+		ve_printk(VE_LOG, "%s_tables: %s match: used from hooks %s, but only "
 		       "valid from %s\n",
 		       xt_prefix[par->family], par->match->name,
 		       textify_hooks(used, sizeof(used), par->hook_mask,
@@ -418,7 +485,7 @@ int xt_check_match(struct xt_mtchk_param *par,
 		return -EINVAL;
 	}
 	if (par->match->proto && (par->match->proto != proto || inv_proto)) {
-		pr_err("%s_tables: %s match: only valid for protocol %u\n",
+		ve_printk(VE_LOG, "%s_tables: %s match: only valid for protocol %u\n",
 		       xt_prefix[par->family], par->match->name,
 		       par->match->proto);
 		return -EINVAL;
@@ -732,13 +799,14 @@ unsigned int *xt_alloc_entry_offsets(unsigned int size)
 {
 	unsigned int *off;
 
-	off = kcalloc(size, sizeof(unsigned int), GFP_KERNEL | __GFP_NOWARN);
+	off = kcalloc(size, sizeof(unsigned int),
+		GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
 
 	if (off)
 		return off;
 
 	if (size < (SIZE_MAX / sizeof(unsigned int)))
-		off = vmalloc(size * sizeof(unsigned int));
+		off = vmalloc_account(size * sizeof(unsigned int));
 
 	return off;
 }
@@ -777,7 +845,7 @@ int xt_check_target(struct xt_tgchk_param *par,
 	int ret;
 
 	if (XT_ALIGN(par->target->targetsize) != size) {
-		pr_err("%s_tables: %s.%u target: invalid size "
+		ve_printk(VE_LOG, "%s_tables: %s.%u target: invalid size "
 		       "%u (kernel) != (user) %u\n",
 		       xt_prefix[par->family], par->target->name,
 		       par->target->revision,
@@ -786,7 +854,7 @@ int xt_check_target(struct xt_tgchk_param *par,
 	}
 	if (par->target->table != NULL &&
 	    strcmp(par->target->table, par->table) != 0) {
-		pr_err("%s_tables: %s target: only valid in %s table, not %s\n",
+		ve_printk(VE_LOG, "%s_tables: %s target: only valid in %s table, not %s\n",
 		       xt_prefix[par->family], par->target->name,
 		       par->target->table, par->table);
 		return -EINVAL;
@@ -794,7 +862,7 @@ int xt_check_target(struct xt_tgchk_param *par,
 	if (par->target->hooks && (par->hook_mask & ~par->target->hooks) != 0) {
 		char used[64], allow[64];
 
-		pr_err("%s_tables: %s target: used from hooks %s, but only "
+		ve_printk(VE_LOG, "%s_tables: %s target: used from hooks %s, but only "
 		       "usable from %s\n",
 		       xt_prefix[par->family], par->target->name,
 		       textify_hooks(used, sizeof(used), par->hook_mask,
@@ -804,7 +872,7 @@ int xt_check_target(struct xt_tgchk_param *par,
 		return -EINVAL;
 	}
 	if (par->target->proto && (par->target->proto != proto || inv_proto)) {
-		pr_err("%s_tables: %s target: only valid for protocol %u\n",
+		ve_printk(VE_LOG, "%s_tables: %s target: only valid for protocol %u\n",
 		       xt_prefix[par->family], par->target->name,
 		       par->target->proto);
 		return -EINVAL;
@@ -883,7 +951,7 @@ void *xt_copy_counters_from_user(const void __user *user, unsigned int len,
 	if (size != (u64)len)
 		return ERR_PTR(-EINVAL);
 
-	mem = vmalloc(len);
+	mem = vmalloc_account(len);
 	if (!mem)
 		return ERR_PTR(-ENOMEM);
 
@@ -975,14 +1043,18 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
 		return NULL;
 
 	if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER))
-		info = kmalloc(sz, GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY);
+		info = kmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_NOWARN | __GFP_NORETRY);
 	if (!info) {
-		info = vmalloc(sz);
+		info = __vmalloc(sz, GFP_KERNEL_ACCOUNT | __GFP_NOWARN |
+				     __GFP_NORETRY | __GFP_HIGHMEM,
+				 PAGE_KERNEL);
 		if (!info)
 			return NULL;
 	}
 	memset(info, 0, sizeof(*info));
 	info->size = size;
+	info->ub = get_beancounter(get_exec_ub());
+
 	return info;
 }
 EXPORT_SYMBOL(xt_alloc_table_info);
@@ -999,6 +1071,8 @@ void xt_free_table_info(struct xt_table_info *info)
 
 	free_percpu(info->stackptr);
 
+	put_beancounter(info->ub);
+
 	kvfree(info);
 }
 EXPORT_SYMBOL(xt_free_table_info);
@@ -1109,6 +1183,12 @@ xt_replace_table(struct xt_table *table,
 		return NULL;
 	}
 
+	if (recharge_xtables(newinfo, private)) {
+		local_bh_enable();
+		*error = -ENOMEM;
+		return NULL;
+	}
+
 	newinfo->initial_entries = private->initial_entries;
 	/*
 	 * Ensure contents of newinfo are visible before assigning to
@@ -1206,6 +1286,7 @@ void *xt_unregister_table(struct xt_table *table)
 	list_del(&table->list);
 	mutex_unlock(&xt[table->af].mutex);
 	kfree(table);
+	uncharge_xtables(private, private->number);
 
 	return private;
 }
@@ -1567,21 +1648,21 @@ int xt_proto_init(struct net *net, u_int8_t af)
 #ifdef CONFIG_PROC_FS
 	strlcpy(buf, xt_prefix[af], sizeof(buf));
 	strlcat(buf, FORMAT_TABLES, sizeof(buf));
-	proc = proc_create_data(buf, 0440, net->proc_net, &xt_table_ops,
+	proc = proc_net_create_data(buf, 0440, net->proc_net, &xt_table_ops,
 				(void *)(unsigned long)af);
 	if (!proc)
 		goto out;
 
 	strlcpy(buf, xt_prefix[af], sizeof(buf));
 	strlcat(buf, FORMAT_MATCHES, sizeof(buf));
-	proc = proc_create_data(buf, 0440, net->proc_net, &xt_match_ops,
+	proc = proc_net_create_data(buf, 0440, net->proc_net, &xt_match_ops,
 				(void *)(unsigned long)af);
 	if (!proc)
 		goto out_remove_tables;
 
 	strlcpy(buf, xt_prefix[af], sizeof(buf));
 	strlcat(buf, FORMAT_TARGETS, sizeof(buf));
-	proc = proc_create_data(buf, 0440, net->proc_net, &xt_target_ops,
+	proc = proc_net_create_data(buf, 0440, net->proc_net, &xt_target_ops,
 				(void *)(unsigned long)af);
 	if (!proc)
 		goto out_remove_matches;
--- a/net/netfilter/xt_CONNSECMARK.c
+++ b/net/netfilter/xt_CONNSECMARK.c
@@ -110,6 +110,8 @@ static int connsecmark_tg_check(const struct xt_tgchk_param *par)
 	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
+	else
+		allow_conntrack_allocation(par->net);
 	return ret;
 }
 
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -248,6 +248,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
 	}
 	__set_bit(IPS_CONFIRMED_BIT, &ct->status);
 	nf_conntrack_get(&ct->ct_general);
+	allow_conntrack_allocation(par->net);
 out:
 	info->ct = ct;
 	return 0;
--- a/net/netfilter/xt_DSCP.c
+++ b/net/netfilter/xt_DSCP.c
@@ -74,6 +74,42 @@ static int dscp_tg_check(const struct xt_tgchk_param *par)
 }
 
 static unsigned int
+tos_tg_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct ipt_tos_target_info *info = par->targinfo;
+	struct iphdr *iph = ip_hdr(skb);
+	u_int8_t oldtos;
+
+	if ((iph->tos & IPTOS_TOS_MASK) != info->tos) {
+		if (!skb_make_writable(skb, sizeof(struct iphdr)))
+			return NF_DROP;
+
+		iph      = ip_hdr(skb);
+		oldtos   = iph->tos;
+		iph->tos = (iph->tos & IPTOS_PREC_MASK) | info->tos;
+		csum_replace2(&iph->check, htons(oldtos), htons(iph->tos));
+	}
+
+	return XT_CONTINUE;
+}
+
+static int
+tos_tg_check_v0(const struct xt_tgchk_param *par)
+{
+	const struct ipt_tos_target_info *info = par->targinfo;
+	const uint8_t tos = info->tos;
+
+	if (tos != IPTOS_LOWDELAY && tos != IPTOS_THROUGHPUT &&
+	    tos != IPTOS_RELIABILITY && tos != IPTOS_MINCOST &&
+	    tos != IPTOS_NORMALSVC) {
+		printk(KERN_WARNING "TOS: bad tos value %#x\n", tos);
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static unsigned int
 tos_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_tos_target_info *info = par->targinfo;
@@ -134,6 +170,16 @@ static struct xt_target dscp_tg_reg[] __read_mostly = {
 	},
 	{
 		.name		= "TOS",
+		.revision	= 0,
+		.family		= NFPROTO_IPV4,
+		.table		= "mangle",
+		.target		= tos_tg_v0,
+		.targetsize	= sizeof(struct ipt_tos_target_info),
+		.checkentry	= tos_tg_check_v0,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "TOS",
 		.revision	= 1,
 		.family		= NFPROTO_IPV4,
 		.table		= "mangle",
--- a/net/netfilter/xt_HMARK.c
+++ b/net/netfilter/xt_HMARK.c
@@ -334,6 +334,7 @@ static int hmark_tg_check(const struct xt_tgchk_param *par)
 		pr_info("xt_HMARK: spi-set and port-set can't be combined\n");
 		return -EINVAL;
 	}
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
--- a/net/netfilter/xt_NETMAP.c
+++ b/net/netfilter/xt_NETMAP.c
@@ -60,6 +60,7 @@ static int netmap_tg6_checkentry(const struct xt_tgchk_param *par)
 
 	if (!(range->flags & NF_NAT_RANGE_MAP_IPS))
 		return -EINVAL;
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
@@ -111,6 +112,7 @@ static int netmap_tg4_check(const struct xt_tgchk_param *par)
 		pr_debug("bad rangesize %u.\n", mr->rangesize);
 		return -EINVAL;
 	}
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
--- a/net/netfilter/xt_REDIRECT.c
+++ b/net/netfilter/xt_REDIRECT.c
@@ -40,6 +40,7 @@ static int redirect_tg6_checkentry(const struct xt_tgchk_param *par)
 
 	if (range->flags & NF_NAT_RANGE_MAP_IPS)
 		return -EINVAL;
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
@@ -56,6 +57,7 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par)
 		pr_debug("bad rangesize %u.\n", mr->rangesize);
 		return -EINVAL;
 	}
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -112,13 +112,13 @@ tcpmss_mangle_packet(struct sk_buff *skb,
 		unsigned int in_mtu = tcpmss_reverse_mtu(net, skb, family);
 
 		if (dst_mtu(skb_dst(skb)) <= minlen) {
-			net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
-					    dst_mtu(skb_dst(skb)));
+			net_velog_ratelimited("unknown or invalid path-MTU (%u)\n",
+					      dst_mtu(skb_dst(skb)));
 			return -1;
 		}
 		if (in_mtu <= minlen) {
-			net_err_ratelimited("unknown or invalid path-MTU (%u)\n",
-					    in_mtu);
+			net_velog_ratelimited("unknown or invalid path-MTU (%u)\n",
+					      in_mtu);
 			return -1;
 		}
 		newmss = min(dst_mtu(skb_dst(skb)), in_mtu) - minlen;
@@ -273,8 +273,8 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par)
 	    (par->hook_mask & ~((1 << NF_INET_FORWARD) |
 			   (1 << NF_INET_LOCAL_OUT) |
 			   (1 << NF_INET_POST_ROUTING))) != 0) {
-		pr_info("path-MTU clamping only supported in "
-			"FORWARD, OUTPUT and POSTROUTING hooks\n");
+		ve_printk(VE_LOG, "path-MTU clamping only supported in "
+				  "FORWARD, OUTPUT and POSTROUTING hooks\n");
 		return -EINVAL;
 	}
 	if (par->nft_compat)
@@ -283,7 +283,7 @@ static int tcpmss_tg4_check(const struct xt_tgchk_param *par)
 	xt_ematch_foreach(ematch, e)
 		if (find_syn_match(ematch))
 			return 0;
-	pr_info("Only works on TCP SYN packets\n");
+	ve_printk(VE_LOG, "Only works on TCP SYN packets\n");
 	return -EINVAL;
 }
 
@@ -298,8 +298,8 @@ static int tcpmss_tg6_check(const struct xt_tgchk_param *par)
 	    (par->hook_mask & ~((1 << NF_INET_FORWARD) |
 			   (1 << NF_INET_LOCAL_OUT) |
 			   (1 << NF_INET_POST_ROUTING))) != 0) {
-		pr_info("path-MTU clamping only supported in "
-			"FORWARD, OUTPUT and POSTROUTING hooks\n");
+		ve_printk(VE_LOG, "path-MTU clamping only supported in "
+				  "FORWARD, OUTPUT and POSTROUTING hooks\n");
 		return -EINVAL;
 	}
 	if (par->nft_compat)
@@ -308,7 +308,7 @@ static int tcpmss_tg6_check(const struct xt_tgchk_param *par)
 	xt_ematch_foreach(ematch, e)
 		if (find_syn_match(ematch))
 			return 0;
-	pr_info("Only works on TCP SYN packets\n");
+	ve_printk(VE_LOG, "Only works on TCP SYN packets\n");
 	return -EINVAL;
 }
 #endif
--- a/net/netfilter/xt_cluster.c
+++ b/net/netfilter/xt_cluster.c
@@ -148,6 +148,7 @@ static int xt_cluster_mt_checkentry(const struct xt_mtchk_param *par)
 			"higher than the total number of nodes\n");
 		return -EDOM;
 	}
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
--- a/net/netfilter/xt_connbytes.c
+++ b/net/netfilter/xt_connbytes.c
@@ -112,6 +112,8 @@ static int connbytes_mt_check(const struct xt_mtchk_param *par)
 	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
+	else
+		allow_conntrack_allocation(par->net);
 
 	/*
 	 * This filter cannot function correctly unless connection tracking
--- a/net/netfilter/xt_connlabel.c
+++ b/net/netfilter/xt_connlabel.c
@@ -53,7 +53,8 @@ static int connlabel_mt_check(const struct xt_mtchk_param *par)
 		pr_info("cannot load conntrack support for proto=%u\n",
 							par->family);
 		return ret;
-	}
+	} else
+		allow_conntrack_allocation(par->net);
 
 	ret = nf_connlabels_get(par->net, info->bit);
 	if (ret < 0)
--- a/net/netfilter/xt_connlimit.c
+++ b/net/netfilter/xt_connlimit.c
@@ -393,6 +393,8 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par)
 	for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i)
 		info->data->climit_root6[i] = RB_ROOT;
 
+	allow_conntrack_allocation(par->net);
+
 	return 0;
 }
 
@@ -430,15 +432,27 @@ static void connlimit_mt_destroy(const struct xt_mtdtor_param *par)
 	kfree(info->data);
 }
 
-static struct xt_match connlimit_mt_reg __read_mostly = {
-	.name       = "connlimit",
-	.revision   = 1,
-	.family     = NFPROTO_UNSPEC,
-	.checkentry = connlimit_mt_check,
-	.match      = connlimit_mt,
-	.matchsize  = sizeof(struct xt_connlimit_info),
-	.destroy    = connlimit_mt_destroy,
-	.me         = THIS_MODULE,
+static struct xt_match connlimit_mt_reg[] __read_mostly = {
+	{
+		.name		= "connlimit",
+		.revision	= 0,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= connlimit_mt_check,
+		.match		= connlimit_mt,
+		.matchsize	= sizeof(struct xt_connlimit_info),
+		.destroy	= connlimit_mt_destroy,
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "connlimit",
+		.revision	= 1,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= connlimit_mt_check,
+		.match		= connlimit_mt,
+		.matchsize	= sizeof(struct xt_connlimit_info),
+		.destroy	= connlimit_mt_destroy,
+		.me		= THIS_MODULE,
+	},
 };
 
 static int __init connlimit_mt_init(void)
@@ -464,7 +478,8 @@ static int __init connlimit_mt_init(void)
 		kmem_cache_destroy(connlimit_conn_cachep);
 		return -ENOMEM;
 	}
-	ret = xt_register_match(&connlimit_mt_reg);
+	ret = xt_register_matches(connlimit_mt_reg,
+	      ARRAY_SIZE(connlimit_mt_reg));
 	if (ret != 0) {
 		kmem_cache_destroy(connlimit_conn_cachep);
 		kmem_cache_destroy(connlimit_rb_cachep);
@@ -474,7 +489,7 @@ static int __init connlimit_mt_init(void)
 
 static void __exit connlimit_mt_exit(void)
 {
-	xt_unregister_match(&connlimit_mt_reg);
+	xt_unregister_matches(connlimit_mt_reg, ARRAY_SIZE(connlimit_mt_reg));
 	kmem_cache_destroy(connlimit_conn_cachep);
 	kmem_cache_destroy(connlimit_rb_cachep);
 }
--- a/net/netfilter/xt_connmark.c
+++ b/net/netfilter/xt_connmark.c
@@ -37,6 +37,45 @@ MODULE_ALIAS("ipt_connmark");
 MODULE_ALIAS("ip6t_connmark");
 
 static unsigned int
+connmark_tg_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_connmark_target_info *markinfo = par->targinfo;
+	struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	u_int32_t diff;
+	u_int32_t mark;
+	u_int32_t newmark;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (ct) {
+		switch(markinfo->mode) {
+		case XT_CONNMARK_SET:
+			newmark = (ct->mark & ~markinfo->mask) | markinfo->mark;
+			if (newmark != ct->mark) {
+				ct->mark = newmark;
+				nf_conntrack_event_cache(IPCT_MARK, ct);
+			}
+			break;
+		case XT_CONNMARK_SAVE:
+			newmark = (ct->mark & ~markinfo->mask) |
+				  (skb->mark & markinfo->mask);
+			if (ct->mark != newmark) {
+				ct->mark = newmark;
+				nf_conntrack_event_cache(IPCT_MARK, ct);
+			}
+			break;
+		case XT_CONNMARK_RESTORE:
+			mark = skb->mark;
+			diff = (ct->mark ^ mark) & markinfo->mask;
+			skb->mark = mark ^ diff;
+			break;
+		}
+	}
+
+	return XT_CONTINUE;
+}
+
+static unsigned int
 connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
 	const struct xt_connmark_tginfo1 *info = par->targinfo;
@@ -74,6 +113,32 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
+static int connmark_tg_check_v0(const struct xt_tgchk_param *par)
+{
+	const struct xt_connmark_target_info *matchinfo = par->targinfo;
+	int ret = -EINVAL;
+
+	if (matchinfo->mode == XT_CONNMARK_RESTORE) {
+		if (strcmp(par->table, "mangle") != 0) {
+			printk(KERN_WARNING "CONNMARK: restore can only be "
+			       "called from \"mangle\" table, not \"%s\"\n",
+			       par->table);
+			return ret;
+		}
+	}
+	if (matchinfo->mark > 0xffffffff || matchinfo->mask > 0xffffffff) {
+		printk(KERN_WARNING "CONNMARK: Only supports 32bit mark\n");
+		return ret;
+	}
+	ret = nf_ct_l3proto_try_module_get(par->family);
+	if (ret < 0) {
+		printk(KERN_WARNING "can't load conntrack support for "
+				    "proto=%u\n", par->family);
+		return ret;
+	}
+	return 0;
+}
+
 static int connmark_tg_check(const struct xt_tgchk_param *par)
 {
 	int ret;
@@ -91,6 +156,37 @@ static void connmark_tg_destroy(const struct xt_tgdtor_param *par)
 }
 
 static bool
+connmark_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_connmark_info *info = par->matchinfo;
+	const struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+
+	ct = nf_ct_get(skb, &ctinfo);
+	if (!ct)
+		return false;
+
+	return ((ct->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static int connmark_mt_check_v0(const struct xt_mtchk_param *par)
+{
+	const struct xt_connmark_info *cm = par->matchinfo;
+
+	if (cm->mark > 0xffffffff || cm->mask > 0xffffffff) {
+		printk(KERN_WARNING "connmark: only support 32bit mark\n");
+		return -EINVAL;
+	}
+	if (nf_ct_l3proto_try_module_get(par->family) < 0) {
+		printk(KERN_WARNING "can't load conntrack support for "
+				    "proto=%u\n", par->family);
+		return -EINVAL;
+	}
+	allow_conntrack_allocation(par->net);
+	return 0;
+}
+
+static bool
 connmark_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_connmark_mtinfo1 *info = par->matchinfo;
@@ -112,6 +208,8 @@ static int connmark_mt_check(const struct xt_mtchk_param *par)
 	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
+	else
+		allow_conntrack_allocation(par->net);
 	return ret;
 }
 
@@ -120,38 +218,139 @@ static void connmark_mt_destroy(const struct xt_mtdtor_param *par)
 	nf_ct_l3proto_module_put(par->family);
 }
 
-static struct xt_target connmark_tg_reg __read_mostly = {
-	.name           = "CONNMARK",
-	.revision       = 1,
-	.family         = NFPROTO_UNSPEC,
-	.checkentry     = connmark_tg_check,
-	.target         = connmark_tg,
-	.targetsize     = sizeof(struct xt_connmark_tginfo1),
-	.destroy        = connmark_tg_destroy,
-	.me             = THIS_MODULE,
+#ifdef CONFIG_COMPAT
+struct compat_xt_connmark_target_info {
+	compat_ulong_t	mark, mask;
+	u_int8_t	mode;
+	u_int8_t	__pad1;
+	u_int16_t	__pad2;
 };
 
-static struct xt_match connmark_mt_reg __read_mostly = {
-	.name           = "connmark",
-	.revision       = 1,
-	.family         = NFPROTO_UNSPEC,
-	.checkentry     = connmark_mt_check,
-	.match          = connmark_mt,
-	.matchsize      = sizeof(struct xt_connmark_mtinfo1),
-	.destroy        = connmark_mt_destroy,
-	.me             = THIS_MODULE,
+static void connmark_tg_compat_from_user_v0(void *dst, const void *src)
+{
+	const struct compat_xt_connmark_target_info *cm = src;
+	struct xt_connmark_target_info m = {
+		.mark	= cm->mark,
+		.mask	= cm->mask,
+		.mode	= cm->mode,
+	};
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int connmark_tg_compat_to_user_v0(void __user *dst, const void *src)
+{
+	const struct xt_connmark_target_info *m = src;
+	struct compat_xt_connmark_target_info cm;
+
+	memset(&cm, 0, sizeof(cm));
+	cm.mark	= m->mark;
+	cm.mask	= m->mask;
+	cm.mode	= m->mode;
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_target connmark_tg_reg[] __read_mostly = {
+	{
+		.name		= "CONNMARK",
+		.revision	= 0,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= connmark_tg_check_v0,
+		.destroy	= connmark_tg_destroy,
+		.target		= connmark_tg_v0,
+		.targetsize	= sizeof(struct xt_connmark_target_info),
+#ifdef CONFIG_COMPAT
+		.compatsize	= sizeof(struct compat_xt_connmark_target_info),
+		.compat_from_user = connmark_tg_compat_from_user_v0,
+		.compat_to_user	= connmark_tg_compat_to_user_v0,
+#endif
+		.me		= THIS_MODULE
+	},
+	{
+		.name           = "CONNMARK",
+		.revision       = 1,
+		.family         = NFPROTO_UNSPEC,
+		.checkentry     = connmark_tg_check,
+		.target         = connmark_tg,
+		.targetsize     = sizeof(struct xt_connmark_tginfo1),
+		.destroy        = connmark_tg_destroy,
+		.me             = THIS_MODULE,
+	},
+};
+
+#ifdef CONFIG_COMPAT
+struct compat_xt_connmark_info {
+	compat_ulong_t	mark, mask;
+	u_int8_t	invert;
+	u_int8_t	__pad1;
+	u_int16_t	__pad2;
+};
+
+static void connmark_mt_compat_from_user_v0(void *dst, const void *src)
+{
+	const struct compat_xt_connmark_info *cm = src;
+	struct xt_connmark_info m = {
+		.mark	= cm->mark,
+		.mask	= cm->mask,
+		.invert	= cm->invert,
+	};
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int connmark_mt_compat_to_user_v0(void __user *dst, const void *src)
+{
+	const struct xt_connmark_info *m = src;
+	struct compat_xt_connmark_info cm;
+
+	memset(&cm, 0, sizeof(cm));
+	cm.mark = m->mark;
+	cm.mask = m->mask;
+	cm.invert = m->invert;
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_match connmark_mt_reg[] __read_mostly = {
+	{
+		.name		= "connmark",
+		.revision	= 0,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= connmark_mt_check_v0,
+		.match		= connmark_mt_v0,
+		.destroy	= connmark_mt_destroy,
+		.matchsize	= sizeof(struct xt_connmark_info),
+#ifdef CONFIG_COMPAT
+		.compatsize	= sizeof(struct compat_xt_connmark_info),
+		.compat_from_user = connmark_mt_compat_from_user_v0,
+		.compat_to_user	= connmark_mt_compat_to_user_v0,
+#endif
+		.me		= THIS_MODULE,
+	},
+	{
+		.name           = "connmark",
+		.revision       = 1,
+		.family         = NFPROTO_UNSPEC,
+		.checkentry     = connmark_mt_check,
+		.match          = connmark_mt,
+		.matchsize      = sizeof(struct xt_connmark_mtinfo1),
+		.destroy        = connmark_mt_destroy,
+		.me             = THIS_MODULE,
+	},
 };
 
 static int __init connmark_mt_init(void)
 {
 	int ret;
 
-	ret = xt_register_target(&connmark_tg_reg);
+	ret = xt_register_targets(connmark_tg_reg,
+				  ARRAY_SIZE(connmark_tg_reg));
 	if (ret < 0)
 		return ret;
-	ret = xt_register_match(&connmark_mt_reg);
+	ret = xt_register_matches(connmark_mt_reg,
+				  ARRAY_SIZE(connmark_mt_reg));
 	if (ret < 0) {
-		xt_unregister_target(&connmark_tg_reg);
+		xt_unregister_targets(connmark_tg_reg,
+				      ARRAY_SIZE(connmark_tg_reg));
 		return ret;
 	}
 	return 0;
@@ -159,8 +358,8 @@ static int __init connmark_mt_init(void)
 
 static void __exit connmark_mt_exit(void)
 {
-	xt_unregister_match(&connmark_mt_reg);
-	xt_unregister_target(&connmark_tg_reg);
+	xt_unregister_matches(connmark_mt_reg, ARRAY_SIZE(connmark_mt_reg));
+	xt_unregister_targets(connmark_tg_reg, ARRAY_SIZE(connmark_tg_reg));
 }
 
 module_init(connmark_mt_init);
--- a/net/netfilter/xt_conntrack.c
+++ b/net/netfilter/xt_conntrack.c
@@ -119,6 +119,95 @@ port_match(u16 min, u16 max, u16 port, bool invert)
 	return (port >= min && port <= max) ^ invert;
 }
 
+static bool
+conntrack_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_conntrack_info *sinfo = par->matchinfo;
+	const struct nf_conn *ct;
+	enum ip_conntrack_info ctinfo;
+	unsigned int statebit;
+
+	ct = nf_ct_get(skb, &ctinfo);
+
+#define FWINV(bool, invflg) ((bool) ^ !!(sinfo->invflags & (invflg)))
+
+	if (ct == &nf_conntrack_untracked)
+		statebit = XT_CONNTRACK_STATE_UNTRACKED;
+	else if (ct)
+		statebit = XT_CONNTRACK_STATE_BIT(ctinfo);
+	else
+		statebit = XT_CONNTRACK_STATE_INVALID;
+
+	if (sinfo->flags & XT_CONNTRACK_STATE) {
+		if (ct) {
+			if (test_bit(IPS_SRC_NAT_BIT, &ct->status))
+				statebit |= XT_CONNTRACK_STATE_SNAT;
+			if (test_bit(IPS_DST_NAT_BIT, &ct->status))
+				statebit |= XT_CONNTRACK_STATE_DNAT;
+		}
+		if (FWINV((statebit & sinfo->statemask) == 0,
+			  XT_CONNTRACK_STATE))
+			return false;
+	}
+
+	if (ct == NULL) {
+		if (sinfo->flags & ~XT_CONNTRACK_STATE)
+			return false;
+		return true;
+	}
+
+	if (sinfo->flags & XT_CONNTRACK_PROTO &&
+	    FWINV(nf_ct_protonum(ct) !=
+		  sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.protonum,
+		  XT_CONNTRACK_PROTO))
+		return false;
+
+	if (sinfo->flags & XT_CONNTRACK_ORIGSRC &&
+	    FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.src.u3.ip &
+		   sinfo->sipmsk[IP_CT_DIR_ORIGINAL].s_addr) !=
+		  sinfo->tuple[IP_CT_DIR_ORIGINAL].src.ip,
+		  XT_CONNTRACK_ORIGSRC))
+		return false;
+
+	if (sinfo->flags & XT_CONNTRACK_ORIGDST &&
+	    FWINV((ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple.dst.u3.ip &
+		   sinfo->dipmsk[IP_CT_DIR_ORIGINAL].s_addr) !=
+		  sinfo->tuple[IP_CT_DIR_ORIGINAL].dst.ip,
+		  XT_CONNTRACK_ORIGDST))
+		return false;
+
+	if (sinfo->flags & XT_CONNTRACK_REPLSRC &&
+	    FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip &
+		   sinfo->sipmsk[IP_CT_DIR_REPLY].s_addr) !=
+		  sinfo->tuple[IP_CT_DIR_REPLY].src.ip,
+		  XT_CONNTRACK_REPLSRC))
+		return false;
+
+	if (sinfo->flags & XT_CONNTRACK_REPLDST &&
+	    FWINV((ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip &
+		   sinfo->dipmsk[IP_CT_DIR_REPLY].s_addr) !=
+		  sinfo->tuple[IP_CT_DIR_REPLY].dst.ip,
+		  XT_CONNTRACK_REPLDST))
+		return false;
+
+	if (sinfo->flags & XT_CONNTRACK_STATUS &&
+	    FWINV((ct->status & sinfo->statusmask) == 0,
+		  XT_CONNTRACK_STATUS))
+		return false;
+
+	if(sinfo->flags & XT_CONNTRACK_EXPIRES) {
+		unsigned long expires = timer_pending(&ct->timeout) ?
+					(ct->timeout.expires - jiffies)/HZ : 0;
+
+		if (FWINV(!(expires >= sinfo->expires_min &&
+			    expires <= sinfo->expires_max),
+			  XT_CONNTRACK_EXPIRES))
+			return false;
+	}
+	return true;
+#undef FWINV
+}
+
 static inline bool
 ct_proto_port_check_v3(const struct xt_conntrack_mtinfo3 *info,
 		       const struct nf_conn *ct)
@@ -245,6 +334,56 @@ conntrack_mt(const struct sk_buff *skb, struct xt_action_param *par,
 	return true;
 }
 
+#ifdef CONFIG_COMPAT
+struct compat_xt_conntrack_info
+{
+	compat_uint_t			statemask;
+	compat_uint_t			statusmask;
+	struct ip_conntrack_old_tuple	tuple[IP_CT_DIR_MAX];
+	struct in_addr			sipmsk[IP_CT_DIR_MAX];
+	struct in_addr			dipmsk[IP_CT_DIR_MAX];
+	compat_ulong_t			expires_min;
+	compat_ulong_t			expires_max;
+	u_int8_t			flags;
+	u_int8_t			invflags;
+};
+
+static void conntrack_mt_compat_from_user_v0(void *dst, const void *src)
+{
+	const struct compat_xt_conntrack_info *cm = src;
+	struct xt_conntrack_info m = {
+		.statemask	= cm->statemask,
+		.statusmask	= cm->statusmask,
+		.expires_min	= cm->expires_min,
+		.expires_max	= cm->expires_max,
+		.flags		= cm->flags,
+		.invflags	= cm->invflags,
+	};
+	memcpy(m.tuple, cm->tuple, sizeof(m.tuple));
+	memcpy(m.sipmsk, cm->sipmsk, sizeof(m.sipmsk));
+	memcpy(m.dipmsk, cm->dipmsk, sizeof(m.dipmsk));
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int conntrack_mt_compat_to_user_v0(void __user *dst, const void *src)
+{
+	const struct xt_conntrack_info *m = src;
+	struct compat_xt_conntrack_info cm;
+
+	memset(&cm, 0, sizeof(cm));
+	cm.statemask	= m->statemask;
+	cm.statusmask	= m->statusmask;
+	cm.expires_min	= m->expires_min;
+	cm.expires_max	= m->expires_max;
+	cm.flags	= m->flags;
+	cm.invflags	= m->invflags;
+	memcpy(cm.tuple, m->tuple, sizeof(cm.tuple));
+	memcpy(cm.sipmsk, m->sipmsk, sizeof(cm.sipmsk));
+	memcpy(cm.dipmsk, m->dipmsk, sizeof(cm.dipmsk));
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif
+
 static bool
 conntrack_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
 {
@@ -277,6 +416,8 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par)
 	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
+	else
+		allow_conntrack_allocation(par->net);
 	return ret;
 }
 
@@ -288,6 +429,21 @@ static void conntrack_mt_destroy(const struct xt_mtdtor_param *par)
 static struct xt_match conntrack_mt_reg[] __read_mostly = {
 	{
 		.name       = "conntrack",
+		.revision   = 0,
+		.family     = NFPROTO_UNSPEC,
+		.match      = conntrack_mt_v0,
+		.checkentry = conntrack_mt_check,
+		.destroy    = conntrack_mt_destroy,
+		.matchsize  = sizeof(struct xt_conntrack_info),
+		.me         = THIS_MODULE,
+#ifdef CONFIG_COMPAT
+		.compatsize       = sizeof(struct compat_xt_conntrack_info),
+		.compat_from_user = conntrack_mt_compat_from_user_v0,
+		.compat_to_user   = conntrack_mt_compat_to_user_v0,
+#endif
+	},
+	{
+		.name       = "conntrack",
 		.revision   = 1,
 		.family     = NFPROTO_UNSPEC,
 		.matchsize  = sizeof(struct xt_conntrack_mtinfo1),
--- a/net/netfilter/xt_dscp.c
+++ b/net/netfilter/xt_dscp.c
@@ -54,6 +54,14 @@ static int dscp_mt_check(const struct xt_mtchk_param *par)
 	return 0;
 }
 
+static bool
+tos_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_tos_match_info *info = par->matchinfo;
+
+	return (ip_hdr(skb)->tos == info->tos_value) ^ info->invert;
+}
+
 static bool tos_mt(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_tos_match_info *info = par->matchinfo;
@@ -85,6 +93,14 @@ static struct xt_match dscp_mt_reg[] __read_mostly = {
 	},
 	{
 		.name		= "tos",
+		.revision	= 0,
+		.family		= NFPROTO_IPV4,
+		.match		= tos_mt_v0,
+		.matchsize	= sizeof(struct xt_tos_match_info),
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "tos",
 		.revision	= 1,
 		.family		= NFPROTO_IPV4,
 		.match		= tos_mt,
--- a/net/netfilter/xt_hashlimit.c
+++ b/net/netfilter/xt_hashlimit.c
@@ -262,7 +262,7 @@ static int htable_create(struct net *net, struct xt_hashlimit_mtinfo1 *minfo,
 	}
 	spin_lock_init(&hinfo->lock);
 
-	hinfo->pde = proc_create_data(minfo->name, 0,
+	hinfo->pde = proc_net_create_data(minfo->name, 0,
 		(family == NFPROTO_IPV4) ?
 		hashlimit_net->ipt_hashlimit : hashlimit_net->ip6t_hashlimit,
 		&dl_file_ops, hinfo);
@@ -867,11 +867,11 @@ static int __net_init hashlimit_proc_net_init(struct net *net)
 {
 	struct hashlimit_net *hashlimit_net = hashlimit_pernet(net);
 
-	hashlimit_net->ipt_hashlimit = proc_mkdir("ipt_hashlimit", net->proc_net);
+	hashlimit_net->ipt_hashlimit = proc_net_mkdir(net, "ipt_hashlimit", net->proc_net);
 	if (!hashlimit_net->ipt_hashlimit)
 		return -ENOMEM;
 #if IS_ENABLED(CONFIG_IP6_NF_IPTABLES)
-	hashlimit_net->ip6t_hashlimit = proc_mkdir("ip6t_hashlimit", net->proc_net);
+	hashlimit_net->ip6t_hashlimit = proc_net_mkdir(net, "ip6t_hashlimit", net->proc_net);
 	if (!hashlimit_net->ip6t_hashlimit) {
 		remove_proc_entry("ipt_hashlimit", net->proc_net);
 		return -ENOMEM;
--- a/net/netfilter/xt_helper.c
+++ b/net/netfilter/xt_helper.c
@@ -66,6 +66,7 @@ static int helper_mt_check(const struct xt_mtchk_param *par)
 		return ret;
 	}
 	info->name[29] = '\0';
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
--- a/net/netfilter/xt_iprange.c
+++ b/net/netfilter/xt_iprange.c
@@ -17,6 +17,39 @@
 #include <linux/netfilter/xt_iprange.h>
 
 static bool
+iprange_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_iprange_mtinfo *info = par->matchinfo;
+	const struct iphdr *iph = ip_hdr(skb);
+
+	if (info->flags & IPRANGE_SRC) {
+		if ((ntohl(iph->saddr) < ntohl(info->src_min.ip)
+			  || ntohl(iph->saddr) > ntohl(info->src_max.ip))
+			 ^ !!(info->flags & IPRANGE_SRC_INV)) {
+			pr_debug("src IP %pI4 NOT in range %s%pI4-%pI4\n",
+				 &iph->saddr,
+				 info->flags & IPRANGE_SRC_INV ? "(INV) " : "",
+				 &info->src_min.ip,
+				 &info->src_max.ip);
+			return false;
+		}
+	}
+	if (info->flags & IPRANGE_DST) {
+		if ((ntohl(iph->daddr) < ntohl(info->dst_min.ip)
+			  || ntohl(iph->daddr) > ntohl(info->dst_max.ip))
+			 ^ !!(info->flags & IPRANGE_DST_INV)) {
+			pr_debug("dst IP %pI4 NOT in range %s%pI4-%pI4\n",
+				 &iph->daddr,
+				 info->flags & IPRANGE_DST_INV ? "(INV) " : "",
+				 &info->dst_min.ip,
+				 &info->dst_max.ip);
+			return false;
+		}
+	}
+	return true;
+}
+
+static bool
 iprange_mt4(const struct sk_buff *skb, struct xt_action_param *par)
 {
 	const struct xt_iprange_mtinfo *info = par->matchinfo;
@@ -104,6 +137,14 @@ iprange_mt6(const struct sk_buff *skb, struct xt_action_param *par)
 static struct xt_match iprange_mt_reg[] __read_mostly = {
 	{
 		.name      = "iprange",
+		.revision  = 0,
+		.family    = NFPROTO_IPV4,
+		.match     = iprange_mt_v0,
+		.matchsize = sizeof(struct xt_iprange_mtinfo),
+		.me        = THIS_MODULE,
+	},
+	{
+		.name      = "iprange",
 		.revision  = 1,
 		.family    = NFPROTO_IPV4,
 		.match     = iprange_mt4,
--- a/net/netfilter/xt_ipvs.c
+++ b/net/netfilter/xt_ipvs.c
@@ -161,6 +161,7 @@ static int ipvs_mt_check(const struct xt_mtchk_param *par)
 		return -EINVAL;
 	}
 
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
--- a/net/netfilter/xt_limit.c
+++ b/net/netfilter/xt_limit.c
@@ -107,8 +107,8 @@ static int limit_mt_check(const struct xt_mtchk_param *par)
 	/* Check for overflow. */
 	if (r->burst == 0
 	    || user2credits(r->avg * r->burst) < user2credits(r->avg)) {
-		pr_info("Overflow, try lower: %u/%u\n",
-			r->avg, r->burst);
+		ve_printk(VE_LOG, "Overflow, try lower: %u/%u\n",
+				  r->avg, r->burst);
 		return -ERANGE;
 	}
 
--- a/net/netfilter/xt_mark.c
+++ b/net/netfilter/xt_mark.c
@@ -24,6 +24,55 @@ MODULE_ALIAS("ip6t_mark");
 MODULE_ALIAS("ipt_MARK");
 MODULE_ALIAS("ip6t_MARK");
 
+static bool
+mark_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_mark_info *info = par->matchinfo;
+
+	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static bool
+mark_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct xt_mark_mtinfo1 *info = par->matchinfo;
+
+	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
+}
+
+static unsigned int
+mark_tg_v0(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_mark_target_info *markinfo = par->targinfo;
+
+	skb->mark = markinfo->mark;
+	return XT_CONTINUE;
+}
+
+static unsigned int
+mark_tg_v1(struct sk_buff *skb, const struct xt_action_param *par)
+{
+	const struct xt_mark_target_info_v1 *markinfo = par->targinfo;
+	int mark = 0;
+
+	switch (markinfo->mode) {
+	case XT_MARK_SET:
+		mark = markinfo->mark;
+		break;
+
+	case XT_MARK_AND:
+		mark = skb->mark & markinfo->mark;
+		break;
+
+	case XT_MARK_OR:
+		mark = skb->mark | markinfo->mark;
+		break;
+	}
+
+	skb->mark = mark;
+	return XT_CONTINUE;
+}
+
 static unsigned int
 mark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
@@ -33,42 +82,220 @@ mark_tg(struct sk_buff *skb, const struct xt_action_param *par)
 	return XT_CONTINUE;
 }
 
-static bool
-mark_mt(const struct sk_buff *skb, struct xt_action_param *par)
+static int mark_tg_check_v0(const struct xt_tgchk_param *par)
 {
-	const struct xt_mark_mtinfo1 *info = par->matchinfo;
+	const struct xt_mark_target_info *markinfo = par->targinfo;
 
-	return ((skb->mark & info->mask) == info->mark) ^ info->invert;
+	if (markinfo->mark > 0xffffffff) {
+		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+		return -EINVAL;
+	}
+	return 0;
 }
 
-static struct xt_target mark_tg_reg __read_mostly = {
-	.name           = "MARK",
-	.revision       = 2,
-	.family         = NFPROTO_UNSPEC,
-	.target         = mark_tg,
-	.targetsize     = sizeof(struct xt_mark_tginfo2),
-	.me             = THIS_MODULE,
+static int mark_tg_check_v1(const struct xt_tgchk_param *par)
+{
+	const struct xt_mark_target_info_v1 *markinfo = par->targinfo;
+
+	if (markinfo->mode != XT_MARK_SET
+	    && markinfo->mode != XT_MARK_AND
+	    && markinfo->mode != XT_MARK_OR) {
+		printk(KERN_WARNING "MARK: unknown mode %u\n",
+		       markinfo->mode);
+		return -EINVAL;
+	}
+	if (markinfo->mark > 0xffffffff) {
+		printk(KERN_WARNING "MARK: Only supports 32bit wide mark\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_xt_mark_target_info {
+	compat_ulong_t	mark;
 };
 
-static struct xt_match mark_mt_reg __read_mostly = {
-	.name           = "mark",
-	.revision       = 1,
-	.family         = NFPROTO_UNSPEC,
-	.match          = mark_mt,
-	.matchsize      = sizeof(struct xt_mark_mtinfo1),
-	.me             = THIS_MODULE,
+static void mark_tg_compat_from_user_v0(void *dst, const void *src)
+{
+	const struct compat_xt_mark_target_info *cm = src;
+	struct xt_mark_target_info m = {
+		.mark	= cm->mark,
+	};
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int mark_tg_compat_to_user_v0(void __user *dst, const void *src)
+{
+	const struct xt_mark_target_info *m = src;
+	struct compat_xt_mark_target_info cm = {
+		.mark	= m->mark,
+	};
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+
+struct compat_xt_mark_target_info_v1 {
+	compat_ulong_t	mark;
+	u_int8_t	mode;
+	u_int8_t	__pad1;
+	u_int16_t	__pad2;
+};
+
+static void mark_tg_compat_from_user_v1(void *dst, const void *src)
+{
+	const struct compat_xt_mark_target_info_v1 *cm = src;
+	struct xt_mark_target_info_v1 m = {
+		.mark	= cm->mark,
+		.mode	= cm->mode,
+	};
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int mark_tg_compat_to_user_v1(void __user *dst, const void *src)
+{
+	const struct xt_mark_target_info_v1 *m = src;
+	struct compat_xt_mark_target_info_v1 cm;
+
+	memset(&cm, 0, sizeof(cm));
+	cm.mark = m->mark;
+	cm.mode = m->mode;
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_target mark_tg_reg[] __read_mostly = {
+	{
+		.name		= "MARK",
+		.family		= NFPROTO_UNSPEC,
+		.revision	= 0,
+		.checkentry	= mark_tg_check_v0,
+		.target		= mark_tg_v0,
+		.targetsize	= sizeof(struct xt_mark_target_info),
+#ifdef CONFIG_COMPAT
+		.compatsize	= sizeof(struct compat_xt_mark_target_info),
+		.compat_from_user = mark_tg_compat_from_user_v0,
+		.compat_to_user	= mark_tg_compat_to_user_v0,
+#endif
+		/*
+		 * To support rhel5 containers which use iptables 1.3.5
+		 * series (which in turn exploit @revision = 1) we're
+		 * dropping off @table here so kernel won't complain
+		 * if one setting up MARK rule in a fashion of iptables 1.4.2
+		 * series (which exploit @revision = 2).
+		 */
+		/* .table		= "mangle", */
+		.me		= THIS_MODULE,
+	},
+	{
+		.name		= "MARK",
+		.family		= NFPROTO_UNSPEC,
+		.revision	= 1,
+		.checkentry	= mark_tg_check_v1,
+		.target		= mark_tg_v1,
+		.targetsize	= sizeof(struct xt_mark_target_info_v1),
+#ifdef CONFIG_COMPAT
+		.compatsize	= sizeof(struct compat_xt_mark_target_info_v1),
+		.compat_from_user = mark_tg_compat_from_user_v1,
+		.compat_to_user	= mark_tg_compat_to_user_v1,
+#endif
+		/*
+		 * To support rhel5 containers which use iptables 1.3.5
+		 * series (which in turn exploit @revision = 1) we're
+		 * dropping off @table here so kernel won't complain
+		 * if one setting up MARK rule in a fashion of iptables 1.4.2
+		 * series (which exploit @revision = 2).
+		 */
+		/* .table		= "mangle", */
+		.me		= THIS_MODULE,
+	},
+	{
+		.name           = "MARK",
+		.revision       = 2,
+		.family         = NFPROTO_UNSPEC,
+		.target         = mark_tg,
+		.targetsize     = sizeof(struct xt_mark_tginfo2),
+		.me             = THIS_MODULE,
+	},
+};
+
+static int mark_mt_check_v0(const struct xt_mtchk_param *par)
+{
+	const struct xt_mark_info *minfo = par->matchinfo;
+
+	if (minfo->mark > 0xffffffff || minfo->mask > 0xffffffff) {
+		printk(KERN_WARNING "mark: only supports 32bit mark\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_xt_mark_info {
+	compat_ulong_t	mark, mask;
+	u_int8_t	invert;
+	u_int8_t	__pad1;
+	u_int16_t	__pad2;
+};
+
+static void mark_mt_compat_from_user_v0(void *dst, const void *src)
+{
+	const struct compat_xt_mark_info *cm = src;
+	struct xt_mark_info m = {
+		.mark	= cm->mark,
+		.mask	= cm->mask,
+		.invert	= cm->invert,
+	};
+	memcpy(dst, &m, sizeof(m));
+}
+
+static int mark_mt_compat_to_user_v0(void __user *dst, const void *src)
+{
+	const struct xt_mark_info *m = src;
+	struct compat_xt_mark_info cm;
+
+	memset(&cm, 0, sizeof(cm));
+	cm.mark = m->mark;
+	cm.mask = m->mask;
+	cm.invert = m->invert;
+	return copy_to_user(dst, &cm, sizeof(cm)) ? -EFAULT : 0;
+}
+#endif /* CONFIG_COMPAT */
+
+static struct xt_match mark_mt_reg[] __read_mostly = {
+	{
+		.name		= "mark",
+		.revision	= 0,
+		.family		= NFPROTO_UNSPEC,
+		.checkentry	= mark_mt_check_v0,
+		.match		= mark_mt_v0,
+		.matchsize	= sizeof(struct xt_mark_info),
+#ifdef CONFIG_COMPAT
+		.compatsize	= sizeof(struct compat_xt_mark_info),
+		.compat_from_user = mark_mt_compat_from_user_v0,
+		.compat_to_user	= mark_mt_compat_to_user_v0,
+#endif
+		.me		= THIS_MODULE,
+	},
+	{
+		.name           = "mark",
+		.revision       = 1,
+		.family         = NFPROTO_UNSPEC,
+		.match          = mark_mt,
+		.matchsize      = sizeof(struct xt_mark_mtinfo1),
+		.me             = THIS_MODULE,
+	},
 };
 
 static int __init mark_mt_init(void)
 {
 	int ret;
 
-	ret = xt_register_target(&mark_tg_reg);
+	ret = xt_register_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
 	if (ret < 0)
 		return ret;
-	ret = xt_register_match(&mark_mt_reg);
+	ret = xt_register_matches(mark_mt_reg, ARRAY_SIZE(mark_mt_reg));
 	if (ret < 0) {
-		xt_unregister_target(&mark_tg_reg);
+		xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
 		return ret;
 	}
 	return 0;
@@ -76,8 +303,8 @@ static int __init mark_mt_init(void)
 
 static void __exit mark_mt_exit(void)
 {
-	xt_unregister_match(&mark_mt_reg);
-	xt_unregister_target(&mark_tg_reg);
+	xt_unregister_matches(mark_mt_reg, ARRAY_SIZE(mark_mt_reg));
+	xt_unregister_targets(mark_tg_reg, ARRAY_SIZE(mark_tg_reg));
 }
 
 module_init(mark_mt_init);
--- a/net/netfilter/xt_nat.c
+++ b/net/netfilter/xt_nat.c
@@ -23,6 +23,13 @@ static int xt_nat_checkentry_v0(const struct xt_tgchk_param *par)
 			par->target->name);
 		return -EINVAL;
 	}
+	allow_conntrack_allocation(par->net);
+	return 0;
+}
+
+static int xt_nat_checkentry_v1(const struct xt_tgchk_param *par)
+{
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
@@ -129,6 +136,7 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
 	{
 		.name		= "SNAT",
 		.revision	= 1,
+		.checkentry	= xt_nat_checkentry_v1,
 		.target		= xt_snat_target_v1,
 		.targetsize	= sizeof(struct nf_nat_range),
 		.table		= "nat",
@@ -139,6 +147,7 @@ static struct xt_target xt_nat_target_reg[] __read_mostly = {
 	{
 		.name		= "DNAT",
 		.revision	= 1,
+		.checkentry	= xt_nat_checkentry_v1,
 		.target		= xt_dnat_target_v1,
 		.targetsize	= sizeof(struct nf_nat_range),
 		.table		= "nat",
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -422,5 +422,7 @@ module_exit(xt_osf_fini);
 
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Evgeniy Polyakov <zbr@ioremap.net>");
+MODULE_ALIAS("ipt_osf");
+MODULE_ALIAS("ip6t_osf");
 MODULE_DESCRIPTION("Passive OS fingerprint matching.");
 MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF);
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -17,6 +17,66 @@
 #include <linux/netfilter/x_tables.h>
 #include <linux/netfilter/xt_owner.h>
 
+static bool
+owner_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ipt_owner_info *info = par->matchinfo;
+	const struct file *filp;
+
+	if (skb->sk == NULL || skb->sk->sk_socket == NULL)
+		return false;
+
+	filp = skb->sk->sk_socket->file;
+	if (filp == NULL)
+		return false;
+
+	if (info->match & XT_OWNER_UID) {
+		kuid_t uid = make_kuid(&init_user_ns, info->uid);
+		if ((!uid_eq(filp->f_cred->fsuid, uid)) ^
+		    !!(info->invert & XT_OWNER_UID))
+			return false;
+	}
+
+	if (info->match & XT_OWNER_GID) {
+		kgid_t gid = make_kgid(&init_user_ns, info->gid);
+		if ((!gid_eq(filp->f_cred->fsgid, gid)) ^
+		    !!(info->invert & XT_OWNER_GID))
+			return false;
+	}
+
+	return true;
+}
+
+static bool
+owner_mt6_v0(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	const struct ip6t_owner_info *info = par->matchinfo;
+	const struct file *filp;
+
+	if (skb->sk == NULL || skb->sk->sk_socket == NULL)
+		return false;
+
+	filp = skb->sk->sk_socket->file;
+	if (filp == NULL)
+		return false;
+
+	if (info->match & XT_OWNER_UID) {
+		kuid_t uid = make_kuid(&init_user_ns, info->uid);
+		if ((!uid_eq(filp->f_cred->fsuid, uid)) ^
+		    !!(info->invert & XT_OWNER_UID))
+			return false;
+	}
+
+	if (info->match & XT_OWNER_GID) {
+		kgid_t gid = make_kgid(&init_user_ns, info->gid);
+		if ((!gid_eq(filp->f_cred->fsgid, gid)) ^
+		    !!(info->invert & XT_OWNER_GID))
+			return false;
+	}
+
+	return true;
+}
+
 static int owner_check(const struct xt_mtchk_param *par)
 {
 	struct xt_owner_match_info *info = par->matchinfo;
@@ -69,26 +129,77 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
 	return true;
 }
 
-static struct xt_match owner_mt_reg __read_mostly = {
-	.name       = "owner",
-	.revision   = 1,
-	.family     = NFPROTO_UNSPEC,
-	.checkentry = owner_check,
-	.match      = owner_mt,
-	.matchsize  = sizeof(struct xt_owner_match_info),
-	.hooks      = (1 << NF_INET_LOCAL_OUT) |
-	              (1 << NF_INET_POST_ROUTING),
-	.me         = THIS_MODULE,
+static int owner_mt_check_v0(const struct xt_mtchk_param *par)
+{
+	const struct ipt_owner_info *info = par->matchinfo;
+
+	if (info->match & ~(XT_OWNER_UID | XT_OWNER_GID)) {
+		printk(KERN_WARNING KBUILD_MODNAME
+		       ": PID, SID and command matching is not "
+		       "supported anymore\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int owner_mt6_check_v0(const struct xt_mtchk_param *par)
+{
+	const struct ip6t_owner_info *info = par->matchinfo;
+
+	if (info->match & ~(XT_OWNER_UID | XT_OWNER_GID)) {
+		printk(KERN_WARNING KBUILD_MODNAME
+		       ": PID and SID matching is not supported anymore\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+static struct xt_match owner_mt_reg[] __read_mostly = {
+	{
+		.name       = "owner",
+		.revision   = 0,
+		.family     = NFPROTO_IPV4,
+		.match      = owner_mt_v0,
+		.matchsize  = sizeof(struct ipt_owner_info),
+		.checkentry = owner_mt_check_v0,
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+		              (1 << NF_INET_POST_ROUTING),
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "owner",
+		.revision   = 0,
+		.family     = NFPROTO_IPV6,
+		.match      = owner_mt6_v0,
+		.matchsize  = sizeof(struct ip6t_owner_info),
+		.checkentry = owner_mt6_check_v0,
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+		              (1 << NF_INET_POST_ROUTING),
+		.me         = THIS_MODULE,
+	},
+	{
+		.name       = "owner",
+		.revision   = 1,
+		.family     = NFPROTO_UNSPEC,
+		.checkentry = owner_check,
+		.match      = owner_mt,
+		.matchsize  = sizeof(struct xt_owner_match_info),
+		.hooks      = (1 << NF_INET_LOCAL_OUT) |
+		              (1 << NF_INET_POST_ROUTING),
+		.me         = THIS_MODULE,
+	},
 };
 
 static int __init owner_mt_init(void)
 {
-	return xt_register_match(&owner_mt_reg);
+	return xt_register_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg));
 }
 
 static void __exit owner_mt_exit(void)
 {
-	xt_unregister_match(&owner_mt_reg);
+	xt_unregister_matches(owner_mt_reg, ARRAY_SIZE(owner_mt_reg));
 }
 
 module_init(owner_mt_init);
--- a/net/netfilter/xt_recent.c
+++ b/net/netfilter/xt_recent.c
@@ -394,7 +394,7 @@ static int recent_mt_check(const struct xt_mtchk_param *par,
 		ret = -EINVAL;
 		goto out;
 	}
-	pde = proc_create_data(t->name, ip_list_perms, recent_net->xt_recent,
+	pde = proc_net_create_data(t->name, ip_list_perms, recent_net->xt_recent,
 		  &recent_mt_fops, t);
 	if (pde == NULL) {
 		recent_table_free(t);
@@ -618,7 +618,7 @@ static int __net_init recent_proc_net_init(struct net *net)
 {
 	struct recent_net *recent_net = recent_pernet(net);
 
-	recent_net->xt_recent = proc_mkdir("xt_recent", net->proc_net);
+	recent_net->xt_recent = proc_net_mkdir(net, "xt_recent", net->proc_net);
 	if (!recent_net->xt_recent)
 		return -ENOMEM;
 	return 0;
--- a/net/netfilter/xt_set.c
+++ b/net/netfilter/xt_set.c
@@ -81,7 +81,7 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par)
 	struct xt_set_info_match_v0 *info = par->matchinfo;
 	ip_set_id_t index;
 
-	index = ip_set_nfnl_get_byindex(info->match_set.index);
+	index = ip_set_nfnl_get_byindex(par->net, info->match_set.index);
 
 	if (index == IPSET_INVALID_ID) {
 		pr_warning("Cannot find set indentified by id %u to match\n",
@@ -91,7 +91,7 @@ set_match_v0_checkentry(const struct xt_mtchk_param *par)
 	if (info->match_set.u.flags[IPSET_DIM_MAX-1] != 0) {
 		pr_warning("Protocol error: set match dimension "
 			   "is over the limit!\n");
-		ip_set_nfnl_put(info->match_set.index);
+		ip_set_nfnl_put(par->net, info->match_set.index);
 		return -ERANGE;
 	}
 
@@ -106,7 +106,7 @@ set_match_v0_destroy(const struct xt_mtdtor_param *par)
 {
 	struct xt_set_info_match_v0 *info = par->matchinfo;
 
-	ip_set_nfnl_put(info->match_set.index);
+	ip_set_nfnl_put(par->net, info->match_set.index);
 }
 
 static unsigned int
@@ -133,7 +133,7 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par)
 	ip_set_id_t index;
 
 	if (info->add_set.index != IPSET_INVALID_ID) {
-		index = ip_set_nfnl_get_byindex(info->add_set.index);
+		index = ip_set_nfnl_get_byindex(par->net, info->add_set.index);
 		if (index == IPSET_INVALID_ID) {
 			pr_warning("Cannot find add_set index %u as target\n",
 				   info->add_set.index);
@@ -142,12 +142,12 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par)
 	}
 
 	if (info->del_set.index != IPSET_INVALID_ID) {
-		index = ip_set_nfnl_get_byindex(info->del_set.index);
+		index = ip_set_nfnl_get_byindex(par->net, info->del_set.index);
 		if (index == IPSET_INVALID_ID) {
 			pr_warning("Cannot find del_set index %u as target\n",
 				   info->del_set.index);
 			if (info->add_set.index != IPSET_INVALID_ID)
-				ip_set_nfnl_put(info->add_set.index);
+				ip_set_nfnl_put(par->net, info->add_set.index);
 			return -ENOENT;
 		}
 	}
@@ -156,9 +156,9 @@ set_target_v0_checkentry(const struct xt_tgchk_param *par)
 		pr_warning("Protocol error: SET target dimension "
 			   "is over the limit!\n");
 		if (info->add_set.index != IPSET_INVALID_ID)
-			ip_set_nfnl_put(info->add_set.index);
+			ip_set_nfnl_put(par->net, info->add_set.index);
 		if (info->del_set.index != IPSET_INVALID_ID)
-			ip_set_nfnl_put(info->del_set.index);
+			ip_set_nfnl_put(par->net, info->del_set.index);
 		return -ERANGE;
 	}
 
@@ -175,9 +175,9 @@ set_target_v0_destroy(const struct xt_tgdtor_param *par)
 	const struct xt_set_info_target_v0 *info = par->targinfo;
 
 	if (info->add_set.index != IPSET_INVALID_ID)
-		ip_set_nfnl_put(info->add_set.index);
+		ip_set_nfnl_put(par->net, info->add_set.index);
 	if (info->del_set.index != IPSET_INVALID_ID)
-		ip_set_nfnl_put(info->del_set.index);
+		ip_set_nfnl_put(par->net, info->del_set.index);
 }
 
 /* Revision 1 match and target */
@@ -202,7 +202,7 @@ set_match_v1_checkentry(const struct xt_mtchk_param *par)
 	struct xt_set_info_match_v1 *info = par->matchinfo;
 	ip_set_id_t index;
 
-	index = ip_set_nfnl_get_byindex(info->match_set.index);
+	index = ip_set_nfnl_get_byindex(par->net, info->match_set.index);
 
 	if (index == IPSET_INVALID_ID) {
 		pr_warning("Cannot find set indentified by id %u to match\n",
@@ -212,7 +212,7 @@ set_match_v1_checkentry(const struct xt_mtchk_param *par)
 	if (info->match_set.dim > IPSET_DIM_MAX) {
 		pr_warning("Protocol error: set match dimension "
 			   "is over the limit!\n");
-		ip_set_nfnl_put(info->match_set.index);
+		ip_set_nfnl_put(par->net, info->match_set.index);
 		return -ERANGE;
 	}
 
@@ -224,7 +224,7 @@ set_match_v1_destroy(const struct xt_mtdtor_param *par)
 {
 	struct xt_set_info_match_v1 *info = par->matchinfo;
 
-	ip_set_nfnl_put(info->match_set.index);
+	ip_set_nfnl_put(par->net, info->match_set.index);
 }
 
 static unsigned int
@@ -251,7 +251,7 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par)
 	ip_set_id_t index;
 
 	if (info->add_set.index != IPSET_INVALID_ID) {
-		index = ip_set_nfnl_get_byindex(info->add_set.index);
+		index = ip_set_nfnl_get_byindex(par->net, info->add_set.index);
 		if (index == IPSET_INVALID_ID) {
 			pr_warning("Cannot find add_set index %u as target\n",
 				   info->add_set.index);
@@ -260,12 +260,12 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par)
 	}
 
 	if (info->del_set.index != IPSET_INVALID_ID) {
-		index = ip_set_nfnl_get_byindex(info->del_set.index);
+		index = ip_set_nfnl_get_byindex(par->net, info->del_set.index);
 		if (index == IPSET_INVALID_ID) {
 			pr_warning("Cannot find del_set index %u as target\n",
 				   info->del_set.index);
 			if (info->add_set.index != IPSET_INVALID_ID)
-				ip_set_nfnl_put(info->add_set.index);
+				ip_set_nfnl_put(par->net, info->add_set.index);
 			return -ENOENT;
 		}
 	}
@@ -274,9 +274,9 @@ set_target_v1_checkentry(const struct xt_tgchk_param *par)
 		pr_warning("Protocol error: SET target dimension "
 			   "is over the limit!\n");
 		if (info->add_set.index != IPSET_INVALID_ID)
-			ip_set_nfnl_put(info->add_set.index);
+			ip_set_nfnl_put(par->net, info->add_set.index);
 		if (info->del_set.index != IPSET_INVALID_ID)
-			ip_set_nfnl_put(info->del_set.index);
+			ip_set_nfnl_put(par->net, info->del_set.index);
 		return -ERANGE;
 	}
 
@@ -289,9 +289,9 @@ set_target_v1_destroy(const struct xt_tgdtor_param *par)
 	const struct xt_set_info_target_v1 *info = par->targinfo;
 
 	if (info->add_set.index != IPSET_INVALID_ID)
-		ip_set_nfnl_put(info->add_set.index);
+		ip_set_nfnl_put(par->net, info->add_set.index);
 	if (info->del_set.index != IPSET_INVALID_ID)
-		ip_set_nfnl_put(info->del_set.index);
+		ip_set_nfnl_put(par->net, info->del_set.index);
 }
 
 /* Revision 2 target */
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -388,6 +388,12 @@ socket_mt6_v1_v2(const struct sk_buff *skb, struct xt_action_param *par)
 }
 #endif
 
+static int socket_mt_v0_check(const struct xt_mtchk_param *par)
+{
+	allow_conntrack_allocation(par->net);
+	return 0;
+}
+
 static int socket_mt_v1_check(const struct xt_mtchk_param *par)
 {
 	const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo;
@@ -396,6 +402,7 @@ static int socket_mt_v1_check(const struct xt_mtchk_param *par)
 		pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V1);
 		return -EINVAL;
 	}
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
@@ -407,6 +414,7 @@ static int socket_mt_v2_check(const struct xt_mtchk_param *par)
 		pr_info("unknown flags 0x%x\n", info->flags & ~XT_SOCKET_FLAGS_V2);
 		return -EINVAL;
 	}
+	allow_conntrack_allocation(par->net);
 	return 0;
 }
 
@@ -416,6 +424,7 @@ static struct xt_match socket_mt_reg[] __read_mostly = {
 		.revision	= 0,
 		.family		= NFPROTO_IPV4,
 		.match		= socket_mt4_v0,
+		.checkentry	= socket_mt_v0_check,
 		.hooks		= (1 << NF_INET_PRE_ROUTING) |
 				  (1 << NF_INET_LOCAL_IN),
 		.me		= THIS_MODULE,
--- a/net/netfilter/xt_state.c
+++ b/net/netfilter/xt_state.c
@@ -47,6 +47,8 @@ static int state_mt_check(const struct xt_mtchk_param *par)
 	if (ret < 0)
 		pr_info("cannot load conntrack support for proto=%u\n",
 			par->family);
+	else
+		allow_conntrack_allocation(par->net);
 	return ret;
 }
 
--- /dev/null
+++ b/net/netfilter/xt_wdog_tmo.c
@@ -0,0 +1,55 @@
+/*
+ *  net/netfilter/xt_wdog_tmo.c
+ *
+ *  Copyright (c) 2013-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/file.h>
+#include <net/sock.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/fence-watchdog.h>
+
+static bool
+wdog_tmo_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+	return fence_wdog_tmo_match();
+}
+
+int wdog_tmo_mt_check(const struct xt_mtchk_param *par)
+{
+
+	if (!ve_is_super(get_exec_env()))
+		return -EPERM;
+	return 0;
+}
+
+static struct xt_match wdog_tmo_mt_reg __read_mostly = {
+		.name       = "wdog_tmo",
+		.revision   = 0,
+		.family     = NFPROTO_UNSPEC,
+		.match      = wdog_tmo_mt,
+		.checkentry = wdog_tmo_mt_check,
+		.matchsize  = 0,
+		.me         = THIS_MODULE,
+};
+
+static int __init wdog_tmo_mt_init(void)
+{
+	return xt_register_match(&wdog_tmo_mt_reg);
+}
+
+static void __exit wdog_tmo_mt_exit(void)
+{
+	xt_unregister_match(&wdog_tmo_mt_reg);
+}
+
+module_init(wdog_tmo_mt_init);
+module_exit(wdog_tmo_mt_exit);
+MODULE_AUTHOR("Dmitry Guryanov <dguryanov@parallels.com>");
+MODULE_DESCRIPTION("Xtables: fence watchdog timeout matching");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_wdog_tmo");
+MODULE_ALIAS("ip6t_wdog_tmo");
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -77,13 +77,6 @@ struct listeners {
 /* state bits */
 #define NETLINK_S_CONGESTED		0x0
 
-/* flags */
-#define NETLINK_F_KERNEL_SOCKET		0x1
-#define NETLINK_F_RECV_PKTINFO		0x2
-#define NETLINK_F_BROADCAST_SEND_ERROR	0x4
-#define NETLINK_F_RECV_NO_ENOBUFS	0x8
-#define NETLINK_F_LISTEN_ALL_NSID	0x10
-
 static inline int netlink_is_kernel(struct sock *sk)
 {
 	return nlk_sk(sk)->flags & NETLINK_F_KERNEL_SOCKET;
@@ -1392,6 +1385,7 @@ static inline int netlink_allowed(const struct socket *sock, unsigned int flag)
 {
 	return (nl_table[sock->sk->sk_protocol].flags & flag) ||
 		ns_capable(sock_net(sock->sk)->user_ns, CAP_NET_ADMIN);
+
 }
 
 static void
@@ -1628,7 +1622,13 @@ static struct sk_buff *netlink_alloc_large_skb(unsigned int size,
 	struct sk_buff *skb;
 	void *data;
 
-	if (size <= NLMSG_GOODSIZE || broadcast)
+	if (size <= NLMSG_GOODSIZE || broadcast ||
+			/*
+			 * Once we have vmalloc_kmem() that would account
+			 * allocated pages into memcg, this check can be
+			 * removed.
+			 */
+			!ve_is_super(get_exec_env()))
 		return alloc_skb(size, GFP_KERNEL);
 
 	size = SKB_DATA_ALIGN(size) +
@@ -1784,6 +1784,7 @@ static int netlink_unicast_kernel(struct sock *sk, struct sk_buff *skb,
 int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
 		    u32 portid, int nonblock)
 {
+	struct netlink_sock *nlk = nlk_sk(ssk);
 	struct sock *sk;
 	int err;
 	long timeo;
@@ -1792,19 +1793,24 @@ int netlink_unicast(struct sock *ssk, struct sk_buff *skb,
 
 	timeo = sock_sndtimeo(ssk, nonblock);
 retry:
-	sk = netlink_getsockbyportid(ssk, portid);
-	if (IS_ERR(sk)) {
-		kfree_skb(skb);
-		return PTR_ERR(sk);
-	}
-	if (netlink_is_kernel(sk))
-		return netlink_unicast_kernel(sk, skb, ssk);
+	if (nlk->flags & NETLINK_F_REPAIR) {
+		sk = ssk;
+		sock_hold(sk);
+	} else {
+		sk = netlink_getsockbyportid(ssk, portid);
+		if (IS_ERR(sk)) {
+			kfree_skb(skb);
+			return PTR_ERR(sk);
+		}
+		if (netlink_is_kernel(sk))
+			return netlink_unicast_kernel(sk, skb, ssk);
 
-	if (sk_filter(sk, skb)) {
-		err = skb->len;
-		kfree_skb(skb);
-		sock_put(sk);
-		return err;
+		if (sk_filter(sk, skb)) {
+			err = skb->len;
+			kfree_skb(skb);
+			sock_put(sk);
+			return err;
+		}
 	}
 
 	err = netlink_attachskb(sk, skb, &timeo, ssk);
@@ -2166,6 +2172,13 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
 		return -EFAULT;
 
 	switch (optname) {
+	case NETLINK_REPAIR:
+		if (val)
+			nlk->flags |= NETLINK_F_REPAIR;
+		else
+			nlk->flags &= ~NETLINK_F_REPAIR;
+		err = 0;
+		break;
 	case NETLINK_PKTINFO:
 		if (val)
 			nlk->flags |= NETLINK_F_RECV_PKTINFO;
@@ -2330,6 +2343,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	int err;
 	struct scm_cookie scm;
 	u32 netlink_skb_flags = 0;
+	bool repair = nlk->flags & NETLINK_F_REPAIR;
 
 	if (msg->msg_flags&MSG_OOB)
 		return -EOPNOTSUPP;
@@ -2349,7 +2363,8 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		dst_group = ffs(addr->nl_groups);
 		err =  -EPERM;
 		if ((dst_group || dst_portid) &&
-		    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND))
+		    !netlink_allowed(sock, NL_CFG_F_NONROOT_SEND &&
+		    !repair))
 			goto out;
 		netlink_skb_flags |= NETLINK_SKB_DST;
 	} else {
@@ -2381,7 +2396,11 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 	if (skb == NULL)
 		goto out;
 
-	NETLINK_CB(skb).portid	= nlk->portid;
+	if (unlikely(repair))
+		NETLINK_CB(skb).portid = dst_portid;
+	else
+		NETLINK_CB(skb).portid	= nlk->portid;
+
 	NETLINK_CB(skb).dst_group = dst_group;
 	NETLINK_CB(skb).creds	= siocb->scm->creds;
 	NETLINK_CB(skb).flags	= netlink_skb_flags;
@@ -2398,7 +2417,7 @@ static int netlink_sendmsg(struct kiocb *kiocb, struct socket *sock,
 		goto out;
 	}
 
-	if (dst_group) {
+	if (dst_group && !repair) {
 		atomic_inc(&skb->users);
 		netlink_broadcast(sk, skb, dst_portid, dst_group, GFP_KERNEL);
 	}
@@ -2417,17 +2436,18 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	struct scm_cookie scm;
 	struct sock *sk = sock->sk;
 	struct netlink_sock *nlk = nlk_sk(sk);
-	int noblock = flags&MSG_DONTWAIT;
 	size_t copied;
 	struct sk_buff *skb, *data_skb;
+	int peeked, skip;
 	int err, ret;
 
 	if (flags&MSG_OOB)
 		return -EOPNOTSUPP;
 
 	copied = 0;
+	skip = sk_peek_offset(sk, flags);
 
-	skb = skb_recv_datagram(sk, flags, noblock, &err);
+	skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
 	if (skb == NULL)
 		goto out;
 
@@ -2455,14 +2475,20 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	nlk->max_recvmsg_len = min_t(size_t, nlk->max_recvmsg_len,
 				     16384);
 
-	copied = data_skb->len;
+	copied = data_skb->len - skip;
 	if (len < copied) {
 		msg->msg_flags |= MSG_TRUNC;
 		copied = len;
 	}
 
 	skb_reset_transport_header(data_skb);
-	err = skb_copy_datagram_iovec(data_skb, 0, msg->msg_iov, copied);
+	err = skb_copy_datagram_iovec(data_skb, skip, msg->msg_iov, copied);
+	if (!err) {
+		if (flags & MSG_PEEK)
+			sk_peek_offset_fwd(sk, copied);
+		else
+			sk_peek_offset_bwd(sk, skb->len);
+	}
 
 	if (msg->msg_name) {
 		struct sockaddr_nl *addr = (struct sockaddr_nl *)msg->msg_name;
@@ -2484,7 +2510,7 @@ static int netlink_recvmsg(struct kiocb *kiocb, struct socket *sock,
 	}
 	siocb->scm->creds = *NETLINK_CREDS(skb);
 	if (flags & MSG_TRUNC)
-		copied = data_skb->len;
+		copied = data_skb->len - skip;
 
 	skb_free_datagram(sk, skb);
 
@@ -2702,6 +2728,7 @@ static int netlink_dump(struct sock *sk)
 	struct netlink_callback *cb;
 	struct sk_buff *skb = NULL;
 	struct nlmsghdr *nlh;
+	struct module *module;
 	int len, err = -ENOBUFS;
 	int alloc_size;
 
@@ -2771,9 +2798,11 @@ static int netlink_dump(struct sock *sk)
 		cb->done(cb);
 
 	nlk->cb_running = false;
+	module = cb->module;
+	skb = cb->skb;
 	mutex_unlock(nlk->cb_mutex);
-	module_put(cb->module);
-	consume_skb(cb->skb);
+	module_put(module);
+	consume_skb(skb);
 	return 0;
 
 errout_skb:
@@ -3132,6 +3161,13 @@ int netlink_unregister_notifier(struct notifier_block *nb)
 }
 EXPORT_SYMBOL(netlink_unregister_notifier);
 
+static int netlink_set_peek_off(struct sock *sk, int val)
+{
+	sk->sk_peek_off = val;
+
+	return 0;
+}
+
 static const struct proto_ops netlink_ops = {
 	.family =	PF_NETLINK,
 	.owner =	THIS_MODULE,
@@ -3151,6 +3187,7 @@ static const struct proto_ops netlink_ops = {
 	.recvmsg =	netlink_recvmsg,
 	.mmap =		netlink_mmap,
 	.sendpage =	sock_no_sendpage,
+	.set_peek_off = netlink_set_peek_off,
 };
 
 static const struct net_proto_family netlink_family_ops = {
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -4,6 +4,15 @@
 #include <linux/rhashtable.h>
 #include <net/sock.h>
 
+/* flags */
+#define NETLINK_F_KERNEL_SOCKET		0x1
+#define NETLINK_F_RECV_PKTINFO		0x2
+#define NETLINK_F_BROADCAST_SEND_ERROR	0x4
+#define NETLINK_F_RECV_NO_ENOBUFS	0x8
+#define NETLINK_F_LISTEN_ALL_NSID	0x10
+#define NETLINK_F_CAP_ACK		0x20
+#define NETLINK_F_REPAIR		0x40
+
 #define NLGRPSZ(x)	(ALIGN(x, sizeof(unsigned long) * 8) / 8)
 #define NLGRPLONGS(x)	(NLGRPSZ(x)/sizeof(unsigned long))
 
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -54,6 +54,27 @@ static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb)
 		       nlk->groups);
 }
 
+static int sk_diag_put_flags(struct sock *sk, struct sk_buff *skb)
+{
+	struct netlink_sock *nlk = nlk_sk(sk);
+	u32 flags = 0;
+
+	if (nlk->cb_running)
+		flags |= NDIAG_FLAG_CB_RUNNING;
+	if (nlk->flags & NETLINK_F_RECV_PKTINFO)
+		flags |= NDIAG_FLAG_PKTINFO;
+	if (nlk->flags & NETLINK_F_BROADCAST_SEND_ERROR)
+		flags |= NDIAG_FLAG_BROADCAST_ERROR;
+	if (nlk->flags & NETLINK_F_RECV_NO_ENOBUFS)
+		flags |= NDIAG_FLAG_NO_ENOBUFS;
+	if (nlk->flags & NETLINK_F_LISTEN_ALL_NSID)
+		flags |= NDIAG_FLAG_LISTEN_ALL_NSID;
+	if (nlk->flags & NETLINK_F_CAP_ACK)
+		flags |= NDIAG_FLAG_CAP_ACK;
+
+	return nla_put_u32(skb, NETLINK_DIAG_FLAGS, flags);
+}
+
 static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 			struct netlink_diag_req *req,
 			u32 portid, u32 seq, u32 flags, int sk_ino)
@@ -91,6 +112,10 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
 	    sk_diag_put_rings_cfg(sk, skb))
 		goto out_nlmsg_trim;
 
+	if ((req->ndiag_show & NDIAG_SHOW_FLAGS) &&
+	    sk_diag_put_flags(sk, skb))
+		goto out_nlmsg_trim;
+
 	nlmsg_end(skb, nlh);
 	return 0;
 
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -89,6 +89,7 @@
 #include <linux/errqueue.h>
 #include <linux/net_tstamp.h>
 #include <linux/percpu.h>
+
 #ifdef CONFIG_INET
 #include <net/inet_common.h>
 #endif
@@ -2504,6 +2505,76 @@ static int packet_sendmsg(struct kiocb *iocb, struct socket *sock,
 		return packet_snd(sock, msg, len);
 }
 
+#ifdef CONFIG_MEMCG_KMEM
+struct packet_sk_charge {
+	struct mem_cgroup	*memcg;
+	unsigned long		nr_pages;
+};
+
+static struct cg_proto *packet_sk_charge(void)
+{
+	struct packet_sk_charge *psc;
+	int err = -ENOMEM;
+
+	psc = kmalloc(sizeof(*psc), GFP_KERNEL);
+	if (!psc)
+		goto out;
+
+	err = 0;
+	psc->memcg = get_mem_cgroup_from_mm(current->mm);
+	if (!psc->memcg)
+		goto out_free_psc;
+	if (!memcg_kmem_is_active(psc->memcg))
+		goto out_put_cg;
+
+	/*
+	 * Forcedly charge the maximum amount of data this socket may have.
+	 * It's typically not huge and packet sockets are rare guests in
+	 * containers, so we don't disturb the memory consumption much.
+	 */
+	psc->nr_pages = ACCESS_ONCE(sysctl_rmem_max)/PAGE_SIZE;
+
+	err = memcg_charge_kmem(psc->memcg, GFP_KERNEL, psc->nr_pages);
+	if (!err)
+		goto out;
+
+out_put_cg:
+	mem_cgroup_put(psc->memcg);
+out_free_psc:
+	kfree(psc);
+	psc = NULL;
+out:
+	if (err)
+		return ERR_PTR(err);
+
+	/*
+	 * The sk->sk_cgrp is not used for packet sockets,
+	 * so we'll just put the smaller structure into it.
+	 */
+	return (struct cg_proto *)psc;
+}
+
+static void packet_sk_uncharge(struct cg_proto *cg)
+{
+	struct packet_sk_charge *psc = (struct packet_sk_charge *)cg;
+
+	if (psc) {
+		memcg_uncharge_kmem(psc->memcg, psc->nr_pages);
+		mem_cgroup_put(psc->memcg);
+		kfree(psc);
+	}
+}
+#else
+static struct cg_proto *packet_sk_charge(void)
+{
+	return NULL;
+}
+
+static void packet_sk_uncharge(struct cg_proto *cg)
+{
+}
+#endif
+
 /*
  *	Close a PACKET socket. This is fairly simple. We immediately go
  *	to 'closed' state and remove our protocol entry in the device list.
@@ -2553,6 +2624,8 @@ static int packet_release(struct socket *sock)
 	}
 
 	fanout_release(sk);
+	packet_sk_uncharge(sk->sk_cgrp);
+	sk->sk_cgrp = NULL;
 
 	synchronize_net();
 	/*
@@ -2715,6 +2788,7 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 {
 	struct sock *sk;
 	struct packet_sock *po;
+	struct cg_proto *cg;
 	__be16 proto = (__force __be16)protocol; /* weird, but documented */
 	int err;
 
@@ -2725,11 +2799,16 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 		return -ESOCKTNOSUPPORT;
 
 	sock->state = SS_UNCONNECTED;
+	cg = packet_sk_charge();
+	if (IS_ERR(cg)) {
+		err = PTR_ERR(cg);
+		goto out;
+	}
 
 	err = -ENOBUFS;
 	sk = sk_alloc(net, PF_PACKET, GFP_KERNEL, &packet_proto);
 	if (sk == NULL)
-		goto out;
+		goto outu;
 
 	sock->ops = &packet_ops;
 	if (sock->type == SOCK_PACKET)
@@ -2776,9 +2855,13 @@ static int packet_create(struct net *net, struct socket *sock, int protocol,
 	sock_prot_inuse_add(net, &packet_proto, 1);
 	preempt_enable();
 
+	sk->sk_cgrp = cg;
+
 	return 0;
 out2:
 	sk_free(sk);
+outu:
+	packet_sk_uncharge(cg);
 out:
 	return err;
 }
@@ -3287,6 +3370,8 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
 			return -EBUSY;
 		if (copy_from_user(&val, optval, sizeof(val)))
 			return -EFAULT;
+		if (val > INT_MAX)
+			return -EINVAL;
 		po->tp_reserve = val;
 		return 0;
 	}
@@ -3699,7 +3784,7 @@ static struct pgv *alloc_pg_vec(struct tpacket_req *req, int order)
 	struct pgv *pg_vec;
 	int i;
 
-	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL);
+	pg_vec = kcalloc(block_nr, sizeof(struct pgv), GFP_KERNEL_ACCOUNT);
 	if (unlikely(!pg_vec))
 		goto out;
 
@@ -3721,6 +3806,7 @@ out_free_pgvec:
 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 		int closing, int tx_ring)
 {
+	struct packet_sk_charge *psc = (struct packet_sk_charge *)sk->sk_cgrp;
 	struct pgv *pg_vec = NULL;
 	struct packet_sock *po = pkt_sk(sk);
 	int was_running, order = 0;
@@ -3734,7 +3820,9 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 	lock_sock(sk);
 	/* Opening a Tx-ring is NOT supported in TPACKET_V3 */
 	if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
-		WARN(1, "Tx-ring is not supported.\n");
+		/* Hide warnings initiated from CT */
+		if (ve_is_super(get_exec_env()))
+			WARN(1, "Tx-ring is not supported.\n");
 		goto out;
 	}
 
@@ -3773,8 +3861,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 		if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
 			goto out;
 		if (po->tp_version >= TPACKET_V3 &&
-		    (int)(req->tp_block_size -
-			  BLK_PLUS_PRIV(req_u->req3.tp_sizeof_priv)) <= 0)
+		    req->tp_block_size <=
+			  BLK_PLUS_PRIV((u64)req_u->req3.tp_sizeof_priv))
 			goto out;
 		if (unlikely(req->tp_frame_size < po->tp_hdrlen +
 					po->tp_reserve))
@@ -3785,15 +3873,24 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 		rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
 		if (unlikely(rb->frames_per_block <= 0))
 			goto out;
+		if (unlikely(req->tp_block_size > UINT_MAX / req->tp_block_nr))
+			goto out;
 		if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
 					req->tp_frame_nr))
 			goto out;
 
 		err = -ENOMEM;
 		order = get_order(req->tp_block_size);
+		if (psc && memcg_charge_kmem(psc->memcg, GFP_KERNEL,
+				(1 << order) * req->tp_block_nr))
+			goto out;
 		pg_vec = alloc_pg_vec(req, order);
-		if (unlikely(!pg_vec))
+		if (unlikely(!pg_vec)) {
+			if (psc)
+				memcg_uncharge_kmem(psc->memcg,
+					(1 << order) * req->tp_block_nr);
 			goto out;
+		}
 		switch (po->tp_version) {
 		case TPACKET_V3:
 		/* Transmit path is not supported. We checked
@@ -3862,8 +3959,12 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
 			prb_shutdown_retire_blk_timer(po, tx_ring, rb_queue);
 	}
 
-	if (pg_vec)
+	if (pg_vec) {
+		if (psc)
+			memcg_uncharge_kmem(psc->memcg,
+				(1 << order) * req->tp_block_nr);
 		free_pg_vec(pg_vec, order, req->tp_block_nr);
+	}
 out:
 	release_sock(sk);
 	return err;
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -190,6 +190,12 @@ new_conn:
 		}
 	}
 
+	if (trans == NULL) {
+		kmem_cache_free(rds_conn_slab, conn);
+		conn = ERR_PTR(-ENODEV);
+		goto out;
+	}
+
 	conn->c_trans = trans;
 
 	ret = trans->conn_alloc(conn, gfp);
--- a/net/sched/em_ipset.c
+++ b/net/sched/em_ipset.c
@@ -24,11 +24,12 @@ static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len,
 {
 	struct xt_set_info *set = data;
 	ip_set_id_t index;
+	struct net *net = qdisc_dev(tp->q)->nd_net;
 
 	if (data_len != sizeof(*set))
 		return -EINVAL;
 
-	index = ip_set_nfnl_get_byindex(set->index);
+	index = ip_set_nfnl_get_byindex(net, set->index);
 	if (index == IPSET_INVALID_ID)
 		return -ENOENT;
 
@@ -37,7 +38,7 @@ static int em_ipset_change(struct tcf_proto *tp, void *data, int data_len,
 	if (em->data)
 		return 0;
 
-	ip_set_nfnl_put(index);
+	ip_set_nfnl_put(net, index);
 	return -ENOMEM;
 }
 
@@ -45,7 +46,7 @@ static void em_ipset_destroy(struct tcf_proto *p, struct tcf_ematch *em)
 {
 	const struct xt_set_info *set = (const void *) em->data;
 	if (set) {
-		ip_set_nfnl_put(set->index);
+		ip_set_nfnl_put(qdisc_dev(p->q)->nd_net, set->index);
 		kfree((void *) em->data);
 	}
 }
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -159,7 +159,6 @@ struct cbq_sched_data {
 	struct cbq_class	*tx_borrowed;
 	int			tx_len;
 	psched_time_t		now;		/* Cached timestamp */
-	psched_time_t		now_rt;		/* Cached real time */
 	unsigned int		pmask;
 
 	struct hrtimer		delay_timer;
@@ -353,12 +352,7 @@ cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 	int toplevel = q->toplevel;
 
 	if (toplevel > cl->level && !(qdisc_is_throttled(cl->q))) {
-		psched_time_t now;
-		psched_tdiff_t incr;
-
-		now = psched_get_time();
-		incr = now - q->now_rt;
-		now = q->now + incr;
+		psched_time_t now = psched_get_time();
 
 		do {
 			if (cl->undertime < now) {
@@ -700,8 +694,13 @@ cbq_update(struct cbq_sched_data *q)
 	struct cbq_class *this = q->tx_class;
 	struct cbq_class *cl = this;
 	int len = q->tx_len;
+	psched_time_t now;
 
 	q->tx_class = NULL;
+	/* Time integrator. We calculate EOS time
+	 * by adding expected packet transmission time.
+	 */
+	now = q->now + L2T(&q->link, len);
 
 	for ( ; cl; cl = cl->share) {
 		long avgidle = cl->avgidle;
@@ -717,7 +716,7 @@ cbq_update(struct cbq_sched_data *q)
 		 *	idle = (now - last) - last_pktlen/rate
 		 */
 
-		idle = q->now - cl->last;
+		idle = now - cl->last;
 		if ((unsigned long)idle > 128*1024*1024) {
 			avgidle = cl->maxidle;
 		} else {
@@ -761,7 +760,7 @@ cbq_update(struct cbq_sched_data *q)
 			idle -= L2T(&q->link, len);
 			idle += L2T(cl, len);
 
-			cl->undertime = q->now + idle;
+			cl->undertime = now + idle;
 		} else {
 			/* Underlimit */
 
@@ -771,7 +770,8 @@ cbq_update(struct cbq_sched_data *q)
 			else
 				cl->avgidle = avgidle;
 		}
-		cl->last = q->now;
+		if ((s64)(now - cl->last) > 0)
+			cl->last = now;
 	}
 
 	cbq_update_toplevel(q, this, q->tx_borrowed);
@@ -872,8 +872,8 @@ cbq_dequeue_prio(struct Qdisc *sch, int prio)
 
 			if (cl->deficit <= 0) {
 				q->active[prio] = cl;
-				cl = cl->next_alive;
 				cl->deficit += cl->quantum;
+				cl = cl->next_alive;
 			}
 			return skb;
 
@@ -943,31 +943,13 @@ cbq_dequeue(struct Qdisc *sch)
 	struct sk_buff *skb;
 	struct cbq_sched_data *q = qdisc_priv(sch);
 	psched_time_t now;
-	psched_tdiff_t incr;
 
 	now = psched_get_time();
-	incr = now - q->now_rt;
-
-	if (q->tx_class) {
-		psched_tdiff_t incr2;
-		/* Time integrator. We calculate EOS time
-		 * by adding expected packet transmission time.
-		 * If real time is greater, we warp artificial clock,
-		 * so that:
-		 *
-		 * cbq_time = max(real_time, work);
-		 */
-		incr2 = L2T(&q->link, q->tx_len);
-		q->now += incr2;
+
+	if (q->tx_class)
 		cbq_update(q);
-		if ((incr -= incr2) < 0)
-			incr = 0;
-		q->now += incr;
-	} else {
-		if (now > q->now)
-			q->now = now;
-	}
-	q->now_rt = now;
+
+	q->now = now;
 
 	for (;;) {
 		q->wd_expires = 0;
@@ -1051,18 +1033,19 @@ static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 
 	for (h = 0; h < q->clhash.hashsize; h++) {
 		hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
+			long mtu;
 			/* BUGGGG... Beware! This expression suffer of
 			 * arithmetic overflows!
 			 */
 			if (cl->priority == prio) {
-				cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
-					q->quanta[prio];
-			}
-			if (cl->quantum <= 0 || cl->quantum>32*qdisc_dev(cl->qdisc)->mtu) {
-				pr_warning("CBQ: class %08x has bad quantum==%ld, repaired.\n",
-					   cl->common.classid, cl->quantum);
-				cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
+				cl->quantum = (cl->weight * cl->allot) /
+					(q->quanta[prio] / q->nclasses[prio]);
 			}
+			mtu = qdisc_dev(cl->qdisc)->mtu;
+			if (cl->quantum <= mtu/2)
+				cl->quantum = mtu/2 + 1;
+			else if (cl->quantum > 32*mtu)
+				cl->quantum = 32*mtu;
 		}
 	}
 }
@@ -1222,7 +1205,6 @@ cbq_reset(struct Qdisc *sch)
 	hrtimer_cancel(&q->delay_timer);
 	q->toplevel = TC_CBQ_MAXLEVEL;
 	q->now = psched_get_time();
-	q->now_rt = q->now;
 
 	for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
 		q->active[prio] = NULL;
@@ -1406,7 +1388,6 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
 	q->delay_timer.function = cbq_undelay;
 	q->toplevel = TC_CBQ_MAXLEVEL;
 	q->now = psched_get_time();
-	q->now_rt = q->now;
 
 	cbq_link_class(&q->link);
 
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -215,17 +215,21 @@ static inline int qdisc_restart(struct Qdisc *q)
 	struct net_device *dev;
 	spinlock_t *root_lock;
 	struct sk_buff *skb;
+	int ret;
 
 	/* Dequeue packet */
 	skb = dequeue_skb(q);
 	if (unlikely(!skb))
 		return 0;
 
+	WARN_ON_ONCE(skb_dst_is_noref(skb));
 	root_lock = qdisc_lock(q);
 	dev = qdisc_dev(q);
 	txq = skb_get_tx_queue(dev, skb);
 
-	return sch_direct_xmit(skb, q, dev, txq, root_lock);
+	ret = sch_direct_xmit(skb, q, dev, txq, root_lock);
+
+	return ret;
 }
 
 void __qdisc_run(struct Qdisc *q)
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1107,6 +1107,8 @@ static int htb_dump_class(struct Qdisc *sch, unsigned long arg,
 	opt.buffer = PSCHED_NS2TICKS(cl->buffer);
 	psched_ratecfg_getrate(&opt.ceil, &cl->ceil);
 	opt.cbuffer = PSCHED_NS2TICKS(cl->cbuffer);
+	opt.rate.mpu = cl->rate.mpu;
+	opt.ceil.mpu = cl->ceil.mpu;
 	opt.quantum = cl->quantum;
 	opt.prio = cl->prio;
 	opt.level = cl->level;
@@ -1491,6 +1493,8 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
 
 	psched_ratecfg_precompute(&cl->rate, &hopt->rate);
 	psched_ratecfg_precompute(&cl->ceil, &hopt->ceil);
+	cl->rate.mpu = hopt->rate.mpu;
+	cl->ceil.mpu = hopt->ceil.mpu;
 
 	cl->buffer = PSCHED_TICKS2NS(hopt->buffer);
 	cl->cbuffer = PSCHED_TICKS2NS(hopt->cbuffer);
--- a/net/sched/sch_teql.c
+++ b/net/sched/sch_teql.c
@@ -177,6 +177,9 @@ static int teql_qdisc_init(struct Qdisc *sch, struct nlattr *opt)
 	struct teql_master *m = (struct teql_master *)sch->ops;
 	struct teql_sched_data *q = qdisc_priv(sch);
 
+	if (!capable(CAP_NET_ADMIN))
+		return -EPERM;
+
 	if (dev->hard_header_len > m->dev->hard_header_len)
 		return -EINVAL;
 
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -672,6 +672,9 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
 	newnp = inet6_sk(newsk);
 
 	memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+	newnp->ipv6_mc_list = NULL;
+	newnp->ipv6_ac_list = NULL;
+	newnp->ipv6_fl_list = NULL;
 
 	rcu_read_lock();
 	opt = rcu_dereference(np->opt);
--- a/net/sctp/objcnt.c
+++ b/net/sctp/objcnt.c
@@ -133,8 +133,8 @@ void sctp_dbg_objcnt_init(struct net *net)
 {
 	struct proc_dir_entry *ent;
 
-	ent = proc_create("sctp_dbg_objcnt", 0,
-			  net->sctp.proc_net_sctp, &sctp_objcnt_ops);
+	ent = proc_net_create_data("sctp_dbg_objcnt", 0,
+			net->sctp.proc_net_sctp, &sctp_objcnt_ops, NULL);
 	if (!ent)
 		pr_warn("sctp_dbg_objcnt: Unable to create /proc entry.\n");
 }
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -110,8 +110,8 @@ int __net_init sctp_snmp_proc_init(struct net *net)
 {
 	struct proc_dir_entry *p;
 
-	p = proc_create("snmp", S_IRUGO, net->sctp.proc_net_sctp,
-			&sctp_snmp_seq_fops);
+	p = proc_net_create_data("snmp", S_IRUGO, net->sctp.proc_net_sctp,
+				 &sctp_snmp_seq_fops, NULL);
 	if (!p)
 		return -ENOMEM;
 
@@ -270,8 +270,8 @@ int __net_init sctp_eps_proc_init(struct net *net)
 {
 	struct proc_dir_entry *p;
 
-	p = proc_create("eps", S_IRUGO, net->sctp.proc_net_sctp,
-			&sctp_eps_seq_fops);
+	p = proc_net_create_data("eps", S_IRUGO, net->sctp.proc_net_sctp,
+				 &sctp_eps_seq_fops, NULL);
 	if (!p)
 		return -ENOMEM;
 
@@ -404,8 +404,8 @@ int __net_init sctp_assocs_proc_init(struct net *net)
 {
 	struct proc_dir_entry *p;
 
-	p = proc_create("assocs", S_IRUGO, net->sctp.proc_net_sctp,
-			&sctp_assocs_seq_fops);
+	p = proc_net_create_data("assocs", S_IRUGO, net->sctp.proc_net_sctp,
+				 &sctp_assocs_seq_fops, NULL);
 	if (!p)
 		return -ENOMEM;
 
@@ -520,8 +520,8 @@ int __net_init sctp_remaddr_proc_init(struct net *net)
 {
 	struct proc_dir_entry *p;
 
-	p = proc_create("remaddr", S_IRUGO, net->sctp.proc_net_sctp,
-			&sctp_remaddr_seq_fops);
+	p = proc_net_create_data("remaddr", S_IRUGO, net->sctp.proc_net_sctp,
+				 &sctp_remaddr_seq_fops, NULL);
 	if (!p)
 		return -ENOMEM;
 	return 0;
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -989,7 +989,7 @@ static int sctp_setsockopt_bindx(struct sock *sk,
 		return -EFAULT;
 
 	/* Alloc space for the address array in kernel memory.  */
-	kaddrs = kmalloc(addrs_size, GFP_KERNEL);
+	kaddrs = kmalloc(addrs_size, GFP_USER | __GFP_NOWARN);
 	if (unlikely(!kaddrs))
 		return -ENOMEM;
 
@@ -1229,9 +1229,12 @@ static int __sctp_connect(struct sock *sk,
 
 	timeo = sock_sndtimeo(sk, f_flags & O_NONBLOCK);
 
-	err = sctp_wait_for_connect(asoc, &timeo);
-	if ((err == 0 || err == -EINPROGRESS) && assoc_id)
+	if (assoc_id)
 		*assoc_id = asoc->assoc_id;
+	err = sctp_wait_for_connect(asoc, &timeo);
+	/* Note: the asoc may be freed after the return of
+	 * sctp_wait_for_connect.
+	 */
 
 	/* Don't free association on exit. */
 	asoc = NULL;
@@ -4638,6 +4641,12 @@ int sctp_do_peeloff(struct sock *sk, sctp_assoc_t id, struct socket **sockp)
 	if (!asoc)
 		return -EINVAL;
 
+	/* If there is a thread waiting on more sndbuf space for
+	 * sending on this asoc, it cannot be peeled.
+	 */
+	if (waitqueue_active(&asoc->wait))
+		return -EBUSY;
+
 	/* An association cannot be branched off from an already peeled-off
 	 * socket, nor is this supported for tcp style sockets.
 	 */
@@ -5142,7 +5151,7 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
 	to = optval + offsetof(struct sctp_getaddrs, addrs);
 	space_left = len - offsetof(struct sctp_getaddrs, addrs);
 
-	addrs = kmalloc(space_left, GFP_KERNEL);
+	addrs = kmalloc(space_left, GFP_USER | __GFP_NOWARN);
 	if (!addrs)
 		return -ENOMEM;
 
@@ -7204,7 +7213,6 @@ static int sctp_wait_for_sndbuf(struct sctp_association *asoc, long *timeo_p,
 		 */
 		release_sock(sk);
 		current_timeo = schedule_timeout(current_timeo);
-		BUG_ON(sk != asoc->base.sk);
 		lock_sock(sk);
 
 		*timeo_p = current_timeo;
--- a/net/socket.c
+++ b/net/socket.c
@@ -84,10 +84,12 @@
 #include <linux/kmod.h>
 #include <linux/audit.h>
 #include <linux/wireless.h>
+#include <linux/in.h>
 #include <linux/nsproxy.h>
 #include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/xattr.h>
+#include <linux/ve.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -195,6 +197,7 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr_storage *k
 		return -EFAULT;
 	return audit_sockaddr(ulen, kaddr);
 }
+EXPORT_SYMBOL(move_addr_to_kernel);
 
 /**
  *	move_addr_to_user	-	copy an address to user space
@@ -293,7 +296,7 @@ static int init_inodecache(void)
 					      0,
 					      (SLAB_HWCACHE_ALIGN |
 					       SLAB_RECLAIM_ACCOUNT |
-					       SLAB_MEM_SPREAD),
+					       SLAB_MEM_SPREAD | SLAB_ACCOUNT),
 					      init_once);
 	if (sock_inode_cachep == NULL)
 		return -ENOMEM;
@@ -1260,6 +1263,11 @@ int __sock_create(struct net *net, int family, int type, int protocol,
 		family = PF_PACKET;
 	}
 
+	/* VZ compatibility layer */
+	err = vz_security_family_check(net, family, 0);
+	if (err < 0)
+		return err;
+
 	err = security_socket_create(family, type, protocol, kern);
 	if (err)
 		return err;
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -449,12 +449,13 @@ EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache);
 /*
  * Remove stale credentials. Avoid sleeping inside the loop.
  */
-static int
+static long
 rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
 {
 	spinlock_t *cache_lock;
 	struct rpc_cred *cred, *next;
 	unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM;
+	long freed = 0;
 
 	list_for_each_entry_safe(cred, next, &cred_unused, cr_lru) {
 
@@ -466,10 +467,11 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
 		 */
 		if (time_in_range(cred->cr_expire, expired, jiffies) &&
 		    test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0)
-			return 0;
+			break;
 
 		list_del_init(&cred->cr_lru);
 		number_cred_unused--;
+		freed++;
 		if (atomic_read(&cred->cr_count) != 0)
 			continue;
 
@@ -482,7 +484,7 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan)
 		}
 		spin_unlock(cache_lock);
 	}
-	return (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
+	return freed;
 }
 
 int rpcauth_cache_do_shrinker(int nr_to_scan)
@@ -501,18 +503,18 @@ int rpcauth_cache_do_shrinker(int nr_to_scan)
 /*
  * Run memory cache shrinker.
  */
-static int
-rpcauth_cache_shrinker(struct shrinker *shrink, struct shrink_control *sc)
+static unsigned long
+rpcauth_cache_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
+
 {
-	int nr_to_scan = sc->nr_to_scan;
-	gfp_t gfp_mask = sc->gfp_mask;
+	if ((sc->gfp_mask & GFP_KERNEL) != GFP_KERNEL)
+		return SHRINK_STOP;
 
-	if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL)
-		return (nr_to_scan == 0) ? 0 : -1;
+	/* nothing left, don't come back */
 	if (list_empty(&cred_unused))
-		return 0;
+		return SHRINK_STOP;
 
-	return rpcauth_cache_do_shrinker(nr_to_scan);
+	return rpcauth_cache_do_shrinker(sc->nr_to_scan);
 }
 
 static void
@@ -530,6 +532,13 @@ rpcauth_cache_enforce_limit(void)
 	rpcauth_cache_do_shrinker(nr_to_scan);
 }
 
+static unsigned long
+rpcauth_cache_shrink_count(struct shrinker *shrink, struct shrink_control *sc)
+
+{
+	return (number_cred_unused / 100) * sysctl_vfs_cache_pressure;
+}
+
 /*
  * Look up a process' credentials in the authentication cache
  */
@@ -853,7 +862,8 @@ rpcauth_uptodatecred(struct rpc_task *task)
 }
 
 static struct shrinker rpc_cred_shrinker = {
-	.shrink = rpcauth_cache_shrinker,
+	.count_objects = rpcauth_cache_shrink_count,
+	.scan_objects = rpcauth_cache_shrink_scan,
 	.seeks = DEFAULT_SEEKS,
 };
 
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -1639,13 +1639,13 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
 	struct sunrpc_net *sn;
 
 	sn = net_generic(net, sunrpc_net_id);
-	cd->u.procfs.proc_ent = proc_mkdir(cd->name, sn->proc_net_rpc);
+	cd->u.procfs.proc_ent = proc_net_mkdir(net, cd->name, sn->proc_net_rpc);
 	if (cd->u.procfs.proc_ent == NULL)
 		goto out_nomem;
 	cd->u.procfs.channel_ent = NULL;
 	cd->u.procfs.content_ent = NULL;
 
-	p = proc_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR,
+	p = proc_net_create_data("flush", S_IFREG|S_IRUSR|S_IWUSR,
 			     cd->u.procfs.proc_ent,
 			     &cache_flush_operations_procfs, cd);
 	cd->u.procfs.flush_ent = p;
@@ -1653,7 +1653,7 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
 		goto out_nomem;
 
 	if (cd->cache_request || cd->cache_parse) {
-		p = proc_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR,
+		p = proc_net_create_data("channel", S_IFREG|S_IRUSR|S_IWUSR,
 				     cd->u.procfs.proc_ent,
 				     &cache_file_operations_procfs, cd);
 		cd->u.procfs.channel_ent = p;
@@ -1661,7 +1661,7 @@ static int create_cache_proc_entries(struct cache_detail *cd, struct net *net)
 			goto out_nomem;
 	}
 	if (cd->cache_show) {
-		p = proc_create_data("content", S_IFREG|S_IRUSR,
+		p = proc_net_create_data("content", S_IFREG|S_IRUSR,
 				cd->u.procfs.proc_ent,
 				&content_file_operations_procfs, cd);
 		cd->u.procfs.content_ent = p;
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -1487,6 +1487,7 @@ static struct file_system_type rpc_pipe_fs_type = {
 	.name		= "rpc_pipefs",
 	.mount		= rpc_mount,
 	.kill_sb	= rpc_kill_sb,
+	.fs_flags	= FS_VIRTUALIZED,
 };
 MODULE_ALIAS_FS("rpc_pipefs");
 MODULE_ALIAS("rpc_pipefs");
@@ -1509,7 +1510,7 @@ int register_rpc_pipefs(void)
 	rpc_inode_cachep = kmem_cache_create("rpc_inode_cache",
 				sizeof(struct rpc_inode),
 				0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
-						SLAB_MEM_SPREAD),
+						SLAB_MEM_SPREAD|SLAB_ACCOUNT),
 				init_once);
 	if (!rpc_inode_cachep)
 		return -ENOMEM;
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -252,7 +252,7 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
 
 static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode)
 {
-	freezable_schedule_unsafe();
+	schedule();
 	if (signal_pending_state(mode, current))
 		return -ERESTARTSYS;
 	return 0;
--- a/net/sunrpc/stats.c
+++ b/net/sunrpc/stats.c
@@ -247,7 +247,7 @@ do_register(struct net *net, const char *name, void *data,
 
 	dprintk("RPC:       registering /proc/net/rpc/%s\n", name);
 	sn = net_generic(net, sunrpc_net_id);
-	return proc_create_data(name, 0, sn->proc_net_rpc, fops, data);
+	return proc_net_create_data(name, 0, sn->proc_net_rpc, fops, data);
 }
 
 struct proc_dir_entry *
@@ -290,7 +290,7 @@ int rpc_proc_init(struct net *net)
 
 	dprintk("RPC:       registering /proc/net/rpc\n");
 	sn = net_generic(net, sunrpc_net_id);
-	sn->proc_net_rpc = proc_mkdir("rpc", net->proc_net);
+	sn->proc_net_rpc = proc_net_mkdir(net, "rpc", net->proc_net);
 	if (sn->proc_net_rpc == NULL)
 		return -ENOMEM;
 
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -11,7 +11,6 @@
  */
 
 #include <linux/linkage.h>
-#include <linux/sched.h>
 #include <linux/errno.h>
 #include <linux/net.h>
 #include <linux/in.h>
@@ -21,6 +20,8 @@
 #include <linux/kthread.h>
 #include <linux/slab.h>
 
+#include <linux/ve.h>
+
 #include <linux/sunrpc/types.h>
 #include <linux/sunrpc/xdr.h>
 #include <linux/sunrpc/stats.h>
@@ -731,8 +732,8 @@ svc_set_num_threads(struct svc_serv *serv, struct svc_pool *pool, int nrservs)
 		}
 
 		__module_get(serv->sv_ops->svo_module);
-		task = kthread_create_on_node(serv->sv_ops->svo_function, rqstp,
-					      node, serv->sv_name);
+		task = kthread_create_on_node_ve(get_exec_env(),
+			serv->sv_ops->svo_function, rqstp, node, serv->sv_name);
 		if (IS_ERR(task)) {
 			error = PTR_ERR(task);
 			module_put(serv->sv_ops->svo_module);
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -762,6 +762,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock)
 	lockdep_set_class(&sk->sk_receive_queue.lock,
 				&af_unix_sk_receive_queue_lock_key);
 
+	sk->sk_allocation	= GFP_KERNEL_ACCOUNT;
 	sk->sk_write_space	= unix_write_space;
 	sk->sk_max_ack_backlog	= net->unx.sysctl_max_dgram_qlen;
 	sk->sk_destruct		= unix_sock_destructor;
--- a/net/unix/sysctl_net_unix.c
+++ b/net/unix/sysctl_net_unix.c
@@ -35,7 +35,7 @@ int __net_init unix_sysctl_register(struct net *net)
 		goto err_alloc;
 
 	/* Don't export sysctls to unprivileged users */
-	if (net->user_ns != &init_user_ns)
+	if (ve_net_hide_sysctl(net))
 		table[0].procname = NULL;
 
 	table[0].data = &net->unx.sysctl_max_dgram_qlen;
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -387,7 +387,14 @@ static inline int xfrm_replay_verify_len(struct xfrm_replay_state_esn *replay_es
 	up = nla_data(rp);
 	ulen = xfrm_replay_state_esn_len(up);
 
-	if (nla_len(rp) < ulen || xfrm_replay_state_esn_len(replay_esn) != ulen)
+	/* Check the overall length and the internal bitmap length to avoid
+	 * potential overflow. */
+	if (nla_len(rp) < ulen ||
+	    xfrm_replay_state_esn_len(replay_esn) != ulen ||
+	    replay_esn->bmp_len != up->bmp_len)
+		return -EINVAL;
+
+	if (up->replay_window > up->bmp_len * sizeof(__u32) * 8)
 		return -EINVAL;
 
 	return 0;
@@ -2133,7 +2140,8 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh,
 	return 0;
 
 bad_policy:
-	WARN(1, "BAD policy passed\n");
+	ve_pr_warn_ratelimited(VE0_LOG, "CT%s: BAD xfrm policy passed\n",
+		net->owner_ve->ve_name);
 free_state:
 	kfree(x);
 nomem:
--- /dev/null
+++ b/scripts/Makefile.kasan
@@ -0,0 +1,29 @@
+ifdef CONFIG_KASAN
+ifdef CONFIG_KASAN_INLINE
+	call_threshold := 10000
+else
+	call_threshold := 0
+endif
+
+CFLAGS_KASAN_MINIMAL := -fsanitize=kernel-address
+
+CFLAGS_KASAN := $(call cc-option, -fsanitize=kernel-address \
+		-fasan-shadow-offset=$(CONFIG_KASAN_SHADOW_OFFSET) \
+		--param asan-stack=1 --param asan-globals=1 \
+		--param asan-instrumentation-with-call-threshold=$(call_threshold))
+
+ifeq ($(call cc-option, $(CFLAGS_KASAN_MINIMAL) -Werror),)
+   ifneq ($(CONFIG_COMPILE_TEST),y)
+        $(warning Cannot use CONFIG_KASAN: \
+            -fsanitize=kernel-address is not supported by compiler)
+   endif
+else
+    ifeq ($(CFLAGS_KASAN),)
+        ifneq ($(CONFIG_COMPILE_TEST),y)
+            $(warning CONFIG_KASAN: compiler does not support all options.\
+                Trying minimal configuration)
+        endif
+        CFLAGS_KASAN := $(CFLAGS_KASAN_MINIMAL)
+    endif
+endif
+endif
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -119,6 +119,22 @@ _c_flags += $(if $(patsubst n%,, \
 		$(CFLAGS_GCOV))
 endif
 
+#
+# Enable address sanitizer flags for kernel except some files or directories
+# we don't want to check (depends on variables KASAN_SANITIZE_obj.o, KASAN_SANITIZE)
+#
+ifeq ($(CONFIG_KASAN),y)
+_c_flags += $(if $(patsubst n%,, \
+		$(KASAN_SANITIZE_$(basetarget).o)$(KASAN_SANITIZE)y), \
+		$(CFLAGS_KASAN))
+endif
+
+ifeq ($(CONFIG_KCOV),y)
+_c_flags += $(if $(patsubst n%,, \
+	$(KCOV_INSTRUMENT_$(basetarget).o)$(KCOV_INSTRUMENT)y), \
+	$(CFLAGS_KCOV))
+endif
+
 # If building the kernel in a separate objtree expand all occurrences
 # of -Idir to -I$(srctree)/dir except for absolute paths (starting with '/').
 
--- a/scripts/Makefile.modpost
+++ b/scripts/Makefile.modpost
@@ -77,7 +77,7 @@ modpost = scripts/mod/modpost                    \
  $(if $(KBUILD_EXTRA_SYMBOLS), $(patsubst %, -e %,$(KBUILD_EXTRA_SYMBOLS))) \
  $(if $(KBUILD_EXTMOD),-o $(modulesymfile))      \
  $(if $(CONFIG_DEBUG_SECTION_MISMATCH),,-S)      \
- $(if $(KBUILD_EXTMOD)$(KBUILD_MODPOST_WARN),-w)
+ $(if $(KBUILD_EXTMOD)$(KBUILD_MODPOST_WARN),$(if $(KBUILD_MODPOST_FAIL),,-w))
 
 # We can go over command line length here, so be careful.
 quiet_cmd_modpost = MODPOST $(words $(filter-out vmlinux FORCE, $^)) modules
--- a/scripts/module-common.lds
+++ b/scripts/module-common.lds
@@ -16,4 +16,8 @@ SECTIONS {
 	__kcrctab_unused	0 : { *(SORT(___kcrctab_unused+*)) }
 	__kcrctab_unused_gpl	0 : { *(SORT(___kcrctab_unused_gpl+*)) }
 	__kcrctab_gpl_future	0 : { *(SORT(___kcrctab_gpl_future+*)) }
+
+
+	. = ALIGN(8);
+	.init_array		0 : { *(SORT(.init_array.*)) *(.init_array) }
 }
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -20,7 +20,7 @@ config SECURITY_DMESG_RESTRICT
 
 config SECURITY
 	bool "Enable different security models"
-	depends on SYSFS
+	depends on SYSFS && !VE
 	help
 	  This allows you to choose different security modules to be
 	  configured into your kernel.
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -661,14 +661,14 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
 		       const void *value, size_t size, int flags)
 {
 	if (!strcmp(name, XATTR_NAME_CAPS)) {
-		if (!capable(CAP_SETFCAP))
+		if (!ve_capable(CAP_SETFCAP))
 			return -EPERM;
 		return 0;
 	}
 
 	if (!strncmp(name, XATTR_SECURITY_PREFIX,
 		     sizeof(XATTR_SECURITY_PREFIX) - 1) &&
-	    !capable(CAP_SYS_ADMIN))
+	    !ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	return 0;
 }
@@ -687,14 +687,14 @@ int cap_inode_setxattr(struct dentry *dentry, const char *name,
 int cap_inode_removexattr(struct dentry *dentry, const char *name)
 {
 	if (!strcmp(name, XATTR_NAME_CAPS)) {
-		if (!capable(CAP_SETFCAP))
+		if (!ve_capable(CAP_SETFCAP))
 			return -EPERM;
 		return 0;
 	}
 
 	if (!strncmp(name, XATTR_SECURITY_PREFIX,
 		     sizeof(XATTR_SECURITY_PREFIX) - 1) &&
-	    !capable(CAP_SYS_ADMIN))
+	    !ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	return 0;
 }
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -13,11 +13,19 @@
 #include <linux/slab.h>
 #include <linux/rcupdate.h>
 #include <linux/mutex.h>
+#include <uapi/linux/vzcalluser.h>
+#include <linux/major.h>
+#include <linux/module.h>
+#include <linux/capability.h>
+#include <linux/ve.h>
 
 #define ACC_MKNOD 1
 #define ACC_READ  2
 #define ACC_WRITE 4
-#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE)
+#define ACC_QUOTA 8	/* deprecated */
+#define ACC_HIDDEN 16
+#define ACC_MOUNT 64
+#define ACC_MASK (ACC_MKNOD | ACC_READ | ACC_WRITE | ACC_MOUNT)
 
 #define DEV_BLOCK 1
 #define DEV_CHAR  2
@@ -75,7 +83,7 @@ static int devcgroup_can_attach(struct cgroup *new_cgrp,
 {
 	struct task_struct *task = cgroup_taskset_first(set);
 
-	if (current != task && !capable(CAP_SYS_ADMIN))
+	if (current != task && !ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 	return 0;
 }
@@ -261,7 +269,7 @@ static void devcgroup_css_free(struct cgroup *cgroup)
 #define DEVCG_LIST 3
 
 #define MAJMINLEN 13
-#define ACCLEN 4
+#define ACCLEN 5
 
 static void set_access(char *acc, short access)
 {
@@ -273,6 +281,8 @@ static void set_access(char *acc, short access)
 		acc[idx++] = 'w';
 	if (access & ACC_MKNOD)
 		acc[idx++] = 'm';
+	if (access & ACC_MOUNT)
+		acc[idx++] = 'M';
 }
 
 static char type_to_char(short type)
@@ -347,6 +357,9 @@ static bool match_exception(struct list_head *exceptions, short type,
 	struct dev_exception_item *ex;
 
 	list_for_each_entry_rcu(ex, exceptions, list) {
+		short mismatched_bits;
+		bool allowed_mount;
+
 		if ((type & DEV_BLOCK) && !(ex->type & DEV_BLOCK))
 			continue;
 		if ((type & DEV_CHAR) && !(ex->type & DEV_CHAR))
@@ -356,7 +369,12 @@ static bool match_exception(struct list_head *exceptions, short type,
 		if (ex->minor != ~0 && ex->minor != minor)
 			continue;
 		/* provided access cannot have more than the exception rule */
-		if (access & (~ex->access))
+		mismatched_bits = access & (~ex->access) & ~ACC_MOUNT;
+		allowed_mount = !(mismatched_bits & ~ACC_WRITE) &&
+				(ex->access & ACC_MOUNT) &&
+				(access & ACC_MOUNT);
+
+		if (mismatched_bits && !allowed_mount)
 			continue;
 		return true;
 	}
@@ -657,7 +675,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 	struct cgroup *p = devcgroup->css.cgroup;
 	struct dev_cgroup *parent = NULL;
 
-	if (!capable(CAP_SYS_ADMIN))
+	if (!ve_capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	if (p->parent)
@@ -673,8 +691,13 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 			if (has_children(devcgroup))
 				return -EINVAL;
 
-			if (!may_allow_all(parent))
-				return -EPERM;
+			if (!may_allow_all(parent)) {
+				if (ve_is_super(get_exec_env()))
+					return -EPERM;
+				else
+					/* Fooling docker in CT - silently exit */
+					return 0;
+			}
 			dev_exception_clean(devcgroup);
 			devcgroup->behavior = DEVCG_DEFAULT_ALLOW;
 			if (!parent)
@@ -750,7 +773,7 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 	}
 	if (!isspace(*b))
 		return -EINVAL;
-	for (b++, count = 0; count < 3; count++, b++) {
+	for (b++, count = 0; count < ACCLEN - 1; count++, b++) {
 		switch (*b) {
 		case 'r':
 			ex.access |= ACC_READ;
@@ -761,9 +784,12 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
 		case 'm':
 			ex.access |= ACC_MKNOD;
 			break;
+		case 'M':
+			ex.access |= ACC_MOUNT;
+			break;
 		case '\n':
 		case '\0':
-			count = 3;
+			count = ACCLEN - 1;
 			break;
 		default:
 			return -EINVAL;
@@ -881,8 +907,24 @@ static int __devcgroup_check_permission(short type, u32 major, u32 minor,
 				     minor, access);
 	rcu_read_unlock();
 
+#ifdef CONFIG_VE
+	/*
+	 * When restoring container allow everything in
+	 * pseudosuper state. We need this for early
+	 * mounting of second ploop device. Still, don't
+	 * change behaviour on the ve0.
+	 */
+	if (!rc) {
+		struct ve_struct *ve = get_exec_env();
+
+		if (!ve_is_super(ve) && ve->is_pseudosuper)
+			return 0;
+		return -EPERM;
+	}
+#else
 	if (!rc)
 		return -EPERM;
+#endif
 
 	return 0;
 }
@@ -899,11 +941,64 @@ int __devcgroup_inode_permission(struct inode *inode, int mask)
 		access |= ACC_WRITE;
 	if (mask & MAY_READ)
 		access |= ACC_READ;
+	if (mask & MAY_MOUNT)
+		access |= ACC_MOUNT;
 
 	return __devcgroup_check_permission(type, imajor(inode), iminor(inode),
 			access);
 }
 
+int devcgroup_device_permission(umode_t mode, dev_t dev, int mask)
+{
+	short type, access = 0;
+
+	if (S_ISBLK(mode))
+		type = DEV_BLOCK;
+	if (S_ISCHR(mode))
+		type = DEV_CHAR;
+	if (mask & MAY_WRITE)
+		access |= ACC_WRITE;
+	if (mask & MAY_READ)
+		access |= ACC_READ;
+
+	return __devcgroup_check_permission(type, MAJOR(dev), MINOR(dev), access);
+}
+
+int devcgroup_device_visible(umode_t mode, int major, int start_minor, int nr_minors)
+{
+	struct dev_cgroup *dev_cgroup;
+	struct dev_exception_item *ex;
+	short access = ACC_READ | ACC_WRITE;
+	bool match = false;
+
+	rcu_read_lock();
+	dev_cgroup = task_devcgroup(current);
+
+	if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) {
+		match = true;
+		goto out;
+	}
+
+	list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) {
+		if ((ex->type & DEV_BLOCK) && !S_ISBLK(mode))
+			continue;
+		if ((ex->type & DEV_CHAR) && !S_ISCHR(mode))
+			continue;
+		if (ex->major != ~0 && ex->major != major)
+			continue;
+		if (ex->minor != ~0 && (ex->minor < start_minor ||
+					ex->minor >= start_minor + nr_minors))
+			continue;
+		if (!(access & ex->access))
+			continue;
+		match = true;
+		break;
+	}
+out:
+	rcu_read_unlock();
+	return match;
+}
+
 int devcgroup_inode_mknod(int mode, dev_t dev)
 {
 	short type;
@@ -920,3 +1015,107 @@ int devcgroup_inode_mknod(int mode, dev_t dev)
 			ACC_MKNOD);
 
 }
+
+#ifdef CONFIG_VE
+
+static unsigned decode_ve_perms(unsigned perm)
+{
+	unsigned mask = 0;
+
+	if (perm & S_IROTH)
+		mask |= ACC_READ;
+	if (perm & S_IWOTH)
+		mask |= ACC_WRITE;
+	if (perm & S_IXUSR)
+		mask |= ACC_MOUNT;
+
+	return mask;
+}
+
+static unsigned encode_ve_perms(unsigned mask)
+{
+	unsigned perm = 0;
+
+	if (mask & ACC_READ)
+		perm |= S_IROTH;
+	if (mask & ACC_WRITE)
+		perm |= S_IWOTH;
+	if (mask & ACC_MOUNT)
+		perm |= S_IXUSR;
+
+	return perm;
+}
+
+int devcgroup_set_perms_ve(struct ve_struct *ve,
+		unsigned type, dev_t dev, unsigned mask)
+{
+	int err = -EINVAL;
+	struct dev_exception_item new;
+	struct cgroup_subsys_state *css;
+
+	if ((type & S_IFMT) == S_IFBLK)
+		new.type = DEV_BLOCK;
+	else if ((type & S_IFMT) == S_IFCHR)
+		new.type = DEV_CHAR;
+	else
+		return -EINVAL;
+
+	new.access = decode_ve_perms(mask) | (mask ? ACC_MKNOD : 0);
+	new.major = new.minor = ~0;
+
+	switch (type & VE_USE_MASK) {
+	default:
+		new.minor = MINOR(dev);
+	case VE_USE_MAJOR:
+		new.major = MAJOR(dev);
+	case 0:
+		;
+	}
+
+	mutex_lock(&devcgroup_mutex);
+	css = ve_get_init_css(ve, devices_subsys_id);
+	err = dev_exception_add(cgroup_to_devcgroup(css->cgroup), &new);
+	css_put(css);
+	mutex_unlock(&devcgroup_mutex);
+
+	return err;
+}
+EXPORT_SYMBOL(devcgroup_set_perms_ve);
+
+int devcgroup_seq_show_ve(struct ve_struct *ve, struct seq_file *m)
+{
+	struct dev_exception_item *wh;
+	struct dev_cgroup *devcgroup;
+	struct cgroup_subsys_state *css;
+
+	css = ve_get_init_css(ve, devices_subsys_id);
+	devcgroup = cgroup_to_devcgroup(css->cgroup);
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(wh, &devcgroup->exceptions, list) {
+		char maj[MAJMINLEN], min[MAJMINLEN];
+		unsigned perm;
+
+		if (wh->access & ACC_HIDDEN)
+			continue;
+
+		set_majmin(maj, wh->major);
+		set_majmin(min, wh->minor);
+
+		perm = encode_ve_perms(wh->access);
+		if (perm & (S_IROTH | S_IWOTH))
+			perm |= S_IXOTH;
+
+		seq_printf(m, "%10u %c %03o %s:%s\n",
+				ve->veid,
+				type_to_char(wh->type),
+				perm, maj, min);
+	}
+	rcu_read_unlock();
+
+	css_put(css);
+	return 0;
+}
+EXPORT_SYMBOL(devcgroup_seq_show_ve);
+
+#endif /* CONFIG_VE */
--- a/security/keys/encrypted-keys/encrypted.c
+++ b/security/keys/encrypted-keys/encrypted.c
@@ -845,6 +845,8 @@ static int encrypted_update(struct key *key, struct key_preparsed_payload *prep)
 	size_t datalen = prep->datalen;
 	int ret = 0;
 
+	if (test_bit(KEY_FLAG_NEGATIVE, &key->flags))
+		return -ENOKEY;
 	if (datalen <= 0 || datalen > 32767 || !prep->data)
 		return -EINVAL;
 
--- a/security/keys/proc.c
+++ b/security/keys/proc.c
@@ -187,7 +187,7 @@ static int proc_keys_show(struct seq_file *m, void *v)
 	struct timespec now;
 	unsigned long timo;
 	key_ref_t key_ref, skey_ref;
-	char xbuf[12];
+	char xbuf[16];
 	int rc;
 
 	struct keyring_search_context ctx = {
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -132,6 +132,9 @@ int install_thread_keyring_to_cred(struct cred *new)
 {
 	struct key *keyring;
 
+	if (new->thread_keyring)
+		return 0;
+
 	keyring = keyring_alloc("_tid", new->uid, new->gid, new,
 				KEY_POS_ALL | KEY_USR_VIEW,
 				KEY_ALLOC_QUOTA_OVERRUN, NULL);
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -19,6 +19,8 @@
 #include <linux/slab.h>
 #include "internal.h"
 
+#include <keys/user-type.h>
+
 #define key_negative_timeout	60	/* default timeout on a negative key's existence */
 
 /**
@@ -517,7 +519,7 @@ struct key *request_key_and_link(struct key_type *type,
 		.index_key.type		= type,
 		.index_key.description	= description,
 		.cred			= current_cred(),
-		.match			= type->match,
+		.match			= type->match ? : user_match,
 		.match_data		= description,
 		.flags			= KEYRING_SEARCH_LOOKUP_DIRECT,
 	};
--- a/security/keys/trusted.c
+++ b/security/keys/trusted.c
@@ -1060,13 +1060,16 @@ static void trusted_rcu_free(struct rcu_head *rcu)
  */
 static int trusted_update(struct key *key, struct key_preparsed_payload *prep)
 {
-	struct trusted_key_payload *p = key->payload.data;
+	struct trusted_key_payload *p;
 	struct trusted_key_payload *new_p;
 	struct trusted_key_options *new_o;
 	size_t datalen = prep->datalen;
 	char *datablob;
 	int ret = 0;
 
+	if (test_bit(KEY_FLAG_NEGATIVE, &key->flags))
+		return -ENOKEY;
+	p = key->payload.data;
 	if (!p->migratable)
 		return -EPERM;
 	if (datalen <= 0 || datalen > 32767 || !prep->data)
--- a/security/keys/user_defined.c
+++ b/security/keys/user_defined.c
@@ -121,7 +121,10 @@ int user_update(struct key *key, struct key_preparsed_payload *prep)
 
 	if (ret == 0) {
 		/* attach the new data, displacing the old */
-		zap = key->payload.data;
+		if (!test_bit(KEY_FLAG_NEGATIVE, &key->flags))
+			zap = key->payload.data;
+		else
+			zap = NULL;
 		rcu_assign_keypointer(key, upayload);
 		key->expiry = 0;
 	}
--- a/security/selinux/Kconfig
+++ b/security/selinux/Kconfig
@@ -1,6 +1,6 @@
 config SECURITY_SELINUX
 	bool "NSA SELinux Support"
-	depends on SECURITY_NETWORK && AUDIT && NET && INET
+	depends on SECURITY_NETWORK && AUDIT && NET && INET && !VE
 	select NETWORK_SECMARK
 	default n
 	help
--- a/tools/testing/selftests/Makefile
+++ b/tools/testing/selftests/Makefile
@@ -2,6 +2,7 @@ TARGETS = breakpoints
 TARGETS += cpu-hotplug
 TARGETS += efivarfs
 TARGETS += kcmp
+TARGETS += memfd
 TARGETS += memory-hotplug
 TARGETS += mqueue
 TARGETS += net
--- /dev/null
+++ b/tools/testing/selftests/memfd/.gitignore
@@ -0,0 +1,4 @@
+fuse_mnt
+fuse_test
+memfd_test
+memfd-test-file
--- /dev/null
+++ b/tools/testing/selftests/memfd/Makefile
@@ -0,0 +1,47 @@
+#
+# tools/testing/selftests/memfd/Makefile
+#
+# Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+#
+
+uname_M := $(shell uname -m 2>/dev/null || echo not)
+ARCH ?= $(shell echo $(uname_M) | sed -e s/i.86/i386/)
+ifeq ($(ARCH),i386)
+	ARCH := X86
+endif
+ifeq ($(ARCH),x86_64)
+	ARCH := X86
+endif
+
+CFLAGS += -D_FILE_OFFSET_BITS=64
+CFLAGS += -I../../../../arch/x86/include/generated/uapi/
+CFLAGS += -I../../../../arch/x86/include/uapi/
+CFLAGS += -I../../../../include/uapi/
+CFLAGS += -I../../../../include/
+
+all:
+ifeq ($(ARCH),X86)
+	gcc $(CFLAGS) memfd_test.c -o memfd_test
+else
+	echo "Not an x86 target, can't build memfd selftest"
+endif
+
+run_tests: all
+ifeq ($(ARCH),X86)
+	gcc $(CFLAGS) memfd_test.c -o memfd_test
+endif
+	@./memfd_test || echo "memfd_test: [FAIL]"
+
+build_fuse:
+ifeq ($(ARCH),X86)
+	gcc $(CFLAGS) fuse_mnt.c `pkg-config fuse --cflags --libs` -o fuse_mnt
+	gcc $(CFLAGS) fuse_test.c -o fuse_test
+else
+	echo "Not an x86 target, can't build memfd selftest"
+endif
+
+run_fuse: build_fuse
+	@./run_fuse_test.sh || echo "fuse_test: [FAIL]"
+
+clean:
+	$(RM) memfd_test fuse_test
--- /dev/null
+++ b/tools/testing/selftests/memfd/fuse_mnt.c
@@ -0,0 +1,117 @@
+/*
+ *  tools/testing/selftests/memfd/fuse_mnt.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * memfd test file-system
+ * This file uses FUSE to create a dummy file-system with only one file /memfd.
+ * This file is read-only and takes 1s per read.
+ *
+ * This file-system is used by the memfd test-cases to force the kernel to pin
+ * pages during reads(). Due to the 1s delay of this file-system, this is a
+ * nice way to test race-conditions against get_user_pages() in the kernel.
+ *
+ * We use direct_io==1 to force the kernel to use direct-IO for this
+ * file-system.
+ */
+
+#define FUSE_USE_VERSION 26
+
+#include <fuse.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <unistd.h>
+
+static const char memfd_content[] = "memfd-example-content";
+static const char memfd_path[] = "/memfd";
+
+static int memfd_getattr(const char *path, struct stat *st)
+{
+	memset(st, 0, sizeof(*st));
+
+	if (!strcmp(path, "/")) {
+		st->st_mode = S_IFDIR | 0755;
+		st->st_nlink = 2;
+	} else if (!strcmp(path, memfd_path)) {
+		st->st_mode = S_IFREG | 0444;
+		st->st_nlink = 1;
+		st->st_size = strlen(memfd_content);
+	} else {
+		return -ENOENT;
+	}
+
+	return 0;
+}
+
+static int memfd_readdir(const char *path,
+			 void *buf,
+			 fuse_fill_dir_t filler,
+			 off_t offset,
+			 struct fuse_file_info *fi)
+{
+	if (strcmp(path, "/"))
+		return -ENOENT;
+
+	filler(buf, ".", NULL, 0);
+	filler(buf, "..", NULL, 0);
+	filler(buf, memfd_path + 1, NULL, 0);
+
+	return 0;
+}
+
+static int memfd_open(const char *path, struct fuse_file_info *fi)
+{
+	if (strcmp(path, memfd_path))
+		return -ENOENT;
+
+	if ((fi->flags & 3) != O_RDONLY)
+		return -EACCES;
+
+	/* force direct-IO */
+	fi->direct_io = 1;
+
+	return 0;
+}
+
+static int memfd_read(const char *path,
+		      char *buf,
+		      size_t size,
+		      off_t offset,
+		      struct fuse_file_info *fi)
+{
+	size_t len;
+
+	if (strcmp(path, memfd_path) != 0)
+		return -ENOENT;
+
+	sleep(1);
+
+	len = strlen(memfd_content);
+	if (offset < len) {
+		if (offset + size > len)
+			size = len - offset;
+
+		memcpy(buf, memfd_content + offset, size);
+	} else {
+		size = 0;
+	}
+
+	return size;
+}
+
+static struct fuse_operations memfd_ops = {
+	.getattr	= memfd_getattr,
+	.readdir	= memfd_readdir,
+	.open		= memfd_open,
+	.read		= memfd_read,
+};
+
+int main(int argc, char *argv[])
+{
+	return fuse_main(argc, argv, &memfd_ops, NULL);
+}
--- /dev/null
+++ b/tools/testing/selftests/memfd/fuse_test.c
@@ -0,0 +1,318 @@
+/*
+ *  tools/testing/selftests/memfd/fuse_test.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+/*
+ * memfd GUP test-case
+ * This tests memfd interactions with get_user_pages(). We require the
+ * fuse_mnt.c program to provide a fake direct-IO FUSE mount-point for us. This
+ * file-system delays _all_ reads by 1s and forces direct-IO. This means, any
+ * read() on files in that file-system will pin the receive-buffer pages for at
+ * least 1s via get_user_pages().
+ *
+ * We use this trick to race ADD_SEALS against a write on a memfd object. The
+ * ADD_SEALS must fail if the memfd pages are still pinned. Note that we use
+ * the read() syscall with our memory-mapped memfd object as receive buffer to
+ * force the kernel to write into our memfd object.
+ */
+
+#define _GNU_SOURCE
+#define __EXPORTED_HEADERS__
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <linux/falloc.h>
+#include <linux/fcntl.h>
+#include <linux/memfd.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+#define MFD_DEF_SIZE 8192
+#define STACK_SIZE 65535
+
+static int sys_memfd_create(const char *name,
+			    unsigned int flags)
+{
+	return syscall(__NR_memfd_create, name, flags);
+}
+
+static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags)
+{
+	int r, fd;
+
+	fd = sys_memfd_create(name, flags);
+	if (fd < 0) {
+		printf("memfd_create(\"%s\", %u) failed: %m\n",
+		       name, flags);
+		abort();
+	}
+
+	r = ftruncate(fd, sz);
+	if (r < 0) {
+		printf("ftruncate(%llu) failed: %m\n", (unsigned long long)sz);
+		abort();
+	}
+
+	return fd;
+}
+
+static __u64 mfd_assert_get_seals(int fd)
+{
+	long r;
+
+	r = fcntl(fd, F_GET_SEALS);
+	if (r < 0) {
+		printf("GET_SEALS(%d) failed: %m\n", fd);
+		abort();
+	}
+
+	return r;
+}
+
+static void mfd_assert_has_seals(int fd, __u64 seals)
+{
+	__u64 s;
+
+	s = mfd_assert_get_seals(fd);
+	if (s != seals) {
+		printf("%llu != %llu = GET_SEALS(%d)\n",
+		       (unsigned long long)seals, (unsigned long long)s, fd);
+		abort();
+	}
+}
+
+static void mfd_assert_add_seals(int fd, __u64 seals)
+{
+	long r;
+	__u64 s;
+
+	s = mfd_assert_get_seals(fd);
+	r = fcntl(fd, F_ADD_SEALS, seals);
+	if (r < 0) {
+		printf("ADD_SEALS(%d, %llu -> %llu) failed: %m\n",
+		       fd, (unsigned long long)s, (unsigned long long)seals);
+		abort();
+	}
+}
+
+static int mfd_busy_add_seals(int fd, __u64 seals)
+{
+	long r;
+	__u64 s;
+
+	r = fcntl(fd, F_GET_SEALS);
+	if (r < 0)
+		s = 0;
+	else
+		s = r;
+
+	r = fcntl(fd, F_ADD_SEALS, seals);
+	if (r < 0 && errno != EBUSY) {
+		printf("ADD_SEALS(%d, %llu -> %llu) didn't fail as expected with EBUSY: %m\n",
+		       fd, (unsigned long long)s, (unsigned long long)seals);
+		abort();
+	}
+
+	return r;
+}
+
+static void *mfd_assert_mmap_shared(int fd)
+{
+	void *p;
+
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ | PROT_WRITE,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+
+	return p;
+}
+
+static void *mfd_assert_mmap_private(int fd)
+{
+	void *p;
+
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+
+	return p;
+}
+
+static int global_mfd = -1;
+static void *global_p = NULL;
+
+static int sealing_thread_fn(void *arg)
+{
+	int sig, r;
+
+	/*
+	 * This thread first waits 200ms so any pending operation in the parent
+	 * is correctly started. After that, it tries to seal @global_mfd as
+	 * SEAL_WRITE. This _must_ fail as the parent thread has a read() into
+	 * that memory mapped object still ongoing.
+	 * We then wait one more second and try sealing again. This time it
+	 * must succeed as there shouldn't be anyone else pinning the pages.
+	 */
+
+	/* wait 200ms for FUSE-request to be active */
+	usleep(200000);
+
+	/* unmount mapping before sealing to avoid i_mmap_writable failures */
+	munmap(global_p, MFD_DEF_SIZE);
+
+	/* Try sealing the global file; expect EBUSY or success. Current
+	 * kernels will never succeed, but in the future, kernels might
+	 * implement page-replacements or other fancy ways to avoid racing
+	 * writes. */
+	r = mfd_busy_add_seals(global_mfd, F_SEAL_WRITE);
+	if (r >= 0) {
+		printf("HURRAY! This kernel fixed GUP races!\n");
+	} else {
+		/* wait 1s more so the FUSE-request is done */
+		sleep(1);
+
+		/* try sealing the global file again */
+		mfd_assert_add_seals(global_mfd, F_SEAL_WRITE);
+	}
+
+	return 0;
+}
+
+static pid_t spawn_sealing_thread(void)
+{
+	uint8_t *stack;
+	pid_t pid;
+
+	stack = malloc(STACK_SIZE);
+	if (!stack) {
+		printf("malloc(STACK_SIZE) failed: %m\n");
+		abort();
+	}
+
+	pid = clone(sealing_thread_fn,
+		    stack + STACK_SIZE,
+		    SIGCHLD | CLONE_FILES | CLONE_FS | CLONE_VM,
+		    NULL);
+	if (pid < 0) {
+		printf("clone() failed: %m\n");
+		abort();
+	}
+
+	return pid;
+}
+
+static void join_sealing_thread(pid_t pid)
+{
+	waitpid(pid, NULL, 0);
+}
+
+int main(int argc, char **argv)
+{
+	static const char zero[MFD_DEF_SIZE];
+	int fd, mfd, r;
+	void *p;
+	int was_sealed;
+	pid_t pid;
+
+	if (argc < 2) {
+		printf("error: please pass path to file in fuse_mnt mount-point\n");
+		abort();
+	}
+
+	/* open FUSE memfd file for GUP testing */
+	printf("opening: %s\n", argv[1]);
+	fd = open(argv[1], O_RDONLY | O_CLOEXEC);
+	if (fd < 0) {
+		printf("cannot open(\"%s\"): %m\n", argv[1]);
+		abort();
+	}
+
+	/* create new memfd-object */
+	mfd = mfd_assert_new("kern_memfd_fuse",
+			     MFD_DEF_SIZE,
+			     MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+	/* mmap memfd-object for writing */
+	p = mfd_assert_mmap_shared(mfd);
+
+	/* pass mfd+mapping to a separate sealing-thread which tries to seal
+	 * the memfd objects with SEAL_WRITE while we write into it */
+	global_mfd = mfd;
+	global_p = p;
+	pid = spawn_sealing_thread();
+
+	/* Use read() on the FUSE file to read into our memory-mapped memfd
+	 * object. This races the other thread which tries to seal the
+	 * memfd-object.
+	 * If @fd is on the memfd-fake-FUSE-FS, the read() is delayed by 1s.
+	 * This guarantees that the receive-buffer is pinned for 1s until the
+	 * data is written into it. The racing ADD_SEALS should thus fail as
+	 * the pages are still pinned. */
+	r = read(fd, p, MFD_DEF_SIZE);
+	if (r < 0) {
+		printf("read() failed: %m\n");
+		abort();
+	} else if (!r) {
+		printf("unexpected EOF on read()\n");
+		abort();
+	}
+
+	was_sealed = mfd_assert_get_seals(mfd) & F_SEAL_WRITE;
+
+	/* Wait for sealing-thread to finish and verify that it
+	 * successfully sealed the file after the second try. */
+	join_sealing_thread(pid);
+	mfd_assert_has_seals(mfd, F_SEAL_WRITE);
+
+	/* *IF* the memfd-object was sealed at the time our read() returned,
+	 * then the kernel did a page-replacement or canceled the read() (or
+	 * whatever magic it did..). In that case, the memfd object is still
+	 * all zero.
+	 * In case the memfd-object was *not* sealed, the read() was successfull
+	 * and the memfd object must *not* be all zero.
+	 * Note that in real scenarios, there might be a mixture of both, but
+	 * in this test-cases, we have explicit 200ms delays which should be
+	 * enough to avoid any in-flight writes. */
+
+	p = mfd_assert_mmap_private(mfd);
+	if (was_sealed && memcmp(p, zero, MFD_DEF_SIZE)) {
+		printf("memfd sealed during read() but data not discarded\n");
+		abort();
+	} else if (!was_sealed && !memcmp(p, zero, MFD_DEF_SIZE)) {
+		printf("memfd sealed after read() but data discarded\n");
+		abort();
+	}
+
+	close(mfd);
+	close(fd);
+
+	printf("fuse: DONE\n");
+
+	return 0;
+}
--- /dev/null
+++ b/tools/testing/selftests/memfd/memfd_test.c
@@ -0,0 +1,920 @@
+/*
+ *  tools/testing/selftests/memfd/memfd_test.c
+ *
+ *  Copyright (c) 2010-2015 Parallels IP Holdings GmbH
+ *
+ */
+
+#define _GNU_SOURCE
+#define __EXPORTED_HEADERS__
+
+#include <errno.h>
+#include <inttypes.h>
+#include <limits.h>
+#include <linux/falloc.h>
+#include <linux/fcntl.h>
+#include <linux/memfd.h>
+#include <sched.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <signal.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+#define MFD_DEF_SIZE 8192
+#define STACK_SIZE 65535
+
+static int sys_memfd_create(const char *name,
+			    unsigned int flags)
+{
+	return syscall(__NR_memfd_create, name, flags);
+}
+
+static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags)
+{
+	int r, fd;
+
+	fd = sys_memfd_create(name, flags);
+	if (fd < 0) {
+		printf("memfd_create(\"%s\", %u) failed: %m\n",
+		       name, flags);
+		abort();
+	}
+
+	r = ftruncate(fd, sz);
+	if (r < 0) {
+		printf("ftruncate(%llu) failed: %m\n", (unsigned long long)sz);
+		abort();
+	}
+
+	return fd;
+}
+
+static void mfd_fail_new(const char *name, unsigned int flags)
+{
+	int r;
+
+	r = sys_memfd_create(name, flags);
+	if (r >= 0) {
+		printf("memfd_create(\"%s\", %u) succeeded, but failure expected\n",
+		       name, flags);
+		close(r);
+		abort();
+	}
+}
+
+static __u64 mfd_assert_get_seals(int fd)
+{
+	long r;
+
+	r = fcntl(fd, F_GET_SEALS);
+	if (r < 0) {
+		printf("GET_SEALS(%d) failed: %m\n", fd);
+		abort();
+	}
+
+	return r;
+}
+
+static void mfd_assert_has_seals(int fd, __u64 seals)
+{
+	__u64 s;
+
+	s = mfd_assert_get_seals(fd);
+	if (s != seals) {
+		printf("%llu != %llu = GET_SEALS(%d)\n",
+		       (unsigned long long)seals, (unsigned long long)s, fd);
+		abort();
+	}
+}
+
+static void mfd_assert_add_seals(int fd, __u64 seals)
+{
+	long r;
+	__u64 s;
+
+	s = mfd_assert_get_seals(fd);
+	r = fcntl(fd, F_ADD_SEALS, seals);
+	if (r < 0) {
+		printf("ADD_SEALS(%d, %llu -> %llu) failed: %m\n",
+		       fd, (unsigned long long)s, (unsigned long long)seals);
+		abort();
+	}
+}
+
+static void mfd_fail_add_seals(int fd, __u64 seals)
+{
+	long r;
+	__u64 s;
+
+	r = fcntl(fd, F_GET_SEALS);
+	if (r < 0)
+		s = 0;
+	else
+		s = r;
+
+	r = fcntl(fd, F_ADD_SEALS, seals);
+	if (r >= 0) {
+		printf("ADD_SEALS(%d, %llu -> %llu) didn't fail as expected\n",
+		       fd, (unsigned long long)s, (unsigned long long)seals);
+		abort();
+	}
+}
+
+static void mfd_assert_size(int fd, size_t size)
+{
+	struct stat st;
+	int r;
+
+	r = fstat(fd, &st);
+	if (r < 0) {
+		printf("fstat(%d) failed: %m\n", fd);
+		abort();
+	} else if (st.st_size != size) {
+		printf("wrong file size %lld, but expected %lld\n",
+		       (long long)st.st_size, (long long)size);
+		abort();
+	}
+}
+
+static int mfd_assert_dup(int fd)
+{
+	int r;
+
+	r = dup(fd);
+	if (r < 0) {
+		printf("dup(%d) failed: %m\n", fd);
+		abort();
+	}
+
+	return r;
+}
+
+static void *mfd_assert_mmap_shared(int fd)
+{
+	void *p;
+
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ | PROT_WRITE,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+
+	return p;
+}
+
+static void *mfd_assert_mmap_private(int fd)
+{
+	void *p;
+
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ,
+		 MAP_PRIVATE,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+
+	return p;
+}
+
+static int mfd_assert_open(int fd, int flags, mode_t mode)
+{
+	char buf[512];
+	int r;
+
+	sprintf(buf, "/proc/self/fd/%d", fd);
+	r = open(buf, flags, mode);
+	if (r < 0) {
+		printf("open(%s) failed: %m\n", buf);
+		abort();
+	}
+
+	return r;
+}
+
+static void mfd_fail_open(int fd, int flags, mode_t mode)
+{
+	char buf[512];
+	int r;
+
+	sprintf(buf, "/proc/self/fd/%d", fd);
+	r = open(buf, flags, mode);
+	if (r >= 0) {
+		printf("open(%s) didn't fail as expected\n");
+		abort();
+	}
+}
+
+static void mfd_assert_read(int fd)
+{
+	char buf[16];
+	void *p;
+	ssize_t l;
+
+	l = read(fd, buf, sizeof(buf));
+	if (l != sizeof(buf)) {
+		printf("read() failed: %m\n");
+		abort();
+	}
+
+	/* verify PROT_READ *is* allowed */
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ,
+		 MAP_PRIVATE,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+	munmap(p, MFD_DEF_SIZE);
+
+	/* verify MAP_PRIVATE is *always* allowed (even writable) */
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ | PROT_WRITE,
+		 MAP_PRIVATE,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+	munmap(p, MFD_DEF_SIZE);
+}
+
+static void mfd_assert_write(int fd)
+{
+	ssize_t l;
+	void *p;
+	int r;
+
+	/* verify write() succeeds */
+	l = write(fd, "\0\0\0\0", 4);
+	if (l != 4) {
+		printf("write() failed: %m\n");
+		abort();
+	}
+
+	/* verify PROT_READ | PROT_WRITE is allowed */
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ | PROT_WRITE,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+	*(char *)p = 0;
+	munmap(p, MFD_DEF_SIZE);
+
+	/* verify PROT_WRITE is allowed */
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_WRITE,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+	*(char *)p = 0;
+	munmap(p, MFD_DEF_SIZE);
+
+	/* verify PROT_READ with MAP_SHARED is allowed and a following
+	 * mprotect(PROT_WRITE) allows writing */
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p == MAP_FAILED) {
+		printf("mmap() failed: %m\n");
+		abort();
+	}
+
+	r = mprotect(p, MFD_DEF_SIZE, PROT_READ | PROT_WRITE);
+	if (r < 0) {
+		printf("mprotect() failed: %m\n");
+		abort();
+	}
+
+	*(char *)p = 0;
+	munmap(p, MFD_DEF_SIZE);
+
+	/* verify PUNCH_HOLE works */
+	r = fallocate(fd,
+		      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+		      0,
+		      MFD_DEF_SIZE);
+	if (r < 0) {
+		printf("fallocate(PUNCH_HOLE) failed: %m\n");
+		abort();
+	}
+}
+
+static void mfd_fail_write(int fd)
+{
+	ssize_t l;
+	void *p;
+	int r;
+
+	/* verify write() fails */
+	l = write(fd, "data", 4);
+	if (l != -EPERM) {
+		printf("expected EPERM on write(), but got %d: %m\n", (int)l);
+		abort();
+	}
+
+	/* verify PROT_READ | PROT_WRITE is not allowed */
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ | PROT_WRITE,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p != MAP_FAILED) {
+		printf("mmap() didn't fail as expected\n");
+		abort();
+	}
+
+	/* verify PROT_WRITE is not allowed */
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_WRITE,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p != MAP_FAILED) {
+		printf("mmap() didn't fail as expected\n");
+		abort();
+	}
+
+	/* Verify PROT_READ with MAP_SHARED with a following mprotect is not
+	 * allowed. Note that for r/w the kernel already prevents the mmap. */
+	p = mmap(NULL,
+		 MFD_DEF_SIZE,
+		 PROT_READ,
+		 MAP_SHARED,
+		 fd,
+		 0);
+	if (p != MAP_FAILED) {
+		r = mprotect(p, MFD_DEF_SIZE, PROT_READ | PROT_WRITE);
+		if (r >= 0) {
+			printf("mmap()+mprotect() didn't fail as expected\n");
+			abort();
+		}
+	}
+
+	/* verify PUNCH_HOLE fails */
+	r = fallocate(fd,
+		      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+		      0,
+		      MFD_DEF_SIZE);
+	if (r >= 0) {
+		printf("fallocate(PUNCH_HOLE) didn't fail as expected\n");
+		abort();
+	}
+}
+
+static void mfd_assert_shrink(int fd)
+{
+	int r, fd2;
+
+	r = ftruncate(fd, MFD_DEF_SIZE / 2);
+	if (r < 0) {
+		printf("ftruncate(SHRINK) failed: %m\n");
+		abort();
+	}
+
+	mfd_assert_size(fd, MFD_DEF_SIZE / 2);
+
+	fd2 = mfd_assert_open(fd,
+			      O_RDWR | O_CREAT | O_TRUNC,
+			      S_IRUSR | S_IWUSR);
+	close(fd2);
+
+	mfd_assert_size(fd, 0);
+}
+
+static void mfd_fail_shrink(int fd)
+{
+	int r;
+
+	r = ftruncate(fd, MFD_DEF_SIZE / 2);
+	if (r >= 0) {
+		printf("ftruncate(SHRINK) didn't fail as expected\n");
+		abort();
+	}
+
+	mfd_fail_open(fd,
+		      O_RDWR | O_CREAT | O_TRUNC,
+		      S_IRUSR | S_IWUSR);
+}
+
+static void mfd_assert_grow(int fd)
+{
+	int r;
+
+	r = ftruncate(fd, MFD_DEF_SIZE * 2);
+	if (r < 0) {
+		printf("ftruncate(GROW) failed: %m\n");
+		abort();
+	}
+
+	mfd_assert_size(fd, MFD_DEF_SIZE * 2);
+
+	r = fallocate(fd,
+		      0,
+		      0,
+		      MFD_DEF_SIZE * 4);
+	if (r < 0) {
+		printf("fallocate(ALLOC) failed: %m\n");
+		abort();
+	}
+
+	mfd_assert_size(fd, MFD_DEF_SIZE * 4);
+}
+
+static void mfd_fail_grow(int fd)
+{
+	int r;
+
+	r = ftruncate(fd, MFD_DEF_SIZE * 2);
+	if (r >= 0) {
+		printf("ftruncate(GROW) didn't fail as expected\n");
+		abort();
+	}
+
+	r = fallocate(fd,
+		      0,
+		      0,
+		      MFD_DEF_SIZE * 4);
+	if (r >= 0) {
+		printf("fallocate(ALLOC) didn't fail as expected\n");
+		abort();
+	}
+}
+
+static void mfd_assert_grow_write(int fd)
+{
+	static char buf[MFD_DEF_SIZE * 8];
+	ssize_t l;
+
+	l = pwrite(fd, buf, sizeof(buf), 0);
+	if (l != sizeof(buf)) {
+		printf("pwrite() failed: %m\n");
+		abort();
+	}
+
+	mfd_assert_size(fd, MFD_DEF_SIZE * 8);
+}
+
+static void mfd_fail_grow_write(int fd)
+{
+	static char buf[MFD_DEF_SIZE * 8];
+	ssize_t l;
+
+	l = pwrite(fd, buf, sizeof(buf), 0);
+	if (l == sizeof(buf)) {
+		printf("pwrite() didn't fail as expected\n");
+		abort();
+	}
+}
+
+static int idle_thread_fn(void *arg)
+{
+	sigset_t set;
+	int sig;
+
+	/* dummy waiter; SIGTERM terminates us anyway */
+	sigemptyset(&set);
+	sigaddset(&set, SIGTERM);
+	sigwait(&set, &sig);
+
+	return 0;
+}
+
+static pid_t spawn_idle_thread(unsigned int flags)
+{
+	uint8_t *stack;
+	pid_t pid;
+
+	stack = malloc(STACK_SIZE);
+	if (!stack) {
+		printf("malloc(STACK_SIZE) failed: %m\n");
+		abort();
+	}
+
+	pid = clone(idle_thread_fn,
+		    stack + STACK_SIZE,
+		    SIGCHLD | flags,
+		    NULL);
+	if (pid < 0) {
+		printf("clone() failed: %m\n");
+		abort();
+	}
+
+	return pid;
+}
+
+static void join_idle_thread(pid_t pid)
+{
+	kill(pid, SIGTERM);
+	waitpid(pid, NULL, 0);
+}
+
+/*
+ * Test memfd_create() syscall
+ * Verify syscall-argument validation, including name checks, flag validation
+ * and more.
+ */
+static void test_create(void)
+{
+	char buf[2048];
+	int fd;
+
+	/* test NULL name */
+	mfd_fail_new(NULL, 0);
+
+	/* test over-long name (not zero-terminated) */
+	memset(buf, 0xff, sizeof(buf));
+	mfd_fail_new(buf, 0);
+
+	/* test over-long zero-terminated name */
+	memset(buf, 0xff, sizeof(buf));
+	buf[sizeof(buf) - 1] = 0;
+	mfd_fail_new(buf, 0);
+
+	/* verify "" is a valid name */
+	fd = mfd_assert_new("", 0, 0);
+	close(fd);
+
+	/* verify invalid O_* open flags */
+	mfd_fail_new("", 0x0100);
+	mfd_fail_new("", ~MFD_CLOEXEC);
+	mfd_fail_new("", ~MFD_ALLOW_SEALING);
+	mfd_fail_new("", ~0);
+	mfd_fail_new("", 0x80000000U);
+
+	/* verify MFD_CLOEXEC is allowed */
+	fd = mfd_assert_new("", 0, MFD_CLOEXEC);
+	close(fd);
+
+	/* verify MFD_ALLOW_SEALING is allowed */
+	fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING);
+	close(fd);
+
+	/* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */
+	fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC);
+	close(fd);
+}
+
+/*
+ * Test basic sealing
+ * A very basic sealing test to see whether setting/retrieving seals works.
+ */
+static void test_basic(void)
+{
+	int fd;
+
+	fd = mfd_assert_new("kern_memfd_basic",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+
+	/* add basic seals */
+	mfd_assert_has_seals(fd, 0);
+	mfd_assert_add_seals(fd, F_SEAL_SHRINK |
+				 F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_SHRINK |
+				 F_SEAL_WRITE);
+
+	/* add them again */
+	mfd_assert_add_seals(fd, F_SEAL_SHRINK |
+				 F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_SHRINK |
+				 F_SEAL_WRITE);
+
+	/* add more seals and seal against sealing */
+	mfd_assert_add_seals(fd, F_SEAL_GROW | F_SEAL_SEAL);
+	mfd_assert_has_seals(fd, F_SEAL_SHRINK |
+				 F_SEAL_GROW |
+				 F_SEAL_WRITE |
+				 F_SEAL_SEAL);
+
+	/* verify that sealing no longer works */
+	mfd_fail_add_seals(fd, F_SEAL_GROW);
+	mfd_fail_add_seals(fd, 0);
+
+	close(fd);
+
+	/* verify sealing does not work without MFD_ALLOW_SEALING */
+	fd = mfd_assert_new("kern_memfd_basic",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC);
+	mfd_assert_has_seals(fd, F_SEAL_SEAL);
+	mfd_fail_add_seals(fd, F_SEAL_SHRINK |
+			       F_SEAL_GROW |
+			       F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_SEAL);
+	close(fd);
+}
+
+/*
+ * Test SEAL_WRITE
+ * Test whether SEAL_WRITE actually prevents modifications.
+ */
+static void test_seal_write(void)
+{
+	int fd;
+
+	fd = mfd_assert_new("kern_memfd_seal_write",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	mfd_assert_has_seals(fd, 0);
+	mfd_assert_add_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE);
+
+	mfd_assert_read(fd);
+	mfd_fail_write(fd);
+	mfd_assert_shrink(fd);
+	mfd_assert_grow(fd);
+	mfd_fail_grow_write(fd);
+
+	close(fd);
+}
+
+/*
+ * Test SEAL_SHRINK
+ * Test whether SEAL_SHRINK actually prevents shrinking
+ */
+static void test_seal_shrink(void)
+{
+	int fd;
+
+	fd = mfd_assert_new("kern_memfd_seal_shrink",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	mfd_assert_has_seals(fd, 0);
+	mfd_assert_add_seals(fd, F_SEAL_SHRINK);
+	mfd_assert_has_seals(fd, F_SEAL_SHRINK);
+
+	mfd_assert_read(fd);
+	mfd_assert_write(fd);
+	mfd_fail_shrink(fd);
+	mfd_assert_grow(fd);
+	mfd_assert_grow_write(fd);
+
+	close(fd);
+}
+
+/*
+ * Test SEAL_GROW
+ * Test whether SEAL_GROW actually prevents growing
+ */
+static void test_seal_grow(void)
+{
+	int fd;
+
+	fd = mfd_assert_new("kern_memfd_seal_grow",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	mfd_assert_has_seals(fd, 0);
+	mfd_assert_add_seals(fd, F_SEAL_GROW);
+	mfd_assert_has_seals(fd, F_SEAL_GROW);
+
+	mfd_assert_read(fd);
+	mfd_assert_write(fd);
+	mfd_assert_shrink(fd);
+	mfd_fail_grow(fd);
+	mfd_fail_grow_write(fd);
+
+	close(fd);
+}
+
+/*
+ * Test SEAL_SHRINK | SEAL_GROW
+ * Test whether SEAL_SHRINK | SEAL_GROW actually prevents resizing
+ */
+static void test_seal_resize(void)
+{
+	int fd;
+
+	fd = mfd_assert_new("kern_memfd_seal_resize",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	mfd_assert_has_seals(fd, 0);
+	mfd_assert_add_seals(fd, F_SEAL_SHRINK | F_SEAL_GROW);
+	mfd_assert_has_seals(fd, F_SEAL_SHRINK | F_SEAL_GROW);
+
+	mfd_assert_read(fd);
+	mfd_assert_write(fd);
+	mfd_fail_shrink(fd);
+	mfd_fail_grow(fd);
+	mfd_fail_grow_write(fd);
+
+	close(fd);
+}
+
+/*
+ * Test sharing via dup()
+ * Test that seals are shared between dupped FDs and they're all equal.
+ */
+static void test_share_dup(void)
+{
+	int fd, fd2;
+
+	fd = mfd_assert_new("kern_memfd_share_dup",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	mfd_assert_has_seals(fd, 0);
+
+	fd2 = mfd_assert_dup(fd);
+	mfd_assert_has_seals(fd2, 0);
+
+	mfd_assert_add_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd2, F_SEAL_WRITE);
+
+	mfd_assert_add_seals(fd2, F_SEAL_SHRINK);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK);
+	mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK);
+
+	mfd_assert_add_seals(fd, F_SEAL_SEAL);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL);
+	mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL);
+
+	mfd_fail_add_seals(fd, F_SEAL_GROW);
+	mfd_fail_add_seals(fd2, F_SEAL_GROW);
+	mfd_fail_add_seals(fd, F_SEAL_SEAL);
+	mfd_fail_add_seals(fd2, F_SEAL_SEAL);
+
+	close(fd2);
+
+	mfd_fail_add_seals(fd, F_SEAL_GROW);
+	close(fd);
+}
+
+/*
+ * Test sealing with active mmap()s
+ * Modifying seals is only allowed if no other mmap() refs exist.
+ */
+static void test_share_mmap(void)
+{
+	int fd;
+	void *p;
+
+	fd = mfd_assert_new("kern_memfd_share_mmap",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	mfd_assert_has_seals(fd, 0);
+
+	/* shared/writable ref prevents sealing WRITE, but allows others */
+	p = mfd_assert_mmap_shared(fd);
+	mfd_fail_add_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, 0);
+	mfd_assert_add_seals(fd, F_SEAL_SHRINK);
+	mfd_assert_has_seals(fd, F_SEAL_SHRINK);
+	munmap(p, MFD_DEF_SIZE);
+
+	/* readable ref allows sealing */
+	p = mfd_assert_mmap_private(fd);
+	mfd_assert_add_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK);
+	munmap(p, MFD_DEF_SIZE);
+
+	close(fd);
+}
+
+/*
+ * Test sealing with open(/proc/self/fd/%d)
+ * Via /proc we can get access to a separate file-context for the same memfd.
+ * This is *not* like dup(), but like a real separate open(). Make sure the
+ * semantics are as expected and we correctly check for RDONLY / WRONLY / RDWR.
+ */
+static void test_share_open(void)
+{
+	int fd, fd2;
+
+	fd = mfd_assert_new("kern_memfd_share_open",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	mfd_assert_has_seals(fd, 0);
+
+	fd2 = mfd_assert_open(fd, O_RDWR, 0);
+	mfd_assert_add_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd2, F_SEAL_WRITE);
+
+	mfd_assert_add_seals(fd2, F_SEAL_SHRINK);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK);
+	mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK);
+
+	close(fd);
+	fd = mfd_assert_open(fd2, O_RDONLY, 0);
+
+	mfd_fail_add_seals(fd, F_SEAL_SEAL);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK);
+	mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK);
+
+	close(fd2);
+	fd2 = mfd_assert_open(fd, O_RDWR, 0);
+
+	mfd_assert_add_seals(fd2, F_SEAL_SEAL);
+	mfd_assert_has_seals(fd, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL);
+	mfd_assert_has_seals(fd2, F_SEAL_WRITE | F_SEAL_SHRINK | F_SEAL_SEAL);
+
+	close(fd2);
+	close(fd);
+}
+
+/*
+ * Test sharing via fork()
+ * Test whether seal-modifications work as expected with forked childs.
+ */
+static void test_share_fork(void)
+{
+	int fd;
+	pid_t pid;
+
+	fd = mfd_assert_new("kern_memfd_share_fork",
+			    MFD_DEF_SIZE,
+			    MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	mfd_assert_has_seals(fd, 0);
+
+	pid = spawn_idle_thread(0);
+	mfd_assert_add_seals(fd, F_SEAL_SEAL);
+	mfd_assert_has_seals(fd, F_SEAL_SEAL);
+
+	mfd_fail_add_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_SEAL);
+
+	join_idle_thread(pid);
+
+	mfd_fail_add_seals(fd, F_SEAL_WRITE);
+	mfd_assert_has_seals(fd, F_SEAL_SEAL);
+
+	close(fd);
+}
+
+int main(int argc, char **argv)
+{
+	pid_t pid;
+
+	printf("memfd: CREATE\n");
+	test_create();
+	printf("memfd: BASIC\n");
+	test_basic();
+
+	printf("memfd: SEAL-WRITE\n");
+	test_seal_write();
+	printf("memfd: SEAL-SHRINK\n");
+	test_seal_shrink();
+	printf("memfd: SEAL-GROW\n");
+	test_seal_grow();
+	printf("memfd: SEAL-RESIZE\n");
+	test_seal_resize();
+
+	printf("memfd: SHARE-DUP\n");
+	test_share_dup();
+	printf("memfd: SHARE-MMAP\n");
+	test_share_mmap();
+	printf("memfd: SHARE-OPEN\n");
+	test_share_open();
+	printf("memfd: SHARE-FORK\n");
+	test_share_fork();
+
+	/* Run test-suite in a multi-threaded environment with a shared
+	 * file-table. */
+	pid = spawn_idle_thread(CLONE_FILES | CLONE_FS | CLONE_VM);
+	printf("memfd: SHARE-DUP (shared file-table)\n");
+	test_share_dup();
+	printf("memfd: SHARE-MMAP (shared file-table)\n");
+	test_share_mmap();
+	printf("memfd: SHARE-OPEN (shared file-table)\n");
+	test_share_open();
+	printf("memfd: SHARE-FORK (shared file-table)\n");
+	test_share_fork();
+	join_idle_thread(pid);
+
+	printf("memfd: DONE\n");
+
+	return 0;
+}
--- /dev/null
+++ b/tools/testing/selftests/memfd/run_fuse_test.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+if test -d "./mnt" ; then
+	fusermount -u ./mnt
+	rmdir ./mnt
+fi
+
+set -e
+
+mkdir mnt
+./fuse_mnt ./mnt
+./fuse_test ./mnt/memfd
+fusermount -u ./mnt
+rmdir ./mnt
--- a/virt/kvm/irqchip.c
+++ b/virt/kvm/irqchip.c
@@ -167,6 +167,14 @@ out:
 	return r;
 }
 
+void __attribute__((weak)) kvm_arch_irq_routing_update(struct kvm *kvm)
+{
+}
+
+void __attribute__((weak)) kvm_arch_post_irq_routing_update(struct kvm *kvm)
+{
+}
+
 int kvm_set_irq_routing(struct kvm *kvm,
 			const struct kvm_irq_routing_entry *ue,
 			unsigned nr,
@@ -220,9 +228,10 @@ int kvm_set_irq_routing(struct kvm *kvm,
 	old = kvm->irq_routing;
 	rcu_assign_pointer(kvm->irq_routing, new);
 	kvm_irq_routing_update(kvm);
+	kvm_arch_irq_routing_update(kvm);
 	mutex_unlock(&kvm->irq_lock);
 
-	kvm_arch_irq_routing_update(kvm);
+	kvm_arch_post_irq_routing_update(kvm);
 
 	synchronize_srcu_expedited(&kvm->irq_srcu);
 
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -66,17 +66,17 @@
 MODULE_AUTHOR("Qumranet");
 MODULE_LICENSE("GPL");
 
-/* halt polling only reduces halt latency by 5-7 us, 400us is enough */
-static unsigned int halt_poll_ns = 400000;
+/* Architectures should define their poll value according to the halt latency */
+static unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
 module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);
 
 /* Default doubles per-vcpu halt_poll_ns. */
 static unsigned int halt_poll_ns_grow = 2;
-module_param(halt_poll_ns_grow, int, S_IRUGO);
+module_param(halt_poll_ns_grow, uint, S_IRUGO | S_IWUSR);
 
 /* Default resets per-vcpu halt_poll_ns . */
 static unsigned int halt_poll_ns_shrink;
-module_param(halt_poll_ns_shrink, int, S_IRUGO);
+module_param(halt_poll_ns_shrink, uint, S_IRUGO | S_IWUSR);
 
 /*
  * Ordering of locks:
@@ -399,6 +399,35 @@ static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
 	return young;
 }
 
+static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
+					struct mm_struct *mm,
+					unsigned long address)
+{
+	struct kvm *kvm = mmu_notifier_to_kvm(mn);
+	int young, idx;
+
+	idx = srcu_read_lock(&kvm->srcu);
+	spin_lock(&kvm->mmu_lock);
+	/*
+	 * Even though we do not flush TLB, this will still adversely
+	 * affect performance on pre-Haswell Intel EPT, where there is
+	 * no EPT Access Bit to clear so that we have to tear down EPT
+	 * tables instead. If we find this unacceptable, we can always
+	 * add a parameter to kvm_age_hva so that it effectively doesn't
+	 * do anything on clear_young.
+	 *
+	 * Also note that currently we never issue secondary TLB flushes
+	 * from clear_young, leaving this job up to the regular system
+	 * cadence. If we find this inaccurate, we might come up with a
+	 * more sophisticated heuristic later.
+	 */
+	young = kvm_age_hva(kvm, address);
+	spin_unlock(&kvm->mmu_lock);
+	srcu_read_unlock(&kvm->srcu, idx);
+
+	return young;
+}
+
 static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
 				       struct mm_struct *mm,
 				       unsigned long address)
@@ -431,6 +460,7 @@ static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
 	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
 	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
 	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
+	.clear_young		= kvm_mmu_notifier_clear_young,
 	.test_young		= kvm_mmu_notifier_test_young,
 	.change_pte		= kvm_mmu_notifier_change_pte,
 	.release		= kvm_mmu_notifier_release,
@@ -1559,7 +1589,8 @@ void kvm_set_pfn_dirty(kvm_pfn_t pfn)
 	if (!kvm_is_reserved_pfn(pfn)) {
 		struct page *page = pfn_to_page(pfn);
 
-		if (!PageReserved(page))
+		if (!PageReserved(page) &&
+		    (!page->mapping || PageAnon(page)))
 			SetPageDirty(page);
 	}
 }
@@ -1920,14 +1951,18 @@ EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
 
 static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
 {
-	int old, val;
+	unsigned int old, val, grow;
 
 	old = val = vcpu->halt_poll_ns;
+	grow = READ_ONCE(halt_poll_ns_grow);
 	/* 10us base */
-	if (val == 0 && halt_poll_ns_grow)
+	if (val == 0 && grow)
 		val = 10000;
 	else
-		val *= halt_poll_ns_grow;
+		val *= grow;
+
+	if (val > halt_poll_ns)
+		val = halt_poll_ns;
 
 	vcpu->halt_poll_ns = val;
 	trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
@@ -1935,13 +1970,14 @@ static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
 
 static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
 {
-	int old, val;
+	unsigned int old, val, shrink;
 
 	old = val = vcpu->halt_poll_ns;
-	if (halt_poll_ns_shrink == 0)
+	shrink = READ_ONCE(halt_poll_ns_shrink);
+	if (shrink == 0)
 		val = 0;
 	else
-		val /= halt_poll_ns_shrink;
+		val /= shrink;
 
 	vcpu->halt_poll_ns = val;
 	trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
