From b14f8ab2844987f013253dd04b708bde7fc1b52d Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 27 Oct 2008 18:27:55 +0200
Subject: exofs: Kbuild, Headers and osd utils

This patch includes osd infrastructure that will be used later by
the file system.

Also the declarations of constants, on disk structures,
and prototypes.

And the Kbuild+Kconfig files needed to build the exofs module.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/Kbuild   |  16 +++++
 fs/exofs/Kconfig  |  13 ++++
 fs/exofs/common.h | 184 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/exofs/exofs.h  | 127 +++++++++++++++++++++++++++++++++++++
 fs/exofs/osd.c    | 153 +++++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 493 insertions(+)
 create mode 100644 fs/exofs/Kbuild
 create mode 100644 fs/exofs/Kconfig
 create mode 100644 fs/exofs/common.h
 create mode 100644 fs/exofs/exofs.h
 create mode 100644 fs/exofs/osd.c

(limited to 'fs')

diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
new file mode 100644
index 000000000000..36d23ca79a58
--- /dev/null
+++ b/fs/exofs/Kbuild
@@ -0,0 +1,16 @@
+#
+# Kbuild for the EXOFS module
+#
+# Copyright (C) 2008 Panasas Inc.  All rights reserved.
+#
+# Authors:
+#   Boaz Harrosh <bharrosh@panasas.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License version 2
+#
+# Kbuild - Gets included from the Kernels Makefile and build system
+#
+
+exofs-y := osd.o
+obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/Kconfig b/fs/exofs/Kconfig
new file mode 100644
index 000000000000..86194b2f799d
--- /dev/null
+++ b/fs/exofs/Kconfig
@@ -0,0 +1,13 @@
+config EXOFS_FS
+	tristate "exofs: OSD based file system support"
+	depends on SCSI_OSD_ULD
+	help
+	  EXOFS is a file system that uses an OSD storage device,
+	  as its backing storage.
+
+# Debugging-related stuff
+config EXOFS_DEBUG
+	bool "Enable debugging"
+	depends on EXOFS_FS
+	help
+	  This option enables EXOFS debug prints.
diff --git a/fs/exofs/common.h b/fs/exofs/common.h
new file mode 100644
index 000000000000..b1512c4bb8c7
--- /dev/null
+++ b/fs/exofs/common.h
@@ -0,0 +1,184 @@
+/*
+ * common.h - Common definitions for both Kernel and user-mode utilities
+ *
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#ifndef __EXOFS_COM_H__
+#define __EXOFS_COM_H__
+
+#include <linux/types.h>
+
+#include <scsi/osd_attributes.h>
+#include <scsi/osd_initiator.h>
+#include <scsi/osd_sec.h>
+
+/****************************************************************************
+ * Object ID related defines
+ * NOTE: inode# = object ID - EXOFS_OBJ_OFF
+ ****************************************************************************/
+#define EXOFS_MIN_PID   0x10000	/* Smallest partition ID */
+#define EXOFS_OBJ_OFF	0x10000	/* offset for objects */
+#define EXOFS_SUPER_ID	0x10000	/* object ID for on-disk superblock */
+#define EXOFS_ROOT_ID	0x10002	/* object ID for root directory */
+
+/* exofs Application specific page/attribute */
+# define EXOFS_APAGE_FS_DATA	(OSD_APAGE_APP_DEFINED_FIRST + 3)
+# define EXOFS_ATTR_INODE_DATA	1
+
+/*
+ * The maximum number of files we can have is limited by the size of the
+ * inode number.  This is the largest object ID that the file system supports.
+ * Object IDs 0, 1, and 2 are always in use (see above defines).
+ */
+enum {
+	EXOFS_MAX_INO_ID = (sizeof(ino_t) * 8 == 64) ? ULLONG_MAX :
+					(1ULL << (sizeof(ino_t) * 8ULL - 1ULL)),
+	EXOFS_MAX_ID	 = (EXOFS_MAX_INO_ID - 1 - EXOFS_OBJ_OFF),
+};
+
+/****************************************************************************
+ * Misc.
+ ****************************************************************************/
+#define EXOFS_BLKSHIFT	12
+#define EXOFS_BLKSIZE	(1UL << EXOFS_BLKSHIFT)
+
+/****************************************************************************
+ * superblock-related things
+ ****************************************************************************/
+#define EXOFS_SUPER_MAGIC	0x5DF5
+
+/*
+ * The file system control block - stored in an object's data (mainly, the one
+ * with ID EXOFS_SUPER_ID).  This is where the in-memory superblock is stored
+ * on disk.  Right now it just has a magic value, which is basically a sanity
+ * check on our ability to communicate with the object store.
+ */
+struct exofs_fscb {
+	__le64  s_nextid;	/* Highest object ID used */
+	__le32  s_numfiles;	/* Number of files on fs */
+	__le16  s_magic;	/* Magic signature */
+	__le16  s_newfs;	/* Non-zero if this is a new fs */
+};
+
+/****************************************************************************
+ * inode-related things
+ ****************************************************************************/
+#define EXOFS_IDATA		5
+
+/*
+ * The file control block - stored in an object's attributes.  This is where
+ * the in-memory inode is stored on disk.
+ */
+struct exofs_fcb {
+	__le64  i_size;			/* Size of the file */
+	__le16  i_mode;         	/* File mode */
+	__le16  i_links_count;  	/* Links count */
+	__le32  i_uid;          	/* Owner Uid */
+	__le32  i_gid;          	/* Group Id */
+	__le32  i_atime;        	/* Access time */
+	__le32  i_ctime;        	/* Creation time */
+	__le32  i_mtime;        	/* Modification time */
+	__le32  i_flags;        	/* File flags (unused for now)*/
+	__le32  i_generation;   	/* File version (for NFS) */
+	__le32  i_data[EXOFS_IDATA];	/* Short symlink names and device #s */
+};
+
+#define EXOFS_INO_ATTR_SIZE	sizeof(struct exofs_fcb)
+
+/* This is the Attribute the fcb is stored in */
+static const struct __weak osd_attr g_attr_inode_data = ATTR_DEF(
+	EXOFS_APAGE_FS_DATA,
+	EXOFS_ATTR_INODE_DATA,
+	EXOFS_INO_ATTR_SIZE);
+
+/****************************************************************************
+ * dentry-related things
+ ****************************************************************************/
+#define EXOFS_NAME_LEN	255
+
+/*
+ * The on-disk directory entry
+ */
+struct exofs_dir_entry {
+	__le64		inode_no;		/* inode number           */
+	__le16		rec_len;		/* directory entry length */
+	u8		name_len;		/* name length            */
+	u8		file_type;		/* umm...file type        */
+	char		name[EXOFS_NAME_LEN];	/* file name              */
+};
+
+enum {
+	EXOFS_FT_UNKNOWN,
+	EXOFS_FT_REG_FILE,
+	EXOFS_FT_DIR,
+	EXOFS_FT_CHRDEV,
+	EXOFS_FT_BLKDEV,
+	EXOFS_FT_FIFO,
+	EXOFS_FT_SOCK,
+	EXOFS_FT_SYMLINK,
+	EXOFS_FT_MAX
+};
+
+#define EXOFS_DIR_PAD			4
+#define EXOFS_DIR_ROUND			(EXOFS_DIR_PAD - 1)
+#define EXOFS_DIR_REC_LEN(name_len) \
+	(((name_len) + offsetof(struct exofs_dir_entry, name)  + \
+	  EXOFS_DIR_ROUND) & ~EXOFS_DIR_ROUND)
+
+/*************************
+ * function declarations *
+ *************************/
+/* osd.c                 */
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN],
+			   const struct osd_obj_id *obj);
+
+int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid);
+static inline int exofs_check_ok(struct osd_request *or)
+{
+	return exofs_check_ok_resid(or, NULL, NULL);
+}
+int exofs_sync_op(struct osd_request *or, int timeout, u8 *cred);
+int exofs_async_op(struct osd_request *or,
+	osd_req_done_fn *async_done, void *caller_context, u8 *cred);
+
+int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr);
+
+int osd_req_read_kern(struct osd_request *or,
+	const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
+
+int osd_req_write_kern(struct osd_request *or,
+	const struct osd_obj_id *obj, u64 offset, void *buff, u64 len);
+
+#endif /*ifndef __EXOFS_COM_H__*/
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
new file mode 100644
index 000000000000..365376d3f72d
--- /dev/null
+++ b/fs/exofs/exofs.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/fs.h>
+#include <linux/time.h>
+#include "common.h"
+
+#ifndef __EXOFS_H__
+#define __EXOFS_H__
+
+#define EXOFS_ERR(fmt, a...) printk(KERN_ERR "exofs: " fmt, ##a)
+
+#ifdef CONFIG_EXOFS_DEBUG
+#define EXOFS_DBGMSG(fmt, a...) \
+	printk(KERN_NOTICE "exofs @%s:%d: " fmt, __func__, __LINE__, ##a)
+#else
+#define EXOFS_DBGMSG(fmt, a...) \
+	do { if (0) printk(fmt, ##a); } while (0)
+#endif
+
+/* u64 has problems with printk this will cast it to unsigned long long */
+#define _LLU(x) (unsigned long long)(x)
+
+/*
+ * our extension to the in-memory superblock
+ */
+struct exofs_sb_info {
+	struct osd_dev	*s_dev;			/* returned by get_osd_dev    */
+	osd_id		s_pid;			/* partition ID of file system*/
+	int		s_timeout;		/* timeout for OSD operations */
+	uint64_t	s_nextid;		/* highest object ID used     */
+	uint32_t	s_numfiles;		/* number of files on fs      */
+	spinlock_t	s_next_gen_lock;	/* spinlock for gen # update  */
+	u32		s_next_generation;	/* next gen # to use          */
+	atomic_t	s_curr_pending;		/* number of pending commands */
+	uint8_t		s_cred[OSD_CAP_LEN];	/* all-powerful credential    */
+};
+
+/*
+ * our extension to the in-memory inode
+ */
+struct exofs_i_info {
+	unsigned long  i_flags;            /* various atomic flags            */
+	uint32_t       i_data[EXOFS_IDATA];/*short symlink names and device #s*/
+	uint32_t       i_dir_start_lookup; /* which page to start lookup      */
+	wait_queue_head_t i_wq;            /* wait queue for inode            */
+	uint64_t       i_commit_size;      /* the object's written length     */
+	uint8_t        i_cred[OSD_CAP_LEN];/* all-powerful credential         */
+	struct inode   vfs_inode;          /* normal in-memory inode          */
+};
+
+/*
+ * our inode flags
+ */
+#define OBJ_2BCREATED	0	/* object will be created soon*/
+#define OBJ_CREATED	1	/* object has been created on the osd*/
+
+static inline int obj_2bcreated(struct exofs_i_info *oi)
+{
+	return test_bit(OBJ_2BCREATED, &oi->i_flags);
+}
+
+static inline void set_obj_2bcreated(struct exofs_i_info *oi)
+{
+	set_bit(OBJ_2BCREATED, &oi->i_flags);
+}
+
+static inline int obj_created(struct exofs_i_info *oi)
+{
+	return test_bit(OBJ_CREATED, &oi->i_flags);
+}
+
+static inline void set_obj_created(struct exofs_i_info *oi)
+{
+	set_bit(OBJ_CREATED, &oi->i_flags);
+}
+
+int __exofs_wait_obj_created(struct exofs_i_info *oi);
+static inline int wait_obj_created(struct exofs_i_info *oi)
+{
+	if (likely(obj_created(oi)))
+		return 0;
+
+	return __exofs_wait_obj_created(oi);
+}
+
+/*
+ * get to our inode from the vfs inode
+ */
+static inline struct exofs_i_info *exofs_i(struct inode *inode)
+{
+	return container_of(inode, struct exofs_i_info, vfs_inode);
+}
+
+#endif
diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c
new file mode 100644
index 000000000000..b249ae97fb15
--- /dev/null
+++ b/fs/exofs/osd.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <scsi/scsi_device.h>
+#include <scsi/osd_sense.h>
+
+#include "exofs.h"
+
+int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid)
+{
+	struct osd_sense_info osi;
+	int ret = osd_req_decode_sense(or, &osi);
+
+	if (ret) { /* translate to Linux codes */
+		if (osi.additional_code == scsi_invalid_field_in_cdb) {
+			if (osi.cdb_field_offset == OSD_CFO_STARTING_BYTE)
+				ret = -EFAULT;
+			if (osi.cdb_field_offset == OSD_CFO_OBJECT_ID)
+				ret = -ENOENT;
+			else
+				ret = -EINVAL;
+		} else if (osi.additional_code == osd_quota_error)
+			ret = -ENOSPC;
+		else
+			ret = -EIO;
+	}
+
+	/* FIXME: should be include in osd_sense_info */
+	if (in_resid)
+		*in_resid = or->in.req ? or->in.req->data_len : 0;
+
+	if (out_resid)
+		*out_resid = or->out.req ? or->out.req->data_len : 0;
+
+	return ret;
+}
+
+void exofs_make_credential(u8 cred_a[OSD_CAP_LEN], const struct osd_obj_id *obj)
+{
+	osd_sec_init_nosec_doall_caps(cred_a, obj, false, true);
+}
+
+/*
+ * Perform a synchronous OSD operation.
+ */
+int exofs_sync_op(struct osd_request *or, int timeout, uint8_t *credential)
+{
+	int ret;
+
+	or->timeout = timeout;
+	ret = osd_finalize_request(or, 0, credential, NULL);
+	if (ret) {
+		EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+		return ret;
+	}
+
+	ret = osd_execute_request(or);
+
+	if (ret)
+		EXOFS_DBGMSG("osd_execute_request() => %d\n", ret);
+	/* osd_req_decode_sense(or, ret); */
+	return ret;
+}
+
+/*
+ * Perform an asynchronous OSD operation.
+ */
+int exofs_async_op(struct osd_request *or, osd_req_done_fn *async_done,
+		   void *caller_context, u8 *cred)
+{
+	int ret;
+
+	ret = osd_finalize_request(or, 0, cred, NULL);
+	if (ret) {
+		EXOFS_DBGMSG("Faild to osd_finalize_request() => %d\n", ret);
+		return ret;
+	}
+
+	ret = osd_execute_request_async(or, async_done, caller_context);
+
+	if (ret)
+		EXOFS_DBGMSG("osd_execute_request_async() => %d\n", ret);
+	return ret;
+}
+
+int extract_attr_from_req(struct osd_request *or, struct osd_attr *attr)
+{
+	struct osd_attr cur_attr = {.attr_page = 0}; /* start with zeros */
+	void *iter = NULL;
+	int nelem;
+
+	do {
+		nelem = 1;
+		osd_req_decode_get_attr_list(or, &cur_attr, &nelem, &iter);
+		if ((cur_attr.attr_page == attr->attr_page) &&
+		    (cur_attr.attr_id == attr->attr_id)) {
+			attr->len = cur_attr.len;
+			attr->val_ptr = cur_attr.val_ptr;
+			return 0;
+		}
+	} while (iter);
+
+	return -EIO;
+}
+
+int osd_req_read_kern(struct osd_request *or,
+	const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
+{
+	struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
+	struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
+
+	if (!bio)
+		return -ENOMEM;
+
+	osd_req_read(or, obj, bio, offset);
+	return 0;
+}
+
+int osd_req_write_kern(struct osd_request *or,
+	const struct osd_obj_id *obj, u64 offset, void* buff, u64 len)
+{
+	struct request_queue *req_q = or->osd_dev->scsi_device->request_queue;
+	struct bio *bio = bio_map_kern(req_q, buff, len, GFP_KERNEL);
+
+	if (!bio)
+		return -ENOMEM;
+
+	osd_req_write(or, obj, bio, offset);
+	return 0;
+}
-- 
cgit v1.2.3


From e806271916ed6068a0e3e4e9298dff0688b88e0d Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 27 Oct 2008 18:37:02 +0200
Subject: exofs: file and file_inode operations

implementation of the file_operations and inode_operations for
regular data files.

Most file_operations are generic vfs implementations except:
- exofs_truncate will truncate the OSD object as well
- Generic file_fsync is not good for none_bd devices so open code it
- The default for .flush in Linux is todo nothing so call exofs_fsync
  on the file.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/Kbuild  |   2 +-
 fs/exofs/exofs.h |  14 ++++++
 fs/exofs/file.c  |  87 ++++++++++++++++++++++++++++++++
 fs/exofs/inode.c | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 250 insertions(+), 1 deletion(-)
 create mode 100644 fs/exofs/file.c
 create mode 100644 fs/exofs/inode.c

(limited to 'fs')

diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 36d23ca79a58..c9546edaddeb 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -12,5 +12,5 @@
 # Kbuild - Gets included from the Kernels Makefile and build system
 #
 
-exofs-y := osd.o
+exofs-y := osd.o inode.o file.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 365376d3f72d..a735258c7364 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -124,4 +124,18 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
 	return container_of(inode, struct exofs_i_info, vfs_inode);
 }
 
+/*************************
+ * function declarations *
+ *************************/
+/* inode.c               */
+void exofs_truncate(struct inode *inode);
+int exofs_setattr(struct dentry *, struct iattr *);
+
+/*********************
+ * operation vectors *
+ *********************/
+/* file.c            */
+extern const struct inode_operations exofs_file_inode_operations;
+extern const struct file_operations exofs_file_operations;
+
 #endif
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
new file mode 100644
index 000000000000..6ed7fe484752
--- /dev/null
+++ b/fs/exofs/file.c
@@ -0,0 +1,87 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/buffer_head.h>
+
+#include "exofs.h"
+
+static int exofs_release_file(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+static int exofs_file_fsync(struct file *filp, struct dentry *dentry,
+			    int datasync)
+{
+	int ret;
+	struct address_space *mapping = filp->f_mapping;
+
+	ret = filemap_write_and_wait(mapping);
+	if (ret)
+		return ret;
+
+	/*Note: file_fsync below also calles sync_blockdev, which is a no-op
+	 *      for exofs, but other then that it does sync_inode and
+	 *      sync_superblock which is what we need here.
+	 */
+	return file_fsync(filp, dentry, datasync);
+}
+
+static int exofs_flush(struct file *file, fl_owner_t id)
+{
+	exofs_file_fsync(file, file->f_path.dentry, 1);
+	/* TODO: Flush the OSD target */
+	return 0;
+}
+
+const struct file_operations exofs_file_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= do_sync_read,
+	.write		= do_sync_write,
+	.aio_read	= generic_file_aio_read,
+	.aio_write	= generic_file_aio_write,
+	.mmap		= generic_file_mmap,
+	.open		= generic_file_open,
+	.release	= exofs_release_file,
+	.fsync		= exofs_file_fsync,
+	.flush		= exofs_flush,
+	.splice_read	= generic_file_splice_read,
+	.splice_write	= generic_file_splice_write,
+};
+
+const struct inode_operations exofs_file_inode_operations = {
+	.truncate	= exofs_truncate,
+	.setattr	= exofs_setattr,
+};
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
new file mode 100644
index 000000000000..b0bda1e91225
--- /dev/null
+++ b/fs/exofs/inode.c
@@ -0,0 +1,148 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/writeback.h>
+#include <linux/buffer_head.h>
+
+#include "exofs.h"
+
+#ifdef CONFIG_EXOFS_DEBUG
+#  define EXOFS_DEBUG_OBJ_ISIZE 1
+#endif
+
+/******************************************************************************
+ * INODE OPERATIONS
+ *****************************************************************************/
+
+/*
+ * Test whether an inode is a fast symlink.
+ */
+static inline int exofs_inode_is_fast_symlink(struct inode *inode)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+
+	return S_ISLNK(inode->i_mode) && (oi->i_data[0] != 0);
+}
+
+/*
+ * get_block_t - Fill in a buffer_head
+ * An OSD takes care of block allocation so we just fake an allocation by
+ * putting in the inode's sector_t in the buffer_head.
+ * TODO: What about the case of create==0 and @iblock does not exist in the
+ * object?
+ */
+static int exofs_get_block(struct inode *inode, sector_t iblock,
+		    struct buffer_head *bh_result, int create)
+{
+	map_bh(bh_result, inode->i_sb, iblock);
+	return 0;
+}
+
+const struct osd_attr g_attr_logical_length = ATTR_DEF(
+	OSD_APAGE_OBJECT_INFORMATION, OSD_ATTR_OI_LOGICAL_LENGTH, 8);
+
+/*
+ * Truncate a file to the specified size - all we have to do is set the size
+ * attribute.  We make sure the object exists first.
+ */
+void exofs_truncate(struct inode *inode)
+{
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+	struct osd_request *or;
+	struct osd_attr attr;
+	loff_t isize = i_size_read(inode);
+	__be64 newsize;
+	int ret;
+
+	if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode)
+	     || S_ISLNK(inode->i_mode)))
+		return;
+	if (exofs_inode_is_fast_symlink(inode))
+		return;
+	if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
+		return;
+	inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+
+	nobh_truncate_page(inode->i_mapping, isize, exofs_get_block);
+
+	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("ERROR: exofs_truncate: osd_start_request failed\n");
+		goto fail;
+	}
+
+	osd_req_set_attributes(or, &obj);
+
+	newsize = cpu_to_be64((u64)isize);
+	attr = g_attr_logical_length;
+	attr.val_ptr = &newsize;
+	osd_req_add_set_attr_list(or, &attr, 1);
+
+	/* if we are about to truncate an object, and it hasn't been
+	 * created yet, wait
+	 */
+	if (unlikely(wait_obj_created(oi)))
+		goto fail;
+
+	ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+	osd_end_request(or);
+	if (ret)
+		goto fail;
+
+out:
+	mark_inode_dirty(inode);
+	return;
+fail:
+	make_bad_inode(inode);
+	goto out;
+}
+
+/*
+ * Set inode attributes - just call generic functions.
+ */
+int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
+{
+	struct inode *inode = dentry->d_inode;
+	int error;
+
+	error = inode_change_ok(inode, iattr);
+	if (error)
+		return error;
+
+	error = inode_setattr(inode, iattr);
+	return error;
+}
-- 
cgit v1.2.3


From 982980d753223fda3864038236b7b94e246895cb Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 27 Oct 2008 19:04:34 +0200
Subject: exofs: symlink_inode and fast_symlink_inode operations

Generic implementation of symlink ops.

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/Kbuild    |  2 +-
 fs/exofs/exofs.h   |  4 ++++
 fs/exofs/symlink.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 62 insertions(+), 1 deletion(-)
 create mode 100644 fs/exofs/symlink.c

(limited to 'fs')

diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index c9546edaddeb..7c361c92bbdf 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -12,5 +12,5 @@
 # Kbuild - Gets included from the Kernels Makefile and build system
 #
 
-exofs-y := osd.o inode.o file.o
+exofs-y := osd.o inode.o file.o symlink.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index a735258c7364..825454d76f6c 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -138,4 +138,8 @@ int exofs_setattr(struct dentry *, struct iattr *);
 extern const struct inode_operations exofs_file_inode_operations;
 extern const struct file_operations exofs_file_operations;
 
+/* symlink.c         */
+extern const struct inode_operations exofs_symlink_inode_operations;
+extern const struct inode_operations exofs_fast_symlink_inode_operations;
+
 #endif
diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c
new file mode 100644
index 000000000000..36e2d7bc7f7b
--- /dev/null
+++ b/fs/exofs/symlink.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/namei.h>
+
+#include "exofs.h"
+
+static void *exofs_follow_link(struct dentry *dentry, struct nameidata *nd)
+{
+	struct exofs_i_info *oi = exofs_i(dentry->d_inode);
+
+	nd_set_link(nd, (char *)oi->i_data);
+	return NULL;
+}
+
+const struct inode_operations exofs_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= page_follow_link_light,
+	.put_link	= page_put_link,
+};
+
+const struct inode_operations exofs_fast_symlink_inode_operations = {
+	.readlink	= generic_readlink,
+	.follow_link	= exofs_follow_link,
+};
-- 
cgit v1.2.3


From beaec07ba6af35d387643b76a2920a7a6e22207b Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 27 Oct 2008 19:31:34 +0200
Subject: exofs: address_space_operations

OK Now we start to read and write from osd-objects. We try to
collect at most contiguous pages as possible in a single write/read.
The first page index is the object's offset.

TODO:
   In 64-bit a single bio can carry at most 128 pages.
   Add support of chaining multiple bios

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/exofs.h |   6 +
 fs/exofs/inode.c | 697 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 703 insertions(+)

(limited to 'fs')

diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 825454d76f6c..0276242326ab 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -130,6 +130,9 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
 /* inode.c               */
 void exofs_truncate(struct inode *inode);
 int exofs_setattr(struct dentry *, struct iattr *);
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata);
 
 /*********************
  * operation vectors *
@@ -138,6 +141,9 @@ int exofs_setattr(struct dentry *, struct iattr *);
 extern const struct inode_operations exofs_file_inode_operations;
 extern const struct file_operations exofs_file_operations;
 
+/* inode.c           */
+extern const struct address_space_operations exofs_aops;
+
 /* symlink.c         */
 extern const struct inode_operations exofs_symlink_inode_operations;
 extern const struct inode_operations exofs_fast_symlink_inode_operations;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index b0bda1e91225..a3691d8bfb98 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -35,6 +35,7 @@
 
 #include <linux/writeback.h>
 #include <linux/buffer_head.h>
+#include <scsi/scsi_device.h>
 
 #include "exofs.h"
 
@@ -42,6 +43,702 @@
 #  define EXOFS_DEBUG_OBJ_ISIZE 1
 #endif
 
+struct page_collect {
+	struct exofs_sb_info *sbi;
+	struct request_queue *req_q;
+	struct inode *inode;
+	unsigned expected_pages;
+
+	struct bio *bio;
+	unsigned nr_pages;
+	unsigned long length;
+	loff_t pg_first; /* keep 64bit also in 32-arches */
+};
+
+static void _pcol_init(struct page_collect *pcol, unsigned expected_pages,
+		struct inode *inode)
+{
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	struct request_queue *req_q = sbi->s_dev->scsi_device->request_queue;
+
+	pcol->sbi = sbi;
+	pcol->req_q = req_q;
+	pcol->inode = inode;
+	pcol->expected_pages = expected_pages;
+
+	pcol->bio = NULL;
+	pcol->nr_pages = 0;
+	pcol->length = 0;
+	pcol->pg_first = -1;
+
+	EXOFS_DBGMSG("_pcol_init ino=0x%lx expected_pages=%u\n", inode->i_ino,
+		     expected_pages);
+}
+
+static void _pcol_reset(struct page_collect *pcol)
+{
+	pcol->expected_pages -= min(pcol->nr_pages, pcol->expected_pages);
+
+	pcol->bio = NULL;
+	pcol->nr_pages = 0;
+	pcol->length = 0;
+	pcol->pg_first = -1;
+	EXOFS_DBGMSG("_pcol_reset ino=0x%lx expected_pages=%u\n",
+		     pcol->inode->i_ino, pcol->expected_pages);
+
+	/* this is probably the end of the loop but in writes
+	 * it might not end here. don't be left with nothing
+	 */
+	if (!pcol->expected_pages)
+		pcol->expected_pages = 128;
+}
+
+static int pcol_try_alloc(struct page_collect *pcol)
+{
+	int pages = min_t(unsigned, pcol->expected_pages, BIO_MAX_PAGES);
+
+	for (; pages; pages >>= 1) {
+		pcol->bio = bio_alloc(GFP_KERNEL, pages);
+		if (likely(pcol->bio))
+			return 0;
+	}
+
+	EXOFS_ERR("Failed to kcalloc expected_pages=%u\n",
+		  pcol->expected_pages);
+	return -ENOMEM;
+}
+
+static void pcol_free(struct page_collect *pcol)
+{
+	bio_put(pcol->bio);
+	pcol->bio = NULL;
+}
+
+static int pcol_add_page(struct page_collect *pcol, struct page *page,
+			 unsigned len)
+{
+	int added_len = bio_add_pc_page(pcol->req_q, pcol->bio, page, len, 0);
+	if (unlikely(len != added_len))
+		return -ENOMEM;
+
+	++pcol->nr_pages;
+	pcol->length += len;
+	return 0;
+}
+
+static int update_read_page(struct page *page, int ret)
+{
+	if (ret == 0) {
+		/* Everything is OK */
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+	} else if (ret == -EFAULT) {
+		/* In this case we were trying to read something that wasn't on
+		 * disk yet - return a page full of zeroes.  This should be OK,
+		 * because the object should be empty (if there was a write
+		 * before this read, the read would be waiting with the page
+		 * locked */
+		clear_highpage(page);
+
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+		ret = 0; /* recovered error */
+		EXOFS_DBGMSG("recovered read error\n");
+	} else /* Error */
+		SetPageError(page);
+
+	return ret;
+}
+
+static void update_write_page(struct page *page, int ret)
+{
+	if (ret) {
+		mapping_set_error(page->mapping, ret);
+		SetPageError(page);
+	}
+	end_page_writeback(page);
+}
+
+/* Called at the end of reads, to optionally unlock pages and update their
+ * status.
+ */
+static int __readpages_done(struct osd_request *or, struct page_collect *pcol,
+			    bool do_unlock)
+{
+	struct bio_vec *bvec;
+	int i;
+	u64 resid;
+	u64 good_bytes;
+	u64 length = 0;
+	int ret = exofs_check_ok_resid(or, &resid, NULL);
+
+	osd_end_request(or);
+
+	if (likely(!ret))
+		good_bytes = pcol->length;
+	else if (!resid)
+		good_bytes = 0;
+	else
+		good_bytes = pcol->length - resid;
+
+	EXOFS_DBGMSG("readpages_done(0x%lx) good_bytes=0x%llx"
+		     " length=0x%lx nr_pages=%u\n",
+		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+		     pcol->nr_pages);
+
+	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
+		struct page *page = bvec->bv_page;
+		struct inode *inode = page->mapping->host;
+		int page_stat;
+
+		if (inode != pcol->inode)
+			continue; /* osd might add more pages at end */
+
+		if (likely(length < good_bytes))
+			page_stat = 0;
+		else
+			page_stat = ret;
+
+		EXOFS_DBGMSG("    readpages_done(0x%lx, 0x%lx) %s\n",
+			  inode->i_ino, page->index,
+			  page_stat ? "bad_bytes" : "good_bytes");
+
+		ret = update_read_page(page, page_stat);
+		if (do_unlock)
+			unlock_page(page);
+		length += bvec->bv_len;
+	}
+
+	pcol_free(pcol);
+	EXOFS_DBGMSG("readpages_done END\n");
+	return ret;
+}
+
+/* callback of async reads */
+static void readpages_done(struct osd_request *or, void *p)
+{
+	struct page_collect *pcol = p;
+
+	__readpages_done(or, pcol, true);
+	atomic_dec(&pcol->sbi->s_curr_pending);
+	kfree(p);
+}
+
+static void _unlock_pcol_pages(struct page_collect *pcol, int ret, int rw)
+{
+	struct bio_vec *bvec;
+	int i;
+
+	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
+		struct page *page = bvec->bv_page;
+
+		if (rw == READ)
+			update_read_page(page, ret);
+		else
+			update_write_page(page, ret);
+
+		unlock_page(page);
+	}
+	pcol_free(pcol);
+}
+
+static int read_exec(struct page_collect *pcol, bool is_sync)
+{
+	struct exofs_i_info *oi = exofs_i(pcol->inode);
+	struct osd_obj_id obj = {pcol->sbi->s_pid,
+					pcol->inode->i_ino + EXOFS_OBJ_OFF};
+	struct osd_request *or = NULL;
+	struct page_collect *pcol_copy = NULL;
+	loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
+	int ret;
+
+	if (!pcol->bio)
+		return 0;
+
+	/* see comment in _readpage() about sync reads */
+	WARN_ON(is_sync && (pcol->nr_pages != 1));
+
+	or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	osd_req_read(or, &obj, pcol->bio, i_start);
+
+	if (is_sync) {
+		exofs_sync_op(or, pcol->sbi->s_timeout, oi->i_cred);
+		return __readpages_done(or, pcol, false);
+	}
+
+	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+	if (!pcol_copy) {
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	*pcol_copy = *pcol;
+	ret = exofs_async_op(or, readpages_done, pcol_copy, oi->i_cred);
+	if (unlikely(ret))
+		goto err;
+
+	atomic_inc(&pcol->sbi->s_curr_pending);
+
+	EXOFS_DBGMSG("read_exec obj=0x%llx start=0x%llx length=0x%lx\n",
+		  obj.id, _LLU(i_start), pcol->length);
+
+	/* pages ownership was passed to pcol_copy */
+	_pcol_reset(pcol);
+	return 0;
+
+err:
+	if (!is_sync)
+		_unlock_pcol_pages(pcol, ret, READ);
+	kfree(pcol_copy);
+	if (or)
+		osd_end_request(or);
+	return ret;
+}
+
+/* readpage_strip is called either directly from readpage() or by the VFS from
+ * within read_cache_pages(), to add one more page to be read. It will try to
+ * collect as many contiguous pages as posible. If a discontinuity is
+ * encountered, or it runs out of resources, it will submit the previous segment
+ * and will start a new collection. Eventually caller must submit the last
+ * segment if present.
+ */
+static int readpage_strip(void *data, struct page *page)
+{
+	struct page_collect *pcol = data;
+	struct inode *inode = pcol->inode;
+	struct exofs_i_info *oi = exofs_i(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	size_t len;
+	int ret;
+
+	/* FIXME: Just for debugging, will be removed */
+	if (PageUptodate(page))
+		EXOFS_ERR("PageUptodate(0x%lx, 0x%lx)\n", pcol->inode->i_ino,
+			  page->index);
+
+	if (page->index < end_index)
+		len = PAGE_CACHE_SIZE;
+	else if (page->index == end_index)
+		len = i_size & ~PAGE_CACHE_MASK;
+	else
+		len = 0;
+
+	if (!len || !obj_created(oi)) {
+		/* this will be out of bounds, or doesn't exist yet.
+		 * Current page is cleared and the request is split
+		 */
+		clear_highpage(page);
+
+		SetPageUptodate(page);
+		if (PageError(page))
+			ClearPageError(page);
+
+		unlock_page(page);
+		EXOFS_DBGMSG("readpage_strip(0x%lx, 0x%lx) empty page,"
+			     " splitting\n", inode->i_ino, page->index);
+
+		return read_exec(pcol, false);
+	}
+
+try_again:
+
+	if (unlikely(pcol->pg_first == -1)) {
+		pcol->pg_first = page->index;
+	} else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+		   page->index)) {
+		/* Discontinuity detected, split the request */
+		ret = read_exec(pcol, false);
+		if (unlikely(ret))
+			goto fail;
+		goto try_again;
+	}
+
+	if (!pcol->bio) {
+		ret = pcol_try_alloc(pcol);
+		if (unlikely(ret))
+			goto fail;
+	}
+
+	if (len != PAGE_CACHE_SIZE)
+		zero_user(page, len, PAGE_CACHE_SIZE - len);
+
+	EXOFS_DBGMSG("    readpage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+		     inode->i_ino, page->index, len);
+
+	ret = pcol_add_page(pcol, page, len);
+	if (ret) {
+		EXOFS_DBGMSG("Failed pcol_add_page pages[i]=%p "
+			  "this_len=0x%zx nr_pages=%u length=0x%lx\n",
+			  page, len, pcol->nr_pages, pcol->length);
+
+		/* split the request, and start again with current page */
+		ret = read_exec(pcol, false);
+		if (unlikely(ret))
+			goto fail;
+
+		goto try_again;
+	}
+
+	return 0;
+
+fail:
+	/* SetPageError(page); ??? */
+	unlock_page(page);
+	return ret;
+}
+
+static int exofs_readpages(struct file *file, struct address_space *mapping,
+			   struct list_head *pages, unsigned nr_pages)
+{
+	struct page_collect pcol;
+	int ret;
+
+	_pcol_init(&pcol, nr_pages, mapping->host);
+
+	ret = read_cache_pages(mapping, pages, readpage_strip, &pcol);
+	if (ret) {
+		EXOFS_ERR("read_cache_pages => %d\n", ret);
+		return ret;
+	}
+
+	return read_exec(&pcol, false);
+}
+
+static int _readpage(struct page *page, bool is_sync)
+{
+	struct page_collect pcol;
+	int ret;
+
+	_pcol_init(&pcol, 1, page->mapping->host);
+
+	/* readpage_strip might call read_exec(,async) inside at several places
+	 * but this is safe for is_async=0 since read_exec will not do anything
+	 * when we have a single page.
+	 */
+	ret = readpage_strip(&pcol, page);
+	if (ret) {
+		EXOFS_ERR("_readpage => %d\n", ret);
+		return ret;
+	}
+
+	return read_exec(&pcol, is_sync);
+}
+
+/*
+ * We don't need the file
+ */
+static int exofs_readpage(struct file *file, struct page *page)
+{
+	return _readpage(page, false);
+}
+
+/* Callback for osd_write. All writes are asynchronouse */
+static void writepages_done(struct osd_request *or, void *p)
+{
+	struct page_collect *pcol = p;
+	struct bio_vec *bvec;
+	int i;
+	u64 resid;
+	u64  good_bytes;
+	u64  length = 0;
+
+	int ret = exofs_check_ok_resid(or, NULL, &resid);
+
+	osd_end_request(or);
+	atomic_dec(&pcol->sbi->s_curr_pending);
+
+	if (likely(!ret))
+		good_bytes = pcol->length;
+	else if (!resid)
+		good_bytes = 0;
+	else
+		good_bytes = pcol->length - resid;
+
+	EXOFS_DBGMSG("writepages_done(0x%lx) good_bytes=0x%llx"
+		     " length=0x%lx nr_pages=%u\n",
+		     pcol->inode->i_ino, _LLU(good_bytes), pcol->length,
+		     pcol->nr_pages);
+
+	__bio_for_each_segment(bvec, pcol->bio, i, 0) {
+		struct page *page = bvec->bv_page;
+		struct inode *inode = page->mapping->host;
+		int page_stat;
+
+		if (inode != pcol->inode)
+			continue; /* osd might add more pages to a bio */
+
+		if (likely(length < good_bytes))
+			page_stat = 0;
+		else
+			page_stat = ret;
+
+		update_write_page(page, page_stat);
+		unlock_page(page);
+		EXOFS_DBGMSG("    writepages_done(0x%lx, 0x%lx) status=%d\n",
+			     inode->i_ino, page->index, page_stat);
+
+		length += bvec->bv_len;
+	}
+
+	pcol_free(pcol);
+	kfree(pcol);
+	EXOFS_DBGMSG("writepages_done END\n");
+}
+
+static int write_exec(struct page_collect *pcol)
+{
+	struct exofs_i_info *oi = exofs_i(pcol->inode);
+	struct osd_obj_id obj = {pcol->sbi->s_pid,
+					pcol->inode->i_ino + EXOFS_OBJ_OFF};
+	struct osd_request *or = NULL;
+	struct page_collect *pcol_copy = NULL;
+	loff_t i_start = pcol->pg_first << PAGE_CACHE_SHIFT;
+	int ret;
+
+	if (!pcol->bio)
+		return 0;
+
+	or = osd_start_request(pcol->sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("write_exec: Faild to osd_start_request()\n");
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	pcol_copy = kmalloc(sizeof(*pcol_copy), GFP_KERNEL);
+	if (!pcol_copy) {
+		EXOFS_ERR("write_exec: Faild to kmalloc(pcol)\n");
+		ret = -ENOMEM;
+		goto err;
+	}
+
+	*pcol_copy = *pcol;
+
+	osd_req_write(or, &obj, pcol_copy->bio, i_start);
+	ret = exofs_async_op(or, writepages_done, pcol_copy, oi->i_cred);
+	if (unlikely(ret)) {
+		EXOFS_ERR("write_exec: exofs_async_op() Faild\n");
+		goto err;
+	}
+
+	atomic_inc(&pcol->sbi->s_curr_pending);
+	EXOFS_DBGMSG("write_exec(0x%lx, 0x%llx) start=0x%llx length=0x%lx\n",
+		  pcol->inode->i_ino, pcol->pg_first, _LLU(i_start),
+		  pcol->length);
+	/* pages ownership was passed to pcol_copy */
+	_pcol_reset(pcol);
+	return 0;
+
+err:
+	_unlock_pcol_pages(pcol, ret, WRITE);
+	kfree(pcol_copy);
+	if (or)
+		osd_end_request(or);
+	return ret;
+}
+
+/* writepage_strip is called either directly from writepage() or by the VFS from
+ * within write_cache_pages(), to add one more page to be written to storage.
+ * It will try to collect as many contiguous pages as possible. If a
+ * discontinuity is encountered or it runs out of resources it will submit the
+ * previous segment and will start a new collection.
+ * Eventually caller must submit the last segment if present.
+ */
+static int writepage_strip(struct page *page,
+			   struct writeback_control *wbc_unused, void *data)
+{
+	struct page_collect *pcol = data;
+	struct inode *inode = pcol->inode;
+	struct exofs_i_info *oi = exofs_i(inode);
+	loff_t i_size = i_size_read(inode);
+	pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
+	size_t len;
+	int ret;
+
+	BUG_ON(!PageLocked(page));
+
+	ret = wait_obj_created(oi);
+	if (unlikely(ret))
+		goto fail;
+
+	if (page->index < end_index)
+		/* in this case, the page is within the limits of the file */
+		len = PAGE_CACHE_SIZE;
+	else {
+		len = i_size & ~PAGE_CACHE_MASK;
+
+		if (page->index > end_index || !len) {
+			/* in this case, the page is outside the limits
+			 * (truncate in progress)
+			 */
+			ret = write_exec(pcol);
+			if (unlikely(ret))
+				goto fail;
+			if (PageError(page))
+				ClearPageError(page);
+			unlock_page(page);
+			return 0;
+		}
+	}
+
+try_again:
+
+	if (unlikely(pcol->pg_first == -1)) {
+		pcol->pg_first = page->index;
+	} else if (unlikely((pcol->pg_first + pcol->nr_pages) !=
+		   page->index)) {
+		/* Discontinuity detected, split the request */
+		ret = write_exec(pcol);
+		if (unlikely(ret))
+			goto fail;
+		goto try_again;
+	}
+
+	if (!pcol->bio) {
+		ret = pcol_try_alloc(pcol);
+		if (unlikely(ret))
+			goto fail;
+	}
+
+	EXOFS_DBGMSG("    writepage_strip(0x%lx, 0x%lx) len=0x%zx\n",
+		     inode->i_ino, page->index, len);
+
+	ret = pcol_add_page(pcol, page, len);
+	if (unlikely(ret)) {
+		EXOFS_DBGMSG("Failed pcol_add_page "
+			     "nr_pages=%u total_length=0x%lx\n",
+			     pcol->nr_pages, pcol->length);
+
+		/* split the request, next loop will start again */
+		ret = write_exec(pcol);
+		if (unlikely(ret)) {
+			EXOFS_DBGMSG("write_exec faild => %d", ret);
+			goto fail;
+		}
+
+		goto try_again;
+	}
+
+	BUG_ON(PageWriteback(page));
+	set_page_writeback(page);
+
+	return 0;
+
+fail:
+	set_bit(AS_EIO, &page->mapping->flags);
+	unlock_page(page);
+	return ret;
+}
+
+static int exofs_writepages(struct address_space *mapping,
+		       struct writeback_control *wbc)
+{
+	struct page_collect pcol;
+	long start, end, expected_pages;
+	int ret;
+
+	start = wbc->range_start >> PAGE_CACHE_SHIFT;
+	end = (wbc->range_end == LLONG_MAX) ?
+			start + mapping->nrpages :
+			wbc->range_end >> PAGE_CACHE_SHIFT;
+
+	if (start || end)
+		expected_pages = min(end - start + 1, 32L);
+	else
+		expected_pages = mapping->nrpages;
+
+	EXOFS_DBGMSG("inode(0x%lx) wbc->start=0x%llx wbc->end=0x%llx"
+		     " m->nrpages=%lu start=0x%lx end=0x%lx\n",
+		     mapping->host->i_ino, wbc->range_start, wbc->range_end,
+		     mapping->nrpages, start, end);
+
+	_pcol_init(&pcol, expected_pages, mapping->host);
+
+	ret = write_cache_pages(mapping, wbc, writepage_strip, &pcol);
+	if (ret) {
+		EXOFS_ERR("write_cache_pages => %d\n", ret);
+		return ret;
+	}
+
+	return write_exec(&pcol);
+}
+
+static int exofs_writepage(struct page *page, struct writeback_control *wbc)
+{
+	struct page_collect pcol;
+	int ret;
+
+	_pcol_init(&pcol, 1, page->mapping->host);
+
+	ret = writepage_strip(page, NULL, &pcol);
+	if (ret) {
+		EXOFS_ERR("exofs_writepage => %d\n", ret);
+		return ret;
+	}
+
+	return write_exec(&pcol);
+}
+
+int exofs_write_begin(struct file *file, struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	int ret = 0;
+	struct page *page;
+
+	page = *pagep;
+	if (page == NULL) {
+		ret = simple_write_begin(file, mapping, pos, len, flags, pagep,
+					 fsdata);
+		if (ret) {
+			EXOFS_DBGMSG("simple_write_begin faild\n");
+			return ret;
+		}
+
+		page = *pagep;
+	}
+
+	 /* read modify write */
+	if (!PageUptodate(page) && (len != PAGE_CACHE_SIZE)) {
+		ret = _readpage(page, true);
+		if (ret) {
+			/*SetPageError was done by _readpage. Is it ok?*/
+			unlock_page(page);
+			EXOFS_DBGMSG("__readpage_filler faild\n");
+		}
+	}
+
+	return ret;
+}
+
+static int exofs_write_begin_export(struct file *file,
+		struct address_space *mapping,
+		loff_t pos, unsigned len, unsigned flags,
+		struct page **pagep, void **fsdata)
+{
+	*pagep = NULL;
+
+	return exofs_write_begin(file, mapping, pos, len, flags, pagep,
+					fsdata);
+}
+
+const struct address_space_operations exofs_aops = {
+	.readpage	= exofs_readpage,
+	.readpages	= exofs_readpages,
+	.writepage	= exofs_writepage,
+	.writepages	= exofs_writepages,
+	.write_begin	= exofs_write_begin_export,
+	.write_end	= simple_write_end,
+};
+
 /******************************************************************************
  * INODE OPERATIONS
  *****************************************************************************/
-- 
cgit v1.2.3


From e6af00f1d1697ca41ab6a55307066ef3466833a9 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 28 Oct 2008 15:38:12 +0200
Subject: exofs: dir_inode and directory operations

implementation of directory and inode operations.

* A directory is treated as a file, and essentially contains a list
  of <file name, inode #> pairs for files that are found in that
  directory. The object IDs correspond to the files' inode numbers
  and are allocated using a 64bit incrementing global counter.
* Each file's control block (AKA on-disk inode) is stored in its
  object's attributes. This applies to both regular files and other
  types (directories, device files, symlinks, etc.).

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/Kbuild  |   2 +-
 fs/exofs/dir.c   | 657 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/exofs/exofs.h |  26 +++
 fs/exofs/inode.c | 272 +++++++++++++++++++++++
 fs/exofs/namei.c | 342 +++++++++++++++++++++++++++++
 5 files changed, 1298 insertions(+), 1 deletion(-)
 create mode 100644 fs/exofs/dir.c
 create mode 100644 fs/exofs/namei.c

(limited to 'fs')

diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 7c361c92bbdf..0a23f24db8b2 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -12,5 +12,5 @@
 # Kbuild - Gets included from the Kernels Makefile and build system
 #
 
-exofs-y := osd.o inode.o file.o symlink.o
+exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
new file mode 100644
index 000000000000..6865605fb6ab
--- /dev/null
+++ b/fs/exofs/dir.c
@@ -0,0 +1,657 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "exofs.h"
+
+static inline unsigned exofs_chunk_size(struct inode *inode)
+{
+	return inode->i_sb->s_blocksize;
+}
+
+static inline void exofs_put_page(struct page *page)
+{
+	kunmap(page);
+	page_cache_release(page);
+}
+
+/* Accesses dir's inode->i_size must be called under inode lock */
+static inline unsigned long dir_pages(struct inode *inode)
+{
+	return (inode->i_size + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
+}
+
+static unsigned exofs_last_byte(struct inode *inode, unsigned long page_nr)
+{
+	loff_t last_byte = inode->i_size;
+
+	last_byte -= page_nr << PAGE_CACHE_SHIFT;
+	if (last_byte > PAGE_CACHE_SIZE)
+		last_byte = PAGE_CACHE_SIZE;
+	return last_byte;
+}
+
+static int exofs_commit_chunk(struct page *page, loff_t pos, unsigned len)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *dir = mapping->host;
+	int err = 0;
+
+	dir->i_version++;
+
+	if (!PageUptodate(page))
+		SetPageUptodate(page);
+
+	if (pos+len > dir->i_size) {
+		i_size_write(dir, pos+len);
+		mark_inode_dirty(dir);
+	}
+	set_page_dirty(page);
+
+	if (IS_DIRSYNC(dir))
+		err = write_one_page(page, 1);
+	else
+		unlock_page(page);
+
+	return err;
+}
+
+static void exofs_check_page(struct page *page)
+{
+	struct inode *dir = page->mapping->host;
+	unsigned chunk_size = exofs_chunk_size(dir);
+	char *kaddr = page_address(page);
+	unsigned offs, rec_len;
+	unsigned limit = PAGE_CACHE_SIZE;
+	struct exofs_dir_entry *p;
+	char *error;
+
+	/* if the page is the last one in the directory */
+	if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
+		limit = dir->i_size & ~PAGE_CACHE_MASK;
+		if (limit & (chunk_size - 1))
+			goto Ebadsize;
+		if (!limit)
+			goto out;
+	}
+	for (offs = 0; offs <= limit - EXOFS_DIR_REC_LEN(1); offs += rec_len) {
+		p = (struct exofs_dir_entry *)(kaddr + offs);
+		rec_len = le16_to_cpu(p->rec_len);
+
+		if (rec_len < EXOFS_DIR_REC_LEN(1))
+			goto Eshort;
+		if (rec_len & 3)
+			goto Ealign;
+		if (rec_len < EXOFS_DIR_REC_LEN(p->name_len))
+			goto Enamelen;
+		if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
+			goto Espan;
+	}
+	if (offs != limit)
+		goto Eend;
+out:
+	SetPageChecked(page);
+	return;
+
+Ebadsize:
+	EXOFS_ERR("ERROR [exofs_check_page]: "
+		"size of directory #%lu is not a multiple of chunk size",
+		dir->i_ino
+	);
+	goto fail;
+Eshort:
+	error = "rec_len is smaller than minimal";
+	goto bad_entry;
+Ealign:
+	error = "unaligned directory entry";
+	goto bad_entry;
+Enamelen:
+	error = "rec_len is too small for name_len";
+	goto bad_entry;
+Espan:
+	error = "directory entry across blocks";
+	goto bad_entry;
+bad_entry:
+	EXOFS_ERR(
+		"ERROR [exofs_check_page]: bad entry in directory #%lu: %s - "
+		"offset=%lu, inode=%llu, rec_len=%d, name_len=%d",
+		dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		_LLU(le64_to_cpu(p->inode_no)),
+		rec_len, p->name_len);
+	goto fail;
+Eend:
+	p = (struct exofs_dir_entry *)(kaddr + offs);
+	EXOFS_ERR("ERROR [exofs_check_page]: "
+		"entry in directory #%lu spans the page boundary"
+		"offset=%lu, inode=%llu",
+		dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
+		_LLU(le64_to_cpu(p->inode_no)));
+fail:
+	SetPageChecked(page);
+	SetPageError(page);
+}
+
+static struct page *exofs_get_page(struct inode *dir, unsigned long n)
+{
+	struct address_space *mapping = dir->i_mapping;
+	struct page *page = read_mapping_page(mapping, n, NULL);
+
+	if (!IS_ERR(page)) {
+		kmap(page);
+		if (!PageChecked(page))
+			exofs_check_page(page);
+		if (PageError(page))
+			goto fail;
+	}
+	return page;
+
+fail:
+	exofs_put_page(page);
+	return ERR_PTR(-EIO);
+}
+
+static inline int exofs_match(int len, const unsigned char *name,
+					struct exofs_dir_entry *de)
+{
+	if (len != de->name_len)
+		return 0;
+	if (!de->inode_no)
+		return 0;
+	return !memcmp(name, de->name, len);
+}
+
+static inline
+struct exofs_dir_entry *exofs_next_entry(struct exofs_dir_entry *p)
+{
+	return (struct exofs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
+}
+
+static inline unsigned
+exofs_validate_entry(char *base, unsigned offset, unsigned mask)
+{
+	struct exofs_dir_entry *de = (struct exofs_dir_entry *)(base + offset);
+	struct exofs_dir_entry *p =
+			(struct exofs_dir_entry *)(base + (offset&mask));
+	while ((char *)p < (char *)de) {
+		if (p->rec_len == 0)
+			break;
+		p = exofs_next_entry(p);
+	}
+	return (char *)p - base;
+}
+
+static unsigned char exofs_filetype_table[EXOFS_FT_MAX] = {
+	[EXOFS_FT_UNKNOWN]	= DT_UNKNOWN,
+	[EXOFS_FT_REG_FILE]	= DT_REG,
+	[EXOFS_FT_DIR]		= DT_DIR,
+	[EXOFS_FT_CHRDEV]	= DT_CHR,
+	[EXOFS_FT_BLKDEV]	= DT_BLK,
+	[EXOFS_FT_FIFO]		= DT_FIFO,
+	[EXOFS_FT_SOCK]		= DT_SOCK,
+	[EXOFS_FT_SYMLINK]	= DT_LNK,
+};
+
+#define S_SHIFT 12
+static unsigned char exofs_type_by_mode[S_IFMT >> S_SHIFT] = {
+	[S_IFREG >> S_SHIFT]	= EXOFS_FT_REG_FILE,
+	[S_IFDIR >> S_SHIFT]	= EXOFS_FT_DIR,
+	[S_IFCHR >> S_SHIFT]	= EXOFS_FT_CHRDEV,
+	[S_IFBLK >> S_SHIFT]	= EXOFS_FT_BLKDEV,
+	[S_IFIFO >> S_SHIFT]	= EXOFS_FT_FIFO,
+	[S_IFSOCK >> S_SHIFT]	= EXOFS_FT_SOCK,
+	[S_IFLNK >> S_SHIFT]	= EXOFS_FT_SYMLINK,
+};
+
+static inline
+void exofs_set_de_type(struct exofs_dir_entry *de, struct inode *inode)
+{
+	mode_t mode = inode->i_mode;
+	de->file_type = exofs_type_by_mode[(mode & S_IFMT) >> S_SHIFT];
+}
+
+static int
+exofs_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+	loff_t pos = filp->f_pos;
+	struct inode *inode = filp->f_path.dentry->d_inode;
+	unsigned int offset = pos & ~PAGE_CACHE_MASK;
+	unsigned long n = pos >> PAGE_CACHE_SHIFT;
+	unsigned long npages = dir_pages(inode);
+	unsigned chunk_mask = ~(exofs_chunk_size(inode)-1);
+	unsigned char *types = NULL;
+	int need_revalidate = (filp->f_version != inode->i_version);
+
+	if (pos > inode->i_size - EXOFS_DIR_REC_LEN(1))
+		return 0;
+
+	types = exofs_filetype_table;
+
+	for ( ; n < npages; n++, offset = 0) {
+		char *kaddr, *limit;
+		struct exofs_dir_entry *de;
+		struct page *page = exofs_get_page(inode, n);
+
+		if (IS_ERR(page)) {
+			EXOFS_ERR("ERROR: "
+				   "bad page in #%lu",
+				   inode->i_ino);
+			filp->f_pos += PAGE_CACHE_SIZE - offset;
+			return PTR_ERR(page);
+		}
+		kaddr = page_address(page);
+		if (unlikely(need_revalidate)) {
+			if (offset) {
+				offset = exofs_validate_entry(kaddr, offset,
+								chunk_mask);
+				filp->f_pos = (n<<PAGE_CACHE_SHIFT) + offset;
+			}
+			filp->f_version = inode->i_version;
+			need_revalidate = 0;
+		}
+		de = (struct exofs_dir_entry *)(kaddr + offset);
+		limit = kaddr + exofs_last_byte(inode, n) -
+							EXOFS_DIR_REC_LEN(1);
+		for (; (char *)de <= limit; de = exofs_next_entry(de)) {
+			if (de->rec_len == 0) {
+				EXOFS_ERR("ERROR: "
+					"zero-length directory entry");
+				exofs_put_page(page);
+				return -EIO;
+			}
+			if (de->inode_no) {
+				int over;
+				unsigned char d_type = DT_UNKNOWN;
+
+				if (types && de->file_type < EXOFS_FT_MAX)
+					d_type = types[de->file_type];
+
+				offset = (char *)de - kaddr;
+				over = filldir(dirent, de->name, de->name_len,
+						(n<<PAGE_CACHE_SHIFT) | offset,
+						le64_to_cpu(de->inode_no),
+						d_type);
+				if (over) {
+					exofs_put_page(page);
+					return 0;
+				}
+			}
+			filp->f_pos += le16_to_cpu(de->rec_len);
+		}
+		exofs_put_page(page);
+	}
+
+	return 0;
+}
+
+struct exofs_dir_entry *exofs_find_entry(struct inode *dir,
+			struct dentry *dentry, struct page **res_page)
+{
+	const unsigned char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+	unsigned long start, n;
+	unsigned long npages = dir_pages(dir);
+	struct page *page = NULL;
+	struct exofs_i_info *oi = exofs_i(dir);
+	struct exofs_dir_entry *de;
+
+	if (npages == 0)
+		goto out;
+
+	*res_page = NULL;
+
+	start = oi->i_dir_start_lookup;
+	if (start >= npages)
+		start = 0;
+	n = start;
+	do {
+		char *kaddr;
+		page = exofs_get_page(dir, n);
+		if (!IS_ERR(page)) {
+			kaddr = page_address(page);
+			de = (struct exofs_dir_entry *) kaddr;
+			kaddr += exofs_last_byte(dir, n) - reclen;
+			while ((char *) de <= kaddr) {
+				if (de->rec_len == 0) {
+					EXOFS_ERR(
+						"ERROR: exofs_find_entry: "
+						"zero-length directory entry");
+					exofs_put_page(page);
+					goto out;
+				}
+				if (exofs_match(namelen, name, de))
+					goto found;
+				de = exofs_next_entry(de);
+			}
+			exofs_put_page(page);
+		}
+		if (++n >= npages)
+			n = 0;
+	} while (n != start);
+out:
+	return NULL;
+
+found:
+	*res_page = page;
+	oi->i_dir_start_lookup = n;
+	return de;
+}
+
+struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
+{
+	struct page *page = exofs_get_page(dir, 0);
+	struct exofs_dir_entry *de = NULL;
+
+	if (!IS_ERR(page)) {
+		de = exofs_next_entry(
+				(struct exofs_dir_entry *)page_address(page));
+		*p = page;
+	}
+	return de;
+}
+
+ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
+{
+	ino_t res = 0;
+	struct exofs_dir_entry *de;
+	struct page *page;
+
+	de = exofs_find_entry(dir, dentry, &page);
+	if (de) {
+		res = le64_to_cpu(de->inode_no);
+		exofs_put_page(page);
+	}
+	return res;
+}
+
+int exofs_set_link(struct inode *dir, struct exofs_dir_entry *de,
+			struct page *page, struct inode *inode)
+{
+	loff_t pos = page_offset(page) +
+			(char *) de - (char *) page_address(page);
+	unsigned len = le16_to_cpu(de->rec_len);
+	int err;
+
+	lock_page(page);
+	err = exofs_write_begin(NULL, page->mapping, pos, len,
+				AOP_FLAG_UNINTERRUPTIBLE, &page, NULL);
+	if (err)
+		EXOFS_ERR("exofs_set_link: exofs_write_begin FAILD => %d\n",
+			  err);
+
+	de->inode_no = cpu_to_le64(inode->i_ino);
+	exofs_set_de_type(de, inode);
+	if (likely(!err))
+		err = exofs_commit_chunk(page, pos, len);
+	exofs_put_page(page);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(dir);
+	return err;
+}
+
+int exofs_add_link(struct dentry *dentry, struct inode *inode)
+{
+	struct inode *dir = dentry->d_parent->d_inode;
+	const unsigned char *name = dentry->d_name.name;
+	int namelen = dentry->d_name.len;
+	unsigned chunk_size = exofs_chunk_size(dir);
+	unsigned reclen = EXOFS_DIR_REC_LEN(namelen);
+	unsigned short rec_len, name_len;
+	struct page *page = NULL;
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	struct exofs_dir_entry *de;
+	unsigned long npages = dir_pages(dir);
+	unsigned long n;
+	char *kaddr;
+	loff_t pos;
+	int err;
+
+	for (n = 0; n <= npages; n++) {
+		char *dir_end;
+
+		page = exofs_get_page(dir, n);
+		err = PTR_ERR(page);
+		if (IS_ERR(page))
+			goto out;
+		lock_page(page);
+		kaddr = page_address(page);
+		dir_end = kaddr + exofs_last_byte(dir, n);
+		de = (struct exofs_dir_entry *)kaddr;
+		kaddr += PAGE_CACHE_SIZE - reclen;
+		while ((char *)de <= kaddr) {
+			if ((char *)de == dir_end) {
+				name_len = 0;
+				rec_len = chunk_size;
+				de->rec_len = cpu_to_le16(chunk_size);
+				de->inode_no = 0;
+				goto got_it;
+			}
+			if (de->rec_len == 0) {
+				EXOFS_ERR("ERROR: exofs_add_link: "
+					"zero-length directory entry");
+				err = -EIO;
+				goto out_unlock;
+			}
+			err = -EEXIST;
+			if (exofs_match(namelen, name, de))
+				goto out_unlock;
+			name_len = EXOFS_DIR_REC_LEN(de->name_len);
+			rec_len = le16_to_cpu(de->rec_len);
+			if (!de->inode_no && rec_len >= reclen)
+				goto got_it;
+			if (rec_len >= name_len + reclen)
+				goto got_it;
+			de = (struct exofs_dir_entry *) ((char *) de + rec_len);
+		}
+		unlock_page(page);
+		exofs_put_page(page);
+	}
+
+	EXOFS_ERR("exofs_add_link: BAD dentry=%p or inode=%p", dentry, inode);
+	return -EINVAL;
+
+got_it:
+	pos = page_offset(page) +
+		(char *)de - (char *)page_address(page);
+	err = exofs_write_begin(NULL, page->mapping, pos, rec_len, 0,
+							&page, NULL);
+	if (err)
+		goto out_unlock;
+	if (de->inode_no) {
+		struct exofs_dir_entry *de1 =
+			(struct exofs_dir_entry *)((char *)de + name_len);
+		de1->rec_len = cpu_to_le16(rec_len - name_len);
+		de->rec_len = cpu_to_le16(name_len);
+		de = de1;
+	}
+	de->name_len = namelen;
+	memcpy(de->name, name, namelen);
+	de->inode_no = cpu_to_le64(inode->i_ino);
+	exofs_set_de_type(de, inode);
+	err = exofs_commit_chunk(page, pos, rec_len);
+	dir->i_mtime = dir->i_ctime = CURRENT_TIME;
+	mark_inode_dirty(dir);
+	sbi->s_numfiles++;
+
+out_put:
+	exofs_put_page(page);
+out:
+	return err;
+out_unlock:
+	unlock_page(page);
+	goto out_put;
+}
+
+int exofs_delete_entry(struct exofs_dir_entry *dir, struct page *page)
+{
+	struct address_space *mapping = page->mapping;
+	struct inode *inode = mapping->host;
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	char *kaddr = page_address(page);
+	unsigned from = ((char *)dir - kaddr) & ~(exofs_chunk_size(inode)-1);
+	unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
+	loff_t pos;
+	struct exofs_dir_entry *pde = NULL;
+	struct exofs_dir_entry *de = (struct exofs_dir_entry *) (kaddr + from);
+	int err;
+
+	while (de < dir) {
+		if (de->rec_len == 0) {
+			EXOFS_ERR("ERROR: exofs_delete_entry:"
+				"zero-length directory entry");
+			err = -EIO;
+			goto out;
+		}
+		pde = de;
+		de = exofs_next_entry(de);
+	}
+	if (pde)
+		from = (char *)pde - (char *)page_address(page);
+	pos = page_offset(page) + from;
+	lock_page(page);
+	err = exofs_write_begin(NULL, page->mapping, pos, to - from, 0,
+							&page, NULL);
+	if (err)
+		EXOFS_ERR("exofs_delete_entry: exofs_write_begin FAILD => %d\n",
+			  err);
+	if (pde)
+		pde->rec_len = cpu_to_le16(to - from);
+	dir->inode_no = 0;
+	if (likely(!err))
+		err = exofs_commit_chunk(page, pos, to - from);
+	inode->i_ctime = inode->i_mtime = CURRENT_TIME;
+	mark_inode_dirty(inode);
+	sbi->s_numfiles--;
+out:
+	exofs_put_page(page);
+	return err;
+}
+
+/* kept aligned on 4 bytes */
+#define THIS_DIR ".\0\0"
+#define PARENT_DIR "..\0"
+
+int exofs_make_empty(struct inode *inode, struct inode *parent)
+{
+	struct address_space *mapping = inode->i_mapping;
+	struct page *page = grab_cache_page(mapping, 0);
+	unsigned chunk_size = exofs_chunk_size(inode);
+	struct exofs_dir_entry *de;
+	int err;
+	void *kaddr;
+
+	if (!page)
+		return -ENOMEM;
+
+	err = exofs_write_begin(NULL, page->mapping, 0, chunk_size, 0,
+							&page, NULL);
+	if (err) {
+		unlock_page(page);
+		goto fail;
+	}
+
+	kaddr = kmap_atomic(page, KM_USER0);
+	de = (struct exofs_dir_entry *)kaddr;
+	de->name_len = 1;
+	de->rec_len = cpu_to_le16(EXOFS_DIR_REC_LEN(1));
+	memcpy(de->name, THIS_DIR, sizeof(THIS_DIR));
+	de->inode_no = cpu_to_le64(inode->i_ino);
+	exofs_set_de_type(de, inode);
+
+	de = (struct exofs_dir_entry *)(kaddr + EXOFS_DIR_REC_LEN(1));
+	de->name_len = 2;
+	de->rec_len = cpu_to_le16(chunk_size - EXOFS_DIR_REC_LEN(1));
+	de->inode_no = cpu_to_le64(parent->i_ino);
+	memcpy(de->name, PARENT_DIR, sizeof(PARENT_DIR));
+	exofs_set_de_type(de, inode);
+	kunmap_atomic(page, KM_USER0);
+	err = exofs_commit_chunk(page, 0, chunk_size);
+fail:
+	page_cache_release(page);
+	return err;
+}
+
+int exofs_empty_dir(struct inode *inode)
+{
+	struct page *page = NULL;
+	unsigned long i, npages = dir_pages(inode);
+
+	for (i = 0; i < npages; i++) {
+		char *kaddr;
+		struct exofs_dir_entry *de;
+		page = exofs_get_page(inode, i);
+
+		if (IS_ERR(page))
+			continue;
+
+		kaddr = page_address(page);
+		de = (struct exofs_dir_entry *)kaddr;
+		kaddr += exofs_last_byte(inode, i) - EXOFS_DIR_REC_LEN(1);
+
+		while ((char *)de <= kaddr) {
+			if (de->rec_len == 0) {
+				EXOFS_ERR("ERROR: exofs_empty_dir: "
+					  "zero-length directory entry"
+					  "kaddr=%p, de=%p\n", kaddr, de);
+				goto not_empty;
+			}
+			if (de->inode_no != 0) {
+				/* check for . and .. */
+				if (de->name[0] != '.')
+					goto not_empty;
+				if (de->name_len > 2)
+					goto not_empty;
+				if (de->name_len < 2) {
+					if (le64_to_cpu(de->inode_no) !=
+					    inode->i_ino)
+						goto not_empty;
+				} else if (de->name[1] != '.')
+					goto not_empty;
+			}
+			de = exofs_next_entry(de);
+		}
+		exofs_put_page(page);
+	}
+	return 1;
+
+not_empty:
+	exofs_put_page(page);
+	return 0;
+}
+
+const struct file_operations exofs_dir_operations = {
+	.llseek		= generic_file_llseek,
+	.read		= generic_read_dir,
+	.readdir	= exofs_readdir,
+};
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 0276242326ab..d11982c67e2a 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -124,6 +124,11 @@ static inline struct exofs_i_info *exofs_i(struct inode *inode)
 	return container_of(inode, struct exofs_i_info, vfs_inode);
 }
 
+/*
+ * Maximum count of links to a file
+ */
+#define EXOFS_LINK_MAX           32000
+
 /*************************
  * function declarations *
  *************************/
@@ -133,10 +138,27 @@ int exofs_setattr(struct dentry *, struct iattr *);
 int exofs_write_begin(struct file *file, struct address_space *mapping,
 		loff_t pos, unsigned len, unsigned flags,
 		struct page **pagep, void **fsdata);
+extern struct inode *exofs_iget(struct super_block *, unsigned long);
+struct inode *exofs_new_inode(struct inode *, int);
+
+/* dir.c:                */
+int exofs_add_link(struct dentry *, struct inode *);
+ino_t exofs_inode_by_name(struct inode *, struct dentry *);
+int exofs_delete_entry(struct exofs_dir_entry *, struct page *);
+int exofs_make_empty(struct inode *, struct inode *);
+struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
+					 struct page **);
+int exofs_empty_dir(struct inode *);
+struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
+int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
+		    struct inode *);
 
 /*********************
  * operation vectors *
  *********************/
+/* dir.c:            */
+extern const struct file_operations exofs_dir_operations;
+
 /* file.c            */
 extern const struct inode_operations exofs_file_inode_operations;
 extern const struct file_operations exofs_file_operations;
@@ -144,6 +166,10 @@ extern const struct file_operations exofs_file_operations;
 /* inode.c           */
 extern const struct address_space_operations exofs_aops;
 
+/* namei.c           */
+extern const struct inode_operations exofs_dir_inode_operations;
+extern const struct inode_operations exofs_special_inode_operations;
+
 /* symlink.c         */
 extern const struct inode_operations exofs_symlink_inode_operations;
 extern const struct inode_operations exofs_fast_symlink_inode_operations;
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index a3691d8bfb98..7c26ba371e1b 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -843,3 +843,275 @@ int exofs_setattr(struct dentry *dentry, struct iattr *iattr)
 	error = inode_setattr(inode, iattr);
 	return error;
 }
+
+/*
+ * Read an inode from the OSD, and return it as is.  We also return the size
+ * attribute in the 'sanity' argument if we got compiled with debugging turned
+ * on.
+ */
+static int exofs_get_inode(struct super_block *sb, struct exofs_i_info *oi,
+		    struct exofs_fcb *inode, uint64_t *sanity)
+{
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct osd_request *or;
+	struct osd_attr attr;
+	struct osd_obj_id obj = {sbi->s_pid,
+				 oi->vfs_inode.i_ino + EXOFS_OBJ_OFF};
+	int ret;
+
+	exofs_make_credential(oi->i_cred, &obj);
+
+	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("exofs_get_inode: osd_start_request failed.\n");
+		return -ENOMEM;
+	}
+	osd_req_get_attributes(or, &obj);
+
+	/* we need the inode attribute */
+	osd_req_add_get_attr_list(or, &g_attr_inode_data, 1);
+
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+	/* we get the size attributes to do a sanity check */
+	osd_req_add_get_attr_list(or, &g_attr_logical_length, 1);
+#endif
+
+	ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+	if (ret)
+		goto out;
+
+	attr = g_attr_inode_data;
+	ret = extract_attr_from_req(or, &attr);
+	if (ret) {
+		EXOFS_ERR("exofs_get_inode: extract_attr_from_req failed\n");
+		goto out;
+	}
+
+	WARN_ON(attr.len != EXOFS_INO_ATTR_SIZE);
+	memcpy(inode, attr.val_ptr, EXOFS_INO_ATTR_SIZE);
+
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+	attr = g_attr_logical_length;
+	ret = extract_attr_from_req(or, &attr);
+	if (ret) {
+		EXOFS_ERR("ERROR: extract attr from or failed\n");
+		goto out;
+	}
+	*sanity = get_unaligned_be64(attr.val_ptr);
+#endif
+
+out:
+	osd_end_request(or);
+	return ret;
+}
+
+/*
+ * Fill in an inode read from the OSD and set it up for use
+ */
+struct inode *exofs_iget(struct super_block *sb, unsigned long ino)
+{
+	struct exofs_i_info *oi;
+	struct exofs_fcb fcb;
+	struct inode *inode;
+	uint64_t uninitialized_var(sanity);
+	int ret;
+
+	inode = iget_locked(sb, ino);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+	if (!(inode->i_state & I_NEW))
+		return inode;
+	oi = exofs_i(inode);
+
+	/* read the inode from the osd */
+	ret = exofs_get_inode(sb, oi, &fcb, &sanity);
+	if (ret)
+		goto bad_inode;
+
+	init_waitqueue_head(&oi->i_wq);
+	set_obj_created(oi);
+
+	/* copy stuff from on-disk struct to in-memory struct */
+	inode->i_mode = le16_to_cpu(fcb.i_mode);
+	inode->i_uid = le32_to_cpu(fcb.i_uid);
+	inode->i_gid = le32_to_cpu(fcb.i_gid);
+	inode->i_nlink = le16_to_cpu(fcb.i_links_count);
+	inode->i_ctime.tv_sec = (signed)le32_to_cpu(fcb.i_ctime);
+	inode->i_atime.tv_sec = (signed)le32_to_cpu(fcb.i_atime);
+	inode->i_mtime.tv_sec = (signed)le32_to_cpu(fcb.i_mtime);
+	inode->i_ctime.tv_nsec =
+		inode->i_atime.tv_nsec = inode->i_mtime.tv_nsec = 0;
+	oi->i_commit_size = le64_to_cpu(fcb.i_size);
+	i_size_write(inode, oi->i_commit_size);
+	inode->i_blkbits = EXOFS_BLKSHIFT;
+	inode->i_generation = le32_to_cpu(fcb.i_generation);
+
+#ifdef EXOFS_DEBUG_OBJ_ISIZE
+	if ((inode->i_size != sanity) &&
+		(!exofs_inode_is_fast_symlink(inode))) {
+		EXOFS_ERR("WARNING: Size of object from inode and "
+			  "attributes differ (%lld != %llu)\n",
+			  inode->i_size, _LLU(sanity));
+	}
+#endif
+
+	oi->i_dir_start_lookup = 0;
+
+	if ((inode->i_nlink == 0) && (inode->i_mode == 0)) {
+		ret = -ESTALE;
+		goto bad_inode;
+	}
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (fcb.i_data[0])
+			inode->i_rdev =
+				old_decode_dev(le32_to_cpu(fcb.i_data[0]));
+		else
+			inode->i_rdev =
+				new_decode_dev(le32_to_cpu(fcb.i_data[1]));
+	} else {
+		memcpy(oi->i_data, fcb.i_data, sizeof(fcb.i_data));
+	}
+
+	if (S_ISREG(inode->i_mode)) {
+		inode->i_op = &exofs_file_inode_operations;
+		inode->i_fop = &exofs_file_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+	} else if (S_ISDIR(inode->i_mode)) {
+		inode->i_op = &exofs_dir_inode_operations;
+		inode->i_fop = &exofs_dir_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+	} else if (S_ISLNK(inode->i_mode)) {
+		if (exofs_inode_is_fast_symlink(inode))
+			inode->i_op = &exofs_fast_symlink_inode_operations;
+		else {
+			inode->i_op = &exofs_symlink_inode_operations;
+			inode->i_mapping->a_ops = &exofs_aops;
+		}
+	} else {
+		inode->i_op = &exofs_special_inode_operations;
+		if (fcb.i_data[0])
+			init_special_inode(inode, inode->i_mode,
+			   old_decode_dev(le32_to_cpu(fcb.i_data[0])));
+		else
+			init_special_inode(inode, inode->i_mode,
+			   new_decode_dev(le32_to_cpu(fcb.i_data[1])));
+	}
+
+	unlock_new_inode(inode);
+	return inode;
+
+bad_inode:
+	iget_failed(inode);
+	return ERR_PTR(ret);
+}
+
+int __exofs_wait_obj_created(struct exofs_i_info *oi)
+{
+	if (!obj_created(oi)) {
+		BUG_ON(!obj_2bcreated(oi));
+		wait_event(oi->i_wq, obj_created(oi));
+	}
+	return unlikely(is_bad_inode(&oi->vfs_inode)) ? -EIO : 0;
+}
+/*
+ * Callback function from exofs_new_inode().  The important thing is that we
+ * set the obj_created flag so that other methods know that the object exists on
+ * the OSD.
+ */
+static void create_done(struct osd_request *or, void *p)
+{
+	struct inode *inode = p;
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct exofs_sb_info *sbi = inode->i_sb->s_fs_info;
+	int ret;
+
+	ret = exofs_check_ok(or);
+	osd_end_request(or);
+	atomic_dec(&sbi->s_curr_pending);
+
+	if (unlikely(ret)) {
+		EXOFS_ERR("object=0x%llx creation faild in pid=0x%llx",
+			  _LLU(sbi->s_pid), _LLU(inode->i_ino + EXOFS_OBJ_OFF));
+		make_bad_inode(inode);
+	} else
+		set_obj_created(oi);
+
+	atomic_dec(&inode->i_count);
+	wake_up(&oi->i_wq);
+}
+
+/*
+ * Set up a new inode and create an object for it on the OSD
+ */
+struct inode *exofs_new_inode(struct inode *dir, int mode)
+{
+	struct super_block *sb;
+	struct inode *inode;
+	struct exofs_i_info *oi;
+	struct exofs_sb_info *sbi;
+	struct osd_request *or;
+	struct osd_obj_id obj;
+	int ret;
+
+	sb = dir->i_sb;
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	oi = exofs_i(inode);
+
+	init_waitqueue_head(&oi->i_wq);
+	set_obj_2bcreated(oi);
+
+	sbi = sb->s_fs_info;
+
+	sb->s_dirt = 1;
+	inode->i_uid = current->cred->fsuid;
+	if (dir->i_mode & S_ISGID) {
+		inode->i_gid = dir->i_gid;
+		if (S_ISDIR(mode))
+			mode |= S_ISGID;
+	} else {
+		inode->i_gid = current->cred->fsgid;
+	}
+	inode->i_mode = mode;
+
+	inode->i_ino = sbi->s_nextid++;
+	inode->i_blkbits = EXOFS_BLKSHIFT;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
+	oi->i_commit_size = inode->i_size = 0;
+	spin_lock(&sbi->s_next_gen_lock);
+	inode->i_generation = sbi->s_next_generation++;
+	spin_unlock(&sbi->s_next_gen_lock);
+	insert_inode_hash(inode);
+
+	mark_inode_dirty(inode);
+
+	obj.partition = sbi->s_pid;
+	obj.id = inode->i_ino + EXOFS_OBJ_OFF;
+	exofs_make_credential(oi->i_cred, &obj);
+
+	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("exofs_new_inode: osd_start_request failed\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	osd_req_create_object(or, &obj);
+
+	/* increment the refcount so that the inode will still be around when we
+	 * reach the callback
+	 */
+	atomic_inc(&inode->i_count);
+
+	ret = exofs_async_op(or, create_done, inode, oi->i_cred);
+	if (ret) {
+		atomic_dec(&inode->i_count);
+		osd_end_request(or);
+		return ERR_PTR(-EIO);
+	}
+	atomic_inc(&sbi->s_curr_pending);
+
+	return inode;
+}
diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c
new file mode 100644
index 000000000000..77fdd765e76d
--- /dev/null
+++ b/fs/exofs/namei.c
@@ -0,0 +1,342 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include "exofs.h"
+
+static inline int exofs_add_nondir(struct dentry *dentry, struct inode *inode)
+{
+	int err = exofs_add_link(dentry, inode);
+	if (!err) {
+		d_instantiate(dentry, inode);
+		return 0;
+	}
+	inode_dec_link_count(inode);
+	iput(inode);
+	return err;
+}
+
+static struct dentry *exofs_lookup(struct inode *dir, struct dentry *dentry,
+				   struct nameidata *nd)
+{
+	struct inode *inode;
+	ino_t ino;
+
+	if (dentry->d_name.len > EXOFS_NAME_LEN)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	ino = exofs_inode_by_name(dir, dentry);
+	inode = NULL;
+	if (ino) {
+		inode = exofs_iget(dir->i_sb, ino);
+		if (IS_ERR(inode))
+			return ERR_CAST(inode);
+	}
+	return d_splice_alias(inode, dentry);
+}
+
+static int exofs_create(struct inode *dir, struct dentry *dentry, int mode,
+			 struct nameidata *nd)
+{
+	struct inode *inode = exofs_new_inode(dir, mode);
+	int err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		inode->i_op = &exofs_file_inode_operations;
+		inode->i_fop = &exofs_file_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+		mark_inode_dirty(inode);
+		err = exofs_add_nondir(dentry, inode);
+	}
+	return err;
+}
+
+static int exofs_mknod(struct inode *dir, struct dentry *dentry, int mode,
+		       dev_t rdev)
+{
+	struct inode *inode;
+	int err;
+
+	if (!new_valid_dev(rdev))
+		return -EINVAL;
+
+	inode = exofs_new_inode(dir, mode);
+	err = PTR_ERR(inode);
+	if (!IS_ERR(inode)) {
+		init_special_inode(inode, inode->i_mode, rdev);
+		mark_inode_dirty(inode);
+		err = exofs_add_nondir(dentry, inode);
+	}
+	return err;
+}
+
+static int exofs_symlink(struct inode *dir, struct dentry *dentry,
+			  const char *symname)
+{
+	struct super_block *sb = dir->i_sb;
+	int err = -ENAMETOOLONG;
+	unsigned l = strlen(symname)+1;
+	struct inode *inode;
+	struct exofs_i_info *oi;
+
+	if (l > sb->s_blocksize)
+		goto out;
+
+	inode = exofs_new_inode(dir, S_IFLNK | S_IRWXUGO);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out;
+
+	oi = exofs_i(inode);
+	if (l > sizeof(oi->i_data)) {
+		/* slow symlink */
+		inode->i_op = &exofs_symlink_inode_operations;
+		inode->i_mapping->a_ops = &exofs_aops;
+		memset(oi->i_data, 0, sizeof(oi->i_data));
+
+		err = page_symlink(inode, symname, l);
+		if (err)
+			goto out_fail;
+	} else {
+		/* fast symlink */
+		inode->i_op = &exofs_fast_symlink_inode_operations;
+		memcpy(oi->i_data, symname, l);
+		inode->i_size = l-1;
+	}
+	mark_inode_dirty(inode);
+
+	err = exofs_add_nondir(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	iput(inode);
+	goto out;
+}
+
+static int exofs_link(struct dentry *old_dentry, struct inode *dir,
+		struct dentry *dentry)
+{
+	struct inode *inode = old_dentry->d_inode;
+
+	if (inode->i_nlink >= EXOFS_LINK_MAX)
+		return -EMLINK;
+
+	inode->i_ctime = CURRENT_TIME;
+	inode_inc_link_count(inode);
+	atomic_inc(&inode->i_count);
+
+	return exofs_add_nondir(dentry, inode);
+}
+
+static int exofs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+	struct inode *inode;
+	int err = -EMLINK;
+
+	if (dir->i_nlink >= EXOFS_LINK_MAX)
+		goto out;
+
+	inode_inc_link_count(dir);
+
+	inode = exofs_new_inode(dir, S_IFDIR | mode);
+	err = PTR_ERR(inode);
+	if (IS_ERR(inode))
+		goto out_dir;
+
+	inode->i_op = &exofs_dir_inode_operations;
+	inode->i_fop = &exofs_dir_operations;
+	inode->i_mapping->a_ops = &exofs_aops;
+
+	inode_inc_link_count(inode);
+
+	err = exofs_make_empty(inode, dir);
+	if (err)
+		goto out_fail;
+
+	err = exofs_add_link(dentry, inode);
+	if (err)
+		goto out_fail;
+
+	d_instantiate(dentry, inode);
+out:
+	return err;
+
+out_fail:
+	inode_dec_link_count(inode);
+	inode_dec_link_count(inode);
+	iput(inode);
+out_dir:
+	inode_dec_link_count(dir);
+	goto out;
+}
+
+static int exofs_unlink(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	struct exofs_dir_entry *de;
+	struct page *page;
+	int err = -ENOENT;
+
+	de = exofs_find_entry(dir, dentry, &page);
+	if (!de)
+		goto out;
+
+	err = exofs_delete_entry(de, page);
+	if (err)
+		goto out;
+
+	inode->i_ctime = dir->i_ctime;
+	inode_dec_link_count(inode);
+	err = 0;
+out:
+	return err;
+}
+
+static int exofs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+	struct inode *inode = dentry->d_inode;
+	int err = -ENOTEMPTY;
+
+	if (exofs_empty_dir(inode)) {
+		err = exofs_unlink(dir, dentry);
+		if (!err) {
+			inode->i_size = 0;
+			inode_dec_link_count(inode);
+			inode_dec_link_count(dir);
+		}
+	}
+	return err;
+}
+
+static int exofs_rename(struct inode *old_dir, struct dentry *old_dentry,
+		struct inode *new_dir, struct dentry *new_dentry)
+{
+	struct inode *old_inode = old_dentry->d_inode;
+	struct inode *new_inode = new_dentry->d_inode;
+	struct page *dir_page = NULL;
+	struct exofs_dir_entry *dir_de = NULL;
+	struct page *old_page;
+	struct exofs_dir_entry *old_de;
+	int err = -ENOENT;
+
+	old_de = exofs_find_entry(old_dir, old_dentry, &old_page);
+	if (!old_de)
+		goto out;
+
+	if (S_ISDIR(old_inode->i_mode)) {
+		err = -EIO;
+		dir_de = exofs_dotdot(old_inode, &dir_page);
+		if (!dir_de)
+			goto out_old;
+	}
+
+	if (new_inode) {
+		struct page *new_page;
+		struct exofs_dir_entry *new_de;
+
+		err = -ENOTEMPTY;
+		if (dir_de && !exofs_empty_dir(new_inode))
+			goto out_dir;
+
+		err = -ENOENT;
+		new_de = exofs_find_entry(new_dir, new_dentry, &new_page);
+		if (!new_de)
+			goto out_dir;
+		inode_inc_link_count(old_inode);
+		err = exofs_set_link(new_dir, new_de, new_page, old_inode);
+		new_inode->i_ctime = CURRENT_TIME;
+		if (dir_de)
+			drop_nlink(new_inode);
+		inode_dec_link_count(new_inode);
+		if (err)
+			goto out_dir;
+	} else {
+		if (dir_de) {
+			err = -EMLINK;
+			if (new_dir->i_nlink >= EXOFS_LINK_MAX)
+				goto out_dir;
+		}
+		inode_inc_link_count(old_inode);
+		err = exofs_add_link(new_dentry, old_inode);
+		if (err) {
+			inode_dec_link_count(old_inode);
+			goto out_dir;
+		}
+		if (dir_de)
+			inode_inc_link_count(new_dir);
+	}
+
+	old_inode->i_ctime = CURRENT_TIME;
+
+	exofs_delete_entry(old_de, old_page);
+	inode_dec_link_count(old_inode);
+
+	if (dir_de) {
+		err = exofs_set_link(old_inode, dir_de, dir_page, new_dir);
+		inode_dec_link_count(old_dir);
+		if (err)
+			goto out_dir;
+	}
+	return 0;
+
+
+out_dir:
+	if (dir_de) {
+		kunmap(dir_page);
+		page_cache_release(dir_page);
+	}
+out_old:
+	kunmap(old_page);
+	page_cache_release(old_page);
+out:
+	return err;
+}
+
+const struct inode_operations exofs_dir_inode_operations = {
+	.create 	= exofs_create,
+	.lookup 	= exofs_lookup,
+	.link   	= exofs_link,
+	.unlink 	= exofs_unlink,
+	.symlink	= exofs_symlink,
+	.mkdir  	= exofs_mkdir,
+	.rmdir  	= exofs_rmdir,
+	.mknod  	= exofs_mknod,
+	.rename 	= exofs_rename,
+	.setattr	= exofs_setattr,
+};
+
+const struct inode_operations exofs_special_inode_operations = {
+	.setattr	= exofs_setattr,
+};
-- 
cgit v1.2.3


From ba9e5e98ca2f808fe92b103a8e6ce5271b10cc89 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 28 Oct 2008 16:11:41 +0200
Subject: exofs: super_operations and file_system_type

This patch ties all operation vectors into a file system superblock
and registers the exofs file_system_type at module's load time.

* The file system control block (AKA on-disk superblock) resides in
  an object with a special ID (defined in common.h).
  Information included in the file system control block is used to
  fill the in-memory superblock structure at mount time. This object
  is created before the file system is used by mkexofs.c It contains
  information such as:
	- The file system's magic number
	- The next inode number to be allocated

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/Kbuild  |   2 +-
 fs/exofs/exofs.h |   2 +
 fs/exofs/inode.c | 186 +++++++++++++++++++
 fs/exofs/super.c | 531 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 720 insertions(+), 1 deletion(-)
 create mode 100644 fs/exofs/super.c

(limited to 'fs')

diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild
index 0a23f24db8b2..cc2d22db119c 100644
--- a/fs/exofs/Kbuild
+++ b/fs/exofs/Kbuild
@@ -12,5 +12,5 @@
 # Kbuild - Gets included from the Kernels Makefile and build system
 #
 
-exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o
+exofs-y := osd.o inode.o file.o symlink.o namei.o dir.o super.o
 obj-$(CONFIG_EXOFS_FS) += exofs.o
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index d11982c67e2a..11e11d5c97ba 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -140,6 +140,8 @@ int exofs_write_begin(struct file *file, struct address_space *mapping,
 		struct page **pagep, void **fsdata);
 extern struct inode *exofs_iget(struct super_block *, unsigned long);
 struct inode *exofs_new_inode(struct inode *, int);
+extern int exofs_write_inode(struct inode *, int);
+extern void exofs_delete_inode(struct inode *);
 
 /* dir.c:                */
 int exofs_add_link(struct dentry *, struct inode *);
diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c
index 7c26ba371e1b..ba8d9fab4693 100644
--- a/fs/exofs/inode.c
+++ b/fs/exofs/inode.c
@@ -1115,3 +1115,189 @@ struct inode *exofs_new_inode(struct inode *dir, int mode)
 
 	return inode;
 }
+
+/*
+ * struct to pass two arguments to update_inode's callback
+ */
+struct updatei_args {
+	struct exofs_sb_info	*sbi;
+	struct exofs_fcb	fcb;
+};
+
+/*
+ * Callback function from exofs_update_inode().
+ */
+static void updatei_done(struct osd_request *or, void *p)
+{
+	struct updatei_args *args = p;
+
+	osd_end_request(or);
+
+	atomic_dec(&args->sbi->s_curr_pending);
+
+	kfree(args);
+}
+
+/*
+ * Write the inode to the OSD.  Just fill up the struct, and set the attribute
+ * synchronously or asynchronously depending on the do_sync flag.
+ */
+static int exofs_update_inode(struct inode *inode, int do_sync)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct super_block *sb = inode->i_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+	struct osd_request *or;
+	struct osd_attr attr;
+	struct exofs_fcb *fcb;
+	struct updatei_args *args;
+	int ret;
+
+	args = kzalloc(sizeof(*args), GFP_KERNEL);
+	if (!args)
+		return -ENOMEM;
+
+	fcb = &args->fcb;
+
+	fcb->i_mode = cpu_to_le16(inode->i_mode);
+	fcb->i_uid = cpu_to_le32(inode->i_uid);
+	fcb->i_gid = cpu_to_le32(inode->i_gid);
+	fcb->i_links_count = cpu_to_le16(inode->i_nlink);
+	fcb->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
+	fcb->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
+	fcb->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
+	oi->i_commit_size = i_size_read(inode);
+	fcb->i_size = cpu_to_le64(oi->i_commit_size);
+	fcb->i_generation = cpu_to_le32(inode->i_generation);
+
+	if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		if (old_valid_dev(inode->i_rdev)) {
+			fcb->i_data[0] =
+				cpu_to_le32(old_encode_dev(inode->i_rdev));
+			fcb->i_data[1] = 0;
+		} else {
+			fcb->i_data[0] = 0;
+			fcb->i_data[1] =
+				cpu_to_le32(new_encode_dev(inode->i_rdev));
+			fcb->i_data[2] = 0;
+		}
+	} else
+		memcpy(fcb->i_data, oi->i_data, sizeof(fcb->i_data));
+
+	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("exofs_update_inode: osd_start_request failed.\n");
+		ret = -ENOMEM;
+		goto free_args;
+	}
+
+	osd_req_set_attributes(or, &obj);
+
+	attr = g_attr_inode_data;
+	attr.val_ptr = fcb;
+	osd_req_add_set_attr_list(or, &attr, 1);
+
+	if (!obj_created(oi)) {
+		EXOFS_DBGMSG("!obj_created\n");
+		BUG_ON(!obj_2bcreated(oi));
+		wait_event(oi->i_wq, obj_created(oi));
+		EXOFS_DBGMSG("wait_event done\n");
+	}
+
+	if (do_sync) {
+		ret = exofs_sync_op(or, sbi->s_timeout, oi->i_cred);
+		osd_end_request(or);
+		goto free_args;
+	} else {
+		args->sbi = sbi;
+
+		ret = exofs_async_op(or, updatei_done, args, oi->i_cred);
+		if (ret) {
+			osd_end_request(or);
+			goto free_args;
+		}
+		atomic_inc(&sbi->s_curr_pending);
+		goto out; /* deallocation in updatei_done */
+	}
+
+free_args:
+	kfree(args);
+out:
+	EXOFS_DBGMSG("ret=>%d\n", ret);
+	return ret;
+}
+
+int exofs_write_inode(struct inode *inode, int wait)
+{
+	return exofs_update_inode(inode, wait);
+}
+
+/*
+ * Callback function from exofs_delete_inode() - don't have much cleaning up to
+ * do.
+ */
+static void delete_done(struct osd_request *or, void *p)
+{
+	struct exofs_sb_info *sbi;
+	osd_end_request(or);
+	sbi = p;
+	atomic_dec(&sbi->s_curr_pending);
+}
+
+/*
+ * Called when the refcount of an inode reaches zero.  We remove the object
+ * from the OSD here.  We make sure the object was created before we try and
+ * delete it.
+ */
+void exofs_delete_inode(struct inode *inode)
+{
+	struct exofs_i_info *oi = exofs_i(inode);
+	struct super_block *sb = inode->i_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct osd_obj_id obj = {sbi->s_pid, inode->i_ino + EXOFS_OBJ_OFF};
+	struct osd_request *or;
+	int ret;
+
+	truncate_inode_pages(&inode->i_data, 0);
+
+	if (is_bad_inode(inode))
+		goto no_delete;
+
+	mark_inode_dirty(inode);
+	exofs_update_inode(inode, inode_needs_sync(inode));
+
+	inode->i_size = 0;
+	if (inode->i_blocks)
+		exofs_truncate(inode);
+
+	clear_inode(inode);
+
+	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("exofs_delete_inode: osd_start_request failed\n");
+		return;
+	}
+
+	osd_req_remove_object(or, &obj);
+
+	/* if we are deleting an obj that hasn't been created yet, wait */
+	if (!obj_created(oi)) {
+		BUG_ON(!obj_2bcreated(oi));
+		wait_event(oi->i_wq, obj_created(oi));
+	}
+
+	ret = exofs_async_op(or, delete_done, sbi, oi->i_cred);
+	if (ret) {
+		EXOFS_ERR(
+		       "ERROR: @exofs_delete_inode exofs_async_op failed\n");
+		osd_end_request(or);
+		return;
+	}
+	atomic_inc(&sbi->s_curr_pending);
+
+	return;
+
+no_delete:
+	clear_inode(inode);
+}
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
new file mode 100644
index 000000000000..b736b117a2a4
--- /dev/null
+++ b/fs/exofs/super.c
@@ -0,0 +1,531 @@
+/*
+ * Copyright (C) 2005, 2006
+ * Avishay Traeger (avishay@gmail.com) (avishay@il.ibm.com)
+ * Copyright (C) 2005, 2006
+ * International Business Machines
+ * Copyright (C) 2008, 2009
+ * Boaz Harrosh <bharrosh@panasas.com>
+ *
+ * Copyrights for code taken from ext2:
+ *     Copyright (C) 1992, 1993, 1994, 1995
+ *     Remy Card (card@masi.ibp.fr)
+ *     Laboratoire MASI - Institut Blaise Pascal
+ *     Universite Pierre et Marie Curie (Paris VI)
+ *     from
+ *     linux/fs/minix/inode.c
+ *     Copyright (C) 1991, 1992  Linus Torvalds
+ *
+ * This file is part of exofs.
+ *
+ * exofs is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation.  Since it is based on ext2, and the only
+ * valid version of GPL for the Linux kernel is version 2, the only valid
+ * version of GPL for exofs is version 2.
+ *
+ * exofs is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with exofs; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ */
+
+#include <linux/string.h>
+#include <linux/parser.h>
+#include <linux/vfs.h>
+#include <linux/random.h>
+
+#include "exofs.h"
+
+/******************************************************************************
+ * MOUNT OPTIONS
+ *****************************************************************************/
+
+/*
+ * struct to hold what we get from mount options
+ */
+struct exofs_mountopt {
+	const char *dev_name;
+	uint64_t pid;
+	int timeout;
+};
+
+/*
+ * exofs-specific mount-time options.
+ */
+enum { Opt_pid, Opt_to, Opt_mkfs, Opt_format, Opt_err };
+
+/*
+ * Our mount-time options.  These should ideally be 64-bit unsigned, but the
+ * kernel's parsing functions do not currently support that.  32-bit should be
+ * sufficient for most applications now.
+ */
+static match_table_t tokens = {
+	{Opt_pid, "pid=%u"},
+	{Opt_to, "to=%u"},
+	{Opt_err, NULL}
+};
+
+/*
+ * The main option parsing method.  Also makes sure that all of the mandatory
+ * mount options were set.
+ */
+static int parse_options(char *options, struct exofs_mountopt *opts)
+{
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int option;
+	bool s_pid = false;
+
+	EXOFS_DBGMSG("parse_options %s\n", options);
+	/* defaults */
+	memset(opts, 0, sizeof(*opts));
+	opts->timeout = BLK_DEFAULT_SG_TIMEOUT;
+
+	while ((p = strsep(&options, ",")) != NULL) {
+		int token;
+		char str[32];
+
+		if (!*p)
+			continue;
+
+		token = match_token(p, tokens, args);
+		switch (token) {
+		case Opt_pid:
+			if (0 == match_strlcpy(str, &args[0], sizeof(str)))
+				return -EINVAL;
+			opts->pid = simple_strtoull(str, NULL, 0);
+			if (opts->pid < EXOFS_MIN_PID) {
+				EXOFS_ERR("Partition ID must be >= %u",
+					  EXOFS_MIN_PID);
+				return -EINVAL;
+			}
+			s_pid = 1;
+			break;
+		case Opt_to:
+			if (match_int(&args[0], &option))
+				return -EINVAL;
+			if (option <= 0) {
+				EXOFS_ERR("Timout must be > 0");
+				return -EINVAL;
+			}
+			opts->timeout = option * HZ;
+			break;
+		}
+	}
+
+	if (!s_pid) {
+		EXOFS_ERR("Need to specify the following options:\n");
+		EXOFS_ERR("    -o pid=pid_no_to_use\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/******************************************************************************
+ * INODE CACHE
+ *****************************************************************************/
+
+/*
+ * Our inode cache.  Isn't it pretty?
+ */
+static struct kmem_cache *exofs_inode_cachep;
+
+/*
+ * Allocate an inode in the cache
+ */
+static struct inode *exofs_alloc_inode(struct super_block *sb)
+{
+	struct exofs_i_info *oi;
+
+	oi = kmem_cache_alloc(exofs_inode_cachep, GFP_KERNEL);
+	if (!oi)
+		return NULL;
+
+	oi->vfs_inode.i_version = 1;
+	return &oi->vfs_inode;
+}
+
+/*
+ * Remove an inode from the cache
+ */
+static void exofs_destroy_inode(struct inode *inode)
+{
+	kmem_cache_free(exofs_inode_cachep, exofs_i(inode));
+}
+
+/*
+ * Initialize the inode
+ */
+static void exofs_init_once(void *foo)
+{
+	struct exofs_i_info *oi = foo;
+
+	inode_init_once(&oi->vfs_inode);
+}
+
+/*
+ * Create and initialize the inode cache
+ */
+static int init_inodecache(void)
+{
+	exofs_inode_cachep = kmem_cache_create("exofs_inode_cache",
+				sizeof(struct exofs_i_info), 0,
+				SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
+				exofs_init_once);
+	if (exofs_inode_cachep == NULL)
+		return -ENOMEM;
+	return 0;
+}
+
+/*
+ * Destroy the inode cache
+ */
+static void destroy_inodecache(void)
+{
+	kmem_cache_destroy(exofs_inode_cachep);
+}
+
+/******************************************************************************
+ * SUPERBLOCK FUNCTIONS
+ *****************************************************************************/
+static const struct super_operations exofs_sops;
+
+/*
+ * Write the superblock to the OSD
+ */
+static void exofs_write_super(struct super_block *sb)
+{
+	struct exofs_sb_info *sbi;
+	struct exofs_fscb *fscb;
+	struct osd_request *or;
+	struct osd_obj_id obj;
+	int ret;
+
+	fscb = kzalloc(sizeof(struct exofs_fscb), GFP_KERNEL);
+	if (!fscb) {
+		EXOFS_ERR("exofs_write_super: memory allocation failed.\n");
+		return;
+	}
+
+	lock_kernel();
+	sbi = sb->s_fs_info;
+	fscb->s_nextid = cpu_to_le64(sbi->s_nextid);
+	fscb->s_numfiles = cpu_to_le32(sbi->s_numfiles);
+	fscb->s_magic = cpu_to_le16(sb->s_magic);
+	fscb->s_newfs = 0;
+
+	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_ERR("exofs_write_super: osd_start_request failed.\n");
+		goto out;
+	}
+
+	obj.partition = sbi->s_pid;
+	obj.id = EXOFS_SUPER_ID;
+	ret = osd_req_write_kern(or, &obj, 0, fscb, sizeof(*fscb));
+	if (unlikely(ret)) {
+		EXOFS_ERR("exofs_write_super: osd_req_write_kern failed.\n");
+		goto out;
+	}
+
+	ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
+	if (unlikely(ret)) {
+		EXOFS_ERR("exofs_write_super: exofs_sync_op failed.\n");
+		goto out;
+	}
+	sb->s_dirt = 0;
+
+out:
+	if (or)
+		osd_end_request(or);
+	unlock_kernel();
+	kfree(fscb);
+}
+
+/*
+ * This function is called when the vfs is freeing the superblock.  We just
+ * need to free our own part.
+ */
+static void exofs_put_super(struct super_block *sb)
+{
+	int num_pend;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+
+	/* make sure there are no pending commands */
+	for (num_pend = atomic_read(&sbi->s_curr_pending); num_pend > 0;
+	     num_pend = atomic_read(&sbi->s_curr_pending)) {
+		wait_queue_head_t wq;
+		init_waitqueue_head(&wq);
+		wait_event_timeout(wq,
+				  (atomic_read(&sbi->s_curr_pending) == 0),
+				  msecs_to_jiffies(100));
+	}
+
+	osduld_put_device(sbi->s_dev);
+	kfree(sb->s_fs_info);
+	sb->s_fs_info = NULL;
+}
+
+/*
+ * Read the superblock from the OSD and fill in the fields
+ */
+static int exofs_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *root;
+	struct exofs_mountopt *opts = data;
+	struct exofs_sb_info *sbi;	/*extended info                  */
+	struct exofs_fscb fscb;		/*on-disk superblock info        */
+	struct osd_request *or = NULL;
+	struct osd_obj_id obj;
+	int ret;
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+	sb->s_fs_info = sbi;
+
+	/* use mount options to fill superblock */
+	sbi->s_dev = osduld_path_lookup(opts->dev_name);
+	if (IS_ERR(sbi->s_dev)) {
+		ret = PTR_ERR(sbi->s_dev);
+		sbi->s_dev = NULL;
+		goto free_sbi;
+	}
+
+	sbi->s_pid = opts->pid;
+	sbi->s_timeout = opts->timeout;
+
+	/* fill in some other data by hand */
+	memset(sb->s_id, 0, sizeof(sb->s_id));
+	strcpy(sb->s_id, "exofs");
+	sb->s_blocksize = EXOFS_BLKSIZE;
+	sb->s_blocksize_bits = EXOFS_BLKSHIFT;
+	sb->s_maxbytes = MAX_LFS_FILESIZE;
+	atomic_set(&sbi->s_curr_pending, 0);
+	sb->s_bdev = NULL;
+	sb->s_dev = 0;
+
+	/* read data from on-disk superblock object */
+	obj.partition = sbi->s_pid;
+	obj.id = EXOFS_SUPER_ID;
+	exofs_make_credential(sbi->s_cred, &obj);
+
+	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		if (!silent)
+			EXOFS_ERR(
+			       "exofs_fill_super: osd_start_request failed.\n");
+		ret = -ENOMEM;
+		goto free_sbi;
+	}
+	ret = osd_req_read_kern(or, &obj, 0, &fscb, sizeof(fscb));
+	if (unlikely(ret)) {
+		if (!silent)
+			EXOFS_ERR(
+			       "exofs_fill_super: osd_req_read_kern failed.\n");
+		ret = -ENOMEM;
+		goto free_sbi;
+	}
+
+	ret = exofs_sync_op(or, sbi->s_timeout, sbi->s_cred);
+	if (unlikely(ret)) {
+		if (!silent)
+			EXOFS_ERR("exofs_fill_super: exofs_sync_op failed.\n");
+		ret = -EIO;
+		goto free_sbi;
+	}
+
+	sb->s_magic = le16_to_cpu(fscb.s_magic);
+	sbi->s_nextid = le64_to_cpu(fscb.s_nextid);
+	sbi->s_numfiles = le32_to_cpu(fscb.s_numfiles);
+
+	/* make sure what we read from the object store is correct */
+	if (sb->s_magic != EXOFS_SUPER_MAGIC) {
+		if (!silent)
+			EXOFS_ERR("ERROR: Bad magic value\n");
+		ret = -EINVAL;
+		goto free_sbi;
+	}
+
+	/* start generation numbers from a random point */
+	get_random_bytes(&sbi->s_next_generation, sizeof(u32));
+	spin_lock_init(&sbi->s_next_gen_lock);
+
+	/* set up operation vectors */
+	sb->s_op = &exofs_sops;
+	root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
+	if (IS_ERR(root)) {
+		EXOFS_ERR("ERROR: exofs_iget failed\n");
+		ret = PTR_ERR(root);
+		goto free_sbi;
+	}
+	sb->s_root = d_alloc_root(root);
+	if (!sb->s_root) {
+		iput(root);
+		EXOFS_ERR("ERROR: get root inode failed\n");
+		ret = -ENOMEM;
+		goto free_sbi;
+	}
+
+	if (!S_ISDIR(root->i_mode)) {
+		dput(sb->s_root);
+		sb->s_root = NULL;
+		EXOFS_ERR("ERROR: corrupt root inode (mode = %hd)\n",
+		       root->i_mode);
+		ret = -EINVAL;
+		goto free_sbi;
+	}
+
+	ret = 0;
+out:
+	if (or)
+		osd_end_request(or);
+	return ret;
+
+free_sbi:
+	osduld_put_device(sbi->s_dev); /* NULL safe */
+	kfree(sbi);
+	goto out;
+}
+
+/*
+ * Set up the superblock (calls exofs_fill_super eventually)
+ */
+static int exofs_get_sb(struct file_system_type *type,
+			  int flags, const char *dev_name,
+			  void *data, struct vfsmount *mnt)
+{
+	struct exofs_mountopt opts;
+	int ret;
+
+	ret = parse_options(data, &opts);
+	if (ret)
+		return ret;
+
+	opts.dev_name = dev_name;
+	return get_sb_nodev(type, flags, &opts, exofs_fill_super, mnt);
+}
+
+/*
+ * Return information about the file system state in the buffer.  This is used
+ * by the 'df' command, for example.
+ */
+static int exofs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+	struct super_block *sb = dentry->d_sb;
+	struct exofs_sb_info *sbi = sb->s_fs_info;
+	struct osd_obj_id obj = {sbi->s_pid, 0};
+	struct osd_attr attrs[] = {
+		ATTR_DEF(OSD_APAGE_PARTITION_QUOTAS,
+			OSD_ATTR_PQ_CAPACITY_QUOTA, sizeof(__be64)),
+		ATTR_DEF(OSD_APAGE_PARTITION_INFORMATION,
+			OSD_ATTR_PI_USED_CAPACITY, sizeof(__be64)),
+	};
+	uint64_t capacity = ULLONG_MAX;
+	uint64_t used = ULLONG_MAX;
+	struct osd_request *or;
+	uint8_t cred_a[OSD_CAP_LEN];
+	int ret;
+
+	/* get used/capacity attributes */
+	exofs_make_credential(cred_a, &obj);
+
+	or = osd_start_request(sbi->s_dev, GFP_KERNEL);
+	if (unlikely(!or)) {
+		EXOFS_DBGMSG("exofs_statfs: osd_start_request failed.\n");
+		return -ENOMEM;
+	}
+
+	osd_req_get_attributes(or, &obj);
+	osd_req_add_get_attr_list(or, attrs, ARRAY_SIZE(attrs));
+	ret = exofs_sync_op(or, sbi->s_timeout, cred_a);
+	if (unlikely(ret))
+		goto out;
+
+	ret = extract_attr_from_req(or, &attrs[0]);
+	if (likely(!ret))
+		capacity = get_unaligned_be64(attrs[0].val_ptr);
+	else
+		EXOFS_DBGMSG("exofs_statfs: get capacity failed.\n");
+
+	ret = extract_attr_from_req(or, &attrs[1]);
+	if (likely(!ret))
+		used = get_unaligned_be64(attrs[1].val_ptr);
+	else
+		EXOFS_DBGMSG("exofs_statfs: get used-space failed.\n");
+
+	/* fill in the stats buffer */
+	buf->f_type = EXOFS_SUPER_MAGIC;
+	buf->f_bsize = EXOFS_BLKSIZE;
+	buf->f_blocks = (capacity >> EXOFS_BLKSHIFT);
+	buf->f_bfree = ((capacity - used) >> EXOFS_BLKSHIFT);
+	buf->f_bavail = buf->f_bfree;
+	buf->f_files = sbi->s_numfiles;
+	buf->f_ffree = EXOFS_MAX_ID - sbi->s_numfiles;
+	buf->f_namelen = EXOFS_NAME_LEN;
+
+out:
+	osd_end_request(or);
+	return ret;
+}
+
+static const struct super_operations exofs_sops = {
+	.alloc_inode    = exofs_alloc_inode,
+	.destroy_inode  = exofs_destroy_inode,
+	.write_inode    = exofs_write_inode,
+	.delete_inode   = exofs_delete_inode,
+	.put_super      = exofs_put_super,
+	.write_super    = exofs_write_super,
+	.statfs         = exofs_statfs,
+};
+
+/******************************************************************************
+ * INSMOD/RMMOD
+ *****************************************************************************/
+
+/*
+ * struct that describes this file system
+ */
+static struct file_system_type exofs_type = {
+	.owner          = THIS_MODULE,
+	.name           = "exofs",
+	.get_sb         = exofs_get_sb,
+	.kill_sb        = generic_shutdown_super,
+};
+
+static int __init init_exofs(void)
+{
+	int err;
+
+	err = init_inodecache();
+	if (err)
+		goto out;
+
+	err = register_filesystem(&exofs_type);
+	if (err)
+		goto out_d;
+
+	return 0;
+out_d:
+	destroy_inodecache();
+out:
+	return err;
+}
+
+static void __exit exit_exofs(void)
+{
+	unregister_filesystem(&exofs_type);
+	destroy_inodecache();
+}
+
+MODULE_AUTHOR("Avishay Traeger <avishay@gmail.com>");
+MODULE_DESCRIPTION("exofs");
+MODULE_LICENSE("GPL");
+
+module_init(init_exofs)
+module_exit(exit_exofs)
-- 
cgit v1.2.3


From 8cf74b3936f5c65975b68333727f0540ec8350f4 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Sun, 22 Mar 2009 12:47:26 +0200
Subject: exofs: export_operations

implement export_operations and set in superblock.
It is now posible to export exofs via nfs

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/dir.c   | 15 +++++++++++++++
 fs/exofs/exofs.h |  1 +
 fs/exofs/super.c | 53 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 69 insertions(+)

(limited to 'fs')

diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c
index 6865605fb6ab..65b0c8c776a1 100644
--- a/fs/exofs/dir.c
+++ b/fs/exofs/dir.c
@@ -381,6 +381,21 @@ struct exofs_dir_entry *exofs_dotdot(struct inode *dir, struct page **p)
 	return de;
 }
 
+ino_t exofs_parent_ino(struct dentry *child)
+{
+	struct page *page;
+	struct exofs_dir_entry *de;
+	ino_t ino;
+
+	de = exofs_dotdot(child->d_inode, &page);
+	if (!de)
+		return 0;
+
+	ino = le64_to_cpu(de->inode_no);
+	exofs_put_page(page);
+	return ino;
+}
+
 ino_t exofs_inode_by_name(struct inode *dir, struct dentry *dentry)
 {
 	ino_t res = 0;
diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h
index 11e11d5c97ba..0fd4c7859679 100644
--- a/fs/exofs/exofs.h
+++ b/fs/exofs/exofs.h
@@ -152,6 +152,7 @@ struct exofs_dir_entry *exofs_find_entry(struct inode *, struct dentry *,
 					 struct page **);
 int exofs_empty_dir(struct inode *);
 struct exofs_dir_entry *exofs_dotdot(struct inode *, struct page **);
+ino_t exofs_parent_ino(struct dentry *child);
 int exofs_set_link(struct inode *, struct exofs_dir_entry *, struct page *,
 		    struct inode *);
 
diff --git a/fs/exofs/super.c b/fs/exofs/super.c
index b736b117a2a4..9f1985e857e2 100644
--- a/fs/exofs/super.c
+++ b/fs/exofs/super.c
@@ -37,6 +37,7 @@
 #include <linux/parser.h>
 #include <linux/vfs.h>
 #include <linux/random.h>
+#include <linux/exportfs.h>
 
 #include "exofs.h"
 
@@ -194,6 +195,7 @@ static void destroy_inodecache(void)
  * SUPERBLOCK FUNCTIONS
  *****************************************************************************/
 static const struct super_operations exofs_sops;
+static const struct export_operations exofs_export_ops;
 
 /*
  * Write the superblock to the OSD
@@ -358,6 +360,7 @@ static int exofs_fill_super(struct super_block *sb, void *data, int silent)
 
 	/* set up operation vectors */
 	sb->s_op = &exofs_sops;
+	sb->s_export_op = &exofs_export_ops;
 	root = exofs_iget(sb, EXOFS_ROOT_ID - EXOFS_OBJ_OFF);
 	if (IS_ERR(root)) {
 		EXOFS_ERR("ERROR: exofs_iget failed\n");
@@ -484,6 +487,56 @@ static const struct super_operations exofs_sops = {
 	.statfs         = exofs_statfs,
 };
 
+/******************************************************************************
+ * EXPORT OPERATIONS
+ *****************************************************************************/
+
+struct dentry *exofs_get_parent(struct dentry *child)
+{
+	unsigned long ino = exofs_parent_ino(child);
+
+	if (!ino)
+		return NULL;
+
+	return d_obtain_alias(exofs_iget(child->d_inode->i_sb, ino));
+}
+
+static struct inode *exofs_nfs_get_inode(struct super_block *sb,
+		u64 ino, u32 generation)
+{
+	struct inode *inode;
+
+	inode = exofs_iget(sb, ino);
+	if (IS_ERR(inode))
+		return ERR_CAST(inode);
+	if (generation && inode->i_generation != generation) {
+		/* we didn't find the right inode.. */
+		iput(inode);
+		return ERR_PTR(-ESTALE);
+	}
+	return inode;
+}
+
+static struct dentry *exofs_fh_to_dentry(struct super_block *sb,
+				struct fid *fid, int fh_len, int fh_type)
+{
+	return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
+				    exofs_nfs_get_inode);
+}
+
+static struct dentry *exofs_fh_to_parent(struct super_block *sb,
+				struct fid *fid, int fh_len, int fh_type)
+{
+	return generic_fh_to_parent(sb, fid, fh_len, fh_type,
+				    exofs_nfs_get_inode);
+}
+
+static const struct export_operations exofs_export_ops = {
+	.fh_to_dentry = exofs_fh_to_dentry,
+	.fh_to_parent = exofs_fh_to_parent,
+	.get_parent = exofs_get_parent,
+};
+
 /******************************************************************************
  * INSMOD/RMMOD
  *****************************************************************************/
-- 
cgit v1.2.3


From 214c8adb87b880690b103e86baffc46f8288fa62 Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Tue, 28 Oct 2008 17:22:01 +0200
Subject: exofs: Documentation

Added some documentation in exofs.txt, as well as a BUGS file.

For further reading, operation instructions, example scripts
and up to date infomation and code please see:
http://open-osd.org

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/exofs/BUGS | 3 +++
 1 file changed, 3 insertions(+)
 create mode 100644 fs/exofs/BUGS

(limited to 'fs')

diff --git a/fs/exofs/BUGS b/fs/exofs/BUGS
new file mode 100644
index 000000000000..1b2d4c63a579
--- /dev/null
+++ b/fs/exofs/BUGS
@@ -0,0 +1,3 @@
+- Out-of-space may cause a severe problem if the object (and directory entry)
+  were written, but the inode attributes failed. Then if the filesystem was
+  unmounted and mounted the kernel can get into an endless loop doing a readdir.
-- 
cgit v1.2.3


From 0d8fe329a80714e0f729ae48cba8d64cbe5701cb Mon Sep 17 00:00:00 2001
From: Boaz Harrosh <bharrosh@panasas.com>
Date: Mon, 15 Dec 2008 12:53:08 +0200
Subject: fs: Add exofs to Kernel build

- Add exofs to fs/Kconfig under "menu 'Miscellaneous filesystems'"
- Add exofs to fs/Makefile

Signed-off-by: Boaz Harrosh <bharrosh@panasas.com>
---
 fs/Kconfig  | 2 ++
 fs/Makefile | 1 +
 2 files changed, 3 insertions(+)

(limited to 'fs')

diff --git a/fs/Kconfig b/fs/Kconfig
index cef8b18ceaa3..ae3b34a2ea69 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -169,6 +169,8 @@ source "fs/romfs/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 
+source "fs/exofs/Kconfig"
+
 endif # MISC_FILESYSTEMS
 
 menuconfig NETWORK_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index 6e82a307bcd4..ba77ef2d293f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -120,3 +120,4 @@ obj-$(CONFIG_DEBUG_FS)		+= debugfs/
 obj-$(CONFIG_OCFS2_FS)		+= ocfs2/
 obj-$(CONFIG_BTRFS_FS)		+= btrfs/
 obj-$(CONFIG_GFS2_FS)           += gfs2/
+obj-$(CONFIG_EXOFS_FS)          += exofs/
-- 
cgit v1.2.3