path: root/Documentation/filesystems
diff options
authorAnton Arapov <>2012-06-08 12:58:00 +0200
committerAnton Arapov <>2012-06-08 12:58:00 +0200
commit6792a3f47a2e42d7164292bf7f1a55cfc4c91652 (patch)
treeb90c002bfbbeaec92f5d8a2383dcabf6524016f7 /Documentation/filesystems
parentfe2895d3d55146cac65b273c0f83e2c7e543cd0e (diff)
fedora kernel: b920e9b748c595f970bf80ede7832d39f8d567dav3.4.1-2
Signed-off-by: Anton Arapov <>
Diffstat (limited to 'Documentation/filesystems')
11 files changed, 278 insertions, 21 deletions
diff --git a/Documentation/filesystems/debugfs.txt b/Documentation/filesystems/debugfs.txt
index 6872c91bce3..7a34f827989 100644
--- a/Documentation/filesystems/debugfs.txt
+++ b/Documentation/filesystems/debugfs.txt
@@ -14,7 +14,10 @@ Debugfs is typically mounted with a command like:
mount -t debugfs none /sys/kernel/debug
-(Or an equivalent /etc/fstab line).
+(Or an equivalent /etc/fstab line).
+The debugfs root directory is accessible by anyone by default. To
+restrict access to the tree the "uid", "gid" and "mode" mount
+options can be used.
Note that the debugfs API is exported GPL-only to modules.
@@ -133,7 +136,7 @@ file.
void __iomem *base;
- struct dentry *debugfs_create_regset32(const char *name, mode_t mode,
+ struct dentry *debugfs_create_regset32(const char *name, umode_t mode,
struct dentry *parent,
struct debugfs_regset32 *regset);
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 10ec4639f15..1b7f9acbcbb 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -144,9 +144,6 @@ journal_async_commit Commit block can be written to disk without waiting
mount the device. This will enable 'journal_checksum'
-journal=update Update the ext4 file system's journal to the current
- format.
journal_dev=devnum When the external journal device's major/minor numbers
have changed, this option allows the user to specify
the new journal location. The journal device is
@@ -308,7 +305,7 @@ min_batch_time=usec This parameter sets the commit time (as
fast disks, at the cost of increasing latency.
journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the
- highest priorty) which should be used for I/O
+ highest priority) which should be used for I/O
operations submitted by kjournald2 during a
commit operation. This defaults to 3, which is
a slightly higher priority than the default I/O
@@ -343,7 +340,7 @@ noinit_itable Do not initialize any uninitialized inode table
init_itable=n The lazy itable init code will wait n times the
number of milliseconds it took to zero out the
previous block group's inode table. This
- minimizes the impact on the systme performance
+ minimizes the impact on the system performance
while file system's inode table is being initialized.
discard Controls whether ext4 should issue discard/TRIM
@@ -356,11 +353,6 @@ nouid32 Disables 32-bit UIDs and GIDs. This is for
interoperability with older kernels which only
store and expect 16-bit values.
-resize Allows to resize filesystem to the end of the last
- existing block group, further resize has to be done
- with resize2fs either online, or offline. It can be
- used only with conjunction with remount.
block_validity This options allows to enables/disables the in-kernel
noblock_validity facility for tracking filesystem metadata blocks
within internal data structures. This allows multi-
diff --git a/Documentation/filesystems/files.txt b/Documentation/filesystems/files.txt
index ac2facc50d2..46dfc6b038c 100644
--- a/Documentation/filesystems/files.txt
+++ b/Documentation/filesystems/files.txt
@@ -113,8 +113,8 @@ the fdtable structure -
if (fd >= 0) {
/* locate_fd() may have expanded fdtable, load the ptr */
fdt = files_fdtable(files);
- FD_SET(fd, fdt->open_fds);
- FD_CLR(fd, fdt->close_on_exec);
+ __set_open_fd(fd, fdt);
+ __clear_close_on_exec(fd, fdt);
diff --git a/Documentation/filesystems/gfs2-uevents.txt b/Documentation/filesystems/gfs2-uevents.txt
index d8188966929..19a19ebebc3 100644
--- a/Documentation/filesystems/gfs2-uevents.txt
+++ b/Documentation/filesystems/gfs2-uevents.txt
@@ -62,7 +62,7 @@ be fixed.
The REMOVE uevent is generated at the end of an unsuccessful mount
or at the end of a umount of the filesystem. All REMOVE uevents will
-have been preceded by at least an ADD uevent for the same fileystem,
+have been preceded by at least an ADD uevent for the same filesystem,
and unlike the other uevents is generated automatically by the kernel's
kobject subsystem.
diff --git a/Documentation/filesystems/nfs/pnfs.txt b/Documentation/filesystems/nfs/pnfs.txt
index 983e14abe7e..c7919c6e3be 100644
--- a/Documentation/filesystems/nfs/pnfs.txt
+++ b/Documentation/filesystems/nfs/pnfs.txt
@@ -53,3 +53,57 @@ lseg maintains an extra reference corresponding to the NFS_LSEG_VALID
bit which holds it in the pnfs_layout_hdr's list. When the final lseg
is removed from the pnfs_layout_hdr's list, the NFS_LAYOUT_DESTROYED
bit is set, preventing any new lsegs from being added.
+layout drivers
+PNFS utilizes what is called layout drivers. The STD defines 3 basic
+layout types: "files" "objects" and "blocks". For each of these types
+there is a layout-driver with a common function-vectors table which
+are called by the nfs-client pnfs-core to implement the different layout
+Files-layout-driver code is in: fs/nfs/nfs4filelayout.c && nfs4filelayoutdev.c
+Objects-layout-deriver code is in: fs/nfs/objlayout/.. directory
+Blocks-layout-deriver code is in: fs/nfs/blocklayout/.. directory
+objects-layout setup
+As part of the full STD implementation the objlayoutdriver.ko needs, at times,
+to automatically login to yet undiscovered iscsi/osd devices. For this the
+driver makes up-calles to a user-mode script called *osd_login*
+The path_name of the script to use is by default:
+ /sbin/osd_login.
+This name can be overridden by the Kernel module parameter:
+ objlayoutdriver.osd_login_prog
+If Kernel does not find the osd_login_prog path it will zero it out
+and will not attempt farther logins. An admin can then write new value
+to the objlayoutdriver.osd_login_prog Kernel parameter to re-enable it.
+The /sbin/osd_login is part of the nfs-utils package, and should usually
+be installed on distributions that support this Kernel version.
+The API to the login script is as follows:
+ Usage: $0 -u <URI> -o <OSDNAME> -s <SYSTEMID>
+ Options:
+ -u target uri e.g. iscsi://<ip>:<port>
+ (allways exists)
+ (More protocols can be defined in the future.
+ The client does not interpret this string it is
+ passed unchanged as recieved from the Server)
+ -o osdname of the requested target OSD
+ (Might be empty)
+ (A string which denotes the OSD name, there is a
+ limit of 64 chars on this string)
+ -s systemid of the requested target OSD
+ (Might be empty)
+ (This string, if not empty is always an hex
+ representation of the 20 bytes osd_system_id)
+blocks-layout setup
+TODO: Document the setup needs of the blocks layout driver
diff --git a/Documentation/filesystems/pohmelfs/network_protocol.txt b/Documentation/filesystems/pohmelfs/network_protocol.txt
index 65e03dd4482..c680b4b5353 100644
--- a/Documentation/filesystems/pohmelfs/network_protocol.txt
+++ b/Documentation/filesystems/pohmelfs/network_protocol.txt
@@ -20,7 +20,7 @@ Commands can be embedded into transaction command (which in turn has own command
so one can extend protocol as needed without breaking backward compatibility as long
as old commands are supported. All string lengths include tail 0 byte.
-All commands are transferred over the network in big-endian. CPU endianess is used at the end peers.
+All commands are transferred over the network in big-endian. CPU endianness is used at the end peers.
@cmd - command number, which specifies command to be processed. Following
commands are used currently:
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index b4a3d765ff9..74acd961881 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -429,3 +429,9 @@ filemap_write_and_wait_range() so that all dirty pages are synced out properly.
You must also keep in mind that ->fsync() is not called with i_mutex held
anymore, so if you require i_mutex locking you must make sure to take it and
release it yourself.
+ d_alloc_root() is gone, along with a lot of bugs caused by code
+misusing it. Replacement: d_make_root(inode). The difference is,
+d_make_root() drops the reference to inode if dentry allocation fails.
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index a76a26a1db8..b7413cb46dc 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -290,7 +290,7 @@ Table 1-4: Contents of the stat files (as of 2.6.30-rc7)
rsslim current limit in bytes on the rss
start_code address above which program text can run
end_code address below which program text can run
- start_stack address of the start of the stack
+ start_stack address of the start of the main process stack
esp current value of ESP
eip current value of EIP
pending bitmap of pending signals
@@ -325,7 +325,7 @@ address perms offset dev inode pathname
a7cb1000-a7cb2000 ---p 00000000 00:00 0
a7cb2000-a7eb2000 rw-p 00000000 00:00 0
a7eb2000-a7eb3000 ---p 00000000 00:00 0
-a7eb3000-a7ed5000 rw-p 00000000 00:00 0
+a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack:1001]
a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/
a8008000-a800a000 r--p 00133000 03:00 4222 /lib/
a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/
@@ -357,11 +357,39 @@ is not associated with a file:
[heap] = the heap of the program
[stack] = the stack of the main process
+ [stack:1001] = the stack of the thread with tid 1001
[vdso] = the "virtual dynamic shared object",
the kernel system call handler
or if empty, the mapping is anonymous.
+The /proc/PID/task/TID/maps is a view of the virtual memory from the viewpoint
+of the individual tasks of a process. In this file you will see a mapping marked
+as [stack] if that task sees it as a stack. This is a key difference from the
+content of /proc/PID/maps, where you will see all mappings that are being used
+as stack by all of those tasks. Hence, for the example above, the task-level
+map, i.e. /proc/PID/task/TID/maps for thread 1001 will look like this:
+08048000-08049000 r-xp 00000000 03:00 8312 /opt/test
+08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test
+0804a000-0806b000 rw-p 00000000 00:00 0 [heap]
+a7cb1000-a7cb2000 ---p 00000000 00:00 0
+a7cb2000-a7eb2000 rw-p 00000000 00:00 0
+a7eb2000-a7eb3000 ---p 00000000 00:00 0
+a7eb3000-a7ed5000 rw-p 00000000 00:00 0 [stack]
+a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/
+a8008000-a800a000 r--p 00133000 03:00 4222 /lib/
+a800a000-a800b000 rw-p 00135000 03:00 4222 /lib/
+a800b000-a800e000 rw-p 00000000 00:00 0
+a800e000-a8022000 r-xp 00000000 03:00 14462 /lib/
+a8022000-a8023000 r--p 00013000 03:00 14462 /lib/
+a8023000-a8024000 rw-p 00014000 03:00 14462 /lib/
+a8024000-a8027000 rw-p 00000000 00:00 0
+a8027000-a8043000 r-xp 00000000 03:00 8317 /lib/
+a8043000-a8044000 r--p 0001b000 03:00 8317 /lib/
+a8044000-a8045000 rw-p 0001c000 03:00 8317 /lib/
+aff35000-aff4a000 rw-p 00000000 00:00 0
+ffffe000-fffff000 r-xp 00000000 00:00 0 [vdso]
The /proc/PID/smaps is an extension based on maps, showing the memory
consumption for each of the process's mappings. For each of mappings there
diff --git a/Documentation/filesystems/qnx6.txt b/Documentation/filesystems/qnx6.txt
new file mode 100644
index 00000000000..050223ea03c
--- /dev/null
+++ b/Documentation/filesystems/qnx6.txt
@@ -0,0 +1,174 @@
+The QNX6 Filesystem
+The qnx6fs is used by newer QNX operating system versions. (e.g. Neutrino)
+It got introduced in QNX 6.4.0 and is used default since 6.4.1.
+mmi_fs Mount filesystem as used for example by Audi MMI 3G system
+qnx6fs shares many properties with traditional Unix filesystems. It has the
+concepts of blocks, inodes and directories.
+On QNX it is possible to create little endian and big endian qnx6 filesystems.
+This feature makes it possible to create and use a different endianness fs
+for the target (QNX is used on quite a range of embedded systems) plattform
+running on a different endianess.
+The Linux driver handles endianness transparently. (LE and BE)
+The space in the device or file is split up into blocks. These are a fixed
+size of 512, 1024, 2048 or 4096, which is decided when the filesystem is
+Blockpointers are 32bit, so the maximum space that can be adressed is
+2^32 * 4096 bytes or 16TB
+The superblocks
+The superblock contains all global information about the filesystem.
+Each qnx6fs got two superblocks, each one having a 64bit serial number.
+That serial number is used to identify the "active" superblock.
+In write mode with reach new snapshot (after each synchronous write), the
+serial of the new master superblock is increased (old superblock serial + 1)
+So basically the snapshot functionality is realized by an atomic final
+update of the serial number. Before updating that serial, all modifications
+are done by copying all modified blocks during that specific write request
+(or period) and building up a new (stable) filesystem structure under the
+inactive superblock.
+Each superblock holds a set of root inodes for the different filesystem
+parts. (Inode, Bitmap and Longfilenames)
+Each of these root nodes holds information like total size of the stored
+data and the adressing levels in that specific tree.
+If the level value is 0, up to 16 direct blocks can be adressed by each
+Level 1 adds an additional indirect adressing level where each indirect
+adressing block holds up to blocksize / 4 bytes pointers to data blocks.
+Level 2 adds an additional indirect adressig block level (so, already up
+to 16 * 256 * 256 = 1048576 blocks that can be adressed by such a tree)a
+Unused block pointers are always set to ~0 - regardless of root node,
+indirect adressing blocks or inodes.
+Data leaves are always on the lowest level. So no data is stored on upper
+tree levels.
+The first Superblock is located at 0x2000. (0x2000 is the bootblock size)
+The Audi MMI 3G first superblock directly starts at byte 0.
+Second superblock position can either be calculated from the superblock
+information (total number of filesystem blocks) or by taking the highest
+device address, zeroing the last 3 bytes and then substracting 0x1000 from
+that address.
+0x1000 is the size reserved for each superblock - regardless of the
+blocksize of the filesystem.
+Each object in the filesystem is represented by an inode. (index node)
+The inode structure contains pointers to the filesystem blocks which contain
+the data held in the object and all of the metadata about an object except
+its longname. (filenames longer than 27 characters)
+The metadata about an object includes the permissions, owner, group, flags,
+size, number of blocks used, access time, change time and modification time.
+Object mode field is POSIX format. (which makes things easier)
+There are also pointers to the first 16 blocks, if the object data can be
+adressed with 16 direct blocks.
+For more than 16 blocks an indirect adressing in form of another tree is
+used. (scheme is the same as the one used for the superblock root nodes)
+The filesize is stored 64bit. Inode counting starts with 1. (whilst long
+filename inodes start with 0)
+A directory is a filesystem object and has an inode just like a file.
+It is a specially formatted file containing records which associate each
+name with an inode number.
+'.' inode number points to the directory inode
+'..' inode number points to the parent directory inode
+Eeach filename record additionally got a filename length field.
+One special case are long filenames or subdirectory names.
+These got set a filename length field of 0xff in the corresponding directory
+record plus the longfile inode number also stored in that record.
+With that longfilename inode number, the longfilename tree can be walked
+starting with the superblock longfilename root node pointers.
+Special files
+Symbolic links are also filesystem objects with inodes. They got a specific
+bit in the inode mode field identifying them as symbolic link.
+The directory entry file inode pointer points to the target file inode.
+Hard links got an inode, a directory entry, but a specific mode bit set,
+no block pointers and the directory file record pointing to the target file
+Character and block special devices do not exist in QNX as those files
+are handled by the QNX kernel/drivers and created in /dev independant of the
+underlaying filesystem.
+Long filenames
+Long filenames are stored in a seperate adressing tree. The staring point
+is the longfilename root node in the active superblock.
+Each data block (tree leaves) holds one long filename. That filename is
+limited to 510 bytes. The first two starting bytes are used as length field
+for the actual filename.
+If that structure shall fit for all allowed blocksizes, it is clear why there
+is a limit of 510 bytes for the actual filename stored.
+The qnx6fs filesystem allocation bitmap is stored in a tree under bitmap
+root node in the superblock and each bit in the bitmap represents one
+filesystem block.
+The first block is block 0, which starts 0x1000 after superblock start.
+So for a normal qnx6fs 0x3000 (bootblock + superblock) is the physical
+address at which block 0 is located.
+Bits at the end of the last bitmap block are set to 1, if the device is
+smaller than addressing space in the bitmap.
+Bitmap system area
+The bitmap itself is devided into three parts.
+First the system area, that is split into two halfs.
+Then userspace.
+The requirement for a static, fixed preallocated system area comes from how
+qnx6fs deals with writes.
+Each superblock got it's own half of the system area. So superblock #1
+always uses blocks from the lower half whilst superblock #2 just writes to
+blocks represented by the upper half bitmap system area bits.
+Bitmap blocks, Inode blocks and indirect addressing blocks for those two
+tree structures are treated as system blocks.
+The rational behind that is that a write request can work on a new snapshot
+(system area of the inactive - resp. lower serial numbered superblock) while
+at the same time there is still a complete stable filesystem structer in the
+other half of the system area.
+When finished with writing (a sync write is completed, the maximum sync leap
+time or a filesystem sync is requested), serial of the previously inactive
+superblock atomically is increased and the fs switches over to that - then
+stable declared - superblock.
+For all data outside the system area, blocks are just copied while writing.
diff --git a/Documentation/filesystems/ramfs-rootfs-initramfs.txt b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
index a8273d5fad2..59b4a0962e0 100644
--- a/Documentation/filesystems/ramfs-rootfs-initramfs.txt
+++ b/Documentation/filesystems/ramfs-rootfs-initramfs.txt
@@ -297,7 +297,7 @@ the above threads) is:
either way about the archive format, and there are alternative tools,
such as:
2) The cpio archive format chosen by the kernel is simpler and cleaner (and
thus easier to create and parse) than any of the (literally dozens of)
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 3d9393b845b..0d049202808 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -114,7 +114,7 @@ members are defined:
struct file_system_type {
const char *name;
int fs_flags;
- struct dentry (*mount) (struct file_system_type *, int,
+ struct dentry *(*mount) (struct file_system_type *, int,
const char *, void *);
void (*kill_sb) (struct super_block *);
struct module *owner;
@@ -993,7 +993,7 @@ struct dentry_operations {
If the 'rcu_walk' parameter is true, then the caller is doing a
pathwalk in RCU-walk mode. Sleeping is not permitted in this mode,
- and the caller can be asked to leave it and call again by returing
+ and the caller can be asked to leave it and call again by returning
This function is only used if DCACHE_MANAGE_TRANSIT is set on the