summaryrefslogtreecommitdiffstats
path: root/Documentation/DocBook
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/DocBook')
-rw-r--r--Documentation/DocBook/Makefile195
-rw-r--r--Documentation/DocBook/deviceiobook.tmpl341
-rw-r--r--Documentation/DocBook/gadget.tmpl752
-rw-r--r--Documentation/DocBook/journal-api.tmpl333
-rw-r--r--Documentation/DocBook/kernel-api.tmpl342
-rw-r--r--Documentation/DocBook/kernel-hacking.tmpl1349
-rw-r--r--Documentation/DocBook/kernel-locking.tmpl2088
-rw-r--r--Documentation/DocBook/libata.tmpl282
-rw-r--r--Documentation/DocBook/librs.tmpl289
-rw-r--r--Documentation/DocBook/lsm.tmpl265
-rw-r--r--Documentation/DocBook/man/Makefile3
-rw-r--r--Documentation/DocBook/mcabook.tmpl107
-rw-r--r--Documentation/DocBook/mtdnand.tmpl1320
-rw-r--r--Documentation/DocBook/procfs-guide.tmpl591
-rw-r--r--Documentation/DocBook/procfs_example.c224
-rw-r--r--Documentation/DocBook/scsidrivers.tmpl193
-rw-r--r--Documentation/DocBook/sis900.tmpl585
-rw-r--r--Documentation/DocBook/tulip-user.tmpl327
-rw-r--r--Documentation/DocBook/usb.tmpl979
-rw-r--r--Documentation/DocBook/via-audio.tmpl597
-rw-r--r--Documentation/DocBook/videobook.tmpl1663
-rw-r--r--Documentation/DocBook/wanbook.tmpl99
-rw-r--r--Documentation/DocBook/writing_usb_driver.tmpl419
-rw-r--r--Documentation/DocBook/z8530book.tmpl385
24 files changed, 13728 insertions, 0 deletions
diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile
new file mode 100644
index 000000000000..a221039ee4c9
--- /dev/null
+++ b/Documentation/DocBook/Makefile
@@ -0,0 +1,195 @@
+###
+# This makefile is used to generate the kernel documentation,
+# primarily based on in-line comments in various source files.
+# See Documentation/kernel-doc-nano-HOWTO.txt for instruction in how
+# to ducument the SRC - and how to read it.
+# To add a new book the only step required is to add the book to the
+# list of DOCBOOKS.
+
+DOCBOOKS := wanbook.xml z8530book.xml mcabook.xml videobook.xml \
+ kernel-hacking.xml kernel-locking.xml via-audio.xml \
+ deviceiobook.xml procfs-guide.xml tulip-user.xml \
+ writing_usb_driver.xml scsidrivers.xml sis900.xml \
+ kernel-api.xml journal-api.xml lsm.xml usb.xml \
+ gadget.xml libata.xml mtdnand.xml librs.xml
+
+###
+# The build process is as follows (targets):
+# (xmldocs)
+# file.tmpl --> file.xml +--> file.ps (psdocs)
+# +--> file.pdf (pdfdocs)
+# +--> DIR=file (htmldocs)
+# +--> man/ (mandocs)
+
+###
+# The targets that may be used.
+.PHONY: xmldocs sgmldocs psdocs pdfdocs htmldocs mandocs installmandocs
+
+BOOKS := $(addprefix $(obj)/,$(DOCBOOKS))
+xmldocs: $(BOOKS)
+sgmldocs: xmldocs
+
+PS := $(patsubst %.xml, %.ps, $(BOOKS))
+psdocs: $(PS)
+
+PDF := $(patsubst %.xml, %.pdf, $(BOOKS))
+pdfdocs: $(PDF)
+
+HTML := $(patsubst %.xml, %.html, $(BOOKS))
+htmldocs: $(HTML)
+
+MAN := $(patsubst %.xml, %.9, $(BOOKS))
+mandocs: $(MAN)
+
+installmandocs: mandocs
+ $(MAKEMAN) install Documentation/DocBook/man
+
+###
+#External programs used
+KERNELDOC = scripts/kernel-doc
+DOCPROC = scripts/basic/docproc
+SPLITMAN = $(PERL) $(srctree)/scripts/split-man
+MAKEMAN = $(PERL) $(srctree)/scripts/makeman
+
+###
+# DOCPROC is used for two purposes:
+# 1) To generate a dependency list for a .tmpl file
+# 2) To preprocess a .tmpl file and call kernel-doc with
+# appropriate parameters.
+# The following rules are used to generate the .xml documentation
+# required to generate the final targets. (ps, pdf, html).
+quiet_cmd_docproc = DOCPROC $@
+ cmd_docproc = SRCTREE=$(srctree)/ $(DOCPROC) doc $< >$@
+define rule_docproc
+ set -e; \
+ $(if $($(quiet)cmd_$(1)),echo ' $($(quiet)cmd_$(1))';) \
+ $(cmd_$(1)); \
+ ( \
+ echo 'cmd_$@ := $(cmd_$(1))'; \
+ echo $@: `SRCTREE=$(srctree) $(DOCPROC) depend $<`; \
+ ) > $(dir $@).$(notdir $@).cmd
+endef
+
+%.xml: %.tmpl FORCE
+ $(call if_changed_rule,docproc)
+
+###
+#Read in all saved dependency files
+cmd_files := $(wildcard $(foreach f,$(BOOKS),$(dir $(f)).$(notdir $(f)).cmd))
+
+ifneq ($(cmd_files),)
+ include $(cmd_files)
+endif
+
+###
+# Changes in kernel-doc force a rebuild of all documentation
+$(BOOKS): $(KERNELDOC)
+
+###
+# procfs guide uses a .c file as example code.
+# This requires an explicit dependency
+C-procfs-example = procfs_example.xml
+C-procfs-example2 = $(addprefix $(obj)/,$(C-procfs-example))
+$(obj)/procfs-guide.xml: $(C-procfs-example2)
+
+###
+# Rules to generate postscript, PDF and HTML
+# db2html creates a directory. Generate a html file used for timestamp
+
+quiet_cmd_db2ps = DB2PS $@
+ cmd_db2ps = db2ps -o $(dir $@) $<
+%.ps : %.xml
+ @(which db2ps > /dev/null 2>&1) || \
+ (echo "*** You need to install DocBook stylesheets ***"; \
+ exit 1)
+ $(call cmd,db2ps)
+
+quiet_cmd_db2pdf = DB2PDF $@
+ cmd_db2pdf = db2pdf -o $(dir $@) $<
+%.pdf : %.xml
+ @(which db2pdf > /dev/null 2>&1) || \
+ (echo "*** You need to install DocBook stylesheets ***"; \
+ exit 1)
+ $(call cmd,db2pdf)
+
+quiet_cmd_db2html = DB2HTML $@
+ cmd_db2html = db2html -o $(patsubst %.html,%,$@) $< && \
+ echo '<a HREF="$(patsubst %.html,%,$(notdir $@))/book1.html"> \
+ Goto $(patsubst %.html,%,$(notdir $@))</a><p>' > $@
+
+%.html: %.xml
+ @(which db2html > /dev/null 2>&1) || \
+ (echo "*** You need to install DocBook stylesheets ***"; \
+ exit 1)
+ @rm -rf $@ $(patsubst %.html,%,$@)
+ $(call cmd,db2html)
+ @if [ ! -z "$(PNG-$(basename $(notdir $@)))" ]; then \
+ cp $(PNG-$(basename $(notdir $@))) $(patsubst %.html,%,$@); fi
+
+###
+# Rule to generate man files - output is placed in the man subdirectory
+
+%.9: %.xml
+ifneq ($(KBUILD_SRC),)
+ $(Q)mkdir -p $(objtree)/Documentation/DocBook/man
+endif
+ $(SPLITMAN) $< $(objtree)/Documentation/DocBook/man "$(VERSION).$(PATCHLEVEL).$(SUBLEVEL)"
+ $(MAKEMAN) convert $(objtree)/Documentation/DocBook/man $<
+
+###
+# Rules to generate postscripts and PNG imgages from .fig format files
+quiet_cmd_fig2eps = FIG2EPS $@
+ cmd_fig2eps = fig2dev -Leps $< $@
+
+%.eps: %.fig
+ @(which fig2dev > /dev/null 2>&1) || \
+ (echo "*** You need to install transfig ***"; \
+ exit 1)
+ $(call cmd,fig2eps)
+
+quiet_cmd_fig2png = FIG2PNG $@
+ cmd_fig2png = fig2dev -Lpng $< $@
+
+%.png: %.fig
+ @(which fig2dev > /dev/null 2>&1) || \
+ (echo "*** You need to install transfig ***"; \
+ exit 1)
+ $(call cmd,fig2png)
+
+###
+# Rule to convert a .c file to inline XML documentation
+%.xml: %.c
+ @echo ' GEN $@'
+ @( \
+ echo "<programlisting>"; \
+ expand --tabs=8 < $< | \
+ sed -e "s/&/\\&amp;/g" \
+ -e "s/</\\&lt;/g" \
+ -e "s/>/\\&gt;/g"; \
+ echo "</programlisting>") > $@
+
+###
+# Help targets as used by the top-level makefile
+dochelp:
+ @echo ' Linux kernel internal documentation in different formats:'
+ @echo ' xmldocs (XML DocBook), psdocs (Postscript), pdfdocs (PDF)'
+ @echo ' htmldocs (HTML), mandocs (man pages, use installmandocs to install)'
+
+###
+# Temporary files left by various tools
+clean-files := $(DOCBOOKS) \
+ $(patsubst %.xml, %.dvi, $(DOCBOOKS)) \
+ $(patsubst %.xml, %.aux, $(DOCBOOKS)) \
+ $(patsubst %.xml, %.tex, $(DOCBOOKS)) \
+ $(patsubst %.xml, %.log, $(DOCBOOKS)) \
+ $(patsubst %.xml, %.out, $(DOCBOOKS)) \
+ $(patsubst %.xml, %.ps, $(DOCBOOKS)) \
+ $(patsubst %.xml, %.pdf, $(DOCBOOKS)) \
+ $(patsubst %.xml, %.html, $(DOCBOOKS)) \
+ $(patsubst %.xml, %.9, $(DOCBOOKS)) \
+ $(C-procfs-example)
+
+clean-dirs := $(patsubst %.xml,%,$(DOCBOOKS))
+
+#man put files in man subdir - traverse down
+subdir- := man/
diff --git a/Documentation/DocBook/deviceiobook.tmpl b/Documentation/DocBook/deviceiobook.tmpl
new file mode 100644
index 000000000000..6f41f2f5c6f6
--- /dev/null
+++ b/Documentation/DocBook/deviceiobook.tmpl
@@ -0,0 +1,341 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="DoingIO">
+ <bookinfo>
+ <title>Bus-Independent Device Accesses</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Matthew</firstname>
+ <surname>Wilcox</surname>
+ <affiliation>
+ <address>
+ <email>matthew@wil.cx</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <authorgroup>
+ <author>
+ <firstname>Alan</firstname>
+ <surname>Cox</surname>
+ <affiliation>
+ <address>
+ <email>alan@redhat.com</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2001</year>
+ <holder>Matthew Wilcox</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ Linux provides an API which abstracts performing IO across all busses
+ and devices, allowing device drivers to be written independently of
+ bus type.
+ </para>
+ </chapter>
+
+ <chapter id="bugs">
+ <title>Known Bugs And Assumptions</title>
+ <para>
+ None.
+ </para>
+ </chapter>
+
+ <chapter id="mmio">
+ <title>Memory Mapped IO</title>
+ <sect1>
+ <title>Getting Access to the Device</title>
+ <para>
+ The most widely supported form of IO is memory mapped IO.
+ That is, a part of the CPU's address space is interpreted
+ not as accesses to memory, but as accesses to a device. Some
+ architectures define devices to be at a fixed address, but most
+ have some method of discovering devices. The PCI bus walk is a
+ good example of such a scheme. This document does not cover how
+ to receive such an address, but assumes you are starting with one.
+ Physical addresses are of type unsigned long.
+ </para>
+
+ <para>
+ This address should not be used directly. Instead, to get an
+ address suitable for passing to the accessor functions described
+ below, you should call <function>ioremap</function>.
+ An address suitable for accessing the device will be returned to you.
+ </para>
+
+ <para>
+ After you've finished using the device (say, in your module's
+ exit routine), call <function>iounmap</function> in order to return
+ the address space to the kernel. Most architectures allocate new
+ address space each time you call <function>ioremap</function>, and
+ they can run out unless you call <function>iounmap</function>.
+ </para>
+ </sect1>
+
+ <sect1>
+ <title>Accessing the device</title>
+ <para>
+ The part of the interface most used by drivers is reading and
+ writing memory-mapped registers on the device. Linux provides
+ interfaces to read and write 8-bit, 16-bit, 32-bit and 64-bit
+ quantities. Due to a historical accident, these are named byte,
+ word, long and quad accesses. Both read and write accesses are
+ supported; there is no prefetch support at this time.
+ </para>
+
+ <para>
+ The functions are named <function>readb</function>,
+ <function>readw</function>, <function>readl</function>,
+ <function>readq</function>, <function>readb_relaxed</function>,
+ <function>readw_relaxed</function>, <function>readl_relaxed</function>,
+ <function>readq_relaxed</function>, <function>writeb</function>,
+ <function>writew</function>, <function>writel</function> and
+ <function>writeq</function>.
+ </para>
+
+ <para>
+ Some devices (such as framebuffers) would like to use larger
+ transfers than 8 bytes at a time. For these devices, the
+ <function>memcpy_toio</function>, <function>memcpy_fromio</function>
+ and <function>memset_io</function> functions are provided.
+ Do not use memset or memcpy on IO addresses; they
+ are not guaranteed to copy data in order.
+ </para>
+
+ <para>
+ The read and write functions are defined to be ordered. That is the
+ compiler is not permitted to reorder the I/O sequence. When the
+ ordering can be compiler optimised, you can use <function>
+ __readb</function> and friends to indicate the relaxed ordering. Use
+ this with care.
+ </para>
+
+ <para>
+ While the basic functions are defined to be synchronous with respect
+ to each other and ordered with respect to each other the busses the
+ devices sit on may themselves have asynchronicity. In particular many
+ authors are burned by the fact that PCI bus writes are posted
+ asynchronously. A driver author must issue a read from the same
+ device to ensure that writes have occurred in the specific cases the
+ author cares. This kind of property cannot be hidden from driver
+ writers in the API. In some cases, the read used to flush the device
+ may be expected to fail (if the card is resetting, for example). In
+ that case, the read should be done from config space, which is
+ guaranteed to soft-fail if the card doesn't respond.
+ </para>
+
+ <para>
+ The following is an example of flushing a write to a device when
+ the driver would like to ensure the write's effects are visible prior
+ to continuing execution.
+ </para>
+
+<programlisting>
+static inline void
+qla1280_disable_intrs(struct scsi_qla_host *ha)
+{
+ struct device_reg *reg;
+
+ reg = ha->iobase;
+ /* disable risc and host interrupts */
+ WRT_REG_WORD(&amp;reg->ictrl, 0);
+ /*
+ * The following read will ensure that the above write
+ * has been received by the device before we return from this
+ * function.
+ */
+ RD_REG_WORD(&amp;reg->ictrl);
+ ha->flags.ints_enabled = 0;
+}
+</programlisting>
+
+ <para>
+ In addition to write posting, on some large multiprocessing systems
+ (e.g. SGI Challenge, Origin and Altix machines) posted writes won't
+ be strongly ordered coming from different CPUs. Thus it's important
+ to properly protect parts of your driver that do memory-mapped writes
+ with locks and use the <function>mmiowb</function> to make sure they
+ arrive in the order intended. Issuing a regular <function>readX
+ </function> will also ensure write ordering, but should only be used
+ when the driver has to be sure that the write has actually arrived
+ at the device (not that it's simply ordered with respect to other
+ writes), since a full <function>readX</function> is a relatively
+ expensive operation.
+ </para>
+
+ <para>
+ Generally, one should use <function>mmiowb</function> prior to
+ releasing a spinlock that protects regions using <function>writeb
+ </function> or similar functions that aren't surrounded by <function>
+ readb</function> calls, which will ensure ordering and flushing. The
+ following pseudocode illustrates what might occur if write ordering
+ isn't guaranteed via <function>mmiowb</function> or one of the
+ <function>readX</function> functions.
+ </para>
+
+<programlisting>
+CPU A: spin_lock_irqsave(&amp;dev_lock, flags)
+CPU A: ...
+CPU A: writel(newval, ring_ptr);
+CPU A: spin_unlock_irqrestore(&amp;dev_lock, flags)
+ ...
+CPU B: spin_lock_irqsave(&amp;dev_lock, flags)
+CPU B: writel(newval2, ring_ptr);
+CPU B: ...
+CPU B: spin_unlock_irqrestore(&amp;dev_lock, flags)
+</programlisting>
+
+ <para>
+ In the case above, newval2 could be written to ring_ptr before
+ newval. Fixing it is easy though:
+ </para>
+
+<programlisting>
+CPU A: spin_lock_irqsave(&amp;dev_lock, flags)
+CPU A: ...
+CPU A: writel(newval, ring_ptr);
+CPU A: mmiowb(); /* ensure no other writes beat us to the device */
+CPU A: spin_unlock_irqrestore(&amp;dev_lock, flags)
+ ...
+CPU B: spin_lock_irqsave(&amp;dev_lock, flags)
+CPU B: writel(newval2, ring_ptr);
+CPU B: ...
+CPU B: mmiowb();
+CPU B: spin_unlock_irqrestore(&amp;dev_lock, flags)
+</programlisting>
+
+ <para>
+ See tg3.c for a real world example of how to use <function>mmiowb
+ </function>
+ </para>
+
+ <para>
+ PCI ordering rules also guarantee that PIO read responses arrive
+ after any outstanding DMA writes from that bus, since for some devices
+ the result of a <function>readb</function> call may signal to the
+ driver that a DMA transaction is complete. In many cases, however,
+ the driver may want to indicate that the next
+ <function>readb</function> call has no relation to any previous DMA
+ writes performed by the device. The driver can use
+ <function>readb_relaxed</function> for these cases, although only
+ some platforms will honor the relaxed semantics. Using the relaxed
+ read functions will provide significant performance benefits on
+ platforms that support it. The qla2xxx driver provides examples
+ of how to use <function>readX_relaxed</function>. In many cases,
+ a majority of the driver's <function>readX</function> calls can
+ safely be converted to <function>readX_relaxed</function> calls, since
+ only a few will indicate or depend on DMA completion.
+ </para>
+ </sect1>
+
+ <sect1>
+ <title>ISA legacy functions</title>
+ <para>
+ On older kernels (2.2 and earlier) the ISA bus could be read or
+ written with these functions and without ioremap being used. This is
+ no longer true in Linux 2.4. A set of equivalent functions exist for
+ easy legacy driver porting. The functions available are prefixed
+ with 'isa_' and are <function>isa_readb</function>,
+ <function>isa_writeb</function>, <function>isa_readw</function>,
+ <function>isa_writew</function>, <function>isa_readl</function>,
+ <function>isa_writel</function>, <function>isa_memcpy_fromio</function>
+ and <function>isa_memcpy_toio</function>
+ </para>
+ <para>
+ These functions should not be used in new drivers, and will
+ eventually be going away.
+ </para>
+ </sect1>
+
+ </chapter>
+
+ <chapter>
+ <title>Port Space Accesses</title>
+ <sect1>
+ <title>Port Space Explained</title>
+
+ <para>
+ Another form of IO commonly supported is Port Space. This is a
+ range of addresses separate to the normal memory address space.
+ Access to these addresses is generally not as fast as accesses
+ to the memory mapped addresses, and it also has a potentially
+ smaller address space.
+ </para>
+
+ <para>
+ Unlike memory mapped IO, no preparation is required
+ to access port space.
+ </para>
+
+ </sect1>
+ <sect1>
+ <title>Accessing Port Space</title>
+ <para>
+ Accesses to this space are provided through a set of functions
+ which allow 8-bit, 16-bit and 32-bit accesses; also
+ known as byte, word and long. These functions are
+ <function>inb</function>, <function>inw</function>,
+ <function>inl</function>, <function>outb</function>,
+ <function>outw</function> and <function>outl</function>.
+ </para>
+
+ <para>
+ Some variants are provided for these functions. Some devices
+ require that accesses to their ports are slowed down. This
+ functionality is provided by appending a <function>_p</function>
+ to the end of the function. There are also equivalents to memcpy.
+ The <function>ins</function> and <function>outs</function>
+ functions copy bytes, words or longs to the given port.
+ </para>
+ </sect1>
+
+ </chapter>
+
+ <chapter id="pubfunctions">
+ <title>Public Functions Provided</title>
+!Einclude/asm-i386/io.h
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/gadget.tmpl b/Documentation/DocBook/gadget.tmpl
new file mode 100644
index 000000000000..a34442436128
--- /dev/null
+++ b/Documentation/DocBook/gadget.tmpl
@@ -0,0 +1,752 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="USB-Gadget-API">
+ <bookinfo>
+ <title>USB Gadget API for Linux</title>
+ <date>20 August 2004</date>
+ <edition>20 August 2004</edition>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ <copyright>
+ <year>2003-2004</year>
+ <holder>David Brownell</holder>
+ </copyright>
+
+ <author>
+ <firstname>David</firstname>
+ <surname>Brownell</surname>
+ <affiliation>
+ <address><email>dbrownell@users.sourceforge.net</email></address>
+ </affiliation>
+ </author>
+ </bookinfo>
+
+<toc></toc>
+
+<chapter><title>Introduction</title>
+
+<para>This document presents a Linux-USB "Gadget"
+kernel mode
+API, for use within peripherals and other USB devices
+that embed Linux.
+It provides an overview of the API structure,
+and shows how that fits into a system development project.
+This is the first such API released on Linux to address
+a number of important problems, including: </para>
+
+<itemizedlist>
+ <listitem><para>Supports USB 2.0, for high speed devices which
+ can stream data at several dozen megabytes per second.
+ </para></listitem>
+ <listitem><para>Handles devices with dozens of endpoints just as
+ well as ones with just two fixed-function ones. Gadget drivers
+ can be written so they're easy to port to new hardware.
+ </para></listitem>
+ <listitem><para>Flexible enough to expose more complex USB device
+ capabilities such as multiple configurations, multiple interfaces,
+ composite devices,
+ and alternate interface settings.
+ </para></listitem>
+ <listitem><para>USB "On-The-Go" (OTG) support, in conjunction
+ with updates to the Linux-USB host side.
+ </para></listitem>
+ <listitem><para>Sharing data structures and API models with the
+ Linux-USB host side API. This helps the OTG support, and
+ looks forward to more-symmetric frameworks (where the same
+ I/O model is used by both host and device side drivers).
+ </para></listitem>
+ <listitem><para>Minimalist, so it's easier to support new device
+ controller hardware. I/O processing doesn't imply large
+ demands for memory or CPU resources.
+ </para></listitem>
+</itemizedlist>
+
+
+<para>Most Linux developers will not be able to use this API, since they
+have USB "host" hardware in a PC, workstation, or server.
+Linux users with embedded systems are more likely to
+have USB peripheral hardware.
+To distinguish drivers running inside such hardware from the
+more familiar Linux "USB device drivers",
+which are host side proxies for the real USB devices,
+a different term is used:
+the drivers inside the peripherals are "USB gadget drivers".
+In USB protocol interactions, the device driver is the master
+(or "client driver")
+and the gadget driver is the slave (or "function driver").
+</para>
+
+<para>The gadget API resembles the host side Linux-USB API in that both
+use queues of request objects to package I/O buffers, and those requests
+may be submitted or canceled.
+They share common definitions for the standard USB
+<emphasis>Chapter 9</emphasis> messages, structures, and constants.
+Also, both APIs bind and unbind drivers to devices.
+The APIs differ in detail, since the host side's current
+URB framework exposes a number of implementation details
+and assumptions that are inappropriate for a gadget API.
+While the model for control transfers and configuration
+management is necessarily different (one side is a hardware-neutral master,
+the other is a hardware-aware slave), the endpoint I/0 API used here
+should also be usable for an overhead-reduced host side API.
+</para>
+
+</chapter>
+
+<chapter id="structure"><title>Structure of Gadget Drivers</title>
+
+<para>A system running inside a USB peripheral
+normally has at least three layers inside the kernel to handle
+USB protocol processing, and may have additional layers in
+user space code.
+The "gadget" API is used by the middle layer to interact
+with the lowest level (which directly handles hardware).
+</para>
+
+<para>In Linux, from the bottom up, these layers are:
+</para>
+
+<variablelist>
+
+ <varlistentry>
+ <term><emphasis>USB Controller Driver</emphasis></term>
+
+ <listitem>
+ <para>This is the lowest software level.
+ It is the only layer that talks to hardware,
+ through registers, fifos, dma, irqs, and the like.
+ The <filename>&lt;linux/usb_gadget.h&gt;</filename> API abstracts
+ the peripheral controller endpoint hardware.
+ That hardware is exposed through endpoint objects, which accept
+ streams of IN/OUT buffers, and through callbacks that interact
+ with gadget drivers.
+ Since normal USB devices only have one upstream
+ port, they only have one of these drivers.
+ The controller driver can support any number of different
+ gadget drivers, but only one of them can be used at a time.
+ </para>
+
+ <para>Examples of such controller hardware include
+ the PCI-based NetChip 2280 USB 2.0 high speed controller,
+ the SA-11x0 or PXA-25x UDC (found within many PDAs),
+ and a variety of other products.
+ </para>
+
+ </listitem></varlistentry>
+
+ <varlistentry>
+ <term><emphasis>Gadget Driver</emphasis></term>
+
+ <listitem>
+ <para>The lower boundary of this driver implements hardware-neutral
+ USB functions, using calls to the controller driver.
+ Because such hardware varies widely in capabilities and restrictions,
+ and is used in embedded environments where space is at a premium,
+ the gadget driver is often configured at compile time
+ to work with endpoints supported by one particular controller.
+ Gadget drivers may be portable to several different controllers,
+ using conditional compilation.
+ (Recent kernels substantially simplify the work involved in
+ supporting new hardware, by <emphasis>autoconfiguring</emphasis>
+ endpoints automatically for many bulk-oriented drivers.)
+ Gadget driver responsibilities include:
+ </para>
+ <itemizedlist>
+ <listitem><para>handling setup requests (ep0 protocol responses)
+ possibly including class-specific functionality
+ </para></listitem>
+ <listitem><para>returning configuration and string descriptors
+ </para></listitem>
+ <listitem><para>(re)setting configurations and interface
+ altsettings, including enabling and configuring endpoints
+ </para></listitem>
+ <listitem><para>handling life cycle events, such as managing
+ bindings to hardware,
+ USB suspend/resume, remote wakeup,
+ and disconnection from the USB host.
+ </para></listitem>
+ <listitem><para>managing IN and OUT transfers on all currently
+ enabled endpoints
+ </para></listitem>
+ </itemizedlist>
+
+ <para>
+ Such drivers may be modules of proprietary code, although
+ that approach is discouraged in the Linux community.
+ </para>
+ </listitem></varlistentry>
+
+ <varlistentry>
+ <term><emphasis>Upper Level</emphasis></term>
+
+ <listitem>
+ <para>Most gadget drivers have an upper boundary that connects
+ to some Linux driver or framework in Linux.
+ Through that boundary flows the data which the gadget driver
+ produces and/or consumes through protocol transfers over USB.
+ Examples include:
+ </para>
+ <itemizedlist>
+ <listitem><para>user mode code, using generic (gadgetfs)
+ or application specific files in
+ <filename>/dev</filename>
+ </para></listitem>
+ <listitem><para>networking subsystem (for network gadgets,
+ like the CDC Ethernet Model gadget driver)
+ </para></listitem>
+ <listitem><para>data capture drivers, perhaps video4Linux or
+ a scanner driver; or test and measurement hardware.
+ </para></listitem>
+ <listitem><para>input subsystem (for HID gadgets)
+ </para></listitem>
+ <listitem><para>sound subsystem (for audio gadgets)
+ </para></listitem>
+ <listitem><para>file system (for PTP gadgets)
+ </para></listitem>
+ <listitem><para>block i/o subsystem (for usb-storage gadgets)
+ </para></listitem>
+ <listitem><para>... and more </para></listitem>
+ </itemizedlist>
+ </listitem></varlistentry>
+
+ <varlistentry>
+ <term><emphasis>Additional Layers</emphasis></term>
+
+ <listitem>
+ <para>Other layers may exist.
+ These could include kernel layers, such as network protocol stacks,
+ as well as user mode applications building on standard POSIX
+ system call APIs such as
+ <emphasis>open()</emphasis>, <emphasis>close()</emphasis>,
+ <emphasis>read()</emphasis> and <emphasis>write()</emphasis>.
+ On newer systems, POSIX Async I/O calls may be an option.
+ Such user mode code will not necessarily be subject to
+ the GNU General Public License (GPL).
+ </para>
+ </listitem></varlistentry>
+
+
+</variablelist>
+
+<para>OTG-capable systems will also need to include a standard Linux-USB
+host side stack,
+with <emphasis>usbcore</emphasis>,
+one or more <emphasis>Host Controller Drivers</emphasis> (HCDs),
+<emphasis>USB Device Drivers</emphasis> to support
+the OTG "Targeted Peripheral List",
+and so forth.
+There will also be an <emphasis>OTG Controller Driver</emphasis>,
+which is visible to gadget and device driver developers only indirectly.
+That helps the host and device side USB controllers implement the
+two new OTG protocols (HNP and SRP).
+Roles switch (host to peripheral, or vice versa) using HNP
+during USB suspend processing, and SRP can be viewed as a
+more battery-friendly kind of device wakeup protocol.
+</para>
+
+<para>Over time, reusable utilities are evolving to help make some
+gadget driver tasks simpler.
+For example, building configuration descriptors from vectors of
+descriptors for the configurations interfaces and endpoints is
+now automated, and many drivers now use autoconfiguration to
+choose hardware endpoints and initialize their descriptors.
+
+A potential example of particular interest
+is code implementing standard USB-IF protocols for
+HID, networking, storage, or audio classes.
+Some developers are interested in KDB or KGDB hooks, to let
+target hardware be remotely debugged.
+Most such USB protocol code doesn't need to be hardware-specific,
+any more than network protocols like X11, HTTP, or NFS are.
+Such gadget-side interface drivers should eventually be combined,
+to implement composite devices.
+</para>
+
+</chapter>
+
+
+<chapter id="api"><title>Kernel Mode Gadget API</title>
+
+<para>Gadget drivers declare themselves through a
+<emphasis>struct usb_gadget_driver</emphasis>, which is responsible for
+most parts of enumeration for a <emphasis>struct usb_gadget</emphasis>.
+The response to a set_configuration usually involves
+enabling one or more of the <emphasis>struct usb_ep</emphasis> objects
+exposed by the gadget, and submitting one or more
+<emphasis>struct usb_request</emphasis> buffers to transfer data.
+Understand those four data types, and their operations, and
+you will understand how this API works.
+</para>
+
+<note><title>Incomplete Data Type Descriptions</title>
+
+<para>This documentation was prepared using the standard Linux
+kernel <filename>docproc</filename> tool, which turns text
+and in-code comments into SGML DocBook and then into usable
+formats such as HTML or PDF.
+Other than the "Chapter 9" data types, most of the significant
+data types and functions are described here.
+</para>
+
+<para>However, docproc does not understand all the C constructs
+that are used, so some relevant information is likely omitted from
+what you are reading.
+One example of such information is endpoint autoconfiguration.
+You'll have to read the header file, and use example source
+code (such as that for "Gadget Zero"), to fully understand the API.
+</para>
+
+<para>The part of the API implementing some basic
+driver capabilities is specific to the version of the
+Linux kernel that's in use.
+The 2.6 kernel includes a <emphasis>driver model</emphasis>
+framework that has no analogue on earlier kernels;
+so those parts of the gadget API are not fully portable.
+(They are implemented on 2.4 kernels, but in a different way.)
+The driver model state is another part of this API that is
+ignored by the kerneldoc tools.
+</para>
+</note>
+
+<para>The core API does not expose
+every possible hardware feature, only the most widely available ones.
+There are significant hardware features, such as device-to-device DMA
+(without temporary storage in a memory buffer)
+that would be added using hardware-specific APIs.
+</para>
+
+<para>This API allows drivers to use conditional compilation to handle
+endpoint capabilities of different hardware, but doesn't require that.
+Hardware tends to have arbitrary restrictions, relating to
+transfer types, addressing, packet sizes, buffering, and availability.
+As a rule, such differences only matter for "endpoint zero" logic
+that handles device configuration and management.
+The API supports limited run-time
+detection of capabilities, through naming conventions for endpoints.
+Many drivers will be able to at least partially autoconfigure
+themselves.
+In particular, driver init sections will often have endpoint
+autoconfiguration logic that scans the hardware's list of endpoints
+to find ones matching the driver requirements
+(relying on those conventions), to eliminate some of the most
+common reasons for conditional compilation.
+</para>
+
+<para>Like the Linux-USB host side API, this API exposes
+the "chunky" nature of USB messages: I/O requests are in terms
+of one or more "packets", and packet boundaries are visible to drivers.
+Compared to RS-232 serial protocols, USB resembles
+synchronous protocols like HDLC
+(N bytes per frame, multipoint addressing, host as the primary
+station and devices as secondary stations)
+more than asynchronous ones
+(tty style: 8 data bits per frame, no parity, one stop bit).
+So for example the controller drivers won't buffer
+two single byte writes into a single two-byte USB IN packet,
+although gadget drivers may do so when they implement
+protocols where packet boundaries (and "short packets")
+are not significant.
+</para>
+
+<sect1 id="lifecycle"><title>Driver Life Cycle</title>
+
+<para>Gadget drivers make endpoint I/O requests to hardware without
+needing to know many details of the hardware, but driver
+setup/configuration code needs to handle some differences.
+Use the API like this:
+</para>
+
+<orderedlist numeration='arabic'>
+
+<listitem><para>Register a driver for the particular device side
+usb controller hardware,
+such as the net2280 on PCI (USB 2.0),
+sa11x0 or pxa25x as found in Linux PDAs,
+and so on.
+At this point the device is logically in the USB ch9 initial state
+("attached"), drawing no power and not usable
+(since it does not yet support enumeration).
+Any host should not see the device, since it's not
+activated the data line pullup used by the host to
+detect a device, even if VBUS power is available.
+</para></listitem>
+
+<listitem><para>Register a gadget driver that implements some higher level
+device function. That will then bind() to a usb_gadget, which
+activates the data line pullup sometime after detecting VBUS.
+</para></listitem>
+
+<listitem><para>The hardware driver can now start enumerating.
+The steps it handles are to accept USB power and set_address requests.
+Other steps are handled by the gadget driver.
+If the gadget driver module is unloaded before the host starts to
+enumerate, steps before step 7 are skipped.
+</para></listitem>
+
+<listitem><para>The gadget driver's setup() call returns usb descriptors,
+based both on what the bus interface hardware provides and on the
+functionality being implemented.
+That can involve alternate settings or configurations,
+unless the hardware prevents such operation.
+For OTG devices, each configuration descriptor includes
+an OTG descriptor.
+</para></listitem>
+
+<listitem><para>The gadget driver handles the last step of enumeration,
+when the USB host issues a set_configuration call.
+It enables all endpoints used in that configuration,
+with all interfaces in their default settings.
+That involves using a list of the hardware's endpoints, enabling each
+endpoint according to its descriptor.
+It may also involve using <function>usb_gadget_vbus_draw</function>
+to let more power be drawn from VBUS, as allowed by that configuration.
+For OTG devices, setting a configuration may also involve reporting
+HNP capabilities through a user interface.
+</para></listitem>
+
+<listitem><para>Do real work and perform data transfers, possibly involving
+changes to interface settings or switching to new configurations, until the
+device is disconnect()ed from the host.
+Queue any number of transfer requests to each endpoint.
+It may be suspended and resumed several times before being disconnected.
+On disconnect, the drivers go back to step 3 (above).
+</para></listitem>
+
+<listitem><para>When the gadget driver module is being unloaded,
+the driver unbind() callback is issued. That lets the controller
+driver be unloaded.
+</para></listitem>
+
+</orderedlist>
+
+<para>Drivers will normally be arranged so that just loading the
+gadget driver module (or statically linking it into a Linux kernel)
+allows the peripheral device to be enumerated, but some drivers
+will defer enumeration until some higher level component (like
+a user mode daemon) enables it.
+Note that at this lowest level there are no policies about how
+ep0 configuration logic is implemented,
+except that it should obey USB specifications.
+Such issues are in the domain of gadget drivers,
+including knowing about implementation constraints
+imposed by some USB controllers
+or understanding that composite devices might happen to
+be built by integrating reusable components.
+</para>
+
+<para>Note that the lifecycle above can be slightly different
+for OTG devices.
+Other than providing an additional OTG descriptor in each
+configuration, only the HNP-related differences are particularly
+visible to driver code.
+They involve reporting requirements during the SET_CONFIGURATION
+request, and the option to invoke HNP during some suspend callbacks.
+Also, SRP changes the semantics of
+<function>usb_gadget_wakeup</function>
+slightly.
+</para>
+
+</sect1>
+
+<sect1 id="ch9"><title>USB 2.0 Chapter 9 Types and Constants</title>
+
+<para>Gadget drivers
+rely on common USB structures and constants
+defined in the
+<filename>&lt;linux/usb_ch9.h&gt;</filename>
+header file, which is standard in Linux 2.6 kernels.
+These are the same types and constants used by host
+side drivers (and usbcore).
+</para>
+
+!Iinclude/linux/usb_ch9.h
+</sect1>
+
+<sect1 id="core"><title>Core Objects and Methods</title>
+
+<para>These are declared in
+<filename>&lt;linux/usb_gadget.h&gt;</filename>,
+and are used by gadget drivers to interact with
+USB peripheral controller drivers.
+</para>
+
+ <!-- yeech, this is ugly in nsgmls PDF output.
+
+ the PDF bookmark and refentry output nesting is wrong,
+ and the member/argument documentation indents ugly.
+
+ plus something (docproc?) adds whitespace before the
+ descriptive paragraph text, so it can't line up right
+ unless the explanations are trivial.
+ -->
+
+!Iinclude/linux/usb_gadget.h
+</sect1>
+
+<sect1 id="utils"><title>Optional Utilities</title>
+
+<para>The core API is sufficient for writing a USB Gadget Driver,
+but some optional utilities are provided to simplify common tasks.
+These utilities include endpoint autoconfiguration.
+</para>
+
+!Edrivers/usb/gadget/usbstring.c
+!Edrivers/usb/gadget/config.c
+<!-- !Edrivers/usb/gadget/epautoconf.c -->
+</sect1>
+
+</chapter>
+
+<chapter id="controllers"><title>Peripheral Controller Drivers</title>
+
+<para>The first hardware supporting this API was the NetChip 2280
+controller, which supports USB 2.0 high speed and is based on PCI.
+This is the <filename>net2280</filename> driver module.
+The driver supports Linux kernel versions 2.4 and 2.6;
+contact NetChip Technologies for development boards and product
+information.
+</para>
+
+<para>Other hardware working in the "gadget" framework includes:
+Intel's PXA 25x and IXP42x series processors
+(<filename>pxa2xx_udc</filename>),
+Toshiba TC86c001 "Goku-S" (<filename>goku_udc</filename>),
+Renesas SH7705/7727 (<filename>sh_udc</filename>),
+MediaQ 11xx (<filename>mq11xx_udc</filename>),
+Hynix HMS30C7202 (<filename>h7202_udc</filename>),
+National 9303/4 (<filename>n9604_udc</filename>),
+Texas Instruments OMAP (<filename>omap_udc</filename>),
+Sharp LH7A40x (<filename>lh7a40x_udc</filename>),
+and more.
+Most of those are full speed controllers.
+</para>
+
+<para>At this writing, there are people at work on drivers in
+this framework for several other USB device controllers,
+with plans to make many of them be widely available.
+</para>
+
+<!-- !Edrivers/usb/gadget/net2280.c -->
+
+<para>A partial USB simulator,
+the <filename>dummy_hcd</filename> driver, is available.
+It can act like a net2280, a pxa25x, or an sa11x0 in terms
+of available endpoints and device speeds; and it simulates
+control, bulk, and to some extent interrupt transfers.
+That lets you develop some parts of a gadget driver on a normal PC,
+without any special hardware, and perhaps with the assistance
+of tools such as GDB running with User Mode Linux.
+At least one person has expressed interest in adapting that
+approach, hooking it up to a simulator for a microcontroller.
+Such simulators can help debug subsystems where the runtime hardware
+is unfriendly to software development, or is not yet available.
+</para>
+
+<para>Support for other controllers is expected to be developed
+and contributed
+over time, as this driver framework evolves.
+</para>
+
+</chapter>
+
+<chapter id="gadget"><title>Gadget Drivers</title>
+
+<para>In addition to <emphasis>Gadget Zero</emphasis>
+(used primarily for testing and development with drivers
+for usb controller hardware), other gadget drivers exist.
+</para>
+
+<para>There's an <emphasis>ethernet</emphasis> gadget
+driver, which implements one of the most useful
+<emphasis>Communications Device Class</emphasis> (CDC) models.
+One of the standards for cable modem interoperability even
+specifies the use of this ethernet model as one of two
+mandatory options.
+Gadgets using this code look to a USB host as if they're
+an Ethernet adapter.
+It provides access to a network where the gadget's CPU is one host,
+which could easily be bridging, routing, or firewalling
+access to other networks.
+Since some hardware can't fully implement the CDC Ethernet
+requirements, this driver also implements a "good parts only"
+subset of CDC Ethernet.
+(That subset doesn't advertise itself as CDC Ethernet,
+to avoid creating problems.)
+</para>
+
+<para>Support for Microsoft's <emphasis>RNDIS</emphasis>
+protocol has been contributed by Pengutronix and Auerswald GmbH.
+This is like CDC Ethernet, but it runs on more slightly USB hardware
+(but less than the CDC subset).
+However, its main claim to fame is being able to connect directly to
+recent versions of Windows, using drivers that Microsoft bundles
+and supports, making it much simpler to network with Windows.
+</para>
+
+<para>There is also support for user mode gadget drivers,
+using <emphasis>gadgetfs</emphasis>.
+This provides a <emphasis>User Mode API</emphasis> that presents
+each endpoint as a single file descriptor. I/O is done using
+normal <emphasis>read()</emphasis> and <emphasis>read()</emphasis> calls.
+Familiar tools like GDB and pthreads can be used to
+develop and debug user mode drivers, so that once a robust
+controller driver is available many applications for it
+won't require new kernel mode software.
+Linux 2.6 <emphasis>Async I/O (AIO)</emphasis>
+support is available, so that user mode software
+can stream data with only slightly more overhead
+than a kernel driver.
+</para>
+
+<para>There's a USB Mass Storage class driver, which provides
+a different solution for interoperability with systems such
+as MS-Windows and MacOS.
+That <emphasis>File-backed Storage</emphasis> driver uses a
+file or block device as backing store for a drive,
+like the <filename>loop</filename> driver.
+The USB host uses the BBB, CB, or CBI versions of the mass
+storage class specification, using transparent SCSI commands
+to access the data from the backing store.
+</para>
+
+<para>There's a "serial line" driver, useful for TTY style
+operation over USB.
+The latest version of that driver supports CDC ACM style
+operation, like a USB modem, and so on most hardware it can
+interoperate easily with MS-Windows.
+One interesting use of that driver is in boot firmware (like a BIOS),
+which can sometimes use that model with very small systems without
+real serial lines.
+</para>
+
+<para>Support for other kinds of gadget is expected to
+be developed and contributed
+over time, as this driver framework evolves.
+</para>
+
+</chapter>
+
+<chapter id="otg"><title>USB On-The-GO (OTG)</title>
+
+<para>USB OTG support on Linux 2.6 was initially developed
+by Texas Instruments for
+<ulink url="http://www.omap.com">OMAP</ulink> 16xx and 17xx
+series processors.
+Other OTG systems should work in similar ways, but the
+hardware level details could be very different.
+</para>
+
+<para>Systems need specialized hardware support to implement OTG,
+notably including a special <emphasis>Mini-AB</emphasis> jack
+and associated transciever to support <emphasis>Dual-Role</emphasis>
+operation:
+they can act either as a host, using the standard
+Linux-USB host side driver stack,
+or as a peripheral, using this "gadget" framework.
+To do that, the system software relies on small additions
+to those programming interfaces,
+and on a new internal component (here called an "OTG Controller")
+affecting which driver stack connects to the OTG port.
+In each role, the system can re-use the existing pool of
+hardware-neutral drivers, layered on top of the controller
+driver interfaces (<emphasis>usb_bus</emphasis> or
+<emphasis>usb_gadget</emphasis>).
+Such drivers need at most minor changes, and most of the calls
+added to support OTG can also benefit non-OTG products.
+</para>
+
+<itemizedlist>
+ <listitem><para>Gadget drivers test the <emphasis>is_otg</emphasis>
+ flag, and use it to determine whether or not to include
+ an OTG descriptor in each of their configurations.
+ </para></listitem>
+ <listitem><para>Gadget drivers may need changes to support the
+ two new OTG protocols, exposed in new gadget attributes
+ such as <emphasis>b_hnp_enable</emphasis> flag.
+ HNP support should be reported through a user interface
+ (two LEDs could suffice), and is triggered in some cases
+ when the host suspends the peripheral.
+ SRP support can be user-initiated just like remote wakeup,
+ probably by pressing the same button.
+ </para></listitem>
+ <listitem><para>On the host side, USB device drivers need
+ to be taught to trigger HNP at appropriate moments, using
+ <function>usb_suspend_device()</function>.
+ That also conserves battery power, which is useful even
+ for non-OTG configurations.
+ </para></listitem>
+ <listitem><para>Also on the host side, a driver must support the
+ OTG "Targeted Peripheral List". That's just a whitelist,
+ used to reject peripherals not supported with a given
+ Linux OTG host.
+ <emphasis>This whitelist is product-specific;
+ each product must modify <filename>otg_whitelist.h</filename>
+ to match its interoperability specification.
+ </emphasis>
+ </para>
+ <para>Non-OTG Linux hosts, like PCs and workstations,
+ normally have some solution for adding drivers, so that
+ peripherals that aren't recognized can eventually be supported.
+ That approach is unreasonable for consumer products that may
+ never have their firmware upgraded, and where it's usually
+ unrealistic to expect traditional PC/workstation/server kinds
+ of support model to work.
+ For example, it's often impractical to change device firmware
+ once the product has been distributed, so driver bugs can't
+ normally be fixed if they're found after shipment.
+ </para></listitem>
+</itemizedlist>
+
+<para>
+Additional changes are needed below those hardware-neutral
+<emphasis>usb_bus</emphasis> and <emphasis>usb_gadget</emphasis>
+driver interfaces; those aren't discussed here in any detail.
+Those affect the hardware-specific code for each USB Host or Peripheral
+controller, and how the HCD initializes (since OTG can be active only
+on a single port).
+They also involve what may be called an <emphasis>OTG Controller
+Driver</emphasis>, managing the OTG transceiver and the OTG state
+machine logic as well as much of the root hub behavior for the
+OTG port.
+The OTG controller driver needs to activate and deactivate USB
+controllers depending on the relevant device role.
+Some related changes were needed inside usbcore, so that it
+can identify OTG-capable devices and respond appropriately
+to HNP or SRP protocols.
+</para>
+
+</chapter>
+
+</book>
+<!--
+ vim:syntax=sgml:sw=4
+-->
diff --git a/Documentation/DocBook/journal-api.tmpl b/Documentation/DocBook/journal-api.tmpl
new file mode 100644
index 000000000000..1ef6f43c6d8f
--- /dev/null
+++ b/Documentation/DocBook/journal-api.tmpl
@@ -0,0 +1,333 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="LinuxJBDAPI">
+ <bookinfo>
+ <title>The Linux Journalling API</title>
+ <authorgroup>
+ <author>
+ <firstname>Roger</firstname>
+ <surname>Gammans</surname>
+ <affiliation>
+ <address>
+ <email>rgammans@computer-surgery.co.uk</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <authorgroup>
+ <author>
+ <firstname>Stephen</firstname>
+ <surname>Tweedie</surname>
+ <affiliation>
+ <address>
+ <email>sct@redhat.com</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2002</year>
+ <holder>Roger Gammans</holder>
+ </copyright>
+
+<legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="Overview">
+ <title>Overview</title>
+ <sect1>
+ <title>Details</title>
+<para>
+The journalling layer is easy to use. You need to
+first of all create a journal_t data structure. There are
+two calls to do this dependent on how you decide to allocate the physical
+media on which the journal resides. The journal_init_inode() call
+is for journals stored in filesystem inodes, or the journal_init_dev()
+call can be use for journal stored on a raw device (in a continuous range
+of blocks). A journal_t is a typedef for a struct pointer, so when
+you are finally finished make sure you call journal_destroy() on it
+to free up any used kernel memory.
+</para>
+
+<para>
+Once you have got your journal_t object you need to 'mount' or load the journal
+file, unless of course you haven't initialised it yet - in which case you
+need to call journal_create().
+</para>
+
+<para>
+Most of the time however your journal file will already have been created, but
+before you load it you must call journal_wipe() to empty the journal file.
+Hang on, you say , what if the filesystem wasn't cleanly umount()'d . Well, it is the
+job of the client file system to detect this and skip the call to journal_wipe().
+</para>
+
+<para>
+In either case the next call should be to journal_load() which prepares the
+journal file for use. Note that journal_wipe(..,0) calls journal_skip_recovery()
+for you if it detects any outstanding transactions in the journal and similarly
+journal_load() will call journal_recover() if necessary.
+I would advise reading fs/ext3/super.c for examples on this stage.
+[RGG: Why is the journal_wipe() call necessary - doesn't this needlessly
+complicate the API. Or isn't a good idea for the journal layer to hide
+dirty mounts from the client fs]
+</para>
+
+<para>
+Now you can go ahead and start modifying the underlying
+filesystem. Almost.
+</para>
+
+
+<para>
+
+You still need to actually journal your filesystem changes, this
+is done by wrapping them into transactions. Additionally you
+also need to wrap the modification of each of the the buffers
+with calls to the journal layer, so it knows what the modifications
+you are actually making are. To do this use journal_start() which
+returns a transaction handle.
+</para>
+
+<para>
+journal_start()
+and its counterpart journal_stop(), which indicates the end of a transaction
+are nestable calls, so you can reenter a transaction if necessary,
+but remember you must call journal_stop() the same number of times as
+journal_start() before the transaction is completed (or more accurately
+leaves the the update phase). Ext3/VFS makes use of this feature to simplify
+quota support.
+</para>
+
+<para>
+Inside each transaction you need to wrap the modifications to the
+individual buffers (blocks). Before you start to modify a buffer you
+need to call journal_get_{create,write,undo}_access() as appropriate,
+this allows the journalling layer to copy the unmodified data if it
+needs to. After all the buffer may be part of a previously uncommitted
+transaction.
+At this point you are at last ready to modify a buffer, and once
+you are have done so you need to call journal_dirty_{meta,}data().
+Or if you've asked for access to a buffer you now know is now longer
+required to be pushed back on the device you can call journal_forget()
+in much the same way as you might have used bforget() in the past.
+</para>
+
+<para>
+A journal_flush() may be called at any time to commit and checkpoint
+all your transactions.
+</para>
+
+<para>
+Then at umount time , in your put_super() (2.4) or write_super() (2.5)
+you can then call journal_destroy() to clean up your in-core journal object.
+</para>
+
+
+<para>
+Unfortunately there a couple of ways the journal layer can cause a deadlock.
+The first thing to note is that each task can only have
+a single outstanding transaction at any one time, remember nothing
+commits until the outermost journal_stop(). This means
+you must complete the transaction at the end of each file/inode/address
+etc. operation you perform, so that the journalling system isn't re-entered
+on another journal. Since transactions can't be nested/batched
+across differing journals, and another filesystem other than
+yours (say ext3) may be modified in a later syscall.
+</para>
+
+<para>
+The second case to bear in mind is that journal_start() can
+block if there isn't enough space in the journal for your transaction
+(based on the passed nblocks param) - when it blocks it merely(!) needs to
+wait for transactions to complete and be committed from other tasks,
+so essentially we are waiting for journal_stop(). So to avoid
+deadlocks you must treat journal_start/stop() as if they
+were semaphores and include them in your semaphore ordering rules to prevent
+deadlocks. Note that journal_extend() has similar blocking behaviour to
+journal_start() so you can deadlock here just as easily as on journal_start().
+</para>
+
+<para>
+Try to reserve the right number of blocks the first time. ;-). This will
+be the maximum number of blocks you are going to touch in this transaction.
+I advise having a look at at least ext3_jbd.h to see the basis on which
+ext3 uses to make these decisions.
+</para>
+
+<para>
+Another wriggle to watch out for is your on-disk block allocation strategy.
+why? Because, if you undo a delete, you need to ensure you haven't reused any
+of the freed blocks in a later transaction. One simple way of doing this
+is make sure any blocks you allocate only have checkpointed transactions
+listed against them. Ext3 does this in ext3_test_allocatable().
+</para>
+
+<para>
+Lock is also providing through journal_{un,}lock_updates(),
+ext3 uses this when it wants a window with a clean and stable fs for a moment.
+eg.
+</para>
+
+<programlisting>
+
+ journal_lock_updates() //stop new stuff happening..
+ journal_flush() // checkpoint everything.
+ ..do stuff on stable fs
+ journal_unlock_updates() // carry on with filesystem use.
+</programlisting>
+
+<para>
+The opportunities for abuse and DOS attacks with this should be obvious,
+if you allow unprivileged userspace to trigger codepaths containing these
+calls.
+</para>
+
+<para>
+A new feature of jbd since 2.5.25 is commit callbacks with the new
+journal_callback_set() function you can now ask the journalling layer
+to call you back when the transaction is finally committed to disk, so that
+you can do some of your own management. The key to this is the journal_callback
+struct, this maintains the internal callback information but you can
+extend it like this:-
+</para>
+<programlisting>
+ struct myfs_callback_s {
+ //Data structure element required by jbd..
+ struct journal_callback for_jbd;
+ // Stuff for myfs allocated together.
+ myfs_inode* i_commited;
+
+ }
+</programlisting>
+
+<para>
+this would be useful if you needed to know when data was committed to a
+particular inode.
+</para>
+
+</sect1>
+
+<sect1>
+<title>Summary</title>
+<para>
+Using the journal is a matter of wrapping the different context changes,
+being each mount, each modification (transaction) and each changed buffer
+to tell the journalling layer about them.
+</para>
+
+<para>
+Here is a some pseudo code to give you an idea of how it works, as
+an example.
+</para>
+
+<programlisting>
+ journal_t* my_jnrl = journal_create();
+ journal_init_{dev,inode}(jnrl,...)
+ if (clean) journal_wipe();
+ journal_load();
+
+ foreach(transaction) { /*transactions must be
+ completed before
+ a syscall returns to
+ userspace*/
+
+ handle_t * xct=journal_start(my_jnrl);
+ foreach(bh) {
+ journal_get_{create,write,undo}_access(xact,bh);
+ if ( myfs_modify(bh) ) { /* returns true
+ if makes changes */
+ journal_dirty_{meta,}data(xact,bh);
+ } else {
+ journal_forget(bh);
+ }
+ }
+ journal_stop(xct);
+ }
+ journal_destroy(my_jrnl);
+</programlisting>
+</sect1>
+
+</chapter>
+
+ <chapter id="adt">
+ <title>Data Types</title>
+ <para>
+ The journalling layer uses typedefs to 'hide' the concrete definitions
+ of the structures used. As a client of the JBD layer you can
+ just rely on the using the pointer as a magic cookie of some sort.
+
+ Obviously the hiding is not enforced as this is 'C'.
+ </para>
+ <sect1><title>Structures</title>
+!Iinclude/linux/jbd.h
+ </sect1>
+</chapter>
+
+ <chapter id="calls">
+ <title>Functions</title>
+ <para>
+ The functions here are split into two groups those that
+ affect a journal as a whole, and those which are used to
+ manage transactions
+</para>
+ <sect1><title>Journal Level</title>
+!Efs/jbd/journal.c
+!Efs/jbd/recovery.c
+ </sect1>
+ <sect1><title>Transasction Level</title>
+!Efs/jbd/transaction.c
+ </sect1>
+</chapter>
+<chapter>
+ <title>See also</title>
+ <para>
+ <citation>
+ <ulink url="ftp://ftp.uk.linux.org/pub/linux/sct/fs/jfs/journal-design.ps.gz">
+ Journaling the Linux ext2fs Filesystem,LinuxExpo 98, Stephen Tweedie
+ </ulink>
+ </citation>
+ </para>
+ <para>
+ <citation>
+ <ulink url="http://olstrans.sourceforge.net/release/OLS2000-ext3/OLS2000-ext3.html">
+ Ext3 Journalling FileSystem , OLS 2000, Dr. Stephen Tweedie
+ </ulink>
+ </citation>
+ </para>
+</chapter>
+
+</book>
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
new file mode 100644
index 000000000000..1bd20c860285
--- /dev/null
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -0,0 +1,342 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="LinuxKernelAPI">
+ <bookinfo>
+ <title>The Linux Kernel API</title>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="Basics">
+ <title>Driver Basics</title>
+ <sect1><title>Driver Entry and Exit points</title>
+!Iinclude/linux/init.h
+ </sect1>
+
+ <sect1><title>Atomic and pointer manipulation</title>
+!Iinclude/asm-i386/atomic.h
+!Iinclude/asm-i386/unaligned.h
+ </sect1>
+
+<!-- FIXME:
+ kernel/sched.c has no docs, which stuffs up the sgml. Comment
+ out until somebody adds docs. KAO
+ <sect1><title>Delaying, scheduling, and timer routines</title>
+X!Ekernel/sched.c
+ </sect1>
+KAO -->
+ </chapter>
+
+ <chapter id="adt">
+ <title>Data Types</title>
+ <sect1><title>Doubly Linked Lists</title>
+!Iinclude/linux/list.h
+ </sect1>
+ </chapter>
+
+ <chapter id="libc">
+ <title>Basic C Library Functions</title>
+
+ <para>
+ When writing drivers, you cannot in general use routines which are
+ from the C Library. Some of the functions have been found generally
+ useful and they are listed below. The behaviour of these functions
+ may vary slightly from those defined by ANSI, and these deviations
+ are noted in the text.
+ </para>
+
+ <sect1><title>String Conversions</title>
+!Ilib/vsprintf.c
+!Elib/vsprintf.c
+ </sect1>
+ <sect1><title>String Manipulation</title>
+!Ilib/string.c
+!Elib/string.c
+ </sect1>
+ <sect1><title>Bit Operations</title>
+!Iinclude/asm-i386/bitops.h
+ </sect1>
+ </chapter>
+
+ <chapter id="mm">
+ <title>Memory Management in Linux</title>
+ <sect1><title>The Slab Cache</title>
+!Emm/slab.c
+ </sect1>
+ <sect1><title>User Space Memory Access</title>
+!Iinclude/asm-i386/uaccess.h
+!Iarch/i386/lib/usercopy.c
+ </sect1>
+ </chapter>
+
+ <chapter id="kfifo">
+ <title>FIFO Buffer</title>
+ <sect1><title>kfifo interface</title>
+!Iinclude/linux/kfifo.h
+!Ekernel/kfifo.c
+ </sect1>
+ </chapter>
+
+ <chapter id="proc">
+ <title>The proc filesystem</title>
+
+ <sect1><title>sysctl interface</title>
+!Ekernel/sysctl.c
+ </sect1>
+ </chapter>
+
+ <chapter id="debugfs">
+ <title>The debugfs filesystem</title>
+
+ <sect1><title>debugfs interface</title>
+!Efs/debugfs/inode.c
+!Efs/debugfs/file.c
+ </sect1>
+ </chapter>
+
+ <chapter id="vfs">
+ <title>The Linux VFS</title>
+ <sect1><title>The Directory Cache</title>
+!Efs/dcache.c
+!Iinclude/linux/dcache.h
+ </sect1>
+ <sect1><title>Inode Handling</title>
+!Efs/inode.c
+!Efs/bad_inode.c
+ </sect1>
+ <sect1><title>Registration and Superblocks</title>
+!Efs/super.c
+ </sect1>
+ <sect1><title>File Locks</title>
+!Efs/locks.c
+!Ifs/locks.c
+ </sect1>
+ </chapter>
+
+ <chapter id="netcore">
+ <title>Linux Networking</title>
+ <sect1><title>Socket Buffer Functions</title>
+!Iinclude/linux/skbuff.h
+!Enet/core/skbuff.c
+ </sect1>
+ <sect1><title>Socket Filter</title>
+!Enet/core/filter.c
+ </sect1>
+ <sect1><title>Generic Network Statistics</title>
+!Iinclude/linux/gen_stats.h
+!Enet/core/gen_stats.c
+!Enet/core/gen_estimator.c
+ </sect1>
+ </chapter>
+
+ <chapter id="netdev">
+ <title>Network device support</title>
+ <sect1><title>Driver Support</title>
+!Enet/core/dev.c
+ </sect1>
+ <sect1><title>8390 Based Network Cards</title>
+!Edrivers/net/8390.c
+ </sect1>
+ <sect1><title>Synchronous PPP</title>
+!Edrivers/net/wan/syncppp.c
+ </sect1>
+ </chapter>
+
+ <chapter id="modload">
+ <title>Module Support</title>
+ <sect1><title>Module Loading</title>
+!Ekernel/kmod.c
+ </sect1>
+ <sect1><title>Inter Module support</title>
+ <para>
+ Refer to the file kernel/module.c for more information.
+ </para>
+<!-- FIXME: Removed for now since no structured comments in source
+X!Ekernel/module.c
+-->
+ </sect1>
+ </chapter>
+
+ <chapter id="hardware">
+ <title>Hardware Interfaces</title>
+ <sect1><title>Interrupt Handling</title>
+!Iarch/i386/kernel/irq.c
+ </sect1>
+
+ <sect1><title>MTRR Handling</title>
+!Earch/i386/kernel/cpu/mtrr/main.c
+ </sect1>
+ <sect1><title>PCI Support Library</title>
+!Edrivers/pci/pci.c
+ </sect1>
+ <sect1><title>PCI Hotplug Support Library</title>
+!Edrivers/pci/hotplug/pci_hotplug_core.c
+ </sect1>
+ <sect1><title>MCA Architecture</title>
+ <sect2><title>MCA Device Functions</title>
+ <para>
+ Refer to the file arch/i386/kernel/mca.c for more information.
+ </para>
+<!-- FIXME: Removed for now since no structured comments in source
+X!Earch/i386/kernel/mca.c
+-->
+ </sect2>
+ <sect2><title>MCA Bus DMA</title>
+!Iinclude/asm-i386/mca_dma.h
+ </sect2>
+ </sect1>
+ </chapter>
+
+ <chapter id="devfs">
+ <title>The Device File System</title>
+!Efs/devfs/base.c
+ </chapter>
+
+ <chapter id="security">
+ <title>Security Framework</title>
+!Esecurity/security.c
+ </chapter>
+
+ <chapter id="pmfuncs">
+ <title>Power Management</title>
+!Ekernel/power/pm.c
+ </chapter>
+
+ <chapter id="blkdev">
+ <title>Block Devices</title>
+!Edrivers/block/ll_rw_blk.c
+ </chapter>
+
+ <chapter id="miscdev">
+ <title>Miscellaneous Devices</title>
+!Edrivers/char/misc.c
+ </chapter>
+
+ <chapter id="viddev">
+ <title>Video4Linux</title>
+!Edrivers/media/video/videodev.c
+ </chapter>
+
+ <chapter id="snddev">
+ <title>Sound Devices</title>
+!Esound/sound_core.c
+<!-- FIXME: Removed for now since no structured comments in source
+X!Isound/sound_firmware.c
+-->
+ </chapter>
+
+ <chapter id="uart16x50">
+ <title>16x50 UART Driver</title>
+!Edrivers/serial/serial_core.c
+!Edrivers/serial/8250.c
+ </chapter>
+
+ <chapter id="z85230">
+ <title>Z85230 Support Library</title>
+!Edrivers/net/wan/z85230.c
+ </chapter>
+
+ <chapter id="fbdev">
+ <title>Frame Buffer Library</title>
+
+ <para>
+ The frame buffer drivers depend heavily on four data structures.
+ These structures are declared in include/linux/fb.h. They are
+ fb_info, fb_var_screeninfo, fb_fix_screeninfo and fb_monospecs.
+ The last three can be made available to and from userland.
+ </para>
+
+ <para>
+ fb_info defines the current state of a particular video card.
+ Inside fb_info, there exists a fb_ops structure which is a
+ collection of needed functions to make fbdev and fbcon work.
+ fb_info is only visible to the kernel.
+ </para>
+
+ <para>
+ fb_var_screeninfo is used to describe the features of a video card
+ that are user defined. With fb_var_screeninfo, things such as
+ depth and the resolution may be defined.
+ </para>
+
+ <para>
+ The next structure is fb_fix_screeninfo. This defines the
+ properties of a card that are created when a mode is set and can't
+ be changed otherwise. A good example of this is the start of the
+ frame buffer memory. This "locks" the address of the frame buffer
+ memory, so that it cannot be changed or moved.
+ </para>
+
+ <para>
+ The last structure is fb_monospecs. In the old API, there was
+ little importance for fb_monospecs. This allowed for forbidden things
+ such as setting a mode of 800x600 on a fix frequency monitor. With
+ the new API, fb_monospecs prevents such things, and if used
+ correctly, can prevent a monitor from being cooked. fb_monospecs
+ will not be useful until kernels 2.5.x.
+ </para>
+
+ <sect1><title>Frame Buffer Memory</title>
+!Edrivers/video/fbmem.c
+ </sect1>
+ <sect1><title>Frame Buffer Console</title>
+!Edrivers/video/console/fbcon.c
+ </sect1>
+ <sect1><title>Frame Buffer Colormap</title>
+!Edrivers/video/fbcmap.c
+ </sect1>
+<!-- FIXME:
+ drivers/video/fbgen.c has no docs, which stuffs up the sgml. Comment
+ out until somebody adds docs. KAO
+ <sect1><title>Frame Buffer Generic Functions</title>
+X!Idrivers/video/fbgen.c
+ </sect1>
+KAO -->
+ <sect1><title>Frame Buffer Video Mode Database</title>
+!Idrivers/video/modedb.c
+!Edrivers/video/modedb.c
+ </sect1>
+ <sect1><title>Frame Buffer Macintosh Video Mode Database</title>
+!Idrivers/video/macmodes.c
+ </sect1>
+ <sect1><title>Frame Buffer Fonts</title>
+ <para>
+ Refer to the file drivers/video/console/fonts.c for more information.
+ </para>
+<!-- FIXME: Removed for now since no structured comments in source
+X!Idrivers/video/console/fonts.c
+-->
+ </sect1>
+ </chapter>
+</book>
diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
new file mode 100644
index 000000000000..49a9ef82d575
--- /dev/null
+++ b/Documentation/DocBook/kernel-hacking.tmpl
@@ -0,0 +1,1349 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="lk-hacking-guide">
+ <bookinfo>
+ <title>Unreliable Guide To Hacking The Linux Kernel</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Paul</firstname>
+ <othername>Rusty</othername>
+ <surname>Russell</surname>
+ <affiliation>
+ <address>
+ <email>rusty@rustcorp.com.au</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2001</year>
+ <holder>Rusty Russell</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+
+ <releaseinfo>
+ This is the first release of this document as part of the kernel tarball.
+ </releaseinfo>
+
+ </bookinfo>
+
+ <toc></toc>
+
+ <chapter id="introduction">
+ <title>Introduction</title>
+ <para>
+ Welcome, gentle reader, to Rusty's Unreliable Guide to Linux
+ Kernel Hacking. This document describes the common routines and
+ general requirements for kernel code: its goal is to serve as a
+ primer for Linux kernel development for experienced C
+ programmers. I avoid implementation details: that's what the
+ code is for, and I ignore whole tracts of useful routines.
+ </para>
+ <para>
+ Before you read this, please understand that I never wanted to
+ write this document, being grossly under-qualified, but I always
+ wanted to read it, and this was the only way. I hope it will
+ grow into a compendium of best practice, common starting points
+ and random information.
+ </para>
+ </chapter>
+
+ <chapter id="basic-players">
+ <title>The Players</title>
+
+ <para>
+ At any time each of the CPUs in a system can be:
+ </para>
+
+ <itemizedlist>
+ <listitem>
+ <para>
+ not associated with any process, serving a hardware interrupt;
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ not associated with any process, serving a softirq, tasklet or bh;
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ running in kernel space, associated with a process;
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ running a process in user space.
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ <para>
+ There is a strict ordering between these: other than the last
+ category (userspace) each can only be pre-empted by those above.
+ For example, while a softirq is running on a CPU, no other
+ softirq will pre-empt it, but a hardware interrupt can. However,
+ any other CPUs in the system execute independently.
+ </para>
+
+ <para>
+ We'll see a number of ways that the user context can block
+ interrupts, to become truly non-preemptable.
+ </para>
+
+ <sect1 id="basics-usercontext">
+ <title>User Context</title>
+
+ <para>
+ User context is when you are coming in from a system call or
+ other trap: you can sleep, and you own the CPU (except for
+ interrupts) until you call <function>schedule()</function>.
+ In other words, user context (unlike userspace) is not pre-emptable.
+ </para>
+
+ <note>
+ <para>
+ You are always in user context on module load and unload,
+ and on operations on the block device layer.
+ </para>
+ </note>
+
+ <para>
+ In user context, the <varname>current</varname> pointer (indicating
+ the task we are currently executing) is valid, and
+ <function>in_interrupt()</function>
+ (<filename>include/linux/interrupt.h</filename>) is <returnvalue>false
+ </returnvalue>.
+ </para>
+
+ <caution>
+ <para>
+ Beware that if you have interrupts or bottom halves disabled
+ (see below), <function>in_interrupt()</function> will return a
+ false positive.
+ </para>
+ </caution>
+ </sect1>
+
+ <sect1 id="basics-hardirqs">
+ <title>Hardware Interrupts (Hard IRQs)</title>
+
+ <para>
+ Timer ticks, <hardware>network cards</hardware> and
+ <hardware>keyboard</hardware> are examples of real
+ hardware which produce interrupts at any time. The kernel runs
+ interrupt handlers, which services the hardware. The kernel
+ guarantees that this handler is never re-entered: if another
+ interrupt arrives, it is queued (or dropped). Because it
+ disables interrupts, this handler has to be fast: frequently it
+ simply acknowledges the interrupt, marks a `software interrupt'
+ for execution and exits.
+ </para>
+
+ <para>
+ You can tell you are in a hardware interrupt, because
+ <function>in_irq()</function> returns <returnvalue>true</returnvalue>.
+ </para>
+ <caution>
+ <para>
+ Beware that this will return a false positive if interrupts are disabled
+ (see below).
+ </para>
+ </caution>
+ </sect1>
+
+ <sect1 id="basics-softirqs">
+ <title>Software Interrupt Context: Bottom Halves, Tasklets, softirqs</title>
+
+ <para>
+ Whenever a system call is about to return to userspace, or a
+ hardware interrupt handler exits, any `software interrupts'
+ which are marked pending (usually by hardware interrupts) are
+ run (<filename>kernel/softirq.c</filename>).
+ </para>
+
+ <para>
+ Much of the real interrupt handling work is done here. Early in
+ the transition to <acronym>SMP</acronym>, there were only `bottom
+ halves' (BHs), which didn't take advantage of multiple CPUs. Shortly
+ after we switched from wind-up computers made of match-sticks and snot,
+ we abandoned this limitation.
+ </para>
+
+ <para>
+ <filename class="headerfile">include/linux/interrupt.h</filename> lists the
+ different BH's. No matter how many CPUs you have, no two BHs will run at
+ the same time. This made the transition to SMP simpler, but sucks hard for
+ scalable performance. A very important bottom half is the timer
+ BH (<filename class="headerfile">include/linux/timer.h</filename>): you
+ can register to have it call functions for you in a given length of time.
+ </para>
+
+ <para>
+ 2.3.43 introduced softirqs, and re-implemented the (now
+ deprecated) BHs underneath them. Softirqs are fully-SMP
+ versions of BHs: they can run on as many CPUs at once as
+ required. This means they need to deal with any races in shared
+ data using their own locks. A bitmask is used to keep track of
+ which are enabled, so the 32 available softirqs should not be
+ used up lightly. (<emphasis>Yes</emphasis>, people will
+ notice).
+ </para>
+
+ <para>
+ tasklets (<filename class="headerfile">include/linux/interrupt.h</filename>)
+ are like softirqs, except they are dynamically-registrable (meaning you
+ can have as many as you want), and they also guarantee that any tasklet
+ will only run on one CPU at any time, although different tasklets can
+ run simultaneously (unlike different BHs).
+ </para>
+ <caution>
+ <para>
+ The name `tasklet' is misleading: they have nothing to do with `tasks',
+ and probably more to do with some bad vodka Alexey Kuznetsov had at the
+ time.
+ </para>
+ </caution>
+
+ <para>
+ You can tell you are in a softirq (or bottom half, or tasklet)
+ using the <function>in_softirq()</function> macro
+ (<filename class="headerfile">include/linux/interrupt.h</filename>).
+ </para>
+ <caution>
+ <para>
+ Beware that this will return a false positive if a bh lock (see below)
+ is held.
+ </para>
+ </caution>
+ </sect1>
+ </chapter>
+
+ <chapter id="basic-rules">
+ <title>Some Basic Rules</title>
+
+ <variablelist>
+ <varlistentry>
+ <term>No memory protection</term>
+ <listitem>
+ <para>
+ If you corrupt memory, whether in user context or
+ interrupt context, the whole machine will crash. Are you
+ sure you can't do what you want in userspace?
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>No floating point or <acronym>MMX</acronym></term>
+ <listitem>
+ <para>
+ The <acronym>FPU</acronym> context is not saved; even in user
+ context the <acronym>FPU</acronym> state probably won't
+ correspond with the current process: you would mess with some
+ user process' <acronym>FPU</acronym> state. If you really want
+ to do this, you would have to explicitly save/restore the full
+ <acronym>FPU</acronym> state (and avoid context switches). It
+ is generally a bad idea; use fixed point arithmetic first.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>A rigid stack limit</term>
+ <listitem>
+ <para>
+ The kernel stack is about 6K in 2.2 (for most
+ architectures: it's about 14K on the Alpha), and shared
+ with interrupts so you can't use it all. Avoid deep
+ recursion and huge local arrays on the stack (allocate
+ them dynamically instead).
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>The Linux kernel is portable</term>
+ <listitem>
+ <para>
+ Let's keep it that way. Your code should be 64-bit clean,
+ and endian-independent. You should also minimize CPU
+ specific stuff, e.g. inline assembly should be cleanly
+ encapsulated and minimized to ease porting. Generally it
+ should be restricted to the architecture-dependent part of
+ the kernel tree.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+ </chapter>
+
+ <chapter id="ioctls">
+ <title>ioctls: Not writing a new system call</title>
+
+ <para>
+ A system call generally looks like this
+ </para>
+
+ <programlisting>
+asmlinkage long sys_mycall(int arg)
+{
+ return 0;
+}
+ </programlisting>
+
+ <para>
+ First, in most cases you don't want to create a new system call.
+ You create a character device and implement an appropriate ioctl
+ for it. This is much more flexible than system calls, doesn't have
+ to be entered in every architecture's
+ <filename class="headerfile">include/asm/unistd.h</filename> and
+ <filename>arch/kernel/entry.S</filename> file, and is much more
+ likely to be accepted by Linus.
+ </para>
+
+ <para>
+ If all your routine does is read or write some parameter, consider
+ implementing a <function>sysctl</function> interface instead.
+ </para>
+
+ <para>
+ Inside the ioctl you're in user context to a process. When a
+ error occurs you return a negated errno (see
+ <filename class="headerfile">include/linux/errno.h</filename>),
+ otherwise you return <returnvalue>0</returnvalue>.
+ </para>
+
+ <para>
+ After you slept you should check if a signal occurred: the
+ Unix/Linux way of handling signals is to temporarily exit the
+ system call with the <constant>-ERESTARTSYS</constant> error. The
+ system call entry code will switch back to user context, process
+ the signal handler and then your system call will be restarted
+ (unless the user disabled that). So you should be prepared to
+ process the restart, e.g. if you're in the middle of manipulating
+ some data structure.
+ </para>
+
+ <programlisting>
+if (signal_pending())
+ return -ERESTARTSYS;
+ </programlisting>
+
+ <para>
+ If you're doing longer computations: first think userspace. If you
+ <emphasis>really</emphasis> want to do it in kernel you should
+ regularly check if you need to give up the CPU (remember there is
+ cooperative multitasking per CPU). Idiom:
+ </para>
+
+ <programlisting>
+cond_resched(); /* Will sleep */
+ </programlisting>
+
+ <para>
+ A short note on interface design: the UNIX system call motto is
+ "Provide mechanism not policy".
+ </para>
+ </chapter>
+
+ <chapter id="deadlock-recipes">
+ <title>Recipes for Deadlock</title>
+
+ <para>
+ You cannot call any routines which may sleep, unless:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ You are in user context.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ You do not own any spinlocks.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ You have interrupts enabled (actually, Andi Kleen says
+ that the scheduling code will enable them for you, but
+ that's probably not what you wanted).
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ <para>
+ Note that some functions may sleep implicitly: common ones are
+ the user space access functions (*_user) and memory allocation
+ functions without <symbol>GFP_ATOMIC</symbol>.
+ </para>
+
+ <para>
+ You will eventually lock up your box if you break these rules.
+ </para>
+
+ <para>
+ Really.
+ </para>
+ </chapter>
+
+ <chapter id="common-routines">
+ <title>Common Routines</title>
+
+ <sect1 id="routines-printk">
+ <title>
+ <function>printk()</function>
+ <filename class="headerfile">include/linux/kernel.h</filename>
+ </title>
+
+ <para>
+ <function>printk()</function> feeds kernel messages to the
+ console, dmesg, and the syslog daemon. It is useful for debugging
+ and reporting errors, and can be used inside interrupt context,
+ but use with caution: a machine which has its console flooded with
+ printk messages is unusable. It uses a format string mostly
+ compatible with ANSI C printf, and C string concatenation to give
+ it a first "priority" argument:
+ </para>
+
+ <programlisting>
+printk(KERN_INFO "i = %u\n", i);
+ </programlisting>
+
+ <para>
+ See <filename class="headerfile">include/linux/kernel.h</filename>;
+ for other KERN_ values; these are interpreted by syslog as the
+ level. Special case: for printing an IP address use
+ </para>
+
+ <programlisting>
+__u32 ipaddress;
+printk(KERN_INFO "my ip: %d.%d.%d.%d\n", NIPQUAD(ipaddress));
+ </programlisting>
+
+ <para>
+ <function>printk()</function> internally uses a 1K buffer and does
+ not catch overruns. Make sure that will be enough.
+ </para>
+
+ <note>
+ <para>
+ You will know when you are a real kernel hacker
+ when you start typoing printf as printk in your user programs :)
+ </para>
+ </note>
+
+ <!--- From the Lions book reader department -->
+
+ <note>
+ <para>
+ Another sidenote: the original Unix Version 6 sources had a
+ comment on top of its printf function: "Printf should not be
+ used for chit-chat". You should follow that advice.
+ </para>
+ </note>
+ </sect1>
+
+ <sect1 id="routines-copy">
+ <title>
+ <function>copy_[to/from]_user()</function>
+ /
+ <function>get_user()</function>
+ /
+ <function>put_user()</function>
+ <filename class="headerfile">include/asm/uaccess.h</filename>
+ </title>
+
+ <para>
+ <emphasis>[SLEEPS]</emphasis>
+ </para>
+
+ <para>
+ <function>put_user()</function> and <function>get_user()</function>
+ are used to get and put single values (such as an int, char, or
+ long) from and to userspace. A pointer into userspace should
+ never be simply dereferenced: data should be copied using these
+ routines. Both return <constant>-EFAULT</constant> or 0.
+ </para>
+ <para>
+ <function>copy_to_user()</function> and
+ <function>copy_from_user()</function> are more general: they copy
+ an arbitrary amount of data to and from userspace.
+ <caution>
+ <para>
+ Unlike <function>put_user()</function> and
+ <function>get_user()</function>, they return the amount of
+ uncopied data (ie. <returnvalue>0</returnvalue> still means
+ success).
+ </para>
+ </caution>
+ [Yes, this moronic interface makes me cringe. Please submit a
+ patch and become my hero --RR.]
+ </para>
+ <para>
+ The functions may sleep implicitly. This should never be called
+ outside user context (it makes no sense), with interrupts
+ disabled, or a spinlock held.
+ </para>
+ </sect1>
+
+ <sect1 id="routines-kmalloc">
+ <title><function>kmalloc()</function>/<function>kfree()</function>
+ <filename class="headerfile">include/linux/slab.h</filename></title>
+
+ <para>
+ <emphasis>[MAY SLEEP: SEE BELOW]</emphasis>
+ </para>
+
+ <para>
+ These routines are used to dynamically request pointer-aligned
+ chunks of memory, like malloc and free do in userspace, but
+ <function>kmalloc()</function> takes an extra flag word.
+ Important values:
+ </para>
+
+ <variablelist>
+ <varlistentry>
+ <term>
+ <constant>
+ GFP_KERNEL
+ </constant>
+ </term>
+ <listitem>
+ <para>
+ May sleep and swap to free memory. Only allowed in user
+ context, but is the most reliable way to allocate memory.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>
+ <constant>
+ GFP_ATOMIC
+ </constant>
+ </term>
+ <listitem>
+ <para>
+ Don't sleep. Less reliable than <constant>GFP_KERNEL</constant>,
+ but may be called from interrupt context. You should
+ <emphasis>really</emphasis> have a good out-of-memory
+ error-handling strategy.
+ </para>
+ </listitem>
+ </varlistentry>
+
+ <varlistentry>
+ <term>
+ <constant>
+ GFP_DMA
+ </constant>
+ </term>
+ <listitem>
+ <para>
+ Allocate ISA DMA lower than 16MB. If you don't know what that
+ is you don't need it. Very unreliable.
+ </para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
+
+ <para>
+ If you see a <errorname>kmem_grow: Called nonatomically from int
+ </errorname> warning message you called a memory allocation function
+ from interrupt context without <constant>GFP_ATOMIC</constant>.
+ You should really fix that. Run, don't walk.
+ </para>
+
+ <para>
+ If you are allocating at least <constant>PAGE_SIZE</constant>
+ (<filename class="headerfile">include/asm/page.h</filename>) bytes,
+ consider using <function>__get_free_pages()</function>
+
+ (<filename class="headerfile">include/linux/mm.h</filename>). It
+ takes an order argument (0 for page sized, 1 for double page, 2
+ for four pages etc.) and the same memory priority flag word as
+ above.
+ </para>
+
+ <para>
+ If you are allocating more than a page worth of bytes you can use
+ <function>vmalloc()</function>. It'll allocate virtual memory in
+ the kernel map. This block is not contiguous in physical memory,
+ but the <acronym>MMU</acronym> makes it look like it is for you
+ (so it'll only look contiguous to the CPUs, not to external device
+ drivers). If you really need large physically contiguous memory
+ for some weird device, you have a problem: it is poorly supported
+ in Linux because after some time memory fragmentation in a running
+ kernel makes it hard. The best way is to allocate the block early
+ in the boot process via the <function>alloc_bootmem()</function>
+ routine.
+ </para>
+
+ <para>
+ Before inventing your own cache of often-used objects consider
+ using a slab cache in
+ <filename class="headerfile">include/linux/slab.h</filename>
+ </para>
+ </sect1>
+
+ <sect1 id="routines-current">
+ <title><function>current</function>
+ <filename class="headerfile">include/asm/current.h</filename></title>
+
+ <para>
+ This global variable (really a macro) contains a pointer to
+ the current task structure, so is only valid in user context.
+ For example, when a process makes a system call, this will
+ point to the task structure of the calling process. It is
+ <emphasis>not NULL</emphasis> in interrupt context.
+ </para>
+ </sect1>
+
+ <sect1 id="routines-udelay">
+ <title><function>udelay()</function>/<function>mdelay()</function>
+ <filename class="headerfile">include/asm/delay.h</filename>
+ <filename class="headerfile">include/linux/delay.h</filename>
+ </title>
+
+ <para>
+ The <function>udelay()</function> function can be used for small pauses.
+ Do not use large values with <function>udelay()</function> as you risk
+ overflow - the helper function <function>mdelay()</function> is useful
+ here, or even consider <function>schedule_timeout()</function>.
+ </para>
+ </sect1>
+
+ <sect1 id="routines-endian">
+ <title><function>cpu_to_be32()</function>/<function>be32_to_cpu()</function>/<function>cpu_to_le32()</function>/<function>le32_to_cpu()</function>
+ <filename class="headerfile">include/asm/byteorder.h</filename>
+ </title>
+
+ <para>
+ The <function>cpu_to_be32()</function> family (where the "32" can
+ be replaced by 64 or 16, and the "be" can be replaced by "le") are
+ the general way to do endian conversions in the kernel: they
+ return the converted value. All variations supply the reverse as
+ well: <function>be32_to_cpu()</function>, etc.
+ </para>
+
+ <para>
+ There are two major variations of these functions: the pointer
+ variation, such as <function>cpu_to_be32p()</function>, which take
+ a pointer to the given type, and return the converted value. The
+ other variation is the "in-situ" family, such as
+ <function>cpu_to_be32s()</function>, which convert value referred
+ to by the pointer, and return void.
+ </para>
+ </sect1>
+
+ <sect1 id="routines-local-irqs">
+ <title><function>local_irq_save()</function>/<function>local_irq_restore()</function>
+ <filename class="headerfile">include/asm/system.h</filename>
+ </title>
+
+ <para>
+ These routines disable hard interrupts on the local CPU, and
+ restore them. They are reentrant; saving the previous state in
+ their one <varname>unsigned long flags</varname> argument. If you
+ know that interrupts are enabled, you can simply use
+ <function>local_irq_disable()</function> and
+ <function>local_irq_enable()</function>.
+ </para>
+ </sect1>
+
+ <sect1 id="routines-softirqs">
+ <title><function>local_bh_disable()</function>/<function>local_bh_enable()</function>
+ <filename class="headerfile">include/linux/interrupt.h</filename></title>
+
+ <para>
+ These routines disable soft interrupts on the local CPU, and
+ restore them. They are reentrant; if soft interrupts were
+ disabled before, they will still be disabled after this pair
+ of functions has been called. They prevent softirqs, tasklets
+ and bottom halves from running on the current CPU.
+ </para>
+ </sect1>
+
+ <sect1 id="routines-processorids">
+ <title><function>smp_processor_id</function>()
+ <filename class="headerfile">include/asm/smp.h</filename></title>
+
+ <para>
+ <function>smp_processor_id()</function> returns the current
+ processor number, between 0 and <symbol>NR_CPUS</symbol> (the
+ maximum number of CPUs supported by Linux, currently 32). These
+ values are not necessarily continuous.
+ </para>
+ </sect1>
+
+ <sect1 id="routines-init">
+ <title><type>__init</type>/<type>__exit</type>/<type>__initdata</type>
+ <filename class="headerfile">include/linux/init.h</filename></title>
+
+ <para>
+ After boot, the kernel frees up a special section; functions
+ marked with <type>__init</type> and data structures marked with
+ <type>__initdata</type> are dropped after boot is complete (within
+ modules this directive is currently ignored). <type>__exit</type>
+ is used to declare a function which is only required on exit: the
+ function will be dropped if this file is not compiled as a module.
+ See the header file for use. Note that it makes no sense for a function
+ marked with <type>__init</type> to be exported to modules with
+ <function>EXPORT_SYMBOL()</function> - this will break.
+ </para>
+ <para>
+ Static data structures marked as <type>__initdata</type> must be initialised
+ (as opposed to ordinary static data which is zeroed BSS) and cannot be
+ <type>const</type>.
+ </para>
+
+ </sect1>
+
+ <sect1 id="routines-init-again">
+ <title><function>__initcall()</function>/<function>module_init()</function>
+ <filename class="headerfile">include/linux/init.h</filename></title>
+ <para>
+ Many parts of the kernel are well served as a module
+ (dynamically-loadable parts of the kernel). Using the
+ <function>module_init()</function> and
+ <function>module_exit()</function> macros it is easy to write code
+ without #ifdefs which can operate both as a module or built into
+ the kernel.
+ </para>
+
+ <para>
+ The <function>module_init()</function> macro defines which
+ function is to be called at module insertion time (if the file is
+ compiled as a module), or at boot time: if the file is not
+ compiled as a module the <function>module_init()</function> macro
+ becomes equivalent to <function>__initcall()</function>, which
+ through linker magic ensures that the function is called on boot.
+ </para>
+
+ <para>
+ The function can return a negative error number to cause
+ module loading to fail (unfortunately, this has no effect if
+ the module is compiled into the kernel). For modules, this is
+ called in user context, with interrupts enabled, and the
+ kernel lock held, so it can sleep.
+ </para>
+ </sect1>
+
+ <sect1 id="routines-moduleexit">
+ <title> <function>module_exit()</function>
+ <filename class="headerfile">include/linux/init.h</filename> </title>
+
+ <para>
+ This macro defines the function to be called at module removal
+ time (or never, in the case of the file compiled into the
+ kernel). It will only be called if the module usage count has
+ reached zero. This function can also sleep, but cannot fail:
+ everything must be cleaned up by the time it returns.
+ </para>
+ </sect1>
+
+ <!-- add info on new-style module refcounting here -->
+ </chapter>
+
+ <chapter id="queues">
+ <title>Wait Queues
+ <filename class="headerfile">include/linux/wait.h</filename>
+ </title>
+ <para>
+ <emphasis>[SLEEPS]</emphasis>
+ </para>
+
+ <para>
+ A wait queue is used to wait for someone to wake you up when a
+ certain condition is true. They must be used carefully to ensure
+ there is no race condition. You declare a
+ <type>wait_queue_head_t</type>, and then processes which want to
+ wait for that condition declare a <type>wait_queue_t</type>
+ referring to themselves, and place that in the queue.
+ </para>
+
+ <sect1 id="queue-declaring">
+ <title>Declaring</title>
+
+ <para>
+ You declare a <type>wait_queue_head_t</type> using the
+ <function>DECLARE_WAIT_QUEUE_HEAD()</function> macro, or using the
+ <function>init_waitqueue_head()</function> routine in your
+ initialization code.
+ </para>
+ </sect1>
+
+ <sect1 id="queue-waitqueue">
+ <title>Queuing</title>
+
+ <para>
+ Placing yourself in the waitqueue is fairly complex, because you
+ must put yourself in the queue before checking the condition.
+ There is a macro to do this:
+ <function>wait_event_interruptible()</function>
+
+ <filename class="headerfile">include/linux/sched.h</filename> The
+ first argument is the wait queue head, and the second is an
+ expression which is evaluated; the macro returns
+ <returnvalue>0</returnvalue> when this expression is true, or
+ <returnvalue>-ERESTARTSYS</returnvalue> if a signal is received.
+ The <function>wait_event()</function> version ignores signals.
+ </para>
+ <para>
+ Do not use the <function>sleep_on()</function> function family -
+ it is very easy to accidentally introduce races; almost certainly
+ one of the <function>wait_event()</function> family will do, or a
+ loop around <function>schedule_timeout()</function>. If you choose
+ to loop around <function>schedule_timeout()</function> remember
+ you must set the task state (with
+ <function>set_current_state()</function>) on each iteration to avoid
+ busy-looping.
+ </para>
+
+ </sect1>
+
+ <sect1 id="queue-waking">
+ <title>Waking Up Queued Tasks</title>
+
+ <para>
+ Call <function>wake_up()</function>
+
+ <filename class="headerfile">include/linux/sched.h</filename>;,
+ which will wake up every process in the queue. The exception is
+ if one has <constant>TASK_EXCLUSIVE</constant> set, in which case
+ the remainder of the queue will not be woken.
+ </para>
+ </sect1>
+ </chapter>
+
+ <chapter id="atomic-ops">
+ <title>Atomic Operations</title>
+
+ <para>
+ Certain operations are guaranteed atomic on all platforms. The
+ first class of operations work on <type>atomic_t</type>
+
+ <filename class="headerfile">include/asm/atomic.h</filename>; this
+ contains a signed integer (at least 24 bits long), and you must use
+ these functions to manipulate or read atomic_t variables.
+ <function>atomic_read()</function> and
+ <function>atomic_set()</function> get and set the counter,
+ <function>atomic_add()</function>,
+ <function>atomic_sub()</function>,
+ <function>atomic_inc()</function>,
+ <function>atomic_dec()</function>, and
+ <function>atomic_dec_and_test()</function> (returns
+ <returnvalue>true</returnvalue> if it was decremented to zero).
+ </para>
+
+ <para>
+ Yes. It returns <returnvalue>true</returnvalue> (i.e. != 0) if the
+ atomic variable is zero.
+ </para>
+
+ <para>
+ Note that these functions are slower than normal arithmetic, and
+ so should not be used unnecessarily. On some platforms they
+ are much slower, like 32-bit Sparc where they use a spinlock.
+ </para>
+
+ <para>
+ The second class of atomic operations is atomic bit operations on a
+ <type>long</type>, defined in
+
+ <filename class="headerfile">include/linux/bitops.h</filename>. These
+ operations generally take a pointer to the bit pattern, and a bit
+ number: 0 is the least significant bit.
+ <function>set_bit()</function>, <function>clear_bit()</function>
+ and <function>change_bit()</function> set, clear, and flip the
+ given bit. <function>test_and_set_bit()</function>,
+ <function>test_and_clear_bit()</function> and
+ <function>test_and_change_bit()</function> do the same thing,
+ except return true if the bit was previously set; these are
+ particularly useful for very simple locking.
+ </para>
+
+ <para>
+ It is possible to call these operations with bit indices greater
+ than BITS_PER_LONG. The resulting behavior is strange on big-endian
+ platforms though so it is a good idea not to do this.
+ </para>
+
+ <para>
+ Note that the order of bits depends on the architecture, and in
+ particular, the bitfield passed to these operations must be at
+ least as large as a <type>long</type>.
+ </para>
+ </chapter>
+
+ <chapter id="symbols">
+ <title>Symbols</title>
+
+ <para>
+ Within the kernel proper, the normal linking rules apply
+ (ie. unless a symbol is declared to be file scope with the
+ <type>static</type> keyword, it can be used anywhere in the
+ kernel). However, for modules, a special exported symbol table is
+ kept which limits the entry points to the kernel proper. Modules
+ can also export symbols.
+ </para>
+
+ <sect1 id="sym-exportsymbols">
+ <title><function>EXPORT_SYMBOL()</function>
+ <filename class="headerfile">include/linux/module.h</filename></title>
+
+ <para>
+ This is the classic method of exporting a symbol, and it works
+ for both modules and non-modules. In the kernel all these
+ declarations are often bundled into a single file to help
+ genksyms (which searches source files for these declarations).
+ See the comment on genksyms and Makefiles below.
+ </para>
+ </sect1>
+
+ <sect1 id="sym-exportsymbols-gpl">
+ <title><function>EXPORT_SYMBOL_GPL()</function>
+ <filename class="headerfile">include/linux/module.h</filename></title>
+
+ <para>
+ Similar to <function>EXPORT_SYMBOL()</function> except that the
+ symbols exported by <function>EXPORT_SYMBOL_GPL()</function> can
+ only be seen by modules with a
+ <function>MODULE_LICENSE()</function> that specifies a GPL
+ compatible license.
+ </para>
+ </sect1>
+ </chapter>
+
+ <chapter id="conventions">
+ <title>Routines and Conventions</title>
+
+ <sect1 id="conventions-doublelinkedlist">
+ <title>Double-linked lists
+ <filename class="headerfile">include/linux/list.h</filename></title>
+
+ <para>
+ There are three sets of linked-list routines in the kernel
+ headers, but this one seems to be winning out (and Linus has
+ used it). If you don't have some particular pressing need for
+ a single list, it's a good choice. In fact, I don't care
+ whether it's a good choice or not, just use it so we can get
+ rid of the others.
+ </para>
+ </sect1>
+
+ <sect1 id="convention-returns">
+ <title>Return Conventions</title>
+
+ <para>
+ For code called in user context, it's very common to defy C
+ convention, and return <returnvalue>0</returnvalue> for success,
+ and a negative error number
+ (eg. <returnvalue>-EFAULT</returnvalue>) for failure. This can be
+ unintuitive at first, but it's fairly widespread in the networking
+ code, for example.
+ </para>
+
+ <para>
+ The filesystem code uses <function>ERR_PTR()</function>
+
+ <filename class="headerfile">include/linux/fs.h</filename>; to
+ encode a negative error number into a pointer, and
+ <function>IS_ERR()</function> and <function>PTR_ERR()</function>
+ to get it back out again: avoids a separate pointer parameter for
+ the error number. Icky, but in a good way.
+ </para>
+ </sect1>
+
+ <sect1 id="conventions-borkedcompile">
+ <title>Breaking Compilation</title>
+
+ <para>
+ Linus and the other developers sometimes change function or
+ structure names in development kernels; this is not done just to
+ keep everyone on their toes: it reflects a fundamental change
+ (eg. can no longer be called with interrupts on, or does extra
+ checks, or doesn't do checks which were caught before). Usually
+ this is accompanied by a fairly complete note to the linux-kernel
+ mailing list; search the archive. Simply doing a global replace
+ on the file usually makes things <emphasis>worse</emphasis>.
+ </para>
+ </sect1>
+
+ <sect1 id="conventions-initialising">
+ <title>Initializing structure members</title>
+
+ <para>
+ The preferred method of initializing structures is to use
+ designated initialisers, as defined by ISO C99, eg:
+ </para>
+ <programlisting>
+static struct block_device_operations opt_fops = {
+ .open = opt_open,
+ .release = opt_release,
+ .ioctl = opt_ioctl,
+ .check_media_change = opt_media_change,
+};
+ </programlisting>
+ <para>
+ This makes it easy to grep for, and makes it clear which
+ structure fields are set. You should do this because it looks
+ cool.
+ </para>
+ </sect1>
+
+ <sect1 id="conventions-gnu-extns">
+ <title>GNU Extensions</title>
+
+ <para>
+ GNU Extensions are explicitly allowed in the Linux kernel.
+ Note that some of the more complex ones are not very well
+ supported, due to lack of general use, but the following are
+ considered standard (see the GCC info page section "C
+ Extensions" for more details - Yes, really the info page, the
+ man page is only a short summary of the stuff in info):
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ Inline functions
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Statement expressions (ie. the ({ and }) constructs).
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Declaring attributes of a function / variable / type
+ (__attribute__)
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ typeof
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Zero length arrays
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Macro varargs
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Arithmetic on void pointers
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Non-Constant initializers
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Assembler Instructions (not outside arch/ and include/asm/)
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Function names as strings (__FUNCTION__)
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ __builtin_constant_p()
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ <para>
+ Be wary when using long long in the kernel, the code gcc generates for
+ it is horrible and worse: division and multiplication does not work
+ on i386 because the GCC runtime functions for it are missing from
+ the kernel environment.
+ </para>
+
+ <!-- FIXME: add a note about ANSI aliasing cleanness -->
+ </sect1>
+
+ <sect1 id="conventions-cplusplus">
+ <title>C++</title>
+
+ <para>
+ Using C++ in the kernel is usually a bad idea, because the
+ kernel does not provide the necessary runtime environment
+ and the include files are not tested for it. It is still
+ possible, but not recommended. If you really want to do
+ this, forget about exceptions at least.
+ </para>
+ </sect1>
+
+ <sect1 id="conventions-ifdef">
+ <title>&num;if</title>
+
+ <para>
+ It is generally considered cleaner to use macros in header files
+ (or at the top of .c files) to abstract away functions rather than
+ using `#if' pre-processor statements throughout the source code.
+ </para>
+ </sect1>
+ </chapter>
+
+ <chapter id="submitting">
+ <title>Putting Your Stuff in the Kernel</title>
+
+ <para>
+ In order to get your stuff into shape for official inclusion, or
+ even to make a neat patch, there's administrative work to be
+ done:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ Figure out whose pond you've been pissing in. Look at the top of
+ the source files, inside the <filename>MAINTAINERS</filename>
+ file, and last of all in the <filename>CREDITS</filename> file.
+ You should coordinate with this person to make sure you're not
+ duplicating effort, or trying something that's already been
+ rejected.
+ </para>
+
+ <para>
+ Make sure you put your name and EMail address at the top of
+ any files you create or mangle significantly. This is the
+ first place people will look when they find a bug, or when
+ <emphasis>they</emphasis> want to make a change.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Usually you want a configuration option for your kernel hack.
+ Edit <filename>Config.in</filename> in the appropriate directory
+ (but under <filename>arch/</filename> it's called
+ <filename>config.in</filename>). The Config Language used is not
+ bash, even though it looks like bash; the safe way is to use only
+ the constructs that you already see in
+ <filename>Config.in</filename> files (see
+ <filename>Documentation/kbuild/kconfig-language.txt</filename>).
+ It's good to run "make xconfig" at least once to test (because
+ it's the only one with a static parser).
+ </para>
+
+ <para>
+ Variables which can be Y or N use <type>bool</type> followed by a
+ tagline and the config define name (which must start with
+ CONFIG_). The <type>tristate</type> function is the same, but
+ allows the answer M (which defines
+ <symbol>CONFIG_foo_MODULE</symbol> in your source, instead of
+ <symbol>CONFIG_FOO</symbol>) if <symbol>CONFIG_MODULES</symbol>
+ is enabled.
+ </para>
+
+ <para>
+ You may well want to make your CONFIG option only visible if
+ <symbol>CONFIG_EXPERIMENTAL</symbol> is enabled: this serves as a
+ warning to users. There many other fancy things you can do: see
+ the various <filename>Config.in</filename> files for ideas.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Edit the <filename>Makefile</filename>: the CONFIG variables are
+ exported here so you can conditionalize compilation with `ifeq'.
+ If your file exports symbols then add the names to
+ <varname>export-objs</varname> so that genksyms will find them.
+ <caution>
+ <para>
+ There is a restriction on the kernel build system that objects
+ which export symbols must have globally unique names.
+ If your object does not have a globally unique name then the
+ standard fix is to move the
+ <function>EXPORT_SYMBOL()</function> statements to their own
+ object with a unique name.
+ This is why several systems have separate exporting objects,
+ usually suffixed with ksyms.
+ </para>
+ </caution>
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Document your option in Documentation/Configure.help. Mention
+ incompatibilities and issues here. <emphasis> Definitely
+ </emphasis> end your description with <quote> if in doubt, say N
+ </quote> (or, occasionally, `Y'); this is for people who have no
+ idea what you are talking about.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Put yourself in <filename>CREDITS</filename> if you've done
+ something noteworthy, usually beyond a single file (your name
+ should be at the top of the source files anyway).
+ <filename>MAINTAINERS</filename> means you want to be consulted
+ when changes are made to a subsystem, and hear about bugs; it
+ implies a more-than-passing commitment to some part of the code.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Finally, don't forget to read <filename>Documentation/SubmittingPatches</filename>
+ and possibly <filename>Documentation/SubmittingDrivers</filename>.
+ </para>
+ </listitem>
+ </itemizedlist>
+ </chapter>
+
+ <chapter id="cantrips">
+ <title>Kernel Cantrips</title>
+
+ <para>
+ Some favorites from browsing the source. Feel free to add to this
+ list.
+ </para>
+
+ <para>
+ <filename>include/linux/brlock.h:</filename>
+ </para>
+ <programlisting>
+extern inline void br_read_lock (enum brlock_indices idx)
+{
+ /*
+ * This causes a link-time bug message if an
+ * invalid index is used:
+ */
+ if (idx >= __BR_END)
+ __br_lock_usage_bug();
+
+ read_lock(&amp;__brlock_array[smp_processor_id()][idx]);
+}
+ </programlisting>
+
+ <para>
+ <filename>include/linux/fs.h</filename>:
+ </para>
+ <programlisting>
+/*
+ * Kernel pointers have redundant information, so we can use a
+ * scheme where we can return either an error code or a dentry
+ * pointer with the same return value.
+ *
+ * This should be a per-architecture thing, to allow different
+ * error and pointer decisions.
+ */
+ #define ERR_PTR(err) ((void *)((long)(err)))
+ #define PTR_ERR(ptr) ((long)(ptr))
+ #define IS_ERR(ptr) ((unsigned long)(ptr) > (unsigned long)(-1000))
+</programlisting>
+
+ <para>
+ <filename>include/asm-i386/uaccess.h:</filename>
+ </para>
+
+ <programlisting>
+#define copy_to_user(to,from,n) \
+ (__builtin_constant_p(n) ? \
+ __constant_copy_to_user((to),(from),(n)) : \
+ __generic_copy_to_user((to),(from),(n)))
+ </programlisting>
+
+ <para>
+ <filename>arch/sparc/kernel/head.S:</filename>
+ </para>
+
+ <programlisting>
+/*
+ * Sun people can't spell worth damn. "compatability" indeed.
+ * At least we *know* we can't spell, and use a spell-checker.
+ */
+
+/* Uh, actually Linus it is I who cannot spell. Too much murky
+ * Sparc assembly will do this to ya.
+ */
+C_LABEL(cputypvar):
+ .asciz "compatability"
+
+/* Tested on SS-5, SS-10. Probably someone at Sun applied a spell-checker. */
+ .align 4
+C_LABEL(cputypvar_sun4m):
+ .asciz "compatible"
+ </programlisting>
+
+ <para>
+ <filename>arch/sparc/lib/checksum.S:</filename>
+ </para>
+
+ <programlisting>
+ /* Sun, you just can't beat me, you just can't. Stop trying,
+ * give up. I'm serious, I am going to kick the living shit
+ * out of you, game over, lights out.
+ */
+ </programlisting>
+ </chapter>
+
+ <chapter id="credits">
+ <title>Thanks</title>
+
+ <para>
+ Thanks to Andi Kleen for the idea, answering my questions, fixing
+ my mistakes, filling content, etc. Philipp Rumpf for more spelling
+ and clarity fixes, and some excellent non-obvious points. Werner
+ Almesberger for giving me a great summary of
+ <function>disable_irq()</function>, and Jes Sorensen and Andrea
+ Arcangeli added caveats. Michael Elizabeth Chastain for checking
+ and adding to the Configure section. <!-- Rusty insisted on this
+ bit; I didn't do it! --> Telsa Gwynne for teaching me DocBook.
+ </para>
+ </chapter>
+</book>
+
diff --git a/Documentation/DocBook/kernel-locking.tmpl b/Documentation/DocBook/kernel-locking.tmpl
new file mode 100644
index 000000000000..90dc2de8e0af
--- /dev/null
+++ b/Documentation/DocBook/kernel-locking.tmpl
@@ -0,0 +1,2088 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="LKLockingGuide">
+ <bookinfo>
+ <title>Unreliable Guide To Locking</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Rusty</firstname>
+ <surname>Russell</surname>
+ <affiliation>
+ <address>
+ <email>rusty@rustcorp.com.au</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2003</year>
+ <holder>Rusty Russell</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+ <toc></toc>
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ Welcome, to Rusty's Remarkably Unreliable Guide to Kernel
+ Locking issues. This document describes the locking systems in
+ the Linux Kernel in 2.6.
+ </para>
+ <para>
+ With the wide availability of HyperThreading, and <firstterm
+ linkend="gloss-preemption">preemption </firstterm> in the Linux
+ Kernel, everyone hacking on the kernel needs to know the
+ fundamentals of concurrency and locking for
+ <firstterm linkend="gloss-smp"><acronym>SMP</acronym></firstterm>.
+ </para>
+ </chapter>
+
+ <chapter id="races">
+ <title>The Problem With Concurrency</title>
+ <para>
+ (Skip this if you know what a Race Condition is).
+ </para>
+ <para>
+ In a normal program, you can increment a counter like so:
+ </para>
+ <programlisting>
+ very_important_count++;
+ </programlisting>
+
+ <para>
+ This is what they would expect to happen:
+ </para>
+
+ <table>
+ <title>Expected Results</title>
+
+ <tgroup cols="2" align="left">
+
+ <thead>
+ <row>
+ <entry>Instance 1</entry>
+ <entry>Instance 2</entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry>read very_important_count (5)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry>add 1 (6)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry>write very_important_count (6)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>read very_important_count (6)</entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>add 1 (7)</entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>write very_important_count (7)</entry>
+ </row>
+ </tbody>
+
+ </tgroup>
+ </table>
+
+ <para>
+ This is what might happen:
+ </para>
+
+ <table>
+ <title>Possible Results</title>
+
+ <tgroup cols="2" align="left">
+ <thead>
+ <row>
+ <entry>Instance 1</entry>
+ <entry>Instance 2</entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry>read very_important_count (5)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>read very_important_count (5)</entry>
+ </row>
+ <row>
+ <entry>add 1 (6)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>add 1 (6)</entry>
+ </row>
+ <row>
+ <entry>write very_important_count (6)</entry>
+ <entry></entry>
+ </row>
+ <row>
+ <entry></entry>
+ <entry>write very_important_count (6)</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+
+ <sect1 id="race-condition">
+ <title>Race Conditions and Critical Regions</title>
+ <para>
+ This overlap, where the result depends on the
+ relative timing of multiple tasks, is called a <firstterm>race condition</firstterm>.
+ The piece of code containing the concurrency issue is called a
+ <firstterm>critical region</firstterm>. And especially since Linux starting running
+ on SMP machines, they became one of the major issues in kernel
+ design and implementation.
+ </para>
+ <para>
+ Preemption can have the same effect, even if there is only one
+ CPU: by preempting one task during the critical region, we have
+ exactly the same race condition. In this case the thread which
+ preempts might run the critical region itself.
+ </para>
+ <para>
+ The solution is to recognize when these simultaneous accesses
+ occur, and use locks to make sure that only one instance can
+ enter the critical region at any time. There are many
+ friendly primitives in the Linux kernel to help you do this.
+ And then there are the unfriendly primitives, but I'll pretend
+ they don't exist.
+ </para>
+ </sect1>
+ </chapter>
+
+ <chapter id="locks">
+ <title>Locking in the Linux Kernel</title>
+
+ <para>
+ If I could give you one piece of advice: never sleep with anyone
+ crazier than yourself. But if I had to give you advice on
+ locking: <emphasis>keep it simple</emphasis>.
+ </para>
+
+ <para>
+ Be reluctant to introduce new locks.
+ </para>
+
+ <para>
+ Strangely enough, this last one is the exact reverse of my advice when
+ you <emphasis>have</emphasis> slept with someone crazier than yourself.
+ And you should think about getting a big dog.
+ </para>
+
+ <sect1 id="lock-intro">
+ <title>Two Main Types of Kernel Locks: Spinlocks and Semaphores</title>
+
+ <para>
+ There are two main types of kernel locks. The fundamental type
+ is the spinlock
+ (<filename class="headerfile">include/asm/spinlock.h</filename>),
+ which is a very simple single-holder lock: if you can't get the
+ spinlock, you keep trying (spinning) until you can. Spinlocks are
+ very small and fast, and can be used anywhere.
+ </para>
+ <para>
+ The second type is a semaphore
+ (<filename class="headerfile">include/asm/semaphore.h</filename>): it
+ can have more than one holder at any time (the number decided at
+ initialization time), although it is most commonly used as a
+ single-holder lock (a mutex). If you can't get a semaphore,
+ your task will put itself on the queue, and be woken up when the
+ semaphore is released. This means the CPU will do something
+ else while you are waiting, but there are many cases when you
+ simply can't sleep (see <xref linkend="sleeping-things"/>), and so
+ have to use a spinlock instead.
+ </para>
+ <para>
+ Neither type of lock is recursive: see
+ <xref linkend="deadlock"/>.
+ </para>
+ </sect1>
+
+ <sect1 id="uniprocessor">
+ <title>Locks and Uniprocessor Kernels</title>
+
+ <para>
+ For kernels compiled without <symbol>CONFIG_SMP</symbol>, and
+ without <symbol>CONFIG_PREEMPT</symbol> spinlocks do not exist at
+ all. This is an excellent design decision: when no-one else can
+ run at the same time, there is no reason to have a lock.
+ </para>
+
+ <para>
+ If the kernel is compiled without <symbol>CONFIG_SMP</symbol>,
+ but <symbol>CONFIG_PREEMPT</symbol> is set, then spinlocks
+ simply disable preemption, which is sufficient to prevent any
+ races. For most purposes, we can think of preemption as
+ equivalent to SMP, and not worry about it separately.
+ </para>
+
+ <para>
+ You should always test your locking code with <symbol>CONFIG_SMP</symbol>
+ and <symbol>CONFIG_PREEMPT</symbol> enabled, even if you don't have an SMP test box, because it
+ will still catch some kinds of locking bugs.
+ </para>
+
+ <para>
+ Semaphores still exist, because they are required for
+ synchronization between <firstterm linkend="gloss-usercontext">user
+ contexts</firstterm>, as we will see below.
+ </para>
+ </sect1>
+
+ <sect1 id="usercontextlocking">
+ <title>Locking Only In User Context</title>
+
+ <para>
+ If you have a data structure which is only ever accessed from
+ user context, then you can use a simple semaphore
+ (<filename>linux/asm/semaphore.h</filename>) to protect it. This
+ is the most trivial case: you initialize the semaphore to the number
+ of resources available (usually 1), and call
+ <function>down_interruptible()</function> to grab the semaphore, and
+ <function>up()</function> to release it. There is also a
+ <function>down()</function>, which should be avoided, because it
+ will not return if a signal is received.
+ </para>
+
+ <para>
+ Example: <filename>linux/net/core/netfilter.c</filename> allows
+ registration of new <function>setsockopt()</function> and
+ <function>getsockopt()</function> calls, with
+ <function>nf_register_sockopt()</function>. Registration and
+ de-registration are only done on module load and unload (and boot
+ time, where there is no concurrency), and the list of registrations
+ is only consulted for an unknown <function>setsockopt()</function>
+ or <function>getsockopt()</function> system call. The
+ <varname>nf_sockopt_mutex</varname> is perfect to protect this,
+ especially since the setsockopt and getsockopt calls may well
+ sleep.
+ </para>
+ </sect1>
+
+ <sect1 id="lock-user-bh">
+ <title>Locking Between User Context and Softirqs</title>
+
+ <para>
+ If a <firstterm linkend="gloss-softirq">softirq</firstterm> shares
+ data with user context, you have two problems. Firstly, the current
+ user context can be interrupted by a softirq, and secondly, the
+ critical region could be entered from another CPU. This is where
+ <function>spin_lock_bh()</function>
+ (<filename class="headerfile">include/linux/spinlock.h</filename>) is
+ used. It disables softirqs on that CPU, then grabs the lock.
+ <function>spin_unlock_bh()</function> does the reverse. (The
+ '_bh' suffix is a historical reference to "Bottom Halves", the
+ old name for software interrupts. It should really be
+ called spin_lock_softirq()' in a perfect world).
+ </para>
+
+ <para>
+ Note that you can also use <function>spin_lock_irq()</function>
+ or <function>spin_lock_irqsave()</function> here, which stop
+ hardware interrupts as well: see <xref linkend="hardirq-context"/>.
+ </para>
+
+ <para>
+ This works perfectly for <firstterm linkend="gloss-up"><acronym>UP
+ </acronym></firstterm> as well: the spin lock vanishes, and this macro
+ simply becomes <function>local_bh_disable()</function>
+ (<filename class="headerfile">include/linux/interrupt.h</filename>), which
+ protects you from the softirq being run.
+ </para>
+ </sect1>
+
+ <sect1 id="lock-user-tasklet">
+ <title>Locking Between User Context and Tasklets</title>
+
+ <para>
+ This is exactly the same as above, because <firstterm
+ linkend="gloss-tasklet">tasklets</firstterm> are actually run
+ from a softirq.
+ </para>
+ </sect1>
+
+ <sect1 id="lock-user-timers">
+ <title>Locking Between User Context and Timers</title>
+
+ <para>
+ This, too, is exactly the same as above, because <firstterm
+ linkend="gloss-timers">timers</firstterm> are actually run from
+ a softirq. From a locking point of view, tasklets and timers
+ are identical.
+ </para>
+ </sect1>
+
+ <sect1 id="lock-tasklets">
+ <title>Locking Between Tasklets/Timers</title>
+
+ <para>
+ Sometimes a tasklet or timer might want to share data with
+ another tasklet or timer.
+ </para>
+
+ <sect2 id="lock-tasklets-same">
+ <title>The Same Tasklet/Timer</title>
+ <para>
+ Since a tasklet is never run on two CPUs at once, you don't
+ need to worry about your tasklet being reentrant (running
+ twice at once), even on SMP.
+ </para>
+ </sect2>
+
+ <sect2 id="lock-tasklets-different">
+ <title>Different Tasklets/Timers</title>
+ <para>
+ If another tasklet/timer wants
+ to share data with your tasklet or timer , you will both need to use
+ <function>spin_lock()</function> and
+ <function>spin_unlock()</function> calls.
+ <function>spin_lock_bh()</function> is
+ unnecessary here, as you are already in a tasklet, and
+ none will be run on the same CPU.
+ </para>
+ </sect2>
+ </sect1>
+
+ <sect1 id="lock-softirqs">
+ <title>Locking Between Softirqs</title>
+
+ <para>
+ Often a softirq might
+ want to share data with itself or a tasklet/timer.
+ </para>
+
+ <sect2 id="lock-softirqs-same">
+ <title>The Same Softirq</title>
+
+ <para>
+ The same softirq can run on the other CPUs: you can use a
+ per-CPU array (see <xref linkend="per-cpu"/>) for better
+ performance. If you're going so far as to use a softirq,
+ you probably care about scalable performance enough
+ to justify the extra complexity.
+ </para>
+
+ <para>
+ You'll need to use <function>spin_lock()</function> and
+ <function>spin_unlock()</function> for shared data.
+ </para>
+ </sect2>
+
+ <sect2 id="lock-softirqs-different">
+ <title>Different Softirqs</title>
+
+ <para>
+ You'll need to use <function>spin_lock()</function> and
+ <function>spin_unlock()</function> for shared data, whether it
+ be a timer, tasklet, different softirq or the same or another
+ softirq: any of them could be running on a different CPU.
+ </para>
+ </sect2>
+ </sect1>
+ </chapter>
+
+ <chapter id="hardirq-context">
+ <title>Hard IRQ Context</title>
+
+ <para>
+ Hardware interrupts usually communicate with a
+ tasklet or softirq. Frequently this involves putting work in a
+ queue, which the softirq will take out.
+ </para>
+
+ <sect1 id="hardirq-softirq">
+ <title>Locking Between Hard IRQ and Softirqs/Tasklets</title>
+
+ <para>
+ If a hardware irq handler shares data with a softirq, you have
+ two concerns. Firstly, the softirq processing can be
+ interrupted by a hardware interrupt, and secondly, the
+ critical region could be entered by a hardware interrupt on
+ another CPU. This is where <function>spin_lock_irq()</function> is
+ used. It is defined to disable interrupts on that cpu, then grab
+ the lock. <function>spin_unlock_irq()</function> does the reverse.
+ </para>
+
+ <para>
+ The irq handler does not to use
+ <function>spin_lock_irq()</function>, because the softirq cannot
+ run while the irq handler is running: it can use
+ <function>spin_lock()</function>, which is slightly faster. The
+ only exception would be if a different hardware irq handler uses
+ the same lock: <function>spin_lock_irq()</function> will stop
+ that from interrupting us.
+ </para>
+
+ <para>
+ This works perfectly for UP as well: the spin lock vanishes,
+ and this macro simply becomes <function>local_irq_disable()</function>
+ (<filename class="headerfile">include/asm/smp.h</filename>), which
+ protects you from the softirq/tasklet/BH being run.
+ </para>
+
+ <para>
+ <function>spin_lock_irqsave()</function>
+ (<filename>include/linux/spinlock.h</filename>) is a variant
+ which saves whether interrupts were on or off in a flags word,
+ which is passed to <function>spin_unlock_irqrestore()</function>. This
+ means that the same code can be used inside an hard irq handler (where
+ interrupts are already off) and in softirqs (where the irq
+ disabling is required).
+ </para>
+
+ <para>
+ Note that softirqs (and hence tasklets and timers) are run on
+ return from hardware interrupts, so
+ <function>spin_lock_irq()</function> also stops these. In that
+ sense, <function>spin_lock_irqsave()</function> is the most
+ general and powerful locking function.
+ </para>
+
+ </sect1>
+ <sect1 id="hardirq-hardirq">
+ <title>Locking Between Two Hard IRQ Handlers</title>
+ <para>
+ It is rare to have to share data between two IRQ handlers, but
+ if you do, <function>spin_lock_irqsave()</function> should be
+ used: it is architecture-specific whether all interrupts are
+ disabled inside irq handlers themselves.
+ </para>
+ </sect1>
+
+ </chapter>
+
+ <chapter id="cheatsheet">
+ <title>Cheat Sheet For Locking</title>
+ <para>
+ Pete Zaitcev gives the following summary:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ If you are in a process context (any syscall) and want to
+ lock other process out, use a semaphore. You can take a semaphore
+ and sleep (<function>copy_from_user*(</function> or
+ <function>kmalloc(x,GFP_KERNEL)</function>).
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Otherwise (== data can be touched in an interrupt), use
+ <function>spin_lock_irqsave()</function> and
+ <function>spin_unlock_irqrestore()</function>.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Avoid holding spinlock for more than 5 lines of code and
+ across any function call (except accessors like
+ <function>readb</function>).
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ <sect1 id="minimum-lock-reqirements">
+ <title>Table of Minimum Requirements</title>
+
+ <para> The following table lists the <emphasis>minimum</emphasis>
+ locking requirements between various contexts. In some cases,
+ the same context can only be running on one CPU at a time, so
+ no locking is required for that context (eg. a particular
+ thread can only run on one CPU at a time, but if it needs
+ shares data with another thread, locking is required).
+ </para>
+ <para>
+ Remember the advice above: you can always use
+ <function>spin_lock_irqsave()</function>, which is a superset
+ of all other spinlock primitives.
+ </para>
+ <table>
+<title>Table of Locking Requirements</title>
+<tgroup cols="11">
+<tbody>
+<row>
+<entry></entry>
+<entry>IRQ Handler A</entry>
+<entry>IRQ Handler B</entry>
+<entry>Softirq A</entry>
+<entry>Softirq B</entry>
+<entry>Tasklet A</entry>
+<entry>Tasklet B</entry>
+<entry>Timer A</entry>
+<entry>Timer B</entry>
+<entry>User Context A</entry>
+<entry>User Context B</entry>
+</row>
+
+<row>
+<entry>IRQ Handler A</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>IRQ Handler B</entry>
+<entry>spin_lock_irqsave</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>Softirq A</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+</row>
+
+<row>
+<entry>Softirq B</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+</row>
+
+<row>
+<entry>Tasklet A</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>Tasklet B</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>Timer A</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>Timer B</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>spin_lock</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>User Context A</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>None</entry>
+</row>
+
+<row>
+<entry>User Context B</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_irq</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>spin_lock_bh</entry>
+<entry>down_interruptible</entry>
+<entry>None</entry>
+</row>
+
+</tbody>
+</tgroup>
+</table>
+</sect1>
+</chapter>
+
+ <chapter id="Examples">
+ <title>Common Examples</title>
+ <para>
+Let's step through a simple example: a cache of number to name
+mappings. The cache keeps a count of how often each of the objects is
+used, and when it gets full, throws out the least used one.
+
+ </para>
+
+ <sect1 id="examples-usercontext">
+ <title>All In User Context</title>
+ <para>
+For our first example, we assume that all operations are in user
+context (ie. from system calls), so we can sleep. This means we can
+use a semaphore to protect the cache and all the objects within
+it. Here's the code:
+ </para>
+
+ <programlisting>
+#include &lt;linux/list.h&gt;
+#include &lt;linux/slab.h&gt;
+#include &lt;linux/string.h&gt;
+#include &lt;asm/semaphore.h&gt;
+#include &lt;asm/errno.h&gt;
+
+struct object
+{
+ struct list_head list;
+ int id;
+ char name[32];
+ int popularity;
+};
+
+/* Protects the cache, cache_num, and the objects within it */
+static DECLARE_MUTEX(cache_lock);
+static LIST_HEAD(cache);
+static unsigned int cache_num = 0;
+#define MAX_CACHE_SIZE 10
+
+/* Must be holding cache_lock */
+static struct object *__cache_find(int id)
+{
+ struct object *i;
+
+ list_for_each_entry(i, &amp;cache, list)
+ if (i-&gt;id == id) {
+ i-&gt;popularity++;
+ return i;
+ }
+ return NULL;
+}
+
+/* Must be holding cache_lock */
+static void __cache_delete(struct object *obj)
+{
+ BUG_ON(!obj);
+ list_del(&amp;obj-&gt;list);
+ kfree(obj);
+ cache_num--;
+}
+
+/* Must be holding cache_lock */
+static void __cache_add(struct object *obj)
+{
+ list_add(&amp;obj-&gt;list, &amp;cache);
+ if (++cache_num > MAX_CACHE_SIZE) {
+ struct object *i, *outcast = NULL;
+ list_for_each_entry(i, &amp;cache, list) {
+ if (!outcast || i-&gt;popularity &lt; outcast-&gt;popularity)
+ outcast = i;
+ }
+ __cache_delete(outcast);
+ }
+}
+
+int cache_add(int id, const char *name)
+{
+ struct object *obj;
+
+ if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
+ return -ENOMEM;
+
+ strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
+ obj-&gt;id = id;
+ obj-&gt;popularity = 0;
+
+ down(&amp;cache_lock);
+ __cache_add(obj);
+ up(&amp;cache_lock);
+ return 0;
+}
+
+void cache_delete(int id)
+{
+ down(&amp;cache_lock);
+ __cache_delete(__cache_find(id));
+ up(&amp;cache_lock);
+}
+
+int cache_find(int id, char *name)
+{
+ struct object *obj;
+ int ret = -ENOENT;
+
+ down(&amp;cache_lock);
+ obj = __cache_find(id);
+ if (obj) {
+ ret = 0;
+ strcpy(name, obj-&gt;name);
+ }
+ up(&amp;cache_lock);
+ return ret;
+}
+</programlisting>
+
+ <para>
+Note that we always make sure we have the cache_lock when we add,
+delete, or look up the cache: both the cache infrastructure itself and
+the contents of the objects are protected by the lock. In this case
+it's easy, since we copy the data for the user, and never let them
+access the objects directly.
+ </para>
+ <para>
+There is a slight (and common) optimization here: in
+<function>cache_add</function> we set up the fields of the object
+before grabbing the lock. This is safe, as no-one else can access it
+until we put it in cache.
+ </para>
+ </sect1>
+
+ <sect1 id="examples-interrupt">
+ <title>Accessing From Interrupt Context</title>
+ <para>
+Now consider the case where <function>cache_find</function> can be
+called from interrupt context: either a hardware interrupt or a
+softirq. An example would be a timer which deletes object from the
+cache.
+ </para>
+ <para>
+The change is shown below, in standard patch format: the
+<symbol>-</symbol> are lines which are taken away, and the
+<symbol>+</symbol> are lines which are added.
+ </para>
+<programlisting>
+--- cache.c.usercontext 2003-12-09 13:58:54.000000000 +1100
++++ cache.c.interrupt 2003-12-09 14:07:49.000000000 +1100
+@@ -12,7 +12,7 @@
+ int popularity;
+ };
+
+-static DECLARE_MUTEX(cache_lock);
++static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED;
+ static LIST_HEAD(cache);
+ static unsigned int cache_num = 0;
+ #define MAX_CACHE_SIZE 10
+@@ -55,6 +55,7 @@
+ int cache_add(int id, const char *name)
+ {
+ struct object *obj;
++ unsigned long flags;
+
+ if ((obj = kmalloc(sizeof(*obj), GFP_KERNEL)) == NULL)
+ return -ENOMEM;
+@@ -63,30 +64,33 @@
+ obj-&gt;id = id;
+ obj-&gt;popularity = 0;
+
+- down(&amp;cache_lock);
++ spin_lock_irqsave(&amp;cache_lock, flags);
+ __cache_add(obj);
+- up(&amp;cache_lock);
++ spin_unlock_irqrestore(&amp;cache_lock, flags);
+ return 0;
+ }
+
+ void cache_delete(int id)
+ {
+- down(&amp;cache_lock);
++ unsigned long flags;
++
++ spin_lock_irqsave(&amp;cache_lock, flags);
+ __cache_delete(__cache_find(id));
+- up(&amp;cache_lock);
++ spin_unlock_irqrestore(&amp;cache_lock, flags);
+ }
+
+ int cache_find(int id, char *name)
+ {
+ struct object *obj;
+ int ret = -ENOENT;
++ unsigned long flags;
+
+- down(&amp;cache_lock);
++ spin_lock_irqsave(&amp;cache_lock, flags);
+ obj = __cache_find(id);
+ if (obj) {
+ ret = 0;
+ strcpy(name, obj-&gt;name);
+ }
+- up(&amp;cache_lock);
++ spin_unlock_irqrestore(&amp;cache_lock, flags);
+ return ret;
+ }
+</programlisting>
+
+ <para>
+Note that the <function>spin_lock_irqsave</function> will turn off
+interrupts if they are on, otherwise does nothing (if we are already
+in an interrupt handler), hence these functions are safe to call from
+any context.
+ </para>
+ <para>
+Unfortunately, <function>cache_add</function> calls
+<function>kmalloc</function> with the <symbol>GFP_KERNEL</symbol>
+flag, which is only legal in user context. I have assumed that
+<function>cache_add</function> is still only called in user context,
+otherwise this should become a parameter to
+<function>cache_add</function>.
+ </para>
+ </sect1>
+ <sect1 id="examples-refcnt">
+ <title>Exposing Objects Outside This File</title>
+ <para>
+If our objects contained more information, it might not be sufficient
+to copy the information in and out: other parts of the code might want
+to keep pointers to these objects, for example, rather than looking up
+the id every time. This produces two problems.
+ </para>
+ <para>
+The first problem is that we use the <symbol>cache_lock</symbol> to
+protect objects: we'd need to make this non-static so the rest of the
+code can use it. This makes locking trickier, as it is no longer all
+in one place.
+ </para>
+ <para>
+The second problem is the lifetime problem: if another structure keeps
+a pointer to an object, it presumably expects that pointer to remain
+valid. Unfortunately, this is only guaranteed while you hold the
+lock, otherwise someone might call <function>cache_delete</function>
+and even worse, add another object, re-using the same address.
+ </para>
+ <para>
+As there is only one lock, you can't hold it forever: no-one else would
+get any work done.
+ </para>
+ <para>
+The solution to this problem is to use a reference count: everyone who
+has a pointer to the object increases it when they first get the
+object, and drops the reference count when they're finished with it.
+Whoever drops it to zero knows it is unused, and can actually delete it.
+ </para>
+ <para>
+Here is the code:
+ </para>
+
+<programlisting>
+--- cache.c.interrupt 2003-12-09 14:25:43.000000000 +1100
++++ cache.c.refcnt 2003-12-09 14:33:05.000000000 +1100
+@@ -7,6 +7,7 @@
+ struct object
+ {
+ struct list_head list;
++ unsigned int refcnt;
+ int id;
+ char name[32];
+ int popularity;
+@@ -17,6 +18,35 @@
+ static unsigned int cache_num = 0;
+ #define MAX_CACHE_SIZE 10
+
++static void __object_put(struct object *obj)
++{
++ if (--obj-&gt;refcnt == 0)
++ kfree(obj);
++}
++
++static void __object_get(struct object *obj)
++{
++ obj-&gt;refcnt++;
++}
++
++void object_put(struct object *obj)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&amp;cache_lock, flags);
++ __object_put(obj);
++ spin_unlock_irqrestore(&amp;cache_lock, flags);
++}
++
++void object_get(struct object *obj)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&amp;cache_lock, flags);
++ __object_get(obj);
++ spin_unlock_irqrestore(&amp;cache_lock, flags);
++}
++
+ /* Must be holding cache_lock */
+ static struct object *__cache_find(int id)
+ {
+@@ -35,6 +65,7 @@
+ {
+ BUG_ON(!obj);
+ list_del(&amp;obj-&gt;list);
++ __object_put(obj);
+ cache_num--;
+ }
+
+@@ -63,6 +94,7 @@
+ strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
+ obj-&gt;id = id;
+ obj-&gt;popularity = 0;
++ obj-&gt;refcnt = 1; /* The cache holds a reference */
+
+ spin_lock_irqsave(&amp;cache_lock, flags);
+ __cache_add(obj);
+@@ -79,18 +111,15 @@
+ spin_unlock_irqrestore(&amp;cache_lock, flags);
+ }
+
+-int cache_find(int id, char *name)
++struct object *cache_find(int id)
+ {
+ struct object *obj;
+- int ret = -ENOENT;
+ unsigned long flags;
+
+ spin_lock_irqsave(&amp;cache_lock, flags);
+ obj = __cache_find(id);
+- if (obj) {
+- ret = 0;
+- strcpy(name, obj-&gt;name);
+- }
++ if (obj)
++ __object_get(obj);
+ spin_unlock_irqrestore(&amp;cache_lock, flags);
+- return ret;
++ return obj;
+ }
+</programlisting>
+
+<para>
+We encapsulate the reference counting in the standard 'get' and 'put'
+functions. Now we can return the object itself from
+<function>cache_find</function> which has the advantage that the user
+can now sleep holding the object (eg. to
+<function>copy_to_user</function> to name to userspace).
+</para>
+<para>
+The other point to note is that I said a reference should be held for
+every pointer to the object: thus the reference count is 1 when first
+inserted into the cache. In some versions the framework does not hold
+a reference count, but they are more complicated.
+</para>
+
+ <sect2 id="examples-refcnt-atomic">
+ <title>Using Atomic Operations For The Reference Count</title>
+<para>
+In practice, <type>atomic_t</type> would usually be used for
+<structfield>refcnt</structfield>. There are a number of atomic
+operations defined in
+
+<filename class="headerfile">include/asm/atomic.h</filename>: these are
+guaranteed to be seen atomically from all CPUs in the system, so no
+lock is required. In this case, it is simpler than using spinlocks,
+although for anything non-trivial using spinlocks is clearer. The
+<function>atomic_inc</function> and
+<function>atomic_dec_and_test</function> are used instead of the
+standard increment and decrement operators, and the lock is no longer
+used to protect the reference count itself.
+</para>
+
+<programlisting>
+--- cache.c.refcnt 2003-12-09 15:00:35.000000000 +1100
++++ cache.c.refcnt-atomic 2003-12-11 15:49:42.000000000 +1100
+@@ -7,7 +7,7 @@
+ struct object
+ {
+ struct list_head list;
+- unsigned int refcnt;
++ atomic_t refcnt;
+ int id;
+ char name[32];
+ int popularity;
+@@ -18,33 +18,15 @@
+ static unsigned int cache_num = 0;
+ #define MAX_CACHE_SIZE 10
+
+-static void __object_put(struct object *obj)
+-{
+- if (--obj-&gt;refcnt == 0)
+- kfree(obj);
+-}
+-
+-static void __object_get(struct object *obj)
+-{
+- obj-&gt;refcnt++;
+-}
+-
+ void object_put(struct object *obj)
+ {
+- unsigned long flags;
+-
+- spin_lock_irqsave(&amp;cache_lock, flags);
+- __object_put(obj);
+- spin_unlock_irqrestore(&amp;cache_lock, flags);
++ if (atomic_dec_and_test(&amp;obj-&gt;refcnt))
++ kfree(obj);
+ }
+
+ void object_get(struct object *obj)
+ {
+- unsigned long flags;
+-
+- spin_lock_irqsave(&amp;cache_lock, flags);
+- __object_get(obj);
+- spin_unlock_irqrestore(&amp;cache_lock, flags);
++ atomic_inc(&amp;obj-&gt;refcnt);
+ }
+
+ /* Must be holding cache_lock */
+@@ -65,7 +47,7 @@
+ {
+ BUG_ON(!obj);
+ list_del(&amp;obj-&gt;list);
+- __object_put(obj);
++ object_put(obj);
+ cache_num--;
+ }
+
+@@ -94,7 +76,7 @@
+ strlcpy(obj-&gt;name, name, sizeof(obj-&gt;name));
+ obj-&gt;id = id;
+ obj-&gt;popularity = 0;
+- obj-&gt;refcnt = 1; /* The cache holds a reference */
++ atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
+
+ spin_lock_irqsave(&amp;cache_lock, flags);
+ __cache_add(obj);
+@@ -119,7 +101,7 @@
+ spin_lock_irqsave(&amp;cache_lock, flags);
+ obj = __cache_find(id);
+ if (obj)
+- __object_get(obj);
++ object_get(obj);
+ spin_unlock_irqrestore(&amp;cache_lock, flags);
+ return obj;
+ }
+</programlisting>
+</sect2>
+</sect1>
+
+ <sect1 id="examples-lock-per-obj">
+ <title>Protecting The Objects Themselves</title>
+ <para>
+In these examples, we assumed that the objects (except the reference
+counts) never changed once they are created. If we wanted to allow
+the name to change, there are three possibilities:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+You can make <symbol>cache_lock</symbol> non-static, and tell people
+to grab that lock before changing the name in any object.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+You can provide a <function>cache_obj_rename</function> which grabs
+this lock and changes the name for the caller, and tell everyone to
+use that function.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+You can make the <symbol>cache_lock</symbol> protect only the cache
+itself, and use another lock to protect the name.
+ </para>
+ </listitem>
+ </itemizedlist>
+
+ <para>
+Theoretically, you can make the locks as fine-grained as one lock for
+every field, for every object. In practice, the most common variants
+are:
+</para>
+ <itemizedlist>
+ <listitem>
+ <para>
+One lock which protects the infrastructure (the <symbol>cache</symbol>
+list in this example) and all the objects. This is what we have done
+so far.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+One lock which protects the infrastructure (including the list
+pointers inside the objects), and one lock inside the object which
+protects the rest of that object.
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+Multiple locks to protect the infrastructure (eg. one lock per hash
+chain), possibly with a separate per-object lock.
+ </para>
+ </listitem>
+ </itemizedlist>
+
+<para>
+Here is the "lock-per-object" implementation:
+</para>
+<programlisting>
+--- cache.c.refcnt-atomic 2003-12-11 15:50:54.000000000 +1100
++++ cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100
+@@ -6,11 +6,17 @@
+
+ struct object
+ {
++ /* These two protected by cache_lock. */
+ struct list_head list;
++ int popularity;
++
+ atomic_t refcnt;
++
++ /* Doesn't change once created. */
+ int id;
++
++ spinlock_t lock; /* Protects the name */
+ char name[32];
+- int popularity;
+ };
+
+ static spinlock_t cache_lock = SPIN_LOCK_UNLOCKED;
+@@ -77,6 +84,7 @@
+ obj-&gt;id = id;
+ obj-&gt;popularity = 0;
+ atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
++ spin_lock_init(&amp;obj-&gt;lock);
+
+ spin_lock_irqsave(&amp;cache_lock, flags);
+ __cache_add(obj);
+</programlisting>
+
+<para>
+Note that I decide that the <structfield>popularity</structfield>
+count should be protected by the <symbol>cache_lock</symbol> rather
+than the per-object lock: this is because it (like the
+<structname>struct list_head</structname> inside the object) is
+logically part of the infrastructure. This way, I don't need to grab
+the lock of every object in <function>__cache_add</function> when
+seeking the least popular.
+</para>
+
+<para>
+I also decided that the <structfield>id</structfield> member is
+unchangeable, so I don't need to grab each object lock in
+<function>__cache_find()</function> to examine the
+<structfield>id</structfield>: the object lock is only used by a
+caller who wants to read or write the <structfield>name</structfield>
+field.
+</para>
+
+<para>
+Note also that I added a comment describing what data was protected by
+which locks. This is extremely important, as it describes the runtime
+behavior of the code, and can be hard to gain from just reading. And
+as Alan Cox says, <quote>Lock data, not code</quote>.
+</para>
+</sect1>
+</chapter>
+
+ <chapter id="common-problems">
+ <title>Common Problems</title>
+ <sect1 id="deadlock">
+ <title>Deadlock: Simple and Advanced</title>
+
+ <para>
+ There is a coding bug where a piece of code tries to grab a
+ spinlock twice: it will spin forever, waiting for the lock to
+ be released (spinlocks, rwlocks and semaphores are not
+ recursive in Linux). This is trivial to diagnose: not a
+ stay-up-five-nights-talk-to-fluffy-code-bunnies kind of
+ problem.
+ </para>
+
+ <para>
+ For a slightly more complex case, imagine you have a region
+ shared by a softirq and user context. If you use a
+ <function>spin_lock()</function> call to protect it, it is
+ possible that the user context will be interrupted by the softirq
+ while it holds the lock, and the softirq will then spin
+ forever trying to get the same lock.
+ </para>
+
+ <para>
+ Both of these are called deadlock, and as shown above, it can
+ occur even with a single CPU (although not on UP compiles,
+ since spinlocks vanish on kernel compiles with
+ <symbol>CONFIG_SMP</symbol>=n. You'll still get data corruption
+ in the second example).
+ </para>
+
+ <para>
+ This complete lockup is easy to diagnose: on SMP boxes the
+ watchdog timer or compiling with <symbol>DEBUG_SPINLOCKS</symbol> set
+ (<filename>include/linux/spinlock.h</filename>) will show this up
+ immediately when it happens.
+ </para>
+
+ <para>
+ A more complex problem is the so-called 'deadly embrace',
+ involving two or more locks. Say you have a hash table: each
+ entry in the table is a spinlock, and a chain of hashed
+ objects. Inside a softirq handler, you sometimes want to
+ alter an object from one place in the hash to another: you
+ grab the spinlock of the old hash chain and the spinlock of
+ the new hash chain, and delete the object from the old one,
+ and insert it in the new one.
+ </para>
+
+ <para>
+ There are two problems here. First, if your code ever
+ tries to move the object to the same chain, it will deadlock
+ with itself as it tries to lock it twice. Secondly, if the
+ same softirq on another CPU is trying to move another object
+ in the reverse direction, the following could happen:
+ </para>
+
+ <table>
+ <title>Consequences</title>
+
+ <tgroup cols="2" align="left">
+
+ <thead>
+ <row>
+ <entry>CPU 1</entry>
+ <entry>CPU 2</entry>
+ </row>
+ </thead>
+
+ <tbody>
+ <row>
+ <entry>Grab lock A -&gt; OK</entry>
+ <entry>Grab lock B -&gt; OK</entry>
+ </row>
+ <row>
+ <entry>Grab lock B -&gt; spin</entry>
+ <entry>Grab lock A -&gt; spin</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+
+ <para>
+ The two CPUs will spin forever, waiting for the other to give up
+ their lock. It will look, smell, and feel like a crash.
+ </para>
+ </sect1>
+
+ <sect1 id="techs-deadlock-prevent">
+ <title>Preventing Deadlock</title>
+
+ <para>
+ Textbooks will tell you that if you always lock in the same
+ order, you will never get this kind of deadlock. Practice
+ will tell you that this approach doesn't scale: when I
+ create a new lock, I don't understand enough of the kernel
+ to figure out where in the 5000 lock hierarchy it will fit.
+ </para>
+
+ <para>
+ The best locks are encapsulated: they never get exposed in
+ headers, and are never held around calls to non-trivial
+ functions outside the same file. You can read through this
+ code and see that it will never deadlock, because it never
+ tries to grab another lock while it has that one. People
+ using your code don't even need to know you are using a
+ lock.
+ </para>
+
+ <para>
+ A classic problem here is when you provide callbacks or
+ hooks: if you call these with the lock held, you risk simple
+ deadlock, or a deadly embrace (who knows what the callback
+ will do?). Remember, the other programmers are out to get
+ you, so don't do this.
+ </para>
+
+ <sect2 id="techs-deadlock-overprevent">
+ <title>Overzealous Prevention Of Deadlocks</title>
+
+ <para>
+ Deadlocks are problematic, but not as bad as data
+ corruption. Code which grabs a read lock, searches a list,
+ fails to find what it wants, drops the read lock, grabs a
+ write lock and inserts the object has a race condition.
+ </para>
+
+ <para>
+ If you don't see why, please stay the fuck away from my code.
+ </para>
+ </sect2>
+ </sect1>
+
+ <sect1 id="racing-timers">
+ <title>Racing Timers: A Kernel Pastime</title>
+
+ <para>
+ Timers can produce their own special problems with races.
+ Consider a collection of objects (list, hash, etc) where each
+ object has a timer which is due to destroy it.
+ </para>
+
+ <para>
+ If you want to destroy the entire collection (say on module
+ removal), you might do the following:
+ </para>
+
+ <programlisting>
+ /* THIS CODE BAD BAD BAD BAD: IF IT WAS ANY WORSE IT WOULD USE
+ HUNGARIAN NOTATION */
+ spin_lock_bh(&amp;list_lock);
+
+ while (list) {
+ struct foo *next = list-&gt;next;
+ del_timer(&amp;list-&gt;timer);
+ kfree(list);
+ list = next;
+ }
+
+ spin_unlock_bh(&amp;list_lock);
+ </programlisting>
+
+ <para>
+ Sooner or later, this will crash on SMP, because a timer can
+ have just gone off before the <function>spin_lock_bh()</function>,
+ and it will only get the lock after we
+ <function>spin_unlock_bh()</function>, and then try to free
+ the element (which has already been freed!).
+ </para>
+
+ <para>
+ This can be avoided by checking the result of
+ <function>del_timer()</function>: if it returns
+ <returnvalue>1</returnvalue>, the timer has been deleted.
+ If <returnvalue>0</returnvalue>, it means (in this
+ case) that it is currently running, so we can do:
+ </para>
+
+ <programlisting>
+ retry:
+ spin_lock_bh(&amp;list_lock);
+
+ while (list) {
+ struct foo *next = list-&gt;next;
+ if (!del_timer(&amp;list-&gt;timer)) {
+ /* Give timer a chance to delete this */
+ spin_unlock_bh(&amp;list_lock);
+ goto retry;
+ }
+ kfree(list);
+ list = next;
+ }
+
+ spin_unlock_bh(&amp;list_lock);
+ </programlisting>
+
+ <para>
+ Another common problem is deleting timers which restart
+ themselves (by calling <function>add_timer()</function> at the end
+ of their timer function). Because this is a fairly common case
+ which is prone to races, you should use <function>del_timer_sync()</function>
+ (<filename class="headerfile">include/linux/timer.h</filename>)
+ to handle this case. It returns the number of times the timer
+ had to be deleted before we finally stopped it from adding itself back
+ in.
+ </para>
+ </sect1>
+
+ </chapter>
+
+ <chapter id="Efficiency">
+ <title>Locking Speed</title>
+
+ <para>
+There are three main things to worry about when considering speed of
+some code which does locking. First is concurrency: how many things
+are going to be waiting while someone else is holding a lock. Second
+is the time taken to actually acquire and release an uncontended lock.
+Third is using fewer, or smarter locks. I'm assuming that the lock is
+used fairly often: otherwise, you wouldn't be concerned about
+efficiency.
+</para>
+ <para>
+Concurrency depends on how long the lock is usually held: you should
+hold the lock for as long as needed, but no longer. In the cache
+example, we always create the object without the lock held, and then
+grab the lock only when we are ready to insert it in the list.
+</para>
+ <para>
+Acquisition times depend on how much damage the lock operations do to
+the pipeline (pipeline stalls) and how likely it is that this CPU was
+the last one to grab the lock (ie. is the lock cache-hot for this
+CPU): on a machine with more CPUs, this likelihood drops fast.
+Consider a 700MHz Intel Pentium III: an instruction takes about 0.7ns,
+an atomic increment takes about 58ns, a lock which is cache-hot on
+this CPU takes 160ns, and a cacheline transfer from another CPU takes
+an additional 170 to 360ns. (These figures from Paul McKenney's
+<ulink url="http://www.linuxjournal.com/article.php?sid=6993"> Linux
+Journal RCU article</ulink>).
+</para>
+ <para>
+These two aims conflict: holding a lock for a short time might be done
+by splitting locks into parts (such as in our final per-object-lock
+example), but this increases the number of lock acquisitions, and the
+results are often slower than having a single lock. This is another
+reason to advocate locking simplicity.
+</para>
+ <para>
+The third concern is addressed below: there are some methods to reduce
+the amount of locking which needs to be done.
+</para>
+
+ <sect1 id="efficiency-rwlocks">
+ <title>Read/Write Lock Variants</title>
+
+ <para>
+ Both spinlocks and semaphores have read/write variants:
+ <type>rwlock_t</type> and <structname>struct rw_semaphore</structname>.
+ These divide users into two classes: the readers and the writers. If
+ you are only reading the data, you can get a read lock, but to write to
+ the data you need the write lock. Many people can hold a read lock,
+ but a writer must be sole holder.
+ </para>
+
+ <para>
+ If your code divides neatly along reader/writer lines (as our
+ cache code does), and the lock is held by readers for
+ significant lengths of time, using these locks can help. They
+ are slightly slower than the normal locks though, so in practice
+ <type>rwlock_t</type> is not usually worthwhile.
+ </para>
+ </sect1>
+
+ <sect1 id="efficiency-read-copy-update">
+ <title>Avoiding Locks: Read Copy Update</title>
+
+ <para>
+ There is a special method of read/write locking called Read Copy
+ Update. Using RCU, the readers can avoid taking a lock
+ altogether: as we expect our cache to be read more often than
+ updated (otherwise the cache is a waste of time), it is a
+ candidate for this optimization.
+ </para>
+
+ <para>
+ How do we get rid of read locks? Getting rid of read locks
+ means that writers may be changing the list underneath the
+ readers. That is actually quite simple: we can read a linked
+ list while an element is being added if the writer adds the
+ element very carefully. For example, adding
+ <symbol>new</symbol> to a single linked list called
+ <symbol>list</symbol>:
+ </para>
+
+ <programlisting>
+ new-&gt;next = list-&gt;next;
+ wmb();
+ list-&gt;next = new;
+ </programlisting>
+
+ <para>
+ The <function>wmb()</function> is a write memory barrier. It
+ ensures that the first operation (setting the new element's
+ <symbol>next</symbol> pointer) is complete and will be seen by
+ all CPUs, before the second operation is (putting the new
+ element into the list). This is important, since modern
+ compilers and modern CPUs can both reorder instructions unless
+ told otherwise: we want a reader to either not see the new
+ element at all, or see the new element with the
+ <symbol>next</symbol> pointer correctly pointing at the rest of
+ the list.
+ </para>
+ <para>
+ Fortunately, there is a function to do this for standard
+ <structname>struct list_head</structname> lists:
+ <function>list_add_rcu()</function>
+ (<filename>include/linux/list.h</filename>).
+ </para>
+ <para>
+ Removing an element from the list is even simpler: we replace
+ the pointer to the old element with a pointer to its successor,
+ and readers will either see it, or skip over it.
+ </para>
+ <programlisting>
+ list-&gt;next = old-&gt;next;
+ </programlisting>
+ <para>
+ There is <function>list_del_rcu()</function>
+ (<filename>include/linux/list.h</filename>) which does this (the
+ normal version poisons the old object, which we don't want).
+ </para>
+ <para>
+ The reader must also be careful: some CPUs can look through the
+ <symbol>next</symbol> pointer to start reading the contents of
+ the next element early, but don't realize that the pre-fetched
+ contents is wrong when the <symbol>next</symbol> pointer changes
+ underneath them. Once again, there is a
+ <function>list_for_each_entry_rcu()</function>
+ (<filename>include/linux/list.h</filename>) to help you. Of
+ course, writers can just use
+ <function>list_for_each_entry()</function>, since there cannot
+ be two simultaneous writers.
+ </para>
+ <para>
+ Our final dilemma is this: when can we actually destroy the
+ removed element? Remember, a reader might be stepping through
+ this element in the list right now: it we free this element and
+ the <symbol>next</symbol> pointer changes, the reader will jump
+ off into garbage and crash. We need to wait until we know that
+ all the readers who were traversing the list when we deleted the
+ element are finished. We use <function>call_rcu()</function> to
+ register a callback which will actually destroy the object once
+ the readers are finished.
+ </para>
+ <para>
+ But how does Read Copy Update know when the readers are
+ finished? The method is this: firstly, the readers always
+ traverse the list inside
+ <function>rcu_read_lock()</function>/<function>rcu_read_unlock()</function>
+ pairs: these simply disable preemption so the reader won't go to
+ sleep while reading the list.
+ </para>
+ <para>
+ RCU then waits until every other CPU has slept at least once:
+ since readers cannot sleep, we know that any readers which were
+ traversing the list during the deletion are finished, and the
+ callback is triggered. The real Read Copy Update code is a
+ little more optimized than this, but this is the fundamental
+ idea.
+ </para>
+
+<programlisting>
+--- cache.c.perobjectlock 2003-12-11 17:15:03.000000000 +1100
++++ cache.c.rcupdate 2003-12-11 17:55:14.000000000 +1100
+@@ -1,15 +1,18 @@
+ #include &lt;linux/list.h&gt;
+ #include &lt;linux/slab.h&gt;
+ #include &lt;linux/string.h&gt;
++#include &lt;linux/rcupdate.h&gt;
+ #include &lt;asm/semaphore.h&gt;
+ #include &lt;asm/errno.h&gt;
+
+ struct object
+ {
+- /* These two protected by cache_lock. */
++ /* This is protected by RCU */
+ struct list_head list;
+ int popularity;
+
++ struct rcu_head rcu;
++
+ atomic_t refcnt;
+
+ /* Doesn't change once created. */
+@@ -40,7 +43,7 @@
+ {
+ struct object *i;
+
+- list_for_each_entry(i, &amp;cache, list) {
++ list_for_each_entry_rcu(i, &amp;cache, list) {
+ if (i-&gt;id == id) {
+ i-&gt;popularity++;
+ return i;
+@@ -49,19 +52,25 @@
+ return NULL;
+ }
+
++/* Final discard done once we know no readers are looking. */
++static void cache_delete_rcu(void *arg)
++{
++ object_put(arg);
++}
++
+ /* Must be holding cache_lock */
+ static void __cache_delete(struct object *obj)
+ {
+ BUG_ON(!obj);
+- list_del(&amp;obj-&gt;list);
+- object_put(obj);
++ list_del_rcu(&amp;obj-&gt;list);
+ cache_num--;
++ call_rcu(&amp;obj-&gt;rcu, cache_delete_rcu, obj);
+ }
+
+ /* Must be holding cache_lock */
+ static void __cache_add(struct object *obj)
+ {
+- list_add(&amp;obj-&gt;list, &amp;cache);
++ list_add_rcu(&amp;obj-&gt;list, &amp;cache);
+ if (++cache_num > MAX_CACHE_SIZE) {
+ struct object *i, *outcast = NULL;
+ list_for_each_entry(i, &amp;cache, list) {
+@@ -85,6 +94,7 @@
+ obj-&gt;popularity = 0;
+ atomic_set(&amp;obj-&gt;refcnt, 1); /* The cache holds a reference */
+ spin_lock_init(&amp;obj-&gt;lock);
++ INIT_RCU_HEAD(&amp;obj-&gt;rcu);
+
+ spin_lock_irqsave(&amp;cache_lock, flags);
+ __cache_add(obj);
+@@ -104,12 +114,11 @@
+ struct object *cache_find(int id)
+ {
+ struct object *obj;
+- unsigned long flags;
+
+- spin_lock_irqsave(&amp;cache_lock, flags);
++ rcu_read_lock();
+ obj = __cache_find(id);
+ if (obj)
+ object_get(obj);
+- spin_unlock_irqrestore(&amp;cache_lock, flags);
++ rcu_read_unlock();
+ return obj;
+ }
+</programlisting>
+
+<para>
+Note that the reader will alter the
+<structfield>popularity</structfield> member in
+<function>__cache_find()</function>, and now it doesn't hold a lock.
+One solution would be to make it an <type>atomic_t</type>, but for
+this usage, we don't really care about races: an approximate result is
+good enough, so I didn't change it.
+</para>
+
+<para>
+The result is that <function>cache_find()</function> requires no
+synchronization with any other functions, so is almost as fast on SMP
+as it would be on UP.
+</para>
+
+<para>
+There is a furthur optimization possible here: remember our original
+cache code, where there were no reference counts and the caller simply
+held the lock whenever using the object? This is still possible: if
+you hold the lock, noone can delete the object, so you don't need to
+get and put the reference count.
+</para>
+
+<para>
+Now, because the 'read lock' in RCU is simply disabling preemption, a
+caller which always has preemption disabled between calling
+<function>cache_find()</function> and
+<function>object_put()</function> does not need to actually get and
+put the reference count: we could expose
+<function>__cache_find()</function> by making it non-static, and
+such callers could simply call that.
+</para>
+<para>
+The benefit here is that the reference count is not written to: the
+object is not altered in any way, which is much faster on SMP
+machines due to caching.
+</para>
+ </sect1>
+
+ <sect1 id="per-cpu">
+ <title>Per-CPU Data</title>
+
+ <para>
+ Another technique for avoiding locking which is used fairly
+ widely is to duplicate information for each CPU. For example,
+ if you wanted to keep a count of a common condition, you could
+ use a spin lock and a single counter. Nice and simple.
+ </para>
+
+ <para>
+ If that was too slow (it's usually not, but if you've got a
+ really big machine to test on and can show that it is), you
+ could instead use a counter for each CPU, then none of them need
+ an exclusive lock. See <function>DEFINE_PER_CPU()</function>,
+ <function>get_cpu_var()</function> and
+ <function>put_cpu_var()</function>
+ (<filename class="headerfile">include/linux/percpu.h</filename>).
+ </para>
+
+ <para>
+ Of particular use for simple per-cpu counters is the
+ <type>local_t</type> type, and the
+ <function>cpu_local_inc()</function> and related functions,
+ which are more efficient than simple code on some architectures
+ (<filename class="headerfile">include/asm/local.h</filename>).
+ </para>
+
+ <para>
+ Note that there is no simple, reliable way of getting an exact
+ value of such a counter, without introducing more locks. This
+ is not a problem for some uses.
+ </para>
+ </sect1>
+
+ <sect1 id="mostly-hardirq">
+ <title>Data Which Mostly Used By An IRQ Handler</title>
+
+ <para>
+ If data is always accessed from within the same IRQ handler, you
+ don't need a lock at all: the kernel already guarantees that the
+ irq handler will not run simultaneously on multiple CPUs.
+ </para>
+ <para>
+ Manfred Spraul points out that you can still do this, even if
+ the data is very occasionally accessed in user context or
+ softirqs/tasklets. The irq handler doesn't use a lock, and
+ all other accesses are done as so:
+ </para>
+
+<programlisting>
+ spin_lock(&amp;lock);
+ disable_irq(irq);
+ ...
+ enable_irq(irq);
+ spin_unlock(&amp;lock);
+</programlisting>
+ <para>
+ The <function>disable_irq()</function> prevents the irq handler
+ from running (and waits for it to finish if it's currently
+ running on other CPUs). The spinlock prevents any other
+ accesses happening at the same time. Naturally, this is slower
+ than just a <function>spin_lock_irq()</function> call, so it
+ only makes sense if this type of access happens extremely
+ rarely.
+ </para>
+ </sect1>
+ </chapter>
+
+ <chapter id="sleeping-things">
+ <title>What Functions Are Safe To Call From Interrupts?</title>
+
+ <para>
+ Many functions in the kernel sleep (ie. call schedule())
+ directly or indirectly: you can never call them while holding a
+ spinlock, or with preemption disabled. This also means you need
+ to be in user context: calling them from an interrupt is illegal.
+ </para>
+
+ <sect1 id="sleeping">
+ <title>Some Functions Which Sleep</title>
+
+ <para>
+ The most common ones are listed below, but you usually have to
+ read the code to find out if other calls are safe. If everyone
+ else who calls it can sleep, you probably need to be able to
+ sleep, too. In particular, registration and deregistration
+ functions usually expect to be called from user context, and can
+ sleep.
+ </para>
+
+ <itemizedlist>
+ <listitem>
+ <para>
+ Accesses to
+ <firstterm linkend="gloss-userspace">userspace</firstterm>:
+ </para>
+ <itemizedlist>
+ <listitem>
+ <para>
+ <function>copy_from_user()</function>
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <function>copy_to_user()</function>
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <function>get_user()</function>
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <function> put_user()</function>
+ </para>
+ </listitem>
+ </itemizedlist>
+ </listitem>
+
+ <listitem>
+ <para>
+ <function>kmalloc(GFP_KERNEL)</function>
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ <function>down_interruptible()</function> and
+ <function>down()</function>
+ </para>
+ <para>
+ There is a <function>down_trylock()</function> which can be
+ used inside interrupt context, as it will not sleep.
+ <function>up()</function> will also never sleep.
+ </para>
+ </listitem>
+ </itemizedlist>
+ </sect1>
+
+ <sect1 id="dont-sleep">
+ <title>Some Functions Which Don't Sleep</title>
+
+ <para>
+ Some functions are safe to call from any context, or holding
+ almost any lock.
+ </para>
+
+ <itemizedlist>
+ <listitem>
+ <para>
+ <function>printk()</function>
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <function>kfree()</function>
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ <function>add_timer()</function> and <function>del_timer()</function>
+ </para>
+ </listitem>
+ </itemizedlist>
+ </sect1>
+ </chapter>
+
+ <chapter id="references">
+ <title>Further reading</title>
+
+ <itemizedlist>
+ <listitem>
+ <para>
+ <filename>Documentation/spinlocks.txt</filename>:
+ Linus Torvalds' spinlocking tutorial in the kernel sources.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Unix Systems for Modern Architectures: Symmetric
+ Multiprocessing and Caching for Kernel Programmers:
+ </para>
+
+ <para>
+ Curt Schimmel's very good introduction to kernel level
+ locking (not written for Linux, but nearly everything
+ applies). The book is expensive, but really worth every
+ penny to understand SMP locking. [ISBN: 0201633388]
+ </para>
+ </listitem>
+ </itemizedlist>
+ </chapter>
+
+ <chapter id="thanks">
+ <title>Thanks</title>
+
+ <para>
+ Thanks to Telsa Gwynne for DocBooking, neatening and adding
+ style.
+ </para>
+
+ <para>
+ Thanks to Martin Pool, Philipp Rumpf, Stephen Rothwell, Paul
+ Mackerras, Ruedi Aschwanden, Alan Cox, Manfred Spraul, Tim
+ Waugh, Pete Zaitcev, James Morris, Robert Love, Paul McKenney,
+ John Ashby for proofreading, correcting, flaming, commenting.
+ </para>
+
+ <para>
+ Thanks to the cabal for having no influence on this document.
+ </para>
+ </chapter>
+
+ <glossary id="glossary">
+ <title>Glossary</title>
+
+ <glossentry id="gloss-preemption">
+ <glossterm>preemption</glossterm>
+ <glossdef>
+ <para>
+ Prior to 2.5, or when <symbol>CONFIG_PREEMPT</symbol> is
+ unset, processes in user context inside the kernel would not
+ preempt each other (ie. you had that CPU until you have it up,
+ except for interrupts). With the addition of
+ <symbol>CONFIG_PREEMPT</symbol> in 2.5.4, this changed: when
+ in user context, higher priority tasks can "cut in": spinlocks
+ were changed to disable preemption, even on UP.
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-bh">
+ <glossterm>bh</glossterm>
+ <glossdef>
+ <para>
+ Bottom Half: for historical reasons, functions with
+ '_bh' in them often now refer to any software interrupt, e.g.
+ <function>spin_lock_bh()</function> blocks any software interrupt
+ on the current CPU. Bottom halves are deprecated, and will
+ eventually be replaced by tasklets. Only one bottom half will be
+ running at any time.
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-hwinterrupt">
+ <glossterm>Hardware Interrupt / Hardware IRQ</glossterm>
+ <glossdef>
+ <para>
+ Hardware interrupt request. <function>in_irq()</function> returns
+ <returnvalue>true</returnvalue> in a hardware interrupt handler.
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-interruptcontext">
+ <glossterm>Interrupt Context</glossterm>
+ <glossdef>
+ <para>
+ Not user context: processing a hardware irq or software irq.
+ Indicated by the <function>in_interrupt()</function> macro
+ returning <returnvalue>true</returnvalue>.
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-smp">
+ <glossterm><acronym>SMP</acronym></glossterm>
+ <glossdef>
+ <para>
+ Symmetric Multi-Processor: kernels compiled for multiple-CPU
+ machines. (CONFIG_SMP=y).
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-softirq">
+ <glossterm>Software Interrupt / softirq</glossterm>
+ <glossdef>
+ <para>
+ Software interrupt handler. <function>in_irq()</function> returns
+ <returnvalue>false</returnvalue>; <function>in_softirq()</function>
+ returns <returnvalue>true</returnvalue>. Tasklets and softirqs
+ both fall into the category of 'software interrupts'.
+ </para>
+ <para>
+ Strictly speaking a softirq is one of up to 32 enumerated software
+ interrupts which can run on multiple CPUs at once.
+ Sometimes used to refer to tasklets as
+ well (ie. all software interrupts).
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-tasklet">
+ <glossterm>tasklet</glossterm>
+ <glossdef>
+ <para>
+ A dynamically-registrable software interrupt,
+ which is guaranteed to only run on one CPU at a time.
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-timers">
+ <glossterm>timer</glossterm>
+ <glossdef>
+ <para>
+ A dynamically-registrable software interrupt, which is run at
+ (or close to) a given time. When running, it is just like a
+ tasklet (in fact, they are called from the TIMER_SOFTIRQ).
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-up">
+ <glossterm><acronym>UP</acronym></glossterm>
+ <glossdef>
+ <para>
+ Uni-Processor: Non-SMP. (CONFIG_SMP=n).
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-usercontext">
+ <glossterm>User Context</glossterm>
+ <glossdef>
+ <para>
+ The kernel executing on behalf of a particular process (ie. a
+ system call or trap) or kernel thread. You can tell which
+ process with the <symbol>current</symbol> macro.) Not to
+ be confused with userspace. Can be interrupted by software or
+ hardware interrupts.
+ </para>
+ </glossdef>
+ </glossentry>
+
+ <glossentry id="gloss-userspace">
+ <glossterm>Userspace</glossterm>
+ <glossdef>
+ <para>
+ A process executing its own code outside the kernel.
+ </para>
+ </glossdef>
+ </glossentry>
+
+ </glossary>
+</book>
+
diff --git a/Documentation/DocBook/libata.tmpl b/Documentation/DocBook/libata.tmpl
new file mode 100644
index 000000000000..cf2fce7707da
--- /dev/null
+++ b/Documentation/DocBook/libata.tmpl
@@ -0,0 +1,282 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="libataDevGuide">
+ <bookinfo>
+ <title>libATA Developer's Guide</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Jeff</firstname>
+ <surname>Garzik</surname>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2003</year>
+ <holder>Jeff Garzik</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ The contents of this file are subject to the Open
+ Software License version 1.1 that can be found at
+ <ulink url="http://www.opensource.org/licenses/osl-1.1.txt">http://www.opensource.org/licenses/osl-1.1.txt</ulink> and is included herein
+ by reference.
+ </para>
+
+ <para>
+ Alternatively, the contents of this file may be used under the terms
+ of the GNU General Public License version 2 (the "GPL") as distributed
+ in the kernel source COPYING file, in which case the provisions of
+ the GPL are applicable instead of the above. If you wish to allow
+ the use of your version of this file only under the terms of the
+ GPL and not to allow others to use your version of this file under
+ the OSL, indicate your decision by deleting the provisions above and
+ replace them with the notice and other provisions required by the GPL.
+ If you do not delete the provisions above, a recipient may use your
+ version of this file under either the OSL or the GPL.
+ </para>
+
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="libataThanks">
+ <title>Thanks</title>
+ <para>
+ The bulk of the ATA knowledge comes thanks to long conversations with
+ Andre Hedrick (www.linux-ide.org).
+ </para>
+ <para>
+ Thanks to Alan Cox for pointing out similarities
+ between SATA and SCSI, and in general for motivation to hack on
+ libata.
+ </para>
+ <para>
+ libata's device detection
+ method, ata_pio_devchk, and in general all the early probing was
+ based on extensive study of Hale Landis's probe/reset code in his
+ ATADRVR driver (www.ata-atapi.com).
+ </para>
+ </chapter>
+
+ <chapter id="libataDriverApi">
+ <title>libata Driver API</title>
+ <sect1>
+ <title>struct ata_port_operations</title>
+
+ <programlisting>
+void (*port_disable) (struct ata_port *);
+ </programlisting>
+
+ <para>
+ Called from ata_bus_probe() and ata_bus_reset() error paths,
+ as well as when unregistering from the SCSI module (rmmod, hot
+ unplug).
+ </para>
+
+ <programlisting>
+void (*dev_config) (struct ata_port *, struct ata_device *);
+ </programlisting>
+
+ <para>
+ Called after IDENTIFY [PACKET] DEVICE is issued to each device
+ found. Typically used to apply device-specific fixups prior to
+ issue of SET FEATURES - XFER MODE, and prior to operation.
+ </para>
+
+ <programlisting>
+void (*set_piomode) (struct ata_port *, struct ata_device *);
+void (*set_dmamode) (struct ata_port *, struct ata_device *);
+void (*post_set_mode) (struct ata_port *ap);
+ </programlisting>
+
+ <para>
+ Hooks called prior to the issue of SET FEATURES - XFER MODE
+ command. dev->pio_mode is guaranteed to be valid when
+ ->set_piomode() is called, and dev->dma_mode is guaranteed to be
+ valid when ->set_dmamode() is called. ->post_set_mode() is
+ called unconditionally, after the SET FEATURES - XFER MODE
+ command completes successfully.
+ </para>
+
+ <para>
+ ->set_piomode() is always called (if present), but
+ ->set_dma_mode() is only called if DMA is possible.
+ </para>
+
+ <programlisting>
+void (*tf_load) (struct ata_port *ap, struct ata_taskfile *tf);
+void (*tf_read) (struct ata_port *ap, struct ata_taskfile *tf);
+ </programlisting>
+
+ <para>
+ ->tf_load() is called to load the given taskfile into hardware
+ registers / DMA buffers. ->tf_read() is called to read the
+ hardware registers / DMA buffers, to obtain the current set of
+ taskfile register values.
+ </para>
+
+ <programlisting>
+void (*exec_command)(struct ata_port *ap, struct ata_taskfile *tf);
+ </programlisting>
+
+ <para>
+ causes an ATA command, previously loaded with
+ ->tf_load(), to be initiated in hardware.
+ </para>
+
+ <programlisting>
+u8 (*check_status)(struct ata_port *ap);
+void (*dev_select)(struct ata_port *ap, unsigned int device);
+ </programlisting>
+
+ <para>
+ Reads the Status ATA shadow register from hardware. On some
+ hardware, this has the side effect of clearing the interrupt
+ condition.
+ </para>
+
+ <programlisting>
+void (*dev_select)(struct ata_port *ap, unsigned int device);
+ </programlisting>
+
+ <para>
+ Issues the low-level hardware command(s) that causes one of N
+ hardware devices to be considered 'selected' (active and
+ available for use) on the ATA bus.
+ </para>
+
+ <programlisting>
+void (*phy_reset) (struct ata_port *ap);
+ </programlisting>
+
+ <para>
+ The very first step in the probe phase. Actions vary depending
+ on the bus type, typically. After waking up the device and probing
+ for device presence (PATA and SATA), typically a soft reset
+ (SRST) will be performed. Drivers typically use the helper
+ functions ata_bus_reset() or sata_phy_reset() for this hook.
+ </para>
+
+ <programlisting>
+void (*bmdma_setup) (struct ata_queued_cmd *qc);
+void (*bmdma_start) (struct ata_queued_cmd *qc);
+ </programlisting>
+
+ <para>
+ When setting up an IDE BMDMA transaction, these hooks arm
+ (->bmdma_setup) and fire (->bmdma_start) the hardware's DMA
+ engine.
+ </para>
+
+ <programlisting>
+void (*qc_prep) (struct ata_queued_cmd *qc);
+int (*qc_issue) (struct ata_queued_cmd *qc);
+ </programlisting>
+
+ <para>
+ Higher-level hooks, these two hooks can potentially supercede
+ several of the above taskfile/DMA engine hooks. ->qc_prep is
+ called after the buffers have been DMA-mapped, and is typically
+ used to populate the hardware's DMA scatter-gather table.
+ Most drivers use the standard ata_qc_prep() helper function, but
+ more advanced drivers roll their own.
+ </para>
+ <para>
+ ->qc_issue is used to make a command active, once the hardware
+ and S/G tables have been prepared. IDE BMDMA drivers use the
+ helper function ata_qc_issue_prot() for taskfile protocol-based
+ dispatch. More advanced drivers roll their own ->qc_issue
+ implementation, using this as the "issue new ATA command to
+ hardware" hook.
+ </para>
+
+ <programlisting>
+void (*eng_timeout) (struct ata_port *ap);
+ </programlisting>
+
+ <para>
+ This is a high level error handling function, called from the
+ error handling thread, when a command times out.
+ </para>
+
+ <programlisting>
+irqreturn_t (*irq_handler)(int, void *, struct pt_regs *);
+void (*irq_clear) (struct ata_port *);
+ </programlisting>
+
+ <para>
+ ->irq_handler is the interrupt handling routine registered with
+ the system, by libata. ->irq_clear is called during probe just
+ before the interrupt handler is registered, to be sure hardware
+ is quiet.
+ </para>
+
+ <programlisting>
+u32 (*scr_read) (struct ata_port *ap, unsigned int sc_reg);
+void (*scr_write) (struct ata_port *ap, unsigned int sc_reg,
+ u32 val);
+ </programlisting>
+
+ <para>
+ Read and write standard SATA phy registers. Currently only used
+ if ->phy_reset hook called the sata_phy_reset() helper function.
+ </para>
+
+ <programlisting>
+int (*port_start) (struct ata_port *ap);
+void (*port_stop) (struct ata_port *ap);
+void (*host_stop) (struct ata_host_set *host_set);
+ </programlisting>
+
+ <para>
+ ->port_start() is called just after the data structures for each
+ port are initialized. Typically this is used to alloc per-port
+ DMA buffers / tables / rings, enable DMA engines, and similar
+ tasks.
+ </para>
+ <para>
+ ->host_stop() is called when the rmmod or hot unplug process
+ begins. The hook must stop all hardware interrupts, DMA
+ engines, etc.
+ </para>
+ <para>
+ ->port_stop() is called after ->host_stop(). It's sole function
+ is to release DMA/memory resources, now that they are no longer
+ actively being used.
+ </para>
+
+ </sect1>
+ </chapter>
+
+ <chapter id="libataExt">
+ <title>libata Library</title>
+!Edrivers/scsi/libata-core.c
+ </chapter>
+
+ <chapter id="libataInt">
+ <title>libata Core Internals</title>
+!Idrivers/scsi/libata-core.c
+ </chapter>
+
+ <chapter id="libataScsiInt">
+ <title>libata SCSI translation/emulation</title>
+!Edrivers/scsi/libata-scsi.c
+!Idrivers/scsi/libata-scsi.c
+ </chapter>
+
+ <chapter id="PiixInt">
+ <title>ata_piix Internals</title>
+!Idrivers/scsi/ata_piix.c
+ </chapter>
+
+ <chapter id="SILInt">
+ <title>sata_sil Internals</title>
+!Idrivers/scsi/sata_sil.c
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/librs.tmpl b/Documentation/DocBook/librs.tmpl
new file mode 100644
index 000000000000..3ff39bafc00e
--- /dev/null
+++ b/Documentation/DocBook/librs.tmpl
@@ -0,0 +1,289 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="Reed-Solomon-Library-Guide">
+ <bookinfo>
+ <title>Reed-Solomon Library Programming Interface</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Thomas</firstname>
+ <surname>Gleixner</surname>
+ <affiliation>
+ <address>
+ <email>tglx@linutronix.de</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2004</year>
+ <holder>Thomas Gleixner</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License version 2 as published by the Free Software Foundation.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ The generic Reed-Solomon Library provides encoding, decoding
+ and error correction functions.
+ </para>
+ <para>
+ Reed-Solomon codes are used in communication and storage
+ applications to ensure data integrity.
+ </para>
+ <para>
+ This documentation is provided for developers who want to utilize
+ the functions provided by the library.
+ </para>
+ </chapter>
+
+ <chapter id="bugs">
+ <title>Known Bugs And Assumptions</title>
+ <para>
+ None.
+ </para>
+ </chapter>
+
+ <chapter id="usage">
+ <title>Usage</title>
+ <para>
+ This chapter provides examples how to use the library.
+ </para>
+ <sect1>
+ <title>Initializing</title>
+ <para>
+ The init function init_rs returns a pointer to a
+ rs decoder structure, which holds the necessary
+ information for encoding, decoding and error correction
+ with the given polynomial. It either uses an existing
+ matching decoder or creates a new one. On creation all
+ the lookup tables for fast en/decoding are created.
+ The function may take a while, so make sure not to
+ call it in critical code paths.
+ </para>
+ <programlisting>
+/* the Reed Solomon control structure */
+static struct rs_control *rs_decoder;
+
+/* Symbolsize is 10 (bits)
+ * Primitve polynomial is x^10+x^3+1
+ * first consecutive root is 0
+ * primitve element to generate roots = 1
+ * generator polinomial degree (number of roots) = 6
+ */
+rs_decoder = init_rs (10, 0x409, 0, 1, 6);
+ </programlisting>
+ </sect1>
+ <sect1>
+ <title>Encoding</title>
+ <para>
+ The encoder calculates the Reed-Solomon code over
+ the given data length and stores the result in
+ the parity buffer. Note that the parity buffer must
+ be initialized before calling the encoder.
+ </para>
+ <para>
+ The expanded data can be inverted on the fly by
+ providing a non zero inversion mask. The expanded data is
+ XOR'ed with the mask. This is used e.g. for FLASH
+ ECC, where the all 0xFF is inverted to an all 0x00.
+ The Reed-Solomon code for all 0x00 is all 0x00. The
+ code is inverted before storing to FLASH so it is 0xFF
+ too. This prevent's that reading from an erased FLASH
+ results in ECC errors.
+ </para>
+ <para>
+ The databytes are expanded to the given symbol size
+ on the fly. There is no support for encoding continuous
+ bitstreams with a symbol size != 8 at the moment. If
+ it is necessary it should be not a big deal to implement
+ such functionality.
+ </para>
+ <programlisting>
+/* Parity buffer. Size = number of roots */
+uint16_t par[6];
+/* Initialize the parity buffer */
+memset(par, 0, sizeof(par));
+/* Encode 512 byte in data8. Store parity in buffer par */
+encode_rs8 (rs_decoder, data8, 512, par, 0);
+ </programlisting>
+ </sect1>
+ <sect1>
+ <title>Decoding</title>
+ <para>
+ The decoder calculates the syndrome over
+ the given data length and the received parity symbols
+ and corrects errors in the data.
+ </para>
+ <para>
+ If a syndrome is available from a hardware decoder
+ then the syndrome calculation is skipped.
+ </para>
+ <para>
+ The correction of the data buffer can be suppressed
+ by providing a correction pattern buffer and an error
+ location buffer to the decoder. The decoder stores the
+ calculated error location and the correction bitmask
+ in the given buffers. This is useful for hardware
+ decoders which use a weird bit ordering scheme.
+ </para>
+ <para>
+ The databytes are expanded to the given symbol size
+ on the fly. There is no support for decoding continuous
+ bitstreams with a symbolsize != 8 at the moment. If
+ it is necessary it should be not a big deal to implement
+ such functionality.
+ </para>
+
+ <sect2>
+ <title>
+ Decoding with syndrome calculation, direct data correction
+ </title>
+ <programlisting>
+/* Parity buffer. Size = number of roots */
+uint16_t par[6];
+uint8_t data[512];
+int numerr;
+/* Receive data */
+.....
+/* Receive parity */
+.....
+/* Decode 512 byte in data8.*/
+numerr = decode_rs8 (rs_decoder, data8, par, 512, NULL, 0, NULL, 0, NULL);
+ </programlisting>
+ </sect2>
+
+ <sect2>
+ <title>
+ Decoding with syndrome given by hardware decoder, direct data correction
+ </title>
+ <programlisting>
+/* Parity buffer. Size = number of roots */
+uint16_t par[6], syn[6];
+uint8_t data[512];
+int numerr;
+/* Receive data */
+.....
+/* Receive parity */
+.....
+/* Get syndrome from hardware decoder */
+.....
+/* Decode 512 byte in data8.*/
+numerr = decode_rs8 (rs_decoder, data8, par, 512, syn, 0, NULL, 0, NULL);
+ </programlisting>
+ </sect2>
+
+ <sect2>
+ <title>
+ Decoding with syndrome given by hardware decoder, no direct data correction.
+ </title>
+ <para>
+ Note: It's not necessary to give data and received parity to the decoder.
+ </para>
+ <programlisting>
+/* Parity buffer. Size = number of roots */
+uint16_t par[6], syn[6], corr[8];
+uint8_t data[512];
+int numerr, errpos[8];
+/* Receive data */
+.....
+/* Receive parity */
+.....
+/* Get syndrome from hardware decoder */
+.....
+/* Decode 512 byte in data8.*/
+numerr = decode_rs8 (rs_decoder, NULL, NULL, 512, syn, 0, errpos, 0, corr);
+for (i = 0; i &lt; numerr; i++) {
+ do_error_correction_in_your_buffer(errpos[i], corr[i]);
+}
+ </programlisting>
+ </sect2>
+ </sect1>
+ <sect1>
+ <title>Cleanup</title>
+ <para>
+ The function free_rs frees the allocated resources,
+ if the caller is the last user of the decoder.
+ </para>
+ <programlisting>
+/* Release resources */
+free_rs(rs_decoder);
+ </programlisting>
+ </sect1>
+
+ </chapter>
+
+ <chapter id="structs">
+ <title>Structures</title>
+ <para>
+ This chapter contains the autogenerated documentation of the structures which are
+ used in the Reed-Solomon Library and are relevant for a developer.
+ </para>
+!Iinclude/linux/rslib.h
+ </chapter>
+
+ <chapter id="pubfunctions">
+ <title>Public Functions Provided</title>
+ <para>
+ This chapter contains the autogenerated documentation of the Reed-Solomon functions
+ which are exported.
+ </para>
+!Elib/reed_solomon/reed_solomon.c
+ </chapter>
+
+ <chapter id="credits">
+ <title>Credits</title>
+ <para>
+ The library code for encoding and decoding was written by Phil Karn.
+ </para>
+ <programlisting>
+ Copyright 2002, Phil Karn, KA9Q
+ May be used under the terms of the GNU General Public License (GPL)
+ </programlisting>
+ <para>
+ The wrapper functions and interfaces are written by Thomas Gleixner
+ </para>
+ <para>
+ Many users have provided bugfixes, improvements and helping hands for testing.
+ Thanks a lot.
+ </para>
+ <para>
+ The following people have contributed to this document:
+ </para>
+ <para>
+ Thomas Gleixner<email>tglx@linutronix.de</email>
+ </para>
+ </chapter>
+</book>
diff --git a/Documentation/DocBook/lsm.tmpl b/Documentation/DocBook/lsm.tmpl
new file mode 100644
index 000000000000..f63822195871
--- /dev/null
+++ b/Documentation/DocBook/lsm.tmpl
@@ -0,0 +1,265 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<article class="whitepaper" id="LinuxSecurityModule" lang="en">
+ <articleinfo>
+ <title>Linux Security Modules: General Security Hooks for Linux</title>
+ <authorgroup>
+ <author>
+ <firstname>Stephen</firstname>
+ <surname>Smalley</surname>
+ <affiliation>
+ <orgname>NAI Labs</orgname>
+ <address><email>ssmalley@nai.com</email></address>
+ </affiliation>
+ </author>
+ <author>
+ <firstname>Timothy</firstname>
+ <surname>Fraser</surname>
+ <affiliation>
+ <orgname>NAI Labs</orgname>
+ <address><email>tfraser@nai.com</email></address>
+ </affiliation>
+ </author>
+ <author>
+ <firstname>Chris</firstname>
+ <surname>Vance</surname>
+ <affiliation>
+ <orgname>NAI Labs</orgname>
+ <address><email>cvance@nai.com</email></address>
+ </affiliation>
+ </author>
+ </authorgroup>
+ </articleinfo>
+
+<sect1><title>Introduction</title>
+
+<para>
+In March 2001, the National Security Agency (NSA) gave a presentation
+about Security-Enhanced Linux (SELinux) at the 2.5 Linux Kernel
+Summit. SELinux is an implementation of flexible and fine-grained
+nondiscretionary access controls in the Linux kernel, originally
+implemented as its own particular kernel patch. Several other
+security projects (e.g. RSBAC, Medusa) have also developed flexible
+access control architectures for the Linux kernel, and various
+projects have developed particular access control models for Linux
+(e.g. LIDS, DTE, SubDomain). Each project has developed and
+maintained its own kernel patch to support its security needs.
+</para>
+
+<para>
+In response to the NSA presentation, Linus Torvalds made a set of
+remarks that described a security framework he would be willing to
+consider for inclusion in the mainstream Linux kernel. He described a
+general framework that would provide a set of security hooks to
+control operations on kernel objects and a set of opaque security
+fields in kernel data structures for maintaining security attributes.
+This framework could then be used by loadable kernel modules to
+implement any desired model of security. Linus also suggested the
+possibility of migrating the Linux capabilities code into such a
+module.
+</para>
+
+<para>
+The Linux Security Modules (LSM) project was started by WireX to
+develop such a framework. LSM is a joint development effort by
+several security projects, including Immunix, SELinux, SGI and Janus,
+and several individuals, including Greg Kroah-Hartman and James
+Morris, to develop a Linux kernel patch that implements this
+framework. The patch is currently tracking the 2.4 series and is
+targeted for integration into the 2.5 development series. This
+technical report provides an overview of the framework and the example
+capabilities security module provided by the LSM kernel patch.
+</para>
+
+</sect1>
+
+<sect1 id="framework"><title>LSM Framework</title>
+
+<para>
+The LSM kernel patch provides a general kernel framework to support
+security modules. In particular, the LSM framework is primarily
+focused on supporting access control modules, although future
+development is likely to address other security needs such as
+auditing. By itself, the framework does not provide any additional
+security; it merely provides the infrastructure to support security
+modules. The LSM kernel patch also moves most of the capabilities
+logic into an optional security module, with the system defaulting
+to the traditional superuser logic. This capabilities module
+is discussed further in <xref linkend="cap"/>.
+</para>
+
+<para>
+The LSM kernel patch adds security fields to kernel data structures
+and inserts calls to hook functions at critical points in the kernel
+code to manage the security fields and to perform access control. It
+also adds functions for registering and unregistering security
+modules, and adds a general <function>security</function> system call
+to support new system calls for security-aware applications.
+</para>
+
+<para>
+The LSM security fields are simply <type>void*</type> pointers. For
+process and program execution security information, security fields
+were added to <structname>struct task_struct</structname> and
+<structname>struct linux_binprm</structname>. For filesystem security
+information, a security field was added to
+<structname>struct super_block</structname>. For pipe, file, and socket
+security information, security fields were added to
+<structname>struct inode</structname> and
+<structname>struct file</structname>. For packet and network device security
+information, security fields were added to
+<structname>struct sk_buff</structname> and
+<structname>struct net_device</structname>. For System V IPC security
+information, security fields were added to
+<structname>struct kern_ipc_perm</structname> and
+<structname>struct msg_msg</structname>; additionally, the definitions
+for <structname>struct msg_msg</structname>, <structname>struct
+msg_queue</structname>, and <structname>struct
+shmid_kernel</structname> were moved to header files
+(<filename>include/linux/msg.h</filename> and
+<filename>include/linux/shm.h</filename> as appropriate) to allow
+the security modules to use these definitions.
+</para>
+
+<para>
+Each LSM hook is a function pointer in a global table,
+security_ops. This table is a
+<structname>security_operations</structname> structure as defined by
+<filename>include/linux/security.h</filename>. Detailed documentation
+for each hook is included in this header file. At present, this
+structure consists of a collection of substructures that group related
+hooks based on the kernel object (e.g. task, inode, file, sk_buff,
+etc) as well as some top-level hook function pointers for system
+operations. This structure is likely to be flattened in the future
+for performance. The placement of the hook calls in the kernel code
+is described by the "called:" lines in the per-hook documentation in
+the header file. The hook calls can also be easily found in the
+kernel code by looking for the string "security_ops->".
+
+</para>
+
+<para>
+Linus mentioned per-process security hooks in his original remarks as a
+possible alternative to global security hooks. However, if LSM were
+to start from the perspective of per-process hooks, then the base
+framework would have to deal with how to handle operations that
+involve multiple processes (e.g. kill), since each process might have
+its own hook for controlling the operation. This would require a
+general mechanism for composing hooks in the base framework.
+Additionally, LSM would still need global hooks for operations that
+have no process context (e.g. network input operations).
+Consequently, LSM provides global security hooks, but a security
+module is free to implement per-process hooks (where that makes sense)
+by storing a security_ops table in each process' security field and
+then invoking these per-process hooks from the global hooks.
+The problem of composition is thus deferred to the module.
+</para>
+
+<para>
+The global security_ops table is initialized to a set of hook
+functions provided by a dummy security module that provides
+traditional superuser logic. A <function>register_security</function>
+function (in <filename>security/security.c</filename>) is provided to
+allow a security module to set security_ops to refer to its own hook
+functions, and an <function>unregister_security</function> function is
+provided to revert security_ops to the dummy module hooks. This
+mechanism is used to set the primary security module, which is
+responsible for making the final decision for each hook.
+</para>
+
+<para>
+LSM also provides a simple mechanism for stacking additional security
+modules with the primary security module. It defines
+<function>register_security</function> and
+<function>unregister_security</function> hooks in the
+<structname>security_operations</structname> structure and provides
+<function>mod_reg_security</function> and
+<function>mod_unreg_security</function> functions that invoke these
+hooks after performing some sanity checking. A security module can
+call these functions in order to stack with other modules. However,
+the actual details of how this stacking is handled are deferred to the
+module, which can implement these hooks in any way it wishes
+(including always returning an error if it does not wish to support
+stacking). In this manner, LSM again defers the problem of
+composition to the module.
+</para>
+
+<para>
+Although the LSM hooks are organized into substructures based on
+kernel object, all of the hooks can be viewed as falling into two
+major categories: hooks that are used to manage the security fields
+and hooks that are used to perform access control. Examples of the
+first category of hooks include the
+<function>alloc_security</function> and
+<function>free_security</function> hooks defined for each kernel data
+structure that has a security field. These hooks are used to allocate
+and free security structures for kernel objects. The first category
+of hooks also includes hooks that set information in the security
+field after allocation, such as the <function>post_lookup</function>
+hook in <structname>struct inode_security_ops</structname>. This hook
+is used to set security information for inodes after successful lookup
+operations. An example of the second category of hooks is the
+<function>permission</function> hook in
+<structname>struct inode_security_ops</structname>. This hook checks
+permission when accessing an inode.
+</para>
+
+</sect1>
+
+<sect1 id="cap"><title>LSM Capabilities Module</title>
+
+<para>
+The LSM kernel patch moves most of the existing POSIX.1e capabilities
+logic into an optional security module stored in the file
+<filename>security/capability.c</filename>. This change allows
+users who do not want to use capabilities to omit this code entirely
+from their kernel, instead using the dummy module for traditional
+superuser logic or any other module that they desire. This change
+also allows the developers of the capabilities logic to maintain and
+enhance their code more freely, without needing to integrate patches
+back into the base kernel.
+</para>
+
+<para>
+In addition to moving the capabilities logic, the LSM kernel patch
+could move the capability-related fields from the kernel data
+structures into the new security fields managed by the security
+modules. However, at present, the LSM kernel patch leaves the
+capability fields in the kernel data structures. In his original
+remarks, Linus suggested that this might be preferable so that other
+security modules can be easily stacked with the capabilities module
+without needing to chain multiple security structures on the security field.
+It also avoids imposing extra overhead on the capabilities module
+to manage the security fields. However, the LSM framework could
+certainly support such a move if it is determined to be desirable,
+with only a few additional changes described below.
+</para>
+
+<para>
+At present, the capabilities logic for computing process capabilities
+on <function>execve</function> and <function>set*uid</function>,
+checking capabilities for a particular process, saving and checking
+capabilities for netlink messages, and handling the
+<function>capget</function> and <function>capset</function> system
+calls have been moved into the capabilities module. There are still a
+few locations in the base kernel where capability-related fields are
+directly examined or modified, but the current version of the LSM
+patch does allow a security module to completely replace the
+assignment and testing of capabilities. These few locations would
+need to be changed if the capability-related fields were moved into
+the security field. The following is a list of known locations that
+still perform such direct examination or modification of
+capability-related fields:
+<itemizedlist>
+<listitem><para><filename>fs/open.c</filename>:<function>sys_access</function></para></listitem>
+<listitem><para><filename>fs/lockd/host.c</filename>:<function>nlm_bind_host</function></para></listitem>
+<listitem><para><filename>fs/nfsd/auth.c</filename>:<function>nfsd_setuser</function></para></listitem>
+<listitem><para><filename>fs/proc/array.c</filename>:<function>task_cap</function></para></listitem>
+</itemizedlist>
+</para>
+
+</sect1>
+
+</article>
diff --git a/Documentation/DocBook/man/Makefile b/Documentation/DocBook/man/Makefile
new file mode 100644
index 000000000000..4fb7ea0f7ac8
--- /dev/null
+++ b/Documentation/DocBook/man/Makefile
@@ -0,0 +1,3 @@
+# Rules are put in Documentation/DocBook
+
+clean-files := *.9.gz *.sgml manpage.links manpage.refs
diff --git a/Documentation/DocBook/mcabook.tmpl b/Documentation/DocBook/mcabook.tmpl
new file mode 100644
index 000000000000..4367f4642f3d
--- /dev/null
+++ b/Documentation/DocBook/mcabook.tmpl
@@ -0,0 +1,107 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="MCAGuide">
+ <bookinfo>
+ <title>MCA Driver Programming Interface</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Alan</firstname>
+ <surname>Cox</surname>
+ <affiliation>
+ <address>
+ <email>alan@redhat.com</email>
+ </address>
+ </affiliation>
+ </author>
+ <author>
+ <firstname>David</firstname>
+ <surname>Weinehall</surname>
+ </author>
+ <author>
+ <firstname>Chris</firstname>
+ <surname>Beauregard</surname>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2000</year>
+ <holder>Alan Cox</holder>
+ <holder>David Weinehall</holder>
+ <holder>Chris Beauregard</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ The MCA bus functions provide a generalised interface to find MCA
+ bus cards, to claim them for a driver, and to read and manipulate POS
+ registers without being aware of the motherboard internals or
+ certain deep magic specific to onboard devices.
+ </para>
+ <para>
+ The basic interface to the MCA bus devices is the slot. Each slot
+ is numbered and virtual slot numbers are assigned to the internal
+ devices. Using a pci_dev as other busses do does not really make
+ sense in the MCA context as the MCA bus resources require card
+ specific interpretation.
+ </para>
+ <para>
+ Finally the MCA bus functions provide a parallel set of DMA
+ functions mimicing the ISA bus DMA functions as closely as possible,
+ although also supporting the additional DMA functionality on the
+ MCA bus controllers.
+ </para>
+ </chapter>
+ <chapter id="bugs">
+ <title>Known Bugs And Assumptions</title>
+ <para>
+ None.
+ </para>
+ </chapter>
+
+ <chapter id="pubfunctions">
+ <title>Public Functions Provided</title>
+!Earch/i386/kernel/mca.c
+ </chapter>
+
+ <chapter id="dmafunctions">
+ <title>DMA Functions Provided</title>
+!Iinclude/asm-i386/mca_dma.h
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/mtdnand.tmpl b/Documentation/DocBook/mtdnand.tmpl
new file mode 100644
index 000000000000..6e463d0db266
--- /dev/null
+++ b/Documentation/DocBook/mtdnand.tmpl
@@ -0,0 +1,1320 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="MTD-NAND-Guide">
+ <bookinfo>
+ <title>MTD NAND Driver Programming Interface</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Thomas</firstname>
+ <surname>Gleixner</surname>
+ <affiliation>
+ <address>
+ <email>tglx@linutronix.de</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2004</year>
+ <holder>Thomas Gleixner</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License version 2 as published by the Free Software Foundation.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ The generic NAND driver supports almost all NAND and AG-AND based
+ chips and connects them to the Memory Technology Devices (MTD)
+ subsystem of the Linux Kernel.
+ </para>
+ <para>
+ This documentation is provided for developers who want to implement
+ board drivers or filesystem drivers suitable for NAND devices.
+ </para>
+ </chapter>
+
+ <chapter id="bugs">
+ <title>Known Bugs And Assumptions</title>
+ <para>
+ None.
+ </para>
+ </chapter>
+
+ <chapter id="dochints">
+ <title>Documentation hints</title>
+ <para>
+ The function and structure docs are autogenerated. Each function and
+ struct member has a short description which is marked with an [XXX] identifier.
+ The following chapters explain the meaning of those identifiers.
+ </para>
+ <sect1>
+ <title>Function identifiers [XXX]</title>
+ <para>
+ The functions are marked with [XXX] identifiers in the short
+ comment. The identifiers explain the usage and scope of the
+ functions. Following identifiers are used:
+ </para>
+ <itemizedlist>
+ <listitem><para>
+ [MTD Interface]</para><para>
+ These functions provide the interface to the MTD kernel API.
+ They are not replacable and provide functionality
+ which is complete hardware independent.
+ </para></listitem>
+ <listitem><para>
+ [NAND Interface]</para><para>
+ These functions are exported and provide the interface to the NAND kernel API.
+ </para></listitem>
+ <listitem><para>
+ [GENERIC]</para><para>
+ Generic functions are not replacable and provide functionality
+ which is complete hardware independent.
+ </para></listitem>
+ <listitem><para>
+ [DEFAULT]</para><para>
+ Default functions provide hardware related functionality which is suitable
+ for most of the implementations. These functions can be replaced by the
+ board driver if neccecary. Those functions are called via pointers in the
+ NAND chip description structure. The board driver can set the functions which
+ should be replaced by board dependend functions before calling nand_scan().
+ If the function pointer is NULL on entry to nand_scan() then the pointer
+ is set to the default function which is suitable for the detected chip type.
+ </para></listitem>
+ </itemizedlist>
+ </sect1>
+ <sect1>
+ <title>Struct member identifiers [XXX]</title>
+ <para>
+ The struct members are marked with [XXX] identifiers in the
+ comment. The identifiers explain the usage and scope of the
+ members. Following identifiers are used:
+ </para>
+ <itemizedlist>
+ <listitem><para>
+ [INTERN]</para><para>
+ These members are for NAND driver internal use only and must not be
+ modified. Most of these values are calculated from the chip geometry
+ information which is evaluated during nand_scan().
+ </para></listitem>
+ <listitem><para>
+ [REPLACEABLE]</para><para>
+ Replaceable members hold hardware related functions which can be
+ provided by the board driver. The board driver can set the functions which
+ should be replaced by board dependend functions before calling nand_scan().
+ If the function pointer is NULL on entry to nand_scan() then the pointer
+ is set to the default function which is suitable for the detected chip type.
+ </para></listitem>
+ <listitem><para>
+ [BOARDSPECIFIC]</para><para>
+ Board specific members hold hardware related information which must
+ be provided by the board driver. The board driver must set the function
+ pointers and datafields before calling nand_scan().
+ </para></listitem>
+ <listitem><para>
+ [OPTIONAL]</para><para>
+ Optional members can hold information relevant for the board driver. The
+ generic NAND driver code does not use this information.
+ </para></listitem>
+ </itemizedlist>
+ </sect1>
+ </chapter>
+
+ <chapter id="basicboarddriver">
+ <title>Basic board driver</title>
+ <para>
+ For most boards it will be sufficient to provide just the
+ basic functions and fill out some really board dependend
+ members in the nand chip description structure.
+ See drivers/mtd/nand/skeleton for reference.
+ </para>
+ <sect1>
+ <title>Basic defines</title>
+ <para>
+ At least you have to provide a mtd structure and
+ a storage for the ioremap'ed chip address.
+ You can allocate the mtd structure using kmalloc
+ or you can allocate it statically.
+ In case of static allocation you have to allocate
+ a nand_chip structure too.
+ </para>
+ <para>
+ Kmalloc based example
+ </para>
+ <programlisting>
+static struct mtd_info *board_mtd;
+static unsigned long baseaddr;
+ </programlisting>
+ <para>
+ Static example
+ </para>
+ <programlisting>
+static struct mtd_info board_mtd;
+static struct nand_chip board_chip;
+static unsigned long baseaddr;
+ </programlisting>
+ </sect1>
+ <sect1>
+ <title>Partition defines</title>
+ <para>
+ If you want to divide your device into parititions, then
+ enable the configuration switch CONFIG_MTD_PARITIONS and define
+ a paritioning scheme suitable to your board.
+ </para>
+ <programlisting>
+#define NUM_PARTITIONS 2
+static struct mtd_partition partition_info[] = {
+ { .name = "Flash partition 1",
+ .offset = 0,
+ .size = 8 * 1024 * 1024 },
+ { .name = "Flash partition 2",
+ .offset = MTDPART_OFS_NEXT,
+ .size = MTDPART_SIZ_FULL },
+};
+ </programlisting>
+ </sect1>
+ <sect1>
+ <title>Hardware control function</title>
+ <para>
+ The hardware control function provides access to the
+ control pins of the NAND chip(s).
+ The access can be done by GPIO pins or by address lines.
+ If you use address lines, make sure that the timing
+ requirements are met.
+ </para>
+ <para>
+ <emphasis>GPIO based example</emphasis>
+ </para>
+ <programlisting>
+static void board_hwcontrol(struct mtd_info *mtd, int cmd)
+{
+ switch(cmd){
+ case NAND_CTL_SETCLE: /* Set CLE pin high */ break;
+ case NAND_CTL_CLRCLE: /* Set CLE pin low */ break;
+ case NAND_CTL_SETALE: /* Set ALE pin high */ break;
+ case NAND_CTL_CLRALE: /* Set ALE pin low */ break;
+ case NAND_CTL_SETNCE: /* Set nCE pin low */ break;
+ case NAND_CTL_CLRNCE: /* Set nCE pin high */ break;
+ }
+}
+ </programlisting>
+ <para>
+ <emphasis>Address lines based example.</emphasis> It's assumed that the
+ nCE pin is driven by a chip select decoder.
+ </para>
+ <programlisting>
+static void board_hwcontrol(struct mtd_info *mtd, int cmd)
+{
+ struct nand_chip *this = (struct nand_chip *) mtd->priv;
+ switch(cmd){
+ case NAND_CTL_SETCLE: this->IO_ADDR_W |= CLE_ADRR_BIT; break;
+ case NAND_CTL_CLRCLE: this->IO_ADDR_W &amp;= ~CLE_ADRR_BIT; break;
+ case NAND_CTL_SETALE: this->IO_ADDR_W |= ALE_ADRR_BIT; break;
+ case NAND_CTL_CLRALE: this->IO_ADDR_W &amp;= ~ALE_ADRR_BIT; break;
+ }
+}
+ </programlisting>
+ </sect1>
+ <sect1>
+ <title>Device ready function</title>
+ <para>
+ If the hardware interface has the ready busy pin of the NAND chip connected to a
+ GPIO or other accesible I/O pin, this function is used to read back the state of the
+ pin. The function has no arguments and should return 0, if the device is busy (R/B pin
+ is low) and 1, if the device is ready (R/B pin is high).
+ If the hardware interface does not give access to the ready busy pin, then
+ the function must not be defined and the function pointer this->dev_ready is set to NULL.
+ </para>
+ </sect1>
+ <sect1>
+ <title>Init function</title>
+ <para>
+ The init function allocates memory and sets up all the board
+ specific parameters and function pointers. When everything
+ is set up nand_scan() is called. This function tries to
+ detect and identify then chip. If a chip is found all the
+ internal data fields are initialized accordingly.
+ The structure(s) have to be zeroed out first and then filled with the neccecary
+ information about the device.
+ </para>
+ <programlisting>
+int __init board_init (void)
+{
+ struct nand_chip *this;
+ int err = 0;
+
+ /* Allocate memory for MTD device structure and private data */
+ board_mtd = kmalloc (sizeof(struct mtd_info) + sizeof (struct nand_chip), GFP_KERNEL);
+ if (!board_mtd) {
+ printk ("Unable to allocate NAND MTD device structure.\n");
+ err = -ENOMEM;
+ goto out;
+ }
+
+ /* Initialize structures */
+ memset ((char *) board_mtd, 0, sizeof(struct mtd_info) + sizeof(struct nand_chip));
+
+ /* map physical adress */
+ baseaddr = (unsigned long)ioremap(CHIP_PHYSICAL_ADDRESS, 1024);
+ if(!baseaddr){
+ printk("Ioremap to access NAND chip failed\n");
+ err = -EIO;
+ goto out_mtd;
+ }
+
+ /* Get pointer to private data */
+ this = (struct nand_chip *) ();
+ /* Link the private data with the MTD structure */
+ board_mtd->priv = this;
+
+ /* Set address of NAND IO lines */
+ this->IO_ADDR_R = baseaddr;
+ this->IO_ADDR_W = baseaddr;
+ /* Reference hardware control function */
+ this->hwcontrol = board_hwcontrol;
+ /* Set command delay time, see datasheet for correct value */
+ this->chip_delay = CHIP_DEPENDEND_COMMAND_DELAY;
+ /* Assign the device ready function, if available */
+ this->dev_ready = board_dev_ready;
+ this->eccmode = NAND_ECC_SOFT;
+
+ /* Scan to find existance of the device */
+ if (nand_scan (board_mtd, 1)) {
+ err = -ENXIO;
+ goto out_ior;
+ }
+
+ add_mtd_partitions(board_mtd, partition_info, NUM_PARTITIONS);
+ goto out;
+
+out_ior:
+ iounmap((void *)baseaddr);
+out_mtd:
+ kfree (board_mtd);
+out:
+ return err;
+}
+module_init(board_init);
+ </programlisting>
+ </sect1>
+ <sect1>
+ <title>Exit function</title>
+ <para>
+ The exit function is only neccecary if the driver is
+ compiled as a module. It releases all resources which
+ are held by the chip driver and unregisters the partitions
+ in the MTD layer.
+ </para>
+ <programlisting>
+#ifdef MODULE
+static void __exit board_cleanup (void)
+{
+ /* Release resources, unregister device */
+ nand_release (board_mtd);
+
+ /* unmap physical adress */
+ iounmap((void *)baseaddr);
+
+ /* Free the MTD device structure */
+ kfree (board_mtd);
+}
+module_exit(board_cleanup);
+#endif
+ </programlisting>
+ </sect1>
+ </chapter>
+
+ <chapter id="boarddriversadvanced">
+ <title>Advanced board driver functions</title>
+ <para>
+ This chapter describes the advanced functionality of the NAND
+ driver. For a list of functions which can be overridden by the board
+ driver see the documentation of the nand_chip structure.
+ </para>
+ <sect1>
+ <title>Multiple chip control</title>
+ <para>
+ The nand driver can control chip arrays. Therefor the
+ board driver must provide an own select_chip function. This
+ function must (de)select the requested chip.
+ The function pointer in the nand_chip structure must
+ be set before calling nand_scan(). The maxchip parameter
+ of nand_scan() defines the maximum number of chips to
+ scan for. Make sure that the select_chip function can
+ handle the requested number of chips.
+ </para>
+ <para>
+ The nand driver concatenates the chips to one virtual
+ chip and provides this virtual chip to the MTD layer.
+ </para>
+ <para>
+ <emphasis>Note: The driver can only handle linear chip arrays
+ of equally sized chips. There is no support for
+ parallel arrays which extend the buswidth.</emphasis>
+ </para>
+ <para>
+ <emphasis>GPIO based example</emphasis>
+ </para>
+ <programlisting>
+static void board_select_chip (struct mtd_info *mtd, int chip)
+{
+ /* Deselect all chips, set all nCE pins high */
+ GPIO(BOARD_NAND_NCE) |= 0xff;
+ if (chip >= 0)
+ GPIO(BOARD_NAND_NCE) &amp;= ~ (1 &lt;&lt; chip);
+}
+ </programlisting>
+ <para>
+ <emphasis>Address lines based example.</emphasis>
+ Its assumed that the nCE pins are connected to an
+ address decoder.
+ </para>
+ <programlisting>
+static void board_select_chip (struct mtd_info *mtd, int chip)
+{
+ struct nand_chip *this = (struct nand_chip *) mtd->priv;
+
+ /* Deselect all chips */
+ this->IO_ADDR_R &amp;= ~BOARD_NAND_ADDR_MASK;
+ this->IO_ADDR_W &amp;= ~BOARD_NAND_ADDR_MASK;
+ switch (chip) {
+ case 0:
+ this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIP0;
+ this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIP0;
+ break;
+ ....
+ case n:
+ this->IO_ADDR_R |= BOARD_NAND_ADDR_CHIPn;
+ this->IO_ADDR_W |= BOARD_NAND_ADDR_CHIPn;
+ break;
+ }
+}
+ </programlisting>
+ </sect1>
+ <sect1>
+ <title>Hardware ECC support</title>
+ <sect2>
+ <title>Functions and constants</title>
+ <para>
+ The nand driver supports three different types of
+ hardware ECC.
+ <itemizedlist>
+ <listitem><para>NAND_ECC_HW3_256</para><para>
+ Hardware ECC generator providing 3 bytes ECC per
+ 256 byte.
+ </para> </listitem>
+ <listitem><para>NAND_ECC_HW3_512</para><para>
+ Hardware ECC generator providing 3 bytes ECC per
+ 512 byte.
+ </para> </listitem>
+ <listitem><para>NAND_ECC_HW6_512</para><para>
+ Hardware ECC generator providing 6 bytes ECC per
+ 512 byte.
+ </para> </listitem>
+ <listitem><para>NAND_ECC_HW8_512</para><para>
+ Hardware ECC generator providing 6 bytes ECC per
+ 512 byte.
+ </para> </listitem>
+ </itemizedlist>
+ If your hardware generator has a different functionality
+ add it at the appropriate place in nand_base.c
+ </para>
+ <para>
+ The board driver must provide following functions:
+ <itemizedlist>
+ <listitem><para>enable_hwecc</para><para>
+ This function is called before reading / writing to
+ the chip. Reset or initialize the hardware generator
+ in this function. The function is called with an
+ argument which let you distinguish between read
+ and write operations.
+ </para> </listitem>
+ <listitem><para>calculate_ecc</para><para>
+ This function is called after read / write from / to
+ the chip. Transfer the ECC from the hardware to
+ the buffer. If the option NAND_HWECC_SYNDROME is set
+ then the function is only called on write. See below.
+ </para> </listitem>
+ <listitem><para>correct_data</para><para>
+ In case of an ECC error this function is called for
+ error detection and correction. Return 1 respectively 2
+ in case the error can be corrected. If the error is
+ not correctable return -1. If your hardware generator
+ matches the default algorithm of the nand_ecc software
+ generator then use the correction function provided
+ by nand_ecc instead of implementing duplicated code.
+ </para> </listitem>
+ </itemizedlist>
+ </para>
+ </sect2>
+ <sect2>
+ <title>Hardware ECC with syndrome calculation</title>
+ <para>
+ Many hardware ECC implementations provide Reed-Solomon
+ codes and calculate an error syndrome on read. The syndrome
+ must be converted to a standard Reed-Solomon syndrome
+ before calling the error correction code in the generic
+ Reed-Solomon library.
+ </para>
+ <para>
+ The ECC bytes must be placed immidiately after the data
+ bytes in order to make the syndrome generator work. This
+ is contrary to the usual layout used by software ECC. The
+ seperation of data and out of band area is not longer
+ possible. The nand driver code handles this layout and
+ the remaining free bytes in the oob area are managed by
+ the autoplacement code. Provide a matching oob-layout
+ in this case. See rts_from4.c and diskonchip.c for
+ implementation reference. In those cases we must also
+ use bad block tables on FLASH, because the ECC layout is
+ interferring with the bad block marker positions.
+ See bad block table support for details.
+ </para>
+ </sect2>
+ </sect1>
+ <sect1>
+ <title>Bad block table support</title>
+ <para>
+ Most NAND chips mark the bad blocks at a defined
+ position in the spare area. Those blocks must
+ not be erased under any circumstances as the bad
+ block information would be lost.
+ It is possible to check the bad block mark each
+ time when the blocks are accessed by reading the
+ spare area of the first page in the block. This
+ is time consuming so a bad block table is used.
+ </para>
+ <para>
+ The nand driver supports various types of bad block
+ tables.
+ <itemizedlist>
+ <listitem><para>Per device</para><para>
+ The bad block table contains all bad block information
+ of the device which can consist of multiple chips.
+ </para> </listitem>
+ <listitem><para>Per chip</para><para>
+ A bad block table is used per chip and contains the
+ bad block information for this particular chip.
+ </para> </listitem>
+ <listitem><para>Fixed offset</para><para>
+ The bad block table is located at a fixed offset
+ in the chip (device). This applies to various
+ DiskOnChip devices.
+ </para> </listitem>
+ <listitem><para>Automatic placed</para><para>
+ The bad block table is automatically placed and
+ detected either at the end or at the beginning
+ of a chip (device)
+ </para> </listitem>
+ <listitem><para>Mirrored tables</para><para>
+ The bad block table is mirrored on the chip (device) to
+ allow updates of the bad block table without data loss.
+ </para> </listitem>
+ </itemizedlist>
+ </para>
+ <para>
+ nand_scan() calls the function nand_default_bbt().
+ nand_default_bbt() selects appropriate default
+ bad block table desriptors depending on the chip information
+ which was retrieved by nand_scan().
+ </para>
+ <para>
+ The standard policy is scanning the device for bad
+ blocks and build a ram based bad block table which
+ allows faster access than always checking the
+ bad block information on the flash chip itself.
+ </para>
+ <sect2>
+ <title>Flash based tables</title>
+ <para>
+ It may be desired or neccecary to keep a bad block table in FLASH.
+ For AG-AND chips this is mandatory, as they have no factory marked
+ bad blocks. They have factory marked good blocks. The marker pattern
+ is erased when the block is erased to be reused. So in case of
+ powerloss before writing the pattern back to the chip this block
+ would be lost and added to the bad blocks. Therefor we scan the
+ chip(s) when we detect them the first time for good blocks and
+ store this information in a bad block table before erasing any
+ of the blocks.
+ </para>
+ <para>
+ The blocks in which the tables are stored are procteted against
+ accidental access by marking them bad in the memory bad block
+ table. The bad block table managment functions are allowed
+ to circumvernt this protection.
+ </para>
+ <para>
+ The simplest way to activate the FLASH based bad block table support
+ is to set the option NAND_USE_FLASH_BBT in the option field of
+ the nand chip structure before calling nand_scan(). For AG-AND
+ chips is this done by default.
+ This activates the default FLASH based bad block table functionality
+ of the NAND driver. The default bad block table options are
+ <itemizedlist>
+ <listitem><para>Store bad block table per chip</para></listitem>
+ <listitem><para>Use 2 bits per block</para></listitem>
+ <listitem><para>Automatic placement at the end of the chip</para></listitem>
+ <listitem><para>Use mirrored tables with version numbers</para></listitem>
+ <listitem><para>Reserve 4 blocks at the end of the chip</para></listitem>
+ </itemizedlist>
+ </para>
+ </sect2>
+ <sect2>
+ <title>User defined tables</title>
+ <para>
+ User defined tables are created by filling out a
+ nand_bbt_descr structure and storing the pointer in the
+ nand_chip structure member bbt_td before calling nand_scan().
+ If a mirror table is neccecary a second structure must be
+ created and a pointer to this structure must be stored
+ in bbt_md inside the nand_chip structure. If the bbt_md
+ member is set to NULL then only the main table is used
+ and no scan for the mirrored table is performed.
+ </para>
+ <para>
+ The most important field in the nand_bbt_descr structure
+ is the options field. The options define most of the
+ table properties. Use the predefined constants from
+ nand.h to define the options.
+ <itemizedlist>
+ <listitem><para>Number of bits per block</para>
+ <para>The supported number of bits is 1, 2, 4, 8.</para></listitem>
+ <listitem><para>Table per chip</para>
+ <para>Setting the constant NAND_BBT_PERCHIP selects that
+ a bad block table is managed for each chip in a chip array.
+ If this option is not set then a per device bad block table
+ is used.</para></listitem>
+ <listitem><para>Table location is absolute</para>
+ <para>Use the option constant NAND_BBT_ABSPAGE and
+ define the absolute page number where the bad block
+ table starts in the field pages. If you have selected bad block
+ tables per chip and you have a multi chip array then the start page
+ must be given for each chip in the chip array. Note: there is no scan
+ for a table ident pattern performed, so the fields
+ pattern, veroffs, offs, len can be left uninitialized</para></listitem>
+ <listitem><para>Table location is automatically detected</para>
+ <para>The table can either be located in the first or the last good
+ blocks of the chip (device). Set NAND_BBT_LASTBLOCK to place
+ the bad block table at the end of the chip (device). The
+ bad block tables are marked and identified by a pattern which
+ is stored in the spare area of the first page in the block which
+ holds the bad block table. Store a pointer to the pattern
+ in the pattern field. Further the length of the pattern has to be
+ stored in len and the offset in the spare area must be given
+ in the offs member of the nand_bbt_descr stucture. For mirrored
+ bad block tables different patterns are mandatory.</para></listitem>
+ <listitem><para>Table creation</para>
+ <para>Set the option NAND_BBT_CREATE to enable the table creation
+ if no table can be found during the scan. Usually this is done only
+ once if a new chip is found. </para></listitem>
+ <listitem><para>Table write support</para>
+ <para>Set the option NAND_BBT_WRITE to enable the table write support.
+ This allows the update of the bad block table(s) in case a block has
+ to be marked bad due to wear. The MTD interface function block_markbad
+ is calling the update function of the bad block table. If the write
+ support is enabled then the table is updated on FLASH.</para>
+ <para>
+ Note: Write support should only be enabled for mirrored tables with
+ version control.
+ </para></listitem>
+ <listitem><para>Table version control</para>
+ <para>Set the option NAND_BBT_VERSION to enable the table version control.
+ It's highly recommended to enable this for mirrored tables with write
+ support. It makes sure that the risk of loosing the bad block
+ table information is reduced to the loss of the information about the
+ one worn out block which should be marked bad. The version is stored in
+ 4 consecutive bytes in the spare area of the device. The position of
+ the version number is defined by the member veroffs in the bad block table
+ descriptor.</para></listitem>
+ <listitem><para>Save block contents on write</para>
+ <para>
+ In case that the block which holds the bad block table does contain
+ other useful information, set the option NAND_BBT_SAVECONTENT. When
+ the bad block table is written then the whole block is read the bad
+ block table is updated and the block is erased and everything is
+ written back. If this option is not set only the bad block table
+ is written and everything else in the block is ignored and erased.
+ </para></listitem>
+ <listitem><para>Number of reserved blocks</para>
+ <para>
+ For automatic placement some blocks must be reserved for
+ bad block table storage. The number of reserved blocks is defined
+ in the maxblocks member of the babd block table description structure.
+ Reserving 4 blocks for mirrored tables should be a reasonable number.
+ This also limits the number of blocks which are scanned for the bad
+ block table ident pattern.
+ </para></listitem>
+ </itemizedlist>
+ </para>
+ </sect2>
+ </sect1>
+ <sect1>
+ <title>Spare area (auto)placement</title>
+ <para>
+ The nand driver implements different possibilities for
+ placement of filesystem data in the spare area,
+ <itemizedlist>
+ <listitem><para>Placement defined by fs driver</para></listitem>
+ <listitem><para>Automatic placement</para></listitem>
+ </itemizedlist>
+ The default placement function is automatic placement. The
+ nand driver has built in default placement schemes for the
+ various chiptypes. If due to hardware ECC functionality the
+ default placement does not fit then the board driver can
+ provide a own placement scheme.
+ </para>
+ <para>
+ File system drivers can provide a own placement scheme which
+ is used instead of the default placement scheme.
+ </para>
+ <para>
+ Placement schemes are defined by a nand_oobinfo structure
+ <programlisting>
+struct nand_oobinfo {
+ int useecc;
+ int eccbytes;
+ int eccpos[24];
+ int oobfree[8][2];
+};
+ </programlisting>
+ <itemizedlist>
+ <listitem><para>useecc</para><para>
+ The useecc member controls the ecc and placement function. The header
+ file include/mtd/mtd-abi.h contains constants to select ecc and
+ placement. MTD_NANDECC_OFF switches off the ecc complete. This is
+ not recommended and available for testing and diagnosis only.
+ MTD_NANDECC_PLACE selects caller defined placement, MTD_NANDECC_AUTOPLACE
+ selects automatic placement.
+ </para></listitem>
+ <listitem><para>eccbytes</para><para>
+ The eccbytes member defines the number of ecc bytes per page.
+ </para></listitem>
+ <listitem><para>eccpos</para><para>
+ The eccpos array holds the byte offsets in the spare area where
+ the ecc codes are placed.
+ </para></listitem>
+ <listitem><para>oobfree</para><para>
+ The oobfree array defines the areas in the spare area which can be
+ used for automatic placement. The information is given in the format
+ {offset, size}. offset defines the start of the usable area, size the
+ length in bytes. More than one area can be defined. The list is terminated
+ by an {0, 0} entry.
+ </para></listitem>
+ </itemizedlist>
+ </para>
+ <sect2>
+ <title>Placement defined by fs driver</title>
+ <para>
+ The calling function provides a pointer to a nand_oobinfo
+ structure which defines the ecc placement. For writes the
+ caller must provide a spare area buffer along with the
+ data buffer. The spare area buffer size is (number of pages) *
+ (size of spare area). For reads the buffer size is
+ (number of pages) * ((size of spare area) + (number of ecc
+ steps per page) * sizeof (int)). The driver stores the
+ result of the ecc check for each tuple in the spare buffer.
+ The storage sequence is
+ </para>
+ <para>
+ &lt;spare data page 0&gt;&lt;ecc result 0&gt;...&lt;ecc result n&gt;
+ </para>
+ <para>
+ ...
+ </para>
+ <para>
+ &lt;spare data page n&gt;&lt;ecc result 0&gt;...&lt;ecc result n&gt;
+ </para>
+ <para>
+ This is a legacy mode used by YAFFS1.
+ </para>
+ <para>
+ If the spare area buffer is NULL then only the ECC placement is
+ done according to the given scheme in the nand_oobinfo structure.
+ </para>
+ </sect2>
+ <sect2>
+ <title>Automatic placement</title>
+ <para>
+ Automatic placement uses the built in defaults to place the
+ ecc bytes in the spare area. If filesystem data have to be stored /
+ read into the spare area then the calling function must provide a
+ buffer. The buffer size per page is determined by the oobfree array in
+ the nand_oobinfo structure.
+ </para>
+ <para>
+ If the spare area buffer is NULL then only the ECC placement is
+ done according to the default builtin scheme.
+ </para>
+ </sect2>
+ <sect2>
+ <title>User space placement selection</title>
+ <para>
+ All non ecc functions like mtd->read and mtd->write use an internal
+ structure, which can be set by an ioctl. This structure is preset
+ to the autoplacement default.
+ <programlisting>
+ ioctl (fd, MEMSETOOBSEL, oobsel);
+ </programlisting>
+ oobsel is a pointer to a user supplied structure of type
+ nand_oobconfig. The contents of this structure must match the
+ criteria of the filesystem, which will be used. See an example in utils/nandwrite.c.
+ </para>
+ </sect2>
+ </sect1>
+ <sect1>
+ <title>Spare area autoplacement default schemes</title>
+ <sect2>
+ <title>256 byte pagesize</title>
+<informaltable><tgroup cols="3"><tbody>
+<row>
+<entry>Offset</entry>
+<entry>Content</entry>
+<entry>Comment</entry>
+</row>
+<row>
+<entry>0x00</entry>
+<entry>ECC byte 0</entry>
+<entry>Error correction code byte 0</entry>
+</row>
+<row>
+<entry>0x01</entry>
+<entry>ECC byte 1</entry>
+<entry>Error correction code byte 1</entry>
+</row>
+<row>
+<entry>0x02</entry>
+<entry>ECC byte 2</entry>
+<entry>Error correction code byte 2</entry>
+</row>
+<row>
+<entry>0x03</entry>
+<entry>Autoplace 0</entry>
+<entry></entry>
+</row>
+<row>
+<entry>0x04</entry>
+<entry>Autoplace 1</entry>
+<entry></entry>
+</row>
+<row>
+<entry>0x05</entry>
+<entry>Bad block marker</entry>
+<entry>If any bit in this byte is zero, then this block is bad.
+This applies only to the first page in a block. In the remaining
+pages this byte is reserved</entry>
+</row>
+<row>
+<entry>0x06</entry>
+<entry>Autoplace 2</entry>
+<entry></entry>
+</row>
+<row>
+<entry>0x07</entry>
+<entry>Autoplace 3</entry>
+<entry></entry>
+</row>
+</tbody></tgroup></informaltable>
+ </sect2>
+ <sect2>
+ <title>512 byte pagesize</title>
+<informaltable><tgroup cols="3"><tbody>
+<row>
+<entry>Offset</entry>
+<entry>Content</entry>
+<entry>Comment</entry>
+</row>
+<row>
+<entry>0x00</entry>
+<entry>ECC byte 0</entry>
+<entry>Error correction code byte 0 of the lower 256 Byte data in
+this page</entry>
+</row>
+<row>
+<entry>0x01</entry>
+<entry>ECC byte 1</entry>
+<entry>Error correction code byte 1 of the lower 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x02</entry>
+<entry>ECC byte 2</entry>
+<entry>Error correction code byte 2 of the lower 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x03</entry>
+<entry>ECC byte 3</entry>
+<entry>Error correction code byte 0 of the upper 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x04</entry>
+<entry>reserved</entry>
+<entry>reserved</entry>
+</row>
+<row>
+<entry>0x05</entry>
+<entry>Bad block marker</entry>
+<entry>If any bit in this byte is zero, then this block is bad.
+This applies only to the first page in a block. In the remaining
+pages this byte is reserved</entry>
+</row>
+<row>
+<entry>0x06</entry>
+<entry>ECC byte 4</entry>
+<entry>Error correction code byte 1 of the upper 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x07</entry>
+<entry>ECC byte 5</entry>
+<entry>Error correction code byte 2 of the upper 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x08 - 0x0F</entry>
+<entry>Autoplace 0 - 7</entry>
+<entry></entry>
+</row>
+</tbody></tgroup></informaltable>
+ </sect2>
+ <sect2>
+ <title>2048 byte pagesize</title>
+<informaltable><tgroup cols="3"><tbody>
+<row>
+<entry>Offset</entry>
+<entry>Content</entry>
+<entry>Comment</entry>
+</row>
+<row>
+<entry>0x00</entry>
+<entry>Bad block marker</entry>
+<entry>If any bit in this byte is zero, then this block is bad.
+This applies only to the first page in a block. In the remaining
+pages this byte is reserved</entry>
+</row>
+<row>
+<entry>0x01</entry>
+<entry>Reserved</entry>
+<entry>Reserved</entry>
+</row>
+<row>
+<entry>0x02-0x27</entry>
+<entry>Autoplace 0 - 37</entry>
+<entry></entry>
+</row>
+<row>
+<entry>0x28</entry>
+<entry>ECC byte 0</entry>
+<entry>Error correction code byte 0 of the first 256 Byte data in
+this page</entry>
+</row>
+<row>
+<entry>0x29</entry>
+<entry>ECC byte 1</entry>
+<entry>Error correction code byte 1 of the first 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x2A</entry>
+<entry>ECC byte 2</entry>
+<entry>Error correction code byte 2 of the first 256 Bytes data in
+this page</entry>
+</row>
+<row>
+<entry>0x2B</entry>
+<entry>ECC byte 3</entry>
+<entry>Error correction code byte 0 of the second 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x2C</entry>
+<entry>ECC byte 4</entry>
+<entry>Error correction code byte 1 of the second 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x2D</entry>
+<entry>ECC byte 5</entry>
+<entry>Error correction code byte 2 of the second 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x2E</entry>
+<entry>ECC byte 6</entry>
+<entry>Error correction code byte 0 of the third 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x2F</entry>
+<entry>ECC byte 7</entry>
+<entry>Error correction code byte 1 of the third 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x30</entry>
+<entry>ECC byte 8</entry>
+<entry>Error correction code byte 2 of the third 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x31</entry>
+<entry>ECC byte 9</entry>
+<entry>Error correction code byte 0 of the fourth 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x32</entry>
+<entry>ECC byte 10</entry>
+<entry>Error correction code byte 1 of the fourth 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x33</entry>
+<entry>ECC byte 11</entry>
+<entry>Error correction code byte 2 of the fourth 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x34</entry>
+<entry>ECC byte 12</entry>
+<entry>Error correction code byte 0 of the fifth 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x35</entry>
+<entry>ECC byte 13</entry>
+<entry>Error correction code byte 1 of the fifth 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x36</entry>
+<entry>ECC byte 14</entry>
+<entry>Error correction code byte 2 of the fifth 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x37</entry>
+<entry>ECC byte 15</entry>
+<entry>Error correction code byte 0 of the sixt 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x38</entry>
+<entry>ECC byte 16</entry>
+<entry>Error correction code byte 1 of the sixt 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x39</entry>
+<entry>ECC byte 17</entry>
+<entry>Error correction code byte 2 of the sixt 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x3A</entry>
+<entry>ECC byte 18</entry>
+<entry>Error correction code byte 0 of the seventh 256 Bytes of
+data in this page</entry>
+</row>
+<row>
+<entry>0x3B</entry>
+<entry>ECC byte 19</entry>
+<entry>Error correction code byte 1 of the seventh 256 Bytes of
+data in this page</entry>
+</row>
+<row>
+<entry>0x3C</entry>
+<entry>ECC byte 20</entry>
+<entry>Error correction code byte 2 of the seventh 256 Bytes of
+data in this page</entry>
+</row>
+<row>
+<entry>0x3D</entry>
+<entry>ECC byte 21</entry>
+<entry>Error correction code byte 0 of the eigth 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x3E</entry>
+<entry>ECC byte 22</entry>
+<entry>Error correction code byte 1 of the eigth 256 Bytes of data
+in this page</entry>
+</row>
+<row>
+<entry>0x3F</entry>
+<entry>ECC byte 23</entry>
+<entry>Error correction code byte 2 of the eigth 256 Bytes of data
+in this page</entry>
+</row>
+</tbody></tgroup></informaltable>
+ </sect2>
+ </sect1>
+ </chapter>
+
+ <chapter id="filesystems">
+ <title>Filesystem support</title>
+ <para>
+ The NAND driver provides all neccecary functions for a
+ filesystem via the MTD interface.
+ </para>
+ <para>
+ Filesystems must be aware of the NAND pecularities and
+ restrictions. One major restrictions of NAND Flash is, that you cannot
+ write as often as you want to a page. The consecutive writes to a page,
+ before erasing it again, are restricted to 1-3 writes, depending on the
+ manufacturers specifications. This applies similar to the spare area.
+ </para>
+ <para>
+ Therefor NAND aware filesystems must either write in page size chunks
+ or hold a writebuffer to collect smaller writes until they sum up to
+ pagesize. Available NAND aware filesystems: JFFS2, YAFFS.
+ </para>
+ <para>
+ The spare area usage to store filesystem data is controlled by
+ the spare area placement functionality which is described in one
+ of the earlier chapters.
+ </para>
+ </chapter>
+ <chapter id="tools">
+ <title>Tools</title>
+ <para>
+ The MTD project provides a couple of helpful tools to handle NAND Flash.
+ <itemizedlist>
+ <listitem><para>flasherase, flasheraseall: Erase and format FLASH partitions</para></listitem>
+ <listitem><para>nandwrite: write filesystem images to NAND FLASH</para></listitem>
+ <listitem><para>nanddump: dump the contents of a NAND FLASH partitions</para></listitem>
+ </itemizedlist>
+ </para>
+ <para>
+ These tools are aware of the NAND restrictions. Please use those tools
+ instead of complaining about errors which are caused by non NAND aware
+ access methods.
+ </para>
+ </chapter>
+
+ <chapter id="defines">
+ <title>Constants</title>
+ <para>
+ This chapter describes the constants which might be relevant for a driver developer.
+ </para>
+ <sect1>
+ <title>Chip option constants</title>
+ <sect2>
+ <title>Constants for chip id table</title>
+ <para>
+ These constants are defined in nand.h. They are ored together to describe
+ the chip functionality.
+ <programlisting>
+/* Chip can not auto increment pages */
+#define NAND_NO_AUTOINCR 0x00000001
+/* Buswitdh is 16 bit */
+#define NAND_BUSWIDTH_16 0x00000002
+/* Device supports partial programming without padding */
+#define NAND_NO_PADDING 0x00000004
+/* Chip has cache program function */
+#define NAND_CACHEPRG 0x00000008
+/* Chip has copy back function */
+#define NAND_COPYBACK 0x00000010
+/* AND Chip which has 4 banks and a confusing page / block
+ * assignment. See Renesas datasheet for further information */
+#define NAND_IS_AND 0x00000020
+/* Chip has a array of 4 pages which can be read without
+ * additional ready /busy waits */
+#define NAND_4PAGE_ARRAY 0x00000040
+ </programlisting>
+ </para>
+ </sect2>
+ <sect2>
+ <title>Constants for runtime options</title>
+ <para>
+ These constants are defined in nand.h. They are ored together to describe
+ the functionality.
+ <programlisting>
+/* Use a flash based bad block table. This option is parsed by the
+ * default bad block table function (nand_default_bbt). */
+#define NAND_USE_FLASH_BBT 0x00010000
+/* The hw ecc generator provides a syndrome instead a ecc value on read
+ * This can only work if we have the ecc bytes directly behind the
+ * data bytes. Applies for DOC and AG-AND Renesas HW Reed Solomon generators */
+#define NAND_HWECC_SYNDROME 0x00020000
+ </programlisting>
+ </para>
+ </sect2>
+ </sect1>
+
+ <sect1>
+ <title>ECC selection constants</title>
+ <para>
+ Use these constants to select the ECC algorithm.
+ <programlisting>
+/* No ECC. Usage is not recommended ! */
+#define NAND_ECC_NONE 0
+/* Software ECC 3 byte ECC per 256 Byte data */
+#define NAND_ECC_SOFT 1
+/* Hardware ECC 3 byte ECC per 256 Byte data */
+#define NAND_ECC_HW3_256 2
+/* Hardware ECC 3 byte ECC per 512 Byte data */
+#define NAND_ECC_HW3_512 3
+/* Hardware ECC 6 byte ECC per 512 Byte data */
+#define NAND_ECC_HW6_512 4
+/* Hardware ECC 6 byte ECC per 512 Byte data */
+#define NAND_ECC_HW8_512 6
+ </programlisting>
+ </para>
+ </sect1>
+
+ <sect1>
+ <title>Hardware control related constants</title>
+ <para>
+ These constants describe the requested hardware access function when
+ the boardspecific hardware control function is called
+ <programlisting>
+/* Select the chip by setting nCE to low */
+#define NAND_CTL_SETNCE 1
+/* Deselect the chip by setting nCE to high */
+#define NAND_CTL_CLRNCE 2
+/* Select the command latch by setting CLE to high */
+#define NAND_CTL_SETCLE 3
+/* Deselect the command latch by setting CLE to low */
+#define NAND_CTL_CLRCLE 4
+/* Select the address latch by setting ALE to high */
+#define NAND_CTL_SETALE 5
+/* Deselect the address latch by setting ALE to low */
+#define NAND_CTL_CLRALE 6
+/* Set write protection by setting WP to high. Not used! */
+#define NAND_CTL_SETWP 7
+/* Clear write protection by setting WP to low. Not used! */
+#define NAND_CTL_CLRWP 8
+ </programlisting>
+ </para>
+ </sect1>
+
+ <sect1>
+ <title>Bad block table related constants</title>
+ <para>
+ These constants describe the options used for bad block
+ table descriptors.
+ <programlisting>
+/* Options for the bad block table descriptors */
+
+/* The number of bits used per block in the bbt on the device */
+#define NAND_BBT_NRBITS_MSK 0x0000000F
+#define NAND_BBT_1BIT 0x00000001
+#define NAND_BBT_2BIT 0x00000002
+#define NAND_BBT_4BIT 0x00000004
+#define NAND_BBT_8BIT 0x00000008
+/* The bad block table is in the last good block of the device */
+#define NAND_BBT_LASTBLOCK 0x00000010
+/* The bbt is at the given page, else we must scan for the bbt */
+#define NAND_BBT_ABSPAGE 0x00000020
+/* The bbt is at the given page, else we must scan for the bbt */
+#define NAND_BBT_SEARCH 0x00000040
+/* bbt is stored per chip on multichip devices */
+#define NAND_BBT_PERCHIP 0x00000080
+/* bbt has a version counter at offset veroffs */
+#define NAND_BBT_VERSION 0x00000100
+/* Create a bbt if none axists */
+#define NAND_BBT_CREATE 0x00000200
+/* Search good / bad pattern through all pages of a block */
+#define NAND_BBT_SCANALLPAGES 0x00000400
+/* Scan block empty during good / bad block scan */
+#define NAND_BBT_SCANEMPTY 0x00000800
+/* Write bbt if neccecary */
+#define NAND_BBT_WRITE 0x00001000
+/* Read and write back block contents when writing bbt */
+#define NAND_BBT_SAVECONTENT 0x00002000
+ </programlisting>
+ </para>
+ </sect1>
+
+ </chapter>
+
+ <chapter id="structs">
+ <title>Structures</title>
+ <para>
+ This chapter contains the autogenerated documentation of the structures which are
+ used in the NAND driver and might be relevant for a driver developer. Each
+ struct member has a short description which is marked with an [XXX] identifier.
+ See the chapter "Documentation hints" for an explanation.
+ </para>
+!Iinclude/linux/mtd/nand.h
+ </chapter>
+
+ <chapter id="pubfunctions">
+ <title>Public Functions Provided</title>
+ <para>
+ This chapter contains the autogenerated documentation of the NAND kernel API functions
+ which are exported. Each function has a short description which is marked with an [XXX] identifier.
+ See the chapter "Documentation hints" for an explanation.
+ </para>
+!Edrivers/mtd/nand/nand_base.c
+!Edrivers/mtd/nand/nand_bbt.c
+!Edrivers/mtd/nand/nand_ecc.c
+ </chapter>
+
+ <chapter id="intfunctions">
+ <title>Internal Functions Provided</title>
+ <para>
+ This chapter contains the autogenerated documentation of the NAND driver internal functions.
+ Each function has a short description which is marked with an [XXX] identifier.
+ See the chapter "Documentation hints" for an explanation.
+ The functions marked with [DEFAULT] might be relevant for a board driver developer.
+ </para>
+!Idrivers/mtd/nand/nand_base.c
+!Idrivers/mtd/nand/nand_bbt.c
+!Idrivers/mtd/nand/nand_ecc.c
+ </chapter>
+
+ <chapter id="credits">
+ <title>Credits</title>
+ <para>
+ The following people have contributed to the NAND driver:
+ <orderedlist>
+ <listitem><para>Steven J. Hill<email>sjhill@realitydiluted.com</email></para></listitem>
+ <listitem><para>David Woodhouse<email>dwmw2@infradead.org</email></para></listitem>
+ <listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem>
+ </orderedlist>
+ A lot of users have provided bugfixes, improvements and helping hands for testing.
+ Thanks a lot.
+ </para>
+ <para>
+ The following people have contributed to this document:
+ <orderedlist>
+ <listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem>
+ </orderedlist>
+ </para>
+ </chapter>
+</book>
diff --git a/Documentation/DocBook/procfs-guide.tmpl b/Documentation/DocBook/procfs-guide.tmpl
new file mode 100644
index 000000000000..45cad23efefa
--- /dev/null
+++ b/Documentation/DocBook/procfs-guide.tmpl
@@ -0,0 +1,591 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" [
+<!ENTITY procfsexample SYSTEM "procfs_example.xml">
+]>
+
+<book id="LKProcfsGuide">
+ <bookinfo>
+ <title>Linux Kernel Procfs Guide</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Erik</firstname>
+ <othername>(J.A.K.)</othername>
+ <surname>Mouw</surname>
+ <affiliation>
+ <orgname>Delft University of Technology</orgname>
+ <orgdiv>Faculty of Information Technology and Systems</orgdiv>
+ <address>
+ <email>J.A.K.Mouw@its.tudelft.nl</email>
+ <pob>PO BOX 5031</pob>
+ <postcode>2600 GA</postcode>
+ <city>Delft</city>
+ <country>The Netherlands</country>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <revhistory>
+ <revision>
+ <revnumber>1.0&nbsp;</revnumber>
+ <date>May 30, 2001</date>
+ <revremark>Initial revision posted to linux-kernel</revremark>
+ </revision>
+ <revision>
+ <revnumber>1.1&nbsp;</revnumber>
+ <date>June 3, 2001</date>
+ <revremark>Revised after comments from linux-kernel</revremark>
+ </revision>
+ </revhistory>
+
+ <copyright>
+ <year>2001</year>
+ <holder>Erik Mouw</holder>
+ </copyright>
+
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute it
+ and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This documentation is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ PURPOSE. See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+
+
+
+ <toc>
+ </toc>
+
+
+
+
+ <preface>
+ <title>Preface</title>
+
+ <para>
+ This guide describes the use of the procfs file system from
+ within the Linux kernel. The idea to write this guide came up on
+ the #kernelnewbies IRC channel (see <ulink
+ url="http://www.kernelnewbies.org/">http://www.kernelnewbies.org/</ulink>),
+ when Jeff Garzik explained the use of procfs and forwarded me a
+ message Alexander Viro wrote to the linux-kernel mailing list. I
+ agreed to write it up nicely, so here it is.
+ </para>
+
+ <para>
+ I'd like to thank Jeff Garzik
+ <email>jgarzik@pobox.com</email> and Alexander Viro
+ <email>viro@parcelfarce.linux.theplanet.co.uk</email> for their input,
+ Tim Waugh <email>twaugh@redhat.com</email> for his <ulink
+ url="http://people.redhat.com/twaugh/docbook/selfdocbook/">Selfdocbook</ulink>,
+ and Marc Joosen <email>marcj@historia.et.tudelft.nl</email> for
+ proofreading.
+ </para>
+
+ <para>
+ This documentation was written while working on the LART
+ computing board (<ulink
+ url="http://www.lart.tudelft.nl/">http://www.lart.tudelft.nl/</ulink>),
+ which is sponsored by the Mobile Multi-media Communications
+ (<ulink
+ url="http://www.mmc.tudelft.nl/">http://www.mmc.tudelft.nl/</ulink>)
+ and Ubiquitous Communications (<ulink
+ url="http://www.ubicom.tudelft.nl/">http://www.ubicom.tudelft.nl/</ulink>)
+ projects.
+ </para>
+
+ <para>
+ Erik
+ </para>
+ </preface>
+
+
+
+
+ <chapter id="intro">
+ <title>Introduction</title>
+
+ <para>
+ The <filename class="directory">/proc</filename> file system
+ (procfs) is a special file system in the linux kernel. It's a
+ virtual file system: it is not associated with a block device
+ but exists only in memory. The files in the procfs are there to
+ allow userland programs access to certain information from the
+ kernel (like process information in <filename
+ class="directory">/proc/[0-9]+/</filename>), but also for debug
+ purposes (like <filename>/proc/ksyms</filename>).
+ </para>
+
+ <para>
+ This guide describes the use of the procfs file system from
+ within the Linux kernel. It starts by introducing all relevant
+ functions to manage the files within the file system. After that
+ it shows how to communicate with userland, and some tips and
+ tricks will be pointed out. Finally a complete example will be
+ shown.
+ </para>
+
+ <para>
+ Note that the files in <filename
+ class="directory">/proc/sys</filename> are sysctl files: they
+ don't belong to procfs and are governed by a completely
+ different API described in the Kernel API book.
+ </para>
+ </chapter>
+
+
+
+
+ <chapter id="managing">
+ <title>Managing procfs entries</title>
+
+ <para>
+ This chapter describes the functions that various kernel
+ components use to populate the procfs with files, symlinks,
+ device nodes, and directories.
+ </para>
+
+ <para>
+ A minor note before we start: if you want to use any of the
+ procfs functions, be sure to include the correct header file!
+ This should be one of the first lines in your code:
+ </para>
+
+ <programlisting>
+#include &lt;linux/proc_fs.h&gt;
+ </programlisting>
+
+
+
+
+ <sect1 id="regularfile">
+ <title>Creating a regular file</title>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>struct proc_dir_entry* <function>create_proc_entry</function></funcdef>
+ <paramdef>const char* <parameter>name</parameter></paramdef>
+ <paramdef>mode_t <parameter>mode</parameter></paramdef>
+ <paramdef>struct proc_dir_entry* <parameter>parent</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <para>
+ This function creates a regular file with the name
+ <parameter>name</parameter>, file mode
+ <parameter>mode</parameter> in the directory
+ <parameter>parent</parameter>. To create a file in the root of
+ the procfs, use <constant>NULL</constant> as
+ <parameter>parent</parameter> parameter. When successful, the
+ function will return a pointer to the freshly created
+ <structname>struct proc_dir_entry</structname>; otherwise it
+ will return <constant>NULL</constant>. <xref
+ linkend="userland"/> describes how to do something useful with
+ regular files.
+ </para>
+
+ <para>
+ Note that it is specifically supported that you can pass a
+ path that spans multiple directories. For example
+ <function>create_proc_entry</function>(<parameter>"drivers/via0/info"</parameter>)
+ will create the <filename class="directory">via0</filename>
+ directory if necessary, with standard
+ <constant>0755</constant> permissions.
+ </para>
+
+ <para>
+ If you only want to be able to read the file, the function
+ <function>create_proc_read_entry</function> described in <xref
+ linkend="convenience"/> may be used to create and initialise
+ the procfs entry in one single call.
+ </para>
+ </sect1>
+
+
+
+
+ <sect1>
+ <title>Creating a symlink</title>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>struct proc_dir_entry*
+ <function>proc_symlink</function></funcdef> <paramdef>const
+ char* <parameter>name</parameter></paramdef>
+ <paramdef>struct proc_dir_entry*
+ <parameter>parent</parameter></paramdef> <paramdef>const
+ char* <parameter>dest</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <para>
+ This creates a symlink in the procfs directory
+ <parameter>parent</parameter> that points from
+ <parameter>name</parameter> to
+ <parameter>dest</parameter>. This translates in userland to
+ <literal>ln -s</literal> <parameter>dest</parameter>
+ <parameter>name</parameter>.
+ </para>
+ </sect1>
+
+ <sect1>
+ <title>Creating a directory</title>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>struct proc_dir_entry* <function>proc_mkdir</function></funcdef>
+ <paramdef>const char* <parameter>name</parameter></paramdef>
+ <paramdef>struct proc_dir_entry* <parameter>parent</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <para>
+ Create a directory <parameter>name</parameter> in the procfs
+ directory <parameter>parent</parameter>.
+ </para>
+ </sect1>
+
+
+
+
+ <sect1>
+ <title>Removing an entry</title>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>void <function>remove_proc_entry</function></funcdef>
+ <paramdef>const char* <parameter>name</parameter></paramdef>
+ <paramdef>struct proc_dir_entry* <parameter>parent</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <para>
+ Removes the entry <parameter>name</parameter> in the directory
+ <parameter>parent</parameter> from the procfs. Entries are
+ removed by their <emphasis>name</emphasis>, not by the
+ <structname>struct proc_dir_entry</structname> returned by the
+ various create functions. Note that this function doesn't
+ recursively remove entries.
+ </para>
+
+ <para>
+ Be sure to free the <structfield>data</structfield> entry from
+ the <structname>struct proc_dir_entry</structname> before
+ <function>remove_proc_entry</function> is called (that is: if
+ there was some <structfield>data</structfield> allocated, of
+ course). See <xref linkend="usingdata"/> for more information
+ on using the <structfield>data</structfield> entry.
+ </para>
+ </sect1>
+ </chapter>
+
+
+
+
+ <chapter id="userland">
+ <title>Communicating with userland</title>
+
+ <para>
+ Instead of reading (or writing) information directly from
+ kernel memory, procfs works with <emphasis>call back
+ functions</emphasis> for files: functions that are called when
+ a specific file is being read or written. Such functions have
+ to be initialised after the procfs file is created by setting
+ the <structfield>read_proc</structfield> and/or
+ <structfield>write_proc</structfield> fields in the
+ <structname>struct proc_dir_entry*</structname> that the
+ function <function>create_proc_entry</function> returned:
+ </para>
+
+ <programlisting>
+struct proc_dir_entry* entry;
+
+entry->read_proc = read_proc_foo;
+entry->write_proc = write_proc_foo;
+ </programlisting>
+
+ <para>
+ If you only want to use a the
+ <structfield>read_proc</structfield>, the function
+ <function>create_proc_read_entry</function> described in <xref
+ linkend="convenience"/> may be used to create and initialise the
+ procfs entry in one single call.
+ </para>
+
+
+
+ <sect1>
+ <title>Reading data</title>
+
+ <para>
+ The read function is a call back function that allows userland
+ processes to read data from the kernel. The read function
+ should have the following format:
+ </para>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>int <function>read_func</function></funcdef>
+ <paramdef>char* <parameter>page</parameter></paramdef>
+ <paramdef>char** <parameter>start</parameter></paramdef>
+ <paramdef>off_t <parameter>off</parameter></paramdef>
+ <paramdef>int <parameter>count</parameter></paramdef>
+ <paramdef>int* <parameter>eof</parameter></paramdef>
+ <paramdef>void* <parameter>data</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <para>
+ The read function should write its information into the
+ <parameter>page</parameter>. For proper use, the function
+ should start writing at an offset of
+ <parameter>off</parameter> in <parameter>page</parameter> and
+ write at most <parameter>count</parameter> bytes, but because
+ most read functions are quite simple and only return a small
+ amount of information, these two parameters are usually
+ ignored (it breaks pagers like <literal>more</literal> and
+ <literal>less</literal>, but <literal>cat</literal> still
+ works).
+ </para>
+
+ <para>
+ If the <parameter>off</parameter> and
+ <parameter>count</parameter> parameters are properly used,
+ <parameter>eof</parameter> should be used to signal that the
+ end of the file has been reached by writing
+ <literal>1</literal> to the memory location
+ <parameter>eof</parameter> points to.
+ </para>
+
+ <para>
+ The parameter <parameter>start</parameter> doesn't seem to be
+ used anywhere in the kernel. The <parameter>data</parameter>
+ parameter can be used to create a single call back function for
+ several files, see <xref linkend="usingdata"/>.
+ </para>
+
+ <para>
+ The <function>read_func</function> function must return the
+ number of bytes written into the <parameter>page</parameter>.
+ </para>
+
+ <para>
+ <xref linkend="example"/> shows how to use a read call back
+ function.
+ </para>
+ </sect1>
+
+
+
+
+ <sect1>
+ <title>Writing data</title>
+
+ <para>
+ The write call back function allows a userland process to write
+ data to the kernel, so it has some kind of control over the
+ kernel. The write function should have the following format:
+ </para>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>int <function>write_func</function></funcdef>
+ <paramdef>struct file* <parameter>file</parameter></paramdef>
+ <paramdef>const char* <parameter>buffer</parameter></paramdef>
+ <paramdef>unsigned long <parameter>count</parameter></paramdef>
+ <paramdef>void* <parameter>data</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <para>
+ The write function should read <parameter>count</parameter>
+ bytes at maximum from the <parameter>buffer</parameter>. Note
+ that the <parameter>buffer</parameter> doesn't live in the
+ kernel's memory space, so it should first be copied to kernel
+ space with <function>copy_from_user</function>. The
+ <parameter>file</parameter> parameter is usually
+ ignored. <xref linkend="usingdata"/> shows how to use the
+ <parameter>data</parameter> parameter.
+ </para>
+
+ <para>
+ Again, <xref linkend="example"/> shows how to use this call back
+ function.
+ </para>
+ </sect1>
+
+
+
+
+ <sect1 id="usingdata">
+ <title>A single call back for many files</title>
+
+ <para>
+ When a large number of almost identical files is used, it's
+ quite inconvenient to use a separate call back function for
+ each file. A better approach is to have a single call back
+ function that distinguishes between the files by using the
+ <structfield>data</structfield> field in <structname>struct
+ proc_dir_entry</structname>. First of all, the
+ <structfield>data</structfield> field has to be initialised:
+ </para>
+
+ <programlisting>
+struct proc_dir_entry* entry;
+struct my_file_data *file_data;
+
+file_data = kmalloc(sizeof(struct my_file_data), GFP_KERNEL);
+entry->data = file_data;
+ </programlisting>
+
+ <para>
+ The <structfield>data</structfield> field is a <type>void
+ *</type>, so it can be initialised with anything.
+ </para>
+
+ <para>
+ Now that the <structfield>data</structfield> field is set, the
+ <function>read_proc</function> and
+ <function>write_proc</function> can use it to distinguish
+ between files because they get it passed into their
+ <parameter>data</parameter> parameter:
+ </para>
+
+ <programlisting>
+int foo_read_func(char *page, char **start, off_t off,
+ int count, int *eof, void *data)
+{
+ int len;
+
+ if(data == file_data) {
+ /* special case for this file */
+ } else {
+ /* normal processing */
+ }
+
+ return len;
+}
+ </programlisting>
+
+ <para>
+ Be sure to free the <structfield>data</structfield> data field
+ when removing the procfs entry.
+ </para>
+ </sect1>
+ </chapter>
+
+
+
+
+ <chapter id="tips">
+ <title>Tips and tricks</title>
+
+
+
+
+ <sect1 id="convenience">
+ <title>Convenience functions</title>
+
+ <funcsynopsis>
+ <funcprototype>
+ <funcdef>struct proc_dir_entry* <function>create_proc_read_entry</function></funcdef>
+ <paramdef>const char* <parameter>name</parameter></paramdef>
+ <paramdef>mode_t <parameter>mode</parameter></paramdef>
+ <paramdef>struct proc_dir_entry* <parameter>parent</parameter></paramdef>
+ <paramdef>read_proc_t* <parameter>read_proc</parameter></paramdef>
+ <paramdef>void* <parameter>data</parameter></paramdef>
+ </funcprototype>
+ </funcsynopsis>
+
+ <para>
+ This function creates a regular file in exactly the same way
+ as <function>create_proc_entry</function> from <xref
+ linkend="regularfile"/> does, but also allows to set the read
+ function <parameter>read_proc</parameter> in one call. This
+ function can set the <parameter>data</parameter> as well, like
+ explained in <xref linkend="usingdata"/>.
+ </para>
+ </sect1>
+
+
+
+ <sect1>
+ <title>Modules</title>
+
+ <para>
+ If procfs is being used from within a module, be sure to set
+ the <structfield>owner</structfield> field in the
+ <structname>struct proc_dir_entry</structname> to
+ <constant>THIS_MODULE</constant>.
+ </para>
+
+ <programlisting>
+struct proc_dir_entry* entry;
+
+entry->owner = THIS_MODULE;
+ </programlisting>
+ </sect1>
+
+
+
+
+ <sect1>
+ <title>Mode and ownership</title>
+
+ <para>
+ Sometimes it is useful to change the mode and/or ownership of
+ a procfs entry. Here is an example that shows how to achieve
+ that:
+ </para>
+
+ <programlisting>
+struct proc_dir_entry* entry;
+
+entry->mode = S_IWUSR |S_IRUSR | S_IRGRP | S_IROTH;
+entry->uid = 0;
+entry->gid = 100;
+ </programlisting>
+
+ </sect1>
+ </chapter>
+
+
+
+
+ <chapter id="example">
+ <title>Example</title>
+
+ <!-- be careful with the example code: it shouldn't be wider than
+ approx. 60 columns, or otherwise it won't fit properly on a page
+ -->
+
+&procfsexample;
+
+ </chapter>
+</book>
diff --git a/Documentation/DocBook/procfs_example.c b/Documentation/DocBook/procfs_example.c
new file mode 100644
index 000000000000..7064084c1c5e
--- /dev/null
+++ b/Documentation/DocBook/procfs_example.c
@@ -0,0 +1,224 @@
+/*
+ * procfs_example.c: an example proc interface
+ *
+ * Copyright (C) 2001, Erik Mouw (J.A.K.Mouw@its.tudelft.nl)
+ *
+ * This file accompanies the procfs-guide in the Linux kernel
+ * source. Its main use is to demonstrate the concepts and
+ * functions described in the guide.
+ *
+ * This software has been developed while working on the LART
+ * computing board (http://www.lart.tudelft.nl/), which is
+ * sponsored by the Mobile Multi-media Communications
+ * (http://www.mmc.tudelft.nl/) and Ubiquitous Communications
+ * (http://www.ubicom.tudelft.nl/) projects.
+ *
+ * The author can be reached at:
+ *
+ * Erik Mouw
+ * Information and Communication Theory Group
+ * Faculty of Information Technology and Systems
+ * Delft University of Technology
+ * P.O. Box 5031
+ * 2600 GA Delft
+ * The Netherlands
+ *
+ *
+ * This program is free software; you can redistribute
+ * it and/or modify it under the terms of the GNU General
+ * Public License as published by the Free Software
+ * Foundation; either version 2 of the License, or (at your
+ * option) any later version.
+ *
+ * This program is distributed in the hope that it will be
+ * useful, but WITHOUT ANY WARRANTY; without even the implied
+ * warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR
+ * PURPOSE. See the GNU General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place,
+ * Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/proc_fs.h>
+#include <linux/jiffies.h>
+#include <asm/uaccess.h>
+
+
+#define MODULE_VERS "1.0"
+#define MODULE_NAME "procfs_example"
+
+#define FOOBAR_LEN 8
+
+struct fb_data_t {
+ char name[FOOBAR_LEN + 1];
+ char value[FOOBAR_LEN + 1];
+};
+
+
+static struct proc_dir_entry *example_dir, *foo_file,
+ *bar_file, *jiffies_file, *symlink;
+
+
+struct fb_data_t foo_data, bar_data;
+
+
+static int proc_read_jiffies(char *page, char **start,
+ off_t off, int count,
+ int *eof, void *data)
+{
+ int len;
+
+ len = sprintf(page, "jiffies = %ld\n",
+ jiffies);
+
+ return len;
+}
+
+
+static int proc_read_foobar(char *page, char **start,
+ off_t off, int count,
+ int *eof, void *data)
+{
+ int len;
+ struct fb_data_t *fb_data = (struct fb_data_t *)data;
+
+ /* DON'T DO THAT - buffer overruns are bad */
+ len = sprintf(page, "%s = '%s'\n",
+ fb_data->name, fb_data->value);
+
+ return len;
+}
+
+
+static int proc_write_foobar(struct file *file,
+ const char *buffer,
+ unsigned long count,
+ void *data)
+{
+ int len;
+ struct fb_data_t *fb_data = (struct fb_data_t *)data;
+
+ if(count > FOOBAR_LEN)
+ len = FOOBAR_LEN;
+ else
+ len = count;
+
+ if(copy_from_user(fb_data->value, buffer, len))
+ return -EFAULT;
+
+ fb_data->value[len] = '\0';
+
+ return len;
+}
+
+
+static int __init init_procfs_example(void)
+{
+ int rv = 0;
+
+ /* create directory */
+ example_dir = proc_mkdir(MODULE_NAME, NULL);
+ if(example_dir == NULL) {
+ rv = -ENOMEM;
+ goto out;
+ }
+
+ example_dir->owner = THIS_MODULE;
+
+ /* create jiffies using convenience function */
+ jiffies_file = create_proc_read_entry("jiffies",
+ 0444, example_dir,
+ proc_read_jiffies,
+ NULL);
+ if(jiffies_file == NULL) {
+ rv = -ENOMEM;
+ goto no_jiffies;
+ }
+
+ jiffies_file->owner = THIS_MODULE;
+
+ /* create foo and bar files using same callback
+ * functions
+ */
+ foo_file = create_proc_entry("foo", 0644, example_dir);
+ if(foo_file == NULL) {
+ rv = -ENOMEM;
+ goto no_foo;
+ }
+
+ strcpy(foo_data.name, "foo");
+ strcpy(foo_data.value, "foo");
+ foo_file->data = &foo_data;
+ foo_file->read_proc = proc_read_foobar;
+ foo_file->write_proc = proc_write_foobar;
+ foo_file->owner = THIS_MODULE;
+
+ bar_file = create_proc_entry("bar", 0644, example_dir);
+ if(bar_file == NULL) {
+ rv = -ENOMEM;
+ goto no_bar;
+ }
+
+ strcpy(bar_data.name, "bar");
+ strcpy(bar_data.value, "bar");
+ bar_file->data = &bar_data;
+ bar_file->read_proc = proc_read_foobar;
+ bar_file->write_proc = proc_write_foobar;
+ bar_file->owner = THIS_MODULE;
+
+ /* create symlink */
+ symlink = proc_symlink("jiffies_too", example_dir,
+ "jiffies");
+ if(symlink == NULL) {
+ rv = -ENOMEM;
+ goto no_symlink;
+ }
+
+ symlink->owner = THIS_MODULE;
+
+ /* everything OK */
+ printk(KERN_INFO "%s %s initialised\n",
+ MODULE_NAME, MODULE_VERS);
+ return 0;
+
+no_symlink:
+ remove_proc_entry("tty", example_dir);
+no_tty:
+ remove_proc_entry("bar", example_dir);
+no_bar:
+ remove_proc_entry("foo", example_dir);
+no_foo:
+ remove_proc_entry("jiffies", example_dir);
+no_jiffies:
+ remove_proc_entry(MODULE_NAME, NULL);
+out:
+ return rv;
+}
+
+
+static void __exit cleanup_procfs_example(void)
+{
+ remove_proc_entry("jiffies_too", example_dir);
+ remove_proc_entry("tty", example_dir);
+ remove_proc_entry("bar", example_dir);
+ remove_proc_entry("foo", example_dir);
+ remove_proc_entry("jiffies", example_dir);
+ remove_proc_entry(MODULE_NAME, NULL);
+
+ printk(KERN_INFO "%s %s removed\n",
+ MODULE_NAME, MODULE_VERS);
+}
+
+
+module_init(init_procfs_example);
+module_exit(cleanup_procfs_example);
+
+MODULE_AUTHOR("Erik Mouw");
+MODULE_DESCRIPTION("procfs examples");
diff --git a/Documentation/DocBook/scsidrivers.tmpl b/Documentation/DocBook/scsidrivers.tmpl
new file mode 100644
index 000000000000..d058e65daf19
--- /dev/null
+++ b/Documentation/DocBook/scsidrivers.tmpl
@@ -0,0 +1,193 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="scsidrivers">
+ <bookinfo>
+ <title>SCSI Subsystem Interfaces</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Douglas</firstname>
+ <surname>Gilbert</surname>
+ <affiliation>
+ <address>
+ <email>dgilbert@interlog.com</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+ <pubdate>2003-08-11</pubdate>
+
+ <copyright>
+ <year>2002</year>
+ <year>2003</year>
+ <holder>Douglas Gilbert</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+This document outlines the interface between the Linux scsi mid level
+and lower level drivers. Lower level drivers are variously called HBA
+(host bus adapter) drivers, host drivers (HD) or pseudo adapter drivers.
+The latter alludes to the fact that a lower level driver may be a
+bridge to another IO subsystem (and the "ide-scsi" driver is an example
+of this). There can be many lower level drivers active in a running
+system, but only one per hardware type. For example, the aic7xxx driver
+controls adaptec controllers based on the 7xxx chip series. Most lower
+level drivers can control one or more scsi hosts (a.k.a. scsi initiators).
+ </para>
+<para>
+This document can been found in an ASCII text file in the linux kernel
+source: <filename>Documentation/scsi/scsi_mid_low_api.txt</filename> .
+It currently hold a little more information than this document. The
+<filename>drivers/scsi/hosts.h</filename> and <filename>
+drivers/scsi/scsi.h</filename> headers contain descriptions of members
+of important structures for the scsi subsystem.
+</para>
+ </chapter>
+
+ <chapter id="driver-struct">
+ <title>Driver structure</title>
+ <para>
+Traditionally a lower level driver for the scsi subsystem has been
+at least two files in the drivers/scsi directory. For example, a
+driver called "xyz" has a header file "xyz.h" and a source file
+"xyz.c". [Actually there is no good reason why this couldn't all
+be in one file.] Some drivers that have been ported to several operating
+systems (e.g. aic7xxx which has separate files for generic and
+OS-specific code) have more than two files. Such drivers tend to have
+their own directory under the drivers/scsi directory.
+ </para>
+ <para>
+scsi_module.c is normally included at the end of a lower
+level driver. For it to work a declaration like this is needed before
+it is included:
+<programlisting>
+ static Scsi_Host_Template driver_template = DRIVER_TEMPLATE;
+ /* DRIVER_TEMPLATE should contain pointers to supported interface
+ functions. Scsi_Host_Template is defined hosts.h */
+ #include "scsi_module.c"
+</programlisting>
+ </para>
+ <para>
+The scsi_module.c assumes the name "driver_template" is appropriately
+defined. It contains 2 functions:
+<orderedlist>
+<listitem><para>
+ init_this_scsi_driver() called during builtin and module driver
+ initialization: invokes mid level's scsi_register_host()
+</para></listitem>
+<listitem><para>
+ exit_this_scsi_driver() called during closedown: invokes
+ mid level's scsi_unregister_host()
+</para></listitem>
+</orderedlist>
+ </para>
+<para>
+When a new, lower level driver is being added to Linux, the following
+files (all found in the drivers/scsi directory) will need some attention:
+Makefile, Config.help and Config.in . It is probably best to look at what
+an existing lower level driver does in this regard.
+</para>
+ </chapter>
+
+ <chapter id="intfunctions">
+ <title>Interface Functions</title>
+!EDocumentation/scsi/scsi_mid_low_api.txt
+ </chapter>
+
+ <chapter id="locks">
+ <title>Locks</title>
+<para>
+Each Scsi_Host instance has a spin_lock called Scsi_Host::default_lock
+which is initialized in scsi_register() [found in hosts.c]. Within the
+same function the Scsi_Host::host_lock pointer is initialized to point
+at default_lock with the scsi_assign_lock() function. Thereafter
+lock and unlock operations performed by the mid level use the
+Scsi_Host::host_lock pointer.
+</para>
+<para>
+Lower level drivers can override the use of Scsi_Host::default_lock by
+using scsi_assign_lock(). The earliest opportunity to do this would
+be in the detect() function after it has invoked scsi_register(). It
+could be replaced by a coarser grain lock (e.g. per driver) or a
+lock of equal granularity (i.e. per host). Using finer grain locks
+(e.g. per scsi device) may be possible by juggling locks in
+queuecommand().
+</para>
+ </chapter>
+
+ <chapter id="changes">
+ <title>Changes since lk 2.4 series</title>
+<para>
+io_request_lock has been replaced by several finer grained locks. The lock
+relevant to lower level drivers is Scsi_Host::host_lock and there is one
+per scsi host.
+</para>
+<para>
+The older error handling mechanism has been removed. This means the
+lower level interface functions abort() and reset() have been removed.
+</para>
+<para>
+In the 2.4 series the scsi subsystem configuration descriptions were
+aggregated with the configuration descriptions from all other Linux
+subsystems in the Documentation/Configure.help file. In the 2.5 series,
+the scsi subsystem now has its own (much smaller) drivers/scsi/Config.help
+file.
+</para>
+ </chapter>
+
+ <chapter id="credits">
+ <title>Credits</title>
+<para>
+The following people have contributed to this document:
+<orderedlist>
+<listitem><para>
+Mike Anderson <email>andmike@us.ibm.com</email>
+</para></listitem>
+<listitem><para>
+James Bottomley <email>James.Bottomley@steeleye.com</email>
+</para></listitem>
+<listitem><para>
+Patrick Mansfield <email>patmans@us.ibm.com</email>
+</para></listitem>
+</orderedlist>
+</para>
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/sis900.tmpl b/Documentation/DocBook/sis900.tmpl
new file mode 100644
index 000000000000..6c2cbac93c3f
--- /dev/null
+++ b/Documentation/DocBook/sis900.tmpl
@@ -0,0 +1,585 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="SiS900Guide">
+
+<bookinfo>
+
+<title>SiS 900/7016 Fast Ethernet Device Driver</title>
+
+<authorgroup>
+<author>
+<firstname>Ollie</firstname>
+<surname>Lho</surname>
+</author>
+
+<author>
+<firstname>Lei Chun</firstname>
+<surname>Chang</surname>
+</author>
+</authorgroup>
+
+<edition>Document Revision: 0.3 for SiS900 driver v1.06 &amp; v1.07</edition>
+<pubdate>November 16, 2000</pubdate>
+
+<copyright>
+ <year>1999</year>
+ <holder>Silicon Integrated System Corp.</holder>
+</copyright>
+
+<legalnotice>
+ <para>
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software
+ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ </para>
+</legalnotice>
+
+<abstract>
+<para>
+This document gives some information on installation and usage of SiS 900/7016
+device driver under Linux.
+</para>
+</abstract>
+
+</bookinfo>
+
+<toc></toc>
+
+<chapter id="intro">
+ <title>Introduction</title>
+
+<para>
+This document describes the revision 1.06 and 1.07 of SiS 900/7016 Fast Ethernet
+device driver under Linux. The driver is developed by Silicon Integrated
+System Corp. and distributed freely under the GNU General Public License (GPL).
+The driver can be compiled as a loadable module and used under Linux kernel
+version 2.2.x. (rev. 1.06)
+With minimal changes, the driver can also be used under 2.3.x and 2.4.x kernel
+(rev. 1.07), please see
+<xref linkend="install"/>. If you are intended to
+use the driver for earlier kernels, you are on your own.
+</para>
+
+<para>
+The driver is tested with usual TCP/IP applications including
+FTP, Telnet, Netscape etc. and is used constantly by the developers.
+</para>
+
+<para>
+Please send all comments/fixes/questions to
+<ulink url="mailto:lcchang@sis.com.tw">Lei-Chun Chang</ulink>.
+</para>
+</chapter>
+
+<chapter id="changes">
+ <title>Changes</title>
+
+<para>
+Changes made in Revision 1.07
+
+<orderedlist>
+<listitem>
+<para>
+Separation of sis900.c and sis900.h in order to move most
+constant definition to sis900.h (many of those constants were
+corrected)
+</para>
+</listitem>
+
+<listitem>
+<para>
+Clean up PCI detection, the pci-scan from Donald Becker were not used,
+just simple pci&lowbar;find&lowbar;*.
+</para>
+</listitem>
+
+<listitem>
+<para>
+MII detection is modified to support multiple mii transceiver.
+</para>
+</listitem>
+
+<listitem>
+<para>
+Bugs in read&lowbar;eeprom, mdio&lowbar;* were removed.
+</para>
+</listitem>
+
+<listitem>
+<para>
+Lot of sis900 irrelevant comments were removed/changed and
+more comments were added to reflect the real situation.
+</para>
+</listitem>
+
+<listitem>
+<para>
+Clean up of physical/virtual address space mess in buffer
+descriptors.
+</para>
+</listitem>
+
+<listitem>
+<para>
+Better transmit/receive error handling.
+</para>
+</listitem>
+
+<listitem>
+<para>
+The driver now uses zero-copy single buffer management
+scheme to improve performance.
+</para>
+</listitem>
+
+<listitem>
+<para>
+Names of variables were changed to be more consistent.
+</para>
+</listitem>
+
+<listitem>
+<para>
+Clean up of auo-negotiation and timer code.
+</para>
+</listitem>
+
+<listitem>
+<para>
+Automatic detection and change of PHY on the fly.
+</para>
+</listitem>
+
+<listitem>
+<para>
+Bug in mac probing fixed.
+</para>
+</listitem>
+
+<listitem>
+<para>
+Fix 630E equalier problem by modifying the equalizer workaround rule.
+</para>
+</listitem>
+
+<listitem>
+<para>
+Support for ICS1893 10/100 Interated PHYceiver.
+</para>
+</listitem>
+
+<listitem>
+<para>
+Support for media select by ifconfig.
+</para>
+</listitem>
+
+<listitem>
+<para>
+Added kernel-doc extratable documentation.
+</para>
+</listitem>
+
+</orderedlist>
+</para>
+</chapter>
+
+<chapter id="tested">
+ <title>Tested Environment</title>
+
+<para>
+This driver is developed on the following hardware
+
+<itemizedlist>
+<listitem>
+
+<para>
+Intel Celeron 500 with SiS 630 (rev 02) chipset
+</para>
+</listitem>
+<listitem>
+
+<para>
+SiS 900 (rev 01) and SiS 7016/7014 Fast Ethernet Card
+</para>
+</listitem>
+
+</itemizedlist>
+
+and tested with these software environments
+
+<itemizedlist>
+<listitem>
+
+<para>
+Red Hat Linux version 6.2
+</para>
+</listitem>
+<listitem>
+
+<para>
+Linux kernel version 2.4.0
+</para>
+</listitem>
+<listitem>
+
+<para>
+Netscape version 4.6
+</para>
+</listitem>
+<listitem>
+
+<para>
+NcFTP 3.0.0 beta 18
+</para>
+</listitem>
+<listitem>
+
+<para>
+Samba version 2.0.3
+</para>
+</listitem>
+
+</itemizedlist>
+
+</para>
+
+</chapter>
+
+<chapter id="files">
+<title>Files in This Package</title>
+
+<para>
+In the package you can find these files:
+</para>
+
+<para>
+<variablelist>
+
+<varlistentry>
+<term>sis900.c</term>
+<listitem>
+<para>
+Driver source file in C
+</para>
+</listitem>
+</varlistentry>
+
+<varlistentry>
+<term>sis900.h</term>
+<listitem>
+<para>
+Header file for sis900.c
+</para>
+</listitem>
+</varlistentry>
+
+<varlistentry>
+<term>sis900.sgml</term>
+<listitem>
+<para>
+DocBook SGML source of the document
+</para>
+</listitem>
+</varlistentry>
+
+<varlistentry>
+<term>sis900.txt</term>
+<listitem>
+<para>
+Driver document in plain text
+</para>
+</listitem>
+</varlistentry>
+
+</variablelist>
+</para>
+</chapter>
+
+<chapter id="install">
+ <title>Installation</title>
+
+<para>
+Silicon Integrated System Corp. is cooperating closely with core Linux Kernel
+developers. The revisions of SiS 900 driver are distributed by the usuall channels
+for kernel tar files and patches. Those kernel tar files for official kernel and
+patches for kernel pre-release can be download at
+<ulink url="http://ftp.kernel.org/pub/linux/kernel/">official kernel ftp site</ulink>
+and its mirrors.
+The 1.06 revision can be found in kernel version later than 2.3.15 and pre-2.2.14,
+and 1.07 revision can be found in kernel version 2.4.0.
+If you have no prior experience in networking under Linux, please read
+<ulink url="http://www.tldp.org/">Ethernet HOWTO</ulink> and
+<ulink url="http://www.tldp.org/">Networking HOWTO</ulink> available from
+Linux Documentation Project (LDP).
+</para>
+
+<para>
+The driver is bundled in release later than 2.2.11 and 2.3.15 so this
+is the most easy case.
+Be sure you have the appropriate packages for compiling kernel source.
+Those packages are listed in Document/Changes in kernel source
+distribution. If you have to install the driver other than those bundled
+in kernel release, you should have your driver file
+<filename>sis900.c</filename> and <filename>sis900.h</filename>
+copied into <filename class="directory">/usr/src/linux/drivers/net/</filename> first.
+There are two alternative ways to install the driver
+</para>
+
+<sect1>
+<title>Building the driver as loadable module</title>
+
+<para>
+To build the driver as a loadable kernel module you have to reconfigure
+the kernel to activate network support by
+</para>
+
+<para><screen>
+make menuconfig
+</screen></para>
+
+<para>
+Choose <quote>Loadable module support ---></quote>,
+then select <quote>Enable loadable module support</quote>.
+</para>
+
+<para>
+Choose <quote>Network Device Support ---></quote>, select
+<quote>Ethernet (10 or 100Mbit)</quote>.
+Then select <quote>EISA, VLB, PCI and on board controllers</quote>,
+and choose <quote>SiS 900/7016 PCI Fast Ethernet Adapter support</quote>
+to <quote>M</quote>.
+</para>
+
+<para>
+After reconfiguring the kernel, you can make the driver module by
+</para>
+
+<para><screen>
+make modules
+</screen></para>
+
+<para>
+The driver should be compiled with no errors. After compiling the driver,
+the driver can be installed to proper place by
+</para>
+
+<para><screen>
+make modules_install
+</screen></para>
+
+<para>
+Load the driver into kernel by
+</para>
+
+<para><screen>
+insmod sis900
+</screen></para>
+
+<para>
+When loading the driver into memory, some information message can be view by
+</para>
+
+<para>
+<screen>
+dmesg
+</screen>
+
+or
+
+<screen>
+cat /var/log/message
+</screen>
+</para>
+
+<para>
+If the driver is loaded properly you will have messages similar to this:
+</para>
+
+<para><screen>
+sis900.c: v1.07.06 11/07/2000
+eth0: SiS 900 PCI Fast Ethernet at 0xd000, IRQ 10, 00:00:e8:83:7f:a4.
+eth0: SiS 900 Internal MII PHY transceiver found at address 1.
+eth0: Using SiS 900 Internal MII PHY as default
+</screen></para>
+
+<para>
+showing the version of the driver and the results of probing routine.
+</para>
+
+<para>
+Once the driver is loaded, network can be brought up by
+</para>
+
+<para><screen>
+/sbin/ifconfig eth0 IPADDR broadcast BROADCAST netmask NETMASK media TYPE
+</screen></para>
+
+<para>
+where IPADDR, BROADCAST, NETMASK are your IP address, broadcast address and
+netmask respectively. TYPE is used to set medium type used by the device.
+Typical values are "10baseT"(twisted-pair 10Mbps Ethernet) or "100baseT"
+(twisted-pair 100Mbps Ethernet). For more information on how to configure
+network interface, please refer to
+<ulink url="http://www.tldp.org/">Networking HOWTO</ulink>.
+</para>
+
+<para>
+The link status is also shown by kernel messages. For example, after the
+network interface is activated, you may have the message:
+</para>
+
+<para><screen>
+eth0: Media Link On 100mbps full-duplex
+</screen></para>
+
+<para>
+If you try to unplug the twist pair (TP) cable you will get
+</para>
+
+<para><screen>
+eth0: Media Link Off
+</screen></para>
+
+<para>
+indicating that the link is failed.
+</para>
+</sect1>
+
+<sect1>
+<title>Building the driver into kernel</title>
+
+<para>
+If you want to make the driver into kernel, choose <quote>Y</quote>
+rather than <quote>M</quote> on
+<quote>SiS 900/7016 PCI Fast Ethernet Adapter support</quote>
+when configuring the kernel. Build the kernel image in the usual way
+</para>
+
+<para><screen>
+make clean
+
+make bzlilo
+</screen></para>
+
+<para>
+Next time the system reboot, you have the driver in memory.
+</para>
+
+</sect1>
+</chapter>
+
+<chapter id="problems">
+ <title>Known Problems and Bugs</title>
+
+<para>
+There are some known problems and bugs. If you find any other bugs please
+mail to <ulink url="mailto:lcchang@sis.com.tw">lcchang@sis.com.tw</ulink>
+
+<orderedlist>
+
+<listitem>
+<para>
+AM79C901 HomePNA PHY is not thoroughly tested, there may be some
+bugs in the <quote>on the fly</quote> change of transceiver.
+</para>
+</listitem>
+
+<listitem>
+<para>
+A bug is hidden somewhere in the receive buffer management code,
+the bug causes NULL pointer reference in the kernel. This fault is
+caught before bad things happen and reported with the message:
+
+<computeroutput>
+eth0: NULL pointer encountered in Rx ring, skipping
+</computeroutput>
+
+which can be viewed with <literal remap="tt">dmesg</literal> or
+<literal remap="tt">cat /var/log/message</literal>.
+</para>
+</listitem>
+
+<listitem>
+<para>
+The media type change from 10Mbps to 100Mbps twisted-pair ethernet
+by ifconfig causes the media link down.
+</para>
+</listitem>
+
+</orderedlist>
+</para>
+</chapter>
+
+<chapter id="RHistory">
+ <title>Revision History</title>
+
+<para>
+<itemizedlist>
+
+<listitem>
+<para>
+November 13, 2000, Revision 1.07, seventh release, 630E problem fixed
+and further clean up.
+</para>
+</listitem>
+
+<listitem>
+<para>
+November 4, 1999, Revision 1.06, Second release, lots of clean up
+and optimization.
+</para>
+</listitem>
+
+<listitem>
+<para>
+August 8, 1999, Revision 1.05, Initial Public Release
+</para>
+</listitem>
+
+</itemizedlist>
+</para>
+</chapter>
+
+<chapter id="acknowledgements">
+ <title>Acknowledgements</title>
+
+<para>
+This driver was originally derived form
+<ulink url="mailto:becker@cesdis1.gsfc.nasa.gov">Donald Becker</ulink>'s
+<ulink url="ftp://cesdis.gsfc.nasa.gov/pub/linux/drivers/kern-2.3/pci-skeleton.c"
+>pci-skeleton</ulink> and
+<ulink url="ftp://cesdis.gsfc.nasa.gov/pub/linux/drivers/kern-2.3/rtl8139.c"
+>rtl8139</ulink> drivers. Donald also provided various suggestion
+regarded with improvements made in revision 1.06.
+</para>
+
+<para>
+The 1.05 revision was created by
+<ulink url="mailto:cmhuang@sis.com.tw">Jim Huang</ulink>, AMD 79c901
+support was added by <ulink url="mailto:lcs@sis.com.tw">Chin-Shan Li</ulink>.
+</para>
+</chapter>
+
+<chapter id="functions">
+<title>List of Functions</title>
+!Idrivers/net/sis900.c
+</chapter>
+
+</book>
diff --git a/Documentation/DocBook/tulip-user.tmpl b/Documentation/DocBook/tulip-user.tmpl
new file mode 100644
index 000000000000..6520d7a1b132
--- /dev/null
+++ b/Documentation/DocBook/tulip-user.tmpl
@@ -0,0 +1,327 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="TulipUserGuide">
+ <bookinfo>
+ <title>Tulip Driver User's Guide</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Jeff</firstname>
+ <surname>Garzik</surname>
+ <affiliation>
+ <address>
+ <email>jgarzik@pobox.com</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2001</year>
+ <holder>Jeff Garzik</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+ <toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+<para>
+The Tulip Ethernet Card Driver
+is maintained by Jeff Garzik (<email>jgarzik@pobox.com</email>).
+</para>
+
+<para>
+The Tulip driver was developed by Donald Becker and changed by
+Jeff Garzik, Takashi Manabe and a cast of thousands.
+</para>
+
+<para>
+For 2.4.x and later kernels, the Linux Tulip driver is available at
+<ulink url="http://sourceforge.net/projects/tulip/">http://sourceforge.net/projects/tulip/</ulink>
+</para>
+
+<para>
+ This driver is for the Digital "Tulip" Ethernet adapter interface.
+ It should work with most DEC 21*4*-based chips/ethercards, as well as
+ with work-alike chips from Lite-On (PNIC) and Macronix (MXIC) and ASIX.
+</para>
+
+<para>
+ The original author may be reached as becker@scyld.com, or C/O
+ Scyld Computing Corporation,
+ 410 Severn Ave., Suite 210,
+ Annapolis MD 21403
+</para>
+
+<para>
+ Additional information on Donald Becker's tulip.c
+ is available at <ulink url="http://www.scyld.com/network/tulip.html">http://www.scyld.com/network/tulip.html</ulink>
+</para>
+
+ </chapter>
+
+ <chapter id="drvr-compat">
+ <title>Driver Compatibility</title>
+
+<para>
+This device driver is designed for the DECchip "Tulip", Digital's
+single-chip ethernet controllers for PCI (now owned by Intel).
+Supported members of the family
+are the 21040, 21041, 21140, 21140A, 21142, and 21143. Similar work-alike
+chips from Lite-On, Macronics, ASIX, Compex and other listed below are also
+supported.
+</para>
+
+<para>
+These chips are used on at least 140 unique PCI board designs. The great
+number of chips and board designs supported is the reason for the
+driver size and complexity. Almost of the increasing complexity is in the
+board configuration and media selection code. There is very little
+increasing in the operational critical path length.
+</para>
+ </chapter>
+
+ <chapter id="board-settings">
+ <title>Board-specific Settings</title>
+
+<para>
+PCI bus devices are configured by the system at boot time, so no jumpers
+need to be set on the board. The system BIOS preferably should assign the
+PCI INTA signal to an otherwise unused system IRQ line.
+</para>
+
+<para>
+Some boards have EEPROMs tables with default media entry. The factory default
+is usually "autoselect". This should only be overridden when using
+transceiver connections without link beat e.g. 10base2 or AUI, or (rarely!)
+for forcing full-duplex when used with old link partners that do not do
+autonegotiation.
+</para>
+ </chapter>
+
+ <chapter id="driver-operation">
+ <title>Driver Operation</title>
+
+<sect1><title>Ring buffers</title>
+
+<para>
+The Tulip can use either ring buffers or lists of Tx and Rx descriptors.
+This driver uses statically allocated rings of Rx and Tx descriptors, set at
+compile time by RX/TX_RING_SIZE. This version of the driver allocates skbuffs
+for the Rx ring buffers at open() time and passes the skb->data field to the
+Tulip as receive data buffers. When an incoming frame is less than
+RX_COPYBREAK bytes long, a fresh skbuff is allocated and the frame is
+copied to the new skbuff. When the incoming frame is larger, the skbuff is
+passed directly up the protocol stack and replaced by a newly allocated
+skbuff.
+</para>
+
+<para>
+The RX_COPYBREAK value is chosen to trade-off the memory wasted by
+using a full-sized skbuff for small frames vs. the copying costs of larger
+frames. For small frames the copying cost is negligible (esp. considering
+that we are pre-loading the cache with immediately useful header
+information). For large frames the copying cost is non-trivial, and the
+larger copy might flush the cache of useful data. A subtle aspect of this
+choice is that the Tulip only receives into longword aligned buffers, thus
+the IP header at offset 14 isn't longword aligned for further processing.
+Copied frames are put into the new skbuff at an offset of "+2", thus copying
+has the beneficial effect of aligning the IP header and preloading the
+cache.
+</para>
+
+</sect1>
+
+<sect1><title>Synchronization</title>
+<para>
+The driver runs as two independent, single-threaded flows of control. One
+is the send-packet routine, which enforces single-threaded use by the
+dev->tbusy flag. The other thread is the interrupt handler, which is single
+threaded by the hardware and other software.
+</para>
+
+<para>
+The send packet thread has partial control over the Tx ring and 'dev->tbusy'
+flag. It sets the tbusy flag whenever it's queuing a Tx packet. If the next
+queue slot is empty, it clears the tbusy flag when finished otherwise it sets
+the 'tp->tx_full' flag.
+</para>
+
+<para>
+The interrupt handler has exclusive control over the Rx ring and records stats
+from the Tx ring. (The Tx-done interrupt can't be selectively turned off, so
+we can't avoid the interrupt overhead by having the Tx routine reap the Tx
+stats.) After reaping the stats, it marks the queue entry as empty by setting
+the 'base' to zero. Iff the 'tp->tx_full' flag is set, it clears both the
+tx_full and tbusy flags.
+</para>
+
+</sect1>
+
+ </chapter>
+
+ <chapter id="errata">
+ <title>Errata</title>
+
+<para>
+The old DEC databooks were light on details.
+The 21040 databook claims that CSR13, CSR14, and CSR15 should each be the last
+register of the set CSR12-15 written. Hmmm, now how is that possible?
+</para>
+
+<para>
+The DEC SROM format is very badly designed not precisely defined, leading to
+part of the media selection junkheap below. Some boards do not have EEPROM
+media tables and need to be patched up. Worse, other boards use the DEC
+design kit media table when it isn't correct for their board.
+</para>
+
+<para>
+We cannot use MII interrupts because there is no defined GPIO pin to attach
+them. The MII transceiver status is polled using an kernel timer.
+</para>
+ </chapter>
+
+ <chapter id="changelog">
+ <title>Driver Change History</title>
+
+ <sect1><title>Version 0.9.14 (February 20, 2001)</title>
+ <itemizedlist>
+ <listitem><para>Fix PNIC problems (Manfred Spraul)</para></listitem>
+ <listitem><para>Add new PCI id for Accton comet</para></listitem>
+ <listitem><para>Support Davicom tulips</para></listitem>
+ <listitem><para>Fix oops in eeprom parsing</para></listitem>
+ <listitem><para>Enable workarounds for early PCI chipsets</para></listitem>
+ <listitem><para>IA64, hppa csr0 support</para></listitem>
+ <listitem><para>Support media types 5, 6</para></listitem>
+ <listitem><para>Interpret a bit more of the 21142 SROM extended media type 3</para></listitem>
+ <listitem><para>Add missing delay in eeprom reading</para></listitem>
+ </itemizedlist>
+ </sect1>
+
+ <sect1><title>Version 0.9.11 (November 3, 2000)</title>
+ <itemizedlist>
+ <listitem><para>Eliminate extra bus accesses when sharing interrupts (prumpf)</para></listitem>
+ <listitem><para>Barrier following ownership descriptor bit flip (prumpf)</para></listitem>
+ <listitem><para>Endianness fixes for >14 addresses in setup frames (prumpf)</para></listitem>
+ <listitem><para>Report link beat to kernel/userspace via netif_carrier_*. (kuznet)</para></listitem>
+ <listitem><para>Better spinlocking in set_rx_mode.</para></listitem>
+ <listitem><para>Fix I/O resource request failure error messages (DaveM catch)</para></listitem>
+ <listitem><para>Handle DMA allocation failure.</para></listitem>
+ </itemizedlist>
+ </sect1>
+
+ <sect1><title>Version 0.9.10 (September 6, 2000)</title>
+ <itemizedlist>
+ <listitem><para>Simple interrupt mitigation (via jamal)</para></listitem>
+ <listitem><para>More PCI ids</para></listitem>
+ </itemizedlist>
+ </sect1>
+
+ <sect1><title>Version 0.9.9 (August 11, 2000)</title>
+ <itemizedlist>
+ <listitem><para>More PCI ids</para></listitem>
+ </itemizedlist>
+ </sect1>
+
+ <sect1><title>Version 0.9.8 (July 13, 2000)</title>
+ <itemizedlist>
+ <listitem><para>Correct signed/unsigned comparison for dummy frame index</para></listitem>
+ <listitem><para>Remove outdated references to struct enet_statistics</para></listitem>
+ </itemizedlist>
+ </sect1>
+
+ <sect1><title>Version 0.9.7 (June 17, 2000)</title>
+ <itemizedlist>
+ <listitem><para>Timer cleanups (Andrew Morton)</para></listitem>
+ <listitem><para>Alpha compile fix (somebody?)</para></listitem>
+ </itemizedlist>
+ </sect1>
+
+ <sect1><title>Version 0.9.6 (May 31, 2000)</title>
+ <itemizedlist>
+ <listitem><para>Revert 21143-related support flag patch</para></listitem>
+ <listitem><para>Add HPPA/media-table debugging printk</para></listitem>
+ </itemizedlist>
+ </sect1>
+
+ <sect1><title>Version 0.9.5 (May 30, 2000)</title>
+ <itemizedlist>
+ <listitem><para>HPPA support (willy@puffingroup)</para></listitem>
+ <listitem><para>CSR6 bits and tulip.h cleanup (Chris Smith)</para></listitem>
+ <listitem><para>Improve debugging messages a bit</para></listitem>
+ <listitem><para>Add delay after CSR13 write in t21142_start_nway</para></listitem>
+ <listitem><para>Remove unused ETHER_STATS code</para></listitem>
+ <listitem><para>Convert 'extern inline' to 'static inline' in tulip.h (Chris Smith)</para></listitem>
+ <listitem><para>Update DS21143 support flags in tulip_chip_info[]</para></listitem>
+ <listitem><para>Use spin_lock_irq, not _irqsave/restore, in tulip_start_xmit()</para></listitem>
+ <listitem><para>Add locking to set_rx_mode()</para></listitem>
+ <listitem><para>Fix race with chip setting DescOwned bit (Hal Murray)</para></listitem>
+ <listitem><para>Request 100% of PIO and MMIO resource space assigned to card</para></listitem>
+ <listitem><para>Remove error message from pci_enable_device failure</para></listitem>
+ </itemizedlist>
+ </sect1>
+
+ <sect1><title>Version 0.9.4.3 (April 14, 2000)</title>
+ <itemizedlist>
+ <listitem><para>mod_timer fix (Hal Murray)</para></listitem>
+ <listitem><para>PNIC2 resuscitation (Chris Smith)</para></listitem>
+ </itemizedlist>
+ </sect1>
+
+ <sect1><title>Version 0.9.4.2 (March 21, 2000)</title>
+ <itemizedlist>
+ <listitem><para>Fix 21041 CSR7, CSR13/14/15 handling</para></listitem>
+ <listitem><para>Merge some PCI ids from tulip 0.91x</para></listitem>
+ <listitem><para>Merge some HAS_xxx flags and flag settings from tulip 0.91x</para></listitem>
+ <listitem><para>asm/io.h fix (submitted by many) and cleanup</para></listitem>
+ <listitem><para>s/HAS_NWAY143/HAS_NWAY/</para></listitem>
+ <listitem><para>Cleanup 21041 mode reporting</para></listitem>
+ <listitem><para>Small code cleanups</para></listitem>
+ </itemizedlist>
+ </sect1>
+
+ <sect1><title>Version 0.9.4.1 (March 18, 2000)</title>
+ <itemizedlist>
+ <listitem><para>Finish PCI DMA conversion (davem)</para></listitem>
+ <listitem><para>Do not netif_start_queue() at end of tulip_tx_timeout() (kuznet)</para></listitem>
+ <listitem><para>PCI DMA fix (kuznet)</para></listitem>
+ <listitem><para>eeprom.c code cleanup</para></listitem>
+ <listitem><para>Remove Xircom Tulip crud</para></listitem>
+ </itemizedlist>
+ </sect1>
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/usb.tmpl b/Documentation/DocBook/usb.tmpl
new file mode 100644
index 000000000000..f3ef0bf435e9
--- /dev/null
+++ b/Documentation/DocBook/usb.tmpl
@@ -0,0 +1,979 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="Linux-USB-API">
+ <bookinfo>
+ <title>The Linux-USB Host Side API</title>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+<chapter id="intro">
+ <title>Introduction to USB on Linux</title>
+
+ <para>A Universal Serial Bus (USB) is used to connect a host,
+ such as a PC or workstation, to a number of peripheral
+ devices. USB uses a tree structure, with the host at the
+ root (the system's master), hubs as interior nodes, and
+ peripheral devices as leaves (and slaves).
+ Modern PCs support several such trees of USB devices, usually
+ one USB 2.0 tree (480 Mbit/sec each) with
+ a few USB 1.1 trees (12 Mbit/sec each) that are used when you
+ connect a USB 1.1 device directly to the machine's "root hub".
+ </para>
+
+ <para>That master/slave asymmetry was designed in part for
+ ease of use. It is not physically possible to assemble
+ (legal) USB cables incorrectly: all upstream "to-the-host"
+ connectors are the rectangular type, matching the sockets on
+ root hubs, and the downstream type are the squarish type
+ (or they are built in to the peripheral).
+ Software doesn't need to deal with distributed autoconfiguration
+ since the pre-designated master node manages all that.
+ At the electrical level, bus protocol overhead is reduced by
+ eliminating arbitration and moving scheduling into host software.
+ </para>
+
+ <para>USB 1.0 was announced in January 1996, and was revised
+ as USB 1.1 (with improvements in hub specification and
+ support for interrupt-out transfers) in September 1998.
+ USB 2.0 was released in April 2000, including high speed
+ transfers and transaction translating hubs (used for USB 1.1
+ and 1.0 backward compatibility).
+ </para>
+
+ <para>USB support was added to Linux early in the 2.2 kernel series
+ shortly before the 2.3 development forked off. Updates
+ from 2.3 were regularly folded back into 2.2 releases, bringing
+ new features such as <filename>/sbin/hotplug</filename> support,
+ more drivers, and more robustness.
+ The 2.5 kernel series continued such improvements, and also
+ worked on USB 2.0 support,
+ higher performance,
+ better consistency between host controller drivers,
+ API simplification (to make bugs less likely),
+ and providing internal "kerneldoc" documentation.
+ </para>
+
+ <para>Linux can run inside USB devices as well as on
+ the hosts that control the devices.
+ Because the Linux 2.x USB support evolved to support mass market
+ platforms such as Apple Macintosh or PC-compatible systems,
+ it didn't address design concerns for those types of USB systems.
+ So it can't be used inside mass-market PDAs, or other peripherals.
+ USB device drivers running inside those Linux peripherals
+ don't do the same things as the ones running inside hosts,
+ and so they've been given a different name:
+ they're called <emphasis>gadget drivers</emphasis>.
+ This document does not present gadget drivers.
+ </para>
+
+ </chapter>
+
+<chapter id="host">
+ <title>USB Host-Side API Model</title>
+
+ <para>Within the kernel,
+ host-side drivers for USB devices talk to the "usbcore" APIs.
+ There are two types of public "usbcore" APIs, targetted at two different
+ layers of USB driver. Those are
+ <emphasis>general purpose</emphasis> drivers, exposed through
+ driver frameworks such as block, character, or network devices;
+ and drivers that are <emphasis>part of the core</emphasis>,
+ which are involved in managing a USB bus.
+ Such core drivers include the <emphasis>hub</emphasis> driver,
+ which manages trees of USB devices, and several different kinds
+ of <emphasis>host controller driver (HCD)</emphasis>,
+ which control individual busses.
+ </para>
+
+ <para>The device model seen by USB drivers is relatively complex.
+ </para>
+
+ <itemizedlist>
+
+ <listitem><para>USB supports four kinds of data transfer
+ (control, bulk, interrupt, and isochronous). Two transfer
+ types use bandwidth as it's available (control and bulk),
+ while the other two types of transfer (interrupt and isochronous)
+ are scheduled to provide guaranteed bandwidth.
+ </para></listitem>
+
+ <listitem><para>The device description model includes one or more
+ "configurations" per device, only one of which is active at a time.
+ Devices that are capable of high speed operation must also support
+ full speed configurations, along with a way to ask about the
+ "other speed" configurations that might be used.
+ </para></listitem>
+
+ <listitem><para>Configurations have one or more "interface", each
+ of which may have "alternate settings". Interfaces may be
+ standardized by USB "Class" specifications, or may be specific to
+ a vendor or device.</para>
+
+ <para>USB device drivers actually bind to interfaces, not devices.
+ Think of them as "interface drivers", though you
+ may not see many devices where the distinction is important.
+ <emphasis>Most USB devices are simple, with only one configuration,
+ one interface, and one alternate setting.</emphasis>
+ </para></listitem>
+
+ <listitem><para>Interfaces have one or more "endpoints", each of
+ which supports one type and direction of data transfer such as
+ "bulk out" or "interrupt in". The entire configuration may have
+ up to sixteen endpoints in each direction, allocated as needed
+ among all the interfaces.
+ </para></listitem>
+
+ <listitem><para>Data transfer on USB is packetized; each endpoint
+ has a maximum packet size.
+ Drivers must often be aware of conventions such as flagging the end
+ of bulk transfers using "short" (including zero length) packets.
+ </para></listitem>
+
+ <listitem><para>The Linux USB API supports synchronous calls for
+ control and bulk messaging.
+ It also supports asynchnous calls for all kinds of data transfer,
+ using request structures called "URBs" (USB Request Blocks).
+ </para></listitem>
+
+ </itemizedlist>
+
+ <para>Accordingly, the USB Core API exposed to device drivers
+ covers quite a lot of territory. You'll probably need to consult
+ the USB 2.0 specification, available online from www.usb.org at
+ no cost, as well as class or device specifications.
+ </para>
+
+ <para>The only host-side drivers that actually touch hardware
+ (reading/writing registers, handling IRQs, and so on) are the HCDs.
+ In theory, all HCDs provide the same functionality through the same
+ API. In practice, that's becoming more true on the 2.5 kernels,
+ but there are still differences that crop up especially with
+ fault handling. Different controllers don't necessarily report
+ the same aspects of failures, and recovery from faults (including
+ software-induced ones like unlinking an URB) isn't yet fully
+ consistent.
+ Device driver authors should make a point of doing disconnect
+ testing (while the device is active) with each different host
+ controller driver, to make sure drivers don't have bugs of
+ their own as well as to make sure they aren't relying on some
+ HCD-specific behavior.
+ (You will need external USB 1.1 and/or
+ USB 2.0 hubs to perform all those tests.)
+ </para>
+
+ </chapter>
+
+<chapter><title>USB-Standard Types</title>
+
+ <para>In <filename>&lt;linux/usb_ch9.h&gt;</filename> you will find
+ the USB data types defined in chapter 9 of the USB specification.
+ These data types are used throughout USB, and in APIs including
+ this host side API, gadget APIs, and usbfs.
+ </para>
+
+!Iinclude/linux/usb_ch9.h
+
+ </chapter>
+
+<chapter><title>Host-Side Data Types and Macros</title>
+
+ <para>The host side API exposes several layers to drivers, some of
+ which are more necessary than others.
+ These support lifecycle models for host side drivers
+ and devices, and support passing buffers through usbcore to
+ some HCD that performs the I/O for the device driver.
+ </para>
+
+
+!Iinclude/linux/usb.h
+
+ </chapter>
+
+ <chapter><title>USB Core APIs</title>
+
+ <para>There are two basic I/O models in the USB API.
+ The most elemental one is asynchronous: drivers submit requests
+ in the form of an URB, and the URB's completion callback
+ handle the next step.
+ All USB transfer types support that model, although there
+ are special cases for control URBs (which always have setup
+ and status stages, but may not have a data stage) and
+ isochronous URBs (which allow large packets and include
+ per-packet fault reports).
+ Built on top of that is synchronous API support, where a
+ driver calls a routine that allocates one or more URBs,
+ submits them, and waits until they complete.
+ There are synchronous wrappers for single-buffer control
+ and bulk transfers (which are awkward to use in some
+ driver disconnect scenarios), and for scatterlist based
+ streaming i/o (bulk or interrupt).
+ </para>
+
+ <para>USB drivers need to provide buffers that can be
+ used for DMA, although they don't necessarily need to
+ provide the DMA mapping themselves.
+ There are APIs to use used when allocating DMA buffers,
+ which can prevent use of bounce buffers on some systems.
+ In some cases, drivers may be able to rely on 64bit DMA
+ to eliminate another kind of bounce buffer.
+ </para>
+
+!Edrivers/usb/core/urb.c
+!Edrivers/usb/core/message.c
+!Edrivers/usb/core/file.c
+!Edrivers/usb/core/usb.c
+!Edrivers/usb/core/hub.c
+ </chapter>
+
+ <chapter><title>Host Controller APIs</title>
+
+ <para>These APIs are only for use by host controller drivers,
+ most of which implement standard register interfaces such as
+ EHCI, OHCI, or UHCI.
+ UHCI was one of the first interfaces, designed by Intel and
+ also used by VIA; it doesn't do much in hardware.
+ OHCI was designed later, to have the hardware do more work
+ (bigger transfers, tracking protocol state, and so on).
+ EHCI was designed with USB 2.0; its design has features that
+ resemble OHCI (hardware does much more work) as well as
+ UHCI (some parts of ISO support, TD list processing).
+ </para>
+
+ <para>There are host controllers other than the "big three",
+ although most PCI based controllers (and a few non-PCI based
+ ones) use one of those interfaces.
+ Not all host controllers use DMA; some use PIO, and there
+ is also a simulator.
+ </para>
+
+ <para>The same basic APIs are available to drivers for all
+ those controllers.
+ For historical reasons they are in two layers:
+ <structname>struct usb_bus</structname> is a rather thin
+ layer that became available in the 2.2 kernels, while
+ <structname>struct usb_hcd</structname> is a more featureful
+ layer (available in later 2.4 kernels and in 2.5) that
+ lets HCDs share common code, to shrink driver size
+ and significantly reduce hcd-specific behaviors.
+ </para>
+
+!Edrivers/usb/core/hcd.c
+!Edrivers/usb/core/hcd-pci.c
+!Edrivers/usb/core/buffer.c
+ </chapter>
+
+ <chapter>
+ <title>The USB Filesystem (usbfs)</title>
+
+ <para>This chapter presents the Linux <emphasis>usbfs</emphasis>.
+ You may prefer to avoid writing new kernel code for your
+ USB driver; that's the problem that usbfs set out to solve.
+ User mode device drivers are usually packaged as applications
+ or libraries, and may use usbfs through some programming library
+ that wraps it. Such libraries include
+ <ulink url="http://libusb.sourceforge.net">libusb</ulink>
+ for C/C++, and
+ <ulink url="http://jUSB.sourceforge.net">jUSB</ulink> for Java.
+ </para>
+
+ <note><title>Unfinished</title>
+ <para>This particular documentation is incomplete,
+ especially with respect to the asynchronous mode.
+ As of kernel 2.5.66 the code and this (new) documentation
+ need to be cross-reviewed.
+ </para>
+ </note>
+
+ <para>Configure usbfs into Linux kernels by enabling the
+ <emphasis>USB filesystem</emphasis> option (CONFIG_USB_DEVICEFS),
+ and you get basic support for user mode USB device drivers.
+ Until relatively recently it was often (confusingly) called
+ <emphasis>usbdevfs</emphasis> although it wasn't solving what
+ <emphasis>devfs</emphasis> was.
+ Every USB device will appear in usbfs, regardless of whether or
+ not it has a kernel driver; but only devices with kernel drivers
+ show up in devfs.
+ </para>
+
+ <sect1>
+ <title>What files are in "usbfs"?</title>
+
+ <para>Conventionally mounted at
+ <filename>/proc/bus/usb</filename>, usbfs
+ features include:
+ <itemizedlist>
+ <listitem><para><filename>/proc/bus/usb/devices</filename>
+ ... a text file
+ showing each of the USB devices on known to the kernel,
+ and their configuration descriptors.
+ You can also poll() this to learn about new devices.
+ </para></listitem>
+ <listitem><para><filename>/proc/bus/usb/BBB/DDD</filename>
+ ... magic files
+ exposing the each device's configuration descriptors, and
+ supporting a series of ioctls for making device requests,
+ including I/O to devices. (Purely for access by programs.)
+ </para></listitem>
+ </itemizedlist>
+ </para>
+
+ <para> Each bus is given a number (BBB) based on when it was
+ enumerated; within each bus, each device is given a similar
+ number (DDD).
+ Those BBB/DDD paths are not "stable" identifiers;
+ expect them to change even if you always leave the devices
+ plugged in to the same hub port.
+ <emphasis>Don't even think of saving these in application
+ configuration files.</emphasis>
+ Stable identifiers are available, for user mode applications
+ that want to use them. HID and networking devices expose
+ these stable IDs, so that for example you can be sure that
+ you told the right UPS to power down its second server.
+ "usbfs" doesn't (yet) expose those IDs.
+ </para>
+
+ </sect1>
+
+ <sect1>
+ <title>Mounting and Access Control</title>
+
+ <para>There are a number of mount options for usbfs, which will
+ be of most interest to you if you need to override the default
+ access control policy.
+ That policy is that only root may read or write device files
+ (<filename>/proc/bus/BBB/DDD</filename>) although anyone may read
+ the <filename>devices</filename>
+ or <filename>drivers</filename> files.
+ I/O requests to the device also need the CAP_SYS_RAWIO capability,
+ </para>
+
+ <para>The significance of that is that by default, all user mode
+ device drivers need super-user privileges.
+ You can change modes or ownership in a driver setup
+ when the device hotplugs, or maye just start the
+ driver right then, as a privileged server (or some activity
+ within one).
+ That's the most secure approach for multi-user systems,
+ but for single user systems ("trusted" by that user)
+ it's more convenient just to grant everyone all access
+ (using the <emphasis>devmode=0666</emphasis> option)
+ so the driver can start whenever it's needed.
+ </para>
+
+ <para>The mount options for usbfs, usable in /etc/fstab or
+ in command line invocations of <emphasis>mount</emphasis>, are:
+
+ <variablelist>
+ <varlistentry>
+ <term><emphasis>busgid</emphasis>=NNNNN</term>
+ <listitem><para>Controls the GID used for the
+ /proc/bus/usb/BBB
+ directories. (Default: 0)</para></listitem></varlistentry>
+ <varlistentry><term><emphasis>busmode</emphasis>=MMM</term>
+ <listitem><para>Controls the file mode used for the
+ /proc/bus/usb/BBB
+ directories. (Default: 0555)
+ </para></listitem></varlistentry>
+ <varlistentry><term><emphasis>busuid</emphasis>=NNNNN</term>
+ <listitem><para>Controls the UID used for the
+ /proc/bus/usb/BBB
+ directories. (Default: 0)</para></listitem></varlistentry>
+
+ <varlistentry><term><emphasis>devgid</emphasis>=NNNNN</term>
+ <listitem><para>Controls the GID used for the
+ /proc/bus/usb/BBB/DDD
+ files. (Default: 0)</para></listitem></varlistentry>
+ <varlistentry><term><emphasis>devmode</emphasis>=MMM</term>
+ <listitem><para>Controls the file mode used for the
+ /proc/bus/usb/BBB/DDD
+ files. (Default: 0644)</para></listitem></varlistentry>
+ <varlistentry><term><emphasis>devuid</emphasis>=NNNNN</term>
+ <listitem><para>Controls the UID used for the
+ /proc/bus/usb/BBB/DDD
+ files. (Default: 0)</para></listitem></varlistentry>
+
+ <varlistentry><term><emphasis>listgid</emphasis>=NNNNN</term>
+ <listitem><para>Controls the GID used for the
+ /proc/bus/usb/devices and drivers files.
+ (Default: 0)</para></listitem></varlistentry>
+ <varlistentry><term><emphasis>listmode</emphasis>=MMM</term>
+ <listitem><para>Controls the file mode used for the
+ /proc/bus/usb/devices and drivers files.
+ (Default: 0444)</para></listitem></varlistentry>
+ <varlistentry><term><emphasis>listuid</emphasis>=NNNNN</term>
+ <listitem><para>Controls the UID used for the
+ /proc/bus/usb/devices and drivers files.
+ (Default: 0)</para></listitem></varlistentry>
+ </variablelist>
+
+ </para>
+
+ <para>Note that many Linux distributions hard-wire the mount options
+ for usbfs in their init scripts, such as
+ <filename>/etc/rc.d/rc.sysinit</filename>,
+ rather than making it easy to set this per-system
+ policy in <filename>/etc/fstab</filename>.
+ </para>
+
+ </sect1>
+
+ <sect1>
+ <title>/proc/bus/usb/devices</title>
+
+ <para>This file is handy for status viewing tools in user
+ mode, which can scan the text format and ignore most of it.
+ More detailed device status (including class and vendor
+ status) is available from device-specific files.
+ For information about the current format of this file,
+ see the
+ <filename>Documentation/usb/proc_usb_info.txt</filename>
+ file in your Linux kernel sources.
+ </para>
+
+ <para>Otherwise the main use for this file from programs
+ is to poll() it to get notifications of usb devices
+ as they're plugged or unplugged.
+ To see what changed, you'd need to read the file and
+ compare "before" and "after" contents, scan the filesystem,
+ or see its hotplug event.
+ </para>
+
+ </sect1>
+
+ <sect1>
+ <title>/proc/bus/usb/BBB/DDD</title>
+
+ <para>Use these files in one of these basic ways:
+ </para>
+
+ <para><emphasis>They can be read,</emphasis>
+ producing first the device descriptor
+ (18 bytes) and then the descriptors for the current configuration.
+ See the USB 2.0 spec for details about those binary data formats.
+ You'll need to convert most multibyte values from little endian
+ format to your native host byte order, although a few of the
+ fields in the device descriptor (both of the BCD-encoded fields,
+ and the vendor and product IDs) will be byteswapped for you.
+ Note that configuration descriptors include descriptors for
+ interfaces, altsettings, endpoints, and maybe additional
+ class descriptors.
+ </para>
+
+ <para><emphasis>Perform USB operations</emphasis> using
+ <emphasis>ioctl()</emphasis> requests to make endpoint I/O
+ requests (synchronously or asynchronously) or manage
+ the device.
+ These requests need the CAP_SYS_RAWIO capability,
+ as well as filesystem access permissions.
+ Only one ioctl request can be made on one of these
+ device files at a time.
+ This means that if you are synchronously reading an endpoint
+ from one thread, you won't be able to write to a different
+ endpoint from another thread until the read completes.
+ This works for <emphasis>half duplex</emphasis> protocols,
+ but otherwise you'd use asynchronous i/o requests.
+ </para>
+
+ </sect1>
+
+
+ <sect1>
+ <title>Life Cycle of User Mode Drivers</title>
+
+ <para>Such a driver first needs to find a device file
+ for a device it knows how to handle.
+ Maybe it was told about it because a
+ <filename>/sbin/hotplug</filename> event handling agent
+ chose that driver to handle the new device.
+ Or maybe it's an application that scans all the
+ /proc/bus/usb device files, and ignores most devices.
+ In either case, it should <function>read()</function> all
+ the descriptors from the device file,
+ and check them against what it knows how to handle.
+ It might just reject everything except a particular
+ vendor and product ID, or need a more complex policy.
+ </para>
+
+ <para>Never assume there will only be one such device
+ on the system at a time!
+ If your code can't handle more than one device at
+ a time, at least detect when there's more than one, and
+ have your users choose which device to use.
+ </para>
+
+ <para>Once your user mode driver knows what device to use,
+ it interacts with it in either of two styles.
+ The simple style is to make only control requests; some
+ devices don't need more complex interactions than those.
+ (An example might be software using vendor-specific control
+ requests for some initialization or configuration tasks,
+ with a kernel driver for the rest.)
+ </para>
+
+ <para>More likely, you need a more complex style driver:
+ one using non-control endpoints, reading or writing data
+ and claiming exclusive use of an interface.
+ <emphasis>Bulk</emphasis> transfers are easiest to use,
+ but only their sibling <emphasis>interrupt</emphasis> transfers
+ work with low speed devices.
+ Both interrupt and <emphasis>isochronous</emphasis> transfers
+ offer service guarantees because their bandwidth is reserved.
+ Such "periodic" transfers are awkward to use through usbfs,
+ unless you're using the asynchronous calls. However, interrupt
+ transfers can also be used in a synchronous "one shot" style.
+ </para>
+
+ <para>Your user-mode driver should never need to worry
+ about cleaning up request state when the device is
+ disconnected, although it should close its open file
+ descriptors as soon as it starts seeing the ENODEV
+ errors.
+ </para>
+
+ </sect1>
+
+ <sect1><title>The ioctl() Requests</title>
+
+ <para>To use these ioctls, you need to include the following
+ headers in your userspace program:
+<programlisting>#include &lt;linux/usb.h&gt;
+#include &lt;linux/usbdevice_fs.h&gt;
+#include &lt;asm/byteorder.h&gt;</programlisting>
+ The standard USB device model requests, from "Chapter 9" of
+ the USB 2.0 specification, are automatically included from
+ the <filename>&lt;linux/usb_ch9.h&gt;</filename> header.
+ </para>
+
+ <para>Unless noted otherwise, the ioctl requests
+ described here will
+ update the modification time on the usbfs file to which
+ they are applied (unless they fail).
+ A return of zero indicates success; otherwise, a
+ standard USB error code is returned. (These are
+ documented in
+ <filename>Documentation/usb/error-codes.txt</filename>
+ in your kernel sources.)
+ </para>
+
+ <para>Each of these files multiplexes access to several
+ I/O streams, one per endpoint.
+ Each device has one control endpoint (endpoint zero)
+ which supports a limited RPC style RPC access.
+ Devices are configured
+ by khubd (in the kernel) setting a device-wide
+ <emphasis>configuration</emphasis> that affects things
+ like power consumption and basic functionality.
+ The endpoints are part of USB <emphasis>interfaces</emphasis>,
+ which may have <emphasis>altsettings</emphasis>
+ affecting things like which endpoints are available.
+ Many devices only have a single configuration and interface,
+ so drivers for them will ignore configurations and altsettings.
+ </para>
+
+
+ <sect2>
+ <title>Management/Status Requests</title>
+
+ <para>A number of usbfs requests don't deal very directly
+ with device I/O.
+ They mostly relate to device management and status.
+ These are all synchronous requests.
+ </para>
+
+ <variablelist>
+
+ <varlistentry><term>USBDEVFS_CLAIMINTERFACE</term>
+ <listitem><para>This is used to force usbfs to
+ claim a specific interface,
+ which has not previously been claimed by usbfs or any other
+ kernel driver.
+ The ioctl parameter is an integer holding the number of
+ the interface (bInterfaceNumber from descriptor).
+ </para><para>
+ Note that if your driver doesn't claim an interface
+ before trying to use one of its endpoints, and no
+ other driver has bound to it, then the interface is
+ automatically claimed by usbfs.
+ </para><para>
+ This claim will be released by a RELEASEINTERFACE ioctl,
+ or by closing the file descriptor.
+ File modification time is not updated by this request.
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_CONNECTINFO</term>
+ <listitem><para>Says whether the device is lowspeed.
+ The ioctl parameter points to a structure like this:
+<programlisting>struct usbdevfs_connectinfo {
+ unsigned int devnum;
+ unsigned char slow;
+}; </programlisting>
+ File modification time is not updated by this request.
+ </para><para>
+ <emphasis>You can't tell whether a "not slow"
+ device is connected at high speed (480 MBit/sec)
+ or just full speed (12 MBit/sec).</emphasis>
+ You should know the devnum value already,
+ it's the DDD value of the device file name.
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_GETDRIVER</term>
+ <listitem><para>Returns the name of the kernel driver
+ bound to a given interface (a string). Parameter
+ is a pointer to this structure, which is modified:
+<programlisting>struct usbdevfs_getdriver {
+ unsigned int interface;
+ char driver[USBDEVFS_MAXDRIVERNAME + 1];
+};</programlisting>
+ File modification time is not updated by this request.
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_IOCTL</term>
+ <listitem><para>Passes a request from userspace through
+ to a kernel driver that has an ioctl entry in the
+ <emphasis>struct usb_driver</emphasis> it registered.
+<programlisting>struct usbdevfs_ioctl {
+ int ifno;
+ int ioctl_code;
+ void *data;
+};
+
+/* user mode call looks like this.
+ * 'request' becomes the driver->ioctl() 'code' parameter.
+ * the size of 'param' is encoded in 'request', and that data
+ * is copied to or from the driver->ioctl() 'buf' parameter.
+ */
+static int
+usbdev_ioctl (int fd, int ifno, unsigned request, void *param)
+{
+ struct usbdevfs_ioctl wrapper;
+
+ wrapper.ifno = ifno;
+ wrapper.ioctl_code = request;
+ wrapper.data = param;
+
+ return ioctl (fd, USBDEVFS_IOCTL, &amp;wrapper);
+} </programlisting>
+ File modification time is not updated by this request.
+ </para><para>
+ This request lets kernel drivers talk to user mode code
+ through filesystem operations even when they don't create
+ a charactor or block special device.
+ It's also been used to do things like ask devices what
+ device special file should be used.
+ Two pre-defined ioctls are used
+ to disconnect and reconnect kernel drivers, so
+ that user mode code can completely manage binding
+ and configuration of devices.
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_RELEASEINTERFACE</term>
+ <listitem><para>This is used to release the claim usbfs
+ made on interface, either implicitly or because of a
+ USBDEVFS_CLAIMINTERFACE call, before the file
+ descriptor is closed.
+ The ioctl parameter is an integer holding the number of
+ the interface (bInterfaceNumber from descriptor);
+ File modification time is not updated by this request.
+ </para><warning><para>
+ <emphasis>No security check is made to ensure
+ that the task which made the claim is the one
+ which is releasing it.
+ This means that user mode driver may interfere
+ other ones. </emphasis>
+ </para></warning></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_RESETEP</term>
+ <listitem><para>Resets the data toggle value for an endpoint
+ (bulk or interrupt) to DATA0.
+ The ioctl parameter is an integer endpoint number
+ (1 to 15, as identified in the endpoint descriptor),
+ with USB_DIR_IN added if the device's endpoint sends
+ data to the host.
+ </para><warning><para>
+ <emphasis>Avoid using this request.
+ It should probably be removed.</emphasis>
+ Using it typically means the device and driver will lose
+ toggle synchronization. If you really lost synchronization,
+ you likely need to completely handshake with the device,
+ using a request like CLEAR_HALT
+ or SET_INTERFACE.
+ </para></warning></listitem></varlistentry>
+
+ </variablelist>
+
+ </sect2>
+
+ <sect2>
+ <title>Synchronous I/O Support</title>
+
+ <para>Synchronous requests involve the kernel blocking
+ until until the user mode request completes, either by
+ finishing successfully or by reporting an error.
+ In most cases this is the simplest way to use usbfs,
+ although as noted above it does prevent performing I/O
+ to more than one endpoint at a time.
+ </para>
+
+ <variablelist>
+
+ <varlistentry><term>USBDEVFS_BULK</term>
+ <listitem><para>Issues a bulk read or write request to the
+ device.
+ The ioctl parameter is a pointer to this structure:
+<programlisting>struct usbdevfs_bulktransfer {
+ unsigned int ep;
+ unsigned int len;
+ unsigned int timeout; /* in milliseconds */
+ void *data;
+};</programlisting>
+ </para><para>The "ep" value identifies a
+ bulk endpoint number (1 to 15, as identified in an endpoint
+ descriptor),
+ masked with USB_DIR_IN when referring to an endpoint which
+ sends data to the host from the device.
+ The length of the data buffer is identified by "len";
+ Recent kernels support requests up to about 128KBytes.
+ <emphasis>FIXME say how read length is returned,
+ and how short reads are handled.</emphasis>.
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_CLEAR_HALT</term>
+ <listitem><para>Clears endpoint halt (stall) and
+ resets the endpoint toggle. This is only
+ meaningful for bulk or interrupt endpoints.
+ The ioctl parameter is an integer endpoint number
+ (1 to 15, as identified in an endpoint descriptor),
+ masked with USB_DIR_IN when referring to an endpoint which
+ sends data to the host from the device.
+ </para><para>
+ Use this on bulk or interrupt endpoints which have
+ stalled, returning <emphasis>-EPIPE</emphasis> status
+ to a data transfer request.
+ Do not issue the control request directly, since
+ that could invalidate the host's record of the
+ data toggle.
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_CONTROL</term>
+ <listitem><para>Issues a control request to the device.
+ The ioctl parameter points to a structure like this:
+<programlisting>struct usbdevfs_ctrltransfer {
+ __u8 bRequestType;
+ __u8 bRequest;
+ __u16 wValue;
+ __u16 wIndex;
+ __u16 wLength;
+ __u32 timeout; /* in milliseconds */
+ void *data;
+};</programlisting>
+ </para><para>
+ The first eight bytes of this structure are the contents
+ of the SETUP packet to be sent to the device; see the
+ USB 2.0 specification for details.
+ The bRequestType value is composed by combining a
+ USB_TYPE_* value, a USB_DIR_* value, and a
+ USB_RECIP_* value (from
+ <emphasis>&lt;linux/usb.h&gt;</emphasis>).
+ If wLength is nonzero, it describes the length of the data
+ buffer, which is either written to the device
+ (USB_DIR_OUT) or read from the device (USB_DIR_IN).
+ </para><para>
+ At this writing, you can't transfer more than 4 KBytes
+ of data to or from a device; usbfs has a limit, and
+ some host controller drivers have a limit.
+ (That's not usually a problem.)
+ <emphasis>Also</emphasis> there's no way to say it's
+ not OK to get a short read back from the device.
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_RESET</term>
+ <listitem><para>Does a USB level device reset.
+ The ioctl parameter is ignored.
+ After the reset, this rebinds all device interfaces.
+ File modification time is not updated by this request.
+ </para><warning><para>
+ <emphasis>Avoid using this call</emphasis>
+ until some usbcore bugs get fixed,
+ since it does not fully synchronize device, interface,
+ and driver (not just usbfs) state.
+ </para></warning></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_SETINTERFACE</term>
+ <listitem><para>Sets the alternate setting for an
+ interface. The ioctl parameter is a pointer to a
+ structure like this:
+<programlisting>struct usbdevfs_setinterface {
+ unsigned int interface;
+ unsigned int altsetting;
+}; </programlisting>
+ File modification time is not updated by this request.
+ </para><para>
+ Those struct members are from some interface descriptor
+ applying to the the current configuration.
+ The interface number is the bInterfaceNumber value, and
+ the altsetting number is the bAlternateSetting value.
+ (This resets each endpoint in the interface.)
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_SETCONFIGURATION</term>
+ <listitem><para>Issues the
+ <function>usb_set_configuration</function> call
+ for the device.
+ The parameter is an integer holding the number of
+ a configuration (bConfigurationValue from descriptor).
+ File modification time is not updated by this request.
+ </para><warning><para>
+ <emphasis>Avoid using this call</emphasis>
+ until some usbcore bugs get fixed,
+ since it does not fully synchronize device, interface,
+ and driver (not just usbfs) state.
+ </para></warning></listitem></varlistentry>
+
+ </variablelist>
+ </sect2>
+
+ <sect2>
+ <title>Asynchronous I/O Support</title>
+
+ <para>As mentioned above, there are situations where it may be
+ important to initiate concurrent operations from user mode code.
+ This is particularly important for periodic transfers
+ (interrupt and isochronous), but it can be used for other
+ kinds of USB requests too.
+ In such cases, the asynchronous requests described here
+ are essential. Rather than submitting one request and having
+ the kernel block until it completes, the blocking is separate.
+ </para>
+
+ <para>These requests are packaged into a structure that
+ resembles the URB used by kernel device drivers.
+ (No POSIX Async I/O support here, sorry.)
+ It identifies the endpoint type (USBDEVFS_URB_TYPE_*),
+ endpoint (number, masked with USB_DIR_IN as appropriate),
+ buffer and length, and a user "context" value serving to
+ uniquely identify each request.
+ (It's usually a pointer to per-request data.)
+ Flags can modify requests (not as many as supported for
+ kernel drivers).
+ </para>
+
+ <para>Each request can specify a realtime signal number
+ (between SIGRTMIN and SIGRTMAX, inclusive) to request a
+ signal be sent when the request completes.
+ </para>
+
+ <para>When usbfs returns these urbs, the status value
+ is updated, and the buffer may have been modified.
+ Except for isochronous transfers, the actual_length is
+ updated to say how many bytes were transferred; if the
+ USBDEVFS_URB_DISABLE_SPD flag is set
+ ("short packets are not OK"), if fewer bytes were read
+ than were requested then you get an error report.
+ </para>
+
+<programlisting>struct usbdevfs_iso_packet_desc {
+ unsigned int length;
+ unsigned int actual_length;
+ unsigned int status;
+};
+
+struct usbdevfs_urb {
+ unsigned char type;
+ unsigned char endpoint;
+ int status;
+ unsigned int flags;
+ void *buffer;
+ int buffer_length;
+ int actual_length;
+ int start_frame;
+ int number_of_packets;
+ int error_count;
+ unsigned int signr;
+ void *usercontext;
+ struct usbdevfs_iso_packet_desc iso_frame_desc[];
+};</programlisting>
+
+ <para> For these asynchronous requests, the file modification
+ time reflects when the request was initiated.
+ This contrasts with their use with the synchronous requests,
+ where it reflects when requests complete.
+ </para>
+
+ <variablelist>
+
+ <varlistentry><term>USBDEVFS_DISCARDURB</term>
+ <listitem><para>
+ <emphasis>TBS</emphasis>
+ File modification time is not updated by this request.
+ </para><para>
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_DISCSIGNAL</term>
+ <listitem><para>
+ <emphasis>TBS</emphasis>
+ File modification time is not updated by this request.
+ </para><para>
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_REAPURB</term>
+ <listitem><para>
+ <emphasis>TBS</emphasis>
+ File modification time is not updated by this request.
+ </para><para>
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_REAPURBNDELAY</term>
+ <listitem><para>
+ <emphasis>TBS</emphasis>
+ File modification time is not updated by this request.
+ </para><para>
+ </para></listitem></varlistentry>
+
+ <varlistentry><term>USBDEVFS_SUBMITURB</term>
+ <listitem><para>
+ <emphasis>TBS</emphasis>
+ </para><para>
+ </para></listitem></varlistentry>
+
+ </variablelist>
+ </sect2>
+
+ </sect1>
+
+ </chapter>
+
+</book>
+<!-- vim:syntax=sgml:sw=4
+-->
diff --git a/Documentation/DocBook/via-audio.tmpl b/Documentation/DocBook/via-audio.tmpl
new file mode 100644
index 000000000000..36e642147d6b
--- /dev/null
+++ b/Documentation/DocBook/via-audio.tmpl
@@ -0,0 +1,597 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="ViaAudioGuide">
+ <bookinfo>
+ <title>Via 686 Audio Driver for Linux</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Jeff</firstname>
+ <surname>Garzik</surname>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>1999-2001</year>
+ <holder>Jeff Garzik</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ The Via VT82C686A "super southbridge" chips contain
+ AC97-compatible audio logic which features dual 16-bit stereo
+ PCM sound channels (full duplex), plus a third PCM channel intended for use
+ in hardware-assisted FM synthesis.
+ </para>
+ <para>
+ The current Linux kernel audio driver for this family of chips
+ supports audio playback and recording, but hardware-assisted
+ FM features, and hardware buffer direct-access (mmap)
+ support are not yet available.
+ </para>
+ <para>
+ This driver supports any Linux kernel version after 2.4.10.
+ </para>
+ <para>
+ Please send bug reports to the mailing list <email>linux-via@gtf.org</email>.
+ To subscribe, e-mail <email>majordomo@gtf.org</email> with
+ </para>
+ <programlisting>
+ subscribe linux-via
+ </programlisting>
+ <para>
+ in the body of the message.
+ </para>
+ </chapter>
+
+ <chapter id="install">
+ <title>Driver Installation</title>
+ <para>
+ To use this audio driver, select the
+ CONFIG_SOUND_VIA82CXXX option in the section Sound during kernel configuration.
+ Follow the usual kernel procedures for rebuilding the kernel,
+ or building and installing driver modules.
+ </para>
+ <para>
+ To make this driver the default audio driver, you can add the
+ following to your /etc/conf.modules file:
+ </para>
+ <programlisting>
+ alias sound via82cxxx_audio
+ </programlisting>
+ <para>
+ Note that soundcore and ac97_codec support modules
+ are also required for working audio, in addition to
+ the via82cxxx_audio module itself.
+ </para>
+ </chapter>
+
+ <chapter id="reportbug">
+ <title>Submitting a bug report</title>
+ <sect1 id="bugrepdesc"><title>Description of problem</title>
+ <para>
+ Describe the application you were using to play/record sound, and how
+ to reproduce the problem.
+ </para>
+ </sect1>
+ <sect1 id="bugrepdiag"><title>Diagnostic output</title>
+ <para>
+ Obtain the via-audio-diag diagnostics program from
+ http://sf.net/projects/gkernel/ and provide a dump of the
+ audio chip's registers while the problem is occurring. Sample command line:
+ </para>
+ <programlisting>
+ ./via-audio-diag -aps > diag-output.txt
+ </programlisting>
+ </sect1>
+ <sect1 id="bugrepdebug"><title>Driver debug output</title>
+ <para>
+ Define <constant>VIA_DEBUG</constant> at the beginning of the driver, then capture and email
+ the kernel log output. This can be viewed in the system kernel log (if
+ enabled), or via the dmesg program. Sample command line:
+ </para>
+ <programlisting>
+ dmesg > /tmp/dmesg-output.txt
+ </programlisting>
+ </sect1>
+ <sect1 id="bugrepprintk"><title>Bigger kernel message buffer</title>
+ <para>
+ If you wish to increase the size of the buffer displayed by dmesg, then
+ change the <constant>LOG_BUF_LEN</constant> macro at the top of linux/kernel/printk.c, recompile
+ your kernel, and pass the <constant>LOG_BUF_LEN</constant> value to dmesg. Sample command line with
+ <constant>LOG_BUF_LEN</constant> == 32768:
+ </para>
+ <programlisting>
+ dmesg -s 32768 > /tmp/dmesg-output.txt
+ </programlisting>
+ </sect1>
+ </chapter>
+
+ <chapter id="bugs">
+ <title>Known Bugs And Assumptions</title>
+ <para>
+ <variablelist>
+ <varlistentry><term>Low volume</term>
+ <listitem>
+ <para>
+ Volume too low on many systems. Workaround: use mixer program
+ such as xmixer to increase volume.
+ </para>
+ </listitem></varlistentry>
+
+ </variablelist>
+
+ </para>
+ </chapter>
+
+ <chapter id="thanks">
+ <title>Thanks</title>
+ <para>
+ Via for providing e-mail support, specs, and NDA'd source code.
+ </para>
+ <para>
+ MandrakeSoft for providing hacking time.
+ </para>
+ <para>
+ AC97 mixer interface fixes and debugging by Ron Cemer <email>roncemer@gte.net</email>.
+ </para>
+ <para>
+ Rui Sousa <email>rui.sousa@conexant.com</email>, for bugfixing
+ MMAP support, and several other notable fixes that resulted from
+ his hard work and testing.
+ </para>
+ <para>
+ Adrian Cox <email>adrian@humboldt.co.uk</email>, for bugfixing
+ MMAP support, and several other notable fixes that resulted from
+ his hard work and testing.
+ </para>
+ <para>
+ Thomas Sailer for further bugfixes.
+ </para>
+ </chapter>
+
+ <chapter id="notes">
+ <title>Random Notes</title>
+ <para>
+ Two /proc pseudo-files provide diagnostic information. This is generally
+ not useful to most users. Power users can disable CONFIG_SOUND_VIA82CXXX_PROCFS,
+ and remove the /proc support code. Once
+ version 2.0.0 is released, the /proc support code will be disabled by
+ default. Available /proc pseudo-files:
+ </para>
+ <programlisting>
+ /proc/driver/via/0/info
+ /proc/driver/via/0/ac97
+ </programlisting>
+ <para>
+ This driver by default supports all PCI audio devices which report
+ a vendor id of 0x1106, and a device id of 0x3058. Subsystem vendor
+ and device ids are not examined.
+ </para>
+ <para>
+ GNU indent formatting options:
+ <programlisting>
+-kr -i8 -ts8 -br -ce -bap -sob -l80 -pcs -cs -ss -bs -di1 -nbc -lp -psl
+ </programlisting>
+ </para>
+ <para>
+ Via has graciously donated e-mail support and source code to help further
+ the development of this driver. Their assistance has been invaluable
+ in the design and coding of the next major version of this driver.
+ </para>
+ <para>
+ The Via audio chip apparently provides a second PCM scatter-gather
+ DMA channel just for FM data, but does not have a full hardware MIDI
+ processor. I haven't put much thought towards a solution here, but it
+ might involve using SoftOSS midi wave table, or simply disabling MIDI
+ support altogether and using the FM PCM channel as a second (input? output?)
+ </para>
+ </chapter>
+
+ <chapter id="changelog">
+ <title>Driver ChangeLog</title>
+
+<sect1 id="version191"><title>
+Version 1.9.1
+</title>
+ <itemizedlist spacing="compact">
+ <listitem>
+ <para>
+ DSP read/write bugfixes from Thomas Sailer.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Add new PCI id for single-channel use of Via 8233.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Other bug fixes, tweaks, new ioctls.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+</sect1>
+
+<sect1 id="version1115"><title>
+Version 1.1.15
+</title>
+ <itemizedlist spacing="compact">
+ <listitem>
+ <para>
+ Support for variable fragment size and variable fragment number (Rui
+ Sousa)
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Fixes for the SPEED, STEREO, CHANNELS, FMT ioctls when in read &amp;
+ write mode (Rui Sousa)
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Mmaped sound is now fully functional. (Rui Sousa)
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Make sure to enable PCI device before reading any of its PCI
+ config information. (fixes potential hotplug problems)
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Clean up code a bit and add more internal function documentation.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ AC97 codec access fixes (Adrian Cox)
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Big endian fixes (Adrian Cox)
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ MIDI support (Adrian Cox)
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Detect and report locked-rate AC97 codecs. If your hardware only
+ supports 48Khz (locked rate), then your recording/playback software
+ must upsample or downsample accordingly. The hardware cannot do it.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Use new pci_request_regions and pci_disable_device functions in
+ kernel 2.4.6.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+</sect1>
+
+<sect1 id="version1114"><title>
+Version 1.1.14
+</title>
+ <itemizedlist spacing="compact">
+ <listitem>
+ <para>
+ Use VM_RESERVE when available, to eliminate unnecessary page faults.
+ </para>
+ </listitem>
+ </itemizedlist>
+</sect1>
+
+<sect1 id="version1112"><title>
+Version 1.1.12
+</title>
+ <itemizedlist spacing="compact">
+ <listitem>
+ <para>
+ mmap bug fixes from Linus.
+ </para>
+ </listitem>
+ </itemizedlist>
+</sect1>
+
+<sect1 id="version1111"><title>
+Version 1.1.11
+</title>
+ <itemizedlist spacing="compact">
+ <listitem>
+ <para>
+ Many more bug fixes. mmap enabled by default, but may still be buggy.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Uses new and spiffy method of mmap'ing the DMA buffer, based
+ on a suggestion from Linus.
+ </para>
+ </listitem>
+ </itemizedlist>
+</sect1>
+
+<sect1 id="version1110"><title>
+Version 1.1.10
+</title>
+ <itemizedlist spacing="compact">
+ <listitem>
+ <para>
+ Many bug fixes. mmap enabled by default, but may still be buggy.
+ </para>
+ </listitem>
+ </itemizedlist>
+</sect1>
+
+<sect1 id="version119"><title>
+Version 1.1.9
+</title>
+ <itemizedlist spacing="compact">
+ <listitem>
+ <para>
+ Redesign and rewrite audio playback implementation. (faster and smaller, hopefully)
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Implement recording and full duplex (DSP_CAP_DUPLEX) support.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Make procfs support optional.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Quick interrupt status check, to lessen overhead in interrupt
+ sharing situations.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Add mmap(2) support. Disabled for now, it is still buggy and experimental.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Surround all syscalls with a semaphore for cheap and easy SMP protection.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Fix bug in channel shutdown (hardware channel reset) code.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Remove unnecessary spinlocks (better performance).
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Eliminate "unknown AFMT" message by using a different method
+ of selecting the best AFMT_xxx sound sample format for use.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Support for realtime hardware pointer position reporting
+ (DSP_CAP_REALTIME, SNDCTL_DSP_GETxPTR ioctls)
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Support for capture/playback triggering
+ (DSP_CAP_TRIGGER, SNDCTL_DSP_SETTRIGGER ioctls)
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ SNDCTL_DSP_SETDUPLEX and SNDCTL_DSP_POST ioctls now handled.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Rewrite open(2) and close(2) logic to allow only one user at
+ a time. All other open(2) attempts will sleep until they succeed.
+ FIXME: open(O_RDONLY) and open(O_WRONLY) should be allowed to succeed.
+ </para>
+ </listitem>
+
+ <listitem>
+ <para>
+ Reviewed code to ensure that SMP and multiple audio devices
+ are fully supported.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+</sect1>
+
+<sect1 id="version118"><title>
+Version 1.1.8
+</title>
+ <itemizedlist spacing="compact">
+ <listitem>
+ <para>
+ Clean up interrupt handler output. Fixes the following kernel error message:
+ </para>
+ <programlisting>
+ unhandled interrupt ...
+ </programlisting>
+ </listitem>
+
+ <listitem>
+ <para>
+ Convert documentation to DocBook, so that PDF, HTML and PostScript (.ps) output is readily
+ available.
+ </para>
+ </listitem>
+
+ </itemizedlist>
+</sect1>
+
+<sect1 id="version117"><title>
+Version 1.1.7
+</title>
+ <itemizedlist spacing="compact">
+ <listitem>
+ <para>
+ Fix module unload bug where mixer device left registered
+ after driver exit
+ </para>
+ </listitem>
+ </itemizedlist>
+</sect1>
+
+<sect1 id="version116"><title>
+Version 1.1.6
+</title>
+ <itemizedlist spacing="compact">
+ <listitem>
+ <para>
+ Rewrite via_set_rate to mimic ALSA basic AC97 rate setting
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Remove much dead code
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Complete spin_lock_irqsave -> spin_lock_irq conversion in via_dsp_ioctl
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Fix build problem in via_dsp_ioctl
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Optimize included headers to eliminate headers found in linux/sound
+ </para>
+ </listitem>
+ </itemizedlist>
+</sect1>
+
+<sect1 id="version115"><title>
+Version 1.1.5
+</title>
+ <itemizedlist spacing="compact">
+ <listitem>
+ <para>
+ Disable some overly-verbose debugging code
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Remove unnecessary sound locks
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Fix some ioctls for better time resolution
+ </para>
+ </listitem>
+ <listitem>
+ <para>
+ Begin spin_lock_irqsave -> spin_lock_irq conversion in via_dsp_ioctl
+ </para>
+ </listitem>
+ </itemizedlist>
+</sect1>
+
+<sect1 id="version114"><title>
+Version 1.1.4
+</title>
+ <itemizedlist spacing="compact">
+ <listitem>
+ <para>
+ Completed rewrite of driver. Eliminated SoundBlaster compatibility
+ completely, and now uses the much-faster scatter-gather DMA engine.
+ </para>
+ </listitem>
+ </itemizedlist>
+</sect1>
+
+ </chapter>
+
+ <chapter id="intfunctions">
+ <title>Internal Functions</title>
+!Isound/oss/via82cxxx_audio.c
+ </chapter>
+
+</book>
+
+
diff --git a/Documentation/DocBook/videobook.tmpl b/Documentation/DocBook/videobook.tmpl
new file mode 100644
index 000000000000..3ec6c875588a
--- /dev/null
+++ b/Documentation/DocBook/videobook.tmpl
@@ -0,0 +1,1663 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="V4LGuide">
+ <bookinfo>
+ <title>Video4Linux Programming</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Alan</firstname>
+ <surname>Cox</surname>
+ <affiliation>
+ <address>
+ <email>alan@redhat.com</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2000</year>
+ <holder>Alan Cox</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ Parts of this document first appeared in Linux Magazine under a
+ ninety day exclusivity.
+ </para>
+ <para>
+ Video4Linux is intended to provide a common programming interface
+ for the many TV and capture cards now on the market, as well as
+ parallel port and USB video cameras. Radio, teletext decoders and
+ vertical blanking data interfaces are also provided.
+ </para>
+ </chapter>
+ <chapter id="radio">
+ <title>Radio Devices</title>
+ <para>
+ There are a wide variety of radio interfaces available for PC's, and these
+ are generally very simple to program. The biggest problem with supporting
+ such devices is normally extracting documentation from the vendor.
+ </para>
+ <para>
+ The radio interface supports a simple set of control ioctls standardised
+ across all radio and tv interfaces. It does not support read or write, which
+ are used for video streams. The reason radio cards do not allow you to read
+ the audio stream into an application is that without exception they provide
+ a connection on to a soundcard. Soundcards can be used to read the radio
+ data just fine.
+ </para>
+ <sect1 id="registerradio">
+ <title>Registering Radio Devices</title>
+ <para>
+ The Video4linux core provides an interface for registering devices. The
+ first step in writing our radio card driver is to register it.
+ </para>
+ <programlisting>
+
+
+static struct video_device my_radio
+{
+ "My radio",
+ VID_TYPE_TUNER,
+ VID_HARDWARE_MYRADIO,
+ radio_open.
+ radio_close,
+ NULL, /* no read */
+ NULL, /* no write */
+ NULL, /* no poll */
+ radio_ioctl,
+ NULL, /* no special init function */
+ NULL /* no private data */
+};
+
+
+ </programlisting>
+ <para>
+ This declares our video4linux device driver interface. The VID_TYPE_ value
+ defines what kind of an interface we are, and defines basic capabilities.
+ </para>
+ <para>
+ The only defined value relevant for a radio card is VID_TYPE_TUNER which
+ indicates that the device can be tuned. Clearly our radio is going to have some
+ way to change channel so it is tuneable.
+ </para>
+ <para>
+ The VID_HARDWARE_ types are unique to each device. Numbers are assigned by
+ <email>alan@redhat.com</email> when device drivers are going to be released. Until then you
+ can pull a suitably large number out of your hat and use it. 10000 should be
+ safe for a very long time even allowing for the huge number of vendors
+ making new and different radio cards at the moment.
+ </para>
+ <para>
+ We declare an open and close routine, but we do not need read or write,
+ which are used to read and write video data to or from the card itself. As
+ we have no read or write there is no poll function.
+ </para>
+ <para>
+ The private initialise function is run when the device is registered. In
+ this driver we've already done all the work needed. The final pointer is a
+ private data pointer that can be used by the device driver to attach and
+ retrieve private data structures. We set this field "priv" to NULL for
+ the moment.
+ </para>
+ <para>
+ Having the structure defined is all very well but we now need to register it
+ with the kernel.
+ </para>
+ <programlisting>
+
+
+static int io = 0x320;
+
+int __init myradio_init(struct video_init *v)
+{
+ if(!request_region(io, MY_IO_SIZE, "myradio"))
+ {
+ printk(KERN_ERR
+ "myradio: port 0x%03X is in use.\n", io);
+ return -EBUSY;
+ }
+
+ if(video_device_register(&amp;my_radio, VFL_TYPE_RADIO)==-1) {
+ release_region(io, MY_IO_SIZE);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+ </programlisting>
+ <para>
+ The first stage of the initialisation, as is normally the case, is to check
+ that the I/O space we are about to fiddle with doesn't belong to some other
+ driver. If it is we leave well alone. If the user gives the address of the
+ wrong device then we will spot this. These policies will generally avoid
+ crashing the machine.
+ </para>
+ <para>
+ Now we ask the Video4Linux layer to register the device for us. We hand it
+ our carefully designed video_device structure and also tell it which group
+ of devices we want it registered with. In this case VFL_TYPE_RADIO.
+ </para>
+ <para>
+ The types available are
+ </para>
+ <table frame="all"><title>Device Types</title>
+ <tgroup cols="3" align="left">
+ <tbody>
+ <row>
+ <entry>VFL_TYPE_RADIO</entry><entry>/dev/radio{n}</entry><entry>
+
+ Radio devices are assigned in this block. As with all of these
+ selections the actual number assignment is done by the video layer
+ accordijng to what is free.</entry>
+ </row><row>
+ <entry>VFL_TYPE_GRABBER</entry><entry>/dev/video{n}</entry><entry>
+ Video capture devices and also -- counter-intuitively for the name --
+ hardware video playback devices such as MPEG2 cards.</entry>
+ </row><row>
+ <entry>VFL_TYPE_VBI</entry><entry>/dev/vbi{n}</entry><entry>
+ The VBI devices capture the hidden lines on a television picture
+ that carry further information like closed caption data, teletext
+ (primarily in Europe) and now Intercast and the ATVEC internet
+ television encodings.</entry>
+ </row><row>
+ <entry>VFL_TYPE_VTX</entry><entry>/dev/vtx[n}</entry><entry>
+ VTX is 'Videotext' also known as 'Teletext'. This is a system for
+ sending numbered, 40x25, mostly textual page images over the hidden
+ lines. Unlike the /dev/vbi interfaces, this is for 'smart' decoder
+ chips. (The use of the word smart here has to be taken in context,
+ the smartest teletext chips are fairly dumb pieces of technology).
+ </entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+ <para>
+ We are most definitely a radio.
+ </para>
+ <para>
+ Finally we allocate our I/O space so that nobody treads on us and return 0
+ to signify general happiness with the state of the universe.
+ </para>
+ </sect1>
+ <sect1 id="openradio">
+ <title>Opening And Closing The Radio</title>
+
+ <para>
+ The functions we declared in our video_device are mostly very simple.
+ Firstly we can drop in what is basically standard code for open and close.
+ </para>
+ <programlisting>
+
+
+static int users = 0;
+
+static int radio_open(stuct video_device *dev, int flags)
+{
+ if(users)
+ return -EBUSY;
+ users++;
+ return 0;
+}
+
+ </programlisting>
+ <para>
+ At open time we need to do nothing but check if someone else is also using
+ the radio card. If nobody is using it we make a note that we are using it,
+ then we ensure that nobody unloads our driver on us.
+ </para>
+ <programlisting>
+
+
+static int radio_close(struct video_device *dev)
+{
+ users--;
+}
+
+ </programlisting>
+ <para>
+ At close time we simply need to reduce the user count and allow the module
+ to become unloadable.
+ </para>
+ <para>
+ If you are sharp you will have noticed neither the open nor the close
+ routines attempt to reset or change the radio settings. This is intentional.
+ It allows an application to set up the radio and exit. It avoids a user
+ having to leave an application running all the time just to listen to the
+ radio.
+ </para>
+ </sect1>
+ <sect1 id="ioctlradio">
+ <title>The Ioctl Interface</title>
+ <para>
+ This leaves the ioctl routine, without which the driver will not be
+ terribly useful to anyone.
+ </para>
+ <programlisting>
+
+
+static int radio_ioctl(struct video_device *dev, unsigned int cmd, void *arg)
+{
+ switch(cmd)
+ {
+ case VIDIOCGCAP:
+ {
+ struct video_capability v;
+ v.type = VID_TYPE_TUNER;
+ v.channels = 1;
+ v.audios = 1;
+ v.maxwidth = 0;
+ v.minwidth = 0;
+ v.maxheight = 0;
+ v.minheight = 0;
+ strcpy(v.name, "My Radio");
+ if(copy_to_user(arg, &amp;v, sizeof(v)))
+ return -EFAULT;
+ return 0;
+ }
+
+ </programlisting>
+ <para>
+ VIDIOCGCAP is the first ioctl all video4linux devices must support. It
+ allows the applications to find out what sort of a card they have found and
+ to figure out what they want to do about it. The fields in the structure are
+ </para>
+ <table frame="all"><title>struct video_capability fields</title>
+ <tgroup cols="2" align="left">
+ <tbody>
+ <row>
+ <entry>name</entry><entry>The device text name. This is intended for the user.</entry>
+ </row><row>
+ <entry>channels</entry><entry>The number of different channels you can tune on
+ this card. It could even by zero for a card that has
+ no tuning capability. For our simple FM radio it is 1.
+ An AM/FM radio would report 2.</entry>
+ </row><row>
+ <entry>audios</entry><entry>The number of audio inputs on this device. For our
+ radio there is only one audio input.</entry>
+ </row><row>
+ <entry>minwidth,minheight</entry><entry>The smallest size the card is capable of capturing
+ images in. We set these to zero. Radios do not
+ capture pictures</entry>
+ </row><row>
+ <entry>maxwidth,maxheight</entry><entry>The largest image size the card is capable of
+ capturing. For our radio we report 0.
+ </entry>
+ </row><row>
+ <entry>type</entry><entry>This reports the capabilities of the device, and
+ matches the field we filled in in the struct
+ video_device when registering.</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+ <para>
+ Having filled in the fields, we use copy_to_user to copy the structure into
+ the users buffer. If the copy fails we return an EFAULT to the application
+ so that it knows it tried to feed us garbage.
+ </para>
+ <para>
+ The next pair of ioctl operations select which tuner is to be used and let
+ the application find the tuner properties. We have only a single FM band
+ tuner in our example device.
+ </para>
+ <programlisting>
+
+
+ case VIDIOCGTUNER:
+ {
+ struct video_tuner v;
+ if(copy_from_user(&amp;v, arg, sizeof(v))!=0)
+ return -EFAULT;
+ if(v.tuner)
+ return -EINVAL;
+ v.rangelow=(87*16000);
+ v.rangehigh=(108*16000);
+ v.flags = VIDEO_TUNER_LOW;
+ v.mode = VIDEO_MODE_AUTO;
+ v.signal = 0xFFFF;
+ strcpy(v.name, "FM");
+ if(copy_to_user(&amp;v, arg, sizeof(v))!=0)
+ return -EFAULT;
+ return 0;
+ }
+
+ </programlisting>
+ <para>
+ The VIDIOCGTUNER ioctl allows applications to query a tuner. The application
+ sets the tuner field to the tuner number it wishes to query. The query does
+ not change the tuner that is being used, it merely enquires about the tuner
+ in question.
+ </para>
+ <para>
+ We have exactly one tuner so after copying the user buffer to our temporary
+ structure we complain if they asked for a tuner other than tuner 0.
+ </para>
+ <para>
+ The video_tuner structure has the following fields
+ </para>
+ <table frame="all"><title>struct video_tuner fields</title>
+ <tgroup cols="2" align="left">
+ <tbody>
+ <row>
+ <entry>int tuner</entry><entry>The number of the tuner in question</entry>
+ </row><row>
+ <entry>char name[32]</entry><entry>A text description of this tuner. "FM" will do fine.
+ This is intended for the application.</entry>
+ </row><row>
+ <entry>u32 flags</entry>
+ <entry>Tuner capability flags</entry>
+ </row>
+ <row>
+ <entry>u16 mode</entry><entry>The current reception mode</entry>
+
+ </row><row>
+ <entry>u16 signal</entry><entry>The signal strength scaled between 0 and 65535. If
+ a device cannot tell the signal strength it should
+ report 65535. Many simple cards contain only a
+ signal/no signal bit. Such cards will report either
+ 0 or 65535.</entry>
+
+ </row><row>
+ <entry>u32 rangelow, rangehigh</entry><entry>
+ The range of frequencies supported by the radio
+ or TV. It is scaled according to the VIDEO_TUNER_LOW
+ flag.</entry>
+
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+
+ <table frame="all"><title>struct video_tuner flags</title>
+ <tgroup cols="2" align="left">
+ <tbody>
+ <row>
+ <entry>VIDEO_TUNER_PAL</entry><entry>A PAL TV tuner</entry>
+ </row><row>
+ <entry>VIDEO_TUNER_NTSC</entry><entry>An NTSC (US) TV tuner</entry>
+ </row><row>
+ <entry>VIDEO_TUNER_SECAM</entry><entry>A SECAM (French) TV tuner</entry>
+ </row><row>
+ <entry>VIDEO_TUNER_LOW</entry><entry>
+ The tuner frequency is scaled in 1/16th of a KHz
+ steps. If not it is in 1/16th of a MHz steps
+ </entry>
+ </row><row>
+ <entry>VIDEO_TUNER_NORM</entry><entry>The tuner can set its format</entry>
+ </row><row>
+ <entry>VIDEO_TUNER_STEREO_ON</entry><entry>The tuner is currently receiving a stereo signal</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+
+ <table frame="all"><title>struct video_tuner modes</title>
+ <tgroup cols="2" align="left">
+ <tbody>
+ <row>
+ <entry>VIDEO_MODE_PAL</entry><entry>PAL Format</entry>
+ </row><row>
+ <entry>VIDEO_MODE_NTSC</entry><entry>NTSC Format (USA)</entry>
+ </row><row>
+ <entry>VIDEO_MODE_SECAM</entry><entry>French Format</entry>
+ </row><row>
+ <entry>VIDEO_MODE_AUTO</entry><entry>A device that does not need to do
+ TV format switching</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+ <para>
+ The settings for the radio card are thus fairly simple. We report that we
+ are a tuner called "FM" for FM radio. In order to get the best tuning
+ resolution we report VIDEO_TUNER_LOW and select tuning to 1/16th of KHz. Its
+ unlikely our card can do that resolution but it is a fair bet the card can
+ do better than 1/16th of a MHz. VIDEO_TUNER_LOW is appropriate to almost all
+ radio usage.
+ </para>
+ <para>
+ We report that the tuner automatically handles deciding what format it is
+ receiving - true enough as it only handles FM radio. Our example card is
+ also incapable of detecting stereo or signal strengths so it reports a
+ strength of 0xFFFF (maximum) and no stereo detected.
+ </para>
+ <para>
+ To finish off we set the range that can be tuned to be 87-108Mhz, the normal
+ FM broadcast radio range. It is important to find out what the card is
+ actually capable of tuning. It is easy enough to simply use the FM broadcast
+ range. Unfortunately if you do this you will discover the FM broadcast
+ ranges in the USA, Europe and Japan are all subtly different and some users
+ cannot receive all the stations they wish.
+ </para>
+ <para>
+ The application also needs to be able to set the tuner it wishes to use. In
+ our case, with a single tuner this is rather simple to arrange.
+ </para>
+ <programlisting>
+
+ case VIDIOCSTUNER:
+ {
+ struct video_tuner v;
+ if(copy_from_user(&amp;v, arg, sizeof(v)))
+ return -EFAULT;
+ if(v.tuner != 0)
+ return -EINVAL;
+ return 0;
+ }
+
+ </programlisting>
+ <para>
+ We copy the user supplied structure into kernel memory so we can examine it.
+ If the user has selected a tuner other than zero we reject the request. If
+ they wanted tuner 0 then, surprisingly enough, that is the current tuner already.
+ </para>
+ <para>
+ The next two ioctls we need to provide are to get and set the frequency of
+ the radio. These both use an unsigned long argument which is the frequency.
+ The scale of the frequency depends on the VIDEO_TUNER_LOW flag as I
+ mentioned earlier on. Since we have VIDEO_TUNER_LOW set this will be in
+ 1/16ths of a KHz.
+ </para>
+ <programlisting>
+
+static unsigned long current_freq;
+
+
+
+ case VIDIOCGFREQ:
+ if(copy_to_user(arg, &amp;current_freq,
+ sizeof(unsigned long))
+ return -EFAULT;
+ return 0;
+
+ </programlisting>
+ <para>
+ Querying the frequency in our case is relatively simple. Our radio card is
+ too dumb to let us query the signal strength so we remember our setting if
+ we know it. All we have to do is copy it to the user.
+ </para>
+ <programlisting>
+
+
+ case VIDIOCSFREQ:
+ {
+ u32 freq;
+ if(copy_from_user(arg, &amp;freq,
+ sizeof(unsigned long))!=0)
+ return -EFAULT;
+ if(hardware_set_freq(freq)&lt;0)
+ return -EINVAL;
+ current_freq = freq;
+ return 0;
+ }
+
+ </programlisting>
+ <para>
+ Setting the frequency is a little more complex. We begin by copying the
+ desired frequency into kernel space. Next we call a hardware specific routine
+ to set the radio up. This might be as simple as some scaling and a few
+ writes to an I/O port. For most radio cards it turns out a good deal more
+ complicated and may involve programming things like a phase locked loop on
+ the card. This is what documentation is for.
+ </para>
+ <para>
+ The final set of operations we need to provide for our radio are the
+ volume controls. Not all radio cards can even do volume control. After all
+ there is a perfectly good volume control on the sound card. We will assume
+ our radio card has a simple 4 step volume control.
+ </para>
+ <para>
+ There are two ioctls with audio we need to support
+ </para>
+ <programlisting>
+
+static int current_volume=0;
+
+ case VIDIOCGAUDIO:
+ {
+ struct video_audio v;
+ if(copy_from_user(&amp;v, arg, sizeof(v)))
+ return -EFAULT;
+ if(v.audio != 0)
+ return -EINVAL;
+ v.volume = 16384*current_volume;
+ v.step = 16384;
+ strcpy(v.name, "Radio");
+ v.mode = VIDEO_SOUND_MONO;
+ v.balance = 0;
+ v.base = 0;
+ v.treble = 0;
+
+ if(copy_to_user(arg. &amp;v, sizeof(v)))
+ return -EFAULT;
+ return 0;
+ }
+
+ </programlisting>
+ <para>
+ Much like the tuner we start by copying the user structure into kernel
+ space. Again we check if the user has asked for a valid audio input. We have
+ only input 0 and we punt if they ask for another input.
+ </para>
+ <para>
+ Then we fill in the video_audio structure. This has the following format
+ </para>
+ <table frame="all"><title>struct video_audio fields</title>
+ <tgroup cols="2" align="left">
+ <tbody>
+ <row>
+ <entry>audio</entry><entry>The input the user wishes to query</entry>
+ </row><row>
+ <entry>volume</entry><entry>The volume setting on a scale of 0-65535</entry>
+ </row><row>
+ <entry>base</entry><entry>The base level on a scale of 0-65535</entry>
+ </row><row>
+ <entry>treble</entry><entry>The treble level on a scale of 0-65535</entry>
+ </row><row>
+ <entry>flags</entry><entry>The features this audio device supports
+ </entry>
+ </row><row>
+ <entry>name</entry><entry>A text name to display to the user. We picked
+ "Radio" as it explains things quite nicely.</entry>
+ </row><row>
+ <entry>mode</entry><entry>The current reception mode for the audio
+
+ We report MONO because our card is too stupid to know if it is in
+ mono or stereo.
+ </entry>
+ </row><row>
+ <entry>balance</entry><entry>The stereo balance on a scale of 0-65535, 32768 is
+ middle.</entry>
+ </row><row>
+ <entry>step</entry><entry>The step by which the volume control jumps. This is
+ used to help make it easy for applications to set
+ slider behaviour.</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+
+ <table frame="all"><title>struct video_audio flags</title>
+ <tgroup cols="2" align="left">
+ <tbody>
+ <row>
+ <entry>VIDEO_AUDIO_MUTE</entry><entry>The audio is currently muted. We
+ could fake this in our driver but we
+ choose not to bother.</entry>
+ </row><row>
+ <entry>VIDEO_AUDIO_MUTABLE</entry><entry>The input has a mute option</entry>
+ </row><row>
+ <entry>VIDEO_AUDIO_TREBLE</entry><entry>The input has a treble control</entry>
+ </row><row>
+ <entry>VIDEO_AUDIO_BASS</entry><entry>The input has a base control</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+
+ <table frame="all"><title>struct video_audio modes</title>
+ <tgroup cols="2" align="left">
+ <tbody>
+ <row>
+ <entry>VIDEO_SOUND_MONO</entry><entry>Mono sound</entry>
+ </row><row>
+ <entry>VIDEO_SOUND_STEREO</entry><entry>Stereo sound</entry>
+ </row><row>
+ <entry>VIDEO_SOUND_LANG1</entry><entry>Alternative language 1 (TV specific)</entry>
+ </row><row>
+ <entry>VIDEO_SOUND_LANG2</entry><entry>Alternative language 2 (TV specific)</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+ <para>
+ Having filled in the structure we copy it back to user space.
+ </para>
+ <para>
+ The VIDIOCSAUDIO ioctl allows the user to set the audio parameters in the
+ video_audio structure. The driver does its best to honour the request.
+ </para>
+ <programlisting>
+
+ case VIDIOCSAUDIO:
+ {
+ struct video_audio v;
+ if(copy_from_user(&amp;v, arg, sizeof(v)))
+ return -EFAULT;
+ if(v.audio)
+ return -EINVAL;
+ current_volume = v/16384;
+ hardware_set_volume(current_volume);
+ return 0;
+ }
+
+ </programlisting>
+ <para>
+ In our case there is very little that the user can set. The volume is
+ basically the limit. Note that we could pretend to have a mute feature
+ by rewriting this to
+ </para>
+ <programlisting>
+
+ case VIDIOCSAUDIO:
+ {
+ struct video_audio v;
+ if(copy_from_user(&amp;v, arg, sizeof(v)))
+ return -EFAULT;
+ if(v.audio)
+ return -EINVAL;
+ current_volume = v/16384;
+ if(v.flags&amp;VIDEO_AUDIO_MUTE)
+ hardware_set_volume(0);
+ else
+ hardware_set_volume(current_volume);
+ current_muted = v.flags &amp;
+ VIDEO_AUDIO_MUTE;
+ return 0;
+ }
+
+ </programlisting>
+ <para>
+ This with the corresponding changes to the VIDIOCGAUDIO code to report the
+ state of the mute flag we save and to report the card has a mute function,
+ will allow applications to use a mute facility with this card. It is
+ questionable whether this is a good idea however. User applications can already
+ fake this themselves and kernel space is precious.
+ </para>
+ <para>
+ We now have a working radio ioctl handler. So we just wrap up the function
+ </para>
+ <programlisting>
+
+
+ }
+ return -ENOIOCTLCMD;
+}
+
+ </programlisting>
+ <para>
+ and pass the Video4Linux layer back an error so that it knows we did not
+ understand the request we got passed.
+ </para>
+ </sect1>
+ <sect1 id="modradio">
+ <title>Module Wrapper</title>
+ <para>
+ Finally we add in the usual module wrapping and the driver is done.
+ </para>
+ <programlisting>
+
+#ifndef MODULE
+
+static int io = 0x300;
+
+#else
+
+static int io = -1;
+
+#endif
+
+MODULE_AUTHOR("Alan Cox");
+MODULE_DESCRIPTION("A driver for an imaginary radio card.");
+module_param(io, int, 0444);
+MODULE_PARM_DESC(io, "I/O address of the card.");
+
+static int __init init(void)
+{
+ if(io==-1)
+ {
+ printk(KERN_ERR
+ "You must set an I/O address with io=0x???\n");
+ return -EINVAL;
+ }
+ return myradio_init(NULL);
+}
+
+static void __exit cleanup(void)
+{
+ video_unregister_device(&amp;my_radio);
+ release_region(io, MY_IO_SIZE);
+}
+
+module_init(init);
+module_exit(cleanup);
+
+ </programlisting>
+ <para>
+ In this example we set the IO base by default if the driver is compiled into
+ the kernel: you can still set it using "my_radio.irq" if this file is called <filename>my_radio.c</filename>. For the module we require the
+ user sets the parameter. We set io to a nonsense port (-1) so that we can
+ tell if the user supplied an io parameter or not.
+ </para>
+ <para>
+ We use MODULE_ defines to give an author for the card driver and a
+ description. We also use them to declare that io is an integer and it is the
+ address of the card, and can be read by anyone from sysfs.
+ </para>
+ <para>
+ The clean-up routine unregisters the video_device we registered, and frees
+ up the I/O space. Note that the unregister takes the actual video_device
+ structure as its argument. Unlike the file operations structure which can be
+ shared by all instances of a device a video_device structure as an actual
+ instance of the device. If you are registering multiple radio devices you
+ need to fill in one structure per device (most likely by setting up a
+ template and copying it to each of the actual device structures).
+ </para>
+ </sect1>
+ </chapter>
+ <chapter>
+ <title>Video Capture Devices</title>
+ <sect1 id="introvid">
+ <title>Video Capture Device Types</title>
+ <para>
+ The video capture devices share the same interfaces as radio devices. In
+ order to explain the video capture interface I will use the example of a
+ camera that has no tuners or audio input. This keeps the example relatively
+ clean. To get both combine the two driver examples.
+ </para>
+ <para>
+ Video capture devices divide into four categories. A little technology
+ backgrounder. Full motion video even at television resolution (which is
+ actually fairly low) is pretty resource-intensive. You are continually
+ passing megabytes of data every second from the capture card to the display.
+ several alternative approaches have emerged because copying this through the
+ processor and the user program is a particularly bad idea .
+ </para>
+ <para>
+ The first is to add the television image onto the video output directly.
+ This is also how some 3D cards work. These basic cards can generally drop the
+ video into any chosen rectangle of the display. Cards like this, which
+ include most mpeg1 cards that used the feature connector, aren't very
+ friendly in a windowing environment. They don't understand windows or
+ clipping. The video window is always on the top of the display.
+ </para>
+ <para>
+ Chroma keying is a technique used by cards to get around this. It is an old
+ television mixing trick where you mark all the areas you wish to replace
+ with a single clear colour that isn't used in the image - TV people use an
+ incredibly bright blue while computing people often use a particularly
+ virulent purple. Bright blue occurs on the desktop. Anyone with virulent
+ purple windows has another problem besides their TV overlay.
+ </para>
+ <para>
+ The third approach is to copy the data from the capture card to the video
+ card, but to do it directly across the PCI bus. This relieves the processor
+ from doing the work but does require some smartness on the part of the video
+ capture chip, as well as a suitable video card. Programming this kind of
+ card and more so debugging it can be extremely tricky. There are some quite
+ complicated interactions with the display and you may also have to cope with
+ various chipset bugs that show up when PCI cards start talking to each
+ other.
+ </para>
+ <para>
+ To keep our example fairly simple we will assume a card that supports
+ overlaying a flat rectangular image onto the frame buffer output, and which
+ can also capture stuff into processor memory.
+ </para>
+ </sect1>
+ <sect1 id="regvid">
+ <title>Registering Video Capture Devices</title>
+ <para>
+ This time we need to add more functions for our camera device.
+ </para>
+ <programlisting>
+static struct video_device my_camera
+{
+ "My Camera",
+ VID_TYPE_OVERLAY|VID_TYPE_SCALES|\
+ VID_TYPE_CAPTURE|VID_TYPE_CHROMAKEY,
+ VID_HARDWARE_MYCAMERA,
+ camera_open.
+ camera_close,
+ camera_read, /* no read */
+ NULL, /* no write */
+ camera_poll, /* no poll */
+ camera_ioctl,
+ NULL, /* no special init function */
+ NULL /* no private data */
+};
+ </programlisting>
+ <para>
+ We need a read() function which is used for capturing data from
+ the card, and we need a poll function so that a driver can wait for the next
+ frame to be captured.
+ </para>
+ <para>
+ We use the extra video capability flags that did not apply to the
+ radio interface. The video related flags are
+ </para>
+ <table frame="all"><title>Capture Capabilities</title>
+ <tgroup cols="2" align="left">
+ <tbody>
+ <row>
+<entry>VID_TYPE_CAPTURE</entry><entry>We support image capture</entry>
+</row><row>
+<entry>VID_TYPE_TELETEXT</entry><entry>A teletext capture device (vbi{n])</entry>
+</row><row>
+<entry>VID_TYPE_OVERLAY</entry><entry>The image can be directly overlaid onto the
+ frame buffer</entry>
+</row><row>
+<entry>VID_TYPE_CHROMAKEY</entry><entry>Chromakey can be used to select which parts
+ of the image to display</entry>
+</row><row>
+<entry>VID_TYPE_CLIPPING</entry><entry>It is possible to give the board a list of
+ rectangles to draw around. </entry>
+</row><row>
+<entry>VID_TYPE_FRAMERAM</entry><entry>The video capture goes into the video memory
+ and actually changes it. Applications need
+ to know this so they can clean up after the
+ card</entry>
+</row><row>
+<entry>VID_TYPE_SCALES</entry><entry>The image can be scaled to various sizes,
+ rather than being a single fixed size.</entry>
+</row><row>
+<entry>VID_TYPE_MONOCHROME</entry><entry>The capture will be monochrome. This isn't a
+ complete answer to the question since a mono
+ camera on a colour capture card will still
+ produce mono output.</entry>
+</row><row>
+<entry>VID_TYPE_SUBCAPTURE</entry><entry>The card allows only part of its field of
+ view to be captured. This enables
+ applications to avoid copying all of a large
+ image into memory when only some section is
+ relevant.</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+ <para>
+ We set VID_TYPE_CAPTURE so that we are seen as a capture card,
+ VID_TYPE_CHROMAKEY so the application knows it is time to draw in virulent
+ purple, and VID_TYPE_SCALES because we can be resized.
+ </para>
+ <para>
+ Our setup is fairly similar. This time we also want an interrupt line
+ for the 'frame captured' signal. Not all cards have this so some of them
+ cannot handle poll().
+ </para>
+ <programlisting>
+
+
+static int io = 0x320;
+static int irq = 11;
+
+int __init mycamera_init(struct video_init *v)
+{
+ if(!request_region(io, MY_IO_SIZE, "mycamera"))
+ {
+ printk(KERN_ERR
+ "mycamera: port 0x%03X is in use.\n", io);
+ return -EBUSY;
+ }
+
+ if(video_device_register(&amp;my_camera,
+ VFL_TYPE_GRABBER)==-1) {
+ release_region(io, MY_IO_SIZE);
+ return -EINVAL;
+ }
+ return 0;
+}
+
+ </programlisting>
+ <para>
+ This is little changed from the needs of the radio card. We specify
+ VFL_TYPE_GRABBER this time as we want to be allocated a /dev/video name.
+ </para>
+ </sect1>
+ <sect1 id="opvid">
+ <title>Opening And Closing The Capture Device</title>
+ <programlisting>
+
+
+static int users = 0;
+
+static int camera_open(stuct video_device *dev, int flags)
+{
+ if(users)
+ return -EBUSY;
+ if(request_irq(irq, camera_irq, 0, "camera", dev)&lt;0)
+ return -EBUSY;
+ users++;
+ return 0;
+}
+
+
+static int camera_close(struct video_device *dev)
+{
+ users--;
+ free_irq(irq, dev);
+}
+ </programlisting>
+ <para>
+ The open and close routines are also quite similar. The only real change is
+ that we now request an interrupt for the camera device interrupt line. If we
+ cannot get the interrupt we report EBUSY to the application and give up.
+ </para>
+ </sect1>
+ <sect1 id="irqvid">
+ <title>Interrupt Handling</title>
+ <para>
+ Our example handler is for an ISA bus device. If it was PCI you would be
+ able to share the interrupt and would have set SA_SHIRQ to indicate a
+ shared IRQ. We pass the device pointer as the interrupt routine argument. We
+ don't need to since we only support one card but doing this will make it
+ easier to upgrade the driver for multiple devices in the future.
+ </para>
+ <para>
+ Our interrupt routine needs to do little if we assume the card can simply
+ queue one frame to be read after it captures it.
+ </para>
+ <programlisting>
+
+
+static struct wait_queue *capture_wait;
+static int capture_ready = 0;
+
+static void camera_irq(int irq, void *dev_id,
+ struct pt_regs *regs)
+{
+ capture_ready=1;
+ wake_up_interruptible(&amp;capture_wait);
+}
+ </programlisting>
+ <para>
+ The interrupt handler is nice and simple for this card as we are assuming
+ the card is buffering the frame for us. This means we have little to do but
+ wake up anybody interested. We also set a capture_ready flag, as we may
+ capture a frame before an application needs it. In this case we need to know
+ that a frame is ready. If we had to collect the frame on the interrupt life
+ would be more complex.
+ </para>
+ <para>
+ The two new routines we need to supply are camera_read which returns a
+ frame, and camera_poll which waits for a frame to become ready.
+ </para>
+ <programlisting>
+
+
+static int camera_poll(struct video_device *dev,
+ struct file *file, struct poll_table *wait)
+{
+ poll_wait(file, &amp;capture_wait, wait);
+ if(capture_read)
+ return POLLIN|POLLRDNORM;
+ return 0;
+}
+
+ </programlisting>
+ <para>
+ Our wait queue for polling is the capture_wait queue. This will cause the
+ task to be woken up by our camera_irq routine. We check capture_read to see
+ if there is an image present and if so report that it is readable.
+ </para>
+ </sect1>
+ <sect1 id="rdvid">
+ <title>Reading The Video Image</title>
+ <programlisting>
+
+
+static long camera_read(struct video_device *dev, char *buf,
+ unsigned long count)
+{
+ struct wait_queue wait = { current, NULL };
+ u8 *ptr;
+ int len;
+ int i;
+
+ add_wait_queue(&amp;capture_wait, &amp;wait);
+
+ while(!capture_ready)
+ {
+ if(file->flags&amp;O_NDELAY)
+ {
+ remove_wait_queue(&amp;capture_wait, &amp;wait);
+ current->state = TASK_RUNNING;
+ return -EWOULDBLOCK;
+ }
+ if(signal_pending(current))
+ {
+ remove_wait_queue(&amp;capture_wait, &amp;wait);
+ current->state = TASK_RUNNING;
+ return -ERESTARTSYS;
+ }
+ schedule();
+ current->state = TASK_INTERRUPTIBLE;
+ }
+ remove_wait_queue(&amp;capture_wait, &amp;wait);
+ current->state = TASK_RUNNING;
+
+ </programlisting>
+ <para>
+ The first thing we have to do is to ensure that the application waits until
+ the next frame is ready. The code here is almost identical to the mouse code
+ we used earlier in this chapter. It is one of the common building blocks of
+ Linux device driver code and probably one which you will find occurs in any
+ drivers you write.
+ </para>
+ <para>
+ We wait for a frame to be ready, or for a signal to interrupt our waiting. If a
+ signal occurs we need to return from the system call so that the signal can
+ be sent to the application itself. We also check to see if the user actually
+ wanted to avoid waiting - ie if they are using non-blocking I/O and have other things
+ to get on with.
+ </para>
+ <para>
+ Next we copy the data from the card to the user application. This is rarely
+ as easy as our example makes out. We will add capture_w, and capture_h here
+ to hold the width and height of the captured image. We assume the card only
+ supports 24bit RGB for now.
+ </para>
+ <programlisting>
+
+
+
+ capture_ready = 0;
+
+ ptr=(u8 *)buf;
+ len = capture_w * 3 * capture_h; /* 24bit RGB */
+
+ if(len>count)
+ len=count; /* Doesn't all fit */
+
+ for(i=0; i&lt;len; i++)
+ {
+ put_user(inb(io+IMAGE_DATA), ptr);
+ ptr++;
+ }
+
+ hardware_restart_capture();
+
+ return i;
+}
+
+ </programlisting>
+ <para>
+ For a real hardware device you would try to avoid the loop with put_user().
+ Each call to put_user() has a time overhead checking whether the accesses to user
+ space are allowed. It would be better to read a line into a temporary buffer
+ then copy this to user space in one go.
+ </para>
+ <para>
+ Having captured the image and put it into user space we can kick the card to
+ get the next frame acquired.
+ </para>
+ </sect1>
+ <sect1 id="iocvid">
+ <title>Video Ioctl Handling</title>
+ <para>
+ As with the radio driver the major control interface is via the ioctl()
+ function. Video capture devices support the same tuner calls as a radio
+ device and also support additional calls to control how the video functions
+ are handled. In this simple example the card has no tuners to avoid making
+ the code complex.
+ </para>
+ <programlisting>
+
+
+
+static int camera_ioctl(struct video_device *dev, unsigned int cmd, void *arg)
+{
+ switch(cmd)
+ {
+ case VIDIOCGCAP:
+ {
+ struct video_capability v;
+ v.type = VID_TYPE_CAPTURE|\
+ VID_TYPE_CHROMAKEY|\
+ VID_TYPE_SCALES|\
+ VID_TYPE_OVERLAY;
+ v.channels = 1;
+ v.audios = 0;
+ v.maxwidth = 640;
+ v.minwidth = 16;
+ v.maxheight = 480;
+ v.minheight = 16;
+ strcpy(v.name, "My Camera");
+ if(copy_to_user(arg, &amp;v, sizeof(v)))
+ return -EFAULT;
+ return 0;
+ }
+
+
+ </programlisting>
+ <para>
+ The first ioctl we must support and which all video capture and radio
+ devices are required to support is VIDIOCGCAP. This behaves exactly the same
+ as with a radio device. This time, however, we report the extra capabilities
+ we outlined earlier on when defining our video_dev structure.
+ </para>
+ <para>
+ We now set the video flags saying that we support overlay, capture,
+ scaling and chromakey. We also report size limits - our smallest image is
+ 16x16 pixels, our largest is 640x480.
+ </para>
+ <para>
+ To keep things simple we report no audio and no tuning capabilities at all.
+ </para>
+ <programlisting>
+
+ case VIDIOCGCHAN:
+ {
+ struct video_channel v;
+ if(copy_from_user(&amp;v, arg, sizeof(v)))
+ return -EFAULT;
+ if(v.channel != 0)
+ return -EINVAL;
+ v.flags = 0;
+ v.tuners = 0;
+ v.type = VIDEO_TYPE_CAMERA;
+ v.norm = VIDEO_MODE_AUTO;
+ strcpy(v.name, "Camera Input");break;
+ if(copy_to_user(&amp;v, arg, sizeof(v)))
+ return -EFAULT;
+ return 0;
+ }
+
+
+ </programlisting>
+ <para>
+ This follows what is very much the standard way an ioctl handler looks
+ in Linux. We copy the data into a kernel space variable and we check that the
+ request is valid (in this case that the input is 0). Finally we copy the
+ camera info back to the user.
+ </para>
+ <para>
+ The VIDIOCGCHAN ioctl allows a user to ask about video channels (that is
+ inputs to the video card). Our example card has a single camera input. The
+ fields in the structure are
+ </para>
+ <table frame="all"><title>struct video_channel fields</title>
+ <tgroup cols="2" align="left">
+ <tbody>
+ <row>
+
+ <entry>channel</entry><entry>The channel number we are selecting</entry>
+ </row><row>
+ <entry>name</entry><entry>The name for this channel. This is intended
+ to describe the port to the user.
+ Appropriate names are therefore things like
+ "Camera" "SCART input"</entry>
+ </row><row>
+ <entry>flags</entry><entry>Channel properties</entry>
+ </row><row>
+ <entry>type</entry><entry>Input type</entry>
+ </row><row>
+ <entry>norm</entry><entry>The current television encoding being used
+ if relevant for this channel.
+ </entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+ <table frame="all"><title>struct video_channel flags</title>
+ <tgroup cols="2" align="left">
+ <tbody>
+ <row>
+ <entry>VIDEO_VC_TUNER</entry><entry>Channel has a tuner.</entry>
+ </row><row>
+ <entry>VIDEO_VC_AUDIO</entry><entry>Channel has audio.</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+ <table frame="all"><title>struct video_channel types</title>
+ <tgroup cols="2" align="left">
+ <tbody>
+ <row>
+ <entry>VIDEO_TYPE_TV</entry><entry>Television input.</entry>
+ </row><row>
+ <entry>VIDEO_TYPE_CAMERA</entry><entry>Fixed camera input.</entry>
+ </row><row>
+ <entry>0</entry><entry>Type is unknown.</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+ <table frame="all"><title>struct video_channel norms</title>
+ <tgroup cols="2" align="left">
+ <tbody>
+ <row>
+ <entry>VIDEO_MODE_PAL</entry><entry>PAL encoded Television</entry>
+ </row><row>
+ <entry>VIDEO_MODE_NTSC</entry><entry>NTSC (US) encoded Television</entry>
+ </row><row>
+ <entry>VIDEO_MODE_SECAM</entry><entry>SECAM (French) Television </entry>
+ </row><row>
+ <entry>VIDEO_MODE_AUTO</entry><entry>Automatic switching, or format does not
+ matter</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+ <para>
+ The corresponding VIDIOCSCHAN ioctl allows a user to change channel and to
+ request the norm is changed - for example to switch between a PAL or an NTSC
+ format camera.
+ </para>
+ <programlisting>
+
+
+ case VIDIOCSCHAN:
+ {
+ struct video_channel v;
+ if(copy_from_user(&amp;v, arg, sizeof(v)))
+ return -EFAULT;
+ if(v.channel != 0)
+ return -EINVAL;
+ if(v.norm != VIDEO_MODE_AUTO)
+ return -EINVAL;
+ return 0;
+ }
+
+
+ </programlisting>
+ <para>
+ The implementation of this call in our driver is remarkably easy. Because we
+ are assuming fixed format hardware we need only check that the user has not
+ tried to change anything.
+ </para>
+ <para>
+ The user also needs to be able to configure and adjust the picture they are
+ seeing. This is much like adjusting a television set. A user application
+ also needs to know the palette being used so that it knows how to display
+ the image that has been captured. The VIDIOCGPICT and VIDIOCSPICT ioctl
+ calls provide this information.
+ </para>
+ <programlisting>
+
+
+ case VIDIOCGPICT
+ {
+ struct video_picture v;
+ v.brightness = hardware_brightness();
+ v.hue = hardware_hue();
+ v.colour = hardware_saturation();
+ v.contrast = hardware_brightness();
+ /* Not settable */
+ v.whiteness = 32768;
+ v.depth = 24; /* 24bit */
+ v.palette = VIDEO_PALETTE_RGB24;
+ if(copy_to_user(&amp;v, arg,
+ sizeof(v)))
+ return -EFAULT;
+ return 0;
+ }
+
+
+ </programlisting>
+ <para>
+ The brightness, hue, color, and contrast provide the picture controls that
+ are akin to a conventional television. Whiteness provides additional
+ control for greyscale images. All of these values are scaled between 0-65535
+ and have 32768 as the mid point setting. The scaling means that applications
+ do not have to worry about the capability range of the hardware but can let
+ it make a best effort attempt.
+ </para>
+ <para>
+ Our depth is 24, as this is in bits. We will be returning RGB24 format. This
+ has one byte of red, then one of green, then one of blue. This then repeats
+ for every other pixel in the image. The other common formats the interface
+ defines are
+ </para>
+ <table frame="all"><title>Framebuffer Encodings</title>
+ <tgroup cols="2" align="left">
+ <tbody>
+ <row>
+ <entry>GREY</entry><entry>Linear greyscale. This is for simple cameras and the
+ like</entry>
+ </row><row>
+ <entry>RGB565</entry><entry>The top 5 bits hold 32 red levels, the next six bits
+ hold green and the low 5 bits hold blue. </entry>
+ </row><row>
+ <entry>RGB555</entry><entry>The top bit is clear. The red green and blue levels
+ each occupy five bits.</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+ <para>
+ Additional modes are support for YUV capture formats. These are common for
+ TV and video conferencing applications.
+ </para>
+ <para>
+ The VIDIOCSPICT ioctl allows a user to set some of the picture parameters.
+ Exactly which ones are supported depends heavily on the card itself. It is
+ possible to support many modes and effects in software. In general doing
+ this in the kernel is a bad idea. Video capture is a performance-sensitive
+ application and the programs can often do better if they aren't being
+ 'helped' by an overkeen driver writer. Thus for our device we will report
+ RGB24 only and refuse to allow a change.
+ </para>
+ <programlisting>
+
+
+ case VIDIOCSPICT:
+ {
+ struct video_picture v;
+ if(copy_from_user(&amp;v, arg, sizeof(v)))
+ return -EFAULT;
+ if(v.depth!=24 ||
+ v.palette != VIDEO_PALETTE_RGB24)
+ return -EINVAL;
+ set_hardware_brightness(v.brightness);
+ set_hardware_hue(v.hue);
+ set_hardware_saturation(v.colour);
+ set_hardware_brightness(v.contrast);
+ return 0;
+ }
+
+
+ </programlisting>
+ <para>
+ We check the user has not tried to change the palette or the depth. We do
+ not want to carry out some of the changes and then return an error. This may
+ confuse the application which will be assuming no change occurred.
+ </para>
+ <para>
+ In much the same way as you need to be able to set the picture controls to
+ get the right capture images, many cards need to know what they are
+ displaying onto when generating overlay output. In some cases getting this
+ wrong even makes a nasty mess or may crash the computer. For that reason
+ the VIDIOCSBUF ioctl used to set up the frame buffer information may well
+ only be usable by root.
+ </para>
+ <para>
+ We will assume our card is one of the old ISA devices with feature connector
+ and only supports a couple of standard video modes. Very common for older
+ cards although the PCI devices are way smarter than this.
+ </para>
+ <programlisting>
+
+
+static struct video_buffer capture_fb;
+
+ case VIDIOCGFBUF:
+ {
+ if(copy_to_user(arg, &amp;capture_fb,
+ sizeof(capture_fb)))
+ return -EFAULT;
+ return 0;
+
+ }
+
+
+ </programlisting>
+ <para>
+ We keep the frame buffer information in the format the ioctl uses. This
+ makes it nice and easy to work with in the ioctl calls.
+ </para>
+ <programlisting>
+
+ case VIDIOCSFBUF:
+ {
+ struct video_buffer v;
+
+ if(!capable(CAP_SYS_ADMIN))
+ return -EPERM;
+
+ if(copy_from_user(&amp;v, arg, sizeof(v)))
+ return -EFAULT;
+ if(v.width!=320 &amp;&amp; v.width!=640)
+ return -EINVAL;
+ if(v.height!=200 &amp;&amp; v.height!=240
+ &amp;&amp; v.height!=400
+ &amp;&amp; v.height !=480)
+ return -EINVAL;
+ memcpy(&amp;capture_fb, &amp;v, sizeof(v));
+ hardware_set_fb(&amp;v);
+ return 0;
+ }
+
+
+
+ </programlisting>
+ <para>
+ The capable() function checks a user has the required capability. The Linux
+ operating system has a set of about 30 capabilities indicating privileged
+ access to services. The default set up gives the superuser (uid 0) all of
+ them and nobody else has any.
+ </para>
+ <para>
+ We check that the user has the SYS_ADMIN capability, that is they are
+ allowed to operate as the machine administrator. We don't want anyone but
+ the administrator making a mess of the display.
+ </para>
+ <para>
+ Next we check for standard PC video modes (320 or 640 wide with either
+ EGA or VGA depths). If the mode is not a standard video mode we reject it as
+ not supported by our card. If the mode is acceptable we save it so that
+ VIDIOCFBUF will give the right answer next time it is called. The
+ hardware_set_fb() function is some undescribed card specific function to
+ program the card for the desired mode.
+ </para>
+ <para>
+ Before the driver can display an overlay window it needs to know where the
+ window should be placed, and also how large it should be. If the card
+ supports clipping it needs to know which rectangles to omit from the
+ display. The video_window structure is used to describe the way the image
+ should be displayed.
+ </para>
+ <table frame="all"><title>struct video_window fields</title>
+ <tgroup cols="2" align="left">
+ <tbody>
+ <row>
+ <entry>width</entry><entry>The width in pixels of the desired image. The card
+ may use a smaller size if this size is not available</entry>
+ </row><row>
+ <entry>height</entry><entry>The height of the image. The card may use a smaller
+ size if this size is not available.</entry>
+ </row><row>
+ <entry>x</entry><entry> The X position of the top left of the window. This
+ is in pixels relative to the left hand edge of the
+ picture. Not all cards can display images aligned on
+ any pixel boundary. If the position is unsuitable
+ the card adjusts the image right and reduces the
+ width.</entry>
+ </row><row>
+ <entry>y</entry><entry> The Y position of the top left of the window. This
+ is counted in pixels relative to the top edge of the
+ picture. As with the width if the card cannot
+ display starting on this line it will adjust the
+ values.</entry>
+ </row><row>
+ <entry>chromakey</entry><entry>The colour (expressed in RGB32 format) for the
+ chromakey colour if chroma keying is being used. </entry>
+ </row><row>
+ <entry>clips</entry><entry>An array of rectangles that must not be drawn
+ over.</entry>
+ </row><row>
+ <entry>clipcount</entry><entry>The number of clips in this array.</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+ <para>
+ Each clip is a struct video_clip which has the following fields
+ </para>
+ <table frame="all"><title>video_clip fields</title>
+ <tgroup cols="2" align="left">
+ <tbody>
+ <row>
+ <entry>x, y</entry><entry>Co-ordinates relative to the display</entry>
+ </row><row>
+ <entry>width, height</entry><entry>Width and height in pixels</entry>
+ </row><row>
+ <entry>next</entry><entry>A spare field for the application to use</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+ <para>
+ The driver is required to ensure it always draws in the area requested or a smaller area, and that it never draws in any of the areas that are clipped.
+ This may well mean it has to leave alone. small areas the application wished to be
+ drawn.
+ </para>
+ <para>
+ Our example card uses chromakey so does not have to address most of the
+ clipping. We will add a video_window structure to our global variables to
+ remember our parameters, as we did with the frame buffer.
+ </para>
+ <programlisting>
+
+
+ case VIDIOCGWIN:
+ {
+ if(copy_to_user(arg, &amp;capture_win,
+ sizeof(capture_win)))
+ return -EFAULT;
+ return 0;
+ }
+
+
+ case VIDIOCSWIN:
+ {
+ struct video_window v;
+ if(copy_from_user(&amp;v, arg, sizeof(v)))
+ return -EFAULT;
+ if(v.width &gt; 640 || v.height &gt; 480)
+ return -EINVAL;
+ if(v.width &lt; 16 || v.height &lt; 16)
+ return -EINVAL;
+ hardware_set_key(v.chromakey);
+ hardware_set_window(v);
+ memcpy(&amp;capture_win, &amp;v, sizeof(v));
+ capture_w = v.width;
+ capture_h = v.height;
+ return 0;
+ }
+
+
+ </programlisting>
+ <para>
+ Because we are using Chromakey our setup is fairly simple. Mostly we have to
+ check the values are sane and load them into the capture card.
+ </para>
+ <para>
+ With all the setup done we can now turn on the actual capture/overlay. This
+ is done with the VIDIOCCAPTURE ioctl. This takes a single integer argument
+ where 0 is on and 1 is off.
+ </para>
+ <programlisting>
+
+
+ case VIDIOCCAPTURE:
+ {
+ int v;
+ if(get_user(v, (int *)arg))
+ return -EFAULT;
+ if(v==0)
+ hardware_capture_off();
+ else
+ {
+ if(capture_fb.width == 0
+ || capture_w == 0)
+ return -EINVAL;
+ hardware_capture_on();
+ }
+ return 0;
+ }
+
+
+ </programlisting>
+ <para>
+ We grab the flag from user space and either enable or disable according to
+ its value. There is one small corner case we have to consider here. Suppose
+ that the capture was requested before the video window or the frame buffer
+ had been set up. In those cases there will be unconfigured fields in our
+ card data, as well as unconfigured hardware settings. We check for this case and
+ return an error if the frame buffer or the capture window width is zero.
+ </para>
+ <programlisting>
+
+
+ default:
+ return -ENOIOCTLCMD;
+ }
+}
+ </programlisting>
+ <para>
+
+ We don't need to support any other ioctls, so if we get this far, it is time
+ to tell the video layer that we don't now what the user is talking about.
+ </para>
+ </sect1>
+ <sect1 id="endvid">
+ <title>Other Functionality</title>
+ <para>
+ The Video4Linux layer supports additional features, including a high
+ performance mmap() based capture mode and capturing part of the image.
+ These features are out of the scope of the book. You should however have enough
+ example code to implement most simple video4linux devices for radio and TV
+ cards.
+ </para>
+ </sect1>
+ </chapter>
+ <chapter id="bugs">
+ <title>Known Bugs And Assumptions</title>
+ <para>
+ <variablelist>
+ <varlistentry><term>Multiple Opens</term>
+ <listitem>
+ <para>
+ The driver assumes multiple opens should not be allowed. A driver
+ can work around this but not cleanly.
+ </para>
+ </listitem></varlistentry>
+
+ <varlistentry><term>API Deficiencies</term>
+ <listitem>
+ <para>
+ The existing API poorly reflects compression capable devices. There
+ are plans afoot to merge V4L, V4L2 and some other ideas into a
+ better interface.
+ </para>
+ </listitem></varlistentry>
+ </variablelist>
+
+ </para>
+ </chapter>
+
+ <chapter id="pubfunctions">
+ <title>Public Functions Provided</title>
+!Edrivers/media/video/videodev.c
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/wanbook.tmpl b/Documentation/DocBook/wanbook.tmpl
new file mode 100644
index 000000000000..9eebcc304de4
--- /dev/null
+++ b/Documentation/DocBook/wanbook.tmpl
@@ -0,0 +1,99 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="WANGuide">
+ <bookinfo>
+ <title>Synchronous PPP and Cisco HDLC Programming Guide</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Alan</firstname>
+ <surname>Cox</surname>
+ <affiliation>
+ <address>
+ <email>alan@redhat.com</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2000</year>
+ <holder>Alan Cox</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ The syncppp drivers in Linux provide a fairly complete
+ implementation of Cisco HDLC and a minimal implementation of
+ PPP. The longer term goal is to switch the PPP layer to the
+ generic PPP interface that is new in Linux 2.3.x. The API should
+ remain unchanged when this is done, but support will then be
+ available for IPX, compression and other PPP features
+ </para>
+ </chapter>
+ <chapter id="bugs">
+ <title>Known Bugs And Assumptions</title>
+ <para>
+ <variablelist>
+ <varlistentry><term>PPP is minimal</term>
+ <listitem>
+ <para>
+ The current PPP implementation is very basic, although sufficient
+ for most wan usages.
+ </para>
+ </listitem></varlistentry>
+
+ <varlistentry><term>Cisco HDLC Quirks</term>
+ <listitem>
+ <para>
+ Currently we do not end all packets with the correct Cisco multicast
+ or unicast flags. Nothing appears to mind too much but this should
+ be corrected.
+ </para>
+ </listitem></varlistentry>
+ </variablelist>
+
+ </para>
+ </chapter>
+
+ <chapter id="pubfunctions">
+ <title>Public Functions Provided</title>
+!Edrivers/net/wan/syncppp.c
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/writing_usb_driver.tmpl b/Documentation/DocBook/writing_usb_driver.tmpl
new file mode 100644
index 000000000000..51f3bfb6fb6e
--- /dev/null
+++ b/Documentation/DocBook/writing_usb_driver.tmpl
@@ -0,0 +1,419 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="USBDeviceDriver">
+ <bookinfo>
+ <title>Writing USB Device Drivers</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Greg</firstname>
+ <surname>Kroah-Hartman</surname>
+ <affiliation>
+ <address>
+ <email>greg@kroah.com</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2001-2002</year>
+ <holder>Greg Kroah-Hartman</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+
+ <para>
+ This documentation is based on an article published in
+ Linux Journal Magazine, October 2001, Issue 90.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ The Linux USB subsystem has grown from supporting only two different
+ types of devices in the 2.2.7 kernel (mice and keyboards), to over 20
+ different types of devices in the 2.4 kernel. Linux currently supports
+ almost all USB class devices (standard types of devices like keyboards,
+ mice, modems, printers and speakers) and an ever-growing number of
+ vendor-specific devices (such as USB to serial converters, digital
+ cameras, Ethernet devices and MP3 players). For a full list of the
+ different USB devices currently supported, see Resources.
+ </para>
+ <para>
+ The remaining kinds of USB devices that do not have support on Linux are
+ almost all vendor-specific devices. Each vendor decides to implement a
+ custom protocol to talk to their device, so a custom driver usually needs
+ to be created. Some vendors are open with their USB protocols and help
+ with the creation of Linux drivers, while others do not publish them, and
+ developers are forced to reverse-engineer. See Resources for some links
+ to handy reverse-engineering tools.
+ </para>
+ <para>
+ Because each different protocol causes a new driver to be created, I have
+ written a generic USB driver skeleton, modeled after the pci-skeleton.c
+ file in the kernel source tree upon which many PCI network drivers have
+ been based. This USB skeleton can be found at drivers/usb/usb-skeleton.c
+ in the kernel source tree. In this article I will walk through the basics
+ of the skeleton driver, explaining the different pieces and what needs to
+ be done to customize it to your specific device.
+ </para>
+ </chapter>
+
+ <chapter id="basics">
+ <title>Linux USB Basics</title>
+ <para>
+ If you are going to write a Linux USB driver, please become familiar with
+ the USB protocol specification. It can be found, along with many other
+ useful documents, at the USB home page (see Resources). An excellent
+ introduction to the Linux USB subsystem can be found at the USB Working
+ Devices List (see Resources). It explains how the Linux USB subsystem is
+ structured and introduces the reader to the concept of USB urbs, which
+ are essential to USB drivers.
+ </para>
+ <para>
+ The first thing a Linux USB driver needs to do is register itself with
+ the Linux USB subsystem, giving it some information about which devices
+ the driver supports and which functions to call when a device supported
+ by the driver is inserted or removed from the system. All of this
+ information is passed to the USB subsystem in the usb_driver structure.
+ The skeleton driver declares a usb_driver as:
+ </para>
+ <programlisting>
+static struct usb_driver skel_driver = {
+ .name = "skeleton",
+ .probe = skel_probe,
+ .disconnect = skel_disconnect,
+ .fops = &amp;skel_fops,
+ .minor = USB_SKEL_MINOR_BASE,
+ .id_table = skel_table,
+};
+ </programlisting>
+ <para>
+ The variable name is a string that describes the driver. It is used in
+ informational messages printed to the system log. The probe and
+ disconnect function pointers are called when a device that matches the
+ information provided in the id_table variable is either seen or removed.
+ </para>
+ <para>
+ The fops and minor variables are optional. Most USB drivers hook into
+ another kernel subsystem, such as the SCSI, network or TTY subsystem.
+ These types of drivers register themselves with the other kernel
+ subsystem, and any user-space interactions are provided through that
+ interface. But for drivers that do not have a matching kernel subsystem,
+ such as MP3 players or scanners, a method of interacting with user space
+ is needed. The USB subsystem provides a way to register a minor device
+ number and a set of file_operations function pointers that enable this
+ user-space interaction. The skeleton driver needs this kind of interface,
+ so it provides a minor starting number and a pointer to its
+ file_operations functions.
+ </para>
+ <para>
+ The USB driver is then registered with a call to usb_register, usually in
+ the driver's init function, as shown here:
+ </para>
+ <programlisting>
+static int __init usb_skel_init(void)
+{
+ int result;
+
+ /* register this driver with the USB subsystem */
+ result = usb_register(&amp;skel_driver);
+ if (result &lt; 0) {
+ err(&quot;usb_register failed for the &quot;__FILE__ &quot;driver.&quot;
+ &quot;Error number %d&quot;, result);
+ return -1;
+ }
+
+ return 0;
+}
+module_init(usb_skel_init);
+ </programlisting>
+ <para>
+ When the driver is unloaded from the system, it needs to unregister
+ itself with the USB subsystem. This is done with the usb_unregister
+ function:
+ </para>
+ <programlisting>
+static void __exit usb_skel_exit(void)
+{
+ /* deregister this driver with the USB subsystem */
+ usb_deregister(&amp;skel_driver);
+}
+module_exit(usb_skel_exit);
+ </programlisting>
+ <para>
+ To enable the linux-hotplug system to load the driver automatically when
+ the device is plugged in, you need to create a MODULE_DEVICE_TABLE. The
+ following code tells the hotplug scripts that this module supports a
+ single device with a specific vendor and product ID:
+ </para>
+ <programlisting>
+/* table of devices that work with this driver */
+static struct usb_device_id skel_table [] = {
+ { USB_DEVICE(USB_SKEL_VENDOR_ID, USB_SKEL_PRODUCT_ID) },
+ { } /* Terminating entry */
+};
+MODULE_DEVICE_TABLE (usb, skel_table);
+ </programlisting>
+ <para>
+ There are other macros that can be used in describing a usb_device_id for
+ drivers that support a whole class of USB drivers. See usb.h for more
+ information on this.
+ </para>
+ </chapter>
+
+ <chapter id="device">
+ <title>Device operation</title>
+ <para>
+ When a device is plugged into the USB bus that matches the device ID
+ pattern that your driver registered with the USB core, the probe function
+ is called. The usb_device structure, interface number and the interface ID
+ are passed to the function:
+ </para>
+ <programlisting>
+static int skel_probe(struct usb_interface *interface,
+ const struct usb_device_id *id)
+ </programlisting>
+ <para>
+ The driver now needs to verify that this device is actually one that it
+ can accept. If so, it returns 0.
+ If not, or if any error occurs during initialization, an errorcode
+ (such as <literal>-ENOMEM</literal> or <literal>-ENODEV</literal>)
+ is returned from the probe function.
+ </para>
+ <para>
+ In the skeleton driver, we determine what end points are marked as bulk-in
+ and bulk-out. We create buffers to hold the data that will be sent and
+ received from the device, and a USB urb to write data to the device is
+ initialized.
+ </para>
+ <para>
+ Conversely, when the device is removed from the USB bus, the disconnect
+ function is called with the device pointer. The driver needs to clean any
+ private data that has been allocated at this time and to shut down any
+ pending urbs that are in the USB system. The driver also unregisters
+ itself from the devfs subsystem with the call:
+ </para>
+ <programlisting>
+/* remove our devfs node */
+devfs_unregister(skel->devfs);
+ </programlisting>
+ <para>
+ Now that the device is plugged into the system and the driver is bound to
+ the device, any of the functions in the file_operations structure that
+ were passed to the USB subsystem will be called from a user program trying
+ to talk to the device. The first function called will be open, as the
+ program tries to open the device for I/O. We increment our private usage
+ count and save off a pointer to our internal structure in the file
+ structure. This is done so that future calls to file operations will
+ enable the driver to determine which device the user is addressing. All
+ of this is done with the following code:
+ </para>
+ <programlisting>
+/* increment our usage count for the module */
+++skel->open_count;
+
+/* save our object in the file's private structure */
+file->private_data = dev;
+ </programlisting>
+ <para>
+ After the open function is called, the read and write functions are called
+ to receive and send data to the device. In the skel_write function, we
+ receive a pointer to some data that the user wants to send to the device
+ and the size of the data. The function determines how much data it can
+ send to the device based on the size of the write urb it has created (this
+ size depends on the size of the bulk out end point that the device has).
+ Then it copies the data from user space to kernel space, points the urb to
+ the data and submits the urb to the USB subsystem. This can be shown in
+ he following code:
+ </para>
+ <programlisting>
+/* we can only write as much as 1 urb will hold */
+bytes_written = (count > skel->bulk_out_size) ? skel->bulk_out_size : count;
+
+/* copy the data from user space into our urb */
+copy_from_user(skel->write_urb->transfer_buffer, buffer, bytes_written);
+
+/* set up our urb */
+usb_fill_bulk_urb(skel->write_urb,
+ skel->dev,
+ usb_sndbulkpipe(skel->dev, skel->bulk_out_endpointAddr),
+ skel->write_urb->transfer_buffer,
+ bytes_written,
+ skel_write_bulk_callback,
+ skel);
+
+/* send the data out the bulk port */
+result = usb_submit_urb(skel->write_urb);
+if (result) {
+ err(&quot;Failed submitting write urb, error %d&quot;, result);
+}
+ </programlisting>
+ <para>
+ When the write urb is filled up with the proper information using the
+ usb_fill_bulk_urb function, we point the urb's completion callback to call our
+ own skel_write_bulk_callback function. This function is called when the
+ urb is finished by the USB subsystem. The callback function is called in
+ interrupt context, so caution must be taken not to do very much processing
+ at that time. Our implementation of skel_write_bulk_callback merely
+ reports if the urb was completed successfully or not and then returns.
+ </para>
+ <para>
+ The read function works a bit differently from the write function in that
+ we do not use an urb to transfer data from the device to the driver.
+ Instead we call the usb_bulk_msg function, which can be used to send or
+ receive data from a device without having to create urbs and handle
+ urb completion callback functions. We call the usb_bulk_msg function,
+ giving it a buffer into which to place any data received from the device
+ and a timeout value. If the timeout period expires without receiving any
+ data from the device, the function will fail and return an error message.
+ This can be shown with the following code:
+ </para>
+ <programlisting>
+/* do an immediate bulk read to get data from the device */
+retval = usb_bulk_msg (skel->dev,
+ usb_rcvbulkpipe (skel->dev,
+ skel->bulk_in_endpointAddr),
+ skel->bulk_in_buffer,
+ skel->bulk_in_size,
+ &amp;count, HZ*10);
+/* if the read was successful, copy the data to user space */
+if (!retval) {
+ if (copy_to_user (buffer, skel->bulk_in_buffer, count))
+ retval = -EFAULT;
+ else
+ retval = count;
+}
+ </programlisting>
+ <para>
+ The usb_bulk_msg function can be very useful for doing single reads or
+ writes to a device; however, if you need to read or write constantly to a
+ device, it is recommended to set up your own urbs and submit them to the
+ USB subsystem.
+ </para>
+ <para>
+ When the user program releases the file handle that it has been using to
+ talk to the device, the release function in the driver is called. In this
+ function we decrement our private usage count and wait for possible
+ pending writes:
+ </para>
+ <programlisting>
+/* decrement our usage count for the device */
+--skel->open_count;
+ </programlisting>
+ <para>
+ One of the more difficult problems that USB drivers must be able to handle
+ smoothly is the fact that the USB device may be removed from the system at
+ any point in time, even if a program is currently talking to it. It needs
+ to be able to shut down any current reads and writes and notify the
+ user-space programs that the device is no longer there. The following
+ code (function <function>skel_delete</function>)
+ is an example of how to do this: </para>
+ <programlisting>
+static inline void skel_delete (struct usb_skel *dev)
+{
+ if (dev->bulk_in_buffer != NULL)
+ kfree (dev->bulk_in_buffer);
+ if (dev->bulk_out_buffer != NULL)
+ usb_buffer_free (dev->udev, dev->bulk_out_size,
+ dev->bulk_out_buffer,
+ dev->write_urb->transfer_dma);
+ if (dev->write_urb != NULL)
+ usb_free_urb (dev->write_urb);
+ kfree (dev);
+}
+ </programlisting>
+ <para>
+ If a program currently has an open handle to the device, we reset the flag
+ <literal>device_present</literal>. For
+ every read, write, release and other functions that expect a device to be
+ present, the driver first checks this flag to see if the device is
+ still present. If not, it releases that the device has disappeared, and a
+ -ENODEV error is returned to the user-space program. When the release
+ function is eventually called, it determines if there is no device
+ and if not, it does the cleanup that the skel_disconnect
+ function normally does if there are no open files on the device (see
+ Listing 5).
+ </para>
+ </chapter>
+
+ <chapter id="iso">
+ <title>Isochronous Data</title>
+ <para>
+ This usb-skeleton driver does not have any examples of interrupt or
+ isochronous data being sent to or from the device. Interrupt data is sent
+ almost exactly as bulk data is, with a few minor exceptions. Isochronous
+ data works differently with continuous streams of data being sent to or
+ from the device. The audio and video camera drivers are very good examples
+ of drivers that handle isochronous data and will be useful if you also
+ need to do this.
+ </para>
+ </chapter>
+
+ <chapter id="Conclusion">
+ <title>Conclusion</title>
+ <para>
+ Writing Linux USB device drivers is not a difficult task as the
+ usb-skeleton driver shows. This driver, combined with the other current
+ USB drivers, should provide enough examples to help a beginning author
+ create a working driver in a minimal amount of time. The linux-usb-devel
+ mailing list archives also contain a lot of helpful information.
+ </para>
+ </chapter>
+
+ <chapter id="resources">
+ <title>Resources</title>
+ <para>
+ The Linux USB Project: <ulink url="http://www.linux-usb.org">http://www.linux-usb.org/</ulink>
+ </para>
+ <para>
+ Linux Hotplug Project: <ulink url="http://linux-hotplug.sourceforge.net">http://linux-hotplug.sourceforge.net/</ulink>
+ </para>
+ <para>
+ Linux USB Working Devices List: <ulink url="http://www.qbik.ch/usb/devices">http://www.qbik.ch/usb/devices/</ulink>
+ </para>
+ <para>
+ linux-usb-devel Mailing List Archives: <ulink url="http://marc.theaimsgroup.com/?l=linux-usb-devel">http://marc.theaimsgroup.com/?l=linux-usb-devel</ulink>
+ </para>
+ <para>
+ Programming Guide for Linux USB Device Drivers: <ulink url="http://usb.cs.tum.edu/usbdoc">http://usb.cs.tum.edu/usbdoc</ulink>
+ </para>
+ <para>
+ USB Home Page: <ulink url="http://www.usb.org">http://www.usb.org</ulink>
+ </para>
+ </chapter>
+
+</book>
diff --git a/Documentation/DocBook/z8530book.tmpl b/Documentation/DocBook/z8530book.tmpl
new file mode 100644
index 000000000000..a507876447aa
--- /dev/null
+++ b/Documentation/DocBook/z8530book.tmpl
@@ -0,0 +1,385 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+ "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="Z85230Guide">
+ <bookinfo>
+ <title>Z8530 Programming Guide</title>
+
+ <authorgroup>
+ <author>
+ <firstname>Alan</firstname>
+ <surname>Cox</surname>
+ <affiliation>
+ <address>
+ <email>alan@redhat.com</email>
+ </address>
+ </affiliation>
+ </author>
+ </authorgroup>
+
+ <copyright>
+ <year>2000</year>
+ <holder>Alan Cox</holder>
+ </copyright>
+
+ <legalnotice>
+ <para>
+ This documentation is free software; you can redistribute
+ it and/or modify it under the terms of the GNU General Public
+ License as published by the Free Software Foundation; either
+ version 2 of the License, or (at your option) any later
+ version.
+ </para>
+
+ <para>
+ This program is distributed in the hope that it will be
+ useful, but WITHOUT ANY WARRANTY; without even the implied
+ warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ See the GNU General Public License for more details.
+ </para>
+
+ <para>
+ You should have received a copy of the GNU General Public
+ License along with this program; if not, write to the Free
+ Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+ MA 02111-1307 USA
+ </para>
+
+ <para>
+ For more details see the file COPYING in the source
+ distribution of Linux.
+ </para>
+ </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+ <chapter id="intro">
+ <title>Introduction</title>
+ <para>
+ The Z85x30 family synchronous/asynchronous controller chips are
+ used on a large number of cheap network interface cards. The
+ kernel provides a core interface layer that is designed to make
+ it easy to provide WAN services using this chip.
+ </para>
+ <para>
+ The current driver only support synchronous operation. Merging the
+ asynchronous driver support into this code to allow any Z85x30
+ device to be used as both a tty interface and as a synchronous
+ controller is a project for Linux post the 2.4 release
+ </para>
+ <para>
+ The support code handles most common card configurations and
+ supports running both Cisco HDLC and Synchronous PPP. With extra
+ glue the frame relay and X.25 protocols can also be used with this
+ driver.
+ </para>
+ </chapter>
+
+ <chapter>
+ <title>Driver Modes</title>
+ <para>
+ The Z85230 driver layer can drive Z8530, Z85C30 and Z85230 devices
+ in three different modes. Each mode can be applied to an individual
+ channel on the chip (each chip has two channels).
+ </para>
+ <para>
+ The PIO synchronous mode supports the most common Z8530 wiring. Here
+ the chip is interface to the I/O and interrupt facilities of the
+ host machine but not to the DMA subsystem. When running PIO the
+ Z8530 has extremely tight timing requirements. Doing high speeds,
+ even with a Z85230 will be tricky. Typically you should expect to
+ achieve at best 9600 baud with a Z8C530 and 64Kbits with a Z85230.
+ </para>
+ <para>
+ The DMA mode supports the chip when it is configured to use dual DMA
+ channels on an ISA bus. The better cards tend to support this mode
+ of operation for a single channel. With DMA running the Z85230 tops
+ out when it starts to hit ISA DMA constraints at about 512Kbits. It
+ is worth noting here that many PC machines hang or crash when the
+ chip is driven fast enough to hold the ISA bus solid.
+ </para>
+ <para>
+ Transmit DMA mode uses a single DMA channel. The DMA channel is used
+ for transmission as the transmit FIFO is smaller than the receive
+ FIFO. it gives better performance than pure PIO mode but is nowhere
+ near as ideal as pure DMA mode.
+ </para>
+ </chapter>
+
+ <chapter>
+ <title>Using the Z85230 driver</title>
+ <para>
+ The Z85230 driver provides the back end interface to your board. To
+ configure a Z8530 interface you need to detect the board and to
+ identify its ports and interrupt resources. It is also your problem
+ to verify the resources are available.
+ </para>
+ <para>
+ Having identified the chip you need to fill in a struct z8530_dev,
+ which describes each chip. This object must exist until you finally
+ shutdown the board. Firstly zero the active field. This ensures
+ nothing goes off without you intending it. The irq field should
+ be set to the interrupt number of the chip. (Each chip has a single
+ interrupt source rather than each channel). You are responsible
+ for allocating the interrupt line. The interrupt handler should be
+ set to <function>z8530_interrupt</function>. The device id should
+ be set to the z8530_dev structure pointer. Whether the interrupt can
+ be shared or not is board dependent, and up to you to initialise.
+ </para>
+ <para>
+ The structure holds two channel structures.
+ Initialise chanA.ctrlio and chanA.dataio with the address of the
+ control and data ports. You can or this with Z8530_PORT_SLEEP to
+ indicate your interface needs the 5uS delay for chip settling done
+ in software. The PORT_SLEEP option is architecture specific. Other
+ flags may become available on future platforms, eg for MMIO.
+ Initialise the chanA.irqs to &amp;z8530_nop to start the chip up
+ as disabled and discarding interrupt events. This ensures that
+ stray interrupts will be mopped up and not hang the bus. Set
+ chanA.dev to point to the device structure itself. The
+ private and name field you may use as you wish. The private field
+ is unused by the Z85230 layer. The name is used for error reporting
+ and it may thus make sense to make it match the network name.
+ </para>
+ <para>
+ Repeat the same operation with the B channel if your chip has
+ both channels wired to something useful. This isn't always the
+ case. If it is not wired then the I/O values do not matter, but
+ you must initialise chanB.dev.
+ </para>
+ <para>
+ If your board has DMA facilities then initialise the txdma and
+ rxdma fields for the relevant channels. You must also allocate the
+ ISA DMA channels and do any necessary board level initialisation
+ to configure them. The low level driver will do the Z8530 and
+ DMA controller programming but not board specific magic.
+ </para>
+ <para>
+ Having initialised the device you can then call
+ <function>z8530_init</function>. This will probe the chip and
+ reset it into a known state. An identification sequence is then
+ run to identify the chip type. If the checks fail to pass the
+ function returns a non zero error code. Typically this indicates
+ that the port given is not valid. After this call the
+ type field of the z8530_dev structure is initialised to either
+ Z8530, Z85C30 or Z85230 according to the chip found.
+ </para>
+ <para>
+ Once you have called z8530_init you can also make use of the utility
+ function <function>z8530_describe</function>. This provides a
+ consistent reporting format for the Z8530 devices, and allows all
+ the drivers to provide consistent reporting.
+ </para>
+ </chapter>
+
+ <chapter>
+ <title>Attaching Network Interfaces</title>
+ <para>
+ If you wish to use the network interface facilities of the driver,
+ then you need to attach a network device to each channel that is
+ present and in use. In addition to use the SyncPPP and Cisco HDLC
+ you need to follow some additional plumbing rules. They may seem
+ complex but a look at the example hostess_sv11 driver should
+ reassure you.
+ </para>
+ <para>
+ The network device used for each channel should be pointed to by
+ the netdevice field of each channel. The dev-&gt; priv field of the
+ network device points to your private data - you will need to be
+ able to find your ppp device from this. In addition to use the
+ sync ppp layer the private data must start with a void * pointer
+ to the syncppp structures.
+ </para>
+ <para>
+ The way most drivers approach this particular problem is to
+ create a structure holding the Z8530 device definition and
+ put that and the syncppp pointer into the private field of
+ the network device. The network device fields of the channels
+ then point back to the network devices. The ppp_device can also
+ be put in the private structure conveniently.
+ </para>
+ <para>
+ If you wish to use the synchronous ppp then you need to attach
+ the syncppp layer to the network device. You should do this before
+ you register the network device. The
+ <function>sppp_attach</function> requires that the first void *
+ pointer in your private data is pointing to an empty struct
+ ppp_device. The function fills in the initial data for the
+ ppp/hdlc layer.
+ </para>
+ <para>
+ Before you register your network device you will also need to
+ provide suitable handlers for most of the network device callbacks.
+ See the network device documentation for more details on this.
+ </para>
+ </chapter>
+
+ <chapter>
+ <title>Configuring And Activating The Port</title>
+ <para>
+ The Z85230 driver provides helper functions and tables to load the
+ port registers on the Z8530 chips. When programming the register
+ settings for a channel be aware that the documentation recommends
+ initialisation orders. Strange things happen when these are not
+ followed.
+ </para>
+ <para>
+ <function>z8530_channel_load</function> takes an array of
+ pairs of initialisation values in an array of u8 type. The first
+ value is the Z8530 register number. Add 16 to indicate the alternate
+ register bank on the later chips. The array is terminated by a 255.
+ </para>
+ <para>
+ The driver provides a pair of public tables. The
+ z8530_hdlc_kilostream table is for the UK 'Kilostream' service and
+ also happens to cover most other end host configurations. The
+ z8530_hdlc_kilostream_85230 table is the same configuration using
+ the enhancements of the 85230 chip. The configuration loaded is
+ standard NRZ encoded synchronous data with HDLC bitstuffing. All
+ of the timing is taken from the other end of the link.
+ </para>
+ <para>
+ When writing your own tables be aware that the driver internally
+ tracks register values. It may need to reload values. You should
+ therefore be sure to set registers 1-7, 9-11, 14 and 15 in all
+ configurations. Where the register settings depend on DMA selection
+ the driver will update the bits itself when you open or close.
+ Loading a new table with the interface open is not recommended.
+ </para>
+ <para>
+ There are three standard configurations supported by the core
+ code. In PIO mode the interface is programmed up to use
+ interrupt driven PIO. This places high demands on the host processor
+ to avoid latency. The driver is written to take account of latency
+ issues but it cannot avoid latencies caused by other drivers,
+ notably IDE in PIO mode. Because the drivers allocate buffers you
+ must also prevent MTU changes while the port is open.
+ </para>
+ <para>
+ Once the port is open it will call the rx_function of each channel
+ whenever a completed packet arrived. This is invoked from
+ interrupt context and passes you the channel and a network
+ buffer (struct sk_buff) holding the data. The data includes
+ the CRC bytes so most users will want to trim the last two
+ bytes before processing the data. This function is very timing
+ critical. When you wish to simply discard data the support
+ code provides the function <function>z8530_null_rx</function>
+ to discard the data.
+ </para>
+ <para>
+ To active PIO mode sending and receiving the <function>
+ z8530_sync_open</function> is called. This expects to be passed
+ the network device and the channel. Typically this is called from
+ your network device open callback. On a failure a non zero error
+ status is returned. The <function>z8530_sync_close</function>
+ function shuts down a PIO channel. This must be done before the
+ channel is opened again and before the driver shuts down
+ and unloads.
+ </para>
+ <para>
+ The ideal mode of operation is dual channel DMA mode. Here the
+ kernel driver will configure the board for DMA in both directions.
+ The driver also handles ISA DMA issues such as controller
+ programming and the memory range limit for you. This mode is
+ activated by calling the <function>z8530_sync_dma_open</function>
+ function. On failure a non zero error value is returned.
+ Once this mode is activated it can be shut down by calling the
+ <function>z8530_sync_dma_close</function>. You must call the close
+ function matching the open mode you used.
+ </para>
+ <para>
+ The final supported mode uses a single DMA channel to drive the
+ transmit side. As the Z85C30 has a larger FIFO on the receive
+ channel this tends to increase the maximum speed a little.
+ This is activated by calling the <function>z8530_sync_txdma_open
+ </function>. This returns a non zero error code on failure. The
+ <function>z8530_sync_txdma_close</function> function closes down
+ the Z8530 interface from this mode.
+ </para>
+ </chapter>
+
+ <chapter>
+ <title>Network Layer Functions</title>
+ <para>
+ The Z8530 layer provides functions to queue packets for
+ transmission. The driver internally buffers the frame currently
+ being transmitted and one further frame (in order to keep back
+ to back transmission running). Any further buffering is up to
+ the caller.
+ </para>
+ <para>
+ The function <function>z8530_queue_xmit</function> takes a network
+ buffer in sk_buff format and queues it for transmission. The
+ caller must provide the entire packet with the exception of the
+ bitstuffing and CRC. This is normally done by the caller via
+ the syncppp interface layer. It returns 0 if the buffer has been
+ queued and non zero values for queue full. If the function accepts
+ the buffer it becomes property of the Z8530 layer and the caller
+ should not free it.
+ </para>
+ <para>
+ The function <function>z8530_get_stats</function> returns a pointer
+ to an internally maintained per interface statistics block. This
+ provides most of the interface code needed to implement the network
+ layer get_stats callback.
+ </para>
+ </chapter>
+
+ <chapter>
+ <title>Porting The Z8530 Driver</title>
+ <para>
+ The Z8530 driver is written to be portable. In DMA mode it makes
+ assumptions about the use of ISA DMA. These are probably warranted
+ in most cases as the Z85230 in particular was designed to glue to PC
+ type machines. The PIO mode makes no real assumptions.
+ </para>
+ <para>
+ Should you need to retarget the Z8530 driver to another architecture
+ the only code that should need changing are the port I/O functions.
+ At the moment these assume PC I/O port accesses. This may not be
+ appropriate for all platforms. Replacing
+ <function>z8530_read_port</function> and <function>z8530_write_port
+ </function> is intended to be all that is required to port this
+ driver layer.
+ </para>
+ </chapter>
+
+ <chapter id="bugs">
+ <title>Known Bugs And Assumptions</title>
+ <para>
+ <variablelist>
+ <varlistentry><term>Interrupt Locking</term>
+ <listitem>
+ <para>
+ The locking in the driver is done via the global cli/sti lock. This
+ makes for relatively poor SMP performance. Switching this to use a
+ per device spin lock would probably materially improve performance.
+ </para>
+ </listitem></varlistentry>
+
+ <varlistentry><term>Occasional Failures</term>
+ <listitem>
+ <para>
+ We have reports of occasional failures when run for very long
+ periods of time and the driver starts to receive junk frames. At
+ the moment the cause of this is not clear.
+ </para>
+ </listitem></varlistentry>
+ </variablelist>
+
+ </para>
+ </chapter>
+
+ <chapter id="pubfunctions">
+ <title>Public Functions Provided</title>
+!Edrivers/net/wan/z85230.c
+ </chapter>
+
+ <chapter id="intfunctions">
+ <title>Internal Functions</title>
+!Idrivers/net/wan/z85230.c
+ </chapter>
+
+</book>