Initial commit.

git-svn-id: file:///home/svnrepos/software/liblbfgs/trunk@2 ecf4c44f-38d1-4fa4-9757-a0b4dd0349fc
2008-04-24 15:04:59 +00:00
parent 3166ac548b
commit 85c2de4384
16 changed files with 3959 additions and 0 deletions
--- a/1
+++ b/1
@@ -0,0 +1 @@
+Naoaki Okazaki <okazaki at chokkan org>
--- a/22
+++ b/22
@@ -0,0 +1,22 @@
+The MIT License
+
+Copyright (c) 1990 Jorge Nocedal
+Copyright (c) 2007 Naoaki Okazaki
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
--- a/29
+++ b/29
@@ -0,0 +1,29 @@
+2008-03-05  Naoaki Okazaki  <okazaki at chokkan org>
+
+	* CRFsuite 0.4 (the first public release):
+	- Website and documentation for CRFsuite.
+	- Tutorial on the CoNLL 2000 chunking shared task.
+	- Performance comparison on the CoNLL 2000 chunking shared task.
+	- Bug fix in L2 regularization.
+	- A number of small improvements for the public release.
+
+
+2007-12-12  Naoaki Okazaki  <okazaki at chokkan org>
+
+	* CRFsuite 0.3 (internal release):
+	- Implemented scaling method for forward/backward algorithm.
+	- Removed the code for computing the forward/backward algorithm in logarithm domain.
+	
+
+2007-11-30  Naoaki Okazaki  <okazaki at chokkan org>
+
+	* CRFsuite 0.2 (internal release):
+	- Orthant-Wise Limited-memory Quasi-Newton (OW-LQN) method for L1 regularization.
+	- Configurable L-BFGS parameters (number of limited memories, epsilon).
+	
+	
+2007-10-29  Naoaki Okazaki  <okazaki at chokkan org>
+
+	* CRFsuite 0.1 (internal release):
+	- Initial release.
+
--- a/Makefile.am
+++ b/Makefile.am
@@ -0,0 +1,32 @@
+# $Id$
+
+docdir = $(prefix)/share/doc/@PACKAGE@
+doc_DATA = README INSTALL COPYING AUTHORS ChangeLog
+
+liblbfgsincludedir = $(includedir)
+liblbfgsinclude_HEADERS = \
+	include/lbfgs.h
+
+EXTRA_DIST = \
+	autogen.sh
+
+lib_LTLIBRARIES = liblbfgs.la
+noinst_PROGRAMS = lbfgssample
+
+liblbfgs_la_SOURCES = \
+	lib/arithmetic_ansi.h \
+	lib/arithmetic_sse_double.h \
+	lib/arithmetic_sse_float.h \
+	lib/lbfgs.c
+
+liblbfgs_la_LDFLAGS = \
+	-no-undefined \
+	-release @VERSION@
+
+lbfgssample_SOURCES = \
+	sample/sample.c
+
+lbfgssample_LDADD = ./liblbfgs.la
+
+AM_CFLAGS = @CFLAGS@
+INCLUDES = @INCLUDES@
--- a/0
+++ b/0
--- a/42
+++ b/42
@@ -0,0 +1,42 @@
+
+           libLBFGS: C library of limited-memory BFGS (L-BFGS)
+
+                                       Copyright (c) 1990, Jorge Nocedal 
+                                       Copyright (c) 2007, Naoaki Okazaki
+
+=========================================================================
+1. Introduction
+=========================================================================
+libLBFGS is a C port of the implementation of Limited-memory
+Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
+The original FORTRAN source code is available at:
+http://www.ece.northwestern.edu/~nocedal/lbfgs.html
+
+The L-BFGS method solves the unconstrainted minimization problem:
+    minimize F(x), x = (x1, x2, ..., xN),
+only if the objective function F(x) and its gradient G(x) are computable.
+
+Refer to the libLBFGS web site for more information.
+http://www.chokkan.org/software/liblbfgs/
+
+
+
+=========================================================================
+2. How to build the sample program
+=========================================================================
+[Microsoft Visual Studio 2005]
+Open the solution file "test/lbfgs.sln" and build it.
+
+[GCC]
+$ cd test
+$ ./build.sh
+
+
+
+=========================================================================
+3. License
+=========================================================================
+libLBFGS is distributed under the term of the MIT license.
+Please refer to COPYING file in the distribution.
+
+$Id$
--- a/autogen.sh
+++ b/autogen.sh
@@ -0,0 +1,38 @@
+#!/bin/sh
+# $Id$
+
+if [ "$1" = "--force" ];
+then
+    FORCE=--force
+    NOFORCE=
+    FORCE_MISSING=--force-missing
+else
+    FORCE=
+    NOFORCE=--no-force
+    FORCE_MISSING=
+fi
+
+libtoolize --copy $FORCE 2>&1 | sed '/^You should/d' || {
+    echo "libtoolize failed!"
+    exit 1
+}
+
+aclocal $FORCE || {
+    echo "aclocal failed!"
+    exit 1
+}
+
+autoheader $FORCE || {
+    echo "autoheader failed!"
+    exit 1
+}
+
+automake -a -c $NOFORCE || {
+    echo "automake failed!"
+    exit 1
+}
+
+autoconf $FORCE || {
+    echo "autoconf failed!"
+    exit 1
+}
--- a/configure.in
+++ b/configure.in
@@ -0,0 +1,93 @@
+dnl $Id:$
+dnl
+dnl
+dnl Exported and configured variables:
+dnl CFLAGS
+dnl LDFLAGS
+dnl INCLUDES
+
+
+dnl ------------------------------------------------------------------
+dnl Initialization for autoconf
+dnl ------------------------------------------------------------------
+AC_PREREQ(2.59)
+AC_INIT
+AC_CONFIG_SRCDIR([lib/lbfgs.c])
+
+
+dnl ------------------------------------------------------------------
+dnl Initialization for automake
+dnl ------------------------------------------------------------------
+AM_INIT_AUTOMAKE(liblbfgs, 1.4)
+AC_CONFIG_HEADERS(config.h)
+AM_MAINTAINER_MODE
+
+
+dnl ------------------------------------------------------------------
+dnl Checks for program
+dnl ------------------------------------------------------------------
+AC_PROG_LIBTOOL
+AC_PROG_INSTALL
+AC_PROG_LN_S
+AC_PROG_MAKE_SET
+
+
+dnl ------------------------------------------------------------------
+dnl Initialization for variables
+dnl ------------------------------------------------------------------
+CFLAGS="${ac_save_CFLAGS}"
+LDFLAGS="${ac_save_LDFLAGS}"
+INCLUDES="-I\$(top_srcdir) -I\$(top_srcdir)/include"
+
+
+dnl ------------------------------------------------------------------
+dnl Checks for header files.
+dnl ------------------------------------------------------------------
+AC_HEADER_STDC
+AC_CHECK_HEADERS(fcntl.h limits.h malloc.h strings.h unistd.h stdint.h)
+
+
+dnl ------------------------------------------------------------------
+dnl Checks for debugging mode
+dnl ------------------------------------------------------------------
+AC_ARG_ENABLE(
+  debug,
+  [AS_HELP_STRING([--enable-debug],[Turn on debugging])]
+)
+
+if test "x$enable_debug" = "xyes"; then
+   CFLAGS="-DDEBUG -O -g ${CFLAGS}"
+else
+   CFLAGS="-O3 -ffast-math ${CFLAGS}"
+fi
+
+dnl ------------------------------------------------------------------
+dnl Checks for profiling mode
+dnl ------------------------------------------------------------------
+AC_ARG_ENABLE(
+  profile,
+  [AS_HELP_STRING([--enable-profile],[Turn on profiling])]
+)
+
+if test "x$enable_profile" = "xyes"; then
+   CFLAGS="-DPROFILE -pg ${CFLAGS}"
+fi
+
+
+dnl ------------------------------------------------------------------
+dnl Checks for library functions.
+dnl ------------------------------------------------------------------
+AC_CHECK_LIB(m, fabs)
+
+dnl ------------------------------------------------------------------
+dnl Export variables
+dnl ------------------------------------------------------------------
+AC_SUBST(CFLAGS)
+AC_SUBST(LDFLAGS)
+AC_SUBST(INCLUDES)
+
+dnl ------------------------------------------------------------------
+dnl Output the configure results.
+dnl ------------------------------------------------------------------
+AC_CONFIG_FILES(Makefile)
+AC_OUTPUT
--- a/doc/doxyfile
+++ b/doc/doxyfile
--- a/doc/footer.html
+++ b/doc/footer.html
@@ -0,0 +1,7 @@
+<hr/>
+<div>
+Copyright (c) 2002-2008 by Naoaki Okazaki
+<br /><i>$datetime</i>
+</div>
+</body>
+</html>
--- a/include/lbfgs.h
+++ b/include/lbfgs.h
@@ -0,0 +1,539 @@
+/*
+ *      C library of Limited memory BFGS (L-BFGS).
+ *
+ * Copyright (c) 1990, Jorge Nocedal
+ * Copyright (c) 2007,2008, Naoaki Okazaki
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/* $Id$ */
+
+#ifndef __LBFGS_H__
+#define __LBFGS_H__
+
+#ifdef  __cplusplus
+extern "C" {
+#endif/*__cplusplus*/
+
+/*
+ * The default precision of floating point values is 64bit (double).
+ */
+#ifndef LBFGS_FLOAT
+#define LBFGS_FLOAT     64
+#endif/*LBFGS_FLOAT*/
+
+/*
+ * Activate optimization routines for IEEE754 floating point values.
+ */
+#ifndef LBFGS_IEEE_FLOAT
+#define LBFGS_IEEE_FLOAT    1
+#endif/*LBFGS_IEEE_FLOAT*/
+
+#if     LBFGS_FLOAT == 32
+typedef float lbfgsfloatval_t;
+
+#elif   LBFGS_FLOAT == 64
+typedef double lbfgsfloatval_t;
+
+#else
+#error "liblbfgs supports single (float; LBFGS_FLOAT = 32) or double (double; LBFGS_FLOAT=64) precision only."
+
+#endif
+
+
+/** 
+ * \addtogroup liblbfgs_api libLBFGS API
+ * @{
+ *
+ *  The libLBFGS API.
+ */
+
+/**
+ * Return values of lbfgs().
+ */
+enum {
+    /** False value. */
+    LBFGSFALSE = 0,
+    /** True value. */
+    LBFGSTRUE,
+
+    /** Unknown error. */
+    LBFGSERR_UNKNOWNERROR = -1024,
+    /** Logic error. */
+    LBFGSERR_LOGICERROR,
+    /** Insufficient memory. */
+    LBFGSERR_OUTOFMEMORY,
+    /** The minimization process has been canceled. */
+    LBFGSERR_CANCELED,
+    /** Invalid number of variables specified. */
+    LBFGSERR_INVALID_N,
+    /** Invalid number of variables (for SSE) specified. */
+    LBFGSERR_INVALID_N_SSE,
+    /** Invalid parameter lbfgs_parameter_t::linesearch specified. */
+    LBFGSERR_INVALID_LINESEARCH,
+    /** Invalid parameter lbfgs_parameter_t::max_step specified. */
+    LBFGSERR_INVALID_MINSTEP,
+    /** Invalid parameter lbfgs_parameter_t::max_step specified. */
+    LBFGSERR_INVALID_MAXSTEP,
+    /** Invalid parameter lbfgs_parameter_t::ftol specified. */
+    LBFGSERR_INVALID_FTOL,
+    /** Invalid parameter lbfgs_parameter_t::gtol specified. */
+    LBFGSERR_INVALID_GTOL,
+    /** Invalid parameter lbfgs_parameter_t::xtol specified. */
+    LBFGSERR_INVALID_XTOL,
+    /** Invalid parameter lbfgs_parameter_t::max_linesearch specified. */
+    LBFGSERR_INVALID_MAXLINESEARCH,
+    /** Invalid parameter lbfgs_parameter_t::orthantwise_c specified. */
+    LBFGSERR_INVALID_ORTHANTWISE,
+    /** The line-search step went out of the interval of uncertainty. */
+    LBFGSERR_OUTOFINTERVAL,
+    /** A logic error occurred; alternatively, the interval of uncertainty
+        became too small. */
+    LBFGSERR_INCORRECT_TMINMAX,
+    /** A rounding error occurred; alternatively, no line-search step
+        satisfies the sufficient decrease and curvature conditions. */
+    LBFGSERR_ROUNDING_ERROR,
+    /** The line-search step became smaller than lbfgs_parameter_t::min_step. */
+    LBFGSERR_MINIMUMSTEP,
+    /** The line-search step became larger than lbfgs_parameter_t::max_step. */
+    LBFGSERR_MAXIMUMSTEP,
+    /** The line-search routine reaches the maximum number of evaluations. */
+    LBFGSERR_MAXIMUMLINESEARCH,
+    /** The algorithm routine reaches the maximum number of iterations. */
+    LBFGSERR_MAXIMUMITERATION,
+    /** Relative width of the interval of uncertainty is at most
+        lbfgs_parameter_t::xtol. */
+    LBFGSERR_WIDTHTOOSMALL,
+    /** A logic error (negative line-search step) occurred. */
+    LBFGSERR_INVALIDPARAMETERS,
+    /** The current search direction increases the objective function value. */
+    LBFGSERR_INCREASEGRADIENT,
+};
+
+/**
+ * Line search algorithms.
+ */
+enum {
+    /** The default algorithm (MoreThuente method). */
+    LBFGS_LINESEARCH_DEFAULT = 0,
+    /** MoreThuente method proposd by More and Thuente. */
+    LBFGS_LINESEARCH_MORETHUENTE = 0,
+    /** Backtracking method. */
+    LBFGS_LINESEARCH_BACKTRACKING,
+};
+
+/**
+ * L-BFGS optimization parameters.
+ *  Call lbfgs_parameter_init() function to initialize parameters to the
+ *  default values.
+ */
+typedef struct {
+    /**
+     * The number of corrections to approximate the inverse hessian matrix.
+     *  The L-BFGS routine stores the computation results of previous \ref m
+     *  iterations to approximate the inverse hessian matrix of the current
+     *  iteration. This parameter controls the size of the limited memories
+     *  (corrections). The default value is \c 6. Values less than \c 3 are
+     *  not recommended. Large values will result in excessive computing time.
+     */
+    int             m;
+
+    /**
+     * Epsilon for convergence test.
+     *  This parameter determines the accuracy with which the solution is to
+     *  be found. A minimization terminates when
+     *      ||g|| < \ref epsilon * max(1, ||x||),
+     *  where ||.|| denotes the Euclidean (L2) norm. The default value is
+     *  \c 1e-5.
+     */
+    lbfgsfloatval_t epsilon;
+
+    /**
+     * The maximum number of iterations.
+     *  The lbfgs() function terminates an optimization process with
+     *  ::LBFGSERR_MAXIMUMITERATION status code when the iteration count
+     *  exceedes this parameter. Setting this parameter to zero continues an
+     *  optimization process until a convergence or error. The default value
+     *  is \c 0.
+     */
+    int             max_iterations;
+
+    /**
+     * The line search algorithm.
+     *  This parameter specifies a line search algorithm to be used by the
+     *  L-BFGS routine.
+     */
+    int             linesearch;
+
+    /**
+     * The maximum number of trials for the line search.
+     *  This parameter controls the number of function and gradients evaluations
+     *  per iteration for the line search routine. The default value is \c 20.
+     */
+    int             max_linesearch;
+
+    /**
+     * The minimum step of the line search routine.
+     *  The default value is \c 1e-20. This value need not be modified unless
+     *  the exponents are too large for the machine being used, or unless the
+     *  problem is extremely badly scaled (in which case the exponents should
+     *  be increased).
+     */
+    lbfgsfloatval_t min_step;
+
+    /**
+     * The maximum step of the line search.
+     *  The default value is \c 1e+20. This value need not be modified unless
+     *  the exponents are too large for the machine being used, or unless the
+     *  problem is extremely badly scaled (in which case the exponents should
+     *  be increased).
+     */
+    lbfgsfloatval_t max_step;
+
+    /**
+     * A parameter to control the accuracy of the line search routine.
+     *  The default value is \c 1e-4. This parameter should be greater
+     *  than zero and smaller than \c 0.5.
+     */
+    lbfgsfloatval_t ftol;
+
+    /**
+     * A parameter to control the accuracy of the line search routine.
+     *  The default value is \c 0.9. If the function and gradient
+     *  evaluations are inexpensive with respect to the cost of the
+     *  iteration (which is sometimes the case when solving very large
+     *  problems) it may be advantageous to set this parameter to a small
+     *  value. A typical small value is \c 0.1. This parameter shuold be
+     *  greater than the \ref ftol parameter (\c 1e-4) and smaller than
+     *  \c 1.0.
+     */
+    lbfgsfloatval_t gtol;
+
+    /**
+     * The machine precision for floating-point values.
+     *  This parameter must be a positive value set by a client program to
+     *  estimate the machine precision. The line search routine will terminate
+     *  with the status code (::LBFGSERR_ROUNDING_ERROR) if the relative width
+     *  of the interval of uncertainty is less than this parameter.
+     */
+    lbfgsfloatval_t xtol;
+
+    /**
+     * Coeefficient for the L1 norm of variables.
+     *  This parameter should be set to zero for standard minimization
+     *  problems. Setting this parameter to a positive value minimizes the
+     *  objective function F(x) combined with the L1 norm |x| of the variables,
+     *  {F(x) + C |x|}. This parameter is the coeefficient for the |x|, i.e.,
+     *  C. As the L1 norm |x| is not differentiable at zero, the library
+     *  modify function and gradient evaluations from a client program
+     *  suitably; a client program thus have only to return the function value
+     *  F(x) and gradients G(x) as usual. The default value is zero.
+     */
+    lbfgsfloatval_t orthantwise_c;
+} lbfgs_parameter_t;
+
+
+/**
+ * Callback interface to provide objective function and gradient evaluations.
+ *
+ *  The lbfgs() function call this function to obtain the values of objective
+ *  function and its gradients when needed. A client program must implement
+ *  this function to evaluate the values of the objective function and its
+ *  gradients, given current values of variables.
+ *  
+ *  @param  instance    The user data sent for lbfgs() function by the client.
+ *  @param  x           The current values of variables.
+ *  @param  g           The gradient vector. The callback function must compute
+ *                      the gradient values for the current variables.
+ *  @param  n           The number of variables.
+ *  @param  step        The current step of the line search routine.
+ *  @retval lbfgsfloatval_t The value of the objective function for the current
+ *                          variables.
+ */
+typedef lbfgsfloatval_t (*lbfgs_evaluate_t)(
+    void *instance,
+    const lbfgsfloatval_t *x,
+    lbfgsfloatval_t *g,
+    const int n,
+    const lbfgsfloatval_t step
+    );
+
+/**
+ * Callback interface to receive the progress of the optimization process.
+ *
+ *  The lbfgs() function call this function for each iteration. Implementing
+ *  this function, a client program can store or display the current progress
+ *  of the optimization process.
+ *
+ *  @param  instance    The user data sent for lbfgs() function by the client.
+ *  @param  x           The current values of variables.
+ *  @param  g           The current gradient values of variables.
+ *  @param  fx          The current value of the objective function.
+ *  @param  xnorm       The Euclidean norm of the variables.
+ *  @param  gnorm       The Euclidean norm of the gradients.
+ *  @param  step        The line-search step used for this iteration.
+ *  @param  n           The number of variables.
+ *  @param  k           The iteration count.
+ *  @param  ls          The number of evaluations called for this iteration.
+ *  @retval int         Zero to continue the optimization process. Returning a
+ *                      non-zero value will cancel the optimization process.
+ */
+typedef int (*lbfgs_progress_t)(
+    void *instance,
+    const lbfgsfloatval_t *x,
+    const lbfgsfloatval_t *g,
+    const lbfgsfloatval_t fx,
+    const lbfgsfloatval_t xnorm,
+    const lbfgsfloatval_t gnorm,
+    const lbfgsfloatval_t step,
+    int n,
+    int k,
+    int ls
+    );
+
+/*
+A user must implement a function compatible with ::lbfgs_evaluate_t (evaluation
+callback) and pass the pointer to the callback function to lbfgs() arguments.
+Similarly, a user can implement a function compatible with ::lbfgs_progress_t
+(progress callback) to obtain the current progress (e.g., variables, function
+value, ||G||, etc) and to cancel the iteration process if necessary.
+Implementation of a progress callback is optional: a user can pass \c NULL if
+progress notification is not necessary.
+
+In addition, a user must preserve two requirements:
+    - The number of variables must be multiples of 16 (this is not 4).
+    - The memory block of variable array ::x must be aligned to 16.
+
+This algorithm terminates an optimization
+when:
+
+    ||G|| < \epsilon \cdot \max(1, ||x||) .
+
+In this formula, ||.|| denotes the Euclidean norm.
+*/
+
+/**
+ * Start a L-BFGS optimization.
+ *
+ *  @param  n           The number of variables.
+ *  @param  x           The array of variables. A client program can set
+ *                      default values for the optimization and receive the
+ *                      optimization result through this array.
+ *  @param  ptr_fx      The pointer to the variable that receives the final
+ *                      value of the objective function for the variables.
+ *                      This argument can be set to \c NULL if the final
+ *                      value of the objective function is unnecessary.
+ *  @param  proc_evaluate   The callback function to provide function and
+ *                          gradient evaluations given a current values of
+ *                          variables. A client program must implement a
+ *                          callback function compatible with \ref
+ *                          lbfgs_evaluate_t and pass the pointer to the
+ *                          callback function.
+ *  @param  proc_progress   The callback function to receive the progress
+ *                          (the number of iterations, the current value of
+ *                          the objective function) of the minimization
+ *                          process. This argument can be set to \c NULL if
+ *                          a progress report is unnecessary.
+ *  @param  instance    A user data for the client program. The callback
+ *                      functions will receive the value of this argument.
+ *  @param  param       The pointer to a structure representing parameters for
+ *                      L-BFGS optimization. A client program can set this
+ *                      parameter to \c NULL to use the default parameters.
+ *                      Call lbfgs_parameter_init() function to fill a
+ *                      structure with the default values.
+ *  @retval int         The status code. This function returns zero if the
+ *                      minimization process terminates without an error. A
+ *                      non-zero value indicates an error.
+ */
+int lbfgs(
+    const int n,
+    lbfgsfloatval_t *x,
+    lbfgsfloatval_t *ptr_fx,
+    lbfgs_evaluate_t proc_evaluate,
+    lbfgs_progress_t proc_progress,
+    void *instance,
+    lbfgs_parameter_t *param
+    );
+
+/**
+ * Initialize L-BFGS parameters to the default values.
+ *
+ *  Call this function to fill a parameter structure with the default values
+ *  and overwrite parameter values if necessary.
+ *
+ *  @param  param       The pointer to the parameter structure.
+ */
+void lbfgs_parameter_init(lbfgs_parameter_t *param);
+
+/** @} */
+
+#ifdef  __cplusplus
+}
+#endif/*__cplusplus*/
+
+
+
+/**
+@mainpage A library of Limited-memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS)
+
+@section intro Introduction
+
+This library is a C port of the implementation of Limited-memory
+Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
+The original FORTRAN source code is available at:
+http://www.ece.northwestern.edu/~nocedal/lbfgs.html
+
+The L-BFGS method solves the unconstrainted minimization problem,
+
+<pre>
+    minimize F(x), x = (x1, x2, ..., xN),
+</pre>
+
+only if the objective function F(x) and its gradient G(x) are computable. The
+well-known Newton's method requires computation of the inverse of the hessian
+matrix of the objective function. However, the computational cost for the
+inverse hessian matrix is expensive especially when the objective function
+takes a large number of variables. The L-BFGS method iteratively find a
+minimizer by approximating the inverse hessian matrix by information from last
+m iterations. This innovation saves the memory storage and computational time
+drastically for large-scaled problems.
+
+Among the various ports of L-BFGS, this library provides several features:
+- <b>Optimization with L1-norm (orthant-wise L-BFGS)</b>:
+  In addition to standard minimization problems, the library can minimize
+  a function F(x) combined with L1-norm |x| of the variables,
+  {F(x) + C |x|}, where C is a constant scalar parameter. This feature is
+  useful for estimating parameters of log-linear models (e.g., logistic
+  regression and maximum entropy) with L1-regularization.
+- <b>Clean C code</b>:
+  Unlike C codes generated automatically by f2c (Fortran 77 into C converter),
+  this port includes changes based on my interpretations, improvements,
+  optimizations, and clean-ups so that the ported code would be well-suited
+  for a C code. In addition to comments inherited from the original code,
+  a number of comments were added through my interpretations.
+- <b>Callback interface</b>:
+  The library receives function and gradient values via a callback interface.
+  The library also notifies the progress of the optimization by invoking a
+  callback function. In the original implementation, a user had to set
+  function and gradient values every time the function returns for obtaining
+  updated values.
+- <b>Thread safe</b>:
+  The library is thread-safe, which is the secondary gain from the callback
+  interface.
+- <b>Cross platform.</b> The source code can be compiled on Microsoft Visual
+  Studio 2005, GNU C Compiler (gcc), etc.
+- <b>Configurable precision</b>: A user can choose single-precision (float)
+  or double-precision (double) accuracy by changing ::LBFGS_FLOAT macro.
+- <b>SSE/SSE2 optimization</b>:
+  This library includes SSE/SSE2 optimization (written in compiler intrinsics)
+  for vector arithmetic operations on Intel/AMD processors. The library uses
+  SSE for float values and SSE2 for double values. The SSE/SSE2 optimization
+  routine is disabled by default; compile the library with __SSE__ symbol
+  defined to activate the optimization routine.
+
+This library is used by:
+- <a href="http://www.chokkan.org/software/crfsuite/">CRFsuite: A fast implementation of Conditional Random Fields (CRFs)</a>
+- <a href="http://www.public.iastate.edu/~gdancik/mlegp/">mlegp: an R package for maximum likelihood estimates for Gaussian processes</a>
+
+@section download Download
+
+- <a href="http://www.chokkan.org/software/dist/liblbfgs-1.3.tar.gz">Source code</a>
+
+libLBFGS is distributed under the term of the
+<a href="http://opensource.org/licenses/mit-license.php">MIT license</a>.
+
+@section changelog History
+- Version 1.4 (2008-04-25):
+    - Configurable line search algorithms. A member variable
+      ::lbfgs_parameter_t::linesearch was added to choose either MoreThuente
+      method (::LBFGS_LINESEARCH_MORETHUENTE) or backtracking algorithm
+      (::LBFGS_LINESEARCH_BACKTRACKING).
+    - Fixed a serious bug: the previous version did not compute
+      psuedo-gradients properly in the line search routine. This bug might
+      quit an iteration process too early when the orthant-wise L-BFGS routine
+      was activated (0 < ::lbfgs_parameter_t::orthantwise_c).
+- Version 1.3 (2007-12-16):
+    - An API change. An argument was added to lbfgs() function to receive the
+      final value of the objective function. This argument can be set to
+      \c NULL if the final value is unnecessary.
+    - Fixed a null-pointer bug in the sample code (reported by Takashi Imamichi).
+    - Added build scripts for Microsoft Visual Studio 2005 and GCC.
+    - Added README file.
+- Version 1.2 (2007-12-13):
+    - Fixed a serious bug in orthant-wise L-BFGS.
+      An important variable was used without initialization.
+- Version 1.1 (2007-12-01):
+    - Implemented orthant-wise L-BFGS.
+    - Implemented lbfgs_parameter_init() function.
+    - Fixed several bugs.
+    - API documentation.
+- Version 1.0 (2007-09-20):
+    - Initial release.
+
+@section api Documentation
+
+- @ref liblbfgs_api "libLBFGS API"
+
+@section sample Sample code
+
+@include sample.c
+
+@section ack Acknowledgements
+
+The L-BFGS algorithm is described in:
+    - Jorge Nocedal.
+      Updating Quasi-Newton Matrices with Limited Storage.
+      <i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980.
+    - Dong C. Liu and Jorge Nocedal.
+      On the limited memory BFGS method for large scale optimization.
+      <i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989.
+
+The line search algorithms used in this implementation are described in:
+    - John E. Dennis and Robert B. Schnabel.
+      <i>Numerical Methods for Unconstrained Optimization and Nonlinear
+      Equations</i>, Englewood Cliffs, 1983.
+    - Jorge J. More and David J. Thuente.
+      Line search algorithm with guaranteed sufficient decrease.
+      <i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3,
+      pp. 286-307, 1994.
+
+This library also implements Orthant-Wise Limited-memory Quasi-Newton (OW-LQN)
+method presented in:
+    - Galen Andrew and Jianfeng Gao.
+      Scalable training of L1-regularized log-linear models.
+      In <i>Proceedings of the 24th International Conference on Machine
+      Learning (ICML 2007)</i>, pp. 33-40, 2007.
+
+Finally I would like to thank the original author, Jorge Nocedal, who has been
+distributing the effieicnt and explanatory implementation in an open source
+licence.
+
+@section reference Reference
+
+- <a href="http://www.ece.northwestern.edu/~nocedal/lbfgs.html">L-BFGS</a> by Jorge Nocedal.
+- <a href="http://research.microsoft.com/research/downloads/Details/3f1840b2-dbb3-45e5-91b0-5ecd94bb73cf/Details.aspx">OWL-QN</a> by Galen Andrew.
+- <a href="http://chasen.org/~taku/software/misc/lbfgs/">C port (via f2c)</a> by Taku Kudo.
+- <a href="http://www.alglib.net/optimization/lbfgs.php">C#/C++/Delphi/VisualBasic6 port</a> in ALGLIB.
+- <a href="http://cctbx.sourceforge.net/">Computational Crystallography Toolbox</a> includes
+  <a href="http://cctbx.sourceforge.net/current_cvs/c_plus_plus/namespacescitbx_1_1lbfgs.html">scitbx::lbfgs</a>.
+*/
+
+#endif/*__LBFGS_H__*/
--- a/lib/arithmetic_ansi.h
+++ b/lib/arithmetic_ansi.h
@@ -0,0 +1,133 @@
+/*
+ *      ANSI C implementation of vector operations.
+ *
+ * Copyright (c) 2007,2008, Naoaki Okazaki
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/* $Id$ */
+
+#include <stdlib.h>
+#include <memory.h>
+
+#if     LBFGS_FLOAT == 32 && LBFGS_IEEE_FLOAT
+#define fsigndiff(x, y) (((*(uint32_t*)(x)) ^ (*(uint32_t*)(y))) & 0x80000000U)
+#else
+#define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
+#endif/*LBFGS_IEEE_FLOAT*/
+
+inline static void* vecalloc(size_t size)
+{
+    void *memblock = malloc(size);
+    if (memblock) {
+        memset(memblock, 0, size);
+    }
+    return memblock;
+}
+
+inline static void vecfree(void *memblock)
+{
+    free(memblock);
+}
+
+inline static void vecset(lbfgsfloatval_t *x, const lbfgsfloatval_t c, const int n)
+{
+    int i;
+    
+    for (i = 0;i < n;++i) {
+        x[i] = c;
+    }
+}
+
+inline static void veccpy(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const int n)
+{
+    int i;
+
+    for (i = 0;i < n;++i) {
+        y[i] = x[i];
+    }
+}
+
+inline static void vecncpy(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const int n)
+{
+    int i;
+
+    for (i = 0;i < n;++i) {
+        y[i] = -x[i];
+    }
+}
+
+inline static void vecadd(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const lbfgsfloatval_t c, const int n)
+{
+    int i;
+
+    for (i = 0;i < n;++i) {
+        y[i] += c * x[i];
+    }
+}
+
+inline static void vecdiff(lbfgsfloatval_t *z, const lbfgsfloatval_t *x, const lbfgsfloatval_t *y, const int n)
+{
+    int i;
+
+    for (i = 0;i < n;++i) {
+        z[i] = x[i] - y[i];
+    }
+}
+
+inline static void vecscale(lbfgsfloatval_t *y, const lbfgsfloatval_t c, const int n)
+{
+    int i;
+
+    for (i = 0;i < n;++i) {
+        y[i] *= c;
+    }
+}
+
+inline static void vecmul(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const int n)
+{
+    int i;
+
+    for (i = 0;i < n;++i) {
+        y[i] *= x[i];
+    }
+}
+
+inline static void vecdot(lbfgsfloatval_t* s, const lbfgsfloatval_t *x, const lbfgsfloatval_t *y, const int n)
+{
+    int i;
+    *s = 0.;
+    for (i = 0;i < n;++i) {
+        *s += x[i] * y[i];
+    }
+}
+
+inline static void vecnorm(lbfgsfloatval_t* s, const lbfgsfloatval_t *x, const int n)
+{
+    vecdot(s, x, x, n);
+    *s = (lbfgsfloatval_t)sqrt(*s);
+}
+
+inline static void vecrnorm(lbfgsfloatval_t* s, const lbfgsfloatval_t *x, const int n)
+{
+    vecnorm(s, x, n);
+    *s = (lbfgsfloatval_t)(1.0 / *s);
+}
--- a/lib/arithmetic_sse_double.h
+++ b/lib/arithmetic_sse_double.h
@@ -0,0 +1,275 @@
+/*
+ *      SSE2 implementation of vector oprations (64bit double).
+ *
+ * Copyright (c) 2007,2008, Naoaki Okazaki
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/* $Id$ */
+
+#include <stdlib.h>
+#include <malloc.h>
+#include <memory.h>
+
+#if     1400 <= _MSC_VER
+#include <intrin.h>
+#endif
+
+inline static void* vecalloc(size_t size)
+{
+    void *memblock = _aligned_malloc(size, 16);
+    if (memblock != NULL) {
+        memset(memblock, 0, size);
+    }
+    return memblock;
+}
+
+inline static void vecfree(void *memblock)
+{
+    _aligned_free(memblock);
+}
+
+#define fsigndiff(x, y) \
+    ((_mm_movemask_pd(_mm_set_pd(*(x), *(y))) + 1) & 0x002)
+
+#define vecset(x, c, n) \
+{ \
+    int i; \
+    __m128d XMM0 = _mm_set1_pd(c); \
+    for (i = 0;i < (n);i += 8) { \
+        _mm_store_pd((x)+i  , XMM0); \
+        _mm_store_pd((x)+i+2, XMM0); \
+        _mm_store_pd((x)+i+4, XMM0); \
+        _mm_store_pd((x)+i+6, XMM0); \
+    } \
+}
+
+#define veccpy(y, x, n) \
+{ \
+    int i; \
+    for (i = 0;i < (n);i += 8) { \
+        __m128d XMM0 = _mm_load_pd((x)+i  ); \
+        __m128d XMM1 = _mm_load_pd((x)+i+2); \
+        __m128d XMM2 = _mm_load_pd((x)+i+4); \
+        __m128d XMM3 = _mm_load_pd((x)+i+6); \
+        _mm_store_pd((y)+i  , XMM0); \
+        _mm_store_pd((y)+i+2, XMM1); \
+        _mm_store_pd((y)+i+4, XMM2); \
+        _mm_store_pd((y)+i+6, XMM3); \
+    } \
+}
+
+#define vecncpy(y, x, n) \
+{ \
+    int i; \
+    for (i = 0;i < (n);i += 8) { \
+        __m128d XMM0 = _mm_setzero_pd(); \
+        __m128d XMM1 = _mm_setzero_pd(); \
+        __m128d XMM2 = _mm_setzero_pd(); \
+        __m128d XMM3 = _mm_setzero_pd(); \
+        __m128d XMM4 = _mm_load_pd((x)+i  ); \
+        __m128d XMM5 = _mm_load_pd((x)+i+2); \
+        __m128d XMM6 = _mm_load_pd((x)+i+4); \
+        __m128d XMM7 = _mm_load_pd((x)+i+6); \
+        XMM0 = _mm_sub_pd(XMM0, XMM4); \
+        XMM1 = _mm_sub_pd(XMM1, XMM5); \
+        XMM2 = _mm_sub_pd(XMM2, XMM6); \
+        XMM3 = _mm_sub_pd(XMM3, XMM7); \
+        _mm_store_pd((y)+i  , XMM0); \
+        _mm_store_pd((y)+i+2, XMM1); \
+        _mm_store_pd((y)+i+4, XMM2); \
+        _mm_store_pd((y)+i+6, XMM3); \
+    } \
+}
+
+#define vecadd(y, x, c, n) \
+{ \
+    int i; \
+    __m128d XMM7 = _mm_set1_pd(c); \
+    for (i = 0;i < (n);i += 4) { \
+        __m128d XMM0 = _mm_load_pd((x)+i  ); \
+        __m128d XMM1 = _mm_load_pd((x)+i+2); \
+        __m128d XMM2 = _mm_load_pd((y)+i  ); \
+        __m128d XMM3 = _mm_load_pd((y)+i+2); \
+        XMM0 = _mm_mul_pd(XMM0, XMM7); \
+        XMM1 = _mm_mul_pd(XMM1, XMM7); \
+        XMM2 = _mm_add_pd(XMM2, XMM0); \
+        XMM3 = _mm_add_pd(XMM3, XMM1); \
+        _mm_store_pd((y)+i  , XMM2); \
+        _mm_store_pd((y)+i+2, XMM3); \
+    } \
+}
+
+#define vecdiff(z, x, y, n) \
+{ \
+    int i; \
+    for (i = 0;i < (n);i += 8) { \
+        __m128d XMM0 = _mm_load_pd((x)+i  ); \
+        __m128d XMM1 = _mm_load_pd((x)+i+2); \
+        __m128d XMM2 = _mm_load_pd((x)+i+4); \
+        __m128d XMM3 = _mm_load_pd((x)+i+6); \
+        __m128d XMM4 = _mm_load_pd((y)+i  ); \
+        __m128d XMM5 = _mm_load_pd((y)+i+2); \
+        __m128d XMM6 = _mm_load_pd((y)+i+4); \
+        __m128d XMM7 = _mm_load_pd((y)+i+6); \
+        XMM0 = _mm_sub_pd(XMM0, XMM4); \
+        XMM1 = _mm_sub_pd(XMM1, XMM5); \
+        XMM2 = _mm_sub_pd(XMM2, XMM6); \
+        XMM3 = _mm_sub_pd(XMM3, XMM7); \
+        _mm_store_pd((z)+i  , XMM0); \
+        _mm_store_pd((z)+i+2, XMM1); \
+        _mm_store_pd((z)+i+4, XMM2); \
+        _mm_store_pd((z)+i+6, XMM3); \
+    } \
+}
+
+#define vecscale(y, c, n) \
+{ \
+    int i; \
+    __m128d XMM7 = _mm_set1_pd(c); \
+    for (i = 0;i < (n);i += 4) { \
+        __m128d XMM0 = _mm_load_pd((y)+i  ); \
+        __m128d XMM1 = _mm_load_pd((y)+i+2); \
+        XMM0 = _mm_mul_pd(XMM0, XMM7); \
+        XMM1 = _mm_mul_pd(XMM1, XMM7); \
+        _mm_store_pd((y)+i  , XMM0); \
+        _mm_store_pd((y)+i+2, XMM1); \
+    } \
+}
+
+#define vecmul(y, x, n) \
+{ \
+    int i; \
+    for (i = 0;i < (n);i += 8) { \
+        __m128d XMM0 = _mm_load_pd((x)+i  ); \
+        __m128d XMM1 = _mm_load_pd((x)+i+2); \
+        __m128d XMM2 = _mm_load_pd((x)+i+4); \
+        __m128d XMM3 = _mm_load_pd((x)+i+6); \
+        __m128d XMM4 = _mm_load_pd((y)+i  ); \
+        __m128d XMM5 = _mm_load_pd((y)+i+2); \
+        __m128d XMM6 = _mm_load_pd((y)+i+4); \
+        __m128d XMM7 = _mm_load_pd((y)+i+6); \
+        XMM4 = _mm_mul_pd(XMM4, XMM0); \
+        XMM5 = _mm_mul_pd(XMM5, XMM1); \
+        XMM6 = _mm_mul_pd(XMM6, XMM2); \
+        XMM7 = _mm_mul_pd(XMM7, XMM3); \
+        _mm_store_pd((y)+i  , XMM4); \
+        _mm_store_pd((y)+i+2, XMM5); \
+        _mm_store_pd((y)+i+4, XMM6); \
+        _mm_store_pd((y)+i+6, XMM7); \
+    } \
+}
+
+
+
+#if     3 <= __SSE__
+/*
+    Horizontal add with haddps SSE3 instruction. The work register (rw)
+    is unused.
+ */
+#define __horizontal_sum(r, rw) \
+    r = _mm_hadd_ps(r, r); \
+    r = _mm_hadd_ps(r, r);
+
+#else
+/*
+    Horizontal add with SSE instruction. The work register (rw) is used.
+ */
+#define __horizontal_sum(r, rw) \
+    rw = r; \
+    r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(1, 0, 3, 2)); \
+    r = _mm_add_ps(r, rw); \
+    rw = r; \
+    r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(2, 3, 0, 1)); \
+    r = _mm_add_ps(r, rw);
+
+#endif
+
+#define vecdot(s, x, y, n) \
+{ \
+    int i; \
+    __m128d XMM0 = _mm_setzero_pd(); \
+    __m128d XMM1 = _mm_setzero_pd(); \
+    __m128d XMM2, XMM3, XMM4, XMM5; \
+    for (i = 0;i < (n);i += 4) { \
+        XMM2 = _mm_load_pd((x)+i  ); \
+        XMM3 = _mm_load_pd((x)+i+2); \
+        XMM4 = _mm_load_pd((y)+i  ); \
+        XMM5 = _mm_load_pd((y)+i+2); \
+        XMM2 = _mm_mul_pd(XMM2, XMM4); \
+        XMM3 = _mm_mul_pd(XMM3, XMM5); \
+        XMM0 = _mm_add_pd(XMM0, XMM2); \
+        XMM1 = _mm_add_pd(XMM1, XMM3); \
+    } \
+    XMM0 = _mm_add_pd(XMM0, XMM1); \
+    XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
+    XMM0 = _mm_add_pd(XMM0, XMM1); \
+    _mm_store_sd((s), XMM0); \
+}
+
+#define vecnorm(s, x, n) \
+{ \
+    int i; \
+    __m128d XMM0 = _mm_setzero_pd(); \
+    __m128d XMM1 = _mm_setzero_pd(); \
+    __m128d XMM2, XMM3, XMM4, XMM5; \
+    for (i = 0;i < (n);i += 4) { \
+        XMM2 = _mm_load_pd((x)+i  ); \
+        XMM3 = _mm_load_pd((x)+i+2); \
+        XMM4 = XMM2; \
+        XMM5 = XMM3; \
+        XMM2 = _mm_mul_pd(XMM2, XMM4); \
+        XMM3 = _mm_mul_pd(XMM3, XMM5); \
+        XMM0 = _mm_add_pd(XMM0, XMM2); \
+        XMM1 = _mm_add_pd(XMM1, XMM3); \
+    } \
+    XMM0 = _mm_add_pd(XMM0, XMM1); \
+    XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
+    XMM0 = _mm_add_pd(XMM0, XMM1); \
+    XMM0 = _mm_sqrt_pd(XMM0); \
+    _mm_store_sd((s), XMM0); \
+}
+
+
+#define vecrnorm(s, x, n) \
+{ \
+    int i; \
+    __m128d XMM0 = _mm_setzero_pd(); \
+    __m128d XMM1 = _mm_setzero_pd(); \
+    __m128d XMM2, XMM3, XMM4, XMM5; \
+    for (i = 0;i < (n);i += 4) { \
+        XMM2 = _mm_load_pd((x)+i  ); \
+        XMM3 = _mm_load_pd((x)+i+2); \
+        XMM4 = XMM2; \
+        XMM5 = XMM3; \
+        XMM2 = _mm_mul_pd(XMM2, XMM4); \
+        XMM3 = _mm_mul_pd(XMM3, XMM5); \
+        XMM0 = _mm_add_pd(XMM0, XMM2); \
+        XMM1 = _mm_add_pd(XMM1, XMM3); \
+    } \
+    XMM2 = _mm_set1_pd(1.0); \
+    XMM0 = _mm_add_pd(XMM0, XMM1); \
+    XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
+    XMM0 = _mm_add_pd(XMM0, XMM1); \
+    XMM0 = _mm_sqrt_pd(XMM0); \
+    XMM2 = _mm_div_pd(XMM2, XMM0); \
+    _mm_store_sd((s), XMM2); \
+}
--- a/lib/arithmetic_sse_float.h
+++ b/lib/arithmetic_sse_float.h
@@ -0,0 +1,283 @@
+/*
+ *      SSE/SSE3 implementation of vector oprations (32bit float).
+ *
+ * Copyright (c) 2007,2008, Naoaki Okazaki
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+/* $Id$ */
+
+#include <stdlib.h>
+#include <malloc.h>
+#include <memory.h>
+
+#if     1400 <= _MSC_VER
+#include <intrin.h>
+#endif
+
+#if     LBFGS_FLOAT == 32 && LBFGS_IEEE_FLOAT
+#define fsigndiff(x, y) (((*(uint32_t*)(x)) ^ (*(uint32_t*)(y))) & 0x80000000U)
+#else
+#define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
+#endif/*LBFGS_IEEE_FLOAT*/
+
+inline static void* vecalloc(size_t size)
+{
+    void *memblock = _aligned_malloc(size, 16);
+    if (memblock != NULL) {
+        memset(memblock, 0, size);
+    }
+    return memblock;
+}
+
+inline static void vecfree(void *memblock)
+{
+    _aligned_free(memblock);
+}
+
+#define vecset(x, c, n) \
+{ \
+    int i; \
+    __m128 XMM0 = _mm_set_ps1(c); \
+    for (i = 0;i < (n);i += 16) { \
+        _mm_store_ps((x)+i   , XMM0); \
+        _mm_store_ps((x)+i+ 4, XMM0); \
+        _mm_store_ps((x)+i+ 8, XMM0); \
+        _mm_store_ps((x)+i+12, XMM0); \
+    } \
+}
+
+#define veccpy(y, x, n) \
+{ \
+    int i; \
+    for (i = 0;i < (n);i += 16) { \
+        __m128 XMM0 = _mm_load_ps((x)+i   ); \
+        __m128 XMM1 = _mm_load_ps((x)+i+ 4); \
+        __m128 XMM2 = _mm_load_ps((x)+i+ 8); \
+        __m128 XMM3 = _mm_load_ps((x)+i+12); \
+        _mm_store_ps((y)+i   , XMM0); \
+        _mm_store_ps((y)+i+ 4, XMM1); \
+        _mm_store_ps((y)+i+ 8, XMM2); \
+        _mm_store_ps((y)+i+12, XMM3); \
+    } \
+}
+
+#define vecncpy(y, x, n) \
+{ \
+    int i; \
+    const uint32_t mask = 0x80000000; \
+    __m128 XMM4 = _mm_load_ps1((float*)&mask); \
+    for (i = 0;i < (n);i += 16) { \
+        __m128 XMM0 = _mm_load_ps((x)+i   ); \
+        __m128 XMM1 = _mm_load_ps((x)+i+ 4); \
+        __m128 XMM2 = _mm_load_ps((x)+i+ 8); \
+        __m128 XMM3 = _mm_load_ps((x)+i+12); \
+        XMM0 = _mm_xor_ps(XMM0, XMM4); \
+        XMM1 = _mm_xor_ps(XMM1, XMM4); \
+        XMM2 = _mm_xor_ps(XMM2, XMM4); \
+        XMM3 = _mm_xor_ps(XMM3, XMM4); \
+        _mm_store_ps((y)+i   , XMM0); \
+        _mm_store_ps((y)+i+ 4, XMM1); \
+        _mm_store_ps((y)+i+ 8, XMM2); \
+        _mm_store_ps((y)+i+12, XMM3); \
+    } \
+}
+
+#define vecadd(y, x, c, n) \
+{ \
+    int i; \
+    __m128 XMM7 = _mm_set_ps1(c); \
+    for (i = 0;i < (n);i += 8) { \
+        __m128 XMM0 = _mm_load_ps((x)+i  ); \
+        __m128 XMM1 = _mm_load_ps((x)+i+4); \
+        __m128 XMM2 = _mm_load_ps((y)+i  ); \
+        __m128 XMM3 = _mm_load_ps((y)+i+4); \
+        XMM0 = _mm_mul_ps(XMM0, XMM7); \
+        XMM1 = _mm_mul_ps(XMM1, XMM7); \
+        XMM2 = _mm_add_ps(XMM2, XMM0); \
+        XMM3 = _mm_add_ps(XMM3, XMM1); \
+        _mm_store_ps((y)+i  , XMM2); \
+        _mm_store_ps((y)+i+4, XMM3); \
+    } \
+}
+
+#define vecdiff(z, x, y, n) \
+{ \
+    int i; \
+    for (i = 0;i < (n);i += 16) { \
+        __m128 XMM0 = _mm_load_ps((x)+i   ); \
+        __m128 XMM1 = _mm_load_ps((x)+i+ 4); \
+        __m128 XMM2 = _mm_load_ps((x)+i+ 8); \
+        __m128 XMM3 = _mm_load_ps((x)+i+12); \
+        __m128 XMM4 = _mm_load_ps((y)+i   ); \
+        __m128 XMM5 = _mm_load_ps((y)+i+ 4); \
+        __m128 XMM6 = _mm_load_ps((y)+i+ 8); \
+        __m128 XMM7 = _mm_load_ps((y)+i+12); \
+        XMM0 = _mm_sub_ps(XMM0, XMM4); \
+        XMM1 = _mm_sub_ps(XMM1, XMM5); \
+        XMM2 = _mm_sub_ps(XMM2, XMM6); \
+        XMM3 = _mm_sub_ps(XMM3, XMM7); \
+        _mm_store_ps((z)+i   , XMM0); \
+        _mm_store_ps((z)+i+ 4, XMM1); \
+        _mm_store_ps((z)+i+ 8, XMM2); \
+        _mm_store_ps((z)+i+12, XMM3); \
+    } \
+}
+
+#define vecscale(y, c, n) \
+{ \
+    int i; \
+    __m128 XMM7 = _mm_set_ps1(c); \
+    for (i = 0;i < (n);i += 8) { \
+        __m128 XMM0 = _mm_load_ps((y)+i  ); \
+        __m128 XMM1 = _mm_load_ps((y)+i+4); \
+        XMM0 = _mm_mul_ps(XMM0, XMM7); \
+        XMM1 = _mm_mul_ps(XMM1, XMM7); \
+        _mm_store_ps((y)+i  , XMM0); \
+        _mm_store_ps((y)+i+4, XMM1); \
+    } \
+}
+
+#define vecmul(y, x, n) \
+{ \
+    int i; \
+    for (i = 0;i < (n);i += 16) { \
+        __m128 XMM0 = _mm_load_ps((x)+i   ); \
+        __m128 XMM1 = _mm_load_ps((x)+i+ 4); \
+        __m128 XMM2 = _mm_load_ps((x)+i+ 8); \
+        __m128 XMM3 = _mm_load_ps((x)+i+12); \
+        __m128 XMM4 = _mm_load_ps((y)+i   ); \
+        __m128 XMM5 = _mm_load_ps((y)+i+ 4); \
+        __m128 XMM6 = _mm_load_ps((y)+i+ 8); \
+        __m128 XMM7 = _mm_load_ps((y)+i+12); \
+        XMM4 = _mm_mul_ps(XMM4, XMM0); \
+        XMM5 = _mm_mul_ps(XMM5, XMM1); \
+        XMM6 = _mm_mul_ps(XMM6, XMM2); \
+        XMM7 = _mm_mul_ps(XMM7, XMM3); \
+        _mm_store_ps((y)+i   , XMM4); \
+        _mm_store_ps((y)+i+ 4, XMM5); \
+        _mm_store_ps((y)+i+ 8, XMM6); \
+        _mm_store_ps((y)+i+12, XMM7); \
+    } \
+}
+
+
+
+#if     3 <= __SSE__
+/*
+    Horizontal add with haddps SSE3 instruction. The work register (rw)
+    is unused.
+ */
+#define __horizontal_sum(r, rw) \
+    r = _mm_hadd_ps(r, r); \
+    r = _mm_hadd_ps(r, r);
+
+#else
+/*
+    Horizontal add with SSE instruction. The work register (rw) is used.
+ */
+#define __horizontal_sum(r, rw) \
+    rw = r; \
+    r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(1, 0, 3, 2)); \
+    r = _mm_add_ps(r, rw); \
+    rw = r; \
+    r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(2, 3, 0, 1)); \
+    r = _mm_add_ps(r, rw);
+
+#endif
+
+#define vecdot(s, x, y, n) \
+{ \
+    int i; \
+    __m128 XMM0 = _mm_setzero_ps(); \
+    __m128 XMM1 = _mm_setzero_ps(); \
+    __m128 XMM2, XMM3, XMM4, XMM5; \
+    for (i = 0;i < (n);i += 8) { \
+        XMM2 = _mm_load_ps((x)+i  ); \
+        XMM3 = _mm_load_ps((x)+i+4); \
+        XMM4 = _mm_load_ps((y)+i  ); \
+        XMM5 = _mm_load_ps((y)+i+4); \
+        XMM2 = _mm_mul_ps(XMM2, XMM4); \
+        XMM3 = _mm_mul_ps(XMM3, XMM5); \
+        XMM0 = _mm_add_ps(XMM0, XMM2); \
+        XMM1 = _mm_add_ps(XMM1, XMM3); \
+    } \
+    XMM0 = _mm_add_ps(XMM0, XMM1); \
+    __horizontal_sum(XMM0, XMM1); \
+    _mm_store_ss((s), XMM0); \
+}
+
+#define vecnorm(s, x, n) \
+{ \
+    int i; \
+    __m128 XMM0 = _mm_setzero_ps(); \
+    __m128 XMM1 = _mm_setzero_ps(); \
+    __m128 XMM2, XMM3; \
+    for (i = 0;i < (n);i += 8) { \
+        XMM2 = _mm_load_ps((x)+i  ); \
+        XMM3 = _mm_load_ps((x)+i+4); \
+        XMM2 = _mm_mul_ps(XMM2, XMM2); \
+        XMM3 = _mm_mul_ps(XMM3, XMM3); \
+        XMM0 = _mm_add_ps(XMM0, XMM2); \
+        XMM1 = _mm_add_ps(XMM1, XMM3); \
+    } \
+    XMM0 = _mm_add_ps(XMM0, XMM1); \
+    __horizontal_sum(XMM0, XMM1); \
+    XMM2 = XMM0; \
+    XMM1 = _mm_rsqrt_ss(XMM0); \
+    XMM3 = XMM1; \
+    XMM1 = _mm_mul_ss(XMM1, XMM1); \
+    XMM1 = _mm_mul_ss(XMM1, XMM3); \
+    XMM1 = _mm_mul_ss(XMM1, XMM0); \
+    XMM1 = _mm_mul_ss(XMM1, _mm_set_ss(-0.5f)); \
+    XMM3 = _mm_mul_ss(XMM3, _mm_set_ss(1.5f)); \
+    XMM3 = _mm_add_ss(XMM3, XMM1); \
+    XMM3 = _mm_mul_ss(XMM3, XMM2); \
+    _mm_store_ss((s), XMM3); \
+}
+
+#define vecrnorm(s, x, n) \
+{ \
+    int i; \
+    __m128 XMM0 = _mm_setzero_ps(); \
+    __m128 XMM1 = _mm_setzero_ps(); \
+    __m128 XMM2, XMM3; \
+    for (i = 0;i < (n);i += 16) { \
+        XMM2 = _mm_load_ps((x)+i  ); \
+        XMM3 = _mm_load_ps((x)+i+4); \
+        XMM2 = _mm_mul_ps(XMM2, XMM2); \
+        XMM3 = _mm_mul_ps(XMM3, XMM3); \
+        XMM0 = _mm_add_ps(XMM0, XMM2); \
+        XMM1 = _mm_add_ps(XMM1, XMM3); \
+    } \
+    XMM0 = _mm_add_ps(XMM0, XMM1); \
+    __horizontal_sum(XMM0, XMM1); \
+    XMM2 = XMM0; \
+    XMM1 = _mm_rsqrt_ss(XMM0); \
+    XMM3 = XMM1; \
+    XMM1 = _mm_mul_ss(XMM1, XMM1); \
+    XMM1 = _mm_mul_ss(XMM1, XMM3); \
+    XMM1 = _mm_mul_ss(XMM1, XMM0); \
+    XMM1 = _mm_mul_ss(XMM1, _mm_set_ss(-0.5f)); \
+    XMM3 = _mm_mul_ss(XMM3, _mm_set_ss(1.5f)); \
+    XMM3 = _mm_add_ss(XMM3, XMM1); \
+    _mm_store_ss((s), XMM3); \
+}
--- a/lib/lbfgs.c
+++ b/lib/lbfgs.c
--- a/sample/main.c
+++ b/sample/main.c
@@ -0,0 +1,69 @@
+#include <stdio.h>
+#include <lbfgs.h>
+
+static lbfgsfloatval_t evaluate(
+    void *instance,
+    const lbfgsfloatval_t *x,
+    lbfgsfloatval_t *g,
+    const int n,
+    const lbfgsfloatval_t step
+    )
+{
+    int i;
+    lbfgsfloatval_t fx = 0.0;
+
+    for (i = 0;i < n;i += 2) {
+        lbfgsfloatval_t t1 = 1.0 - x[i];
+        lbfgsfloatval_t t2 = 10.0 * (x[i+1] - x[i] * x[i]);
+        g[i+1] = 20.0 * t2;
+        g[i] = -2.0 * (x[i] * g[i+1] + t1);
+        fx += t1 * t1 + t2 * t2;
+    }
+    return fx;
+}
+
+static int progress(
+    void *instance,
+    const lbfgsfloatval_t *x,
+    const lbfgsfloatval_t *g,
+    const lbfgsfloatval_t fx,
+    const lbfgsfloatval_t xnorm,
+    const lbfgsfloatval_t gnorm,
+    const lbfgsfloatval_t step,
+    int n,
+    int k,
+    int ls
+    )
+{
+    printf("Iteration %d:\n", k);
+    printf("  fx = %f, x[0] = %f, x[1] = %f\n", fx, x[0], x[1]);
+    printf("  xnorm = %f, gnorm = %f, step = %f\n", xnorm, gnorm, step);
+    printf("\n");
+    return 0;
+}
+
+#define N   8
+
+int main(int argc, char *argv)
+{
+    int i, ret = 0;
+    lbfgsfloatval_t x[N], fx;
+
+    /* Initialize the variables. */
+    for (i = 0;i < N;i += 2) {
+        x[i] = -1.2;
+        x[i+1] = 1.0;
+    }
+
+    /*
+        Start the L-BFGS optimization; this will invoke the callback functions
+        evaluate() and progress() when necessary.
+     */
+    ret = lbfgs(N, x, &fx, evaluate, progress, NULL, NULL);
+
+    /* Report the result. */
+    printf("L-BFGS optimization terminated with status code = %d\n", ret);
+    printf("  fx = %f, x[0] = %f, x[1] = %f\n", fx, x[0], x[1]);
+
+    return 0;
+}