Initial commit.
git-svn-id: file:///home/svnrepos/software/liblbfgs/trunk@2 ecf4c44f-38d1-4fa4-9757-a0b4dd0349fc
This commit is contained in:
parent
3166ac548b
commit
85c2de4384
22
COPYING
Normal file
22
COPYING
Normal file
@ -0,0 +1,22 @@
|
||||
The MIT License
|
||||
|
||||
Copyright (c) 1990 Jorge Nocedal
|
||||
Copyright (c) 2007 Naoaki Okazaki
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a
|
||||
copy of this software and associated documentation files (the "Software"),
|
||||
to deal in the Software without restriction, including without limitation
|
||||
the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
and/or sell copies of the Software, and to permit persons to whom the
|
||||
Software is furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
29
ChangeLog
Normal file
29
ChangeLog
Normal file
@ -0,0 +1,29 @@
|
||||
2008-03-05 Naoaki Okazaki <okazaki at chokkan org>
|
||||
|
||||
* CRFsuite 0.4 (the first public release):
|
||||
- Website and documentation for CRFsuite.
|
||||
- Tutorial on the CoNLL 2000 chunking shared task.
|
||||
- Performance comparison on the CoNLL 2000 chunking shared task.
|
||||
- Bug fix in L2 regularization.
|
||||
- A number of small improvements for the public release.
|
||||
|
||||
|
||||
2007-12-12 Naoaki Okazaki <okazaki at chokkan org>
|
||||
|
||||
* CRFsuite 0.3 (internal release):
|
||||
- Implemented scaling method for forward/backward algorithm.
|
||||
- Removed the code for computing the forward/backward algorithm in logarithm domain.
|
||||
|
||||
|
||||
2007-11-30 Naoaki Okazaki <okazaki at chokkan org>
|
||||
|
||||
* CRFsuite 0.2 (internal release):
|
||||
- Orthant-Wise Limited-memory Quasi-Newton (OW-LQN) method for L1 regularization.
|
||||
- Configurable L-BFGS parameters (number of limited memories, epsilon).
|
||||
|
||||
|
||||
2007-10-29 Naoaki Okazaki <okazaki at chokkan org>
|
||||
|
||||
* CRFsuite 0.1 (internal release):
|
||||
- Initial release.
|
||||
|
32
Makefile.am
Normal file
32
Makefile.am
Normal file
@ -0,0 +1,32 @@
|
||||
# $Id$
|
||||
|
||||
docdir = $(prefix)/share/doc/@PACKAGE@
|
||||
doc_DATA = README INSTALL COPYING AUTHORS ChangeLog
|
||||
|
||||
liblbfgsincludedir = $(includedir)
|
||||
liblbfgsinclude_HEADERS = \
|
||||
include/lbfgs.h
|
||||
|
||||
EXTRA_DIST = \
|
||||
autogen.sh
|
||||
|
||||
lib_LTLIBRARIES = liblbfgs.la
|
||||
noinst_PROGRAMS = lbfgssample
|
||||
|
||||
liblbfgs_la_SOURCES = \
|
||||
lib/arithmetic_ansi.h \
|
||||
lib/arithmetic_sse_double.h \
|
||||
lib/arithmetic_sse_float.h \
|
||||
lib/lbfgs.c
|
||||
|
||||
liblbfgs_la_LDFLAGS = \
|
||||
-no-undefined \
|
||||
-release @VERSION@
|
||||
|
||||
lbfgssample_SOURCES = \
|
||||
sample/sample.c
|
||||
|
||||
lbfgssample_LDADD = ./liblbfgs.la
|
||||
|
||||
AM_CFLAGS = @CFLAGS@
|
||||
INCLUDES = @INCLUDES@
|
42
README
Normal file
42
README
Normal file
@ -0,0 +1,42 @@
|
||||
|
||||
libLBFGS: C library of limited-memory BFGS (L-BFGS)
|
||||
|
||||
Copyright (c) 1990, Jorge Nocedal
|
||||
Copyright (c) 2007, Naoaki Okazaki
|
||||
|
||||
=========================================================================
|
||||
1. Introduction
|
||||
=========================================================================
|
||||
libLBFGS is a C port of the implementation of Limited-memory
|
||||
Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
|
||||
The original FORTRAN source code is available at:
|
||||
http://www.ece.northwestern.edu/~nocedal/lbfgs.html
|
||||
|
||||
The L-BFGS method solves the unconstrainted minimization problem:
|
||||
minimize F(x), x = (x1, x2, ..., xN),
|
||||
only if the objective function F(x) and its gradient G(x) are computable.
|
||||
|
||||
Refer to the libLBFGS web site for more information.
|
||||
http://www.chokkan.org/software/liblbfgs/
|
||||
|
||||
|
||||
|
||||
=========================================================================
|
||||
2. How to build the sample program
|
||||
=========================================================================
|
||||
[Microsoft Visual Studio 2005]
|
||||
Open the solution file "test/lbfgs.sln" and build it.
|
||||
|
||||
[GCC]
|
||||
$ cd test
|
||||
$ ./build.sh
|
||||
|
||||
|
||||
|
||||
=========================================================================
|
||||
3. License
|
||||
=========================================================================
|
||||
libLBFGS is distributed under the term of the MIT license.
|
||||
Please refer to COPYING file in the distribution.
|
||||
|
||||
$Id$
|
38
autogen.sh
Executable file
38
autogen.sh
Executable file
@ -0,0 +1,38 @@
|
||||
#!/bin/sh
|
||||
# $Id$
|
||||
|
||||
if [ "$1" = "--force" ];
|
||||
then
|
||||
FORCE=--force
|
||||
NOFORCE=
|
||||
FORCE_MISSING=--force-missing
|
||||
else
|
||||
FORCE=
|
||||
NOFORCE=--no-force
|
||||
FORCE_MISSING=
|
||||
fi
|
||||
|
||||
libtoolize --copy $FORCE 2>&1 | sed '/^You should/d' || {
|
||||
echo "libtoolize failed!"
|
||||
exit 1
|
||||
}
|
||||
|
||||
aclocal $FORCE || {
|
||||
echo "aclocal failed!"
|
||||
exit 1
|
||||
}
|
||||
|
||||
autoheader $FORCE || {
|
||||
echo "autoheader failed!"
|
||||
exit 1
|
||||
}
|
||||
|
||||
automake -a -c $NOFORCE || {
|
||||
echo "automake failed!"
|
||||
exit 1
|
||||
}
|
||||
|
||||
autoconf $FORCE || {
|
||||
echo "autoconf failed!"
|
||||
exit 1
|
||||
}
|
93
configure.in
Normal file
93
configure.in
Normal file
@ -0,0 +1,93 @@
|
||||
dnl $Id:$
|
||||
dnl
|
||||
dnl
|
||||
dnl Exported and configured variables:
|
||||
dnl CFLAGS
|
||||
dnl LDFLAGS
|
||||
dnl INCLUDES
|
||||
|
||||
|
||||
dnl ------------------------------------------------------------------
|
||||
dnl Initialization for autoconf
|
||||
dnl ------------------------------------------------------------------
|
||||
AC_PREREQ(2.59)
|
||||
AC_INIT
|
||||
AC_CONFIG_SRCDIR([lib/lbfgs.c])
|
||||
|
||||
|
||||
dnl ------------------------------------------------------------------
|
||||
dnl Initialization for automake
|
||||
dnl ------------------------------------------------------------------
|
||||
AM_INIT_AUTOMAKE(liblbfgs, 1.4)
|
||||
AC_CONFIG_HEADERS(config.h)
|
||||
AM_MAINTAINER_MODE
|
||||
|
||||
|
||||
dnl ------------------------------------------------------------------
|
||||
dnl Checks for program
|
||||
dnl ------------------------------------------------------------------
|
||||
AC_PROG_LIBTOOL
|
||||
AC_PROG_INSTALL
|
||||
AC_PROG_LN_S
|
||||
AC_PROG_MAKE_SET
|
||||
|
||||
|
||||
dnl ------------------------------------------------------------------
|
||||
dnl Initialization for variables
|
||||
dnl ------------------------------------------------------------------
|
||||
CFLAGS="${ac_save_CFLAGS}"
|
||||
LDFLAGS="${ac_save_LDFLAGS}"
|
||||
INCLUDES="-I\$(top_srcdir) -I\$(top_srcdir)/include"
|
||||
|
||||
|
||||
dnl ------------------------------------------------------------------
|
||||
dnl Checks for header files.
|
||||
dnl ------------------------------------------------------------------
|
||||
AC_HEADER_STDC
|
||||
AC_CHECK_HEADERS(fcntl.h limits.h malloc.h strings.h unistd.h stdint.h)
|
||||
|
||||
|
||||
dnl ------------------------------------------------------------------
|
||||
dnl Checks for debugging mode
|
||||
dnl ------------------------------------------------------------------
|
||||
AC_ARG_ENABLE(
|
||||
debug,
|
||||
[AS_HELP_STRING([--enable-debug],[Turn on debugging])]
|
||||
)
|
||||
|
||||
if test "x$enable_debug" = "xyes"; then
|
||||
CFLAGS="-DDEBUG -O -g ${CFLAGS}"
|
||||
else
|
||||
CFLAGS="-O3 -ffast-math ${CFLAGS}"
|
||||
fi
|
||||
|
||||
dnl ------------------------------------------------------------------
|
||||
dnl Checks for profiling mode
|
||||
dnl ------------------------------------------------------------------
|
||||
AC_ARG_ENABLE(
|
||||
profile,
|
||||
[AS_HELP_STRING([--enable-profile],[Turn on profiling])]
|
||||
)
|
||||
|
||||
if test "x$enable_profile" = "xyes"; then
|
||||
CFLAGS="-DPROFILE -pg ${CFLAGS}"
|
||||
fi
|
||||
|
||||
|
||||
dnl ------------------------------------------------------------------
|
||||
dnl Checks for library functions.
|
||||
dnl ------------------------------------------------------------------
|
||||
AC_CHECK_LIB(m, fabs)
|
||||
|
||||
dnl ------------------------------------------------------------------
|
||||
dnl Export variables
|
||||
dnl ------------------------------------------------------------------
|
||||
AC_SUBST(CFLAGS)
|
||||
AC_SUBST(LDFLAGS)
|
||||
AC_SUBST(INCLUDES)
|
||||
|
||||
dnl ------------------------------------------------------------------
|
||||
dnl Output the configure results.
|
||||
dnl ------------------------------------------------------------------
|
||||
AC_CONFIG_FILES(Makefile)
|
||||
AC_OUTPUT
|
1252
doc/doxyfile
Normal file
1252
doc/doxyfile
Normal file
File diff suppressed because it is too large
Load Diff
7
doc/footer.html
Normal file
7
doc/footer.html
Normal file
@ -0,0 +1,7 @@
|
||||
<hr/>
|
||||
<div>
|
||||
Copyright (c) 2002-2008 by Naoaki Okazaki
|
||||
<br /><i>$datetime</i>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
539
include/lbfgs.h
Normal file
539
include/lbfgs.h
Normal file
@ -0,0 +1,539 @@
|
||||
/*
|
||||
* C library of Limited memory BFGS (L-BFGS).
|
||||
*
|
||||
* Copyright (c) 1990, Jorge Nocedal
|
||||
* Copyright (c) 2007,2008, Naoaki Okazaki
|
||||
* All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* $Id$ */
|
||||
|
||||
#ifndef __LBFGS_H__
|
||||
#define __LBFGS_H__
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif/*__cplusplus*/
|
||||
|
||||
/*
|
||||
* The default precision of floating point values is 64bit (double).
|
||||
*/
|
||||
#ifndef LBFGS_FLOAT
|
||||
#define LBFGS_FLOAT 64
|
||||
#endif/*LBFGS_FLOAT*/
|
||||
|
||||
/*
|
||||
* Activate optimization routines for IEEE754 floating point values.
|
||||
*/
|
||||
#ifndef LBFGS_IEEE_FLOAT
|
||||
#define LBFGS_IEEE_FLOAT 1
|
||||
#endif/*LBFGS_IEEE_FLOAT*/
|
||||
|
||||
#if LBFGS_FLOAT == 32
|
||||
typedef float lbfgsfloatval_t;
|
||||
|
||||
#elif LBFGS_FLOAT == 64
|
||||
typedef double lbfgsfloatval_t;
|
||||
|
||||
#else
|
||||
#error "liblbfgs supports single (float; LBFGS_FLOAT = 32) or double (double; LBFGS_FLOAT=64) precision only."
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
/**
|
||||
* \addtogroup liblbfgs_api libLBFGS API
|
||||
* @{
|
||||
*
|
||||
* The libLBFGS API.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Return values of lbfgs().
|
||||
*/
|
||||
enum {
|
||||
/** False value. */
|
||||
LBFGSFALSE = 0,
|
||||
/** True value. */
|
||||
LBFGSTRUE,
|
||||
|
||||
/** Unknown error. */
|
||||
LBFGSERR_UNKNOWNERROR = -1024,
|
||||
/** Logic error. */
|
||||
LBFGSERR_LOGICERROR,
|
||||
/** Insufficient memory. */
|
||||
LBFGSERR_OUTOFMEMORY,
|
||||
/** The minimization process has been canceled. */
|
||||
LBFGSERR_CANCELED,
|
||||
/** Invalid number of variables specified. */
|
||||
LBFGSERR_INVALID_N,
|
||||
/** Invalid number of variables (for SSE) specified. */
|
||||
LBFGSERR_INVALID_N_SSE,
|
||||
/** Invalid parameter lbfgs_parameter_t::linesearch specified. */
|
||||
LBFGSERR_INVALID_LINESEARCH,
|
||||
/** Invalid parameter lbfgs_parameter_t::max_step specified. */
|
||||
LBFGSERR_INVALID_MINSTEP,
|
||||
/** Invalid parameter lbfgs_parameter_t::max_step specified. */
|
||||
LBFGSERR_INVALID_MAXSTEP,
|
||||
/** Invalid parameter lbfgs_parameter_t::ftol specified. */
|
||||
LBFGSERR_INVALID_FTOL,
|
||||
/** Invalid parameter lbfgs_parameter_t::gtol specified. */
|
||||
LBFGSERR_INVALID_GTOL,
|
||||
/** Invalid parameter lbfgs_parameter_t::xtol specified. */
|
||||
LBFGSERR_INVALID_XTOL,
|
||||
/** Invalid parameter lbfgs_parameter_t::max_linesearch specified. */
|
||||
LBFGSERR_INVALID_MAXLINESEARCH,
|
||||
/** Invalid parameter lbfgs_parameter_t::orthantwise_c specified. */
|
||||
LBFGSERR_INVALID_ORTHANTWISE,
|
||||
/** The line-search step went out of the interval of uncertainty. */
|
||||
LBFGSERR_OUTOFINTERVAL,
|
||||
/** A logic error occurred; alternatively, the interval of uncertainty
|
||||
became too small. */
|
||||
LBFGSERR_INCORRECT_TMINMAX,
|
||||
/** A rounding error occurred; alternatively, no line-search step
|
||||
satisfies the sufficient decrease and curvature conditions. */
|
||||
LBFGSERR_ROUNDING_ERROR,
|
||||
/** The line-search step became smaller than lbfgs_parameter_t::min_step. */
|
||||
LBFGSERR_MINIMUMSTEP,
|
||||
/** The line-search step became larger than lbfgs_parameter_t::max_step. */
|
||||
LBFGSERR_MAXIMUMSTEP,
|
||||
/** The line-search routine reaches the maximum number of evaluations. */
|
||||
LBFGSERR_MAXIMUMLINESEARCH,
|
||||
/** The algorithm routine reaches the maximum number of iterations. */
|
||||
LBFGSERR_MAXIMUMITERATION,
|
||||
/** Relative width of the interval of uncertainty is at most
|
||||
lbfgs_parameter_t::xtol. */
|
||||
LBFGSERR_WIDTHTOOSMALL,
|
||||
/** A logic error (negative line-search step) occurred. */
|
||||
LBFGSERR_INVALIDPARAMETERS,
|
||||
/** The current search direction increases the objective function value. */
|
||||
LBFGSERR_INCREASEGRADIENT,
|
||||
};
|
||||
|
||||
/**
|
||||
* Line search algorithms.
|
||||
*/
|
||||
enum {
|
||||
/** The default algorithm (MoreThuente method). */
|
||||
LBFGS_LINESEARCH_DEFAULT = 0,
|
||||
/** MoreThuente method proposd by More and Thuente. */
|
||||
LBFGS_LINESEARCH_MORETHUENTE = 0,
|
||||
/** Backtracking method. */
|
||||
LBFGS_LINESEARCH_BACKTRACKING,
|
||||
};
|
||||
|
||||
/**
|
||||
* L-BFGS optimization parameters.
|
||||
* Call lbfgs_parameter_init() function to initialize parameters to the
|
||||
* default values.
|
||||
*/
|
||||
typedef struct {
|
||||
/**
|
||||
* The number of corrections to approximate the inverse hessian matrix.
|
||||
* The L-BFGS routine stores the computation results of previous \ref m
|
||||
* iterations to approximate the inverse hessian matrix of the current
|
||||
* iteration. This parameter controls the size of the limited memories
|
||||
* (corrections). The default value is \c 6. Values less than \c 3 are
|
||||
* not recommended. Large values will result in excessive computing time.
|
||||
*/
|
||||
int m;
|
||||
|
||||
/**
|
||||
* Epsilon for convergence test.
|
||||
* This parameter determines the accuracy with which the solution is to
|
||||
* be found. A minimization terminates when
|
||||
* ||g|| < \ref epsilon * max(1, ||x||),
|
||||
* where ||.|| denotes the Euclidean (L2) norm. The default value is
|
||||
* \c 1e-5.
|
||||
*/
|
||||
lbfgsfloatval_t epsilon;
|
||||
|
||||
/**
|
||||
* The maximum number of iterations.
|
||||
* The lbfgs() function terminates an optimization process with
|
||||
* ::LBFGSERR_MAXIMUMITERATION status code when the iteration count
|
||||
* exceedes this parameter. Setting this parameter to zero continues an
|
||||
* optimization process until a convergence or error. The default value
|
||||
* is \c 0.
|
||||
*/
|
||||
int max_iterations;
|
||||
|
||||
/**
|
||||
* The line search algorithm.
|
||||
* This parameter specifies a line search algorithm to be used by the
|
||||
* L-BFGS routine.
|
||||
*/
|
||||
int linesearch;
|
||||
|
||||
/**
|
||||
* The maximum number of trials for the line search.
|
||||
* This parameter controls the number of function and gradients evaluations
|
||||
* per iteration for the line search routine. The default value is \c 20.
|
||||
*/
|
||||
int max_linesearch;
|
||||
|
||||
/**
|
||||
* The minimum step of the line search routine.
|
||||
* The default value is \c 1e-20. This value need not be modified unless
|
||||
* the exponents are too large for the machine being used, or unless the
|
||||
* problem is extremely badly scaled (in which case the exponents should
|
||||
* be increased).
|
||||
*/
|
||||
lbfgsfloatval_t min_step;
|
||||
|
||||
/**
|
||||
* The maximum step of the line search.
|
||||
* The default value is \c 1e+20. This value need not be modified unless
|
||||
* the exponents are too large for the machine being used, or unless the
|
||||
* problem is extremely badly scaled (in which case the exponents should
|
||||
* be increased).
|
||||
*/
|
||||
lbfgsfloatval_t max_step;
|
||||
|
||||
/**
|
||||
* A parameter to control the accuracy of the line search routine.
|
||||
* The default value is \c 1e-4. This parameter should be greater
|
||||
* than zero and smaller than \c 0.5.
|
||||
*/
|
||||
lbfgsfloatval_t ftol;
|
||||
|
||||
/**
|
||||
* A parameter to control the accuracy of the line search routine.
|
||||
* The default value is \c 0.9. If the function and gradient
|
||||
* evaluations are inexpensive with respect to the cost of the
|
||||
* iteration (which is sometimes the case when solving very large
|
||||
* problems) it may be advantageous to set this parameter to a small
|
||||
* value. A typical small value is \c 0.1. This parameter shuold be
|
||||
* greater than the \ref ftol parameter (\c 1e-4) and smaller than
|
||||
* \c 1.0.
|
||||
*/
|
||||
lbfgsfloatval_t gtol;
|
||||
|
||||
/**
|
||||
* The machine precision for floating-point values.
|
||||
* This parameter must be a positive value set by a client program to
|
||||
* estimate the machine precision. The line search routine will terminate
|
||||
* with the status code (::LBFGSERR_ROUNDING_ERROR) if the relative width
|
||||
* of the interval of uncertainty is less than this parameter.
|
||||
*/
|
||||
lbfgsfloatval_t xtol;
|
||||
|
||||
/**
|
||||
* Coeefficient for the L1 norm of variables.
|
||||
* This parameter should be set to zero for standard minimization
|
||||
* problems. Setting this parameter to a positive value minimizes the
|
||||
* objective function F(x) combined with the L1 norm |x| of the variables,
|
||||
* {F(x) + C |x|}. This parameter is the coeefficient for the |x|, i.e.,
|
||||
* C. As the L1 norm |x| is not differentiable at zero, the library
|
||||
* modify function and gradient evaluations from a client program
|
||||
* suitably; a client program thus have only to return the function value
|
||||
* F(x) and gradients G(x) as usual. The default value is zero.
|
||||
*/
|
||||
lbfgsfloatval_t orthantwise_c;
|
||||
} lbfgs_parameter_t;
|
||||
|
||||
|
||||
/**
|
||||
* Callback interface to provide objective function and gradient evaluations.
|
||||
*
|
||||
* The lbfgs() function call this function to obtain the values of objective
|
||||
* function and its gradients when needed. A client program must implement
|
||||
* this function to evaluate the values of the objective function and its
|
||||
* gradients, given current values of variables.
|
||||
*
|
||||
* @param instance The user data sent for lbfgs() function by the client.
|
||||
* @param x The current values of variables.
|
||||
* @param g The gradient vector. The callback function must compute
|
||||
* the gradient values for the current variables.
|
||||
* @param n The number of variables.
|
||||
* @param step The current step of the line search routine.
|
||||
* @retval lbfgsfloatval_t The value of the objective function for the current
|
||||
* variables.
|
||||
*/
|
||||
typedef lbfgsfloatval_t (*lbfgs_evaluate_t)(
|
||||
void *instance,
|
||||
const lbfgsfloatval_t *x,
|
||||
lbfgsfloatval_t *g,
|
||||
const int n,
|
||||
const lbfgsfloatval_t step
|
||||
);
|
||||
|
||||
/**
|
||||
* Callback interface to receive the progress of the optimization process.
|
||||
*
|
||||
* The lbfgs() function call this function for each iteration. Implementing
|
||||
* this function, a client program can store or display the current progress
|
||||
* of the optimization process.
|
||||
*
|
||||
* @param instance The user data sent for lbfgs() function by the client.
|
||||
* @param x The current values of variables.
|
||||
* @param g The current gradient values of variables.
|
||||
* @param fx The current value of the objective function.
|
||||
* @param xnorm The Euclidean norm of the variables.
|
||||
* @param gnorm The Euclidean norm of the gradients.
|
||||
* @param step The line-search step used for this iteration.
|
||||
* @param n The number of variables.
|
||||
* @param k The iteration count.
|
||||
* @param ls The number of evaluations called for this iteration.
|
||||
* @retval int Zero to continue the optimization process. Returning a
|
||||
* non-zero value will cancel the optimization process.
|
||||
*/
|
||||
typedef int (*lbfgs_progress_t)(
|
||||
void *instance,
|
||||
const lbfgsfloatval_t *x,
|
||||
const lbfgsfloatval_t *g,
|
||||
const lbfgsfloatval_t fx,
|
||||
const lbfgsfloatval_t xnorm,
|
||||
const lbfgsfloatval_t gnorm,
|
||||
const lbfgsfloatval_t step,
|
||||
int n,
|
||||
int k,
|
||||
int ls
|
||||
);
|
||||
|
||||
/*
|
||||
A user must implement a function compatible with ::lbfgs_evaluate_t (evaluation
|
||||
callback) and pass the pointer to the callback function to lbfgs() arguments.
|
||||
Similarly, a user can implement a function compatible with ::lbfgs_progress_t
|
||||
(progress callback) to obtain the current progress (e.g., variables, function
|
||||
value, ||G||, etc) and to cancel the iteration process if necessary.
|
||||
Implementation of a progress callback is optional: a user can pass \c NULL if
|
||||
progress notification is not necessary.
|
||||
|
||||
In addition, a user must preserve two requirements:
|
||||
- The number of variables must be multiples of 16 (this is not 4).
|
||||
- The memory block of variable array ::x must be aligned to 16.
|
||||
|
||||
This algorithm terminates an optimization
|
||||
when:
|
||||
|
||||
||G|| < \epsilon \cdot \max(1, ||x||) .
|
||||
|
||||
In this formula, ||.|| denotes the Euclidean norm.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Start a L-BFGS optimization.
|
||||
*
|
||||
* @param n The number of variables.
|
||||
* @param x The array of variables. A client program can set
|
||||
* default values for the optimization and receive the
|
||||
* optimization result through this array.
|
||||
* @param ptr_fx The pointer to the variable that receives the final
|
||||
* value of the objective function for the variables.
|
||||
* This argument can be set to \c NULL if the final
|
||||
* value of the objective function is unnecessary.
|
||||
* @param proc_evaluate The callback function to provide function and
|
||||
* gradient evaluations given a current values of
|
||||
* variables. A client program must implement a
|
||||
* callback function compatible with \ref
|
||||
* lbfgs_evaluate_t and pass the pointer to the
|
||||
* callback function.
|
||||
* @param proc_progress The callback function to receive the progress
|
||||
* (the number of iterations, the current value of
|
||||
* the objective function) of the minimization
|
||||
* process. This argument can be set to \c NULL if
|
||||
* a progress report is unnecessary.
|
||||
* @param instance A user data for the client program. The callback
|
||||
* functions will receive the value of this argument.
|
||||
* @param param The pointer to a structure representing parameters for
|
||||
* L-BFGS optimization. A client program can set this
|
||||
* parameter to \c NULL to use the default parameters.
|
||||
* Call lbfgs_parameter_init() function to fill a
|
||||
* structure with the default values.
|
||||
* @retval int The status code. This function returns zero if the
|
||||
* minimization process terminates without an error. A
|
||||
* non-zero value indicates an error.
|
||||
*/
|
||||
int lbfgs(
|
||||
const int n,
|
||||
lbfgsfloatval_t *x,
|
||||
lbfgsfloatval_t *ptr_fx,
|
||||
lbfgs_evaluate_t proc_evaluate,
|
||||
lbfgs_progress_t proc_progress,
|
||||
void *instance,
|
||||
lbfgs_parameter_t *param
|
||||
);
|
||||
|
||||
/**
|
||||
* Initialize L-BFGS parameters to the default values.
|
||||
*
|
||||
* Call this function to fill a parameter structure with the default values
|
||||
* and overwrite parameter values if necessary.
|
||||
*
|
||||
* @param param The pointer to the parameter structure.
|
||||
*/
|
||||
void lbfgs_parameter_init(lbfgs_parameter_t *param);
|
||||
|
||||
/** @} */
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif/*__cplusplus*/
|
||||
|
||||
|
||||
|
||||
/**
|
||||
@mainpage A library of Limited-memory Broyden-Fletcher-Goldfarb-Shanno (L-BFGS)
|
||||
|
||||
@section intro Introduction
|
||||
|
||||
This library is a C port of the implementation of Limited-memory
|
||||
Broyden-Fletcher-Goldfarb-Shanno (L-BFGS) method written by Jorge Nocedal.
|
||||
The original FORTRAN source code is available at:
|
||||
http://www.ece.northwestern.edu/~nocedal/lbfgs.html
|
||||
|
||||
The L-BFGS method solves the unconstrainted minimization problem,
|
||||
|
||||
<pre>
|
||||
minimize F(x), x = (x1, x2, ..., xN),
|
||||
</pre>
|
||||
|
||||
only if the objective function F(x) and its gradient G(x) are computable. The
|
||||
well-known Newton's method requires computation of the inverse of the hessian
|
||||
matrix of the objective function. However, the computational cost for the
|
||||
inverse hessian matrix is expensive especially when the objective function
|
||||
takes a large number of variables. The L-BFGS method iteratively find a
|
||||
minimizer by approximating the inverse hessian matrix by information from last
|
||||
m iterations. This innovation saves the memory storage and computational time
|
||||
drastically for large-scaled problems.
|
||||
|
||||
Among the various ports of L-BFGS, this library provides several features:
|
||||
- <b>Optimization with L1-norm (orthant-wise L-BFGS)</b>:
|
||||
In addition to standard minimization problems, the library can minimize
|
||||
a function F(x) combined with L1-norm |x| of the variables,
|
||||
{F(x) + C |x|}, where C is a constant scalar parameter. This feature is
|
||||
useful for estimating parameters of log-linear models (e.g., logistic
|
||||
regression and maximum entropy) with L1-regularization.
|
||||
- <b>Clean C code</b>:
|
||||
Unlike C codes generated automatically by f2c (Fortran 77 into C converter),
|
||||
this port includes changes based on my interpretations, improvements,
|
||||
optimizations, and clean-ups so that the ported code would be well-suited
|
||||
for a C code. In addition to comments inherited from the original code,
|
||||
a number of comments were added through my interpretations.
|
||||
- <b>Callback interface</b>:
|
||||
The library receives function and gradient values via a callback interface.
|
||||
The library also notifies the progress of the optimization by invoking a
|
||||
callback function. In the original implementation, a user had to set
|
||||
function and gradient values every time the function returns for obtaining
|
||||
updated values.
|
||||
- <b>Thread safe</b>:
|
||||
The library is thread-safe, which is the secondary gain from the callback
|
||||
interface.
|
||||
- <b>Cross platform.</b> The source code can be compiled on Microsoft Visual
|
||||
Studio 2005, GNU C Compiler (gcc), etc.
|
||||
- <b>Configurable precision</b>: A user can choose single-precision (float)
|
||||
or double-precision (double) accuracy by changing ::LBFGS_FLOAT macro.
|
||||
- <b>SSE/SSE2 optimization</b>:
|
||||
This library includes SSE/SSE2 optimization (written in compiler intrinsics)
|
||||
for vector arithmetic operations on Intel/AMD processors. The library uses
|
||||
SSE for float values and SSE2 for double values. The SSE/SSE2 optimization
|
||||
routine is disabled by default; compile the library with __SSE__ symbol
|
||||
defined to activate the optimization routine.
|
||||
|
||||
This library is used by:
|
||||
- <a href="http://www.chokkan.org/software/crfsuite/">CRFsuite: A fast implementation of Conditional Random Fields (CRFs)</a>
|
||||
- <a href="http://www.public.iastate.edu/~gdancik/mlegp/">mlegp: an R package for maximum likelihood estimates for Gaussian processes</a>
|
||||
|
||||
@section download Download
|
||||
|
||||
- <a href="http://www.chokkan.org/software/dist/liblbfgs-1.3.tar.gz">Source code</a>
|
||||
|
||||
libLBFGS is distributed under the term of the
|
||||
<a href="http://opensource.org/licenses/mit-license.php">MIT license</a>.
|
||||
|
||||
@section changelog History
|
||||
- Version 1.4 (2008-04-25):
|
||||
- Configurable line search algorithms. A member variable
|
||||
::lbfgs_parameter_t::linesearch was added to choose either MoreThuente
|
||||
method (::LBFGS_LINESEARCH_MORETHUENTE) or backtracking algorithm
|
||||
(::LBFGS_LINESEARCH_BACKTRACKING).
|
||||
- Fixed a serious bug: the previous version did not compute
|
||||
psuedo-gradients properly in the line search routine. This bug might
|
||||
quit an iteration process too early when the orthant-wise L-BFGS routine
|
||||
was activated (0 < ::lbfgs_parameter_t::orthantwise_c).
|
||||
- Version 1.3 (2007-12-16):
|
||||
- An API change. An argument was added to lbfgs() function to receive the
|
||||
final value of the objective function. This argument can be set to
|
||||
\c NULL if the final value is unnecessary.
|
||||
- Fixed a null-pointer bug in the sample code (reported by Takashi Imamichi).
|
||||
- Added build scripts for Microsoft Visual Studio 2005 and GCC.
|
||||
- Added README file.
|
||||
- Version 1.2 (2007-12-13):
|
||||
- Fixed a serious bug in orthant-wise L-BFGS.
|
||||
An important variable was used without initialization.
|
||||
- Version 1.1 (2007-12-01):
|
||||
- Implemented orthant-wise L-BFGS.
|
||||
- Implemented lbfgs_parameter_init() function.
|
||||
- Fixed several bugs.
|
||||
- API documentation.
|
||||
- Version 1.0 (2007-09-20):
|
||||
- Initial release.
|
||||
|
||||
@section api Documentation
|
||||
|
||||
- @ref liblbfgs_api "libLBFGS API"
|
||||
|
||||
@section sample Sample code
|
||||
|
||||
@include sample.c
|
||||
|
||||
@section ack Acknowledgements
|
||||
|
||||
The L-BFGS algorithm is described in:
|
||||
- Jorge Nocedal.
|
||||
Updating Quasi-Newton Matrices with Limited Storage.
|
||||
<i>Mathematics of Computation</i>, Vol. 35, No. 151, pp. 773--782, 1980.
|
||||
- Dong C. Liu and Jorge Nocedal.
|
||||
On the limited memory BFGS method for large scale optimization.
|
||||
<i>Mathematical Programming</i> B, Vol. 45, No. 3, pp. 503-528, 1989.
|
||||
|
||||
The line search algorithms used in this implementation are described in:
|
||||
- John E. Dennis and Robert B. Schnabel.
|
||||
<i>Numerical Methods for Unconstrained Optimization and Nonlinear
|
||||
Equations</i>, Englewood Cliffs, 1983.
|
||||
- Jorge J. More and David J. Thuente.
|
||||
Line search algorithm with guaranteed sufficient decrease.
|
||||
<i>ACM Transactions on Mathematical Software (TOMS)</i>, Vol. 20, No. 3,
|
||||
pp. 286-307, 1994.
|
||||
|
||||
This library also implements Orthant-Wise Limited-memory Quasi-Newton (OW-LQN)
|
||||
method presented in:
|
||||
- Galen Andrew and Jianfeng Gao.
|
||||
Scalable training of L1-regularized log-linear models.
|
||||
In <i>Proceedings of the 24th International Conference on Machine
|
||||
Learning (ICML 2007)</i>, pp. 33-40, 2007.
|
||||
|
||||
Finally I would like to thank the original author, Jorge Nocedal, who has been
|
||||
distributing the effieicnt and explanatory implementation in an open source
|
||||
licence.
|
||||
|
||||
@section reference Reference
|
||||
|
||||
- <a href="http://www.ece.northwestern.edu/~nocedal/lbfgs.html">L-BFGS</a> by Jorge Nocedal.
|
||||
- <a href="http://research.microsoft.com/research/downloads/Details/3f1840b2-dbb3-45e5-91b0-5ecd94bb73cf/Details.aspx">OWL-QN</a> by Galen Andrew.
|
||||
- <a href="http://chasen.org/~taku/software/misc/lbfgs/">C port (via f2c)</a> by Taku Kudo.
|
||||
- <a href="http://www.alglib.net/optimization/lbfgs.php">C#/C++/Delphi/VisualBasic6 port</a> in ALGLIB.
|
||||
- <a href="http://cctbx.sourceforge.net/">Computational Crystallography Toolbox</a> includes
|
||||
<a href="http://cctbx.sourceforge.net/current_cvs/c_plus_plus/namespacescitbx_1_1lbfgs.html">scitbx::lbfgs</a>.
|
||||
*/
|
||||
|
||||
#endif/*__LBFGS_H__*/
|
133
lib/arithmetic_ansi.h
Normal file
133
lib/arithmetic_ansi.h
Normal file
@ -0,0 +1,133 @@
|
||||
/*
|
||||
* ANSI C implementation of vector operations.
|
||||
*
|
||||
* Copyright (c) 2007,2008, Naoaki Okazaki
|
||||
* All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* $Id$ */
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <memory.h>
|
||||
|
||||
#if LBFGS_FLOAT == 32 && LBFGS_IEEE_FLOAT
|
||||
#define fsigndiff(x, y) (((*(uint32_t*)(x)) ^ (*(uint32_t*)(y))) & 0x80000000U)
|
||||
#else
|
||||
#define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
|
||||
#endif/*LBFGS_IEEE_FLOAT*/
|
||||
|
||||
inline static void* vecalloc(size_t size)
|
||||
{
|
||||
void *memblock = malloc(size);
|
||||
if (memblock) {
|
||||
memset(memblock, 0, size);
|
||||
}
|
||||
return memblock;
|
||||
}
|
||||
|
||||
inline static void vecfree(void *memblock)
|
||||
{
|
||||
free(memblock);
|
||||
}
|
||||
|
||||
inline static void vecset(lbfgsfloatval_t *x, const lbfgsfloatval_t c, const int n)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0;i < n;++i) {
|
||||
x[i] = c;
|
||||
}
|
||||
}
|
||||
|
||||
inline static void veccpy(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const int n)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0;i < n;++i) {
|
||||
y[i] = x[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline static void vecncpy(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const int n)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0;i < n;++i) {
|
||||
y[i] = -x[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline static void vecadd(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const lbfgsfloatval_t c, const int n)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0;i < n;++i) {
|
||||
y[i] += c * x[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline static void vecdiff(lbfgsfloatval_t *z, const lbfgsfloatval_t *x, const lbfgsfloatval_t *y, const int n)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0;i < n;++i) {
|
||||
z[i] = x[i] - y[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline static void vecscale(lbfgsfloatval_t *y, const lbfgsfloatval_t c, const int n)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0;i < n;++i) {
|
||||
y[i] *= c;
|
||||
}
|
||||
}
|
||||
|
||||
inline static void vecmul(lbfgsfloatval_t *y, const lbfgsfloatval_t *x, const int n)
|
||||
{
|
||||
int i;
|
||||
|
||||
for (i = 0;i < n;++i) {
|
||||
y[i] *= x[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline static void vecdot(lbfgsfloatval_t* s, const lbfgsfloatval_t *x, const lbfgsfloatval_t *y, const int n)
|
||||
{
|
||||
int i;
|
||||
*s = 0.;
|
||||
for (i = 0;i < n;++i) {
|
||||
*s += x[i] * y[i];
|
||||
}
|
||||
}
|
||||
|
||||
inline static void vecnorm(lbfgsfloatval_t* s, const lbfgsfloatval_t *x, const int n)
|
||||
{
|
||||
vecdot(s, x, x, n);
|
||||
*s = (lbfgsfloatval_t)sqrt(*s);
|
||||
}
|
||||
|
||||
inline static void vecrnorm(lbfgsfloatval_t* s, const lbfgsfloatval_t *x, const int n)
|
||||
{
|
||||
vecnorm(s, x, n);
|
||||
*s = (lbfgsfloatval_t)(1.0 / *s);
|
||||
}
|
275
lib/arithmetic_sse_double.h
Normal file
275
lib/arithmetic_sse_double.h
Normal file
@ -0,0 +1,275 @@
|
||||
/*
|
||||
* SSE2 implementation of vector oprations (64bit double).
|
||||
*
|
||||
* Copyright (c) 2007,2008, Naoaki Okazaki
|
||||
* All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* $Id$ */
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <malloc.h>
|
||||
#include <memory.h>
|
||||
|
||||
#if 1400 <= _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
inline static void* vecalloc(size_t size)
|
||||
{
|
||||
void *memblock = _aligned_malloc(size, 16);
|
||||
if (memblock != NULL) {
|
||||
memset(memblock, 0, size);
|
||||
}
|
||||
return memblock;
|
||||
}
|
||||
|
||||
inline static void vecfree(void *memblock)
|
||||
{
|
||||
_aligned_free(memblock);
|
||||
}
|
||||
|
||||
#define fsigndiff(x, y) \
|
||||
((_mm_movemask_pd(_mm_set_pd(*(x), *(y))) + 1) & 0x002)
|
||||
|
||||
#define vecset(x, c, n) \
|
||||
{ \
|
||||
int i; \
|
||||
__m128d XMM0 = _mm_set1_pd(c); \
|
||||
for (i = 0;i < (n);i += 8) { \
|
||||
_mm_store_pd((x)+i , XMM0); \
|
||||
_mm_store_pd((x)+i+2, XMM0); \
|
||||
_mm_store_pd((x)+i+4, XMM0); \
|
||||
_mm_store_pd((x)+i+6, XMM0); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define veccpy(y, x, n) \
|
||||
{ \
|
||||
int i; \
|
||||
for (i = 0;i < (n);i += 8) { \
|
||||
__m128d XMM0 = _mm_load_pd((x)+i ); \
|
||||
__m128d XMM1 = _mm_load_pd((x)+i+2); \
|
||||
__m128d XMM2 = _mm_load_pd((x)+i+4); \
|
||||
__m128d XMM3 = _mm_load_pd((x)+i+6); \
|
||||
_mm_store_pd((y)+i , XMM0); \
|
||||
_mm_store_pd((y)+i+2, XMM1); \
|
||||
_mm_store_pd((y)+i+4, XMM2); \
|
||||
_mm_store_pd((y)+i+6, XMM3); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define vecncpy(y, x, n) \
|
||||
{ \
|
||||
int i; \
|
||||
for (i = 0;i < (n);i += 8) { \
|
||||
__m128d XMM0 = _mm_setzero_pd(); \
|
||||
__m128d XMM1 = _mm_setzero_pd(); \
|
||||
__m128d XMM2 = _mm_setzero_pd(); \
|
||||
__m128d XMM3 = _mm_setzero_pd(); \
|
||||
__m128d XMM4 = _mm_load_pd((x)+i ); \
|
||||
__m128d XMM5 = _mm_load_pd((x)+i+2); \
|
||||
__m128d XMM6 = _mm_load_pd((x)+i+4); \
|
||||
__m128d XMM7 = _mm_load_pd((x)+i+6); \
|
||||
XMM0 = _mm_sub_pd(XMM0, XMM4); \
|
||||
XMM1 = _mm_sub_pd(XMM1, XMM5); \
|
||||
XMM2 = _mm_sub_pd(XMM2, XMM6); \
|
||||
XMM3 = _mm_sub_pd(XMM3, XMM7); \
|
||||
_mm_store_pd((y)+i , XMM0); \
|
||||
_mm_store_pd((y)+i+2, XMM1); \
|
||||
_mm_store_pd((y)+i+4, XMM2); \
|
||||
_mm_store_pd((y)+i+6, XMM3); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define vecadd(y, x, c, n) \
|
||||
{ \
|
||||
int i; \
|
||||
__m128d XMM7 = _mm_set1_pd(c); \
|
||||
for (i = 0;i < (n);i += 4) { \
|
||||
__m128d XMM0 = _mm_load_pd((x)+i ); \
|
||||
__m128d XMM1 = _mm_load_pd((x)+i+2); \
|
||||
__m128d XMM2 = _mm_load_pd((y)+i ); \
|
||||
__m128d XMM3 = _mm_load_pd((y)+i+2); \
|
||||
XMM0 = _mm_mul_pd(XMM0, XMM7); \
|
||||
XMM1 = _mm_mul_pd(XMM1, XMM7); \
|
||||
XMM2 = _mm_add_pd(XMM2, XMM0); \
|
||||
XMM3 = _mm_add_pd(XMM3, XMM1); \
|
||||
_mm_store_pd((y)+i , XMM2); \
|
||||
_mm_store_pd((y)+i+2, XMM3); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define vecdiff(z, x, y, n) \
|
||||
{ \
|
||||
int i; \
|
||||
for (i = 0;i < (n);i += 8) { \
|
||||
__m128d XMM0 = _mm_load_pd((x)+i ); \
|
||||
__m128d XMM1 = _mm_load_pd((x)+i+2); \
|
||||
__m128d XMM2 = _mm_load_pd((x)+i+4); \
|
||||
__m128d XMM3 = _mm_load_pd((x)+i+6); \
|
||||
__m128d XMM4 = _mm_load_pd((y)+i ); \
|
||||
__m128d XMM5 = _mm_load_pd((y)+i+2); \
|
||||
__m128d XMM6 = _mm_load_pd((y)+i+4); \
|
||||
__m128d XMM7 = _mm_load_pd((y)+i+6); \
|
||||
XMM0 = _mm_sub_pd(XMM0, XMM4); \
|
||||
XMM1 = _mm_sub_pd(XMM1, XMM5); \
|
||||
XMM2 = _mm_sub_pd(XMM2, XMM6); \
|
||||
XMM3 = _mm_sub_pd(XMM3, XMM7); \
|
||||
_mm_store_pd((z)+i , XMM0); \
|
||||
_mm_store_pd((z)+i+2, XMM1); \
|
||||
_mm_store_pd((z)+i+4, XMM2); \
|
||||
_mm_store_pd((z)+i+6, XMM3); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define vecscale(y, c, n) \
|
||||
{ \
|
||||
int i; \
|
||||
__m128d XMM7 = _mm_set1_pd(c); \
|
||||
for (i = 0;i < (n);i += 4) { \
|
||||
__m128d XMM0 = _mm_load_pd((y)+i ); \
|
||||
__m128d XMM1 = _mm_load_pd((y)+i+2); \
|
||||
XMM0 = _mm_mul_pd(XMM0, XMM7); \
|
||||
XMM1 = _mm_mul_pd(XMM1, XMM7); \
|
||||
_mm_store_pd((y)+i , XMM0); \
|
||||
_mm_store_pd((y)+i+2, XMM1); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define vecmul(y, x, n) \
|
||||
{ \
|
||||
int i; \
|
||||
for (i = 0;i < (n);i += 8) { \
|
||||
__m128d XMM0 = _mm_load_pd((x)+i ); \
|
||||
__m128d XMM1 = _mm_load_pd((x)+i+2); \
|
||||
__m128d XMM2 = _mm_load_pd((x)+i+4); \
|
||||
__m128d XMM3 = _mm_load_pd((x)+i+6); \
|
||||
__m128d XMM4 = _mm_load_pd((y)+i ); \
|
||||
__m128d XMM5 = _mm_load_pd((y)+i+2); \
|
||||
__m128d XMM6 = _mm_load_pd((y)+i+4); \
|
||||
__m128d XMM7 = _mm_load_pd((y)+i+6); \
|
||||
XMM4 = _mm_mul_pd(XMM4, XMM0); \
|
||||
XMM5 = _mm_mul_pd(XMM5, XMM1); \
|
||||
XMM6 = _mm_mul_pd(XMM6, XMM2); \
|
||||
XMM7 = _mm_mul_pd(XMM7, XMM3); \
|
||||
_mm_store_pd((y)+i , XMM4); \
|
||||
_mm_store_pd((y)+i+2, XMM5); \
|
||||
_mm_store_pd((y)+i+4, XMM6); \
|
||||
_mm_store_pd((y)+i+6, XMM7); \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
|
||||
#if 3 <= __SSE__
|
||||
/*
|
||||
Horizontal add with haddps SSE3 instruction. The work register (rw)
|
||||
is unused.
|
||||
*/
|
||||
#define __horizontal_sum(r, rw) \
|
||||
r = _mm_hadd_ps(r, r); \
|
||||
r = _mm_hadd_ps(r, r);
|
||||
|
||||
#else
|
||||
/*
|
||||
Horizontal add with SSE instruction. The work register (rw) is used.
|
||||
*/
|
||||
#define __horizontal_sum(r, rw) \
|
||||
rw = r; \
|
||||
r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(1, 0, 3, 2)); \
|
||||
r = _mm_add_ps(r, rw); \
|
||||
rw = r; \
|
||||
r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(2, 3, 0, 1)); \
|
||||
r = _mm_add_ps(r, rw);
|
||||
|
||||
#endif
|
||||
|
||||
#define vecdot(s, x, y, n) \
|
||||
{ \
|
||||
int i; \
|
||||
__m128d XMM0 = _mm_setzero_pd(); \
|
||||
__m128d XMM1 = _mm_setzero_pd(); \
|
||||
__m128d XMM2, XMM3, XMM4, XMM5; \
|
||||
for (i = 0;i < (n);i += 4) { \
|
||||
XMM2 = _mm_load_pd((x)+i ); \
|
||||
XMM3 = _mm_load_pd((x)+i+2); \
|
||||
XMM4 = _mm_load_pd((y)+i ); \
|
||||
XMM5 = _mm_load_pd((y)+i+2); \
|
||||
XMM2 = _mm_mul_pd(XMM2, XMM4); \
|
||||
XMM3 = _mm_mul_pd(XMM3, XMM5); \
|
||||
XMM0 = _mm_add_pd(XMM0, XMM2); \
|
||||
XMM1 = _mm_add_pd(XMM1, XMM3); \
|
||||
} \
|
||||
XMM0 = _mm_add_pd(XMM0, XMM1); \
|
||||
XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
|
||||
XMM0 = _mm_add_pd(XMM0, XMM1); \
|
||||
_mm_store_sd((s), XMM0); \
|
||||
}
|
||||
|
||||
#define vecnorm(s, x, n) \
|
||||
{ \
|
||||
int i; \
|
||||
__m128d XMM0 = _mm_setzero_pd(); \
|
||||
__m128d XMM1 = _mm_setzero_pd(); \
|
||||
__m128d XMM2, XMM3, XMM4, XMM5; \
|
||||
for (i = 0;i < (n);i += 4) { \
|
||||
XMM2 = _mm_load_pd((x)+i ); \
|
||||
XMM3 = _mm_load_pd((x)+i+2); \
|
||||
XMM4 = XMM2; \
|
||||
XMM5 = XMM3; \
|
||||
XMM2 = _mm_mul_pd(XMM2, XMM4); \
|
||||
XMM3 = _mm_mul_pd(XMM3, XMM5); \
|
||||
XMM0 = _mm_add_pd(XMM0, XMM2); \
|
||||
XMM1 = _mm_add_pd(XMM1, XMM3); \
|
||||
} \
|
||||
XMM0 = _mm_add_pd(XMM0, XMM1); \
|
||||
XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
|
||||
XMM0 = _mm_add_pd(XMM0, XMM1); \
|
||||
XMM0 = _mm_sqrt_pd(XMM0); \
|
||||
_mm_store_sd((s), XMM0); \
|
||||
}
|
||||
|
||||
|
||||
#define vecrnorm(s, x, n) \
|
||||
{ \
|
||||
int i; \
|
||||
__m128d XMM0 = _mm_setzero_pd(); \
|
||||
__m128d XMM1 = _mm_setzero_pd(); \
|
||||
__m128d XMM2, XMM3, XMM4, XMM5; \
|
||||
for (i = 0;i < (n);i += 4) { \
|
||||
XMM2 = _mm_load_pd((x)+i ); \
|
||||
XMM3 = _mm_load_pd((x)+i+2); \
|
||||
XMM4 = XMM2; \
|
||||
XMM5 = XMM3; \
|
||||
XMM2 = _mm_mul_pd(XMM2, XMM4); \
|
||||
XMM3 = _mm_mul_pd(XMM3, XMM5); \
|
||||
XMM0 = _mm_add_pd(XMM0, XMM2); \
|
||||
XMM1 = _mm_add_pd(XMM1, XMM3); \
|
||||
} \
|
||||
XMM2 = _mm_set1_pd(1.0); \
|
||||
XMM0 = _mm_add_pd(XMM0, XMM1); \
|
||||
XMM1 = _mm_shuffle_pd(XMM0, XMM0, _MM_SHUFFLE2(1, 1)); \
|
||||
XMM0 = _mm_add_pd(XMM0, XMM1); \
|
||||
XMM0 = _mm_sqrt_pd(XMM0); \
|
||||
XMM2 = _mm_div_pd(XMM2, XMM0); \
|
||||
_mm_store_sd((s), XMM2); \
|
||||
}
|
283
lib/arithmetic_sse_float.h
Normal file
283
lib/arithmetic_sse_float.h
Normal file
@ -0,0 +1,283 @@
|
||||
/*
|
||||
* SSE/SSE3 implementation of vector oprations (32bit float).
|
||||
*
|
||||
* Copyright (c) 2007,2008, Naoaki Okazaki
|
||||
* All rights reserved.
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
* of this software and associated documentation files (the "Software"), to deal
|
||||
* in the Software without restriction, including without limitation the rights
|
||||
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
* copies of the Software, and to permit persons to whom the Software is
|
||||
* furnished to do so, subject to the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be included in
|
||||
* all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
* THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/* $Id$ */
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <malloc.h>
|
||||
#include <memory.h>
|
||||
|
||||
#if 1400 <= _MSC_VER
|
||||
#include <intrin.h>
|
||||
#endif
|
||||
|
||||
#if LBFGS_FLOAT == 32 && LBFGS_IEEE_FLOAT
|
||||
#define fsigndiff(x, y) (((*(uint32_t*)(x)) ^ (*(uint32_t*)(y))) & 0x80000000U)
|
||||
#else
|
||||
#define fsigndiff(x, y) (*(x) * (*(y) / fabs(*(y))) < 0.)
|
||||
#endif/*LBFGS_IEEE_FLOAT*/
|
||||
|
||||
inline static void* vecalloc(size_t size)
|
||||
{
|
||||
void *memblock = _aligned_malloc(size, 16);
|
||||
if (memblock != NULL) {
|
||||
memset(memblock, 0, size);
|
||||
}
|
||||
return memblock;
|
||||
}
|
||||
|
||||
inline static void vecfree(void *memblock)
|
||||
{
|
||||
_aligned_free(memblock);
|
||||
}
|
||||
|
||||
#define vecset(x, c, n) \
|
||||
{ \
|
||||
int i; \
|
||||
__m128 XMM0 = _mm_set_ps1(c); \
|
||||
for (i = 0;i < (n);i += 16) { \
|
||||
_mm_store_ps((x)+i , XMM0); \
|
||||
_mm_store_ps((x)+i+ 4, XMM0); \
|
||||
_mm_store_ps((x)+i+ 8, XMM0); \
|
||||
_mm_store_ps((x)+i+12, XMM0); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define veccpy(y, x, n) \
|
||||
{ \
|
||||
int i; \
|
||||
for (i = 0;i < (n);i += 16) { \
|
||||
__m128 XMM0 = _mm_load_ps((x)+i ); \
|
||||
__m128 XMM1 = _mm_load_ps((x)+i+ 4); \
|
||||
__m128 XMM2 = _mm_load_ps((x)+i+ 8); \
|
||||
__m128 XMM3 = _mm_load_ps((x)+i+12); \
|
||||
_mm_store_ps((y)+i , XMM0); \
|
||||
_mm_store_ps((y)+i+ 4, XMM1); \
|
||||
_mm_store_ps((y)+i+ 8, XMM2); \
|
||||
_mm_store_ps((y)+i+12, XMM3); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define vecncpy(y, x, n) \
|
||||
{ \
|
||||
int i; \
|
||||
const uint32_t mask = 0x80000000; \
|
||||
__m128 XMM4 = _mm_load_ps1((float*)&mask); \
|
||||
for (i = 0;i < (n);i += 16) { \
|
||||
__m128 XMM0 = _mm_load_ps((x)+i ); \
|
||||
__m128 XMM1 = _mm_load_ps((x)+i+ 4); \
|
||||
__m128 XMM2 = _mm_load_ps((x)+i+ 8); \
|
||||
__m128 XMM3 = _mm_load_ps((x)+i+12); \
|
||||
XMM0 = _mm_xor_ps(XMM0, XMM4); \
|
||||
XMM1 = _mm_xor_ps(XMM1, XMM4); \
|
||||
XMM2 = _mm_xor_ps(XMM2, XMM4); \
|
||||
XMM3 = _mm_xor_ps(XMM3, XMM4); \
|
||||
_mm_store_ps((y)+i , XMM0); \
|
||||
_mm_store_ps((y)+i+ 4, XMM1); \
|
||||
_mm_store_ps((y)+i+ 8, XMM2); \
|
||||
_mm_store_ps((y)+i+12, XMM3); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define vecadd(y, x, c, n) \
|
||||
{ \
|
||||
int i; \
|
||||
__m128 XMM7 = _mm_set_ps1(c); \
|
||||
for (i = 0;i < (n);i += 8) { \
|
||||
__m128 XMM0 = _mm_load_ps((x)+i ); \
|
||||
__m128 XMM1 = _mm_load_ps((x)+i+4); \
|
||||
__m128 XMM2 = _mm_load_ps((y)+i ); \
|
||||
__m128 XMM3 = _mm_load_ps((y)+i+4); \
|
||||
XMM0 = _mm_mul_ps(XMM0, XMM7); \
|
||||
XMM1 = _mm_mul_ps(XMM1, XMM7); \
|
||||
XMM2 = _mm_add_ps(XMM2, XMM0); \
|
||||
XMM3 = _mm_add_ps(XMM3, XMM1); \
|
||||
_mm_store_ps((y)+i , XMM2); \
|
||||
_mm_store_ps((y)+i+4, XMM3); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define vecdiff(z, x, y, n) \
|
||||
{ \
|
||||
int i; \
|
||||
for (i = 0;i < (n);i += 16) { \
|
||||
__m128 XMM0 = _mm_load_ps((x)+i ); \
|
||||
__m128 XMM1 = _mm_load_ps((x)+i+ 4); \
|
||||
__m128 XMM2 = _mm_load_ps((x)+i+ 8); \
|
||||
__m128 XMM3 = _mm_load_ps((x)+i+12); \
|
||||
__m128 XMM4 = _mm_load_ps((y)+i ); \
|
||||
__m128 XMM5 = _mm_load_ps((y)+i+ 4); \
|
||||
__m128 XMM6 = _mm_load_ps((y)+i+ 8); \
|
||||
__m128 XMM7 = _mm_load_ps((y)+i+12); \
|
||||
XMM0 = _mm_sub_ps(XMM0, XMM4); \
|
||||
XMM1 = _mm_sub_ps(XMM1, XMM5); \
|
||||
XMM2 = _mm_sub_ps(XMM2, XMM6); \
|
||||
XMM3 = _mm_sub_ps(XMM3, XMM7); \
|
||||
_mm_store_ps((z)+i , XMM0); \
|
||||
_mm_store_ps((z)+i+ 4, XMM1); \
|
||||
_mm_store_ps((z)+i+ 8, XMM2); \
|
||||
_mm_store_ps((z)+i+12, XMM3); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define vecscale(y, c, n) \
|
||||
{ \
|
||||
int i; \
|
||||
__m128 XMM7 = _mm_set_ps1(c); \
|
||||
for (i = 0;i < (n);i += 8) { \
|
||||
__m128 XMM0 = _mm_load_ps((y)+i ); \
|
||||
__m128 XMM1 = _mm_load_ps((y)+i+4); \
|
||||
XMM0 = _mm_mul_ps(XMM0, XMM7); \
|
||||
XMM1 = _mm_mul_ps(XMM1, XMM7); \
|
||||
_mm_store_ps((y)+i , XMM0); \
|
||||
_mm_store_ps((y)+i+4, XMM1); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define vecmul(y, x, n) \
|
||||
{ \
|
||||
int i; \
|
||||
for (i = 0;i < (n);i += 16) { \
|
||||
__m128 XMM0 = _mm_load_ps((x)+i ); \
|
||||
__m128 XMM1 = _mm_load_ps((x)+i+ 4); \
|
||||
__m128 XMM2 = _mm_load_ps((x)+i+ 8); \
|
||||
__m128 XMM3 = _mm_load_ps((x)+i+12); \
|
||||
__m128 XMM4 = _mm_load_ps((y)+i ); \
|
||||
__m128 XMM5 = _mm_load_ps((y)+i+ 4); \
|
||||
__m128 XMM6 = _mm_load_ps((y)+i+ 8); \
|
||||
__m128 XMM7 = _mm_load_ps((y)+i+12); \
|
||||
XMM4 = _mm_mul_ps(XMM4, XMM0); \
|
||||
XMM5 = _mm_mul_ps(XMM5, XMM1); \
|
||||
XMM6 = _mm_mul_ps(XMM6, XMM2); \
|
||||
XMM7 = _mm_mul_ps(XMM7, XMM3); \
|
||||
_mm_store_ps((y)+i , XMM4); \
|
||||
_mm_store_ps((y)+i+ 4, XMM5); \
|
||||
_mm_store_ps((y)+i+ 8, XMM6); \
|
||||
_mm_store_ps((y)+i+12, XMM7); \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
|
||||
#if 3 <= __SSE__
|
||||
/*
|
||||
Horizontal add with haddps SSE3 instruction. The work register (rw)
|
||||
is unused.
|
||||
*/
|
||||
#define __horizontal_sum(r, rw) \
|
||||
r = _mm_hadd_ps(r, r); \
|
||||
r = _mm_hadd_ps(r, r);
|
||||
|
||||
#else
|
||||
/*
|
||||
Horizontal add with SSE instruction. The work register (rw) is used.
|
||||
*/
|
||||
#define __horizontal_sum(r, rw) \
|
||||
rw = r; \
|
||||
r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(1, 0, 3, 2)); \
|
||||
r = _mm_add_ps(r, rw); \
|
||||
rw = r; \
|
||||
r = _mm_shuffle_ps(r, rw, _MM_SHUFFLE(2, 3, 0, 1)); \
|
||||
r = _mm_add_ps(r, rw);
|
||||
|
||||
#endif
|
||||
|
||||
#define vecdot(s, x, y, n) \
|
||||
{ \
|
||||
int i; \
|
||||
__m128 XMM0 = _mm_setzero_ps(); \
|
||||
__m128 XMM1 = _mm_setzero_ps(); \
|
||||
__m128 XMM2, XMM3, XMM4, XMM5; \
|
||||
for (i = 0;i < (n);i += 8) { \
|
||||
XMM2 = _mm_load_ps((x)+i ); \
|
||||
XMM3 = _mm_load_ps((x)+i+4); \
|
||||
XMM4 = _mm_load_ps((y)+i ); \
|
||||
XMM5 = _mm_load_ps((y)+i+4); \
|
||||
XMM2 = _mm_mul_ps(XMM2, XMM4); \
|
||||
XMM3 = _mm_mul_ps(XMM3, XMM5); \
|
||||
XMM0 = _mm_add_ps(XMM0, XMM2); \
|
||||
XMM1 = _mm_add_ps(XMM1, XMM3); \
|
||||
} \
|
||||
XMM0 = _mm_add_ps(XMM0, XMM1); \
|
||||
__horizontal_sum(XMM0, XMM1); \
|
||||
_mm_store_ss((s), XMM0); \
|
||||
}
|
||||
|
||||
#define vecnorm(s, x, n) \
|
||||
{ \
|
||||
int i; \
|
||||
__m128 XMM0 = _mm_setzero_ps(); \
|
||||
__m128 XMM1 = _mm_setzero_ps(); \
|
||||
__m128 XMM2, XMM3; \
|
||||
for (i = 0;i < (n);i += 8) { \
|
||||
XMM2 = _mm_load_ps((x)+i ); \
|
||||
XMM3 = _mm_load_ps((x)+i+4); \
|
||||
XMM2 = _mm_mul_ps(XMM2, XMM2); \
|
||||
XMM3 = _mm_mul_ps(XMM3, XMM3); \
|
||||
XMM0 = _mm_add_ps(XMM0, XMM2); \
|
||||
XMM1 = _mm_add_ps(XMM1, XMM3); \
|
||||
} \
|
||||
XMM0 = _mm_add_ps(XMM0, XMM1); \
|
||||
__horizontal_sum(XMM0, XMM1); \
|
||||
XMM2 = XMM0; \
|
||||
XMM1 = _mm_rsqrt_ss(XMM0); \
|
||||
XMM3 = XMM1; \
|
||||
XMM1 = _mm_mul_ss(XMM1, XMM1); \
|
||||
XMM1 = _mm_mul_ss(XMM1, XMM3); \
|
||||
XMM1 = _mm_mul_ss(XMM1, XMM0); \
|
||||
XMM1 = _mm_mul_ss(XMM1, _mm_set_ss(-0.5f)); \
|
||||
XMM3 = _mm_mul_ss(XMM3, _mm_set_ss(1.5f)); \
|
||||
XMM3 = _mm_add_ss(XMM3, XMM1); \
|
||||
XMM3 = _mm_mul_ss(XMM3, XMM2); \
|
||||
_mm_store_ss((s), XMM3); \
|
||||
}
|
||||
|
||||
#define vecrnorm(s, x, n) \
|
||||
{ \
|
||||
int i; \
|
||||
__m128 XMM0 = _mm_setzero_ps(); \
|
||||
__m128 XMM1 = _mm_setzero_ps(); \
|
||||
__m128 XMM2, XMM3; \
|
||||
for (i = 0;i < (n);i += 16) { \
|
||||
XMM2 = _mm_load_ps((x)+i ); \
|
||||
XMM3 = _mm_load_ps((x)+i+4); \
|
||||
XMM2 = _mm_mul_ps(XMM2, XMM2); \
|
||||
XMM3 = _mm_mul_ps(XMM3, XMM3); \
|
||||
XMM0 = _mm_add_ps(XMM0, XMM2); \
|
||||
XMM1 = _mm_add_ps(XMM1, XMM3); \
|
||||
} \
|
||||
XMM0 = _mm_add_ps(XMM0, XMM1); \
|
||||
__horizontal_sum(XMM0, XMM1); \
|
||||
XMM2 = XMM0; \
|
||||
XMM1 = _mm_rsqrt_ss(XMM0); \
|
||||
XMM3 = XMM1; \
|
||||
XMM1 = _mm_mul_ss(XMM1, XMM1); \
|
||||
XMM1 = _mm_mul_ss(XMM1, XMM3); \
|
||||
XMM1 = _mm_mul_ss(XMM1, XMM0); \
|
||||
XMM1 = _mm_mul_ss(XMM1, _mm_set_ss(-0.5f)); \
|
||||
XMM3 = _mm_mul_ss(XMM3, _mm_set_ss(1.5f)); \
|
||||
XMM3 = _mm_add_ss(XMM3, XMM1); \
|
||||
_mm_store_ss((s), XMM3); \
|
||||
}
|
1144
lib/lbfgs.c
Normal file
1144
lib/lbfgs.c
Normal file
File diff suppressed because it is too large
Load Diff
69
sample/main.c
Normal file
69
sample/main.c
Normal file
@ -0,0 +1,69 @@
|
||||
#include <stdio.h>
|
||||
#include <lbfgs.h>
|
||||
|
||||
static lbfgsfloatval_t evaluate(
|
||||
void *instance,
|
||||
const lbfgsfloatval_t *x,
|
||||
lbfgsfloatval_t *g,
|
||||
const int n,
|
||||
const lbfgsfloatval_t step
|
||||
)
|
||||
{
|
||||
int i;
|
||||
lbfgsfloatval_t fx = 0.0;
|
||||
|
||||
for (i = 0;i < n;i += 2) {
|
||||
lbfgsfloatval_t t1 = 1.0 - x[i];
|
||||
lbfgsfloatval_t t2 = 10.0 * (x[i+1] - x[i] * x[i]);
|
||||
g[i+1] = 20.0 * t2;
|
||||
g[i] = -2.0 * (x[i] * g[i+1] + t1);
|
||||
fx += t1 * t1 + t2 * t2;
|
||||
}
|
||||
return fx;
|
||||
}
|
||||
|
||||
static int progress(
|
||||
void *instance,
|
||||
const lbfgsfloatval_t *x,
|
||||
const lbfgsfloatval_t *g,
|
||||
const lbfgsfloatval_t fx,
|
||||
const lbfgsfloatval_t xnorm,
|
||||
const lbfgsfloatval_t gnorm,
|
||||
const lbfgsfloatval_t step,
|
||||
int n,
|
||||
int k,
|
||||
int ls
|
||||
)
|
||||
{
|
||||
printf("Iteration %d:\n", k);
|
||||
printf(" fx = %f, x[0] = %f, x[1] = %f\n", fx, x[0], x[1]);
|
||||
printf(" xnorm = %f, gnorm = %f, step = %f\n", xnorm, gnorm, step);
|
||||
printf("\n");
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define N 8
|
||||
|
||||
int main(int argc, char *argv)
|
||||
{
|
||||
int i, ret = 0;
|
||||
lbfgsfloatval_t x[N], fx;
|
||||
|
||||
/* Initialize the variables. */
|
||||
for (i = 0;i < N;i += 2) {
|
||||
x[i] = -1.2;
|
||||
x[i+1] = 1.0;
|
||||
}
|
||||
|
||||
/*
|
||||
Start the L-BFGS optimization; this will invoke the callback functions
|
||||
evaluate() and progress() when necessary.
|
||||
*/
|
||||
ret = lbfgs(N, x, &fx, evaluate, progress, NULL, NULL);
|
||||
|
||||
/* Report the result. */
|
||||
printf("L-BFGS optimization terminated with status code = %d\n", ret);
|
||||
printf(" fx = %f, x[0] = %f, x[1] = %f\n", fx, x[0], x[1]);
|
||||
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue
Block a user