#Copyright (c) 2010, Roland Memisevic
#All rights reserved.
#
#Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
#
#    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
#    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
#
#            THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

""" This module contains three classes and some supporting functions for 
    computing Kernel Information Embeddings and shared Kernel Information
    Embeddings. Graphics processing units (GPU) are used for fast learning 
    and inference through the gnumpy package.  

    Kernel Information Embeddings are a family of probabilistic non-parametric 
    dimensionality reduction algorithms.

    Unlike most standard embedding methods (such as LLE, kernel PCA, UKR, 
    Laplacian Eigenmaps and many others) Kernel Information Embeddings come 
    with both forward- and backward mappings. Thus, after computing embeddings 
    with KIE, it is possible to compute low- dimensional codes for new, 
    previously unseen test-data; to produce 'fantasy-data' by projecting 
    latent-space elements into the data-space, or to efficiently project 
    data-cases onto a learned manifold, for example in order to perform 
    de-noising. 

    To instantiate these classes you need to provide data, a kernel bandwidth 
    and a regularization parameter. Each class contains a params-attribute, 
    a cost()- and a grad()-function, which can be passed to any non-linear, 
    unconstrained optimizer for training. A train()-method that performs
    simple gradient descent (not very fast) is provided for convenience. 
    A faster train_cg()-method is also provided which uses conjugate gradients.

    Latent space elements reside in the attribute Z (which, internally, is 
    just a view onto the params-array). To compute the forward-, or backward-
    mapping or to project new cases onto a learned manifold, use the methods 
    forward() and backward() and project(). 

    For more information on Kernel Information Embeddings see:
    2008. Memisevic, R. "Non-linear latent factor models for revealing 
    structure in high-dimensional data". PhD-thesis, University of Toronto. 
    2006. Memisevic, R. (2006). "Kernel information embeddings." In: 
    Proceedings of the 23rd International Conference on Machine learning 
    (ICML 2006). 

    The provided classes are:

    Kie -        Implements the basic kernel information embedding model.  
    SharedKie -  Implements shared kernel information embedding,
    SharedKie2 - Implements shared kernel information embedding for exactly _two_ 
                 datasets. In contrast SharedKie, this class allows for sharing of 
                 only a subset of points. In contrast to Kie and SharedKie it does 
                 not currently come with GPU support, and is therefore slower. 

    For more documentation see the class definitions themselves below. 

    For a usage example see the end of this file (under "if __name__=='__main__' ". 

    Dependencies:
    This module depends on numpy and Tijmen Tieleman's gnumpy package. 
    Some learning methods further make use of either scipy or of minimze.py for the 
    optimization. The latter is available at the website as kie_cuda.py 
    The usage example uses matplotlib for plotting. 

"""

from numpy import zeros, ones, sum, dot, diag, newaxis, exp, log, eye, \
                  hstack, vstack, pi, sqrt, array, arange, argmin, argmax, \
                  kron, random, where, inf, isnan, sort , argsort, outer, double, mean
from numpy.linalg import pinv
from numpy.random import randn
import random as pyrandom

import gnumpy
gnumpy.board_id_to_use = 0

LARGE = 10.**8
SMALL = 10.**-8

random.seed(1)
pyrandom.seed(1)

def drawDiscrete(probs, numdraws=1):
    """ Draw from a discrete distributions."""
    result = zeros(numdraws, dtype=int)
    for d in range(numdraws):
        r = pyrandom.random()
        accumulator = 0.0
        for i,x in enumerate(probs): 
            accumulator += x 
            if r < accumulator: 
                result[d] = i 
                break
    return result


def normalizeshape(x):
    if len(x.shape) < 2:
        return x[:,newaxis]
    else:
        return x


def listify(x):
    if type(x) != type([]):
        return [x]
    else:
        return x


def distmatrix(x, y=None, missing=None):
    """ Compute distance matrix for the points in (columnwise) data-matrices 
        x and y.
    """
    if missing is not None and y is not None:
        raise 'missing values allowed only if x=y (single data-set)' 
    if y is None: y = x
    if len(x.shape)<2:
        x = x[:,newaxis]
    if len(y.shape)<2:
        y = y[:,newaxis]
    if missing is None:
        x2 = sum(x**2, 0)
        y2 = sum(y**2, 0)
        return x2[:,newaxis] + y2[newaxis,:] - 2*dot(x.T,y)
    else: #missing is given, so we know x==y
        y = y*(1.0-missing)
        y2 = sum(y**2, 0)
        return y2[:,newaxis] + y2[newaxis, :] - 2*dot(y.T,y)


def distmatrix_gpu(x, y=None):
    """ Compute distance matrix for the points in (columnwise) data-matrices 
        x and y.
    """
    if y is None: y = x
    if len(x.shape)<2:
        x = x[:,newaxis]
    if len(y.shape)<2:
        y = y[:,newaxis]
    x2 = (x*x).sum(0)
    y2 = (y*y).sum(0)
    return x2[:,newaxis] + y2[newaxis,:] - 2*gnumpy.dot(x.T,y)


def logsumexp(x, dim=0):
    """Compute log(sum(exp(x))) in numerically stable way."""
    #xmax = x.max()
    #return xmax + log(exp(x-xmax).sum())
    if dim==0:
        xmax = x.max(0)
        return xmax + log(exp(x-xmax).sum(0))
    elif dim==1:
        xmax = x.max(1)
        return xmax + log(exp(x-xmax[:,newaxis]).sum(1))
    else: 
        raise 'dim ' + str(dim) + 'not supported'


def logsumexp_gpu(x, dim=0):
    """Compute log(sum(exp(x))) for a gpu array."""
    if dim==0:
        xmax = x.max(0)
        return xmax + gnumpy.log(gnumpy.exp(x-xmax).sum(0))
    elif dim==1:
        xmax = x.max(1)[:,newaxis]
        return xmax + gnumpy.log(gnumpy.exp(x-xmax).sum(1))[:,newaxis]
    else: 
        raise 'dim ' + str(dim) + 'not supported'


def dens1d(data1, data2, h):
    d1_ = (data1**2).sum(0)[newaxis,:]
    d2_ = (data2**2).sum(0)[newaxis,:]
    D = d1_.T + d2_ - 2*dot(data1.T,data2)
    normalizer = 1./(sqrt(pi*h)*data2.shape[1])
    return exp(-D/h).sum(1)*normalizer


def dens2d(data, h, numvals):
    """ Return the 2d-kernel density estimate evaluated on a grid, using data
        as the training data and h as the bandwidths. numvals determines how 
        fine the grid is. 
        
        Returns also the values of the grid which is useful for contour plots.

        Only for 2d-data. 
    """
    dens = []
    span0 = (data[0].max()-data[0].min())
    span1 = (data[1].max()-data[1].min())
    xrange = arange(data[0].min()-span0/5.,
                        data[0].max()+span0/5., span0/numvals)
    yrange = arange(data[1].min()-span1/5.,
                        data[1].max()+span1/5., span1/numvals)
    testdata = vstack((kron(ones((1,len(yrange))),xrange[newaxis,:]),
                       kron(yrange[newaxis,:],ones((1,len(xrange))))))
    d1_ = (testdata**2).sum(0)[newaxis,:]
    d2_ = (data**2).sum(0)[newaxis,:]
    D = d1_.T + d2_ - 2*dot(testdata.T,data)
    normalizer = 1./(pi*h*data.shape[1])
    density = exp(-D/h).sum(1)*normalizer
    count = 0
    for x in xrange:
        for y in yrange:
            dens.append(density[count])
            count += 1
    
    dens = array(dens).reshape(xrange.shape[0], yrange.shape[0]).T
    return xrange, yrange, dens


def bandwidthFromNnDistance(data):
    D = distmatrix(data)
    D *= (D>SMALL)
    return 2.0*mean(sort(sqrt(D),1)[:,1])**2


def bandwidthFromLikelihood(data, hmin, hmax, numsteps):
    numdims, numcases = data.shape
    #bandwidths = arange(data.var()/10, 10*data.var(), data.var())
    bandwidths = arange(hmin, hmax, (hmax-hmin)/numsteps)
    gram = dot(data.T, data)
    D = diag(gram)[:,newaxis] + diag(gram)[newaxis,:] - 2 * gram
    E = eye(numcases)*LARGE
    dens = []
    lognumcases = log(numcases)
    for h in bandwidths:
        lognormalizer = lognumcases+0.5*numdims*log(pi*h)
        dens.append( (logsumexp(-(D/h+E),1) - lognormalizer).sum()/numcases )
        #print "bandwidth: %f : log-density %f" % (h, dens[-1])
    #print "best: %f at %d of %d" % (bandwidths[argmax(dens)], argmax(dens)+1, len(bandwidths))
    return bandwidths[argmax(dens)], dens


optbandwidth = bandwidthFromLikelihood


def bandwidthFromValidationLikelihood(data, validdata, hmin, hmax, numsteps):
    numdims, numtrain = data.shape
    numdims_, numvalid = validdata.shape
    assert numdims == numdims_
    bandwidths = arange(hmin, hmax, (hmax-hmin)/numsteps)
    D = distmatrix(validdata, data)
    dens = []
    lognumtrain = log(numtrain)
    for h in bandwidths:
        lognormalizer = lognumtrain+0.5*numdims*log(pi*h)
        dens.append( (logsumexp(-(D/h),1) - lognormalizer).sum()/numvalid)
        #print "bandwidth: %f : log-density %f" % (h, dens[-1])
    #print "best: %f at %d of %d" % (bandwidths[argmax(dens)], argmax(dens)+1, len(bandwidths))
    return bandwidths[argmax(dens)], dens


def bandwidthFromPerplexity(data, k, hmin, hmax, numsteps, loo=False):
    import bisect
    def _log(M):
        M = M
        M[M<10**-10]=1
        return log(M)
    numdims, numcases = data.shape
    bandwidths = arange(hmin, hmax, (hmax-hmin)/numsteps)
    averageEntropy = zeros(len(bandwidths), dtype=float)
    D = distmatrix(data)
    for i, bandwidth in enumerate(bandwidths):
        K = exp(-(1.0/bandwidth)*D)
        if loo:
            K = K-eye(numcases)
        K = K/sum(K,1)[:,newaxis]
        averageEntropy[i] = (-sum(sum(K*_log(K))))/numcases
    #averageEntropy[where(isnan(averageEntropy))[0]]=-inf
    return bandwidths[bisect.bisect(list(averageEntropy), log(k))]


def bandwidthFromRuleofthumb(data):
    return 1.06 * data.std() / (double(data.shape[1])**0.2)


def dens1dlat(model, numvals, args=(None,)):
    """ Return the data for plotting latent space density evaluated on 
        a grid. (Returns both the grid values and the associated densities,
        so that the result can be simply passed on to plot(). 

        Only for 1d-latent spaces. 
    """
    span = (Z.max()-Z.min())
    zrange = arange(Z.min()-span/5., Z.max()+span/5., span/numvals)
    return zrange, model.latdensity(zrange[newaxis,:], *args)


def contour2dlat(model, numvals, args=(None,)):
    """ Return the data for countour-plot of the latent density.
        Only for 2d-latent spaces. 
    """
    dens = []
    Z = model.Z_gpu.asarray()
    span0 = (Z[0].max()-Z[0].min())
    span1 = (Z[1].max()-Z[1].min())
    xrange = arange(Z[0].min()-span0/5.,
                        Z[0].max()+span0/5., span0/numvals)
    yrange = arange(Z[1].min()-span1/5.,
                        Z[1].max()+span1/5., span1/numvals)
    for x in xrange:
        for y in yrange:
            dens.append(model.latdensity(hstack((x,y)), *args))
    
    dens = array(dens).reshape(xrange.shape[0], yrange.shape[0]).T
    return xrange, yrange, dens


def contour2dobs(model, numvals):
    """ Return the data for countour-plot of the observable space density.
        Only for 2d-latent spaces. 
    """
    dens = []
    span0 = (model.Z[0].max()-model.Z[0].min())
    span1 = (model.Z[1].max()-model.Z[1].min())
    xrange = arange(model.Z[0].min()-span0/5.,
                        model.Z[0].max()+span0/5., span0/numvals)
    yrange = arange(model.Z[1].min()-span1/5.,
                        model.Z[1].max()+span1/5., span1/numvals)
    for x in xrange:
        for y in yrange:
            dens.append(model.obsdensity(model.forward(hstack((x,y)))))
    
    dens = array(dens).reshape(xrange.shape[0], yrange.shape[0]).T
    return xrange, yrange, dens


def gaussKernelMatrix(Z, h):
    GramZ = dot(Z.T, Z)
    return exp(-(1.0/h)*
             (diag(GramZ)[newaxis,:] + diag(GramZ)[:,newaxis] - 2 * GramZ))


def log_anisotropicKernelMatrix(Y, Ytest=None, h=None):
    if h is None:
        h = bandwidthFromNnDistance(Y)
    simpleK = exp(-(1.0/h)*distmatrix(Y))
    distVecs = Y[:,:,newaxis]-Y[:,newaxis,:]
    reg = eye(Y.shape[0]) * h/5.0
    anisotropicKs = []
    for i in range(Y.shape[1]):
        anisotropicKs.append(sum(distVecs[:,i,:][:,:,newaxis]
                            *distVecs[:,i,:][newaxis,:,:].transpose(0,2,1)
                            *simpleK[:,i][newaxis,:,newaxis],1) 
                          + reg)
    if Ytest is not None:
        distVecsTest = Y[:,:,newaxis]-Ytest[:,newaxis,:]
        dists = zeros((Ytest.shape[1], Y.shape[1]), dtype=float)
    else:
        distVecsTest = distVecs
        dists = zeros((Y.shape[1], Y.shape[1]), dtype=float)
    for i in range(Y.shape[1]):
        dists[:, i] = (dot(distVecsTest[:,i,:].T, pinv(anisotropicKs[i]))
                      * distVecsTest[:,i,:].T).sum(1)
    return -0.5 * dists


def log_anisotropicKernelMatrix_gpu(Y_gpu, Ytest_gpu=None, h=None):
    Y_gpu = gnumpy.garray(Y_gpu)
    if h is None:
        h = bandwidthFromNnDistance(Y.asarray())
    h = float(h)
    simpleK = gnumpy.exp(-(1.0/h)*distmatrix_gpu(Y_gpu))
    distVecs = Y_gpu[:,:,newaxis]-Y_gpu[:,newaxis,:]
    reg = eye(Y_gpu.shape[0]) * h/5.0
    anisotropicKs = []
    for i in range(Y_gpu.shape[1]):
        anisotropicKs.append(sum((distVecs[:,i,:][:,:,newaxis]
                            *distVecs[:,i,:][newaxis,:,:].transpose(0,2,1)
                            *simpleK[:,i][newaxis,:,newaxis]).asarray(),1) 
                          + reg)
    if Ytest_gpu is not None:
        distVecsTest = Y_gpu[:,:,newaxis]-Ytest_gpu[:,newaxis,:]
        dists = gnumpy.garray(zeros((Ytest_gpu.shape[1], Y_gpu.shape[1]), dtype=float))
    else:
        distVecsTest = distVecs
        dists = gnumpy.garray(zeros((Y_gpu.shape[1], Y_gpu.shape[1]), dtype=float))
    for i in range(Y_gpu.shape[1]):
        dists[:, i] = (gnumpy.dot(distVecsTest[:,i,:].T, 
                                    gnumpy.garray(pinv(anisotropicKs[i])))
                      * distVecsTest[:,i,:].T).sum(1)
    return (-(0.5) * dists).asarray()


def classKernelMatrix(Z):
    """Return the Grammian for the 'one-hot'-encoded class-label matrix."""
    return dot(Z.T,Z)


def penfunc_squarednorm(Z):
    return (Z*Z).sum()


def pengrad_squarednorm(Z):
    return 2*Z


def penfunc_l1(Z):
    return sqrt(Z*Z+SMALL).sum()


def pengrad_l1(Z):
    return Z/sqrt(Z*Z+SMALL) 


def penfunc_entropy(Z):
    return -logsumexp(-distmatrix(Z),1).sum()


def pengrad_entropy(Z):
    KZ = exp(-distmatrix(Z))
    sumKZ = sum(KZ,1)
    oneoversumKZ = 1./sumKZ
    gradZ = zeros(Z.shape, dtype=float)
    for l in range(Z.shape[1]):
        gradZ[:, l] = \
            sum(((Z[:,l][:,newaxis] - Z)
                  *KZ[l,:][newaxis,:]
                  *(-oneoversumKZ[l]
                    -oneoversumKZ.T[:,newaxis]
                   ).T),1)
    gradZ *= 2.0
    return -gradZ


def penfunc_entropyLoo(Z):
    return -logsumexp(-(distmatrix(Z)+eye(Z.shape[1])*LARGE),1).sum()


def pengrad_entropyLoo(Z):
    KZ = exp(-distmatrix(Z))
    KZ *= 1.0-eye(Z.shape[1])
    sumKZ = sum(KZ,1)
    oneoversumKZ = 1./(sumKZ+SMALL)
    gradZ = zeros(Z.shape, dtype=float)
    for l in range(Z.shape[1]):
        gradZ[:, l] = \
            sum(((Z[:,l][:,newaxis] - Z)
                  *KZ[l,:][newaxis,:]
                  *(-oneoversumKZ[l]
                    -oneoversumKZ.T[:,newaxis]
                   ).T),1)
    gradZ *= 2.0
    return -gradZ


#class penfunc_entropylooSquarednorm(object):
#    def __init__(self, normpenalty):
#        self.normpenalty = normpenalty
#    def __call__(self, Z):
#       return penfunc_entropyLoo(Z) + self.normpenalty * penfunc_squarednorm(Z)
#    def setnormpenalty(self, p):
#        self.normpenalty = p



class Kie(object):
    """ Kernel information embedding. 
    
    Compute a low-dimensional embedding of data by maximizing an estimate 
    of the mutual information between the embedding and the original data
    distribution. Data is denoted Y and the embedding is denoted Z. 

    The constructor takes the following parameters:
    q       : Dimensionality of the latent space.
    hy      : Observable space kernel bandwidth.  
              The kernel that is used by default has the form:
              K(y_i, y_j) = exp(-(1.0/hy)*d_ij), where d_ij is the Euclidean
              distance between y_i and y_j. 
    Y       : Observed data matrix. The shape of the data matrix needs to 
              be (number of dimensions) x (number of data-points). 
    reg     : The amount of regularization. This is a multiplier that is 
              used to weight the regularization term relative to the kie
              objective function.

    optional parameters: 
    loo    : A boolean that determines if a leave-one-out variant of the 
             objective function or the standard KIE objective funcion is used.
             If False, the standard objective function is used. 
    missing: A boolean matrix that can be used to specify missing data. 
             The shape is the same as that of the data matrix Y. Any True 
             entry means that the corresponding entry of Y is considered 
             missing (the actual entry is thus being ignored).
             (*** The missing value feature is experimental and might not 
             work properly at this point ***.)
    anisotropic: 
             A boolean that (if True) turns on the use of anisotropic kernels, 
             instead of the default spherical Gaussian kernels. These kernels 
             often work better in high-dimensional data spaces, but result in 
             slower learning and inference. 
    penfunc, pengrad:
             A penalty function and its gradient, used for regularization. 
             The default is a squared norm penalty, which can be thought of
             as a Gaussian prior on the latent variables. 

    The methods cost and grad can be used in conjunction with a gradient-based
    optimizer to train the model. Alternatively, the method train can 
    be used to train the model, but it is not very fast as it uses simple 
    gradient descent. 
    """

    def __init__(self, q, hy, Y, reg, loo=False, missing=None,
                                                 anisotropic=False,
                                                 learnbandwidth=False, 
                                                 penfunc=penfunc_squarednorm, 
                                                 pengrad=pengrad_squarednorm):
        self.loo = loo   #use leave-one-out-estimate?
        self.learnbandwidth = learnbandwidth  #optimize bandwidth, too?
        self.q = q
        self.anisotropic = anisotropic
        self.missing = missing
        if self.missing is not None:
            self.Y = Y * (1.0-self.missing)
        else:
            self.Y = Y 
        self.Y_gpu = gnumpy.garray(self.Y)
        self.penfunc = penfunc
        self.pengrad = pengrad
        self.loghy = array(log(hy))
        self.d, self.numcases = Y.shape
        self.lognumcases = log(self.numcases)
        self.reg = reg     #amount of regularization
        #self.Z = randn(self.q,self.numcases) * 0.1
        #self.Z_gpu = gnumpy.garray(randn(self.q,self.numcases) * 0.1)
        #self.GramZ = zeros((self.numcases, self.numcases), 'float')
        #self.kZ = zeros((self.numcases, self.numcases), 'float')
        self.kZ_gpu = gnumpy.garray(zeros((self.numcases, self.numcases), 'float'))
        #some quantities useful for gradient/cost computation:
        if self.anisotropic and self.learnbandwidth:
            raise 'only one of [anisotropic, learnbandwidth] is allowed'
        if self.anisotropic and self.missing:
            raise 'only one of [anisotropic, missing] is allowed'
        if not self.anisotropic:
            self.YY = distmatrix(Y, missing=self.missing)
            self.kY = -(1.0/exp(self.loghy)) * self.YY
        else:
            self.kY = log_anisotropicKernelMatrix_gpu(self.Y, h=exp(self.loghy))
        if self.loo: 
            self.kY -= eye(self.numcases)*LARGE
        self.kY_gpu = gnumpy.garray(self.kY)
        self.KY = exp(self.kY)  #kernel matrix
        self.KY_gpu = gnumpy.garray(self.KY)
        if self.missing is not None:
            self.KY *= \
                dot(self.missing.T.astype(float), self.missing.astype(float))
            self.KY += 1./(pi*exp(self.loghy)*dot(self.missing.T.astype(float), 
                        self.missing.astype(float))) * (1.0-eye(self.numcases))
        if not self.learnbandwidth:
            #self.params = self.Z.reshape(self.q*self.numcases)
            self.params_gpu = gnumpy.garray(randn(self.numcases*self.q)*0.1)
            self.inc_gpu = gnumpy.garray(zeros(self.numcases*self.q))
        else:
            assert 0, 'not implemented'
            #self.params = hstack((self.Z.reshape(self.q*self.numcases), self.loghy))
            self.loghy = self.params[-1:]
        self.Z_gpu = self.params_gpu[:self.q*self.numcases].reshape(self.q, self.numcases)
        self.sumKZ_gpu = gnumpy.empty(self.numcases)
        self.sumKZKY_gpu = gnumpy.empty(self.numcases)
        self.oneoversumKZ_gpu = gnumpy.empty(self.numcases)
        self.oneoversumKZKY_gpu = gnumpy.empty(self.numcases)
        self.KZ_gpu = gnumpy.empty((self.numcases, self.numcases))
        self.ZZ_gpu = gnumpy.empty((self.numcases, self.numcases))
        self.Z2_gpu = gnumpy.empty(self.numcases)
        self.ZZFactor_gpu = gnumpy.empty((self.numcases, self.numcases))
        self.ILarge_gpu = gnumpy.garray(eye(self.numcases, self.numcases))
        self.ILarge_gpu *= LARGE
        self.gradZ_gpu = gnumpy.zeros((self.q, self.numcases))
        self.firstcall = True
        self.momentum = 0.9
        self.stepsize = 0.01

    def logkernelmatrix(self, Ytest, Y=None):
        if Y is None:
            Y = Ytest
        if not self.anisotropic:
            return -(1.0/exp(self.loghy))*distmatrix(Ytest, Y)
        else:
            return log_anisotropicKernelMatrix(Y, Ytest, h=exp(self.loghy))

    def logkernelmatrix_gpu(self, Ytest, Y=None):
        if Y is None:
            Y = Ytest
        if not self.anisotropic:
            return -float(1.0/gnumpy.exp(self.loghy))*distmatrix_gpu(Ytest, Y)
        else:
            assert 0, 'not implemented'
            return log_anisotropicKernelMatrix_gpu(Y, Ytest, h=gnumpy.exp(self.loghy))

    def kernelmatrix(self, Ytest, Y=None):
        if Y is None:
            Y = Ytest
        if not self.anisotropic:
            return exp(-(1.0/exp(self.loghy))*distmatrix(Ytest, Y))
        else:
            return exp(anisotropicKernelMatrix_gpu(Y, Ytest, h=exp(self.loghy)))

    def updateloghy(self,loghy):
        self.loghy = loghy
        if not self.anisotropic:
            self.kY = -(1.0/exp(self.loghy)) * self.YY
        else:
            self.kY = log_anisotropicKernelMatrix_gpu(self.Y, h=exp(self.loghy))
        if self.loo: 
            self.kY -= eye(self.numcases)*LARGE
        self.KY = exp(self.kY) 
        if self.missing is not None:
            self.KY *= \
                dot(self.missing.T.astype(float), self.missing.astype(float))
            self.KY += 1./(pi*exp(self.loghy)*dot(self.missing.T.astype(float),
                       self.missing.astype(float))) * (1.0-eye(self.numcases))

    def cost(self):
        self.Z2_gpu[:] = (self.Z_gpu * self.Z_gpu).sum(0)
        self.kZ_gpu[:,:] = -self.Z2_gpu[newaxis,:]
        self.kZ_gpu -= self.Z2_gpu[:,newaxis]
        self.kZ_gpu += 2*gnumpy.dot(self.Z_gpu.T, self.Z_gpu)

        if self.learnbandwidth:
            assert 0, 'not implemented'
            self.kY = -(1.0/exp(self.loghy)) * self.YY
            if self.loo: 
                self.kY -= eye(self.numcases)*LARGE
            self.KY = exp(self.kY)
        if self.loo:
            #self.kZ.subtract(self.ILarge_gpu)
            self.kZ_gpu -= self.ILarge_gpu
        cost = - logsumexp_gpu(self.kY_gpu+self.kZ_gpu,1).sum()/self.numcases \
               + logsumexp_gpu(self.kZ_gpu,1).sum()/self.numcases
        if self.learnbandwidth:
            assert 0, 'not implemented'
            cost += self.lognumcases + 0.5*self.d*log(pi*exp(self.loghy))
            cost = cost[0]
        #add penalty:
        cost += (self.reg/self.numcases) * self.penfunc(self.Z_gpu.asarray())
        return float(cost)

    def grad(self):
        self.Z2_gpu[:] = (self.Z_gpu * self.Z_gpu).sum(0)
        self.kZ_gpu[:,:] = -self.Z2_gpu[newaxis,:]
        self.kZ_gpu -= self.Z2_gpu[:,newaxis]
        self.kZ_gpu += 2*gnumpy.dot(self.Z_gpu.T, self.Z_gpu)
        self.KZ_gpu[:,:] = gnumpy.exp(self.kZ_gpu)
        if self.learnbandwidth:
            assert 0, 'not implemented'
            self.kY = -(1.0/exp(self.loghy)) * self.YY
            if self.loo: 
                self.kY -= eye(self.numcases)*LARGE
            self.KY = exp(self.kY)
        if self.loo:
            #KZ *= (1.-eye(self.numcases))
            self.kZ_gpu -= self.ILarge_gpu
            self.KZ_gpu[:,:] = gnumpy.exp(self.kZ_gpu)
        self.sumKZ_gpu[:] = self.KZ_gpu.sum(1)
        self.oneoversumKZ_gpu[:] = 1.0/self.sumKZ_gpu
        self.sumKZKY_gpu[:] = (self.KZ_gpu*self.KY_gpu).sum(1)
        self.oneoversumKZKY_gpu[:] = 1.0/self.sumKZKY_gpu
        self.ZZFactor_gpu[:,:] = self.KY_gpu*self.oneoversumKZKY_gpu[:,newaxis]
        self.ZZFactor_gpu += self.KY_gpu*self.oneoversumKZKY_gpu[newaxis,:]
        self.ZZFactor_gpu -= self.oneoversumKZ_gpu[newaxis,:]
        self.ZZFactor_gpu -= self.oneoversumKZ_gpu[:,newaxis]
        self.ZZFactor_gpu *= self.KZ_gpu
        #self.gradZ_gpu[:, :] = self.ZZ_gpu.sum(1)
        for i in range(self.q):
            self.ZZ_gpu[:,:] = 0.0
            self.ZZ_gpu += self.Z_gpu[i, :][:,newaxis]
            self.ZZ_gpu -= self.Z_gpu[i, :][newaxis,:]
            self.ZZ_gpu *= self.ZZFactor_gpu
            self.gradZ_gpu[i, :] = self.ZZ_gpu.sum(1)
        #for l in range(self.numcases):
        #    self.gradZ_gpu[:, l] = \
        #        ((self.Z_gpu[:,l][:,newaxis].tile((1, self.numcases)) - self.Z_gpu)
        #              *self.kZ_gpu[l,:][newaxis,:].tile((self.q, 1))
        #              *( (self.KY_gpu[l,:]/self.sumKZKY_gpu[l]).tile((self.q, 1))
        #                +(self.KY_gpu[l,:]/self.sumKZKY_gpu.T).tile((self.q, 1))
        #                -self.oneoversumKZ_gpu[l]
        #                -self.oneoversumKZ_gpu.T.tile((self.q, 1))
        #               )).sum(1).asarray()
        self.gradZ_gpu *= 2.0 / self.numcases
        #add penalty gradient:
        self.gradZ_gpu += (self.reg/self.numcases) * self.pengrad(self.Z_gpu.asarray())
        if not self.learnbandwidth:
            return self.gradZ_gpu.reshape(-1)
        else:
            assert 0, 'not implemented'
            Dh = -sum(sum(((KZ*self.KY)/sumKZKY[:,newaxis])*self.YY,1),0) \
                 / (self.numcases*exp(self.loghy)) \
                 + (0.5*self.d)
            return hstack((gradZ.flatten(), Dh))

    def f(self, params):
        """Wrapper function around cost function to check grads, etc."""
        paramsold = self.params_gpu.asarray()
        self.updateparams(params)
        result = self.cost() 
        self.updateparams(paramsold)
        return result

    def g(self, params):
        """Wrapper function around gradient to check grads, etc."""
        paramsold = self.params_gpu.asarray()
        self.updateparams(params)
        result = self.grad()
        self.updateparams(paramsold)
        return result.asarray()

    def updateparams(self, newparams):
        #self.params[:] = newparams
        if type(newparams) == type(array([])):
            self.params_gpu[:] *= 0.0
            self.params_gpu[:] += gnumpy.garray(newparams)
        else:
            self.params_gpu[:] = newparams

    def forwardMean(self, Ztest, ydimensions=None):
        """ 
        
        ydimensions is an index (list of integers or boolean array)
        specifying for which dimensions the mapping shall be computed
        """
        Ztest = normalizeshape(Ztest)
        kz = -distmatrix(Ztest, self.Z_gpu.asarray())
        lse = logsumexp(kz, 1)[:,newaxis]
        if ydimensions is None:
            if self.missing is None:
                return sum(exp(kz-lse)[:,:,newaxis]* 
                        self.Y[:,:,newaxis].transpose(2, 1, 0), 1).T 
            else:
                return sum(exp(kz-lse)[:,:,newaxis]* 
                        self.Y[:,:,newaxis].transpose(2, 1, 0)*
            (1./sqrt((pi*exp(self.loghy))**sum(self.missing.astype(float),0)))[newaxis,:,newaxis]
                        ,1).T 
        else:
            if self.missing is None:
                 return sum(exp(kz-lse)[:,:,newaxis]* 
                   self.Y[ydimensions,:][:,:,newaxis].transpose(2, 1, 0), 1).T
            else:
                 return sum(exp(kz-lse)[:,:,newaxis]* 
                   self.Y[ydimensions,:][:,:,newaxis].transpose(2, 1, 0)*
            (1./sqrt((pi*exp(self.loghy))**sum(self.missing.astype(float),0)))[newaxis,:,newaxis]
                   ,1).T

    def backwardMean(self, Ytest, ydimensions=None):
        """ 
        
        ydimensions is an index (list of integers or boolean array)
        specifying for which dimensions the mapping shall be computed
        """
        Ytest = normalizeshape(Ytest)
        if ydimensions is None:
            ky = self.logkernelmatrix(Ytest, self.Y)
        else:
            ky = self.logkernelmatrix(Ytest, self.Y[ydimensions,:])
        lse = logsumexp(ky, 1)[:,newaxis]
        return sum(exp(ky-lse)[:,:,newaxis]* 
                    self.Z[:,:,newaxis].transpose(2, 1, 0), 1).T

    forward = forwardMean

    backward = backwardMean

    def forwardVar(self, Ztest):
        if self.missing is not None:
            assert 0, 'forwardVar for missing data not implemented'
        Ztest = normalizeshape(Ztest)
        assert Ztest.shape[1] == 1, 'can compute cov only one point at a time'
        kz = -distmatrix(Ztest, self.Z_gpu.asarray())
        lse = logsumexp(kz, 1)[:,newaxis]
        Kz = exp(kz-lse)
        Yz = self.Y * Kz 
        result = eye(self.d) * exp(self.loghy)
        result += dot(Yz, self.Y.T)
        for l in range(self.numcases):
            result += (Yz[:,l][newaxis,newaxis,:] * Yz[:,:,newaxis]).sum(1)
        return result

    def backwardVar(self, Ytest):
        if self.missing is not None:
            assert 0, 'backwardVar for missing data not implemented'
        Ytest = normalizeshape(Ytest)
        assert Ytest.shape[1] == 1, 'can compute cov only one point at a time'
        Ytest = normalizeshape(Ytest)
        ky = self.logkernelmatrix(Ytest, self.Y)
        lse = logsumexp(ky, 1)[:,newaxis]
        Ky = exp(ky-lse)
        Zy = self.Z_gpu.asarray() * Ky 
        result = eye(self.q) * 1.0
        result += dot(Zy, self.Z_gpu.asarray().T)
        for l in range(self.numcases):
            result -= (Zy[:,l][newaxis,newaxis,:] * Zy[:,:,newaxis]).sum(1)
        return result

    def backwardSample(self, Ytest, numsamples=1):
        Ytest = normalizeshape(Ytest)
        ky = self.logkernelmatrix(Ytest, self.Y)
        lse = logsumexp(ky, 1)[:,newaxis]
        Ky = exp(ky-lse)
        result = zeros((self.q, Ytest.shape[1], numsamples), dtype=float)
        for i in range(Ytest.shape[1]):
            indexes = drawDiscrete(Ky[i ,:], numsamples)
            result[:,i,:] = (randn(self.q, numsamples) + self.Z_gpu.asarray()[:,indexes])
        return result 

    def project(self, y, ydimensions=None):
        return self.forward(self.backwardMode(y, ydimensions=ydimensions))

    def forwardMode(self, Ztest, Yinit=None):
        """ Returns the conditional mode of y given z. """
        if Yinit is None:
            Yinit = self.Y[:,argmin(distmatrix(Ztest, self.Z),1)]
        from scipy.optimize import fmin_cg
        return fmin_cg(self._forwardMode_cost, Yinit.flatten(), 
                                   self._forwardMode_grad, 
                                   args=(Ztest.flatten(),), 
                                   maxiter=100, 
                                   disp=0).reshape(self.d,-1)

    def _forwardMode_cost(self, Ytest, Ztest):
        Ytest = Ytest.reshape(self.d,-1)
        Ztest = Ztest.reshape(self.q,-1)
        numtest = Ztest.shape[1]
        kZ = -distmatrix_gpu(gnumpy.garray(Ztest), self.Z_gpu)
        kY = self.logkernelmatrix(Ytest, self.Y)
        # -(1.0/exp(self.loghy)) * distmatrix(Ytest, self.Y)
        return sum( -logsumexp(kZ+kY,1) + logsumexp(kZ,1) )/float(numtest)

    def _forwardMode_grad(self, Ytest, Ztest):
        Ytest = Ytest.reshape(self.d,-1)
        Ztest = Ztest.reshape(self.q,-1)
        numtest = Ztest.shape[1]
        kZ = exp(-distmatrix(Ztest, self.Z))
        #kY = exp(-(1.0/exp(self.loghy)) * distmatrix(Ytest, self.Y))
        kY = self.kernelmatrix(Ytest, self.Y)
        return -((2.0/exp(self.loghy))\
                 *sum((kZ*kY)[:,:,newaxis]*
                      (self.Y[:,:,newaxis].transpose(2,1,0)-
                       Ytest[:,:,newaxis].transpose(1,2,0)),1).T
                                         /sum(kZ*kY,1)[newaxis,:]\
                                                   / float(numtest)).flatten()

    def backwardMode(self, Ytest, Zinit=None, ydimensions=None):
        """ Returns the conditional mode of z given y. """
        from scipy.optimize import fmin_cg
        from minimize import minimize
        if Zinit is None:
            if ydimensions is None:
                Zinit = self.Z_gpu[:,argmin(distmatrix(Ytest, self.Y),1)]
            else:
                Zinit = \
                   self.Z_gpu[:,argmin(distmatrix(Ytest, self.Y[ydimensions,:]),1)]
        else:
            Zinit = gnumpy.garray(normalizeshape(Zinit))
        if ydimensions is None:
            Ytest = Ytest.reshape(self.d,-1)
            #kY = -(1.0/exp(self.loghy)) * distmatrix(Ytest, self.Y)
            kY = self.logkernelmatrix_gpu(gnumpy.garray(Ytest), self.Y_gpu)
        else:
            if type(ydimensions[0]) == type(True):
                numdims = sum(ydimensions)
            else:
                numdims = len(ydimensions)
            Ytest = Ytest.reshape(numdims,-1)
            #kY = \
            #  -(1.0/exp(self.loghy)) * distmatrix(Ytest, self.Y[ydimensions,:])
            kY = self.logkernelmatrix_gpu(gnumpy.garray(Ytest), 
                                                            self.Y_gpu[ydimensions,:])
        #return fmin_cg(self._backwardMode_cost, Zinit.flatten(), 
        #                           self._backwardMode_grad, 
        #                           args=(kY, ), 
        #                           maxiter=100, 
        #                           disp=0).reshape(self.q,-1)
        #return minimize(Zinit.flatten(), 
        #                           self._backwardMode_cost, 
        #                           self._backwardMode_grad, 
        #                           args=(kY, ), 
        #                           maxnumlinesearch=100)[0].reshape(self.q,-1)

        return self.meanshift(Zinit, kY)
        #return bolddriver(self._backwardMode_cost, 
        #                  self._backwardMode_grad, 
        #                  Zinit.flatten(),
        #                  0.001,
        #                  maxnumsteps=1000,
        #                  args = (kY, KY)
        #                  ).reshape(self.q, -1)

#    def meanshift(self, Z, kY):
#        numtest = Z.shape[1]
#        Zout = zeros((self.q, numtest), 'single')
#        for i in range(numtest):
#            z = Z[:, i]
#            for step in range(10000):
#                z_old = z
#                kz = -distmatrix_gpu(z[:,None], self.Z_gpu)
#                kz += kY[i,:]
#                z = (gnumpy.exp(kz-logsumexp_gpu(kz, 1)) * self.Z_gpu).sum(1)
#                if ((z_old - z)**2).sum()/self.q < 10.0**-12.0:
#                    Zout[:, i] = z.asarray()
#                    break
#        return Zout

    def meanshift(self, Z, kY):
        numtest = Z.shape[1]
        Zout = zeros((self.q, numtest), 'single')
        kz = gnumpy.garray(zeros((1, self.numcases), 'single'))
        z = gnumpy.garray(zeros(self.q, 'single'))
        z_old = gnumpy.garray(zeros(self.q, 'single'))
        for i in range(numtest):
            z[:] = Z[:, i]
            for step in range(100):
                z_old[:] = z
                kz[:,:] = -distmatrix_gpu(z[:,None], self.Z_gpu)
                kz += kY[i,:]
                z[:] = (gnumpy.exp(kz-logsumexp_gpu(kz, 1)) * self.Z_gpu).sum(1)
                if ((z_old-z)*(z_old-z)).sum()/float(self.q) < 10.0**-8.0:
                    Zout[:, i] = z.asarray()
                    break
            gnumpy.free_reuse_cache()
        return Zout

    def _backwardMode_cost(self, Ztest, kY):
        Ztest = Ztest.reshape(self.q,-1)
        numtest = kY.shape[0]
        kZ = -distmatrix_gpu(gnumpy.garray(Ztest), self.Z_gpu)
        return (-logsumexp_gpu(kZ+kY,1) + logsumexp_gpu(kY,1)).sum()/float(numtest)

    def _backwardMode_grad(self, Ztest, kY):
        Ztest = Ztest.reshape(self.q,-1)
        numtest = kY.shape[0]
        kZ = gnumpy.exp(-distmatrix_gpu(gnumpy.garray(Ztest), self.Z_gpu))
        result = gnumpy.zeros((self.q, numtest))
        KY = gnumpy.exp(kY)
        for i in range(self.q):
            result[i,:] = \
                -2.0*(kZ*KY*(self.Z_gpu[i,:][newaxis,:]-Ztest[i,:][:,newaxis])).sum(1)/\
                        (kZ*KY).sum(1)[newaxis,:] / float(numtest)
        return result.asarray().flatten()
        #return -(2.0*sum((kZ*kY).asarray()[:,:,newaxis]*
        #              (self.Z_gpu.asarray()[:,:,newaxis].transpose(2,1,0)-
        #               Ztest[:,:,newaxis].transpose(1,2,0)),1).T
        #                                 /sum((kZ*kY).asarray(),1)[newaxis,:]\
        #                                           / float(numtest)).flatten()

    def latdensity(self, Z, Ytest=None):
        Z = normalizeshape(Z)
        if Ytest is None:
            #return sum(exp(-sum((Z-self.Z)**2,0))) / self.Znormalize
            #kz = -distmatrix(Z,self.Z)
            #lse = logsumexp(kz, 1)[:,newaxis]
            #return sum(exp(kz-lse),1) / self.Znormalize
            return sum(exp(-distmatrix(Z, self.Z_gpu.asarray())),1) / self.Znormalize
        else:
            Ytest = normalizeshape(Ytest)
            if Ytest.shape[1] != 1:
                raise "specify a single Ytest only"
            Kz = exp(-distmatrix(Z, self.Z_gpu.asarray()))
            ky = self.logkernelmatrix(Ytest, self.Y)
            Ky = exp(ky - logsumexp(ky, 1)[:,newaxis])
            return sum(Kz*Ky, 1).T / self.Znormalize

    def latentHessian(self, z, Ytest=None):
        z = normalizeshape(z)
        assert z.shape[1] == 1, 'Hessian computation supported only at single latent positions'
        if Ytest is None:
            weights = ones(self.numcases, 'float')/double(self.numcases)
        else:
            Ytest = normalizeshape(Ytest)
            assert Ytest.shape[1] == 1, 'Hessian computation supported only for a single observation'
            weights = self.logkernelmatrix(Ytest, self.Y)
            weights = exp(weights - logsumexp(weights)[:,newaxis])
        kZ = gnumpy.exp(-distmatrix_gpu(gnumpy.garray(z), self.Z_gpu))
        w = (weights*kZ.asarray()).flatten()
        w /= w.sum()
        A = zeros((self.q, self.q), 'float')
        Z_ = z-self.Z_gpu.asarray()
        for i in range(self.numcases):
            A -= w[i] * outer(Z_[:,i], Z_[:,i])
            for l in range(self.numcases):
                A += (w[i]*w[l]) * outer(Z_[:,i], Z_[:,l])
        result = 2.0 * eye(self.q) + 4.0 * A
        return result

    def obsdensity(self, Y, Ztest=None):
        Y = normalizeshape(Y)
        if Ztest is None:
#            return sum(exp(-sum((Y[:,newaxis]-self.Y)**2,0)/exp(self.loghy))) \
#               / self.Ynormalize
             return self.kernelmatrix(Y, self.Y)/self.Ynormalize
        else:
            Ztest = normalizeshape(Ztest)
            if Ztest.shape[1] != 1:
                raise "specify a single Ytest only"
            raise 'not completely implemented yet'
        Ky = self.kernelmatrix(Y, self.Y)
        kz = -distmatrix(Z, self.Z)
        Kz = exp(kz - logsumexp(kz, 1)[:,newaxis])
        return sum(Kz*Ky, 1).T / self.Ynormalize

#    def train(self, numsteps, verbose=True, stepsize=[0.001]):
#        for s in range(numsteps):
#            if stepsize[0] < 10**-10: 
#                print "stepsize < 10**-10: exiting" 
#                return
#            if s == 0:
#                oldcost = self.cost()
#                if verbose:
#                    print "initial cost: %f " % oldcost
#            g = self.grad()
#            self.updateparams(self.params_gpu - stepsize[0] * g)
#            newcost = self.cost()
#            if newcost <= oldcost:
#                if verbose:
#                    print "cost: %f " % newcost
#                    print "increasing step-size to %f" % stepsize[0]
#                oldcost = newcost
#                stepsize[0] = stepsize[0] * 1.1
#            else:
#                if verbose:
#                    print "cost: %f larger than best cost %f" % \
#                                                            (newcost, oldcost)
#                    print "decreasing step-size to %f" % stepsize[0]
#                self.updateparams(self.params_gpu + stepsize[0] * g)
#                newcost = oldcost
#                stepsize[0] *= 0.5


    def train_bolddriver(self, numsteps, setstepsize=None, verbose=False):
        if setstepsize is not None:
            self.stepsize = setstepsize
        for step in range(numsteps):
            if self.firstcall:
                self.firstcall = False
                self.oldcost = self.f(self.params_gpu)
                #print "initial cost: %f " % self.oldcost
            g = self.g(self.params_gpu)
            self.inc_gpu[:] = self.momentum*self.inc_gpu - self.stepsize * g 
            self.updateparams(self.params_gpu + self.inc_gpu)
            self.newcost = self.f(self.params_gpu)
            if self.newcost <= self.oldcost:
                if verbose:
                    print "cost: %f " % self.newcost
                    print "increasing step-size to %f" % self.stepsize
                self.oldcost = self.newcost
                self.stepsize = self.stepsize * 1.1
            else:
                if verbose:
                    print "cost: %f " % self.newcost
                    print "decreasing step-size to %f" % self.stepsize
                #roll back changes to parameters and increments:
                self.updateparams(self.params_gpu - self.inc_gpu)
                if self.momentum > 0.0:
                    self.inc_gpu[:] = (self.inc_gpu + self.stepsize * g) / self.momentum
                else:
                    self.inc_gpu *= 0.0
                self.newcost = self.oldcost
                self.stepsize = self.stepsize * 0.5
                if self.stepsize < 10.0**-8.0:
                    if verbose:
                        print 'stepssize < ', 10**.0-8.0, ' exiting.'
                    break

    train = train_bolddriver

    def train_cg(self, maxnumlinesearch=100):
        from minimize import minimize
        p, g, numlinesearches = minimize(self.params_gpu.asarray(), self.f, self.g, args=(), maxnumlinesearch=maxnumlinesearch)
        self.updateparams(p)
        return numlinesearches


class Cie(object):
    """ Conditional kernel information embedding.
    
    Compute a conditional embedding of data by factoring out known 
    information. 
    """
    def __init__(self, q, KX=None, X=None, hx=None, 
                          KY=None, Y=None, hy=None, reg=0.1):
        self.q = q
        self.d, self.numcases = Y.shape
        self.reg = reg     #amount of regularization
        self.Z = randn(self.q, self.numcases)*0.1
        self.Y = Y
        #some quantities useful for gradient/cost computation:
        if Y == None:
            self.KY = KY
        else:
            self.hy = hy
            self.KY = gaussKernelMatrix(Y, hy)
        if X == None:
            self.KX = KX
        else:
            self.hx = hx
            self.KX = gaussKernelMatrix(X, hx)
            self.X = X
        self.params = self.Z.reshape(self.q*self.numcases)

    def cost(self):
        #NOTE: H(Z) = -(MINUS!) logsum...
        KZ = gaussKernelMatrix(self.Z, 1.0)
        sumKZKX = sum(KZ*self.KX,1)
        sumKZKXKY = sum(KZ*self.KX*self.KY,1)
        cost = -sum( - log(sumKZKX) + log(sumKZKXKY) ) / self.numcases
        #add penalty:
        cost += (self.reg/self.numcases) * sum(sum(self.Z*self.Z))
        return cost

    def grad(self):
        #----this is redundant and could be avoided if cost/grad were together:
        KZ = gaussKernelMatrix(self.Z, 1.0)
        sumKZKX = sum(KZ*self.KX,1)
        sumKZKXKY = sum(KZ*self.KX*self.KY,1)
        #----------
        grad = zeros((self.q, self.numcases), dtype=float)
        KXoversumKZKX = self.KX/sumKZKX[newaxis,:]
        KXoversumKZKX_ = self.KX/sumKZKX[:,newaxis]
        KYKX = self.KY*self.KX
        KYKXoversumKZKXKY = KYKX/sumKZKXKY[newaxis,:]
        KYKXoversumKZKXKY_ = KYKX/sumKZKXKY[:,newaxis]
        for l in range(self.numcases):
            grad[:, l] += \
                -sum((self.Z[:,l][:,newaxis] - self.Z)*KZ[l, :][newaxis,:]*
                                (KXoversumKZKX_[l,:]
                                 +KXoversumKZKX[l,:]
                                 -KYKXoversumKZKXKY_[l,:]
                                 -KYKXoversumKZKXKY[l,:])[newaxis,:]
                    , 1)
        grad *= (2.0/self.numcases)
        #add penalty-gradient:
        grad = grad + 2 * (self.reg/self.numcases) * self.Z
        return grad.flatten()

    def f(self,params):
        """Wrapper function around cost function to check grads, etc."""
        paramsold = self.params.copy()
        self.updateparams(params.copy().flatten())
        result = self.cost() 
        self.updateparams(paramsold.copy())
        return result

    def g(self,params):
        """Wrapper function around gradient to check grads, etc."""
        paramsold = self.params.copy()
        self.updateparams(params.copy().flatten())
        result = self.grad()
        self.updateparams(paramsold.copy())
        return result

    def updateparams(self,newparams):
        self.params_gpu[:,:] = newparams

    def forward(self, Ztest):
        Ztest = normalizeshape(Ztest)
        #kz = -sum((z[:,newaxis]-self.Z)**2, 0)
        #lse = logsumexp(kz)
        #return sum(exp(kz-lse)[newaxis,:]*self.Y,1)
        kz = -distmatrix(Ztest, self.Z)
        lse = logsumexp(kz, 1)[:,newaxis]
        return sum(exp(kz-lse)[:,:,newaxis]* 
                    self.Y[:,:,newaxis].transpose(2, 1, 0), 1).T

    def backward(self, Ytest):
        Ytest = normalizeshape(Ytest)
        #ky = -(1.0/self.hy)*sum((y[:,newaxis]-self.Y)**2, 0)
        #lse = logsumexp(ky)
        #return sum(exp(ky-lse)[newaxis,:]*self.Z,1)
        ky = -(1.0/exp(self.loghy))*distmatrix(Ytest, self.Y)
        lse = logsumexp(ky, 1)[:,newaxis]
        return sum(exp(ky-lse)[:,:,newaxis]* 
                    self.Z[:,:,newaxis].transpose(2, 1, 0), 1).T

    def forwardxz(self, x, z):
        kz = -sum((z[:,newaxis]-self.Z)**2, 0)
        kx = -sum((x[:,newaxis]-self.X)**2, 0)
        k = kz + kx
        lse = logsumexp(k)[:,newaxis] #???
        return sum(exp(k-lse)[newaxis,:]*self.Y,1)

#    def project(self, y):
#        return self.forward(self.backward(y))

    def train(self, numsteps, verbose=True, stepsize=[0.001]):
        for s in range(numsteps):
            if stepsize[0] < 10**-10: 
                print "stepsize < 10**-10: exiting" 
                return
            if s == 0:
                oldcost = self.cost()
                if verbose:
                    print "initial cost: %f " % oldcost
            g = self.grad()
            self.params -= stepsize[0] * g
            newcost = self.cost()
            if newcost <= oldcost:
                if verbose:
                    print "cost: %f " % newcost
                    print "increasing step-size to %f" % stepsize[0]
                oldcost = newcost
                stepsize[0] = stepsize[0] * 1.1
            else:
                if verbose:
                    print "cost: %f larger than best cost %f" % \
                                                            (newcost, oldcost)
                    print "decreasing step-size to %f" % stepsize[0]
                self.params += stepsize[0] * g
                newcost = oldcost
                stepsize[0] *= 0.5


class SharedKie(object):
    """ Shared kernel information embeddings. 
    
    Compute embeddings for several data sets simultaneously. Corresponding 
    data-points in the different data sets are represented by the same latent 
    space elements. 

    The constructor takes the following parameters:
    q       : Dimensionality of the latent space.
    hs      : A sequence of bandwidths, one for each input data set. 
    datasets: A sequence of observable data matrices. Data matrices need to 
              have the shape (number of dimensions) x (number of data points). 
              The number of data points has to be the same in all matrices, 
              the number of dimensions may vary.
    weights : A sequence of weights that can be used to weight the relative 
              importance of the different data-sets. 
    reg     : Regularization multiplier used to weight the regularization 
              term relative to the kie objective function. 

    optional parameters:
    The meaning of optional parameters is the same as for the corresponding 
    parameters in class Kie. See class Kie for details.

    The methods cost and grad can be used in conjunction with a gradient-based
    optimizer to train the model. Alternatively, the method train can 
    be used to train the model, but it is not very fast as it uses simple 
    gradient descent. 
    """
    def __init__(self, q, hs, datasets, weights, regs, loo=False, 
                           anisotropic=False, penfuncs=[penfunc_squarednorm], 
                                              pengrads=[pengrad_squarednorm]):
        self.loo = loo   #use leave-one-out-estimate?
        self.weights = weights
        self.q = q
        self.penfuncs = penfuncs
        self.pengrads = pengrads
        self.datasets = datasets
        self.loghs = [array(log(h)) for h in hs]
        self.numdatasets = len(self.loghs)
        self.dimss, self.numcasess = zip(*[D.shape for D in datasets])
        self.numcases = self.numcasess[0]
        self.anisotropic = anisotropic
        self.regs = regs     #amount of regularization
        #self.Z = 0.1*randn(self.q, self.numcases)
        #self.GramZ = zeros((self.numcases, self.numcases), 'float')
        #self.kZ = zeros((self.numcases, self.numcases), 'float')
        self.kZ_gpu = gnumpy.garray(zeros((self.numcases, self.numcases), 'float'))
        #some quantities useful for gradient/cost computation:
        if not self.anisotropic:
            Grams = [dot(x.T,x) for x in self.datasets]
            self.Ds = [diag(Gram)[newaxis,:]+diag(Gram)[:,newaxis]-2*Gram
                                                           for Gram in Grams]
            self.kYs_gpu = [gnumpy.garray(-(1.0/exp(logh)) * DD)
                                    for logh, DD in zip(self.loghs, self.Ds)]
        else:
            self.kYs_gpu = [gnumpy.garray(log_anisotropicKernelMatrix(Y, h=exp(logh)))
                              for logh, Y in zip(self.loghs, self.datasets)]
        self.params_gpu = gnumpy.garray(randn(self.numcases*self.q)*0.1)
        self.Z_gpu = self.params_gpu[:self.q*self.numcases].reshape(self.q, self.numcases)
        self.inc_gpu = gnumpy.garray(zeros(self.numcases*self.q))

        self.sumKZ_gpu = gnumpy.empty(self.numcases)
        self.oneoversumKZ_gpu = gnumpy.empty(self.numcases)
        self.KZ_gpu = gnumpy.empty((self.numcases, self.numcases))
        self.ZZ_gpu = gnumpy.empty((self.numcases, self.numcases))
        self.Z2_gpu = gnumpy.empty(self.numcases)
        self.ZZFactor_gpu = gnumpy.empty((self.numcases, self.numcases))
        self.ILarge_gpu = gnumpy.garray(eye(self.numcases, self.numcases))
        self.ILarge_gpu *= LARGE
        self.gradZ_gpu = gnumpy.zeros((self.q, self.numcases))
        self.sumKZKYs = [None]*self.numdatasets
        self.oneoversumKZKYs_gpu = [None]*self.numdatasets
        for d in range(self.numdatasets):
            self.sumKZKYs[d] = gnumpy.empty(self.numcases)
            self.oneoversumKZKYs_gpu[d] = gnumpy.empty(self.numcases)
        if self.loo: 
            for d in range(self.numdatasets):
                self.kYs_gpu[d] -= self.ILarge_gpu
        self.KYs_gpu = [K.exp() for K in self.kYs_gpu]
        self.firstcall = True

        self.momentum = 0.9
        self.stepsize = 0.01

    def logkernelmatrix(self, Ytest, Y=None, h=None):
        if Y is None:
            Y = Ytest
        if not self.anisotropic:
            return -(1.0/h)*distmatrix(Ytest, Y)
        else:
            return log_anisotropicKernelMatrix(Y, Ytest, h=h)

    def kernelmatrix(self, Ytest, Y=None, h=None):
        if Y is None:
            Y = Ytest
        if not self.anisotropic:
            return exp(-(1.0/h)*distmatrix(Ytest, Y))
        else:
            return gnumpy.exp(log_anisotropicKernelMatrix(Y, Ytest, h=h))

    def oldcost(self):
        GramZ = dot(self.Z_gpu.asarray().T, self.Z_gpu.asarray())
        kZ = -(diag(GramZ)[newaxis,:]+diag(GramZ)[:,newaxis]-2*GramZ)
        if self.loo:
            kZ -= eye(self.numcases)*LARGE
        cost = 0.0
        for i in range(self.numdatasets):
            cost -= sum(logsumexp(self.kYs_gpu[i].asarray()+kZ,1)) * self.weights[i]
            cost += sum(logsumexp(kZ,1)) * self.weights[i]
        cost /= float(self.numcases) #* float(self.numdatasets)
        #add penalty:
        for i, penfunc in enumerate(self.penfuncs):
            cost += (self.regs[i]/float(self.numcases)) * penfunc(self.Z_gpu.asarray())
        return cost

    def cost(self):
        #self.GramZ[:,:] = dot(self.Z.T, self.Z)
        #kZ = -(diag(GramZ)[newaxis,:]+diag(GramZ)[:,newaxis]-2*GramZ)
        self.Z2_gpu[:] = (self.Z_gpu * self.Z_gpu).sum(0)
        self.kZ_gpu[:,:] = -self.Z2_gpu[newaxis,:]
        self.kZ_gpu -= self.Z2_gpu[:,newaxis]
        self.kZ_gpu += 2*gnumpy.dot(self.Z_gpu.T, self.Z_gpu)

        if self.loo:
            self.kZ_gpu -= self.ILarge_gpu
        cost = 0.0
        for d in range(self.numdatasets):
            cost -= (logsumexp_gpu(self.kYs_gpu[d]+self.kZ_gpu,1)).sum() * self.weights[d]
            cost += (logsumexp_gpu(self.kZ_gpu,1)).sum() * self.weights[d]
        cost /= float(self.numcases) #* float(self.numdatasets)
        #add penalty:
        for i, penfunc in enumerate(self.penfuncs):
            cost += (self.regs[i]/float(self.numcases)) * penfunc(self.Z_gpu.asarray())
        return float(cost)
        #return cost

    def oldgrad(self):
        GramZ = dot(self.Z_gpu.asarray().T, self.Z_gpu.asarray())
        KZ = exp(-(diag(GramZ)[newaxis,:]+diag(GramZ)[:,newaxis]-2*GramZ))
        if self.loo:
            KZ *= (1.-eye(self.numcases))
        sumKZ = sum(KZ,1)
        oneoversumKZ = 1./sumKZ 
        sumKZKs = [sum(KZ*KY.asarray(),1) for KY in self.KYs_gpu]
        #sumKZKY = sum(KZy*self.KY,1)
        #sumKZKX = sum(KZx*self.KX,1)
        gradZ = zeros((self.q, self.numcases), dtype=float) 
        #gradZx = gradZ[:,:self.numcasesX]
        #gradZy = gradZ[:,-self.numcasesY:]
        for i in range(self.numdatasets):
            for l in range(self.numcases):
                gradZ[:, l] += self.weights[i]*\
                    2*sum(((self.Z_gpu.asarray()[:,l][:,newaxis] - self.Z_gpu.asarray())
                          *KZ[l,:][newaxis,:]
                          *( (self.KYs_gpu[i].asarray()[l,:]/sumKZKs[i][l])[:,newaxis]
                            +(self.KYs_gpu[i].asarray()[l,:]/sumKZKs[i].T)[:,newaxis]
                            -oneoversumKZ[l]
                            -oneoversumKZ.T[:,newaxis]
                           ).T),1)
        gradZ /= float(self.numcases) #* float(self.numdatasets)
        for i, pengrad in enumerate(self.pengrads):
            gradZ += (self.regs[i]/self.numcases) * pengrad(self.Z_gpu.asarray())
        return gradZ.flatten()

    def grad(self):
        self.Z2_gpu[:] = (self.Z_gpu * self.Z_gpu).sum(0)
        self.kZ_gpu[:,:] = -self.Z2_gpu[newaxis,:]
        self.kZ_gpu -= self.Z2_gpu[:,newaxis]
        self.kZ_gpu += 2*gnumpy.dot(self.Z_gpu.T, self.Z_gpu)
        self.KZ_gpu[:,:] = gnumpy.exp(self.kZ_gpu)
        if self.loo:
            self.kZ_gpu -= self.ILarge_gpu
            self.KZ_gpu[:,:] = gnumpy.exp(self.kZ_gpu)
        self.sumKZ_gpu[:] = self.KZ_gpu.sum(1)
        self.oneoversumKZ_gpu[:] = 1.0/(self.sumKZ_gpu+SMALL)
        for d in range(self.numdatasets):
            self.sumKZKYs[d][:] = (self.KZ_gpu*self.KYs_gpu[d]).sum(1) 
            self.oneoversumKZKYs_gpu[d][:] = 1.0/(self.sumKZKYs[d]+SMALL)
        #gradZ = zeros((self.q, self.numcases), dtype=float) 
        for d in range(self.numdatasets):
#            for l in range(self.numcases):
#                gradZ[:, l] += self.weights[d]*\
#                    2*sum(((self.Z[:,l][:,newaxis] - self.Z)
#                          *KZ[l,:][newaxis,:]
#                          *( (self.Ks[d][l,:]/sumKZKs[d][l])[:,newaxis]
#                            +(self.Ks[d][l,:]/sumKZKs[d].T)[:,newaxis]
#                            -oneoversumKZ[l]
#                            -oneoversumKZ.T[:,newaxis]
#                           ).T),1)
            self.ZZFactor_gpu[:,:] = self.KYs_gpu[d]*self.oneoversumKZKYs_gpu[d][:,newaxis]
            self.ZZFactor_gpu += self.KYs_gpu[d]*self.oneoversumKZKYs_gpu[d][newaxis,:]
            self.ZZFactor_gpu -= self.oneoversumKZ_gpu[newaxis,:]
            self.ZZFactor_gpu -= self.oneoversumKZ_gpu[:,newaxis]
            self.ZZFactor_gpu *= self.KZ_gpu
            for i in range(self.q):
                self.ZZ_gpu[:,:] = 0.0
                self.ZZ_gpu += self.Z_gpu[i, :][:,newaxis]
                self.ZZ_gpu -= self.Z_gpu[i, :][newaxis,:]
                self.ZZ_gpu *= self.ZZFactor_gpu
                self.gradZ_gpu[i, :] += 2*self.weights[d]*self.ZZ_gpu.sum(1)
        self.gradZ_gpu /= float(self.numcases) #* float(self.numdatasets)
        for i, pengrad in enumerate(self.pengrads):
            self.gradZ_gpu += (self.regs[i]/self.numcases) * pengrad(self.Z_gpu.asarray())
        #return self.gradZ_gpu.asarray().flatten()
        return self.gradZ_gpu.reshape(-1)

    def f(self, params):
        """Wrapper function around cost function to check grads, etc."""
        paramsold = self.params_gpu.asarray()
        self.updateparams(params)
        result = self.cost() 
        self.updateparams(paramsold)
        return result

    def g(self, params):
        """Wrapper function around gradient to check grads, etc."""
        paramsold = self.params_gpu.asarray()
        self.updateparams(params)
        result = self.grad()
        self.updateparams(paramsold)
        return result.asarray()

    def updateparams(self, newparams):
        #self.params[:] = newparams
        if type(newparams) == type(array([])):
            self.params_gpu[:] *= 0.0
            self.params_gpu[:] += gnumpy.garray(newparams)
        else:
            self.params_gpu[:] = newparams

    def forwardMean(self, Ztest, indexes):
        indexes = listify(indexes)
        Ztest = normalizeshape(Ztest)
        kz = -distmatrix(Ztest, self.Z_gpu.asarray())
        lse = logsumexp(kz, 1)[:,newaxis]
        return [sum(exp(kz-lse)[:,:,newaxis]* 
                    self.datasets[i][:,:,newaxis].transpose(2, 1, 0), 1).T 
                                                              for i in indexes]
    def setYtest(self, Ytests, indexes):
        self.indexes = listify(indexes)
        Ytests = listify(Ytests)
        self.Ytests = [normalizeshape(Ytest) for Ytest in Ytests]
        self.kytest = zeros((self.Ytests[0].shape[1], self.numcases), dtype=float)
        for Ytest,i,logh in zip(self.Ytests, self.indexes, self.loghs):
            self.kytest += self.logkernelmatrix(Ytest, self.datasets[i], h=exp(logh))
        self.kytest_lse = logsumexp(self.kytest, 1)[:,newaxis]

    def backwardMean(self, Ytests=None, indexes=None):
        if Ytests is None:
            ky = self.kytest
            lse = self.kytest_lse
        else:
            indexes = listify(indexes)
            Ytests = listify(Ytests)
            Ytests = [normalizeshape(Ytest) for Ytest in Ytests]
            ky = zeros((Ytests[0].shape[1], self.numcases), dtype=float)
            for Ytest,i,logh in zip(Ytests, indexes, self.loghs):
                #ky += -(1.0/exp(logh))*distmatrix(Ytest, self.datasets[i]) 
                ky += self.logkernelmatrix(Ytest, self.datasets[i], h=exp(logh))
            lse = logsumexp(ky, 1)[:,newaxis]
        return sum(exp(ky-lse)[:,:,newaxis]* 
                                   self.Z_gpu.asarray()[:,:,newaxis].transpose(2, 1, 0), 1).T

    def backwardVar(self, Ytests, indexes):
        if self.missing is not None:
            assert 0, 'backwardVar for missing data not implemented'
        indexes = listify(indexes)
        Ytests = listify(Ytests)
        Ytests = [normalizeshape(Ytest) for Ytest in Ytests]
        assert Ytests[0].shape[1] == 1, 'can compute cov only one point at a time'
        ky = zeros((Ytests[0].shape[1], self.numcases), dtype=float)
        for Ytest,i,logh in zip(Ytests, indexes, self.loghs):
            ky += self.logkernelmatrix(Ytest, self.datasets[i], h=exp(logh))
        lse = logsumexp(ky, 1)[:,newaxis]
        Ky = exp(ky-lse)
        Zy = self.Z_gpu.asarray() * Ky 
        result = eye(self.q) * 1.0
        result += dot(Zy, self.Z_gpu.asarray().T)
        for l in range(self.numcases):
            result -= (Zy[:,l][newaxis,newaxis,:] * Zy[:,:,newaxis]).sum(1)
        return result

    forward = forwardMean

    backward = backwardMean

    def forwardMode(self, Ztest, Yinit=None, index=None):
        """ Returns the conditional mode of y given z. 
        
        Only a single index is allowed. 
        """
        from scipy.optimize import fmin_cg
        if Yinit is None:
            Yinit=self.datasets[index][:,argmin(distmatrix(Ztest, self.Z_gpu.asarray()),1)]
        return fmin_cg(self._forwardMode_cost, Yinit.flatten(), 
                                   self._forwardMode_grad, 
                                   args=(Ztest.flatten(), index), 
                                   maxiter=100, 
                                   disp=0).reshape(self.dimss[index],-1)

    def _forwardMode_cost(self, Ytest, Ztest, index):
        Ytest = Ytest.reshape(self.dimss[index],-1)
        Ztest = Ztest.reshape(self.q,-1)
        numtest = Ztest.shape[1]
        kZ = -distmatrix(Ztest, self.Z_gpu.asarray())
        kY = -(1.0/exp(self.loghs[index])) \
              * distmatrix(Ytest, self.datasets[index])
        return sum( -logsumexp(kZ+kY,1) + logsumexp(kZ,1) )/float(numtest)

    def _forwardMode_grad(self, Ytest, Ztest, index):
        Ytest = Ytest.reshape(self.dimss[index],-1)
        Ztest = Ztest.reshape(self.q,-1)
        numtest = Ztest.shape[1]
        kZ = exp(-distmatrix(Ztest, self.Z_gpu.asarray()))
        kY = exp(-(1.0/exp(self.loghs[index])) 
                  * distmatrix(Ytest, self.datasets[index]))
        return -((2.0/exp(self.loghs[index]))\
                 *sum((kZ*kY)[:,:,newaxis]*
                      (self.datasets[index][:,:,newaxis].transpose(2,1,0)-
                       Ytest[:,:,newaxis].transpose(1,2,0)),1).T
                                         /sum(kZ*kY,1)[newaxis,:]\
                                                   / float(numtest)).flatten()

    def backwardNn(self, Ytests, indexes):
        indexes = listify(indexes)
        Ytests = listify(Ytests)
        numtest = Ytests[0].shape[1]
        dY = mean([distmatrix(Ytest, self.datasets[i]) 
                                         for Ytest, i in zip(Ytests, indexes)], 0)
        return self.Z_gpu.asarray()[:,argmin(dY,1)]
        
    def backwardMode(self, Ytests=None, indexes=None, Zinit=None):
        """ Returns the conditional mode of z given y. """
        from scipy.optimize import fmin_cg
        from minimize import minimize
        if Ytests is None:
            kY = self.kytest
            Ytests = self.Ytests
            indexes = self.indexes
        else:
            indexes = listify(indexes)
            Ytests = listify(Ytests)
            numtest = Ytests[0].shape[1]
            kY = zeros((numtest, self.numcases), dtype=float)
            for Ytest,i in zip(Ytests, indexes):
                kY += self.logkernelmatrix(
                                 Ytest, self.datasets[i], h=exp(self.loghs[i]))
        kY = gnumpy.garray(kY)

        if Zinit is None:
            #if not self.anisotropic:
            #    dY = mean([distmatrix(Ytest, self.datasets[i]) 
            #                             for Ytest, i in zip(Ytests, indexes)], 0)
            #else:
            dY = mean([-kY for Ytest, i in zip(Ytests, indexes)], 0)
            Zinit = self.Z_gpu[:,argmin(dY,1)]
        else:
            Zinit = gnumpy.garray(normalizeshape(Zinit))

        return self.meanshift(Zinit, kY)

    def meanshift(self, Z, kY):
        numtest = Z.shape[1]
        Zout = zeros((self.q, numtest), 'single')
        kz = gnumpy.garray(zeros((1, self.numcases), 'single'))
        z = gnumpy.garray(zeros(self.q, 'single'))
        z_old = gnumpy.garray(zeros(self.q, 'single'))
        for i in range(numtest):
            z[:] = Z[:, i]
            for step in range(100):
                z_old[:] = z
                kz[:,:] = -distmatrix_gpu(z[:,None], self.Z_gpu)
                kz += kY[i,:]
                z[:] = (gnumpy.exp(kz-logsumexp_gpu(kz, 1)) * self.Z_gpu).sum(1)
                if ((z_old-z)*(z_old-z)).sum()/float(self.q) < 10.0**-8.0:
                    Zout[:, i] = z.asarray()
                    break
            gnumpy.free_reuse_cache()
        return Zout

    def _backwardMode_cost_nocuda(self, Ztest, kY, KY_dontneed):
        Ztest = Ztest.reshape(self.q,-1)
        numtest = kY.shape[0]
        kZ = -distmatrix(Ztest, self.Z_gpu.asarray())
        return sum( -logsumexp(kZ+kY,1) + logsumexp(kY,1) )/float(numtest)

    def _backwardMode_grad_nocuda(self, Ztest, kY_dontneed, KY):
        Ztest = Ztest.reshape(self.q,-1)
        numtest = kY.shape[0]
        KZ = exp(-distmatrix(Ztest, self.Z_gpu.asarray()))
        KY = exp(kY)
        return -(2.0*sum((KZ*KY)[:,:,newaxis]*
                      (self.Z_gpu.asarray()[:,:,newaxis].transpose(2,1,0)-
                       Ztest[:,:,newaxis].transpose(1,2,0)),1).T
                                         /sum(KZ*KY,1)[newaxis,:]\
                                                   / float(numtest)).flatten()

    def _backwardMode_cost(self, Ztest, kY, KY_dontneed):
        Ztest = Ztest.reshape(self.q,-1)
        numtest = kY.shape[0]
        kZ = -distmatrix_gpu(gnumpy.garray(Ztest), self.Z_gpu)
        return (-logsumexp_gpu(kZ+kY,1) + logsumexp_gpu(kY,1)).sum()/float(numtest)

    def _backwardMode_grad(self, Ztest, kY_dontneed, KY):
        Ztest = Ztest.reshape(self.q,-1)
        numtest = KY.shape[0]
        KZ = gnumpy.exp(-distmatrix_gpu(gnumpy.garray(Ztest), self.Z_gpu))
        result = gnumpy.zeros((self.q, numtest))
        KZKY = (KZ*KY + SMALL).sum(1)[newaxis,:]
        for i in range(self.q):
            result[i,:] = \
                -2.0*(KZ*KY*(self.Z_gpu[i,:][newaxis,:]-Ztest[i,:][:,newaxis])).sum(1)/\
                        KZKY / float(numtest)
        assert not isnan(result.asarray().sum() ), "nan in _backwardMode_grad"
        return result.asarray().flatten()

    def backwardSample(self, Ytests=None, indexes=None, numsamples=1):
        if Ytests is None:
            kY = self.kytest
            kY_lse = self.kytest_lse
            Ytests = self.Ytests
        else:
            Ytests = listify(Ytests)
            indexes = listify(indexes)
            numtest = Ytests[0].shape[1]
            kY = zeros((numtest, self.numcases), dtype=float)
            for Ytest,i in zip(Ytests, indexes):
                kY += self.logkernelmatrix(
                                 Ytest, self.datasets[i], h=exp(self.loghs[i]))
            kY_lse = logsumexp(kY, 1)[:,newaxis]
        kY = exp(kY - kY_lse)
        #Ytests = listify(Ytests)
        #indexes = listify(indexes)
        #Ytests = [normalizeshape(Ytest) for Ytest in Ytests]
        #kY = zeros((Ytests[0].shape[1], self.numcases), dtype=float)
        #for Ytest,i in zip(Ytests, indexes):
        #    #kY += -(1.0/exp(logh))*distmatrix(Ytest, self.datasets[i]) 
        #    kY += self.logkernelmatrix(
        #                         Ytest, self.datasets[i], h=exp(self.loghs[i]))
        #kY = exp(kY - logsumexp(kY, 1)[:,newaxis])
        result = zeros((self.q, Ytests[0].shape[1], numsamples), dtype=float)
        for i in range(Ytests[0].shape[1]):
            indexes = drawDiscrete(kY[i ,:], numsamples)
            result[:,i,:] = (randn(self.q, numsamples) + self.Z_gpu.asarray()[:,indexes])
        return result 

#    def forwardSample(self, Ztest, numsamples=1):
#        Ztest = normalizeshape(Ztest)
#        kz = -distmatrix(Ztest, self.Zx)
#        kz =  exp(kz - logsumexp(kz, 1)[:,newaxis])
#        result = zeros((self.dimsX, Ztest.shape[1], numsamples), dtype=float)
#        stddev = sqrt(exp(self.loghx))
#        for i in range(Ztest.shape[1]):
#            indexes = drawDiscrete(kz[i ,:], numsamples)
#            result[:,i,:] = (stddev*randn(self.dimsX, numsamples) 
#                                                           + self.X[:,indexes])
#        return result 

    def latdensity(self, Z, Ytests=None, indexes=None):
        Z = normalizeshape(Z)
        if Ytests is None:
            return sum(exp(-sum((Z-self.Z_gpu.asarray())**2,0))) / self.Znormalize
        else:
            indexes = listify(indexes)
            Ytests = listify(Ytests)
            Ytests = [normalizeshape(Ytest) for Ytest in Ytests]
            assert Ytests[0].shape[1] == 1, "specify a single Ytest only"
            Kz = exp(-distmatrix(Z, self.Z_gpu.asarray()))
            Ky = zeros((1, self.numcases), dtype=float)
            for Ytest,i in zip(Ytests, indexes):
                #Ky += -(1.0/exp(logh))*distmatrix(Ytest, self.datasets[i]) 
                Ky += self.logkernelmatrix(
                                 Ytest, self.datasets[i], h=exp(self.loghs[i]))
            Ky = exp(Ky - logsumexp(Ky, 1)[:,newaxis])
            return sum(Kz*Ky, 1).T #/ self.Znormalize

    def latentHessian(self, z, Ytests=None, indexes=None):
        z = normalizeshape(z)
        assert z.shape[1] == 1, 'Hessian computation supported only at single latent positions'
        if Ytests is None:
            weights = ones(self.numcases, 'float')/double(self.numcases)
        else:
            indexes = listify(indexes)
            Ytests = listify(Ytests)
            Ytests = [normalizeshape(Ytest) for Ytest in Ytests]
            assert Ytests[0].shape[1] == 1, 'Hessian computation supported only for a single observation'
            weights = zeros((1, self.numcases), dtype=float)
            for Ytest,i in zip(Ytests, indexes):
                weights += self.logkernelmatrix(
                                 Ytest, self.datasets[i], h=exp(self.loghs[i]))
            weights = exp(weights - logsumexp(weights, 1)[:,newaxis])
        kZ = gnumpy.exp(-distmatrix_gpu(gnumpy.garray(z), self.Z_gpu))
        w = (weights*kZ.asarray()).flatten()
        w /= w.sum()
        A = zeros((self.q, self.q), 'float')
        Z_ = z-self.Z_gpu.asarray()
        for i in range(self.numcases):
            A -= w[i] * outer(Z_[:,i], Z_[:,i])
            for l in range(self.numcases):
                A += (w[i]*w[l]) * outer(Z_[:,i], Z_[:,l])
        result = 2.0 * eye(self.q) + 4.0 * A
        return result

    def obsdensity(self,y):
        return sum(exp(-sum((y[:,newaxis]-self.Y)**2,0) / exp(self.loghy))) \
               / self.Ynormalize

    def rankNeighbors(self, Ytest, indexes):
        """ Returns indexes into the training data, based on conditional 
            densities of latent space representatives.  
        """
        Ytest = normalizeshape(Ytest)
        return argsort(-self.latdensity(self.Z_gpu.asarray(), Ytests=Ytest, indexes=indexes))

    def train_bolddriver(self, numsteps, setstepsize=None, verbose=False):
        if setstepsize is not None:
            self.stepsize = setstepsize
        for step in range(numsteps):
            if self.firstcall:
                self.firstcall = False
                self.oldcost = self.f(self.params_gpu)
                #print "initial cost: %f " % self.oldcost
            g = self.g(self.params_gpu)
            self.inc_gpu[:] = self.momentum*self.inc_gpu - self.stepsize * g 
            self.updateparams(self.params_gpu + self.inc_gpu)
            self.newcost = self.f(self.params_gpu)
            if self.newcost <= self.oldcost:
                if verbose:
                    print "cost: %f " % self.newcost
                    print "increasing step-size to %f" % self.stepsize
                self.oldcost = self.newcost
                self.stepsize = self.stepsize * 1.1
            else:
                if verbose:
                    print "cost: %f " % self.newcost
                    print "decreasing step-size to %f" % self.stepsize
                #roll back changes to parameters and increments:
                self.updateparams(self.params_gpu - self.inc_gpu)
                if self.momentum > 0.0:
                    self.inc_gpu[:] = (self.inc_gpu + self.stepsize * g) / self.momentum
                else:
                    self.inc_gpu *= 0.0
                self.newcost = self.oldcost
                self.stepsize = self.stepsize * 0.5
                if self.stepsize < 10.0**-8.0:
                    if verbose:
                        print 'stepssize < ', 10**.0-8.0, ' exiting.'
                    break

    train = train_bolddriver

    def train_cg(self, maxnumlinesearch=100):
        from minimize import minimize
        p, g, numlinesearches = minimize(self.params_gpu.asarray(), self.f, self.g, args=(), maxnumlinesearch=maxnumlinesearch, verbose=False)
        print 'used ', numlinesearches, 'linesearches'
        self.updateparams(p)
        return numlinesearches



class SharedKie2(object):
    """ Shared kernel information embeddings for exactly two data-sets. 
    
    Compute embeddings for exactly two data sets simultaneously. Corresponding 
    data-points in the different data sets are represented by the same latent 
    space elements. In contrast to SharedKie, SharedKie2 allows you to let only 
    a subset of data points share an embedding. SharedKie2 also allows the two 
    data-sets to contain different numbers of data-points. 

    The parameter numshared is an integer that is used to specify the number 
    of points that share latent representatives. To specify which data points 
    in the two data-sets share latent representatives, the following 
    convention is used: The points contained in the _last_ numshared columns
    of the input data matrix X are assumed to correspond with the _first_
    numshared columns of input data matrix Y. This can be illustrated as 
    follows:

    datamatrix X:    nnnnnnnnnnnnnnnssss
    datamatrix Y:                   ssssnnnnnnnnnnnnnnnnnnnnnnnnnnn
    latent matrix:   llllllllllllllllllllllllllllllllllllllllllllll

    Here 'n' denotes points that do not share their latent representatives, 
    's' denotes points that share their latent representatives and 'l' 
    denote the latent representatives (whose number, consequently, is: 
    total number of points in X + total number of points in Y - numshared). 

    The methods cost and grad can be used in conjunction with a gradient-based
    optimizer to train the model. Alternatively, the method train can 
    be used to train the model, but it is not very fast as it uses simple 
    gradient descent. 

    This class, in contrast to Kie and SharedKie, currently does not provide GPU-support. 

    """
    def __init__(self, q, hx, X, hy, Y, numshared, reg, loo, 
                                                  penfunc=penfunc_squarednorm, 
                                                  pengrad=pengrad_squarednorm):
        self.loo = loo   #use leave-one-out-estimate?
        self.q = q
        self.X = X
        self.Y = Y
        self.penfunc = penfunc
        self.pengrad = pengrad
        self.numshared = numshared
        self.loghy = array(log(hy))
        self.loghx = array(log(hx))
        self.dimsX, self.numcasesX = X.shape
        self.dimsY, self.numcasesY = Y.shape
        self.numcases = self.numcasesX + self.numcasesY - self.numshared
        self.reg = reg     #amount of regularization
        self.Z = 0.1*randn(self.q, self.numcases)
        self.Zjoint = \
                  self.Z[:,self.numcasesX:self.numcasesX+self.numshared]
        self.Zx = self.Z[:,:self.numcasesX]
        self.Zy = self.Z[:,-self.numcasesY:]
        #some quantities useful for gradient/cost computation:
        GramX = dot(X.T,X)
        self.XX = diag(GramX)[newaxis,:] + diag(GramX)[:,newaxis] - 2 * GramX
        self.kX = -(1.0/exp(self.loghx)) * self.XX
        GramY = dot(Y.T,Y)
        self.YY = diag(GramY)[newaxis,:] + diag(GramY)[:,newaxis] - 2 * GramY
        self.kY = -(1.0/exp(self.loghy)) * self.YY
        #if self.loo: 
        #    self.kY -= eye(self.numcases)*LARGE
        self.KY = exp(self.kY)  
        self.KX = exp(self.kX) 
        self.params = self.Z.reshape(-1)

    def cost(self):
        GramZx = dot(self.Zx.T, self.Zx)
        GramZy = dot(self.Zy.T, self.Zy)
        kZx = -(diag(GramZx)[newaxis,:] + diag(GramZx)[:,newaxis] - 2 * GramZx)
        kZy = -(diag(GramZy)[newaxis,:] + diag(GramZy)[:,newaxis] - 2 * GramZy)
        if self.loo:
            kZx -= eye(self.numcasesX)*LARGE
            kZy -= eye(self.numcasesY)*LARGE
        cost = - sum(logsumexp(self.kX+kZx,1))/self.numcasesX \
               + sum(logsumexp(kZx,1))/self.numcasesX \
               - sum(logsumexp(self.kY+kZy,1))/self.numcasesY \
               + sum(logsumexp(kZy,1))/self.numcasesY
        #add penalty:
        cost += (self.reg/self.numcases) * self.penfunc(self.Z_gpu.asarray())
        return cost
    
    def grad(self):
        GramZx = dot(self.Zx.T,self.Zx)
        GramZy = dot(self.Zy.T,self.Zy)
        KZx = exp(-(diag(GramZx)[newaxis,:] + diag(GramZx)[:,newaxis] - 
                                                                  2*GramZx))
        KZy = exp(-(diag(GramZy)[newaxis,:] + diag(GramZy)[:,newaxis] - 
                                                                  2*GramZy))
        if self.loo:
            KZx *= (1.-eye(self.numcasesX))
            KZy *= (1.-eye(self.numcasesY))
        sumKZx = sum(KZx,1)
        sumKZy = sum(KZy,1)
        oneoversumKZx = 1./sumKZx
        oneoversumKZy = 1./sumKZy
        sumKZKY = sum(KZy*self.KY,1)
        sumKZKX = sum(KZx*self.KX,1)
        gradZ = zeros((self.q, self.numcases), dtype=float) 
        gradZx = gradZ[:,:self.numcasesX]
        gradZy = gradZ[:,-self.numcasesY:]
        for l in range(self.numcasesX):
            gradZx[:, l] += \
                2*sum(((self.Zx[:,l][:,newaxis] - self.Zx)
                      *KZx[l,:][newaxis,:]
                      *( (self.KX[l,:]/sumKZKX[l])[:,newaxis]
                        +(self.KX[l,:]/sumKZKX.T)[:,newaxis]
                        -oneoversumKZx[l]
                        -oneoversumKZx.T[:,newaxis]
                       ).T),1)/self.numcasesX
        for l in range(self.numcasesY):
            gradZy[:, l] += \
                2*sum(((self.Zy[:,l][:,newaxis] - self.Zy)
                      *KZy[l,:][newaxis,:]
                      *( (self.KY[l,:]/sumKZKY[l])[:,newaxis]
                        +(self.KY[l,:]/sumKZKY.T)[:,newaxis]
                        -oneoversumKZy[l]
                        -oneoversumKZy.T[:,newaxis]
                       ).T),1)/self.numcasesY

        gradZ += (self.reg/self.numcases) * self.pengrad(self.Z_gpu.asarray())
        return gradZ.flatten()

    def updateloghxy(self, loghx, loghy):
        self.loghx = loghx
        self.loghy = loghy
        self.kY = -(1.0/exp(self.loghy)) * self.YY
        self.kX = -(1.0/exp(self.loghx)) * self.XX
        if self.loo: 
            self.kY -= eye(self.numcasesY)*LARGE
            self.kX -= eye(self.numcasesX)*LARGE
        self.KY = exp(self.kY) 
        self.KX = exp(self.kX) 
 
    def f(self,params):
        """Wrapper function around cost function to check grads, etc."""
        paramsold = self.params.copy()
        self.updateparams(params.copy().flatten())
        result = self.cost() 
        self.updateparams(paramsold.copy())
        return result

    def g(self,params):
        """Wrapper function around gradient to check grads, etc."""
        paramsold = self.params.copy()
        self.updateparams(params.copy().flatten())
        result = self.grad()
        self.updateparams(paramsold.copy())
        return result

    def updateparams(self,newparams):
        self.params *= 0.0
        self.params += newparams.copy()

    def forwardxMean(self, Ztest):
        Ztest = normalizeshape(Ztest)
        kz = -distmatrix(Ztest, self.Zx)
        lse = logsumexp(kz, 1)[:,newaxis]
        return sum(exp(kz-lse)[:,:,newaxis]* 
                    self.X[:,:,newaxis].transpose(2, 1, 0), 1).T

    def forwardyMean(self, Ztest):
        Ztest = normalizeshape(Ztest)
        kz = -distmatrix(Ztest, self.Zy)
        lse = logsumexp(kz, 1)[:,newaxis]
        return sum(exp(kz-lse)[:,:,newaxis]* 
                    self.Y[:,:,newaxis].transpose(2, 1, 0), 1).T

    def backwardxMean(self, Xtest):
        Xtest = normalizeshape(Xtest)
        kx = -(1.0/exp(self.loghx))*distmatrix(Xtest, self.X)
        lse = logsumexp(kx, 1)[:,newaxis]
        return sum(exp(kx-lse)[:,:,newaxis]* 
                    self.Zx[:,:,newaxis].transpose(2, 1, 0), 1).T

    def backwardyMean(self, Ytest):
        Ytest = normalizeshape(Ytest)
        ky = -(1.0/exp(self.loghy))*distmatrix(Ytest, self.Y)
        lse = logsumexp(ky, 1)[:,newaxis]
        return sum(exp(ky-lse)[:,:,newaxis]* 
                    self.Zy[:,:,newaxis].transpose(2, 1, 0), 1).T

    forwardx = forwardxMean

    forwardy = forwardyMean

    backwardx = backwardxMean

    backwardy = backwardyMean

    def forwardyMode(self, Ztest, Yinit=None):
        """ Returns the conditional mode of y given z. """
        from scipy.optimize import fmin_cg
        if Yinit is None:
            Yinit = self.Y[:,argmin(distmatrix(Ztest, self.Zy),1)]
        return fmin_cg(self._forwardMode_cost, Yinit.flatten(), 
                                   self._forwardMode_grad, 
                                   args=(Ztest.flatten(),), 
                                   maxiter=100, 
                                   disp=0).reshape(self.d,-1)

    def _forwardyMode_cost(self, Ytest, Ztest):
        Ytest = Ytest.reshape(self.d,-1)
        Ztest = Ztest.reshape(self.q,-1)
        numtest = Ztest.shape[1]
        kZ = -distmatrix(Ztest, self.Zy)
        kY = -(1.0/exp(self.loghy)) * distmatrix(Ytest, self.Y)
        return sum( -logsumexp(kZ+kY,1) + logsumexp(kZ,1) )/float(numtest)

    def _forwardyMode_grad(self, Ytest, Ztest):
        Ytest = Ytest.reshape(self.d,-1)
        Ztest = Ztest.reshape(self.q,-1)
        numtest = Ztest.shape[1]
        kZ = exp(-distmatrix(Ztest, self.Zy))
        kY = exp(-(1.0/exp(self.loghy)) * distmatrix(Ytest, self.Y))
        return -((2.0/exp(self.loghy))\
                 *sum((kZ*kY)[:,:,newaxis]*
                      (self.Y[:,:,newaxis].transpose(2,1,0)-
                       Ytest[:,:,newaxis].transpose(1,2,0)),1).T
                                         /sum(kZ*kY,1)[newaxis,:]\
                                                   / float(numtest)).flatten()

    def backwardyMode(self, Ytest, Zinit=None):
        """ Returns the conditional mode of z given y. """
        from scipy.optimize import fmin_cg
        if Zinit is None:
            Zinit = self.Zy[:,argmin(distmatrix(Ytest, self.Y),1)]
        return fmin_cg(self._backwardyMode_cost, Zinit.flatten(), 
                                   self._backwardyMode_grad, 
                                   args=(Ytest.flatten(),), 
                                   maxiter=100, 
                                   disp=0).reshape(self.q,-1)

    def _backwardyMode_cost(self, Ztest, Ytest):
        Ztest = Ztest.reshape(self.q,-1)
        Ytest = Ytest.reshape(self.dimsY,-1)
        numtest = Ytest.shape[1]
        kY = -(1.0/exp(self.loghy)) * distmatrix(Ytest, self.Y)
        kZ = -distmatrix(Ztest, self.Zy)
        return sum( -logsumexp(kZ+kY,1) + logsumexp(kY,1) )/float(numtest)

    def _backwardyMode_grad(self, Ztest, Ytest):
        Ztest = Ztest.reshape(self.q,-1)
        Ytest = Ytest.reshape(self.dimsY,-1)
        numtest = Ytest.shape[1]
        kY = exp(-(1.0/exp(self.loghy)) * distmatrix(Ytest, self.Y))
        kZ = exp(-distmatrix(Ztest, self.Zy))
        return -(2.0*sum((kZ*kY)[:,:,newaxis]*
                      (self.Zy[:,:,newaxis].transpose(2,1,0)-
                       Ztest[:,:,newaxis].transpose(1,2,0)),1).T
                                         /sum(kZ*kY,1)[newaxis,:]\
                                                   / float(numtest)).flatten()

    def forwardxMode(self, Ztest, Xinit=None):
        """ Returns the conditional mode of x given z. """
        from scipy.optimize import fmin_cg
        if Xinit is None:
            Xinit = self.X[:,argmin(distmatrix(Ztest, self.Zx),1)]
        return fmin_cg(self._forwardMode_cost, Xinit.flatten(), 
                                   self._forwardMode_grad, 
                                   args=(Ztest.flatten(),), 
                                   maxiter=100, 
                                   disp=0).reshape(self.d,-1)

    def _forwardxMode_cost(self, Xtest, Ztest):
        Xtest = Xtest.reshape(self.d,-1)
        Ztest = Ztest.reshape(self.q,-1)
        numtest = Ztest.shape[1]
        kZ = -distmatrix(Ztest, self.Zx)
        kX = -(1.0/exp(self.loghx)) * distmatrix(Xtest, self.X)
        return sum( -logsumexp(kZ+kX,1) + logsumexp(kZ,1) )/float(numtest)

    def _forwardxMode_grad(self, Xtest, Ztest):
        Xtest = Xtest.reshape(self.d,-1)
        Ztest = Ztest.reshape(self.q,-1)
        numtest = Ztest.shape[1]
        kZ = exp(-distmatrix(Ztest, self.Zx))
        kX = exp(-(1.0/exp(self.loghx)) * distmatrix(Xtest, self.X))
        return -((2.0/exp(self.loghx))\
                 *sum((kZ*kX)[:,:,newaxis]*
                      (self.X[:,:,newaxis].transpose(2,1,0)-
                       Xtest[:,:,newaxis].transpose(1,2,0)),1).T
                                         /sum(kZ*kX,1)[newaxis,:]\
                                                   / float(numtest)).flatten()

    def backwardxMode(self, Xtest, Zinit=None):
        """ Returns the conditional mode of z given x. """
        from scipy.optimize import fmin_cg
        if Zinit is None:
            Zinit = self.Zx[:,argmin(distmatrix(Xtest, self.X),1)]
        return fmin_cg(self._backwardxMode_cost, Zinit.flatten(), 
                                   self._backwardxMode_grad, 
                                   args=(Xtest.flatten(),), 
                                   maxiter=100, 
                                   disp=0).reshape(self.q,-1)

    def _backwardxMode_cost(self, Ztest, Xtest):
        Ztest = Ztest.reshape(self.q,-1)
        Xtest = Xtest.reshape(self.dimsX,-1)
        numtest = Xtest.shape[1]
        kX = -(1.0/exp(self.loghx)) * distmatrix(Xtest, self.X)
        kZ = -distmatrix(Ztest, self.Zx)
        return sum( -logsumexp(kZ+kX,1) + logsumexp(kX,1) )/float(numtest)

    def _backwardxMode_grad(self, Ztest, Xtest):
        Ztest = Ztest.reshape(self.q,-1)
        Xtest = Xtest.reshape(self.dimsX,-1)
        numtest = Xtest.shape[1]
        kX = exp(-(1.0/exp(self.loghx)) * distmatrix(Xtest, self.X))
        kZ = exp(-distmatrix(Ztest, self.Zx))
        return -(2.0*sum((kZ*kX)[:,:,newaxis]*
                      (self.Zx[:,:,newaxis].transpose(2,1,0)-
                       Ztest[:,:,newaxis].transpose(1,2,0)),1).T
                                         /sum(kZ*kX,1)[newaxis,:]\
                                                   / float(numtest)).flatten()

    def projectx(self, x):
        return self.forwardx(self.backwardxMode(x))

    def projecty(self, y):
        return self.forwardy(self.backwardyMode(y))

    def xtoy(self, x):
        return self.forwardy(self.backwardxMode(x))

    def ytox(self, y):
        return self.forwardx(self.backwardyMode(y))

    def samplezgivenx(self, Xtest, numsamples=1):
        Xtest = normalizeshape(Xtest)
        kx = -(1.0/exp(self.loghx))*distmatrix(Xtest, self.X)
        kx =  exp(kx - logsumexp(kx, 1)[:,newaxis])
        result = zeros((self.q, Xtest.shape[1], numsamples), dtype=float)
        for i in range(Xtest.shape[1]):
            indexes = drawDiscrete(kx[i ,:], numsamples)
            result[:,i,:] = (randn(self.q, numsamples) + self.Zx[:,indexes])
        return result 

    def samplexgivenz(self, Ztest, numsamples=1):
        Ztest = normalizeshape(Ztest)
        kz = -distmatrix(Ztest, self.Zx)
        kz =  exp(kz - logsumexp(kz, 1)[:,newaxis])
        result = zeros((self.dimsX, Ztest.shape[1], numsamples), dtype=float)
        stddev = sqrt(exp(self.loghx))
        for i in range(Ztest.shape[1]):
            indexes = drawDiscrete(kz[i ,:], numsamples)
            result[:,i,:] = (stddev*randn(self.dimsX, numsamples) 
                                                           + self.X[:,indexes])
        return result 

    def samplezgiveny(self, Ytest, numsamples=1):
        Ytest = normalizeshape(Ytest)
        ky = -(1.0/exp(self.loghy))*distmatrix(Ytest, self.Y)
        ky =  exp(ky - logsumexp(ky, 1)[:,newaxis])
        result = zeros((self.q, Ytest.shape[1], numsamples), dtype=float)
        for i in range(Ytest.shape[1]):
            indexes = drawDiscrete(ky[i ,:], numsamples)
            result[:,i,:] = (randn(self.q, numsamples) + self.Zy[:,indexes])
        return result 

    def sampleygivenz(self, Ztest, numsamples=1):
        Ztest = normalizeshape(Ztest)
        kz = -distmatrix(Ztest, self.Zy)
        kz =  exp(kz - logsumexp(kz, 1)[:,newaxis])
        result = zeros((self.dimsY, Ztest.shape[1], numsamples), dtype=float)
        stddev = sqrt(exp(self.loghy))
        for i in range(Ztest.shape[1]):
            indexes = drawDiscrete(kz[i ,:], numsamples)
            result[:,i,:] = (stddev*randn(self.dimsY, numsamples) 
                                                           + self.Y[:,indexes])
        return result 

    def latdensity(self,z):
        return sum(exp(-sum((z[:,newaxis]-self.Z)**2,0))) / self.Znormalize

    def obsdensity(self,y):
        return sum(exp(-sum((y[:,newaxis]-self.Y)**2,0) / exp(self.loghy))) \
               / self.Ynormalize

    def ydensityUnnormalized(self, Ytest):
        Ytest = normalizeshape(Ytest)
        return (-(1.0/exp(self.loghy))*distmatrix(Ytest, self.Y)).sum()

    def train(self, numsteps, verbose=True, stepsize=[0.001]):
        for s in range(numsteps):
            if stepsize[0] < 10**-10: 
                print "stepsize < 10**-10: exiting" 
                return
            if s == 0:
                oldcost = self.cost()
                if verbose:
                    print "initial cost: %f " % oldcost
            g = self.grad()
            self.params -= stepsize[0] * g
            newcost = self.cost()
            if newcost <= oldcost:
                if verbose:
                    print "cost: %f " % newcost
                    print "increasing step-size to %f" % stepsize[0]
                oldcost = newcost
                stepsize[0] = stepsize[0] * 1.1
            else:
                if verbose:
                    print "cost: %f larger than best cost %f" % \
                                                            (newcost, oldcost)
                    print "decreasing step-size to %f" % stepsize[0]
                self.params += stepsize[0] * g
                newcost = oldcost
                stepsize[0] *= 0.5


if __name__=='__main__':
    from pylab import *
    import numpy.random
    import time 

    numpy.random.seed(1)

    numcases = 2000; d=2; q=1; hy=0.5; reg=1.0; loo=False; learnbandwidth = False

    def scurve(N, Neval, sigma=0.0):
        t = pi*(1.5*numpy.random.rand(1,N/2)-1)
        Y = zeros((2,N))
        Y[0, :][newaxis, :] = concatenate((cos(t),-cos(t)),1)
        Y[1, :][newaxis, :] = concatenate((sin(t),2-sin(t)),1)
        Y = Y + numpy.random.randn(2, N) * sigma
        t = concatenate((t,t),1)
        teval = pi*(1.5*numpy.random.rand(1,Neval/2)-1)
        Yeval = zeros((2,Neval))
        Yeval[0,:][newaxis,:] = concatenate((cos(teval),-cos(teval)),1)
        Yeval[1,:][newaxis,:] = concatenate((sin(teval),2-sin(teval)),1)
        Yeval = Yeval + numpy.random.randn(2, Neval) * sigma
        teval = concatenate((teval,teval),1)
        return Y, Yeval, t, teval

    Y, Yeval, t, eval = scurve(numcases, numcases, 0.15)
    Y2, Yeval2, t2, eval2 = scurve(numcases, numcases, 0.15)

    #hy, dens = optbandwidth(Y, Y.var()/1000, Y.var(), 1000)
    hy = 0.05

    model = Kie(1, hy, Y, reg=reg, loo=loo, learnbandwidth=learnbandwidth, anisotropic=False)
    #model = SharedKie(1, [hy,hy], [Y,Y], weights=[0.5,0.5], regs=[reg], loo=loo, anisotropic=False)

    Yeval_ = zeros((2,numcases),dtype=float)
    Y_ = zeros((2,numcases),dtype=float)
    regs = []
    errTrain = []
    errEval = []
    Zs = []
    t0 = time.time()
    
    print 'training'
    for i in range(15):
        print i
        #trainer.step()
        model.train_cg(50)
        #model.train_cg(100)
        Zs.append(model.Z_gpu.asarray())
        regs.append(model.reg)
        subplot(2,1,2)
        scatter(model.Z_gpu.asarray()[0], ones(numcases,dtype=float), 30, t[0])
        errTrain.append(sum((Y_-Y)**2)/numcases)
        errEval.append(sum((Yeval_-Yeval)**2/numcases))
        subplot(2,1,1)
        hold(False)
        scatter(Y[0], Y[1], 20)
        print "Iteration %d, reg: %f" % (i, regs[-1])
        model.reg *= 0.7

    print 'done training'
    print 'time used:', time.time()-t0
    print 'computing projection onto manifold'
    for j in range(numcases):
        Y_[:,j] = model.project(Y[:,j]).flatten()
        #Yeval_[:,j] = model.project(Yeval[:,j]).flatten()
    hold(False)
    scatter(Y[0], Y[1], 20)
    hold(True)
    scatter(Y_[0], Y_[1], 20, 'k')


