t-sne代码解读

# coding utf-8
'''
代码参考了作者Laurens van der Maaten的开放出的t-sne代码, 并没有用类进行实现,主要是优化了计算的实现
'''
import numpy as np


def cal_pairwise_dist(x):
    '''计算pairwise 距离, x是matrix
    (a-b)^2 = a^w + b^2 - 2*a*b
    '''
    #巧妙的计算点与点之间的欧式距离^2,二行代码实现,n*n pair_wise矩阵
    sum_x = np.sum(np.square(x), 1)
    dist = np.add(np.add(-2 * np.dot(x, x.T), sum_x).T, sum_x)
    return dist


def cal_perplexity(dist, idx=0, beta=1.0):
    '''计算perplexity, D是距离向量,
    idx指dist中自己与自己距离的位置,beta是高斯分布参数
    这里的perp仅计算了熵,方便计算
    '''
    #beta = 1/(2*σ^2) 进行了整合,更简洁
    prob = np.exp(-dist * beta)
    # 设置自身prob为0
    prob[idx] = 0
    sum_prob = np.sum(prob)
    #这里计算熵用的是自然对数np.log,有关系吗?没关系
    #这里只计算熵,这个形式做了变形,等价于熵的公式-∑pj|ilogpj|i
    perp = np.log(sum_prob) + beta * np.sum(dist * prob) / sum_prob
    #这里得到的是Pj|i矩阵
    prob /= sum_prob
    return perp, prob


def seach_prob(x, tol=1e-5, perplexity=30.0):
    '''二分搜索寻找beta,并计算pairwise的prob
    '''

    # 初始化参数
    print("Computing pairwise distances...")
    (n, d) = x.shape
    dist = cal_pairwise_dist(x)
    pair_prob = np.zeros((n, n))
    beta = np.ones((n, 1))
    # 取log,方便后续计算,base困惑度,也是自然对数
    base_perp = np.log(perplexity)

    for i in range(n):
        if i % 500 == 0:
            print("Computing pair_prob for point %s of %s ..." %(i,n))

        betamin = -np.inf
        betamax = np.inf
        perp, this_prob = cal_perplexity(dist[i], i, beta[i])

        # 二分搜索,寻找最佳sigma下的prob
        perp_diff = perp - base_perp
        tries = 0
        while np.abs(perp_diff) > tol and tries < 50:
            if perp_diff > 0:
                betamin = beta[i].copy()
                if betamax == np.inf or betamax == -np.inf:
                    beta[i] = beta[i] * 2
                else:
                    beta[i] = (beta[i] + betamax) / 2
            else:
                betamax = beta[i].copy()
                if betamin == np.inf or betamin == -np.inf:
                    beta[i] = beta[i] / 2
                else:
                    beta[i] = (beta[i] + betamin) / 2

            # 更新perb,prob值
            perp, this_prob = cal_perplexity(dist[i], i, beta[i])
            perp_diff = perp - base_perp
            tries = tries + 1
        # 记录prob值,条件概率Pj|i
        pair_prob[i,] = this_prob
    print("Mean value of sigma: ", np.mean(np.sqrt(1 / beta)))
    return pair_prob


def pca(x, no_dims = 50):
    ''' PCA算法
    使用PCA先进行预降维
    '''
    print("Preprocessing the data using PCA...")
    (n, d) = x.shape
    x = x - np.tile(np.mean(x, 0), (n, 1))
    l, M = np.linalg.eig(np.dot(x.T, x))
    y = np.dot(x, M[:,0:no_dims])
    return y


def tsne(x, no_dims=2, initial_dims=50, perplexity=30.0, max_iter=1000):
    """Runs t-SNE on the dataset in the NxD array x
    to reduce its dimensionality to no_dims dimensions.
    The syntaxis of the function is Y = tsne.tsne(x, no_dims, perplexity),
    where x is an NxD NumPy array.
    """

    # Check inputs
    if isinstance(no_dims, float):
        print("Error: array x should have type float.")
        return -1
    if round(no_dims) != no_dims:
        print("Error: number of dimensions should be an integer.")
        return -1

    # 初始化参数和变量
    x = pca(x, initial_dims).real
    (n, d) = x.shape
    initial_momentum = 0.5
    final_momentum = 0.8
    eta = 500
    min_gain = 0.01
    y = np.random.randn(n, no_dims)
    dy = np.zeros((n, no_dims))
    iy = np.zeros((n, no_dims))
    gains = np.ones((n, no_dims))

    # 对称化
    P = seach_prob(x, 1e-5, perplexity)
    #np.transpose(P) 转置就得到Pj|i
    #加起来得到联合分布Pij (Pij+Pji)/2n 为什么不是除以2n呢?np.sum(P)就等于2n
    P = P + np.transpose(P)
    P = P / np.sum(P)
    # early exaggeration sklearn也有这个参数,默认12,是直接乘以Pij矩阵,放大p值
    P = P * 4
    P = np.maximum(P, 1e-12)

    # Run iterations
    for iter in range(max_iter):
        # Compute pairwise affinities
        sum_y = np.sum(np.square(y), 1)
        #这里是t分布,欧式距离转换成自由度为1的t分布的概率密度,任意两点之间的概率密度
        num = 1 / (1 + np.add(np.add(-2 * np.dot(y, y.T), sum_y).T, sum_y))
        #对角线定义为0
        num[range(n), range(n)] = 0
        #得到qij,np.sum(num)就是分母
        Q = num / np.sum(num)
        Q = np.maximum(Q, 1e-12)

        # Compute gradient
        PQ = P - Q
        for i in range(n):
            #梯度,梯度每一个元素也是一个向量yi-yj,原公式中的4省略了
            dy[i,:] = np.sum(np.tile(PQ[:,i] * num[:,i], (no_dims, 1)).T * (y[i,:] - y), 0)

        # Perform the update
        # 前20轮是一个比较大的动量,之后是一个小动量,论文中动量是关于迭代轮数的一个函数α(t)
        if iter < 20:
            momentum = initial_momentum
        else:
            momentum = final_momentum
        #这里iy是什么?iy是delta迭代量,shape和dy一致;gains是什么?后面会和梯度相乘
        #本轮梯度和上轮迭代量每一个element,同号的地方,gains * 0.8,异号的地方 gains + 0.2
        #梯度乘以gains可能是某种变种的梯度下降
        gains = (gains + 0.2) * ((dy > 0) != (iy > 0)) + (gains * 0.8) * ((dy > 0) == (iy > 0))
        #gains小于min_gain,则取min_gain
        gains[gains < min_gain] = min_gain
        #上一次迭代量iy乘以动量+梯度部分,梯度部分为什么要乘以gains不是很明白
        iy = momentum * iy - eta * (gains * dy)
        #iy是delta迭代量
        y = y + iy
        #假设这里y是两列,每一列都会减去,每一列的均值,这是在中心化?为什么要中心化?
        #中心化后,低维中点与点的欧式距离并不会变化,这是低维空间中,每个维度数值范围差不多?
        y = y - np.tile(np.mean(y, 0), (n, 1))
        # Compute current value of cost function
        #前100轮用了early exaggeration,P会乘以4,每100轮输出一次cost function的值
        if (iter + 1) % 100 == 0:
            if iter > 100:
                C = np.sum(P * np.log(P / Q))
            else:
                C = np.sum( P/4 * np.log( P/4 / Q))
            print("Iteration ", (iter + 1), ": error is ", C)
        # Stop lying about P-values
        if iter == 100:
            P = P / 4
    print("finished training!")
    return y


if __name__ == "__main__":
    # Run Y = tsne.tsne(X, no_dims, perplexity) to perform t-SNE on your dataset.
    X = np.loadtxt("mnist2500_X.txt")
    labels = np.loadtxt("mnist2500_labels.txt")
    Y = tsne(X, 2, 50, 20.0)
    from matplotlib import pyplot as plt
    plt.scatter(Y[:,0], Y[:,1], 20, labels)
    plt.show()

留言

熱門文章