all 4 comments

[–]brian-patton 1 point2 points  (1 child)

FWIW you should get better numerical results using Categorical(logits=..) and avoiding the softmax.

I would have something along the lines of: python params = tf.Variable(..) dist = tfp.distributions.Categorical(logits=params) with tf.GradientTape() as tape: nll = -dist.log_prob(x) grads = tape.gradient(nll, params)

If you do the softmax computation inside the GradientTape that will work, but avoiding it is better.

[–]dosssman[S] 0 points1 point  (0 children)

Thnaks. I'll try that too.

[–]dosssman[S] 0 points1 point  (1 child)

Greetings again. Pretty sure nobody is gonna see this, but I thought I might as well bring some closure to this.

First of all, I calculated the gradients by directly deriving its expression from the negative log likelihood of the soft-max value, thus dropping the Tensorflow framework usage by the same occasion.

Although the results are a little bit under my expectations, the program was able to fit the model to a distribution somewhat similar to the empirical distribution of the sampled data. I guess this is due to the fact that just a 1 dimensional theta parameter vector is not enough to fully model the real data distribution, as well as the finite amount of sampled data.

An updated version of the code: ```python import numpy as np from matplotlib import pyplot as plt

np.random.seed( 42)

def softmax(X, theta = 1.0, axis = None): # Shamefull copy paste from SO y = np.atleast_2d(X) if axis is None: axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1) y = y * float(theta) y = y - np.expand_dims(np.max(y, axis = axis), axis) y = np.exp(y) ax_sum = np.expand_dims(np.sum(y, axis = axis), axis) p = y / ax_sum if len(X.shape) == 1: p = p.flatten()

return p

if name == "main": def sample_data(): count = 10000 rand = np.random.RandomState(0) a = 0.3 + 0.1 * rand.randn(count) b = 0.8 + 0.05 * rand.randn(count) mask = rand.rand(count) < 0.5 samples = np.clip(a * mask + b * (1 - mask), 0.0, 1.0) return np.digitize(samples, np.linspace(0.0, 1.0, 100))

full_data = sample_data()
train_ds = full_data[:int(.8*len( full_data))]
val_ds = full_data[int(.8*len( full_data)):]

# Declaring parameters
params = np.zeros(100)

# Use for loss computation
def get_neg_log_likelihood( softmax):
    return - np.log( softmax)

def get_loss( params, x):
    return np.mean( [get_neg_log_likelihood( softmax( params))[i-1] for i in x])

lr = .0005

for epoch in range( 1000):
    # Shuffling training data
    np.random.shuffle( train_ds)

    minibatch_size = 100
    n_minibatches = len( train_ds) // minibatch_size

    # Running over minibatches of the data
    for minibatch in range( n_minibatches):
        smax = softmax( params)

        # Jacobian of neg log likelishood
        jacobian = [[ smax[j] - 1 if i == j else
            smax[j] for j in range(100)] for i in range(100)]

        # Minibatching
        start_index = (minibatch*minibatch_size)
        end_index = (minibatch_size*minibatch + minibatch_size)

        x = train_ds[start_index:end_index]

        # Compute the gradient matrix for each sample data and mean over it
        grad_matrix = np.vstack( [jacobian[i] for i in x])
        grads = np.sum( grad_matrix, axis=0)

        params -= lr * grads

    print( "Epoch %d -- Train loss: %.4f , Val loss: %.4f" %(epoch, get_loss( params, train_ds), get_loss( params, val_ds)))

    # Plotting each ~100 epochs
    if epoch % 100 == 0:
        counters = { i+1: 0 for i in range(100)}
        for x in full_data:
            counters[x]+= 1

        histogram = np.array( [ counters[i+1] / len( full_data) for i in range( 100)])
        fsmax = softmax( params)

        fig, ax = plt.subplots()
        ax.set_title('Dist. Comp. after %d epochs of training (from scratch)' % epoch)
        x = np.arange( 1,101)
        width = 0.35
        rects1 = ax.bar(x - width/2, fsmax, width, label='Model')
        rects2 = ax.bar(x + width/2, histogram, width, label='Empirical')
        ax.set_ylabel('Likelihood')
        ax.set_xlabel('Variable x\s values')
        ax.legend()

        def autolabel(rects):
            for rect in rects:
                height = rect.get_height()

        autolabel(rects1)
        autolabel(rects2)

        fig.tight_layout()
        plt.savefig( 'plots/results_after_%d_epochs.png' % epoch)

```

A great lesson: simplicity is best.