import constellation
from constellation import util
import torch
from matplotlib import pyplot
from mpl_toolkits.axisartist.axislines import SubplotZero
import warnings

torch.manual_seed(57)

# Number of symbols to learn
order = 16

# Shape of the hidden layers
hidden_layers = (8, 4,)

# Initial value for the learning rate
initial_learning_rate = 0.1

# Number of batches to skip between every loss report
loss_report_batch_skip = 50

# Size of batches
batch_size = 2048

# File in which the trained model is saved
output_file = 'output/constellation-order-{}.pth'.format(order)

###

# Setup plot for showing training progress
fig = pyplot.figure()
ax = SubplotZero(fig, 111)
fig.add_subplot(ax)

pyplot.show(block=False)

# Train the model with random data
model = constellation.ConstellationNet(
    order=order,
    encoder_layers=hidden_layers,
    decoder_layers=hidden_layers[::-1],
)

print('Starting training\n')

# Current batch index
batch = 0

# Accumulated loss for last batches
running_loss = 0

# List of training examples (not shuffled)
classes_ordered = torch.arange(order).repeat(batch_size)

# Constellation from the previous training batch
prev_constel = model.get_constellation()
total_change = float('inf')

# Optimizer settings
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=initial_learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, verbose=True,
    factor=0.25,
    patience=100,
    cooldown=50,
    threshold=1e-8
)

while total_change >= 1e-3:
    # Shuffle training data and convert to one-hot encoding
    classes_dataset = classes_ordered[torch.randperm(len(classes_ordered))]
    onehot_dataset = util.messages_to_onehot(classes_dataset, order)

    # Perform training step for current batch
    model.train()
    optimizer.zero_grad()
    predictions = model(onehot_dataset)
    loss = criterion(predictions, classes_dataset)
    loss.backward()
    optimizer.step()

    # Update learning rate scheduler
    scheduler.step(loss)

    # Check for convergence
    model.eval()
    cur_constel = model.get_constellation()
    total_change = (cur_constel - prev_constel).norm(dim=1).sum()
    prev_constel = cur_constel

    # Report loss
    running_loss += loss.item()

    if batch % loss_report_batch_skip == loss_report_batch_skip - 1:
        print('Batch #{}'.format(batch + 1))
        print('\tLoss is {}'.format(running_loss / loss_report_batch_skip))
        print('\tChange is {}\n'.format(total_change))

        running_loss = 0

    # Update figure with current encoding
    ax.clear()
    util.plot_constellation(
        ax, cur_constel,
        model.channel, model.decoder,
        noise_samples=0
    )
    fig.canvas.draw()
    fig.canvas.flush_events()

    batch += 1

model.eval()

# Calcul de la perte finale
with torch.no_grad():
    classes_ordered = torch.arange(order).repeat(2048)
    classes_dataset = classes_ordered[torch.randperm(len(classes_ordered))]
    onehot_dataset = util.messages_to_onehot(classes_dataset, order)

    predictions = model(onehot_dataset)
    final_loss = criterion(predictions, classes_dataset)

print('\nFinished training')
print('Final loss is {}'.format(final_loss))
print('Saving model as {}'.format(output_file))

with warnings.catch_warnings():
    warnings.simplefilter('ignore')
    torch.save(model, output_file)