perceiver_music_transformer.py

# -*- coding: utf-8 -*-
"""Perceiver_Music_Transformer.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/13GdzZLAH0OG5GVnrosxT1Y6ftbhAm-12

# Perceiver Music Transformer (ver. 1.0)

***

Powered by tegridy-tools: https://github.com/asigalov61/tegridy-tools

***

WARNING: This complete implementation is a functioning model of the Artificial Intelligence. Please excercise great humility, care, and respect. https://www.nscai.gov/

***

#### Project Los Angeles

#### Tegridy Code 2023

***

# (GPU CHECK)
"""

#@title NVIDIA GPU Check
!nvidia-smi

"""# (SETUP ENVIRONMENT)"""

#@title Install all dependencies (run only once per session)

!git clone https://github.com/asigalov61/Perceiver-Music-Transformer
!pip install einops
!pip install torch
!pip install torch-summary

!pip install tqdm
!pip install matplotlib

!apt install fluidsynth #Pip does not work for some reason. Only apt works
!pip install midi2audio

#@title Import all needed modules

print('Loading needed modules. Please wait...')
import os
import random
import copy
import math
from collections import OrderedDict

from tqdm.notebook import tqdm

import matplotlib.pyplot as plt

import torch
from torchsummary import summary

print('Loading core modules...')
os.chdir('/content/Perceiver-Music-Transformer')

import TMIDIX

from perceiver_ar_pytorch_full import PerceiverAR, AutoregressiveWrapper

from midi2audio import FluidSynth
from IPython.display import Audio, display

os.chdir('/content/')
print('Done!')

"""# (UNZIP MODEL)"""

# Commented out IPython magic to ensure Python compatibility.
#@title Unzip Pre-Trained Perceiver Music Transformer Model
print('=' * 70)
# %cd /content/Perceiver-Music-Transformer/Model

print('=' * 70)
print('Unzipping pre-trained Perceiver Music Transformer model...Please wait...')

!cat /content/Perceiver-Music-Transformer/Model/Perceiver_Music_Transformer_Trained_Model.zip* > /content/Perceiver-Music-Transformer/Model/Perceiver_Music_Transformer_Trained_Model.zip
print('=' * 70)

!unzip -j /content/Perceiver-Music-Transformer/Model/Perceiver_Music_Transformer_Trained_Model.zip
print('=' * 70)

print('Done! Enjoy! :)')
print('=' * 70)
# %cd /content/
print('=' * 70)

"""# (LOAD MODEL)"""

#@title Load/Reload the model

full_path_to_model_checkpoint = "/content/Perceiver-Music-Transformer/Model/Perceiver_Music_Transformer_Trained_Model_16026_steps_0.8256_loss.pth" #@param {type:"string"}

print('Loading the model...')
# Load model

# constants

SEQ_LEN = 6144 # 6k
PREFIX_SEQ_LEN = 4096 # 4k

model = PerceiverAR(
    num_tokens = 2145,
    dim = 1024,
    depth = 32,
    ff_mult=2,
    cross_attn_dropout = 0.25,
    max_seq_len = SEQ_LEN,
    cross_attn_seq_len = PREFIX_SEQ_LEN
)

model = AutoregressiveWrapper(model)

model = torch.nn.DataParallel(model)

model.cuda()

state_dict = torch.load(full_path_to_model_checkpoint)

model.load_state_dict(state_dict)

model.eval()

print('Done!')

# Model stats
summary(model)

"""# (GENERATE)"""

#@title Load Seed/Custom MIDI

#@markdown PLEASE NOTE: Custom MIDI must have at least 1024 notes to fill-in Perceiver prefix requirements.

full_path_to_custom_MIDI_file = "/content/Perceiver-Music-Transformer/Seeds/Perceiver-Music-Transformer-Sample-Piano-Seed-MIDI.mid" #@param {type:"string"}

print('Loading custom MIDI file...')
score = TMIDIX.midi2ms_score(open(full_path_to_custom_MIDI_file, 'rb').read())

events_matrix = []

itrack = 1

#==================================================

patches = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

patch_map = [
            [0, 1, 2, 3, 4, 5, 6, 7], # Piano 
            [24, 25, 26, 27, 28, 29, 30], # Guitar
            [32, 33, 34, 35, 36, 37, 38, 39], # Bass
            [40, 41], # Violin
            [42, 43], # Cello
            [46], # Harp
            [56, 57, 58, 59, 60], # Trumpet
            [64, 65, 66, 67, 68, 69, 70, 71], # Sax
            [72, 73, 74, 75, 76, 77, 78], # Flute
            [-1], # Drums
            [52, 53], # Choir
            [16, 17, 18, 19, 20] # Organ
            ]

while itrack < len(score):
    for event in score[itrack]:         
        if event[0] == 'note' or event[0] == 'patch_change':
            events_matrix.append(event)
    itrack += 1

events_matrix.sort(key=lambda x: x[1])

events_matrix1 = []

for event in events_matrix:
        if event[0] == 'patch_change':
            patches[event[2]] = event[3]

        if event[0] == 'note':
            event.extend([patches[event[3]]])
            once = False
            
            for p in patch_map:
                if event[6] in p and event[3] != 9: # Except the drums
                    event[3] = patch_map.index(p)
                    once = True
                    
            if not once and event[3] != 9: # Except the drums
                event[3] = 15 # All other instruments/patches channel
                event[5] = max(80, event[5])
                
            if event[3] < 12: # We won't write chans 12-16 for now...
                events_matrix1.append(event)
                
#=======================================================
# PRE-PROCESSING

if len(events_matrix1) > 0:

  # recalculating timings
  for e in events_matrix1:
      e[1] = int(e[1] / 8) # Max 2 seconds for start-times
      e[2] = int(e[2] / 16) # Max 4 seconds for durations

  # Sorting by pitch, then by start-time
  events_matrix1.sort(key=lambda x: x[4], reverse=True)
  events_matrix1.sort(key=lambda x: x[1])

  #=======================================================
  # FINAL PRE-PROCESSING

  melody_chords = []

  pe = events_matrix1[0]

  for e in events_matrix1:
    if e[1] >= 0 and e[2] >= 0:

      # Cliping all values...
      time = max(0, min(255, e[1]-pe[1]))             
      dur = max(1, min(255, e[2]))
      cha = max(0, min(11, e[3]))
      ptc = max(1, min(127, e[4]))
      vel = max(8, min(127, e[5]))

      velocity = round(vel / 15)

      # Writing final note 
      melody_chords.append([time, dur, cha, ptc, velocity])

      pe = e


  if len(melody_chords) > 1024:            

    #=======================================================
    # MAIN PROCESSING CYCLE
    #=======================================================
    
    mel_cho = []

    for m in melody_chords:
        
        # WRITING EACH NOTE HERE
        time = m[0]
        dur = m[1]
        cha_vel = (m[2] * 8) + (m[4]-1)
        cha_ptc = (m[2] * 128) + m[3]

        mel_cho.extend([time, dur+256, cha_vel+512, cha_ptc+608])
        
    # TOTAL DICTIONARY SIZE 2144+1 = 2145

    #=======================================================
    # FINAL PROCESSING
    #=======================================================

# =================================

out1 = mel_cho

if len(out1) != 0:
    
    song = out1
    song_f = []
    time = 0
    dur = 0
    vel = 0
    pitch = 0
    channel = 0
                    
    for ss in song:
      
      if ss > 0 and ss < 256:

          time += ss * 8
        
      if ss >= 256 and ss < 512:

          dur = (ss-256) * 16

      if ss >= 512 and ss < 608:

          channel = (ss-512) // 8
          vel = (((ss-512) % 8)+1) * 15
              
      if ss >= 608 and ss < 608+(12*128):
          
          pitch = (ss-608) % 128

          song_f.append(['note', time, dur, channel, pitch, vel ])

    detailed_stats = TMIDIX.Tegridy_SONG_to_MIDI_Converter(song_f,
                                                        output_signature = 'Perceiver Music Transformer',  
                                                        output_file_name = '/content/Perceiver-Music-Transformer-Seed-MIDI', 
                                                        track_name='Project Los Angeles',
                                                        list_of_MIDI_patches=[0, 24, 32, 40, 42, 46, 56, 71, 73, 0, 53, 19, 0, 0, 0, 0],
                                                        number_of_ticks_per_quarter=500)

    print('Done!')

print('Displaying resulting composition...')
fname = '/content/Perceiver-Music-Transformer-Seed-MIDI'

x = []
y =[]
c = []

colors = ['red', 'yellow', 'green', 'cyan', 'blue', 'pink', 'orange', 'purple', 'gray', 'white', 'gold', 'silver']

for s in song_f:
  x.append(s[1] / 1000)
  y.append(s[4])
  c.append(colors[s[3]])

FluidSynth("/usr/share/sounds/sf2/FluidR3_GM.sf2", 16000).midi_to_audio(str(fname + '.mid'), str(fname + '.wav'))
display(Audio(str(fname + '.wav'), rate=16000))

plt.figure(figsize=(14,5))
ax=plt.axes(title=fname)
ax.set_facecolor('black')

plt.scatter(x,y, c=c)
plt.xlabel("Time")
plt.ylabel("Pitch")
plt.show()

"""# Continuation"""

#@title Single Continuation Block Generator

number_of_prime_notes = 64 #@param {type:"slider", min:1, max:64, step:1}
number_of_tokens_to_generate = 512 #@param {type:"slider", min:64, max:2048, step:32}
number_of_batches_to_generate = 4 #@param {type:"slider", min:1, max:16, step:1}
temperature = 0.8 #@param {type:"slider", min:0.1, max:1, step:0.1}
return_prefix_and_prime_tokens = False #@param {type:"boolean"}

#===================================================================
print('=' * 70)
print('Perceiver Music Transformer Model Continuation Generator')
print('=' * 70)

print('Generation settings:')
print('=' * 70)
print('Number of prime notes:', number_of_prime_notes)
print('Number of tokens to generate:', number_of_tokens_to_generate)
print('Number of batches to generate:', number_of_batches_to_generate)
print('Model temperature:', temperature)

print('=' * 70)

num_toks = 4096 + (number_of_prime_notes * 4)

inp = [mel_cho[:num_toks]] * number_of_batches_to_generate

inp = torch.LongTensor(inp).cuda()

out = model.module.generate(inp, 
                     number_of_tokens_to_generate, 
                     temperature=temperature,
                     return_prime=return_prefix_and_prime_tokens)  

out0 = out.tolist()

print('=' * 70)
print('Done!')
print('=' * 70)

#======================================================================

print('Rendering results...')
print('=' * 70)

for i in range(number_of_batches_to_generate):

  print('=' * 70)
  print('Batch #', i)
  print('=' * 70)

  out1 = out0[i]

  print('Sample INTs', out1[:12])
  print('=' * 70)

  if len(out1) != 0:
      
      song = out1
      song_f = []
      time = 0
      dur = 0
      vel = 0
      pitch = 0
      channel = 0
                      
      for ss in song:
        
        if ss > 0 and ss < 256:

            time += ss * 8
          
        if ss >= 256 and ss < 512:

            dur = (ss-256) * 16

        if ss >= 512 and ss < 608:

            channel = (ss-512) // 8
            vel = (((ss-512) % 8)+1) * 15
                
        if ss >= 608 and ss < 608+(12*128):
            
            pitch = (ss-608) % 128

            song_f.append(['note', time, dur, channel, pitch, vel ])

      detailed_stats = TMIDIX.Tegridy_SONG_to_MIDI_Converter(song_f,
                                                          output_signature = 'Perceiver Music Transformer',  
                                                          output_file_name = '/content/Perceiver-Music-Transformer-Composition', 
                                                          track_name='Project Los Angeles',
                                                          list_of_MIDI_patches=[0, 24, 32, 40, 42, 46, 56, 71, 73, 0, 53, 19, 0, 0, 0, 0],
                                                          number_of_ticks_per_quarter=500)

      print('Done!')

  print('Displaying resulting composition...')
  fname = '/content/Perceiver-Music-Transformer-Composition'

  x = []
  y =[]
  c = []

  colors = ['red', 'yellow', 'green', 'cyan', 'blue', 'pink', 'orange', 'purple', 'gray', 'white', 'gold', 'silver']

  for s in song_f:
    x.append(s[1] / 1000)
    y.append(s[4])
    c.append(colors[s[3]])

  FluidSynth("/usr/share/sounds/sf2/FluidR3_GM.sf2", 16000).midi_to_audio(str(fname + '.mid'), str(fname + '.wav'))
  display(Audio(str(fname + '.wav'), rate=16000))

  plt.figure(figsize=(14,5))
  ax=plt.axes(title=fname)
  ax.set_facecolor('black')

  plt.scatter(x,y, c=c)
  plt.xlabel("Time")
  plt.ylabel("Pitch")
  plt.show()

"""# Congrats! You did it! :)"""