FrequentItemsets.py

import numpy as np 
from itertools import combinations
import pandas as pd
import math
import matplotlib.pyplot as plt 
import io
import time
import psutil
from scipy.spatial import Delaunay

class FrequentItemsets:
    """
    Class for generating frequent itemsets and association rules.

    Methods:
    - Ck_generator(k, L, datasetT): Generate candidate itemsets of size k.
    - support_calculator(C, datasetT): Calculate support for candidate itemsets.
    - Lk_generator(C, supp_min): Generate frequent itemsets from candidate itemsets.
    - appriori(min_supp, datasetT): Generate frequent itemsets using the Apriori algorithm.
    - regle_association(L): Generate association rules from frequent itemsets.
    - mesure_calculator(r, methode, L): Calculate association rule measures.
    - regles_frequente(L, conf_min, m): Generate frequent association rules.
    - rules_nbr_plot(transactions_table, supp_lower_bound, supp_upper_bound, conf_lower_bound, conf_upper_bound): Plot the number of frequent rules generated by support and confidence thresholds.
    - freq_items_nbr_plot(transactions_table, supp_lower_bound, supp_upper_bound): Plot the number of frequent itemsets generated by support threshold.
    - time_exec_plot(transactions_table, supp_lower_bound, supp_upper_bound): Plot the execution time of Apriori algorithm by support threshold.
    - memory_alloc_plot(transactions_table, supp_lower_bound, supp_upper_bound, conf_lower_bound, conf_upper_bound): Plot the memory allocation of Apriori algorithm and association rules by support and confidence thresholds.
    """

    def Ck_generator(self, k, L, datasetT):
        if k==1:
            uniItemListe=set()
            for i in range(0,len(datasetT)):
                uniItemListe.update(set(datasetT[i]))
            return [(v,) for v in uniItemListe]
        else:
            C=[]
            if len(L)==0: return C
            listeitemsunique=sorted(set([element for tuple in L.keys() for element in tuple]))
            if len(listeitemsunique)<k: return []
            combinations_list = list(combinations(listeitemsunique, k))
            if k==2: 
                return combinations_list
            #le pruning
            for combi in combinations_list:
                exist=True
                sous_combinations_list = list(combinations(combi, k-1))
                
                for sous_combi in sous_combinations_list:
                    if sous_combi not in list(L.keys()):
                        exist=False
                        break
                if exist==True:
                    C.append(combi)   
            return C
        
    def support_calculator(self, C, datasetT):
        dico={}
        dico.update({val:0 for val in C})
        for row in datasetT:
            combinations_list = list(combinations(row, len(list(dico.keys())[0])))
            for val in combinations_list:
                if val in dico: 
                    dico[val]+=1
        dico.update({key:val/len(datasetT) for key,val in dico.items()})
    
        return dico

    def Lk_generator(self, C, supp_min):
        c={}
        c=({key:float(val) for key,val in C.items() if float(val)>=supp_min})
        return c
    
    def appriori(self, min_supp, datasetT):
        L=[]
        k=1
        C=self.Ck_generator(k,None,datasetT)
    
        while(len(C)!=0):
            S=self.support_calculator(C,datasetT)
        
            l=self.Lk_generator(S,min_supp)
            if len(l)!=0:L.append(l)
        
            k+=1
            C=self.Ck_generator(k,l,None)
            
        return L
    
    def regle_association(self, L):
        regles=pd.DataFrame()
        first=True
        for key,value in L.items():
            if first==True:
                k=len(key)
                first==False
            if k==2 :
                    new_row = {'antecedant':(key[0],),'consequent':(key[1],),'mesure':0.0,'support':value}
                    regles = pd.concat([regles, pd.DataFrame([new_row])], ignore_index=True)
                    new_row={'antecedant':(key[1],),'consequent':(key[0],),'mesure':0.0,'support':value}
                    regles = pd.concat([regles, pd.DataFrame([new_row])], ignore_index=True)
            else:
                for i in range(k-1,k-(k//2)-1,-1):
                    sous_combinations_list = list(combinations(key,i))#ab c  / ac bc 
                    for sous_comb in sous_combinations_list:
                        reste=sorted(set(key).symmetric_difference(sous_comb))
                        regles = pd.concat([regles, pd.DataFrame([{'antecedant':tuple(reste),'consequent':sous_comb,'mesure':0.0,'support':value}])], ignore_index=True)
                        if not len(tuple(reste))==len(sous_comb):
                            regles = pd.concat([regles, pd.DataFrame([{'antecedant':sous_comb,'consequent':tuple(reste),'mesure':0.0,'support':value}])], ignore_index=True)
            
        return regles
    
    def mesure_calculator(self, r, methode, L):#A,D->B,C
        if methode==0:#confidence
            return (r["support"]/L[len(r["antecedant"])-1][r["antecedant"]])
        elif methode==1:#cosine
            return (r["support"]/math.sqrt(L[len(r["antecedant"])-1][r["antecedant"]]*L[len(r["consequent"])-1][r["consequent"]]))
        elif methode==2:#lift
            return r["support"]/(L[len(r["antecedant"])-1][r["antecedant"]]*L[len(r["consequent"])-1][r["consequent"]])
        elif methode==3:#jackard
            return r["support"]/(L[len(r["antecedant"])-1][r["antecedant"]]+L[len(r["consequent"])-1][r["consequent"]]- r["support"])
        else:#kulc
            return 0.5*((r["support"]/L[len(r["antecedant"])-1][r["antecedant"]])+(r["support"]/L[len(r["consequent"])-1][r["consequent"]]))

    def regles_frequente(self, L, conf_min, m):

        regles=pd.DataFrame()
        for l in L[1:]:
            regles= pd.concat([regles, self.regle_association(l)], ignore_index=True)
            

        for i in range(0,len(regles)):
            regles.iloc[i,2] = self.mesure_calculator(regles.loc[i],m,L)
        
        if not len(regles)==0:
            regles=regles[regles['mesure'] >= conf_min]
            
        return regles
    
    #**************************************** EXPERIMENTATION PLOTS **********************************************
    def rules_nbr_plot(self, transactions_table, supp_lower_bound, supp_upper_bound, conf_lower_bound, conf_upper_bound):
        results= np.empty((0,3),float)
        for sup_min in np.arange(supp_lower_bound,supp_upper_bound,0.01):
            for conf_min in np.arange(conf_lower_bound,conf_upper_bound,0.01):
                L=self.appriori(sup_min, transactions_table)
                rs=self.regles_frequente(L,conf_min,0)
            
                results=np.vstack((results,np.array([sup_min,conf_min,len(rs)])))

        fig, ax = plt.subplots(subplot_kw={'projection': '3d'})

        Z = results[:, 2].reshape(results.shape[0], 1)
        ax.scatter(results[:, 0], results[:, 1], Z, c='r', marker='o')

        tri = Delaunay(results[:, :2])

        ax.plot_trisurf(results[:, 0], results[:, 1], Z.flatten(), triangles=tri.simplices, cmap="viridis", linewidth=0.9, antialiased=True)

        ax.set_xlabel('sup')
        ax.set_ylabel('conf')
        ax.set_zlabel('nbr de regles frequentes', labelpad=10)

        ax.view_init(elev=10, azim=-40)
        plt.title("Nombre de regles frequentes générées par sup_min et conf_min")
        buffer = io.BytesIO()
        plt.savefig(buffer, format='png')
        plt.close(fig)

        with open("plots\\rules_nbr_plot.png", 'wb') as f:
            f.write(buffer.getvalue())

        return "plots\\rules_nbr_plot.png"
        
    def freq_items_nbr_plot(self, transactions_table, supp_lower_bound, supp_upper_bound):
        resultsf= np.empty((0,2),float)
        for sup_min in np.arange(supp_upper_bound,supp_lower_bound,-0.001):
            L=self.appriori(sup_min,transactions_table)
            resultsf=np.vstack((resultsf,np.array([sup_min,sum(len(l) for l in L)]))) 
        
        x = resultsf[:,0]
        y = resultsf[:,1]

        plt.plot(x, y)
        plt.title('Evolution du nombre de motifs frequents selon le support min')
        plt.xlabel('supmin')
        plt.ylabel('nbr motifs frequents')

    def time_exec_plot(self, transactions_table, supp_lower_bound, supp_upper_bound):
        TimeResults= np.empty((0,2),float)

        for sup_min in np.arange(supp_upper_bound,supp_lower_bound,-0.002):
            duree=0.0
            for j in range(0,10):
                start=time.time()
                L=self.appriori(sup_min,transactions_table)
                duree+=time.time()-start
            TimeResults=np.vstack((TimeResults,np.array([sup_min,duree/10.0])))
                    
        x = TimeResults[:,0]
        y = TimeResults[:,1]

        plt.plot(x, y)
        plt.title('Evolution du temps d''execution d''apriori selon le sup_min')
        plt.xlabel('Supmin')
        plt.ylabel('Temps d execution')

    def memory_alloc_plot(self, transactions_table, supp_lower_bound, supp_upper_bound, conf_lower_bound, conf_upper_bound):
        resultsMemory= np.empty((0,3),float)

        for sup_min in np.arange(supp_lower_bound,supp_upper_bound,0.06):
            for conf_min in np.arange(conf_lower_bound,conf_upper_bound,0.02):
                initial_memory = psutil.Process().memory_info().rss / 1024 / 1024 /2024 # in MB
                L=self.appriori(sup_min,transactions_table)
                rs=self.regles_frequente(L,conf_min,0)
                final_memory = psutil.Process().memory_info().rss / 1024 / 1024 /2024 # in MB
                resultsMemory=np.vstack((resultsMemory,np.array([sup_min,conf_min,final_memory-initial_memory])))

        fig, ax = plt.subplots(subplot_kw={'projection': '3d'})

        Z = resultsMemory[:, 2].reshape(resultsMemory.shape[0], 1)
        ax.scatter(resultsMemory[:, 0], resultsMemory[:, 1], Z, c='r', marker='o')

        # Create Delaunay triangulation
        tri = Delaunay(resultsMemory[:, :2])

        # Plot the surface using the triangulation
        ax.plot_trisurf(resultsMemory[:, 0], resultsMemory[:, 1], Z.flatten(), triangles=tri.simplices, cmap="viridis", linewidth=0.9, antialiased=True)

        ax.set_xlabel('sup__min')
        ax.set_ylabel('conf_min')
        ax.set_zlabel('memoire',labelpad=10)
        ax.view_init(elev=10, azim=-30)
        plt.title("Evolution de l'espace aloué à l''algorithme apriori et regles d''association selon le sup_min et conf_min")
        buffer = io.BytesIO()
        plt.savefig(buffer, format='png')
        plt.close(fig)

        # Save the buffer to the specified filename
        with open("plots\\memory_alloc_plot.png", 'wb') as f:
            f.write(buffer.getvalue())

        return "plots\\memory_alloc_plot.png"