Añadiendo etiquetas a barras marinas

Estoy tratando de crear dos gráficos de barras agrupadas horizontalmente alineados. Tengo una gran cantidad de datos para varios modelos de Machine Learning y sus horarios de ejecución correspondientes y me gustaría mostrar todos estos datos de manera significativa. Mi intento hasta ahora parece lo siguiente:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

labels = ['MLP','FCN','ResNet','ROCKET','1-NN DTW','LightGBM','XGBoost','CatBoost']
            
Data1_Accuracy = [20, 34, 30, 35, 27,77.83125,78.7204167,78.5354167]
Data2_Accuracy = [20, 34, 30, 35, 27,75.7979167,76.2520833,77.87]
Data3_Accuracy = [20, 34, 30, 35, 27,80.14625,81.5033333,81.4625]
Data4_Accuracy = [20, 34, 30, 35, 27,78.3841667,79.34875,80.5270833]
Data5_Accuracy = [20, 34, 30, 35, 27,79.2495833,77.5370833,79.2666667]
Data6_Accuracy = [20, 34, 30, 35, 27,77.03125,77.2429167,77.9960275]
Data7_Accuracy = [20, 34, 30, 35, 27,81.3241667,80.5408333,84.2083333]
Data8_Accuracy = [20, 34, 30, 35, 27,78.1470833,78.1225,80.2754167]
Data9_Accuracy = [20, 34, 30, 35, 27,80.7383333,79.9358333,79.6916667]
Data10_Accuracy = [20, 34, 30, 35, 27,74.1095833,73.0879167,73.0529167]
Data11_Accuracy = [20, 34, 30, 35, 27,78.4775,77.8658333,78.35]
Data12_Accuracy = [20, 34, 30, 35, 27,73.0991667,71.9683333,72.75625]
Data13_Accuracy = [20, 34, 30, 35, 27,79.03,79.575,80.3870833]
Data14_Accuracy = [20, 34, 30, 35, 27,81.0241667,81.455,80.5516667]
Data15_Accuracy = [20, 34, 30, 35, 27,79.4829167,80.01375,81.68]
Data16_Accuracy = [20, 34, 30, 35, 27,81.1158333,80.9795833,80.6541667]

Data1_Times = [20, 34, 30, 35, 27,829.0177925,58.6558111,8493.968922]
Data2_Times = [20, 34, 30, 35, 27,604.5935536,64.3871907,6833.585728]
Data3_Times = [20, 34, 30, 35, 27,1286.01507,92.4329714,6821.308612]
Data4_Times = [20, 34, 30, 35, 27,757.3903304,78.7253731,5455.483287]
Data5_Times = [20, 34, 30, 35, 27,401.3722335,30.4119882,5160.041989]
Data6_Times = [20, 34, 30, 35, 27,321.4673242,54.1971346,4465.557807]
Data7_Times = [20, 34, 30, 35, 27,2598.48826,193.1256487,10811.65574]
Data8_Times = [20, 34, 30, 35, 27,1545.059628,139.9638344,7784.332016]
Data9_Times = [20, 34, 30, 35, 27,663.416329,615.3660963,3560.337827]
Data10_Times = [20, 34, 30, 35, 27,670.1615828,621.8249994,3567.653313]
Data11_Times = [20, 34, 30, 35, 27,619.1959161,572.3292757,3493.582855]
Data12_Times = [20, 34, 30, 35, 27,626.107683,579.0746278,3528.605614]
Data13_Times = [20, 34, 30, 35, 27,2936.5633,2631.284413,6465.254111]
Data14_Times = [20, 34, 30, 35, 27,2967.02757,2672.068268,6551.57865]
Data15_Times = [20, 34, 30, 35, 27,4102.511475,3711.899848,7704.401239]
Data16_Times = [20, 34, 30, 35, 27,4075.485739,3726.896591,7737.482708]

Data1_TimesInHours = np.array(Data1_Times) / 3600
Data2_TimesInHours = np.array(Data2_Times) / 3600
Data3_TimesInHours = np.array(Data3_Times) / 3600
Data4_TimesInHours = np.array(Data4_Times) / 3600
Data5_TimesInHours = np.array(Data5_Times) / 3600
Data6_TimesInHours = np.array(Data6_Times) / 3600
Data7_TimesInHours = np.array(Data7_Times) / 3600
Data8_TimesInHours = np.array(Data8_Times) / 3600
Data9_TimesInHours = np.array(Data9_Times) / 3600
Data10_TimesInHours = np.array(Data10_Times) / 3600
Data11_TimesInHours = np.array(Data11_Times) / 3600
Data12_TimesInHours = np.array(Data12_Times) / 3600
Data13_TimesInHours = np.array(Data13_Times) / 3600
Data14_TimesInHours = np.array(Data14_Times) / 3600
Data15_TimesInHours = np.array(Data15_Times) / 3600
Data16_TimesInHours = np.array(Data16_Times) / 3600

accuraciesDataFrame = pd.DataFrame({'Index': labels,
                   'Data1_Accuracy': Data1_Accuracy,
                   'Data2_Accuracy': Data2_Accuracy,
                   'Data3_Accuracy': Data3_Accuracy,
                   'Data4_Accuracy': Data4_Accuracy,
                   'Data5_Accuracy': Data5_Accuracy,
                   'Data6_Accuracy': Data6_Accuracy,
                   'Data7_Accuracy': Data7_Accuracy,
                   'Data8_Accuracy': Data8_Accuracy,
                   'Data9_Accuracy': Data9_Accuracy,
                   'Data10_Accuracy': Data10_Accuracy,
                   'Data11_Accuracy': Data11_Accuracy,
                   'Data12_Accuracy)': Data12_Accuracy,
                   'Data13_Accuracy': Data13_Accuracy,
                   'Data14_Accuracy': Data14_Accuracy,
                   'Data15_Accuracy': Data15_Accuracy,
                   'Data16_Accuracy': Data16_Accuracy},
                    columns = ['Index','Data1_Accuracy','Data2_Accuracy','Data3_Accuracy','Data4_Accuracy','Data5_Accuracy','Data6_Accuracy','Data7_Accuracy','Data8_Accuracy','Data9_Accuracy','Data10_Accuracy',
                               'Data11_Accuracy','Data12_Accuracy','Data13_Accuracy','Data14_Accuracy','Data15_Accuracy','Data16_Accuracy'])
        
timesDataFrame = pd.DataFrame({'Index': labels,
           'Data1_TimesInHours': Data1_TimesInHours,
           'Data2_TimesInHours': Data2_TimesInHours,
           'Data3_TimesInHours': Data3_TimesInHours,
           'Data4_TimesInHours': Data4_TimesInHours,
           'Data5_TimesInHours': Data5_TimesInHours,
           'Data6_TimesInHours': Data6_TimesInHours,
           'Data7_TimesInHours': Data7_TimesInHours,
           'Data8_TimesInHours': Data8_TimesInHours,
           'Data9_TimesInHours': Data9_TimesInHours,
           'Data10_TimesInHours': Data10_TimesInHours,
           'Data11_TimesInHours': Data11_TimesInHours,
           'Data12_TimesInHours': Data12_TimesInHours,
           'Data13_TimesInHours': Data13_TimesInHours,
           'Data14_TimesInHours': Data14_TimesInHours,
           'Data15_TimesInHours': Data15_TimesInHours,
           'Data16_TimesInHours': Data16_TimesInHours},
           columns = [
               'Index','Data1_TimesInHours','Data2_TimesInHours','Data3_TimesInHours','Data4_TimesInHours',
                       'Data5_TimesInHours','Data6_TimesInHours','Data7_TimesInHours','Data8_TimesInHours','Data9_TimesInHours','Data10_TimesInHours',
                       'Data11_TimesInHours','Data12_TimesInHours','Data13_TimesInHours','Data14_TimesInHours','Data15_TimesInHours','Data16_TimesInHours'
                       ])
 
accuraciesDataFrameMelted = pd.melt(accuraciesDataFrame, id_vars=['Index'])
timesDataFrameMelted = pd.melt(timesDataFrame, id_vars=['Index'])

fig, axs = plt.subplots(1,2)
fig.set_size_inches(30,10)

xRangeFirstChart = list(range(0,101))

fig.suptitle('Rounded accuracies (%) and times for training and evaluation (h) for different data types and models',fontsize=26)

g1 = sns.barplot(x='value', y='Index', hue='variable', data=accuraciesDataFrameMelted, ax=axs[0])

axs[0].set_xlim([xRangeFirstChart[0],xRangeFirstChart[-1]])
axs[0].set_ylabel('Model',fontsize=24)
axs[0].set_xlabel('Rounded Accuracy (%)',fontsize=24)
axs[0].set_title('Rounded accuracies (%) for different data types and models',fontsize=22)

g2 = sns.barplot(x='value', y='Index', hue='variable', data=timesDataFrameMelted, ax=axs[1])

axs[0].get_legend().remove()
axs[1].get_legend().remove()

axs[1].get_yaxis().set_visible(False)
axs[1].set_xlabel('Training and evaluation time (h)',fontsize=24)
axs[1].set_title('Rounded training and evaluation time (h) for different data types and models',fontsize=22)

plt.savefig('PathToFigure/MyFigure.png', dpi=300, bbox_inches='tight', pad_inches=0)

Lo que me falta es una forma de escribir las etiquetas "Data 1", "Data 2", Data 3", etc... en cada barra. Por favor, consulte la imagen para una visualización de lo que estoy tratando de lograr. Cualquier ayuda es muy apreciada!

Wanted output

Pregunta hecha hace 3 años, 4 meses, 29 días - Por techinnovator


3 Respuestas:

  • Como hay tantos bares en un gráfico, Yo usaría sns.catplot para dibujar las diferentes categorías en un Facet Grid y entonces sería mucho mejor añadir etiquetas, que usted puede hacer con la función personalizada add_labels (por favor, tenga en cuenta los diferentes parámetros -- no dude en eliminar algunos/add otros. Me he adaptado de esta solución).

    También puede hacer que el eje x sea más variable si pasa sharex=False al crear los catplots (ver final de esta solución)

    También, sns.catplot no funciona bien con añadir a subplotas, para que puedas guardar como una figura. Por eso uso. plt.close(fig) para deshacerse de la figura en blanco que creamos, y esto también significaría añadir cualquier formato (como agregar un título) a esa figura sería inútil, ya que nos estamos deshaciendo de la figura al final; sin embargo, hay hacks. Uno es guardar como figuras separadas y utilizar una solución desde Aquí.: para combinar en uno .pdf. Creo que sería mejor tener el espacio extra de un gráfico por página o imagen. Otra opción es usar un poco de un hack para entrar en una figura:

    fig, ax = plt.subplots(nrows=2)
    sns.set_context('paper', font_scale=1.4)
    plt.style.use('dark_background')
    
    n_cols=4 #this is used later in a couple of places to make dynamic 
    g1 = sns.catplot(data=accuraciesDataFrameMelted, x='value', y='variable', col='Index', kind='bar', 
                     col_wrap=n_cols, ax=ax[0])
    g1.fig.suptitle('Rounded accuracies (%) for different data types and models',fontsize=22)      
    plt.subplots_adjust(top=0.9, bottom=-0.5)        
    
    g2 = sns.catplot(data=timesDataFrameMelted, x='value', y='variable', col='Index', kind='bar', 
                     col_wrap=n_cols, ax=ax[1])
    g2.fig.suptitle('Rounded training and evaluation time (h) for different data types and models',fontsize=22)
    plt.subplots_adjust(top=0.9, bottom=-0.5)      
    
    def add_labels(graph, category_size, axis_number, omit_thresh, width_var, num_format):
        for i in range(category_size):
            ax = graph.facet_axis(axis_number,i)
            for p in ax.patches:
                if p.get_width() > omit_thresh: # omit labels close to zero or other threshold
                    width = p.get_width() * width_var   # get bar length
                    ax.text(width,       # set the text at 1 unit right of the bar
                    p.get_y() + p.get_height() / 2, # get Y coordinate + X coordinate / 2
                    num_format.format(p.get_width()), # set variable to display, 2 decimals
                    ha = 'center',   # horizontal alignment
                    va = 'center')  # vertical alignment
                else:
                    pass
    
    
    l1 = len(accuraciesDataFrameMelted['Index'].unique())
    l2 = len(timesDataFrame['Index'].unique())
    add_labels(graph=g1, category_size=l1, axis_number=0, omit_thresh=1, width_var=0.5, num_format='{:1.0f}')
    add_labels(graph=g2, category_size=l2, axis_number=1, omit_thresh=0.1, width_var=0.5, num_format='{:1.2f}')  
    
    for g, i in zip([g1,g2], [0, n_cols]):
        g.axes[i].set_ylabel('Model')     
    for g in [g1,g2]:
        g.set_titles("{col_name}", fontsize=12)
    g1.set_axis_labels(x_var="Rounded Accuracy (%)", y_var="Model")
    g2.set_axis_labels(x_var="Training and evaluation time (h)", y_var="Model")
    plt.close(fig)
    g1.fig.savefig('g1.pdf',dpi=300, bbox_inches = "tight")
    g2.fig.savefig('g2.pdf',dpi=300, bbox_inches = "tight")
    plt.show()
    

    (Zoomed In para mostrar el primer gráfico)

    enter image description here


    (Zoomed Para mostrar ambos gráficos)

    enter image description here


    También puede hacer que el eje x sea más variable si pasa sharex=False cuando crea el catplot, haciendo los cambios a continuación (pasar sharex y cambiar uno de los params en mi función a 'omit_thresh=0:

    g1 = sns.catplot(data=accuraciesDataFrameMelted, x='value', y='variable', 
    col='Index', kind='bar', 
                 col_wrap=n_cols, ax=ax[0], sharex=False)  
    g2 = sns.catplot(data=timesDataFrameMelted, x='value', y='variable', col='Index', kind='bar', 
                 col_wrap=n_cols, ax=ax[1], sharex=False)
    add_labels(graph=g1, category_size=l1, axis_number=0, omit_thresh=0, width_var=0.5, num_format='{:1.0f}')
    add_labels(graph=g2, category_size=l2, axis_number=1, omit_thresh=0, width_var=0.5, num_format='{:1.3f}')  
    

    enter image description here

    Respondida el Dec 16, 2020 a las 19:41 - por bugbusterx

    Votos positivos: 0 | Votos negativos: 0

  • Para añadir etiquetas a las barras en tu gráfico, puedes hacer uso de la función annotate de Matplotlib. Esta función te permite añadir texto en coordenadas específicas de tu gráfico. En tu caso, puedes iterar sobre cada barra y añadir la etiqueta correspondiente.

    Aquí te muestro cómo modificar tu código para añadir las etiquetas "Data1", "Data2", etc. a las barras de tus gráficos:

    python
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    import seaborn as sns
    
    labels = ['MLP','FCN','ResNet','ROCKET','1-NN DTW','LightGBM','XGBoost','CatBoost']
    
    

    Tu código existente...

    Plotting

    fig, axs = plt.subplots(1,2) fig.set_size_inches(30,10) xRangeFirstChart = list(range(0,101)) fig.suptitle('Rounded accuracies (%) and times for training and evaluation (h) for different data types and models',fontsize=26) g1 = sns.barplot(x='value', y='Index', hue='variable', data=accuraciesDataFrameMelted, ax=axs[0]) for i, bar in enumerate(axs[0].patches): label = f'Data {i%16+1}' width = bar.get_width() axs[0].text(width + 1, bar.get_y() + bar.get_height() / 2, label, ha='center', va='center', fontsize=12, color='black', clip_on=True) axs[0].set_xlim([xRangeFirstChart[0],xRangeFirstChart[-1]]) axs[0].set_ylabel('Model',fontsize=24) axs[0].set_xlabel('Rounded Accuracy (%)',fontsize=24) axs[0].set_title('Rounded accuracies (%) for different data types and models',fontsize=22)

    Resto de tu código...

    plt.savefig('PathToFigure/MyFigure.png', dpi=300, bbox_inches='tight', pad_inches=0)

    Con el código adicional proporcionado, ahora las etiquetas "Data 1", "Data 2", etc. se mostrarán a la derecha de cada barra en el primer gráfico. Puedes ajustar la posición y el estilo de las etiquetas según tus preferencias.

    Respondida el Dec 17, 2020 a las 19:38 - por Gpt

    Votos positivos: 0 | Votos negativos: 0

  • Puedes añadir las etiquetas a tus barras utilizando el método annotate() de Matplotlib. Aquí tienes un ejemplo modificado de tu código que añade las etiquetas a cada barra:

    ``` import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns

    labels = ['MLP','FCN','ResNet','ROCKET','1-NN DTW','LightGBM','XGBoost','CatBoost']

    Data1Accuracy = [20, 34, 30, 35, 27,77.83125,78.7204167,78.5354167] Data2Accuracy = [20, 34, 30, 35, 27,75.7979167,76.2520833,77.87] Data3Accuracy = [20, 34, 30, 35, 27,80.14625,81.5033333,81.4625] Data4Accuracy = [20, 34, 30, 35, 27,78.3841667,79.34875,80.5270833] Data5Accuracy = [20, 34, 30, 35, 27,79.2495833,77.5370833,79.2666667] Data6Accuracy = [20, 34, 30, 35, 27,77.03125,77.2429167,77.9960275] Data7Accuracy = [20, 34, 30, 35, 27,81.3241667,80.5408333,84.2083333] Data8Accuracy = [20, 34, 30, 35, 27,78.1470833,78.1225,80.2754167] Data9Accuracy = [20, 34, 30, 35, 27,80.7383333,79.9358333,79.6916667] Data10Accuracy = [20, 34, 30, 35, 27,74.1095833,73.0879167,73.0529167] Data11Accuracy = [20, 34, 30, 35, 27,78.4775,77.8658333,78.35] Data12Accuracy = [20, 34, 30, 35, 27,73.0991667,71.9683333,72.75625] Data13Accuracy = [20, 34, 30, 35, 27,79.03,79.575,80.3870833] Data14Accuracy = [20, 34, 30, 35, 27,81.0241667,81.455,80.5516667] Data15Accuracy = [20, 34, 30, 35, 27,79.4829167,80.01375,81.68] Data16Accuracy = [20, 34, 30, 35, 27,81.1158333,80.9795833,80.6541667]

    Data1Times = [20, 34, 30, 35, 27,829.0177925,58.6558111,8493.968922] Data2Times = [20, 34, 30, 35, 27,604.5935536,64.3871907,6833.585728] Data3Times = [20, 34, 30, 35, 27,1286.01507,92.4329714,6821.308612] Data4Times = [20, 34, 30, 35, 27,757.3903304,78.7253731,5455.483287] Data5Times = [20, 34, 30, 35, 27,401.3722335,30.4119882,5160.041989] Data6Times = [20, 34, 30, 35, 27,321.4673242,54.1971346,4465.557807] Data7Times = [20, 34, 30, 35, 27,2598.48826,193.1256487,10811.65574] Data8Times = [20, 34, 30, 35, 27,1545.059628,139.9638344,7784.332016] Data9Times = [20, 34, 30, 35, 27,663.416329,615.3660963,3560.337827] Data10Times = [20, 34, 30, 35, 27,670.1615828,621.8249994,3567.653313] Data11Times = [20, 34, 30, 35, 27,619.1959161,572.3292757,3493.582855] Data12Times = [20, 34, 30, 35, 27,626.107683,579.0746278,3528.605614] Data13Times = [20, 34, 30, 35, 27,2936.5633,2631.284413,6465.254111] Data14Times = [20, 34, 30, 35, 27,2967.02757,2672.068268,6551.57865] Data15Times = [20, 34, 30, 35, 27,4102.511475,3711.899848,7704.401239] Data16Times = [20, 34, 30, 35, 27,4075.485739,3726.896591,7737.482708]

    Data1TimesInHours = np.array(Data1Times) / 3600 Data2TimesInHours = np.array(Data2Times) / 3600 Data3TimesInHours = np.array(Data3Times) / 3600

    Respondida el Dec 17, 2020 a las 19:48 - por Gemini

    Votos positivos: 0 | Votos negativos: 0