In [1]:
import os
import warnings
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.families import Gaussian
from statsmodels.genmod.families.links import identity
from sklearn.preprocessing import StandardScaler
import plotly.graph_objects as go
import plotly.io as pio

from src import config, process

In [2]:
# ignore runtime warnings
warnings.filterwarnings("ignore", category=RuntimeWarning)

In [3]:
def get_spearman_corr(df, column, name, conc, plot=False):
    # create merged dataset
    merged = pd.merge(left=conc, right=df[[column]], left_index=True, right_index=True, how='inner')
    merged.dropna(inplace=True)
    
    # get spearman correlation
    correlation, p_value = spearmanr(merged['concreteness'], merged[column])
    
    # print results
    print(f'Concreteness and {name}')
    print(f'Spearman Correlation Coefficient: {correlation:.4f}')
    print(f'P-value: {p_value:.4e}')
    
    if plot:
        # plot results and save figure
        g = sns.jointplot(x="concreteness", y=column, data=merged, 
                        kind="scatter", joint_kws={"s": 20, "alpha": 0.2})
        fig_name = f"joinplot_concreteness_{column}.png"
        plt.subplots_adjust(left=0.2)
        plt.savefig(os.path.join(config.FIGURES_DIR, fig_name), bbox_inches='tight')
        plt.show()
    

In [4]:
def get_glm(df, column, name, conc, freq, plot=False):
    df = pd.DataFrame(StandardScaler().fit_transform(df), columns=df.columns, index=df.index)
    conc = pd.DataFrame(StandardScaler().fit_transform(conc), columns=conc.columns, index=conc.index)
    freq = pd.DataFrame(StandardScaler().fit_transform(freq), columns=freq.columns, index=freq.index)
    
    # create merged dataset
    merged = pd.merge(left=conc, right=df[[column]], left_index=True, right_index=True, how='inner')
    merged = pd.merge(left=merged, right=freq, left_index=True, right_index=True, how='inner')
    merged.dropna(inplace=True)
    
    # scale merged dataset
    #scaler = StandardScaler()
    #scaled_data = scaler.fit_transform(merged)
    #merged = pd.DataFrame(scaled_data, columns=merged.columns)
    
    merged.dropna(inplace=True)

    # glm(polysemy ~ conc*freq, data = data, family = gaussian(link="identity"))
    formula = f"{column} ~ concreteness*frequency"
    model = smf.glm(formula=formula, data=merged, family=Gaussian(link=identity())).fit()
    print(model.summary())
    return model

In [5]:
# Function to manually calculate predictions
def manual_prediction(coeffs, data):
    return (coeffs['Intercept'] + 
            coeffs['concreteness'] * data['concreteness'] + 
            coeffs['frequency'] * data['frequency'] + 
            coeffs['concreteness:frequency'] * data['concreteness'] * data['frequency'])

def plot_glm(model, name, lower_bound=-3, upper_bound=3):

    # Generate grid for concreteness and frequency
    concreteness_range = np.linspace(lower_bound, upper_bound, 100)
    frequency_range = np.linspace(lower_bound, upper_bound, 100)
    concreteness_grid, frequency_grid = np.meshgrid(concreteness_range, frequency_range)

    # Create DataFrame for prediction
    df_pred = pd.DataFrame({
        'concreteness': concreteness_grid.ravel(),
        'frequency': frequency_grid.ravel()
    })

    # Add interaction term
    df_pred['concreteness:frequency'] = df_pred['concreteness'] * df_pred['frequency']

    # Predict scores using the model
    df_pred['predicted_score'] = model.predict(sm.add_constant(df_pred))

    conf_int = model.conf_int()

    # Create predictions for lower and upper bounds
    lower_coeffs = conf_int[0]  # Lower bounds of coefficients
    upper_coeffs = conf_int[1]  # Upper bounds of coefficients

    # Calculate lower and upper bound predictions
    df_pred['lower_bound'] = manual_prediction(lower_coeffs, df_pred)
    df_pred['upper_bound'] = manual_prediction(upper_coeffs, df_pred)

    # Create the Plotly 3D plot
    fig = go.Figure()

    # Main predicted surface
    fig.add_trace(go.Surface(z=df_pred['predicted_score'].values.reshape(concreteness_grid.shape), x=concreteness_grid, y=frequency_grid, colorscale='Viridis', name='Predicted Score'))

    # Lower bound surface
    fig.add_trace(go.Surface(z=df_pred['lower_bound'].values.reshape(concreteness_grid.shape), x=concreteness_grid, y=frequency_grid, colorscale='Viridis', opacity=0.3, showscale=False, name='Lower Bound'))

    # Upper bound surface
    fig.add_trace(go.Surface(z=df_pred['upper_bound'].values.reshape(concreteness_grid.shape), x=concreteness_grid, y=frequency_grid, colorscale='Viridis', opacity=0.3, showscale=False, name='Upper Bound'))

    # Update layout
    fig.update_layout(
        title=name,
        scene=dict(
            xaxis_title='Concreteness',
            yaxis_title='Frequency',
            zaxis_title='Score',
            camera=dict(eye=dict(x=1.87, y=0.88, z=0.64))
        ),
        autosize=False,
        width=800,
        height=700,
        margin=dict(l=65, r=50, b=65, t=90)
    )

    # Show the plot
    #fig.show()
    fig.write_html(f'figures/{name}.html'.replace(' ', '_'), auto_open=False)
    #pio.show(fig)


In [6]:
def run(language):
    folder = os.path.join(config.PROCESSED_DATA_DIR, language)
    hist = pd.read_csv(os.path.join(folder, f'hist_polysemy_score_{language}.csv'), sep=';', index_col=0)
    contemp = pd.read_csv(os.path.join(folder, f'contemp_polysemy_score_{language}.csv'), sep=';', index_col=0)
    conc = pd.read_csv(os.path.join(folder, f'concreteness_{language}.csv'), sep=';', index_col='Word')
    freq = process.get_most_frequent_words(
        input_dir=os.path.join(config.EXTERNAL_DATA_DIR, language), 
        input_file=config.FREQUENCY_FILENAMES[language], 
        language=language, 
        nr_words=20_000,
        vocab_only=False)
    freq['frequency'] = freq['frequency'].apply(np.log)
    
    get_spearman_corr(hist, 'slope', f'Polysemy Score Evolution {language}', conc)
    model = get_glm(hist, 'slope', f"Polysemy score Evolution with Frequency dependence {language}", conc, freq)
    plot_glm(model, f'Polysemy Score Evolution {language}')
    print("\n\n\n")
    get_spearman_corr(contemp, 'contemp_polysemy_score', f'Contemporary Polysemy Score {language}', conc)
    model = get_glm(contemp, 'contemp_polysemy_score', f"Contemporary Polysemy score with Frequency dependence {language}", conc, freq)
    plot_glm(model, f'Contemporary Polysemy Score {language}')
    print("\n\n\n")
    get_spearman_corr(hist, 'polysemy_score_1990', f'Historic Polysemy Score 1990s {language}', conc)
    model = get_glm(hist, 'polysemy_score_1990', f"History Polysemy score 1990s with Frequency dependence {language}", conc, freq)
    plot_glm(model, f'Historic Polysemy Score 1990s {language}')

## German

In [7]:
model = run(language="german")

Concreteness and Polysemy Score Evolution german
Spearman Correlation Coefficient: 0.2739
P-value: 1.1263e-26
                 Generalized Linear Model Regression Results                  
Dep. Variable:                  slope   No. Observations:                 1468
Model:                            GLM   Df Residuals:                     1464
Model Family:                Gaussian   Df Model:                            3
Link Function:               identity   Scale:                         0.67734
Method:                          IRLS   Log-Likelihood:                -1795.0
Date:                Wed, 20 Dec 2023   Deviance:                       991.62
Time:                        23:25:21   Pearson chi2:                     992.
No. Iterations:                     3   Pseudo R-squ. (CS):             0.1149
Covariance Type:            nonrobust                                         
                             coef    std err          z      P>|z|      [0.025      0.975]
---------

## English

In [8]:
model = run(language="english")

Concreteness and Polysemy Score Evolution english
Spearman Correlation Coefficient: -0.0542
P-value: 4.0462e-11
                 Generalized Linear Model Regression Results                  
Dep. Variable:                  slope   No. Observations:                14842
Model:                            GLM   Df Residuals:                    14838
Model Family:                Gaussian   Df Model:                            3
Link Function:               identity   Scale:                         0.93594
Method:                          IRLS   Log-Likelihood:                -20567.
Date:                Wed, 20 Dec 2023   Deviance:                       13887.
Time:                        23:25:22   Pearson chi2:                 1.39e+04
No. Iterations:                     3   Pseudo R-squ. (CS):            0.06641
Covariance Type:            nonrobust                                         
                             coef    std err          z      P>|z|      [0.025      0.975]
-------

## French

In [9]:
run(language="french")

Concreteness and Polysemy Score Evolution french
Spearman Correlation Coefficient: -0.3043
P-value: 6.8459e-25
                 Generalized Linear Model Regression Results                  
Dep. Variable:                  slope   No. Observations:                 1454
Model:                            GLM   Df Residuals:                     1450
Model Family:                Gaussian   Df Model:                            3
Link Function:               identity   Scale:                          1.0906
Method:                          IRLS   Log-Likelihood:                -2124.2
Date:                Wed, 20 Dec 2023   Deviance:                       1581.4
Time:                        23:25:24   Pearson chi2:                 1.58e+03
No. Iterations:                     3   Pseudo R-squ. (CS):            0.06503
Covariance Type:            nonrobust                                         
                             coef    std err          z      P>|z|      [0.025      0.975]
--------

## Compare cutoff percentiles 

In [10]:
# load regerence dataset
polysemy_reference = pd.read_csv("data/external/english/concreteness_w_definition.csv", usecols=["Word"])
polysemy_reference = polysemy_reference.value_counts().reset_index()
polysemy_reference.set_index('Word', drop=True, inplace=True)

In [11]:
# load other reference
polysemy_reference_ = pd.read_csv('polysemy_from_diacronic.csv', sep=';', index_col=0)

In [12]:
for cutoff_percentile in [75, 80, 85, 90, 95, 96, 97, 98, 99]:
    print(f"Cutoff Percentile: {cutoff_percentile}")
    contemp = pd.read_csv(f"data/processed/english/contemp_polysemy_score_english_{cutoff_percentile}.csv", sep=';', index_col=0)
    merged = contemp.merge(polysemy_reference_, how='inner', left_index=True, right_index=True)
    # change 'word' to 0 for other dataset
    correlation = merged.corr('spearman')['word'].loc['contemp_polysemy_score']
    print(f"Correlation: {correlation}")
    print()

Cutoff Percentile: 75
Correlation: 0.1856562251272132

Cutoff Percentile: 80
Correlation: 0.188895760230789

Cutoff Percentile: 85
Correlation: 0.18798944245805527

Cutoff Percentile: 90
Correlation: 0.18664202957580978

Cutoff Percentile: 95
Correlation: 0.17560989044088063

Cutoff Percentile: 96
Correlation: 0.1666502282402278

Cutoff Percentile: 97
Correlation: 0.15882297459383482

Cutoff Percentile: 98
Correlation: 0.14201753593960414

Cutoff Percentile: 99
Correlation: 0.1133414986139982

