# import libraries
import pandas as pd
import numpy as np
import seaborn as sns 
from matplotlib import pyplot as plt
import matplotlib as mpl
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# change this to your own data directory
data_dir = "data/"

# read and preprocess data
text_file_name = "osdg-community-data-v2023-01-01.csv"
text_df = pd.read_csv(data_dir + text_file_name,sep = "\t",  quotechar='"')
col_names = text_df.columns.values[0].split('\t')
text_df[col_names] = text_df[text_df.columns.values[0]].apply(lambda x: pd.Series(str(x).split("\t")))
text_df = text_df.astype({'sdg':int, 'labels_negative': int, 'labels_positive':int, 'agreement': float}, copy=True)
text_df.drop(text_df.columns.values[0], axis=1, inplace=True)
text_df = text_df.query("agreement > 0.5 and (labels_positive - labels_negative) > 2")
text_df.reset_index(inplace=True, drop=True)
docs = text_df.text
categories = text_df.sdg
X_train, X_test, y_train, y_test = \
    train_test_split(docs, categories, test_size=0.33, random_state=7)

14.9. Solutions to Exercises: Sections 5 to 7#

14.9.1. Classification of Texts#

For exercises 1 to 3, the below code will be used to establish the dataset used for all next classifications.

X_train_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1), stop_words = "english", min_df=5)
X_train_tfidf_vectorizer.fit(X_train)
X_train_tfidf_vector = X_train_tfidf_vectorizer.transform(X_train)
X_test_tfidf_vector = X_train_tfidf_vectorizer.transform(X_test)

Exercise 1

tfidf_mlp_clf = MLPClassifier(random_state=1, max_iter=100).fit(X_train_tfidf_vector, y_train)
y_pred = tfidf_mlp_clf.predict(X_test_tfidf_vector)
tfidf_mlp_clf.score(X_test_tfidf_vector, y_test)
print(metrics.classification_report(y_test,y_pred))
              precision    recall  f1-score   support

           1       0.79      0.81      0.80       481
           2       0.82      0.87      0.84       316
           3       0.94      0.93      0.94       674
           4       0.92      0.94      0.93       863
           5       0.92      0.92      0.92       920
           6       0.90      0.91      0.91       465
           7       0.88      0.88      0.88       730
           8       0.68      0.61      0.64       353
           9       0.79      0.80      0.79       328
          10       0.64      0.58      0.61       256
          11       0.85      0.88      0.86       462
          12       0.85      0.78      0.81       217
          13       0.85      0.87      0.86       443
          14       0.93      0.92      0.93       263
          15       0.89      0.85      0.87       313
          16       0.97      0.97      0.97      1057

    accuracy                           0.88      8141
   macro avg       0.85      0.85      0.85      8141
weighted avg       0.88      0.88      0.88      8141

Exercise 2

tfidf_multinomialNB_clf = MultinomialNB().fit(X_train_tfidf_vector, y_train)
y_pred = tfidf_multinomialNB_clf.predict(X_test_tfidf_vector)
print(metrics.classification_report(y_test,y_pred))
              precision    recall  f1-score   support

           1       0.70      0.73      0.71       481
           2       0.90      0.67      0.77       316
           3       0.92      0.90      0.91       674
           4       0.76      0.96      0.85       863
           5       0.62      0.94      0.75       920
           6       0.86      0.81      0.84       465
           7       0.60      0.97      0.74       730
           8       0.85      0.08      0.15       353
           9       0.91      0.32      0.48       328
          10       0.89      0.12      0.21       256
          11       0.86      0.75      0.80       462
          12       0.97      0.30      0.46       217
          13       0.84      0.80      0.82       443
          14       0.96      0.61      0.75       263
          15       0.96      0.64      0.77       313
          16       0.86      0.98      0.91      1057

    accuracy                           0.77      8141
   macro avg       0.84      0.66      0.68      8141
weighted avg       0.81      0.77      0.74      8141

Exercise 3

from sklearn.linear_model import RidgeClassifier

tfidf_ridge_clf = RidgeClassifier(tol=1e-2, solver="sparse_cg")
tfidf_ridge_clf = tfidf_ridge_clf.fit(X_train_tfidf_vector, y_train)
y_pred = tfidf_ridge_clf.predict(X_test_tfidf_vector)
print(metrics.classification_report(y_test,y_pred, digits = 4))
              precision    recall  f1-score   support

           1     0.8174    0.8004    0.8088       481
           2     0.8192    0.8892    0.8528       316
           3     0.9294    0.9377    0.9335       674
           4     0.9109    0.9594    0.9345       863
           5     0.9092    0.9467    0.9276       920
           6     0.9188    0.9247    0.9218       465
           7     0.8714    0.9096    0.8901       730
           8     0.7186    0.6006    0.6543       353
           9     0.8233    0.7957    0.8093       328
          10     0.6990    0.5625    0.6234       256
          11     0.8690    0.8615    0.8652       462
          12     0.8836    0.7696    0.8227       217
          13     0.8521    0.8713    0.8616       443
          14     0.9423    0.9316    0.9369       263
          15     0.9112    0.8850    0.8979       313
          16     0.9513    0.9612    0.9562      1057

    accuracy                         0.8840      8141
   macro avg     0.8642    0.8504    0.8560      8141
weighted avg     0.8815    0.8840    0.8820      8141

Exercise 4

Implementations may vary. One example is given below:

def ClassifyDocs(data, classifier_algorithm, params):
    docs = data.text
    categories = data.sdg
    X_train, X_test, y_train, y_test = train_test_split(docs, categories, test_size=0.33, random_state=7)

    X_train_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1), stop_words = "english", min_df=5)
    X_train_tfidf_vectorizer.fit(X_train)
    X_train_tfidf_vector = X_train_tfidf_vectorizer.transform(X_train)
    X_test_tfidf_vector = X_train_tfidf_vectorizer.transform(X_test)
    
    if classifier_algorithm == "mlp":
        tfidf_mlp_clf = MLPClassifier(random_state=params[0], max_iter=params[1]).fit(X_train_tfidf_vector, y_train)
        y_pred = tfidf_mlp_clf.predict(X_test_tfidf_vector)
        tfidf_mlp_clf.score(X_test_tfidf_vector, y_test)
        print(metrics.classification_report(y_test,y_pred))

    elif classifier_algorithm == "multinomialNB":
        tfidf_multinomialNB_clf = MultinomialNB().fit(X_train_tfidf_vector, y_train)
        y_pred = tfidf_multinomialNB_clf.predict(X_test_tfidf_vector)
        print(metrics.classification_report(y_test,y_pred))

    elif classifier_algorithm == "ridge":
        tfidf_ridge_clf = RidgeClassifier(tol=params[0], solver=params[1])
        tfidf_ridge_clf = tfidf_ridge_clf.fit(X_train_tfidf_vector, y_train)
        y_pred = tfidf_ridge_clf.predict(X_test_tfidf_vector)
        print(metrics.classification_report(y_test,y_pred, digits = 4))

    else:
        print("Invalid classifier algorithm")

Exercise 5

print("MLP")
ClassifyDocs(text_df, "mlp", [1,100])
print("MultinomialNB")
ClassifyDocs(text_df, "multinomialNB", [])
print("Ridge")
class_params = [1e-2, "sparse_cg"]
ClassifyDocs(text_df, "ridge", class_params)
MLP
              precision    recall  f1-score   support

           1       0.79      0.81      0.80       481
           2       0.82      0.87      0.84       316
           3       0.94      0.93      0.94       674
           4       0.92      0.94      0.93       863
           5       0.92      0.92      0.92       920
           6       0.90      0.91      0.91       465
           7       0.88      0.88      0.88       730
           8       0.68      0.61      0.64       353
           9       0.79      0.80      0.79       328
          10       0.64      0.58      0.61       256
          11       0.85      0.88      0.86       462
          12       0.85      0.78      0.81       217
          13       0.85      0.87      0.86       443
          14       0.93      0.92      0.93       263
          15       0.89      0.85      0.87       313
          16       0.97      0.97      0.97      1057

    accuracy                           0.88      8141
   macro avg       0.85      0.85      0.85      8141
weighted avg       0.88      0.88      0.88      8141

MultinomialNB
              precision    recall  f1-score   support

           1       0.70      0.73      0.71       481
           2       0.90      0.67      0.77       316
           3       0.92      0.90      0.91       674
           4       0.76      0.96      0.85       863
           5       0.62      0.94      0.75       920
           6       0.86      0.81      0.84       465
           7       0.60      0.97      0.74       730
           8       0.85      0.08      0.15       353
           9       0.91      0.32      0.48       328
          10       0.89      0.12      0.21       256
          11       0.86      0.75      0.80       462
          12       0.97      0.30      0.46       217
          13       0.84      0.80      0.82       443
          14       0.96      0.61      0.75       263
          15       0.96      0.64      0.77       313
          16       0.86      0.98      0.91      1057

    accuracy                           0.77      8141
   macro avg       0.84      0.66      0.68      8141
weighted avg       0.81      0.77      0.74      8141

Ridge
              precision    recall  f1-score   support

           1     0.8174    0.8004    0.8088       481
           2     0.8192    0.8892    0.8528       316
           3     0.9294    0.9377    0.9335       674
           4     0.9109    0.9594    0.9345       863
           5     0.9092    0.9467    0.9276       920
           6     0.9188    0.9247    0.9218       465
           7     0.8714    0.9096    0.8901       730
           8     0.7186    0.6006    0.6543       353
           9     0.8233    0.7957    0.8093       328
          10     0.6990    0.5625    0.6234       256
          11     0.8690    0.8615    0.8652       462
          12     0.8836    0.7696    0.8227       217
          13     0.8521    0.8713    0.8616       443
          14     0.9423    0.9316    0.9369       263
          15     0.9112    0.8850    0.8979       313
          16     0.9513    0.9612    0.9562      1057

    accuracy                         0.8840      8141
   macro avg     0.8642    0.8504    0.8560      8141
weighted avg     0.8815    0.8840    0.8820      8141

14.9.2. Evaluating Classification#

Exercise 1

First, we do the prep needed before making the ROC curve:

from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import LabelBinarizer

import tensorflow as tf
import tensorflow_hub as hub

# change this to your own embedding directory
embedding_dir = ""

# load the embedding
embed = hub.load(embedding_dir + "universal-sentence-encoder_4")

text_df["embedding"] = list(embed(text_df.text))

file_name = "sdg_names_definitions.csv"
sdg_names = pd.read_csv(data_dir + file_name)

docs = text_df.embedding.tolist()
scaler = preprocessing.MinMaxScaler().fit(docs)
X = scaler.transform(docs)
y = text_df.sdg

label_binarizer = LabelBinarizer().fit(y)
y_onehot = label_binarizer.transform(y)
n_classes = len(label_binarizer.classes_)
class_names = [sdg_names[sdg_names["sdg"] == label_binarizer.classes_[i]].sdg_name.item() \
               for i in range(n_classes)]
2024-07-12 16:12:25.149972: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string
	 [[{{node inputs}}]]

Then, we can make the curve itself. First for the Naive Bayes classification:

X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=.33, random_state=0)
ovr_nb_clf = OneVsRestClassifier(MultinomialNB()).fit(X_train,y_train)
y_score_nb = ovr_nb_clf.predict_proba(X_test)
from matplotlib import cm
colors = list(cm.get_cmap('tab20').colors)

fpr = dict()
tpr = dict()
roc_auc = dict()
for class_id in range(n_classes):
    fpr[class_id], tpr[class_id], _ = metrics.roc_curve(y_test[:, class_id], y_score_nb[:, class_id]) # roc_curve works on binary
    roc_auc[class_id] = metrics.auc(fpr[class_id], tpr[class_id])

for class_id, color in zip(range(n_classes), colors):
    plt.plot(fpr[class_id], tpr[class_id], color=color, lw=1.5,alpha = 1, 
             label='SDG {0} - {1} (AUC = {2:0.2f})'
             ''.format(class_id+1, class_names[class_id], roc_auc[class_id]))
plt.plot([0, 1], [0, 1], '--', lw=1, color="lightgrey")
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC for SDG multiclass classifier')
plt.legend(loc="lower right", fontsize=6)
plt.show()
/var/folders/px/b7vc3nh913zb_m0x36ncftj00000gn/T/ipykernel_86574/1881487763.py:2: MatplotlibDeprecationWarning: The get_cmap function was deprecated in Matplotlib 3.7 and will be removed two minor releases later. Use ``matplotlib.colormaps[name]`` or ``matplotlib.colormaps.get_cmap(obj)`` instead.
  colors = list(cm.get_cmap('tab20').colors)
../../_images/bde9092bf584b6fe05b0deae93bb42742e29bfaccca126b56fd3e6e081c9bf76.png

And then for the Ridge classification:

ovr_ridge_clf = OneVsRestClassifier(RidgeClassifier(tol=1e-2, solver="sparse_cg")).fit(X_train,y_train)
y_score_ridge = ovr_ridge_clf.predict(X_test)

fpr = dict()
tpr = dict()
roc_auc = dict()
for class_id in range(n_classes):
    fpr[class_id], tpr[class_id], _ = metrics.roc_curve(y_test[:, class_id], y_score_ridge[:, class_id]) # roc_curve works on binary
    roc_auc[class_id] = metrics.auc(fpr[class_id], tpr[class_id])

for class_id, color in zip(range(n_classes), colors):
    plt.plot(fpr[class_id], tpr[class_id], color=color, lw=1.5,alpha = 1, 
             label='SDG {0} - {1} (AUC = {2:0.2f})'
             ''.format(class_id+1, class_names[class_id], roc_auc[class_id]))
plt.plot([0, 1], [0, 1], '--', lw=1, color="lightgrey")
plt.xlim([-0.05, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC for SDG multiclass classifier')
plt.legend(loc="lower right", fontsize=6)
plt.show()
../../_images/e620ce66547f0aec48f2b86f42f192ac462fb4758a1b1dd64b3aea5e93a9ae9d.png

When students are making the plot for ridge regression, they will likely notice that ridge classifiers do not have a predict_proba method. Feel free to encourage students to look to other resources for overcoming this problem. Alternatively, ask them why they might think this is the case - the reason for this is because ridge regression does not calculate probabilities for its results in its basic form but must rather interpret the assigned values from the regression into some sort of probability, which it does not do by default.

Exercise 2

Answers may vary depending on the chosen goals but should follow the example given in the chapter.

Exercise 3

First, we perform the classification:

docs = text_df.embedding.tolist()
scaler = preprocessing.MinMaxScaler().fit(docs)
X = scaler.transform(docs)
y = text_df.sdg

label_binarizer = LabelBinarizer().fit(y)
y_onehot = label_binarizer.transform(y)
n_classes = len(label_binarizer.classes_)
class_names = [sdg_names[sdg_names["sdg"] == label_binarizer.classes_[i]].sdg_name.item() \
               for i in range(n_classes)]

X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=.33, random_state=0)
ovr_nb_clf = OneVsRestClassifier(MultinomialNB()).fit(X_train,y_train)
y_score_nb = ovr_nb_clf.predict_proba(X_test)

Then we can make the plot, first for our Naive Bayes Classification. Note that we did SDG 15 for this plot as SDG yielded a plotting error for us; students can change goals as well.

from sklearn.metrics import RocCurveDisplay

label_binarizer = LabelBinarizer().fit(y_train)
y_onehot_test = label_binarizer.transform(y_test)

fig, ax = plt.subplots(figsize=(6,6))

class_of_interest = 8 # SDG 8
class_id = np.flatnonzero(label_binarizer.classes_ == class_of_interest)[0]
RocCurveDisplay.from_predictions(
    y_onehot_test[:, class_id],
    y_score_nb[:, class_id],
    name=f"SDG {class_of_interest} vs the rest",
    color="darkorange",
    ax = ax,
)

class_of_interest = 15
class_id = np.flatnonzero(label_binarizer.classes_ == class_of_interest)[0]
RocCurveDisplay.from_predictions(
    y_onehot_test[:, class_id],
    y_score_nb[:, class_id],
    name=f"SDG {class_of_interest} vs the rest",
    color="purple",
    ax = ax,
)

plt.plot([0, 1], [0, 1], "--", label="chance level (AUC = 0.5)", color = "grey")
plt.plot([0.05, 0.05], [0, 1], "--", label="FPT at 0.05 level", color = "green")

plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("One-vs-Rest ROC curves")
plt.legend()
plt.show()
../../_images/0bb27b965a2b1c27ee0793da7a792caed9f4911ba22f6a5e02f286236986ebf4.png

And then the Ridge classifier:

X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size=.33, random_state=0)
ovr_ridge_clf = OneVsRestClassifier(RidgeClassifier(tol=1e-2, solver="sparse_cg")).fit(X_train,y_train)
y_score_ridge = ovr_nb_clf.predict(X_test)

fig, ax = plt.subplots(figsize=(6,6))

class_of_interest = 8 # SDG 8
class_id = np.flatnonzero(label_binarizer.classes_ == class_of_interest)[0]
RocCurveDisplay.from_predictions(
    y_onehot_test[:, class_id],
    y_score_nb[:, class_id],
    name=f"SDG {class_of_interest} vs the rest",
    color="darkorange",
    ax = ax,
)

class_of_interest = 15
class_id = np.flatnonzero(label_binarizer.classes_ == class_of_interest)[0]
RocCurveDisplay.from_predictions(
    y_onehot_test[:, class_id],
    y_score_nb[:, class_id],
    name=f"SDG {class_of_interest} vs the rest",
    color="purple",
    ax = ax,
)

plt.plot([0, 1], [0, 1], "--", label="chance level (AUC = 0.5)", color = "grey")
plt.plot([0.05, 0.05], [0, 1], "--", label="FPT at 0.05 level", color = "green")

plt.axis("square")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("One-vs-Rest ROC curves")
plt.legend()
plt.show()
../../_images/0bb27b965a2b1c27ee0793da7a792caed9f4911ba22f6a5e02f286236986ebf4.png

Exercise 4

Implementations may vary. One example is given below:

def ClassifyDocs_Advanced(data, classifier_algorithm, params, roc_plot = False, quick_metrics = False):
    docs = data.text
    categories = data.sdg
    X_train, X_test, y_train, y_test = train_test_split(docs, categories, test_size=0.33, random_state=7)

    X_train_tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1), stop_words = "english", min_df=5)
    X_train_tfidf_vectorizer.fit(X_train)
    X_train_tfidf_vector = X_train_tfidf_vectorizer.transform(X_train)
    X_test_tfidf_vector = X_train_tfidf_vectorizer.transform(X_test)

    metrics_report = ""
    
    if classifier_algorithm == "mlp":
        tfidf_mlp_clf = MLPClassifier(random_state=params[0], max_iter=params[1]).fit(X_train_tfidf_vector, y_train)
        y_pred = tfidf_mlp_clf.predict(X_test_tfidf_vector)
        tfidf_mlp_clf.score(X_test_tfidf_vector, y_test)
        metrics_report = metrics.classification_report(y_test,y_pred)

    elif classifier_algorithm == "multinomialNB":
        tfidf_multinomialNB_clf = MultinomialNB().fit(X_train_tfidf_vector, y_train)
        y_pred = tfidf_multinomialNB_clf.predict(X_test_tfidf_vector)
        metrics_report = metrics.classification_report(y_test,y_pred)

    elif classifier_algorithm == "ridge":
        tfidf_ridge_clf = RidgeClassifier(tol=params[0], solver=params[1])
        tfidf_ridge_clf = tfidf_ridge_clf.fit(X_train_tfidf_vector, y_train)
        y_pred = tfidf_ridge_clf.predict(X_test_tfidf_vector)
        metrics_report = metrics.classification_report(y_test,y_pred)

    else:
        print("Invalid classifier algorithm")
        return 0

    if quick_metrics:
        #print only accuracy, precision, recall, f1-score from metrics_report
        print(metrics_report.split("\n")[0])
    else:
        print(metrics_report)

    if roc_plot:
        #plot ROC curve
        label_binarizer = LabelBinarizer().fit(y_test)
        y_onehot_test = label_binarizer.transform(y_test)
        y_score = None
        if classifier_algorithm == "mlp":
            y_score = tfidf_mlp_clf.predict_proba(X_test_tfidf_vector)
        elif classifier_algorithm == "multinomialNB":
            y_score = tfidf_multinomialNB_clf.predict_proba(X_test_tfidf_vector)
        elif classifier_algorithm == "ridge":
            y_score = tfidf_ridge_clf.decision_function(X_test_tfidf_vector)
        else:
            print("Invalid classifier algorithm")
            return 0

        fig, ax = plt.subplots(figsize=(6,6))

        from matplotlib import cm
        colors = list(cm.get_cmap('tab20').colors)

        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for class_id in range(n_classes):
            fpr[class_id], tpr[class_id], _ = metrics.roc_curve(y_test[:, class_id], y_score_nb[:, class_id]) # roc_curve works on binary
            roc_auc[class_id] = metrics.auc(fpr[class_id], tpr[class_id])

        for class_id, color in zip(range(n_classes), colors):
            plt.plot(fpr[class_id], tpr[class_id], color=color, lw=1.5,alpha = 1, 
             label='SDG {0} - {1} (AUC = {2:0.2f})'
             ''.format(class_id+1, class_names[class_id], roc_auc[class_id]))
        plt.plot([0, 1], [0, 1], '--', lw=1, color="lightgrey")
        plt.xlim([-0.05, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC for SDG multiclass classifier')
        plt.legend(loc="lower right", fontsize=6)
        plt.show()

Exercise 5

A point at (1,1) corresponds to a model that predicts everything as a positive, leading to all data points being labeled as positive (so a 100% true and false positive rate). A point at (0,0) corresponds to a model that predicts everything as a negative (so a 0% true and false positive rate). While students do not have to elaborate on this, this hopefully demonstrates that a model with 100% true positive rate or a model with 0% false positive rate is not necessarily a good thing.

14.9.3. Applications#

Exercise 1

Since we are classifying new text data, we do not need to use as many data points for testing purposes. In previous chapters, more testing points are better for model evaluation, while more training data is better to make a more accurate model. In this case, having more training data can make for a more classification of the new texts.

Exercise 2

For these exercises, the training process can take a while, but it is interruptable, as in the solutions given below. The results for the MLP classifier are given as follows:

import requests
from bs4 import BeautifulSoup
url = "https://news.un.org/en/story/2023/07/1138767"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

from nltk import sent_tokenize
page_sentences = []
for tag in soup.find_all('p'):
        if (len(tag.text)>=30):
            sentences = sent_tokenize(tag.text)
            page_sentences = page_sentences + sentences
page_sentence_df = pd.DataFrame({"text": page_sentences})

# change this to your own data directory
data_dir = "data/"

# read and preprocess data
text_file_name = "osdg-community-data-v2023-01-01.csv"
text_df = pd.read_csv(data_dir + text_file_name,sep = "\t",  quotechar='"')
col_names = text_df.columns.values[0].split('\t')
text_df[col_names] = text_df[text_df.columns.values[0]].apply(lambda x: pd.Series(str(x).split("\t")))
text_df = text_df.astype({'sdg':int, 'labels_negative': int, 'labels_positive':int, 'agreement': float}, copy=True)
text_df.drop(text_df.columns.values[0], axis=1, inplace=True)
text_df = text_df.query("agreement > 0.5 and (labels_positive - labels_negative) > 2")
text_df.reset_index(inplace=True, drop=True)

docs = text_df.text
categories = text_df.sdg
X_train, X_test, y_train, y_test = \
    train_test_split(docs, categories, test_size=0.33, random_state=7)

X_train_count_vectorizer = CountVectorizer(ngram_range=(2,2), stop_words = "english" )
X_train_count_vectorizer.fit(X_train)  
X_train_count_vector = X_train_count_vectorizer.transform(X_train) 
X_test_count_vector = X_train_count_vectorizer.transform(X_test) 

count_MLP_clf = MLPClassifier(random_state=1, max_iter=100).fit(X_train_count_vector, y_train)
y_pred = count_MLP_clf.predict(X_test_count_vector)

fig, ax = plt.subplots(figsize=(15, 5))
font = {'family': 'sans-serif', 'weight': 'heavy','size': 7,} # set font for displaying confusion matrix
cm = mpl.colormaps["YlGnBu"] # set the color map for displaying confusion matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, text_kw=font, ax=ax, cmap=cm)
print(metrics.classification_report(y_test,y_pred, digits = 4))
/opt/anaconda3/envs/nlp2/lib/python3.9/site-packages/sklearn/neural_network/_multilayer_perceptron.py:693: UserWarning: Training interrupted by user.
  warnings.warn("Training interrupted by user.")
              precision    recall  f1-score   support

           1     0.7833    0.8191    0.8008       481
           2     0.8672    0.7025    0.7762       316
           3     0.9139    0.9139    0.9139       674
           4     0.9034    0.9316    0.9173       863
           5     0.8196    0.9380    0.8748       920
           6     0.8554    0.9290    0.8907       465
           7     0.6689    0.9384    0.7811       730
           8     0.7807    0.4136    0.5407       353
           9     0.7925    0.5823    0.6714       328
          10     0.7616    0.4492    0.5651       256
          11     0.8546    0.7381    0.7921       462
          12     0.9007    0.5853    0.7095       217
          13     0.7106    0.8646    0.7800       443
          14     0.8125    0.8403    0.8262       263
          15     0.9174    0.6741    0.7772       313
          16     0.9545    0.9716    0.9630      1057

    accuracy                         0.8326      8141
   macro avg     0.8311    0.7682    0.7863      8141
weighted avg     0.8392    0.8326    0.8263      8141
../../_images/613e55fe9eb4f533a99226b3ce2c507dde0ee077830619c6681d2a563116ad3e.png

Similarly, the solutions for Ridge Regression work as follows:

count_ridge_clf = RidgeClassifier(tol=1e-2, solver="sparse_cg").fit(X_train_count_vector, y_train)
y_pred = count_ridge_clf.predict(X_test_count_vector)

fig, ax = plt.subplots(figsize=(15, 5))
font = {'family': 'sans-serif', 'weight': 'heavy','size': 7,} # set font for displaying confusion matrix
cm = mpl.colormaps["YlGnBu"] # set the color map for displaying confusion matrix
ConfusionMatrixDisplay.from_predictions(y_test, y_pred, text_kw=font, ax=ax, cmap=cm)
print(metrics.classification_report(y_test,y_pred, digits = 4))
              precision    recall  f1-score   support

           1     0.7700    0.8004    0.7849       481
           2     0.7697    0.7405    0.7548       316
           3     0.8640    0.9139    0.8882       674
           4     0.7321    0.9502    0.8270       863
           5     0.8081    0.9109    0.8564       920
           6     0.8803    0.8538    0.8668       465
           7     0.8033    0.8616    0.8315       730
           8     0.6944    0.4249    0.5272       353
           9     0.7094    0.5732    0.6341       328
          10     0.7093    0.4766    0.5701       256
          11     0.7642    0.7294    0.7464       462
          12     0.7925    0.5806    0.6702       217
          13     0.8049    0.8104    0.8076       443
          14     0.8773    0.7338    0.7992       263
          15     0.9057    0.7061    0.7935       313
          16     0.9439    0.9555    0.9497      1057

    accuracy                         0.8138      8141
   macro avg     0.8018    0.7514    0.7692      8141
weighted avg     0.8132    0.8138    0.8079      8141
../../_images/f436c256b23c0d359a8da69619c6af293c8530268a48120c6e9d29842028b726.png

Answers comparing these models may vary, but they are largely similar in their results.

Exercise 3

Answers may vary as to how the students classify the website by reading it; however, goal 8 (Decent work and economic growth) or goal 1 (No poverty) should make the most sense.

To classify this document with Naive Bayes, we do the following:

url = "https://www.rockefellerfoundation.org/our-work/economic-equity/"
response = requests.get(url)
soup = BeautifulSoup(response.content, "html.parser")

page_sentences = []
for tag in soup.find_all('p'):
        if (len(tag.text)>=30):
            sentences = sent_tokenize(tag.text)
            page_sentences = page_sentences + sentences
page_sentence_df = pd.DataFrame({"text": page_sentences})

count_NB_clf = MultinomialNB().fit(X_train_count_vector, y_train)
y_pred = count_NB_clf.predict(X_test_count_vector)

page_count_vector = X_train_count_vectorizer.transform(pd.Series(page_sentence_df.text.str.cat()))
page_pred = count_NB_clf.predict(page_count_vector)
page_pred
array([1])

Exercise 4

High similarity would indicate that more components of the vectors are similar to each other and therefore the vectors have a smaller angle between them. For cosines, we know that smaller angles lead to a higher cosine value, leading to a higher score. So a high score indicates high similarity. Students can also be asked to demonstrate this on a mathematical level, but this is optional.

Exercise 5

The process for generating the similarity matrix uses the same code given in the textbook examples, at least for dataframe generation.

import tensorflow as tf
import tensorflow_hub as hub

# change this to your own embedding directory
embedding_dir = ""

# load the embedding
embed = hub.load(embedding_dir + "universal-sentence-encoder_4")

text_df["embedding"] = list(embed(text_df.text))

def get_sentence_df(text_df):
    text_df_sentence = []
    text_df_sdg = []
    text_df_text_id = []
    embedding = []
    for (text, sdg, text_id) in iter(zip(text_df.text, text_df.sdg, text_df.text_id)):
        temp_sentence = sent_tokenize(text)
        text_df_sentence = text_df_sentence + temp_sentence
        text_df_sdg = text_df_sdg + [sdg]*len(temp_sentence)
        text_df_text_id = text_df_text_id + [text_id]*len(temp_sentence)
        embedding = embedding + list(embed(temp_sentence))
    sentence_df = pd.DataFrame({"text": text_df_sentence, 
                                "sdg": text_df_sdg, 
                                "text_id": text_df_text_id,
                                "embedding": embedding})
    return sentence_df

sentence_df = get_sentence_df(text_df)
2024-07-15 19:47:39.705699: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'inputs' with dtype string
	 [[{{node inputs}}]]

To get the first 40 documents, we’ll use similar code but calling a different dataframe:

sns.heatmap(np.array(np.inner(text_df.embedding[:40].tolist(), 
                              text_df.embedding[:40].tolist())))
<Axes: >
../../_images/db01d4b5351132444ddbcfbb778329f6b5dd8dae4345bdf91c7b0974fa947e92.png

Exercise 6

The code for this is given below. Students may have trouble with runtimes for this due to the number of sentences, in which case you can subset by SDG or position.

sns.heatmap(np.array(np.inner(sentence_df.embedding.tolist(), \
                              sentence_df.embedding.tolist())))

Exercise 7

def get_most_similar(text_df, sentence, n = 5):
    sentence_sim = np.inner(list(text_df.embedding), embed([sentence]))
    val = sorted(sentence_sim, reverse=True)[n]
    return text_df[sentence_sim > val].text

s = "countries are working hard to save the ocean aminals."
get_most_similar(text_df, sentence = s, n=5)
5780     Oceans cover approximately 71% of the Earth's ...
8186     It puts forward a working definition of the oc...
12106    However, many commercial fish stocks are overf...
15024    Ninety-eight per cent of the area occupied by ...
18113    The top three African countries that have the ...
Name: text, dtype: object

Exercise 8

Students can also take subsets of the sentences, due to the sheer number of sentences present.

#get subset of sentence_df with sdg as 1 or 3
sentence_df_subset = sentence_df[sentence_df.sdg.isin([1,3])]

#make a heatmap
sns.heatmap(np.array(np.inner(sentence_df_subset.embedding.tolist(), \
                              sentence_df_subset.embedding.tolist())))
<Axes: >
../../_images/23baabbd3df34fa33e8935a6d105b7dda02d56d11288a6f8cc679d1fbc627d4e.png

Exercise 9

"""GetMostSimilarToURL(url, text_df, n = 5) takes in a url as a string, parses
it into a single string, and then calls get_most_similar to get the most similar documents."""
def GetMostSimilarToURL(url, text_df, n = 5):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, "html.parser")
    page_sentences = []
    for tag in soup.find_all('p'):
        if (len(tag.text)>=30):
            sentences = sent_tokenize(tag.text)
            page_sentences = page_sentences + sentences
    
    #make all the strings in page_sentences into one string
    page_string = ' '.join(page_sentences)

    #use this to call get_most_similar
    return get_most_similar(text_df, page_string, n)

Exercise 10

GetMostSimilarToURL("http://gianttortoise.org/en/beyond-tracking", text_df, n = 5)
4329     A rather large number of species of birds, inv...
4981     In general, neither mechanical nor chemical re...
5758     Assessments now typically involve an “intense ...
10274    The Conservation Banking scheme in the United ...
14146    Local communities and individual farmers prefe...
Name: text, dtype: object
GetMostSimilarToURL("https://www.dhs.gov/blue-campaign/what-human-trafficking", text_df, n = 5)
6677     ABSTRACTThis interpretive policy analysis of A...
10997    The trafficking of persons around the world is...
11409    Human trafficking is a serious global problem ...
22162    Sex trafficking is recognized as a national pr...
24426    International human rights law protects agains...
Name: text, dtype: object
GetMostSimilarToURL("https://www.dol.gov/agencies/odep/program-areas/individuals/older-workers", text_df, n = 5)
1868     In addition, the government has taken some mea...
2098     Although educational attainment has increased ...
18319    More than 280,000 disabled South Africans aged...
20533    Those who have been brought up in poor homes, ...
24499    Such a shift in viewpoint may account for a la...
Name: text, dtype: object
GetMostSimilarToURL("https://michigantoday.umich.edu/2022/08/26/positively-breaking-the-age-code/", text_df, n = 5)
6072     Purpose – The purpose of this paper is to prov...
9517     Eggleston and Fuchs (2012) show that at the be...
13177    The per-capita expenditure is projected accord...
15112    On the other hand, heavy drinking was shown to...
21460    The RAND Future Elderly Model [9] came the clo...
Name: text, dtype: object

Answers may vary as to whether or not the returned documents are most similar. However, students should try and call code such as text_df[6072] so as to read the texts as much as possible before coming to a conclusion.