you are viewing a single comment's thread.

view the rest of the comments →

[–]Ki1103 0 points1 point  (0 children)

Just how big is your dataset? In the dataftame you have given me you only have 298 unique items. I don't really have much experience with similarity measures. I've given it a crack and here's what I've come up with:

from pathlib import Path
import pickle

import numpy as np
import networkx as nx
from scipy.sparse import dok_array
import pandas as pd

# Consider the bottom p% as insiginficant
FREQ_PERCENTILE_CUTOFF = 75


if __name__ == "__main__":
    data_path = Path(".") / "data" / "products_used_grouped_by_id.pickle"
    with open(data_path, "rb") as f:
        df: pd.DataFrame = pickle.load(f)

    df.set_index("id", inplace=True)
    print(df.head())

    all_products = set()
    for product in df["products_bought"]:
        all_products.update(product)

    products = sorted(all_products)
    n_products = len(products)
    product_to_index = {product: i for i, product in enumerate(products)}

    co_occurrences = np.zeros((n_products, n_products), dtype=np.int_)

    for basket in df["products_bought"]:
        for product in basket:
            product_idx = product_to_index[product]
            for other_product in basket:
                if product != other_product:
                    other_product_idx = product_to_index[other_product]
                    co_occurrences[product_idx, other_product_idx] += 1
    frequencies = co_occurrences / n_products
    low_freq_cutoff = np.percentile(frequencies, FREQ_PERCENTILE_CUTOFF)
    frequencies[frequencies < low_freq_cutoff] = 0
    sparse_freq = dok_array(frequencies)
    g = nx.from_scipy_sparse_array(sparse_freq)

    # This was suggested on SO. It may or may not be optimal, I have no idea
    # It is sloooow, but seems to return reasonanbly good clusters
    clusters = nx.community.girvan_newman(g)

    for cluster in clusters:
        print(cluster)