Code

https://github.com/XiaoLing941212/random_projecttion.git

    import random
    import numpy as np
    import math
    import os
    import pandas as pd
    import time
class random_projection:
    def __init__(self, data, stop_depth):
        self.data = data.to_numpy()
        self.stop_depth = stop_depth
        self.res = self.cluster(self.data, self.stop_depth, [])
        #print (self.res)
        self.NumOfCluster = self.setNumOfCluster()
        #print (self.NumOfCluster)
        self.labels = np.ones(data.shape[0])
        self.setLabels()
        #print("Array of ones:", self.labels)
 
    def setLabels(self):
        for m in range(self.NumOfCluster):
            #print(m)
            for n in range(len(self.res[m])):
                row = self.getMatchedRowsIndices(self.res[m][n], self.data)
                #print (row)
                self.labels[row] = m
 
    def getLabels(self):
        return self.labels
 
    def getNumOfCluster(self):
        return self.NumOfCluster
 
    def setNumOfCluster(self):
        return len(self.res)
 
    @staticmethod
    def getMatchedRowsIndices(target_row, array):
        # Make sure the target row has the right shape
        if target_row.shape[0] != array.shape[1]:
            print("The target row does not have the same number of elements as the array rows.")
        else:
            # Compare each row in the array with the target row
            matches = np.all(array == target_row, axis=1)
 
            # Use np.where to find the indices of matches
            row_indices = np.where(matches)[0]
 
            return row_indices
 
    def cluster(self, candidates, enough, res):
        if len(candidates) < enough:
            res.append(candidates)
            return res
 
        east, west, east_items, west_items = self.split(candidates)
        res = self.cluster(east_items, enough, res)
        res = self.cluster(west_items, enough, res)
 
        return res
 
 
    def split(self, candidates):
        pivot = random.choice(candidates)
        east_pivot = self.find_farest(pivot, candidates)
        west_pivot = self.find_farest(east_pivot, candidates)
        c = self.cal_distance(east_pivot, west_pivot)
 
        if c == 0:
            east_items = candidates[:len(candidates)//2]
            west_items = candidates[len(candidates)//2:]
            return east_pivot, west_pivot, east_items, west_items
 
        all_distance = []
        for candidate in candidates:
            a = self.cal_distance(candidate, west_pivot)
            b = self.cal_distance(candidate, east_pivot)
            d = (a ** 2 + c ** 2 - b ** 2) / (2 * c)
            all_distance.append((d, candidate))
 
        all_distance.sort(key=lambda x: x[0])
        sorted_candidates = [item[1] for item in all_distance]
        east_items = sorted_candidates[:len(sorted_candidates)//2]
        west_items = sorted_candidates[len(sorted_candidates)//2:]
 
        return east_pivot, west_pivot, east_items, west_items
 
    @staticmethod
    def find_farest(pivot, candidates):
        max_d = 0
        most_point = pivot
 
        for candidate in candidates:
            cur_d = random_projection.cal_distance(pivot, candidate)
            if  cur_d > max_d:
                max_d = cur_d
                most_point = candidate
 
        return most_point
 
    @staticmethod
    def cal_distance(p1, p2):
        return math.sqrt(sum([(v1 - v2) ** 2 for v1, v2 in zip(p1[:-1], p2[:-1])]))
 
def add_labels_to_df(df, labels_data, labels_name):
    if len(labels_data) == len(df):
        # Adding the new column
        df[labels_name] = labels_data
    else:
        print("The length of the ndarray does not match the number of DataFrame rows.")
 
    # Convert 'my_column' to int
    df[labels_name] = df[labels_name].astype(int)

Example how to use:

  # Let's assume we have data for sales in a week across three departments
data = {
    'Electronics': [2350, 3200, 1530, 2050, 2980, 4200, 3700],
    'Clothing': [1500, 1700, 900, 1200, 2100, 2600, 1800],
    'Groceries': [5400, 6200, 5900, 5100, 6200, 6800, 7100]
}
 
# Create DataFrame
df = pd.DataFrame(data)
test = random_projection(df, 7)
df_labels = test.getLabels()
add_labels_to_df(df, df_labels, 'df_labels')

Reference List

  1. Chen, J., Nair, V., Krishna, R., & Menzies, T. (2018). “Sampling” as a baseline optimizer for search-based software engineering. IEEE Transactions on Software Engineering, 45(6), 597-614.