Code
https://github.com/XiaoLing941212/random_projecttion.git
import random
import numpy as np
import math
import os
import pandas as pd
import time
class random_projection:
def __init__(self, data, stop_depth):
self.data = data.to_numpy()
self.stop_depth = stop_depth
self.res = self.cluster(self.data, self.stop_depth, [])
#print (self.res)
self.NumOfCluster = self.setNumOfCluster()
#print (self.NumOfCluster)
self.labels = np.ones(data.shape[0])
self.setLabels()
#print("Array of ones:", self.labels)
def setLabels(self):
for m in range(self.NumOfCluster):
#print(m)
for n in range(len(self.res[m])):
row = self.getMatchedRowsIndices(self.res[m][n], self.data)
#print (row)
self.labels[row] = m
def getLabels(self):
return self.labels
def getNumOfCluster(self):
return self.NumOfCluster
def setNumOfCluster(self):
return len(self.res)
@staticmethod
def getMatchedRowsIndices(target_row, array):
# Make sure the target row has the right shape
if target_row.shape[0] != array.shape[1]:
print("The target row does not have the same number of elements as the array rows.")
else:
# Compare each row in the array with the target row
matches = np.all(array == target_row, axis=1)
# Use np.where to find the indices of matches
row_indices = np.where(matches)[0]
return row_indices
def cluster(self, candidates, enough, res):
if len(candidates) < enough:
res.append(candidates)
return res
east, west, east_items, west_items = self.split(candidates)
res = self.cluster(east_items, enough, res)
res = self.cluster(west_items, enough, res)
return res
def split(self, candidates):
pivot = random.choice(candidates)
east_pivot = self.find_farest(pivot, candidates)
west_pivot = self.find_farest(east_pivot, candidates)
c = self.cal_distance(east_pivot, west_pivot)
if c == 0:
east_items = candidates[:len(candidates)//2]
west_items = candidates[len(candidates)//2:]
return east_pivot, west_pivot, east_items, west_items
all_distance = []
for candidate in candidates:
a = self.cal_distance(candidate, west_pivot)
b = self.cal_distance(candidate, east_pivot)
d = (a ** 2 + c ** 2 - b ** 2) / (2 * c)
all_distance.append((d, candidate))
all_distance.sort(key=lambda x: x[0])
sorted_candidates = [item[1] for item in all_distance]
east_items = sorted_candidates[:len(sorted_candidates)//2]
west_items = sorted_candidates[len(sorted_candidates)//2:]
return east_pivot, west_pivot, east_items, west_items
@staticmethod
def find_farest(pivot, candidates):
max_d = 0
most_point = pivot
for candidate in candidates:
cur_d = random_projection.cal_distance(pivot, candidate)
if cur_d > max_d:
max_d = cur_d
most_point = candidate
return most_point
@staticmethod
def cal_distance(p1, p2):
return math.sqrt(sum([(v1 - v2) ** 2 for v1, v2 in zip(p1[:-1], p2[:-1])]))
def add_labels_to_df(df, labels_data, labels_name):
if len(labels_data) == len(df):
# Adding the new column
df[labels_name] = labels_data
else:
print("The length of the ndarray does not match the number of DataFrame rows.")
# Convert 'my_column' to int
df[labels_name] = df[labels_name].astype(int)
Example how to use:
# Let's assume we have data for sales in a week across three departments
data = {
'Electronics': [2350, 3200, 1530, 2050, 2980, 4200, 3700],
'Clothing': [1500, 1700, 900, 1200, 2100, 2600, 1800],
'Groceries': [5400, 6200, 5900, 5100, 6200, 6800, 7100]
}
# Create DataFrame
df = pd.DataFrame(data)
test = random_projection(df, 7)
df_labels = test.getLabels()
add_labels_to_df(df, df_labels, 'df_labels')
Reference List
- Chen, J., Nair, V., Krishna, R., & Menzies, T. (2018). “Sampling” as a baseline optimizer for search-based software engineering. IEEE Transactions on Software Engineering, 45(6), 597-614.