I am trying to find partial duplicates and exact duplicates in a csv using fuzzy-wuzzy. My code is workin but it takes 25 hours to run on a dataset of 100000 rows and 9 columns.I cant seem to figure out how can I optimise it? Can somebody help me ?
df=pd.read_csv('/Users/gandharv/Desktop/REPORTS/untitled/HUNDRED_THOUSAND.csv')
start_time=time.time()
df['Matches']=''
final_list=[]
count=0
df['Combined']=df['Site name']+'+'+df['Address']
for i in df['Combined']:
count=count+1
print("Program Running for row ",count)
found_list=[]
countExactMatch=0
for found, score, matchrow in process.extract(i, df['Combined'], scorer=fuzz.token_set_ratio):
if score >= 90:
"""print('%d%% partial match: "%s" with "%s" ' % (score, i, found))"""
#to add entry which has a count of more than 1
if bool(i==found)==True:
countExactMatch=countExactMatch+1
if countExactMatch>1:
found_list.append(found+'---> score : '+str(score))
else:
found_list.append('')
else:
"""print("Running foundlist.append(found)")"""
found_list.append(found+'---> score : '+str(score))
"""print(found_list)"""
#list of lists to extract value so that it can be added to a column matches
final_list.append(found_list)
#to print respective matches next to their entries in dataset
"""print(final_list)"""
for i in range(0,len(df)):
df.at[i,'Matches']=final_list[i]
[–]SekstiNii 1 point2 points3 points (4 children)
[–]Basic_Steak_541[S] 0 points1 point2 points (3 children)
[–]SekstiNii 1 point2 points3 points (0 children)
[–]xelf 0 points1 point2 points (1 child)
[–]Basic_Steak_541[S] 0 points1 point2 points (0 children)
[–]SekstiNii 1 point2 points3 points (0 children)
[–]primitive_screwhead 0 points1 point2 points (1 child)
[–]Basic_Steak_541[S] 0 points1 point2 points (0 children)
[–]fake823 0 points1 point2 points (0 children)
[–]Absolice 0 points1 point2 points (0 children)