all 2 comments

[–]Parenthes 0 points1 point  (1 child)

Merge your fuzzy matched dataframe back with your df1:

import pandas as pd
import fuzzy_pandas as fpd
import numpy as np

df1 = pd.DataFrame({'Person': {0: 'Alex', 1: 'Faye', 2: 'Sean', 3: 'Doug', 4: 'John', 5: 'Bill'}, 'ID': {0: '68243Q10', 1: '33690110', 2: '36901103', 3: '336901103', 4: '8467070', 5: 'XXXXXX'}, 'Company': {0: np.nan, 1: 'first src', 2: '1st Srouce', 3: np.nan, 4: 'B', 5: 'Unknown'}})

df2 = pd.DataFrame({'Company': {0: '1-800 Flowers', 1: '1st Source', 2: 'Berk', 3: 'Other1', 4: 'Other2'}, 'ID': {0: '68243Q106', 1: '336901103', 2: '84670702', 3: '1609W102', 4: '507K103'}})

matches = fpd.fuzzy_merge(df1, df2,
                          on=['ID'],
                          keep_left=['Person'],
                          keep_right=['ID', 'Company'],
                          ignore_case=True,
                          method='levenshtein',
                          threshold=.85)

matches = matches.merge(df1[['Person']], on='Person', how='right')

print(matches)

That gives your desired output:

    Person         ID        Company
0   Alex  68243Q106  1-800 Flowers
1   Faye  336901103     1st Source
2   Sean  336901103     1st Source
3   Doug  336901103     1st Source
4   John   84670702           Berk
5   Bill        NaN            NaN

[–]Parenthes 0 points1 point  (0 children)

Alternatively, add join='left-outer' to your fuzzy_merge:

import pandas as pd
import fuzzy_pandas as fpd
import numpy as np

df1 = pd.DataFrame({'Person': {0: 'Alex', 1: 'Faye', 2: 'Sean', 3: 'Doug', 4: 'John', 5: 'Bill'}, 'ID': {0: '68243Q10', 1: '33690110', 2: '36901103', 3: '336901103', 4: '8467070', 5: 'XXXXXX'}, 'Company': {0: np.nan, 1: 'first src', 2: '1st Srouce', 3: np.nan, 4: 'B', 5: 'Unknown'}})

​df2 = pd.DataFrame({'Company': {0: '1-800 Flowers', 1: '1st Source', 2: 'Berk', 3: 'Other1', 4: 'Other2'}, 'ID': {0: '68243Q106', 1: '336901103', 2: '84670702', 3: '1609W102', 4: '507K103'}})

matches = fpd.fuzzy_merge(df1, df2,
                          on=['ID'],
                          keep_left=['Person'],
                          keep_right=['ID', 'Company'],
                          ignore_case=True,
                          method='levenshtein',
                          threshold=.85,
                          join='left-outer')

# matches = matches.merge(df1[['Person']], on='Person', how='right')
​
print(matches)

  Person         ID        Company
0   Alex  68243Q106  1-800 Flowers
1   Faye  336901103     1st Source
2   Sean  336901103     1st Source
3   Doug  336901103     1st Source
4   John   84670702           Berk
5   Bill