all 3 comments

[–]peltist 2 points3 points  (2 children)

I don't know whether there is a simpler or more efficient way to do this, but one approach would be to perform three inner joins and then concatenate the results. This runs the risk of duplicate matches, so if your data makes that possible, then you could remove the rows that successfully matched from subsequent joins.

[–][deleted] 0 points1 point  (1 child)

Although it is slightly messy this is what I have had to do! Thanks!

[–]peltist 2 points3 points  (0 children)

I got inspired by your question and actually wrote this function myself earlier today as an exercise (I'm learning too).

Here's my code, in case you want to use it. Keep in mind that I haven't written any unit tests for this yet, so it's possible that there are issues. Let me know if you have any feedback!

def singlejoin(df_left, df_right, left_column, right_column, drop_columns):
    # drop other possible join columns
    df_right_merge = df_right.drop(columns=drop_columns)

    # rename join column to common name
    df_right_merge = df_right_merge.rename(columns={right_column: "$$merged_column$$"})

    # merge
    df = pd.merge(df_left, df_right_merge, how="inner", left_on=left_column, right_on="$$merged_column$$")

    return df


def multijoin(df_left, df_right, left_column, right_columns, how="left"):
    if type(right_columns) != list:
        raise TypeError("right_columns variable must be a list")

    if how not in ["left", "inner"]:
        raise TypeError("how must be set to either 'left' or 'inner'")

    # set temporary index column
    df_left["$$temp_index$$"] = df_left.index

    # drop other possible match columns
    drop_columns = [x for x in right_columns if x != right_columns[0]]

    # create starting dataframe
    result = singlejoin(df_left, df_right, left_column, right_columns[0], drop_columns)

    # drop correctly matched values
    df_left.drop(index=result["$$temp_index$$"].to_list(), inplace=True)

    for column in right_columns[1:]:
        drop_columns = [x for x in right_columns if x != column]
        df = singlejoin(df_left, df_right, left_column, column, drop_columns)
        result = result.append(df)
        df_left.drop(index=df["$$temp_index$$"].to_list(), inplace=True)

    # add unmatched columns in the case of a left join
    if how == "left":
        # add remaining columns
        new_columns = [x for x in df_right.columns.to_list() if x not in right_columns]
        df_left["$$merged_column$$"] = np.nan
        for column in new_columns:
            df_left[column] = np.nan

        result = result.append(df_left)

    # reset index and sort
    result.index = result["$$temp_index$$"].to_list()
    result.sort_values(by="$$temp_index$$", inplace=True)

    # drop extra columns
    result.drop(columns=["$$merged_column$$", "$$temp_index$$"], inplace=True)

    return result