Skip to content

Instantly share code, notes, and snippets.

@baarkerlounger
Created April 19, 2021 12:46
Show Gist options
  • Select an option

  • Save baarkerlounger/7c8b380fbf52288dcfc068ad1ed996c8 to your computer and use it in GitHub Desktop.

Select an option

Save baarkerlounger/7c8b380fbf52288dcfc068ad1ed996c8 to your computer and use it in GitHub Desktop.

Revisions

  1. baarkerlounger revised this gist Apr 19, 2021. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion TF-IDF Matching
    Original file line number Diff line number Diff line change
    @@ -1,6 +1,6 @@
    import pandas as pd
    import numpy
    from fuzzywuzzy import process, fuzz
    import tfidf_matcher as tm

    mhclg_column_list = ["Tenancy start date", "Type of letting", "Who is the landlord", "Registration no", "LA CORE code", "Management group", "Scheme code", "Tenant code", "Starter/Introductory tenancy", "Type of tenancy", "Tenancy Duration", "Age of Person 1", "Age of Person 2", "Age of Person 3", "Age of Person 4", "Age of Person 5", "Age of Person 6", "Age of Person 7", "Age of Person 8", "Gender of Person 1", "Gender of Person 2", "Gender of Person 3", "Gender of Person 4", "Gender of Person 5", "Gender of Person 6", "Gender of Person 7", "Gender of Person 8", "Person 2 relationship to Person 1", "Person 3 relationship to Person 1", "Person 4 relationship to Person 1", "Person 5 relationship to Person 1", "Person 6 relationship to Person 1", "Person 7 relationship to Person 1", "Person 8 relationship to Person 1", "Economic Status of Person 1", "Economic Status of Person 2", "Economic Status of Person 3", "Economic Status of Person 4", "Economic Status of Person 5", "Economic Status of Person 6", "Economic Status of Person 7", "Economic Status of Person 8", "Ethnic group of person 1 as defined by applicant", "Nationality of person 1", "Household member has ever served in the UK Armed Forces", "Household member has been seriously injured or ill in the UK Armed Forces", "Does the household contain a pregnant person", "Which benefits does the tenant receive", "How much income comes from these benefits", "Tenant's net income", "Income refused", "Main reason the household left their last settled home", "Accessibility requirements", "Housing situation", "LA in which household lived prior to this letting", "Postcode of previous accommodation", "How long has the household lived in the LA", "How long has the household been on the waiting list", "Homeless status prior to this letting", "Reason for Housing Priority", "Was the letting made under CBL", "Was the letting made under CHR", "Was the letting made under CAP", "Source of referral for this letting", "Rent and other charges period", "Basic rent", "Service charge", "Personal Service Charge", "Support charge", "Care home charge", "Exempt from accommodation charges", "After benefits, what is the outstanding rent", "Void or newbuild/renewal date", "Major repairs completion date", "Supported scheme", "Number of offers since last tenancy", "Property Reference", "UPRN", "Number of bedrooms", "Type of unit", "Type of building", "Wheelchair accessible", "For relets, previous basis for rent", "Reason for vacancy", "ONS LA code", "Postcode of property"]

  2. baarkerlounger created this gist Apr 19, 2021.
    26 changes: 26 additions & 0 deletions TF-IDF Matching
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,26 @@
    import pandas as pd
    import numpy
    from fuzzywuzzy import process, fuzz

    mhclg_column_list = ["Tenancy start date", "Type of letting", "Who is the landlord", "Registration no", "LA CORE code", "Management group", "Scheme code", "Tenant code", "Starter/Introductory tenancy", "Type of tenancy", "Tenancy Duration", "Age of Person 1", "Age of Person 2", "Age of Person 3", "Age of Person 4", "Age of Person 5", "Age of Person 6", "Age of Person 7", "Age of Person 8", "Gender of Person 1", "Gender of Person 2", "Gender of Person 3", "Gender of Person 4", "Gender of Person 5", "Gender of Person 6", "Gender of Person 7", "Gender of Person 8", "Person 2 relationship to Person 1", "Person 3 relationship to Person 1", "Person 4 relationship to Person 1", "Person 5 relationship to Person 1", "Person 6 relationship to Person 1", "Person 7 relationship to Person 1", "Person 8 relationship to Person 1", "Economic Status of Person 1", "Economic Status of Person 2", "Economic Status of Person 3", "Economic Status of Person 4", "Economic Status of Person 5", "Economic Status of Person 6", "Economic Status of Person 7", "Economic Status of Person 8", "Ethnic group of person 1 as defined by applicant", "Nationality of person 1", "Household member has ever served in the UK Armed Forces", "Household member has been seriously injured or ill in the UK Armed Forces", "Does the household contain a pregnant person", "Which benefits does the tenant receive", "How much income comes from these benefits", "Tenant's net income", "Income refused", "Main reason the household left their last settled home", "Accessibility requirements", "Housing situation", "LA in which household lived prior to this letting", "Postcode of previous accommodation", "How long has the household lived in the LA", "How long has the household been on the waiting list", "Homeless status prior to this letting", "Reason for Housing Priority", "Was the letting made under CBL", "Was the letting made under CHR", "Was the letting made under CAP", "Source of referral for this letting", "Rent and other charges period", "Basic rent", "Service charge", "Personal Service Charge", "Support charge", "Care home charge", "Exempt from accommodation charges", "After benefits, what is the outstanding rent", "Void or newbuild/renewal date", "Major repairs completion date", "Supported scheme", "Number of offers since last tenancy", "Property Reference", "UPRN", "Number of bedrooms", "Type of unit", "Type of building", "Wheelchair accessible", "For relets, previous basis for rent", "Reason for vacancy", "ONS LA code", "Postcode of property"]

    input_columns_list = ['Start date', ' ', "Landlord Name", "Reg no", "Local Authority Code", "Mgmnt group", "Our Scheme code column", "Tenant Cde", "Is this a starter or introductory tenancy?", "Tenancy type", "Duration of tenancy", "Person 1 age", "Person 2 age", "Person 3 age", "Person 4 age", "Person 5 age", "Person 6 age", "Person 7 age", "Person 8 age", "Person 1 gender", "Person 2 gender", "Person 3 gender", "Person 4 gender", "Person 5 gender", "Person 6 gender", "Person 7 gender", "Person 8 gender", "Person 2 relationship to Person 1", "Person 3 relationship to Person 1", "Person 4 relationship to Person 1", "Person 5 relationship to Person 1", "Person 6 relationship to Person 1", "Person 7 relationship to Person 1", "Person 8 relationship to Person 1", "Person 1 Economic status", "Person 2 Economic status", "Person 3 Economic status", "Person 4 Economic status", "Person 5 Economic status", "Person 6 Economic status", "Person 7 Economic status", "Person 8 Economic status", "Person 1 ethnic group", "Person 1 nationality", "Armed forces status", "Armed forces injury status", "Pregnancy status", "Tenant benefits", "Benefit income", "Net income", "Income refused", "Reason for move", "Accessibility", "Prior Housing status"]

    df = pd.DataFrame.from_dict({'mhclg_columns': mhclg_column_list, 'input_columns': input_columns_list}, orient='index').transpose().fillna(' ')

    match = []
    similarity = []

    df = tm.matcher(mhclg_column_list, input_columns_list, 1, 2)

    df['correct_match'] = pd.Series(input_columns_list)
    df['result'] = numpy.where(df['Lookup 1'] == df.correct_match, True, False)
    result_count = df.groupby('result')['result'].count()
    accuracy = result_count[True] / result_count.sum()

    print('Matched Data: ')
    print(df)
    print('\n')
    print('\n')

    print(f'TFIDF Accuracy: {accuracy}')