Skip to content

Instantly share code, notes, and snippets.

@vthily
Last active June 9, 2022 03:13
Show Gist options
  • Save vthily/e396dba8dfa0a1ab9cd1e9e66a5d85c5 to your computer and use it in GitHub Desktop.
Save vthily/e396dba8dfa0a1ab9cd1e9e66a5d85c5 to your computer and use it in GitHub Desktop.
Group the single characters (they are adjacent in order) into meaningful acronyms
#!/usr/bin/python3
import functools
def normalizeAcronyms(inputStrInWords):
# Step1: Add extra space to prevent concatenate with previous word
inputStrInWords = list(map(lambda a: a+' ' if len(a) > 1 else a, inputStrInWords))
# Step2: Remove the underscore, eg: c_d_a, l_p_a, etc.
inputStrInWords = list(map(lambda a: a.replace('_', '').upper() if '_' in a else a, inputStrInWords))
# Step3: Combine if the next word is a single letter.
retStr = functools.reduce(lambda a, b: a+b.upper() if len(b)==1 else a + ' ' + b, inputStrInWords)
# Step4: Uppercase the first letter in the sentence
retStr = retStr[0].upper() + retStr[1:]
# Step5: Clean the space as added in the step1.
return retStr.replace(' ', ' ')
if __name__ == '__main__':
inputStrInWords = ['c', 'd', 'a', 'trustee', 'and', 'b', 'b']
normalizeAcronyms(inputStr)
@vthily
Copy link
Author

vthily commented Jun 9, 2022

Version 2:

#!/usr/bin/python3

# helper function to find the index, length of the subset in the superset 
def find_sub_idx(superset_lst, subset_lst, start = 0):
    length = len(subset_lst)
    for idx in range(start, len(superset_lst)):
        if superset_lst[idx : idx + length] == subset_lst:
            return idx, idx + length


# helper function to replace the subset with the new set
def replace_sub(superset_lst, subset_lst, new_list):
    length = len(new_list)
    idx = 0
    for start, end in iter(lambda: find_sub_idx(superset_lst, subset_lst, idx), None):
        superset_lst[start : end] = new_list
        idx = start + length

# helper function to do inverse normalization for acronyms
def normalizeAcronyms(inputStrInWords):
    acronyms = [
            ['k','i','f','a', 's'], # KIFAS
            # 4 characters acronyms
            ['p','s','e','a'], ['c','a','r','g'], ['m','m','p','p'], # CARG, MMPP
            ['o','c','d','c'], ['p','o','s','b'],  #OCDC, POSB
            # 3 characters acronyms
            ['m','s','f'], ['c','d','a'], ['h','s','r'],  # MSF, CDA, HSR,
            ['l','p','a'], ['o','p','g'], ['s','d','n'],  # LPA, OPG, SDN
            # 2 characters/subwords acronyms 
            ['h','r'],  ['p','a'], ['i','c'],  # HR, PA, IC
            ]
            
    if (inputStrInWords == []):
        return ''
    for acr in acronyms:
        replace_sub(inputStrInWords, acr, [(''.join(acr)).upper()])
        
    inputStrInWords = [word.replace('_', '') for word in inputStrInWords]
    
    return " ".join(inputStrInWords)


if __name__ == '__main__':a
  
  inputStrInWords = ['i', 'i', 'want', 'to', 'talk', 'about', 'a_b_c', 'o', 'c', 'd', 'c', 'd', 'a', 'trustee', 'i', 'c', 'a', 'r', 'g']
  inputSentence = normalizeAcronyms(inputStrInWords)
  print(inputSentence)
  

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment