How to split a string on multiple multi‑character delimiters (and keep them) in Python

4 Answers

0 votes
def split_keep_multi_delims(s, delims):
    result = []
    i = 0
    delim_set = set(d[0] for d in delims if d)

    while i < len(s):
        c = s[i]

        if c in delim_set:
            # Count repeated delimiter characters
            start = i
            while i < len(s) and s[i] == c:
                i += 1
            result.append(s[start:i])
        else:
            # Collect normal text until next delimiter run
            start = i
            while i < len(s) and s[i] not in delim_set:
                i += 1
            result.append(s[start:i])

    return result


s = "aa==bbb---cccc++++ddddd"
delims = ["=", "-", "+"]

parts = split_keep_multi_delims(s, delims)

for p in parts:
    print(f"[{p}]", end=" ")

 
 
'''
run:
 
[aa] [==] [bbb] [---] [cccc] [++++] [ddddd] 
 
'''

 



answered Mar 10 by avibootz
0 votes
import re

def split_and_keep_multi_delimiters(s, delimiters):
  # Create a regex pattern by joining the delimiters with a pipe '|' for 'OR',
  # and wrapping each with parentheses to create capture groups.
  pattern = "|".join(f"({re.escape(d)})" for d in delimiters)
  
  # Use re.split() with the created pattern
  result = re.split(pattern, s)
  
  # re.split() may produce empty strings due to how the split works 
  return [item for item in result if item]


s = "aa==bbb---cccc++++ddddd"
delimiters = ["==", "---", "++++"]

split_list = split_and_keep_multi_delimiters(s, delimiters)

print(split_list)

 
 
'''
run:
 
['aa', '==', 'bbb', '---', 'cccc', '++++', 'ddddd']
 
'''

 



answered Mar 10 by avibootz
0 votes
import re

s = "aa==bbb---cccc++++ddddd"
delimiters = ["=", "-", "+"]

# 1. re.escape handles special regex characters like '+'
# 2. [=|-|\+]+ matches one or more consecutive occurrences of any listed character
# 3. Parentheses (...) ensure the matched delimiters are kept in the result list
pattern = f"([{'|'.join(map(re.escape, delimiters))}]+)"

result = re.split(pattern, s)

print(result)


 
'''
run:
 
['aa', '==', 'bbb', '---', 'cccc', '++++', 'ddddd']
 
'''

 



answered Mar 10 by avibootz
0 votes
def split_and_keep(text, delim_set):
    if not text:
        return []

    result = []
    start = 0
    
    for i in range(1, len(text)):
        char = text[i]
        prev_char = text[i-1]
        
        # Split if:
        # 1. Current char is a delimiter and previous was different
        # 2. Current char is text and previous was a delimiter
        # 3. Both are delimiters but different types (e.g., '+-')
        if (char in delim_set or prev_char in delim_set) and (char != prev_char):
            result.append(text[start:i])
            start = i
            
    # Append the final remaining part
    result.append(text[start:])
    
    return result
    
s = "aa==bbb---cccc++++ddddd"
delimiters = {"=", "-", "+"}  


print(split_and_keep(s, delimiters))


 
'''
run:
 
['aa', '==', 'bbb', '---', 'cccc', '++++', 'ddddd']

'''

 



answered Mar 10 by avibootz

Related questions

...