How to get the word before the last word from a string (edge‑case‑safe) in C++

2 Answers

0 votes
#include <iostream>
#include <vector>
#include <string>
#include <regex>

// Didn't work with Unicode

// Returns the word before the last word in a string.
// Handles punctuation, multiple spaces, tabs, and other edge cases.
// NOTE: This version treats "words" as sequences of A–Z or a–z letters.
// Full Unicode word support would require a different library (e.g., ICU, Boost.Regex).
std::string getWordBeforeLast(const std::string& text) {
    // Empty or whitespace-only input → no result
    if (text.find_first_not_of(" \t\n\r") == std::string::npos) {
        return "";
    }

    // Extract words using a C++-compatible regex.
    // This pattern matches ASCII letters only: A–Z and a–z.
    std::regex wordPattern(R"([A-Za-z]+)");
    std::sregex_iterator it(text.begin(), text.end(), wordPattern);
    std::sregex_iterator end;

    std::vector<std::string> words;
    for (; it != end; ++it) {
        words.push_back(it->str());
    }

    // Need at least two words to return the one before the last
    if (words.size() < 2) {
        return "";
    }

    // Return the second-to-last word
    return words[words.size() - 2];
}

int main() {
    std::cout << "=== Testing: Get Word Before Last ===\n\n";

    // Test cases to validate all edge conditions
    std::vector<std::string> testCases = {
        "python c++",
        "  many   spaces   here   now  ",
        "OneWord",
        "",
        "   ",
        "Hello, world!",
        "Tabs\tand\nnewlines work too",
        "Unicode 世界、こんにちは",
        "Ends with punctuation.",
        "Multiple words, with punctuation, here!",
        "state-of-the-art program example"
    };

    for (const auto& test : testCases) {
        std::string result = getWordBeforeLast(test);

        std::cout << "Input: \"" << test << "\"\n";
        std::cout << "Output: " << (result.empty() ? "null" : result) << "\n";
        std::cout << std::string(40, '-') << "\n";
    }
}


/*
OUTPUT:

=== Testing: Get Word Before Last ===

Input: "python c++"
Output: python
----------------------------------------
Input: "  many   spaces   here   now  "
Output: here
----------------------------------------
Input: "OneWord"
Output: null
----------------------------------------
Input: ""
Output: null
----------------------------------------
Input: "   "
Output: null
----------------------------------------
Input: "Hello, world!"
Output: Hello
----------------------------------------
Input: "Tabs	and
newlines work too"
Output: work
----------------------------------------
Input: "Unicode 世界、こんにちは"
Output: null
----------------------------------------
Input: "Ends with punctuation."
Output: with
----------------------------------------
Input: "Multiple words, with punctuation, here!"
Output: punctuation
----------------------------------------
Input: "state-of-the-art program example"
Output: program
----------------------------------------

*/

 



answered Mar 28 by avibootz
edited Mar 28 by avibootz
0 votes
#include <iostream>
#include <vector>
#include <string>
#include <cctype>

// Unicode works

// Check if ASCII punctuation or whitespace
bool isAsciiSeparator(char c) {
    unsigned char uc = static_cast<unsigned char>(c);
    if (std::isspace(uc)) return true;

    switch (c) {
        case ',': case '.': case '!': case '?':
        case ';': case ':': case '"': case '\'':
        case '(': case ')': case '[': case ']':
        case '{': case '}': case '-':
        case '/': case '\\':
            return true;
    }
    return false;
}

// Check if a character starts a UTF-8 multi-byte sequence
bool isUtf8Continuation(unsigned char c) {
    return (c & 0xC0) == 0x80;
}

// Returns the word before the last word
std::string getWordBeforeLast(const std::string& text) {
    std::vector<std::string> words;
    std::string current;

    for (size_t i = 0; i < text.size(); ++i) {
        unsigned char uc = static_cast<unsigned char>(text[i]);

        if (isAsciiSeparator(text[i])) {
            if (!current.empty()) {
                words.push_back(current);
                current.clear();
            }
        } 
        // Logic for Unicode: 
        // Specifically check for the Japanese comma '、' (UTF-8: E3 80 81)
        else if (uc == 0xE3 && i + 2 < text.size() && 
                 static_cast<unsigned char>(text[i+1]) == 0x80 && 
                 static_cast<unsigned char>(text[i+2]) == 0x81) {
            if (!current.empty()) {
                words.push_back(current);
                current.clear();
            }
            i += 2; // Skip the rest of the 3-byte character
        }
        else {
            current.push_back(text[i]);
        }
    }

    if (!current.empty()) words.push_back(current);

    if (words.size() < 2) return "";
    return words[words.size() - 2];
}

int main() {
    std::cout << "=== Testing: Get Word Before Last ===\n\n";
    
    // Test cases to validate all edge conditions
    std::vector<std::string> testCases = {
        "python c++",
        "  many   spaces   here   now  ",
        "OneWord",
        "",
        "   ",
        "Hello, world!",
        "Tabs\tand\nnewlines work too",
        "Unicode 世界、こんにちは",
        "Ends with punctuation.",
        "Multiple words, with punctuation, here!",
        "state-of-the-art program example"
    };

    for (const auto& t : testCases) {
        std::string result = getWordBeforeLast(t);
        std::cout << "Input:  " << t << "\n";
        std::cout << "Output: " << (result.empty() ? "null" : result) << "\n";
        std::cout << "----------------------------------------\n";
    }
}


/*
OUTPUT:

=== Testing: Get Word Before Last ===

Input:  python c++
Output: python
----------------------------------------
Input:    many   spaces   here   now  
Output: here
----------------------------------------
Input:  OneWord
Output: null
----------------------------------------
Input:  
Output: null
----------------------------------------
Input:     
Output: null
----------------------------------------
Input:  Hello, world!
Output: Hello
----------------------------------------
Input:  Tabs	and
newlines work too
Output: work
----------------------------------------
Input:  Unicode 世界、こんにちは
Output: 世界
----------------------------------------
Input:  Ends with punctuation.
Output: with
----------------------------------------
Input:  Multiple words, with punctuation, here!
Output: punctuation
----------------------------------------
Input:  state-of-the-art program example
Output: program
----------------------------------------

*/

 



answered Mar 28 by avibootz

Related questions

...