Tokenizing Strings

Download: Reese.zip

The process of finding groups from one set of characters that are separated by characters from another set is called tokenizing. The groups of interest or "words" are known as tokens. The characters separating them are called delimiters or separators. C provides the function strtok that accepts text and any set of delimiters and tokenizes the text. C++ does not have the equivalent of strtok. However, this example demonstrates a function (tokenize) that does the same thing and is easier to use.

Reference

Greg Reese, C++ Standard Library Practical Tips Charles River Media, 2006. ISBN 1-58450-400-5. p. 386-390.

`string_tokenize.cpp`

// string_tokenize.cpp

#include <algorithm>
#include <iostream>
#include <iterator>
#include <string>
#include <vector>

using namespace std;

vector<string> tokenize( const string& text, const char* delimiters=0 );

void print(vector<string> &tokens)
{
	int i, n;
	n = tokens.size();
	for (i=0; i<n; i++) {
		cout << i+1 << " " << tokens[i] << endl;
	}
}

int main( )
{
	vector<string> tokens;
   const char file_delimiters[] = ":\\.";

   string phrase = "How much    wood would a woodchuck chuck?";

   // find words
   tokens = tokenize(phrase);
   cout << "Tokens in " << phrase << endl;
   print(tokens);
   
   
   string file( "c:\\reese\\book\\code\\string_tokenize.cpp" );

   // find the parts of the file name
   tokens = tokenize( file, file_delimiters );

   // display the tokens
   cout << "TOKENS IN " << file << endl;
   print( tokens );

   // try a file with no delimiters
   file = "data";
   tokens = tokenize( file, file_delimiters );
   cout << "\nTOKENS IN " << file << endl;
   print( tokens );

   // try a file that's all delimiters
   file = "..";
   tokens = tokenize( file, file_delimiters );
   cout << "\nTOKENS IN " << file << endl;
   print( tokens );

   // try different delimiters by finding the numbers
   // in a Social Security number
   string social_security( "431-02-9495" );
   tokens = tokenize( social_security, "-" );
   cout << "\nTOKENS IN " << social_security << endl;
   print( tokens );
}

vector<string> tokenize( const string& text, const char* delimiters)
{
	vector<string> tokens;
	bool done = false;

   // can't use NULL pointer in find_first_of
   if( delimiters == 0   ) delimiters = " ";

   string::size_type start = 0;  // beginning index of token
   string::size_type finish;     // ending index of token
   while( !done )
   {
      // find the next character that is not a delimiter
      start = text.find_first_not_of( delimiters, start );

      // if there is a character that is not a delimiter...
      if( start != string::npos )
      {
         // find the next character after it that is a delimiter
         finish = text.find_first_of( delimiters, start );

         // if there is such a delimiter, the token is all the
         // characters from the starting character to just before
         // the delimiter
         if( finish != string::npos )
         {
            tokens.push_back( text.substr( start, finish-start ) );
            start = finish;   // use finish, not finish+1
         }
         // if there is not such a delimiter, the token is all the
         // characters from the starting character to the end of the
         // string. Bail out because there's no more text to look at
         else
         {
            tokens.push_back( text.substr( start,
               text.length()-start ) );
            break;
         }
      }

      // all remaining characters are delimiters
      else
         break;
   }
   return tokens;
}

Results

Tokens in How much    wood would a woodchuck chuck?
1 How
2 much
3 wood
4 would
5 a
6 woodchuck
7 chuck?
TOKENS IN c:\reese\book\code\string_tokenize.cpp
1 c
2 reese
3 book
4 code
5 string_tokenize
6 cpp

TOKENS IN data
1 data

TOKENS IN ..

TOKENS IN 431-02-9495
1 431
2 02
3 9495

Maintained by John Loomis, updated Wed Feb 14 23:13:55 2007