标准C++以及MFC6.0字符串的tokenize和split函数

标准C++字符串string以及MFC6.0字符串CString的tokenize和split函数。

1、标准串的：

/********************************************

  the tokenize function for std::string

*********************************************/
#include <string>
#include <vector>
#include <iostream>
using namespace std;

typedef basic_string<char>::size_type S_T;
static const S_T npos = -1;

////trim指示是否保留空串，默认为保留。
vector<string> tokenize(const string& src, string tok,  bool trim=false, string null_subst="")
{
 if( src.empty() || tok.empty() ) throw "tokenize: empty string/0";
 
 vector<string> v;
 S_T pre_index = 0, index = 0, len = 0;
 while( (index = src.find_first_of(tok, pre_index)) != npos )
 {
  if( (len = index-pre_index)!=0 )
   v.push_back(src.substr(pre_index, len));
  else if(trim==false)
   v.push_back(null_subst);
  pre_index = index+1;
 }
 string endstr = src.substr(pre_index);
 if( trim==false ) v.push_back( endstr.empty()? null_subst:endstr );
 else if( !endstr.empty() ) v.push_back(endstr);
 return v;
}

////使用一个完整的串delimit（而不是其中的某个字符）来分割src串,没有trim选项，即严格分割。
vector<string> split(const string& src, string delimit, string null_subst="")
{
 if( src.empty() || delimit.empty() ) throw "split: empty string/0";

 vector<string> v;
 S_T deli_len = delimit.size();
 long index = npos, last_search_position = 0;
 while( (index=src.find(delimit, last_search_position))!=npos )
 {
  if(index==last_search_position)
   v.push_back(null_subst);
  else
   v.push_back( src.substr(last_search_position, index-last_search_position) );
  last_search_position = index + deli_len;
 }
 string last_one = src.substr(last_search_position);
 v.push_back( last_one.empty()? null_subst:last_one );
 return v;
}

// test
int main(void)
{
 string src = ",ab,cde;,,fg,," ;
 string tok = ",;" ;

 vector<string> v1 = tokenize(src, tok ,true);
 vector<string> v2 = tokenize(src, tok ,false, "<null>");

 cout<<"-------------v1:"<<endl;
 for(int i=0; i<v1.size();i++)
 {
  cout<<v1[i].c_str()<<endl;
 }
 
 cout<<"-------------v2:"<<endl;
 for(int j=0; j<v2.size();j++)
 {
  cout<<v2[j].c_str()<<endl;
 }

 try{
 
  string s = "######123#4###56########789###";
  string del = "";//"###";
  vector<string> v3 = split(s, del, "<null>");
  cout<<"-------------v3:"<<endl;
  for(int k=0; k<v3.size();k++)
  {
   cout<<v3[k].c_str()<<endl;
  }
 }
 catch (char *s) {
  cout<<s<<endl;
 }

 return 0;
}


2、CString版的：

#include <stdio.h>
#include <afx.h>

/*
 * 该函数用delimits里的字符拆分s，传出一个CStringList指针pList，
 * 若trim为真，则不保留分割后的空串(注意不是空白字符)。比如：
 * Tokenize( "a,bc;,d,", ",;", &out_list, TRUE)
 * 会返回3个串：a、bc、d。
 * 若trim为FALSE，则用nullSubst用来替代分割后的空串，比如：
 *  Tokenize( "a,bc;,d;", ",;", &out_list, FALSE,"[null]" )
 * 会返回5个串：a、bc、[null]、d、[null]。
 * trim默认为FALSE，nullSubst默认为空串。
 */
void Tokenize(CString s, CString delimits, CStringList* pList, BOOL trim=FALSE, CString nullSubst="")
{
 ASSERT( !s.IsEmpty() && !delimits.IsEmpty() );

 s += delimits[0];
 for( long index=-1; (index=s.FindOneOf((LPCTSTR)delimits))!=-1; )
 {
  if(index != 0) pList->AddTail( s.Left(index) );
  else if(!trim) pList->AddTail(nullSubst);
  s = s.Right(s.GetLength()-index-1);
 }
}


/* 
 * 类似java字符串的split()方法。
 * 使用一个完整的串delimit（而不是其中的某个字符）来分割src串,没有trim选项，
 * 即严格分割。num用来确定最多分割为多少个串,如果是0（默认），则按照delimit
 * 分割，若为1，则返回源串。
 */
void Split(const CString& src, CString delimit, CStringList* pOutList, int num=0, CString nullSubst="")
{
 ASSERT( !src.IsEmpty() && !delimit.IsEmpty() );
 if(num==1) 
 {
  pOutList->AddTail(src);
  return;
 }

 int deliLen = delimit.GetLength();
 long index = -1, lastSearchPosition = 0, cnt = 0;

 while( (index=src.Find(delimit, lastSearchPosition))!=-1 )
 {
  if(index==lastSearchPosition)
   pOutList->AddTail(nullSubst);
  else
   pOutList->AddTail(src.Mid(lastSearchPosition, index-lastSearchPosition));
  lastSearchPosition = index + deliLen;

  if(num)
  {
   ++cnt;
   if(cnt+1==num) break;
  }
 }
 CString lastOne = src.Mid(lastSearchPosition);
 pOutList->AddTail( lastOne.IsEmpty()? nullSubst:lastOne);
}



// test
int main(void)
{
 CString s = ",ab;cde,f,,;gh,,";
 CString sub = ",;";
 CStringList list1,list2;

 
 Tokenize(s,sub,&list1,TRUE,"no use"); // <-----
 printf("-------[Tokenize_trim]-------/n");
 POSITION pos1 = list1.GetHeadPosition();
 while( pos1!= NULL )
 {
  printf( list1.GetNext(pos1) );
  printf("/n");
 }
 Tokenize(s,sub,&list2,FALSE,"[null]"); // <-----
 printf("------[Tokenize_no_trim]-----/n");
 POSITION pos2 = list2.GetHeadPosition();
 while( pos2!= NULL )
 {
  printf( list2.GetNext(pos2) );
  printf("/n");
 }
 
 CStringList list3;
 s = "###0123###567######89###1000###";
 sub = "###";
 Split(s,sub,&list3, 3, "<null>"); // <-----
 printf("------[Split]-----/n");
 POSITION pos3 = list3.GetHeadPosition();
 while( pos3!= NULL )
 {
  printf( list3.GetNext(pos3) );
  printf("/n");
 }
 return 0; 
}