漫谈C++ string（1）：std::string实现

从实际使用上来说，std::string跟应该叫做std::string_buffer，因为它的特性更像std::vector而不是std::array，在很多情况下都容易引起数据拷贝。在实际项目中，拷贝std::vector并不是一个好的行为，他们更想要的是copy-on-write string、不可变string、基于引用计数的string等。在实际使用的过程中，std::string的最大问题就是它不适用于作为参数传递，因为它非常容易导致不必要的数据拷贝。此外，由于各个标注库实现的方式不同，会给跨平台一直带来风险，所以现在不少大型项目都会自己实现相关的string类。在实际使用过程中一般有以下替代品：

boost::string_ref（chrome:StringPiece）
std::string_view(c++17后才可用)
不直接传递一个string，而是传递它的迭代器

本文主要来分析GNU STL中std::string的实现，string_ref的实现可参考string_ref的实现，folly::FBString的实现可参考FBString的实现

定义

以gcc-stl 4.4.0为例std::string的实现方式，gnu-stl中string的定义如下：


template<typename _CharT, typename _Traits = char_traits<_CharT>,
         typename _Alloc = allocator<_CharT> >
  class basic_string;
typedef basic_string<char>    string;

可以看到，std::string是basic_string关于char的实现，下面来看basic_string的实现，如下为关于basic_string的部分说明：

*  A string looks like this:
*
*  @code
*                                        [_Rep]
*                                        _M_length //长度
*   [basic_string<char_type>]            _M_capacity //容量
*   _M_dataplus                          _M_refcount //引用计数，使用了cow
*   _M_p ---------------->               unnamed array of char_type //指向实际的数据
*  @endcode

构造

首先来看basic_string的构造函数，以最常见的std::string(“xxx”)这种构造方式为例来看它对应的构造函数：

template<typename _CharT, typename _Traits, typename _Alloc>
   basic_string<_CharT, _Traits, _Alloc>::
   basic_string(const _CharT* __s, size_type __n, const _Alloc& __a)
   : _M_dataplus(_S_construct(__s, __s + __n, __a), __a)
   { }

它调用_S_construct这个函数对_M_dataplus这个成员变量来进行初始化，_M_dataplus的定义为：


     // Use empty-base optimization: http://www.cantrip.org/emptyopt.html
     struct _Alloc_hider : _Alloc
     {
_Alloc_hider(_CharT* __dat, const _Alloc& __a)
: _Alloc(__a), _M_p(__dat) { }

_CharT* _M_p; // The actual data.
     };

    private:
     // Data Members (private):
     mutable _Alloc_hider	_M_dataplus;

由此可见，_M_dataplus中包含指向实际数据的指针，前文也说到，basic_string中应该还包含有大小、容量等数据成员的，这部分数据成员是存在_Rep中的，basic_string只存储指向它的指针，从而将减少内存占用和避免额外的内存拷贝，它的定义如下所示：


     struct _Rep_base
     {
size_type		_M_length;
size_type		_M_capacity;
_Atomic_word		_M_refcount;
     };

     struct _Rep : _Rep_base
     {
       _CharT*
         _M_refdata() throw()
         { return reinterpret_cast<_CharT*>(this + 1); }//获取指向的实际数据
     }

_S_construct根据输入参数初始化_M_dataplus,定义如下：

  template<typename _CharT, typename _Traits, typename _Alloc>
    template<typename _InIterator>
      _CharT*
      basic_string<_CharT, _Traits, _Alloc>::
      _S_construct(_InIterator __beg, _InIterator __end, const _Alloc& __a,
		   input_iterator_tag)
      {
#ifndef _GLIBCXX_FULLY_DYNAMIC_STRING //是否开启动态字符串，默认是开启的，即使用引用计数
	if (__beg == __end && __a == _Alloc())
	  return _S_empty_rep()._M_refdata();
#endif
	// Avoid reallocation for common case.
     //当字符串较短时，直接将数据存在栈中，避免去申请动态内存空间
	_CharT __buf[128];
	size_type __len = 0;
	while (__beg != __end && __len < sizeof(__buf) / sizeof(_CharT))
	  {
	    __buf[__len++] = *__beg;
	    ++__beg;
	  }
     //根据__len分配空间，初始化__r
	_Rep* __r = _Rep::_S_create(__len, size_type(0), __a);
     //将数据拷贝纸__r的指向位置
	_M_copy(__r->_M_refdata(), __buf, __len);
	__try
	  {
	    while (__beg != __end)
	      {
		if (__len == __r->_M_capacity)
		  {
		    // Allocate more space.
            //分配更多内存，销毁老的指针
		    _Rep* __another = _Rep::_S_create(__len + 1, __len, __a);
		    _M_copy(__another->_M_refdata(), __r->_M_refdata(), __len);
		    __r->_M_destroy(__a);
		    __r = __another;
		  }
		__r->_M_refdata()[__len++] = *__beg;
		++__beg;
	      }
	  }
	__catch(...)
	  {
	    __r->_M_destroy(__a);
	    __throw_exception_again;
	  }
    // 返回指向的数据，及最终M_dataplus指向实际存储的数据
	__r->_M_set_length_and_sharable(__len);
	return __r->_M_refdata();
   }

其中，_Rep::_S_create的定义如下：

 template<typename _CharT, typename _Traits, typename _Alloc>
   typename basic_string<_CharT, _Traits, _Alloc>::_Rep*
   basic_string<_CharT, _Traits, _Alloc>::_Rep::
   _S_create(size_type __capacity, size_type __old_capacity,
      const _Alloc& __alloc)
   {
    if (__capacity > _S_max_size)
__throw_length_error(__N("basic_string::_S_create"));

     //对齐页面大小及头部大小，其中头部带下是malloc分配时内存页面对齐时所需要使用的
     const size_type __pagesize = 4096;
     const size_type __malloc_header_size = 4 * sizeof(void*);
    
     //一次扩大至少两倍容量，避免频繁的内存分配
     if (__capacity > __old_capacity && __capacity < 2 * __old_capacity)
__capacity = 2 * __old_capacity;

     //多申请一个字节用以存储'\0'
    size_type __size = (__capacity + 1) * sizeof(_CharT) + sizeof(_Rep);

     //进行内存对齐的实际长度，需要加上头部长度
    const size_type __adj_size = __size + __malloc_header_size;
     if (__adj_size > __pagesize && __capacity > __old_capacity)
{
     //进行页面对齐
  const size_type __extra = __pagesize - __adj_size % __pagesize;
  __capacity += __extra / sizeof(_CharT);

  if (__capacity > _S_max_size)
    __capacity = _S_max_size;
  __size = (__capacity + 1) * sizeof(_CharT) + sizeof(_Rep);
}

     //开辟内存
     void* __place = _Raw_bytes_alloc(__alloc).allocate(__size);
     _Rep *__p = new (__place) _Rep;
     __p->_M_capacity = __capacity;
   
     //启用引用计数
     __p->_M_set_sharable();
     return __p;
   }

到现在可以明白的是，_Rep中存储了前文说到的字符串长度、容量、引用计数等信息，同时会申请__capacity + 1大小的内存用以存储实际字符串内容。至此，整个basic_string的初始化机制就比较明朗了。

内存布局

一个std::string的内存布局如下：

拷贝

在gcc4.*版本中，basic_string的拷贝采用了COW机制，通过如下代码可以来验证：

#include <iostream>
using namespace std;

int main() {
  string s1 = "s1";
 
  string s2 = s1;
  cout << ((void*)(s1.data()) == (void*)(s2.data())) << endl;

  s2[0] = 'x';
  cout << ((void*)(s1.data()) == (void*)(s2.data())) << endl;

  return 0;
}

在GCC 4.8.5上编译后的运行结果如下所示：

1
2

1
0

由此可验证字符串拷贝默认是采用了COW机制，下面来看字符串拷贝的具体实现，它的拷贝构造函数为:

basic_string(const basic_string& __str)
    : _M_dataplus(__str._M_rep()->_M_grab(_Alloc(__str.get_allocator()),
					  __str.get_allocator()),
		  __str.get_allocator())
      
 _CharT* _M_grab(const _Alloc& __alloc1, const _Alloc& __alloc2)
{
    return (! _M_is_leaked() && __alloc1 == __alloc2)
        ? _M_refcopy() : _M_clone (__alloc1);
}

_CharT*_M_refcopy() throw ()
{
#ifndef _GLIBCXX_FULLY_DYNAMIC_STRING
    if ( __builtin_expect(this != &_S_empty_rep(), false))
#endif
        __gnu_cxx::__atomic_add_dispatch (&this-> _M_refcount, 1);
    return _M_refdata();
}

可以看到，最终调用的_M_refcopy这个函数，将引用计数加1，然后将指针指向_Rep，下面来看实际的修改的行为。

reference operator []( size_type __pos )
{
    _M_leak();
    return _M_data ()[__pos ];//其中,_M_data()用以获取_M_p
}

void _M_leak ()  // for use in begin() & non-const op[]
{
    //拷贝构造时_M_is_leaked都是为false，即是通过COW生成的，所以会调用_M_leak_hard
    if (! _M_rep ()->_M_is_leaked ())
        _M_leak_hard ();
}

void _M_leak_hard ()
{
    if ( _M_rep ()->_M_is_shared ())
        _M_mutate (0, 0, 0);
    _M_rep()-> _M_set_leaked ();
}

void _M_mutate ( size_type __pos , size_type __len1, size_type __len2 )
{
    const size_type __old_size = this-> size ();
    const size_type __new_size = __old_size + __len2 - __len1 ;
    const size_type __how_much = __old_size - __pos - __len1 ;

	//重新生成_Rep,拷贝数据，引用数减1
    if ( __new_size > this -> capacity() || _M_rep ()->_M_is_shared ())
    {
      
        const allocator_type __a = get_allocator ();
        _Rep * __r = _Rep:: _S_create (__new_size , this-> capacity (), __a );

        if (__pos )
            _M_copy (__r -> _M_refdata(), _M_data (), __pos );
        if (__how_much )
            _M_copy (__r -> _M_refdata() + __pos + __len2 ,
            _M_data () + __pos + __len1, __how_much );

        //引用计数减1
        _M_rep ()->_M_dispose ( __a);

        //指向新的_Rep
        _M_data (__r -> _M_refdata());
    }
    else if (__how_much && __len1 != __len2 )
    {
        // Work in-place.
        _M_move (_M_data () + __pos + __len2 ,
            _M_data () + __pos + __len1, __how_much );
    }

    //设置自身的引用计数
    _M_rep()-> _M_set_length_and_sharable (__new_size );
}

所以数据拷贝的发生在数据实际修改的时候。

总结

basic_string仅仅包含一个成员_M_dataplus，它会指向一个_Rep的结构，_Rep中才会实际存放字符串的内容和长度等信息。初始化过程中，对于短字符串，会先存放在栈中，待生成_Rep指针后才会将数据拷贝至_Rep中，这样可以避免初始化短字符串的时候去申请动态内存空间
_Rep中记录了basic_string的引用计数，在GCC4.*版本中，引用计数是默认开启的，所以对象的拷贝构造很快
因为basic_string中仅包含指向_Rep的指针这一个数据成员，所以sizeof(std::string)等于8，但是由于它还要为它分配_Rep，一个空_Rep的大小为25，所以在GCC4.*中一个空string的大小也为33个字节
COW能有效减少内存拷贝，但是其实现较为复杂，也存在一些其他问题，具体可参考[std::string的Copy-on-Write：不如想象中美好)，加上新的标准库中std::move的引入导致COW的这种优化不再有不要，所以从GCC5开始已经废弃了COW机制

定义

构造

内存布局

拷贝

总结

Reference