tfs主控与数据节点读写流程—源码解读

1. tfs节点的管理类介绍
1.1 数据节点
   数据节点主要进行进行实际数据的存储与读写,其管理类是DataServer,其职责如下:


   相关数据流的任务函数处理由handlePacketQueue函数完成,如下:

    //一个数据节点
    class DataService 
    {
      OpManager op_manager_;
      LeaseManager *lease_manager_;
      DataHelper data_helper_;
      TaskManager task_manager_;
      BlockManager *block_manager_;
      TrafficControl traffic_control_;
      ClientRequestServer client_request_server_;
      WritableBlockManager writable_block_manager_;
      CheckManager check_manager_;
      SyncManager*  sync_manager_;
      MigrateManager* migrate_manager_;
      TimeoutThreadHelperPtr  timeout_thread_;
      RunTaskThreadHelperPtr  task_thread_;
      RunCheckThreadHelperPtr check_thread_;
    }
    
    bool DataService::handlePacketQueue(tbnet::Packet* packet, void* args)
    {
      bool bret = BaseService::handlePacketQueue(packet, args);
      if (bret)
      {
        int32_t pcode = packet->getPCode();
        int32_t ret = LOCAL_PACKET == pcode ? TFS_ERROR : TFS_SUCCESS;
        if (TFS_SUCCESS == ret)
        {
          switch (pcode)
          {
            case LIST_BLOCK_MESSAGE:
              ret = list_blocks(dynamic_cast<ListBlockMessage*>(packet));
              break;
            case REPLICATE_BLOCK_MESSAGE:
            case COMPACT_BLOCK_MESSAGE:
            case DS_COMPACT_BLOCK_MESSAGE:
            case DS_REPLICATE_BLOCK_MESSAGE:
            case RESP_DS_COMPACT_BLOCK_MESSAGE:
            case RESP_DS_REPLICATE_BLOCK_MESSAGE:
            case REQ_EC_MARSHALLING_MESSAGE:
            case REQ_EC_REINSTATE_MESSAGE:
            case REQ_EC_DISSOLVE_MESSAGE:
            case NS_REQ_RESOLVE_BLOCK_VERSION_CONFLICT_MESSAGE:
              ret = task_manager_.handle(dynamic_cast<BaseTaskMessage*>(packet));
              break;
            case GET_BLOCK_INFO_MESSAGE_V2:
              ret = get_block_info(dynamic_cast<GetBlockInfoMessageV2*>(packet));
              break;
            case GET_SERVER_STATUS_MESSAGE:
              ret = get_server_status(dynamic_cast<GetServerStatusMessage*>(packet));
              break;
            case STATUS_MESSAGE:
              ret = get_ping_status(dynamic_cast<StatusMessage*>(packet));
              break;
            case CLIENT_CMD_MESSAGE:
              ret = client_command(dynamic_cast<ClientCmdMessage*>(packet));
              break;
            case REQ_CALL_DS_REPORT_BLOCK_MESSAGE:
            case STAT_FILE_MESSAGE_V2:
            case READ_FILE_MESSAGE_V2:
            case WRITE_FILE_MESSAGE_V2:
            case CLOSE_FILE_MESSAGE_V2:
            case UNLINK_FILE_MESSAGE_V2:
            case NEW_BLOCK_MESSAGE_V2:
            case REMOVE_BLOCK_MESSAGE_V2:
            case READ_RAWDATA_MESSAGE_V2:
            case WRITE_RAWDATA_MESSAGE_V2:
            case READ_INDEX_MESSAGE_V2:
            case WRITE_INDEX_MESSAGE_V2:
            case QUERY_EC_META_MESSAGE:
            case COMMIT_EC_META_MESSAGE:
            case GET_ALL_BLOCKS_HEADER_MESSAGE:
              ret = client_request_server_.handle(packet);
              break;
            case REQ_CHECK_BLOCK_MESSAGE:
            case REPORT_CHECK_BLOCK_MESSAGE:
              ret = check_manager_.handle(packet);
              break;
            default:
              TBSYS_LOG(ERROR, "process packet pcode: %d
", pcode);
              ret = TFS_ERROR;
              break;
          }
          if (common::TFS_SUCCESS != ret)
          {
            common::BasePacket* msg = dynamic_cast<common::BasePacket*>(packet);
            msg->reply_error_packet(TBSYS_LOG_LEVEL(ERROR), ret, "execute message failed");
          }
        }
      }
      return bret;
    }

    其中的主要职责由task_manager_和client_request_server_完成:

  a. task_manager_ 负责执行数据节点的均衡、数据迁移、清理压缩、整理等操作,满足系统集群的平衡迁移、复制容错、节点扩容等需求;
  b. client_request_server_ 负责执行数据节点上文件的读、写、查询、删除等操作,直接响应客户端的请求。相应的代码如下:

    int TaskManager::handle(BaseTaskMessage* packet)
    {
      int pcode = packet->getPCode();
      int ret = TFS_SUCCESS;
      switch(pcode)
      {
        case REPLICATE_BLOCK_MESSAGE:
          ret = add_replicate_task(dynamic_cast<ReplicateBlockMessage*>(packet));
          break;
        case COMPACT_BLOCK_MESSAGE:
          ret = add_compact_task(dynamic_cast<NsRequestCompactBlockMessage*>(packet));
          break;
        case DS_REPLICATE_BLOCK_MESSAGE:
          ret = add_ds_replicate_task(dynamic_cast<DsReplicateBlockMessage*>(packet));
          break;
        case DS_COMPACT_BLOCK_MESSAGE:
          ret = add_ds_compact_task(dynamic_cast<DsCompactBlockMessage*>(packet));
          break;
        case REQ_EC_MARSHALLING_MESSAGE:
          ret = add_marshalling_task(dynamic_cast<ECMarshallingMessage*>(packet));
          break;
        case REQ_EC_REINSTATE_MESSAGE:
          ret = add_reinstate_task(dynamic_cast<ECReinstateMessage*>(packet));
          break;
        case REQ_EC_DISSOLVE_MESSAGE:
          ret = add_dissolve_task(dynamic_cast<ECDissolveMessage*>(packet));
          break;
        case NS_REQ_RESOLVE_BLOCK_VERSION_CONFLICT_MESSAGE:
          ret = add_resolve_conflict_task(dynamic_cast<NsReqResolveBlockVersionConflictMessage*>(packet));
          return ret;
        case RESP_DS_REPLICATE_BLOCK_MESSAGE:
        case RESP_DS_COMPACT_BLOCK_MESSAGE:
          ret = handle_complete(packet);
          break;
        default:
          ret = TFS_ERROR;
          TBSYS_LOG(WARN, "unknown pcode : %d",  pcode);
          break;
      }

      if (TFS_SUCCESS == ret)
      {
        packet->reply(new StatusMessage(STATUS_MESSAGE_OK));
      }

      return ret;
    }

    int ClientRequestServer::handle(tbnet::Packet* packet)
    {
      int ret = (NULL == packet) ? EXIT_POINTER_NULL : TFS_SUCCESS;
      if (TFS_SUCCESS == ret)
      {
        int32_t pcode = packet->getPCode();
        switch (pcode)
        {
          case REQ_CALL_DS_REPORT_BLOCK_MESSAGE:
            ret = report_block(dynamic_cast<CallDsReportBlockRequestMessage*>(packet));
            break;
          case STAT_FILE_MESSAGE_V2:
            ret = stat_file(dynamic_cast<StatFileMessageV2*>(packet));
            break;
          case READ_FILE_MESSAGE_V2:
            ret = read_file(dynamic_cast<ReadFileMessageV2*>(packet));
            break;
          case WRITE_FILE_MESSAGE_V2:
            ret = write_file(dynamic_cast<WriteFileMessageV2*>(packet));
            break;
          case CLOSE_FILE_MESSAGE_V2:
            ret = close_file(dynamic_cast<CloseFileMessageV2*>(packet));
            break;
          case UNLINK_FILE_MESSAGE_V2:
            ret = unlink_file(dynamic_cast<UnlinkFileMessageV2*>(packet));
            break;
          case NEW_BLOCK_MESSAGE_V2:
            ret = new_block(dynamic_cast<NewBlockMessageV2*>(packet));
            break;
          case REMOVE_BLOCK_MESSAGE_V2:
            ret = remove_block(dynamic_cast<RemoveBlockMessageV2*>(packet));
            break;
          case READ_RAWDATA_MESSAGE_V2:
            ret = read_raw_data(dynamic_cast<ReadRawdataMessageV2*>(packet));
            break;
          case WRITE_RAWDATA_MESSAGE_V2:
            ret = write_raw_data(dynamic_cast<WriteRawdataMessageV2*>(packet));
            break;
          case READ_INDEX_MESSAGE_V2:
            ret = read_index(dynamic_cast<ReadIndexMessageV2*>(packet));
            break;
          case WRITE_INDEX_MESSAGE_V2:
            ret = write_index(dynamic_cast<WriteIndexMessageV2*>(packet));
            break;
          case QUERY_EC_META_MESSAGE:
            ret = query_ec_meta(dynamic_cast<QueryEcMetaMessage*>(packet));
            break;
          case COMMIT_EC_META_MESSAGE:
            ret = commit_ec_meta(dynamic_cast<CommitEcMetaMessage*>(packet));
            break;
          case GET_ALL_BLOCKS_HEADER_MESSAGE:
            ret = get_all_blocks_header(dynamic_cast<GetAllBlocksHeaderMessage*>(packet));
            break;
          default:
            TBSYS_LOG(WARN, "process packet pcode: %d
", pcode);
            ret = EXIT_UNKNOWN_MSGTYPE;
            break;
        }
      }

      return ret;
    }

 1.2 主控节点

   主控节点主要职责分为三块: 
    a. 各个控制流的任务管理与下发,如Block的创建,删除,复制,均衡,整理;
    b. 各个数据节点健康状态的检查;
    c. 元数据及元数据与各个数据节点上block的检索管理。
    其管理类是NameServer,负责的相关任务同样由handlePacketQueue函数处理,如下:

  class NameServer
    {
      LayoutManager layout_manager_;
      NameServerHeartManager master_slave_heart_manager_;
      HeartManagement heart_manager_;
    }
    
    
    class LayoutManager
    {
        BlockManager block_manager_;
        ServerManager server_manager_;
        TaskManager task_manager_;
        OpLogSyncManager oplog_sync_mgr_;
        FamilyManager  family_manager_;    
        ClientRequestServer client_request_server_;
        common::GCObjectManager<LayoutManager, common::BaseObject> gc_manager_;        
    }
    
    bool NameServer::handlePacketQueue(tbnet::Packet *packet, void *args)
    {
      bool bret = BaseService::handlePacketQueue(packet, args);
      if (bret)
      {
        int32_t pcode = packet->getPCode();
        int32_t ret = LOCAL_PACKET == pcode ? TFS_ERROR : common::TFS_SUCCESS;
        if (TFS_SUCCESS == ret)
        {
          //TBSYS_LOG(DEBUG, "PCODE: %d", pcode);
          common::BasePacket* msg = dynamic_cast<common::BasePacket*>(packet);
          switch (pcode)
          {
            case GET_BLOCK_INFO_MESSAGE_V2:
              ret = open(msg);
              break;
            case BATCH_GET_BLOCK_INFO_MESSAGE_V2:
              ret = batch_open(msg);
              break;
            case REPLICATE_BLOCK_MESSAGE:
            case BLOCK_COMPACT_COMPLETE_MESSAGE:
            case REQ_EC_MARSHALLING_COMMIT_MESSAGE:
            case REQ_EC_REINSTATE_COMMIT_MESSAGE:
            case REQ_EC_DISSOLVE_COMMIT_MESSAGE:
              ret = layout_manager_.get_client_request_server().handle(msg);
              break;
            case SHOW_SERVER_INFORMATION_MESSAGE:
              ret = show_server_information(msg);
              break;
            case STATUS_MESSAGE:
              ret = ping(msg);
              break;
            case DUMP_PLAN_MESSAGE:
              ret = dump_plan(msg);
              break;
            case CLIENT_NS_KEEPALIVE_MESSAGE:
              ret = client_keepalive(msg);
              break;
            case CLIENT_CMD_MESSAGE:
              ret = client_control_cmd(msg);
              break;
            case REQ_RESOLVE_BLOCK_VERSION_CONFLICT_MESSAGE:
              {
              BaseTaskMessage* packet = dynamic_cast<BaseTaskMessage*>(msg);
              if (0 == packet->get_seqno())
                ret = resolve_block_version_conflict(msg);
              else
                ret = layout_manager_.get_client_request_server().handle(msg);
              }
              break;
            case REQ_GET_FAMILY_INFO_MESSAGE:
              ret = get_family_info(msg);
              break;
            case REPAIR_BLOCK_MESSAGE_V2:
              ret = repair(msg);
              break;
            case DS_APPLY_BLOCK_MESSAGE:
              ret = apply_block(msg);
              break;
            case DS_APPLY_BLOCK_FOR_UPDATE_MESSAGE:
              ret = apply_block_for_update(msg);
              break;
            case DS_GIVEUP_BLOCK_MESSAGE:
              ret = giveup_block(msg);
              break;
            default:
              ret = EXIT_UNKNOWN_MSGTYPE;
              TBSYS_LOG(WARN, "unknown msg type: %d", pcode);
              break;
          }
          if (common::TFS_SUCCESS != ret)
          {
            msg->reply_error_packet(TBSYS_LOG_LEVEL(ERROR), ret, "execute message failed, pcode: %d", pcode);
          }
        }
      }
      return bret;
    }

 2. 文件物理结构与逻辑结构设计介绍

  2.1 文件存储设计  
   文件存储设计官方相关公布图如下:


     在tfs系统里面,个人理解数据存储设计概念分为三层:
         a. 应用层:用户最关心的业务数据文件,如图片、文档等file_id。
         b. 逻辑层:开发人员最关心的,用LogicBlock表示。
         c. 物理层:运维人员最关心的,用BasePhysicalBlock表示。
      Block是逻辑上的概念,一个block包含多个bucket,每个bucket包含多个slot,每个slot一一对应的file_id, file_id在block内唯一。BasePhysicalBlock是物理磁盘上的概念,一个block也包含一个或多个BasePhysicalBlock,与磁盘的碎片程度有关。
 2.2 关于Block
      每个block_id标示着一个Block,tfs逻辑上是以Block的方式管理文件内容的,block_id_集群中是全局唯一的,由nameserver全局唯一生成与管理,file_id_在每个block中是局部唯一的,所以通过二元祖<block_id_,file_id_>可以定位集群中用户所需要的数据。
      另外,每个block的大小基本配置固定的,例如默认64M。由于tfs与HDFS的需求不同,主要管理存储小文件的,所以数据文件偏小,但当满足大数据文件的存储需求时,也会将其拆分成多个小文件FileSegment。所以读取文件数据时数据检索流程:FileSegment-->LogicBlock-->FileInfoV2-->BasePhysicalBlock

 2.3 结构表达
    各个相应的类结构表达如下:

    //用户请求上传信息
    struct FileSegment
    {
      uint64_t block_id_;
      uint64_t file_id_;
      int32_t offset_;
      int32_t length_;
    };
    
    struct BlockIndex
    {
      uint64_t logic_block_id_;
      int32_t  physical_block_id_:20;//<=1048575
      int32_t  physical_file_name_id_:20;//<=1048575 number 1~1048575
      int32_t  next_index_:20;//<=1048575
      int32_t  prev_index_:20;//<=1048575
      int8_t   index_:7;// 0 ~36(0: main block, 1~36: ext block)
      int8_t   status_:2;//0: uncomplete, 1: complete
      int8_t   split_flag_:2;//0: unsplit, 1:split
      int8_t   split_status_:2;//0: split uncomplete, 1: split complete
      int8_t   reserve_:3;//reserve
    };
    
    struct FileInfoV2 //30
    {
      uint64_t id_; //file id
      int32_t  offset_; //offset in block file
      int32_t  size_:28;// file size
      int8_t   status_:4;//delete flag
      uint32_t crc_; // checksum
      int32_t  modify_time_;//modify time
      int32_t create_time_; // create time
      uint16_t next_;      //next index
    }
 
    struct IndexHeaderV2
    {
      common::BlockInfoV2 info_;//56
      ThroughputV2 throughput_;//72
      int32_t used_offset_;//12 * 4 = 48
      int32_t avail_offset_;
      int32_t marshalling_offset_;
      uint32_t seq_no_;
      union
      {
        uint16_t file_info_bucket_size_;
        uint16_t index_num_;
      };
      uint16_t used_file_info_bucket_size_;
      int8_t  max_index_num_;
      int8_t  reserve_[27];
    }
    
    class BasePhysicalBlock : public GCObject
    {
      public:
        int32_t physical_block_id_;
        int32_t start_;//the data start offset of this block file
        int32_t end_;//the data end offset of this block file
        FileOperation file_op_(path);
    };
    
    class LogicBlock
    {
        BaseIndexHandle* index_handle_;
        DataHandle data_handle_;
        std::vector<PhysicalBlock*> physical_block_list_; //the physical block list of this logic block
    }
    
    class LogicBlockManager
    {
        common::TfsSortedVector<BaseLogicBlock*, LogicBlockIdCompare> logic_blocks_;
    }
    
    class PhysicalBlockManager
    {
        common::TfsSortedVector<BasePhysicalBlock*,PhysicalBlockIdCompare> physical_blocks_;
        common::TfsSortedVector<BasePhysicalBlock*,PhysicalBlockIdCompare> alloc_physical_blocks_; 
    }
    
    class BlockManager
    {    
        LogicBlockManager logic_block_manager_;
        PhysicalBlockManager physical_block_manager_;
        GCObjectManager   gc_manager_;
        mutable common::RWLock mutex_;
    }   

3. 读取流程

   3.1 客户端

    int64_t TfsClientImplV2::read(const int fd, void* buf, const int64_t count)
    {
      int64_t ret = TFS_SUCCESS;
      if ((fd < 0) || (NULL == buf) || (count < 0))
      {
        ret = EXIT_PARAMETER_ERROR;
      }
      else
      {
        TfsFile* tfs_file = get_file(fd);
        if (NULL == tfs_file)
        {
          ret = EXIT_INVALIDFD_ERROR;
        }
        else
        {
          ret = tfs_file->read(buf, count);
          tfs_file->get_session()->update_stat(ST_READ, ret > 0);
        }
      }
      return ret;
    }

    
    uint64_t File::get_read_ds() const
    {
      uint64_t server_id = 0;
      if (ds_.size() > 0)
      {
        server_id = ds_[read_index_%ds_.size()];
      }
    }

 3.2 服务端

    //读取数据
    int ClientRequestServer::read_file(ReadFileMessageV2* message)
    {
      TIMER_START();
      uint64_t block_id = message->get_block_id();
      uint64_t attach_block_id = message->get_attach_block_id();
      uint64_t file_id = message->get_file_id();
      int32_t length = message->get_length();
      int32_t offset = message->get_offset();
      int8_t flag = message->get_flag();
      uint64_t peer_id = message->get_connection()->getPeerId();
      const FamilyInfoExt& family_info = message->get_family_info();

      int ret = ((INVALID_BLOCK_ID == block_id) || (INVALID_FILE_ID == file_id) ||
          (offset < 0) || (length <= 0)) ? EXIT_PARAMETER_ERROR : TFS_SUCCESS;

      if (TFS_SUCCESS == ret)
      {
        FileInfoV2 file_info;
        file_info.id_ = file_id;
        if (INVALID_FAMILY_ID == family_info.family_id_)
        {
          ret = get_block_manager().stat(file_info,
              FORCE_STAT, block_id, message->get_attach_block_id());
        }
        else
        {
          ret = get_data_helper().stat_file_degrade(block_id,
              file_id, FORCE_STAT, family_info, file_info);
        }

        if (TFS_SUCCESS == ret)
        {
          // truncate read length
          if (offset + length > file_info.size_)
          {
            length = file_info.size_ - offset;
          }

          ret = (length < 0) ? EXIT_PARAMETER_ERROR: TFS_SUCCESS;
          if (TFS_SUCCESS == ret)
          {
            ReadFileRespMessageV2* resp_msg = new (std::nothrow) ReadFileRespMessageV2();
            assert(NULL != message);

            // if length is truncated to 0
            // reply a packet with length 0 to tell client that it already reach to the end of file
            if (0 == length)
            {
              resp_msg->set_length(0);
            }
            else
            {
              char* buffer = resp_msg->alloc_data(length);
              assert(NULL != buffer);
              if (INVALID_FAMILY_ID == family_info.family_id_)
              {
                TimeStat timer;
                timer.start();
                ret = get_block_manager().read(buffer,
                    length, offset, file_id, flag, block_id, attach_block_id);
                timer.end();
                ret = (ret < 0) ? ret: TFS_SUCCESS;

                // log slow read request
                if (TFS_SUCCESS == ret && timer.duration() > SYSPARAM_DATASERVER.max_io_warn_time_)
                {
                  TBSYS_LOG(WARN, "slow read request. blockid: %"PRI64_PREFIX"u, "
                      "attach_blockid: %"PRI64_PREFIX"u, fileid: %"PRI64_PREFIX"u, cost: %"PRI64_PREFIX"d",
                      attach_block_id, block_id, file_id, timer.duration());
                }
              }
              else
              {
                ret = get_data_helper().read_file_degrade(block_id,
                    file_info, buffer, length, offset, flag, family_info);
              }
            }

            if (TFS_SUCCESS != ret)
            {
              // upper layer will reply error packet to client
              tbsys::gDelete(resp_msg);
            }
            else
            {
              // readv2 support
              if (flag & READ_DATA_OPTION_WITH_FINFO)
              {
                resp_msg->set_file_info(file_info);
              }
              ret = message->reply(resp_msg);
              if (TFS_SUCCESS == ret)
              {
                get_traffic_control().rw_traffic_stat(false, length);
              }
            }
          }
        }
      }

      TIMER_END();
      return ret;
    }
    
    // b BlockManager读取buf,从logic_block_id
    int BlockManager::read(char* buf, int32_t& nbytes, const int32_t offset,
        const uint64_t fileid, const int8_t flag, const uint64_t logic_block_id, const uint64_t attach_logic_block_id)
    {
      int32_t ret = (NULL != buf && nbytes > 0 && offset >= 0 && INVALID_FILE_ID != fileid && flag >= 0
          && INVALID_BLOCK_ID != logic_block_id && INVALID_BLOCK_ID != attach_logic_block_id) ? TFS_SUCCESS : EXIT_PARAMETER_ERROR;
      if (TFS_SUCCESS == ret)
      {
        BaseLogicBlock* logic_block = get(logic_block_id);
        ret = (NULL != logic_block) ? TFS_SUCCESS  : EXIT_NO_LOGICBLOCK_ERROR;
        if (TFS_SUCCESS == ret)
        {
          ret = logic_block->read(buf, nbytes, offset, fileid, flag, attach_logic_block_id);
        }
      }
      return ret;
    }
    
    //c 从physical_block读取真实数据,一个logic_block可能包含多个physic_blocks
    int DataHandle::pread(char* buf, const int32_t nbytes, const int32_t offset)
    {
      int32_t ret = (NULL != buf && nbytes > 0 && offset >= 0) ? TFS_SUCCESS : EXIT_PARAMETER_ERROR;
      if (TFS_SUCCESS == ret)
      {
        PhysicalBlock* physical_block = NULL;
        int32_t inner_offset = 0, length = nbytes, current_offset = offset;
        int32_t inner_length = 0, mem_offset = 0,  total_read_length = 0;
        while ((TFS_SUCCESS == ret) && (current_offset < (offset + nbytes)))
        {
          inner_length  = length;
          ret = logic_block_.choose_physic_block(physical_block, inner_length, inner_offset, current_offset);
          if (TFS_SUCCESS == ret)
          {
            length = std::min(length, inner_length);
            ret = physical_block->pread((buf + mem_offset), length, inner_offset);
            ret = ret >= 0 ? TFS_SUCCESS : ret;
            if (TFS_SUCCESS == ret)
            {
              current_offset += length;
              mem_offset     += length;
              total_read_length += length;
              length         =  nbytes - total_read_length;
            }
          }
        }
      }
      return TFS_SUCCESS == ret ? nbytes : ret;
    }

 4. 写流程

   4.1 客户端

    // client 写数据
    int TfsFile::write_ex(const char* buf, int64_t count)
    {
      int ret = TFS_SUCCESS;
      uint64_t server = 0;
      tbnet::Packet* resp_msg = NULL;
      NewClient* client = NewClientManager::get_instance().create_client();
      if (NULL == client)
      {
        ret = EXIT_CLIENT_MANAGER_CREATE_CLIENT_ERROR;
        TBSYS_LOG(WARN, "create new client fail.");
      }
      else
      {
        WriteFileMessageV2 msg;
        msg.set_block_id(fsname_.get_block_id());
        msg.set_attach_block_id(fsname_.get_block_id());
        msg.set_file_id(fsname_.get_file_id());
        msg.set_offset(file_.offset_);
        msg.set_length(count);
        msg.set_lease_id(file_.lease_id_);
        msg.set_master_id(file_.get_write_ds());
        msg.set_version(file_.version_);
        msg.set_flag(file_.opt_flag_);
        msg.set_ds(file_.ds_);
        msg.set_data(buf);
        if (file_.has_family())
        {
          msg.set_family_info(file_.family_info_);
        }
        server = file_.get_write_ds();
        ret = send_msg_to_server(server, client, &msg, resp_msg, ClientConfig::wait_timeout_);
      }
    }
    
      uint64_t File::get_write_ds() const
    {
      uint64_t server_id = 0;
      if (ds_.size() > 0)
      {
        server_id = ds_[0];
      }
      return server_id;
    }

  4.2 服务端

   a. NameServer逻辑过程

	BlockCollect* LayoutManager::add_new_block_helper_create_by_system_()
	{
      BlockCollect* block = NULL;
      int32_t ret =  (0 == block_id) ? TFS_SUCCESS : EXIT_PARAMETER_ERROR;
      if (TFS_SUCCESS == ret)
      {
        uint64_t result[MAX_REPLICATION_NUM];
        ArrayHelper<uint64_t> helper(MAX_REPLICATION_NUM, result);
        uint64_t news[MAX_REPLICATION_NUM];
        ArrayHelper<uint64_t> news_helper(MAX_REPLICATION_NUM, news);
        if (NULL != server)
          helper.push_back(server->id());

        block_id = get_alive_block_id_(false);
        ret = (INVALID_BLOCK_ID == block_id) ? EXIT_BLOCK_ID_INVALID_ERROR : TFS_SUCCESS;
        if (TFS_SUCCESS == ret)
        {
          //add block collect object
          block = get_block_manager().insert(block_id, now, true);
          ret = (NULL != block) ? TFS_SUCCESS : EXIT_NO_BLOCK;
        }

        if (TFS_SUCCESS == ret)
        {
          int32_t count = SYSPARAM_NAMESERVER.max_replication_ - helper.get_array_index();
          if (count > 0)
          {
            get_server_manager().choose_create_block_target_server(helper, news_helper, count);
          }
          BlockCollect* pobject = NULL;
          ret = !helper.empty() ? TFS_SUCCESS : EXIT_CHOOSE_CREATE_BLOCK_TARGET_SERVER_ERROR;
          if (TFS_SUCCESS == ret)//add block collect object successful
          {
            ret = add_new_block_helper_send_msg_(block_id, helper);
            if (TFS_SUCCESS == ret)
            {
              //build relation
              ret = add_new_block_helper_build_relation_(block, helper, now);
              if (TFS_SUCCESS == ret)
              {
                add_new_block_helper_write_log_(block_id, helper, now);
              }
            }//end send message to dataserver successful
            else
            {
              get_block_manager().remove(pobject, block_id);//rollback
            }
          }
          else
          {
            get_block_manager().remove(pobject, block_id);//rollback
          }
          get_gc_manager().insert(pobject, now);
        }
      }//end if (TFS_SUCCESS == ret) check parameter
      return TFS_SUCCESS == ret ? block : NULL;
    }
	
	//a.生成block_id
	uint64_t BlockIdFactory::generation(const bool verify)
    {
      mutex_.lock();
      ++count_;
      uint64_t id = ++global_id_;
      assert(id <= MAX_BLOCK_ID);
      bool flush_flag = false;
      if (count_ >= SKIP_BLOCK_NUMBER)
      {
        flush_flag = true;
        count_ = 0;
      }
      mutex_.unlock();
      int32_t ret = common::TFS_SUCCESS;
      if (flush_flag)
      {
        ret = flush_(id);
        if (common::TFS_SUCCESS != ret)
        {
          TBSYS_LOG(WARN, "update global block id failed, id: %"PRI64_PREFIX"u, ret: %d", id, ret);
        }
      }
      if (common::TFS_SUCCESS == ret)
      {
        if (verify)
          id |= 0x8000000000000000;
      }
      return id;
    }
	
	//b.选择数据节点
	int ServerManager::choose_create_block_target_server(common::ArrayHelper<uint64_t>& result,
        common::ArrayHelper<uint64_t>& news, const int32_t count) const
    {
      news.clear();
      std::set<uint32_t> lans;
      get_lans_(lans, result);
      ServerCollect* pserver = NULL;
      int32_t index = count;
      while (index-- > 0)
      {
        pserver = NULL;
        if (TFS_SUCCESS == choose_replciate_random_choose_server_base_lock_(pserver, result, lans))
        {
          assert(NULL != pserver);
          news.push_back(pserver->id());
          result.push_back(pserver->id());
          uint32_t lan =  Func::get_lan(pserver->id(), SYSPARAM_NAMESERVER.group_mask_);
          lans.insert(lan);
        }
      }
      return count - news.get_array_index();
    }
	
	int ServerManager::choose_replciate_random_choose_server_base_lock_(ServerCollect*& result,
        const common::ArrayHelper<uint64_t>& except, const std::set<uint32_t>& lans) const
    {
      result = NULL;
      RWLock::Lock lock(rwmutex_, READ_LOCKER);
      int64_t size = std::min(servers_.size(), SYSPARAM_NAMESERVER.choose_target_server_random_max_nums_);
      int64_t index = size, random_index = 0;
      while (index-- > 0 && NULL == result)
      {
        random_index = random() % servers_.size();
        ServerCollect* pserver = servers_.at(random_index);
        assert(NULL != pserver);
        bool valid  = ((!pserver->is_full()) && (!except.exist(pserver->id()))
                      && (DATASERVER_DISK_TYPE_FULL == pserver->get_disk_type()));
        if (valid && !lans.empty())
        {
          uint32_t lan =  Func::get_lan(pserver->id(), SYSPARAM_NAMESERVER.group_mask_);
          valid = lans.find(lan) == lans.end();
        }

        if (valid)
        {
          result = pserver;
        }
      }
      return (NULL != result) ? TFS_SUCCESS : EXIT_NO_DATASERVER;
	}

   b. 数据节点逻辑过程   

     //服务端的响应分为三步

  1. prepare_op 校验lease_id合法性、准备block空间等;
  2. forward_op 异步发送数据至从副本节点data_slaves、同时写入本主副本节点上;
  3. write_file_callback 核实各个副本数据节点的写入成功与否、校验各个副本数据节点的版本一致性、返回写入结果消息。

int ClientRequestServer::write_file(WriteFileMessageV2* message)
    {
      TIMER_START();
      uint64_t block_id = message->get_block_id();
      uint64_t attach_block_id = message->get_attach_block_id();
      uint64_t file_id = message->get_file_id();
      uint64_t lease_id = message->get_lease_id();
      int32_t offset = message->get_offset();
      int32_t length = message->get_length();
      VUINT64 servers = message->get_ds(); // will copy vector
      const char* data = message->get_data();
      uint64_t master_id = message->get_master_id();
      uint64_t peer_id = message->get_connection()->getPeerId();
      int32_t flag = message->get_flag();
      const FamilyInfoExt& family_info = message->get_family_info();
      int64_t family_id = family_info.family_id_;
      DsRuntimeGlobalInformation& ds_info = DsRuntimeGlobalInformation::instance();
      bool is_master = (master_id == ds_info.information_.id_);
      int32_t version = is_master ? -1 : message->get_version();  // master won't check version

      bool prepare_ok = false;
      int ret = TFS_SUCCESS;
      if ((NULL == data) || (offset < 0) || (length <= 0))
      {
        ret = EXIT_PARAMETER_ERROR;
      }

      // tbnet already receive this packet from network
      get_traffic_control().rw_traffic_stat(true, length);

      if (TFS_SUCCESS == ret)
      {
        if (is_master && INVALID_LEASE_ID == lease_id)
        {
          // first write to master
          ret = get_op_manager().prepare_op(attach_block_id,
              file_id, lease_id, OP_TYPE_WRITE, is_master, family_info, servers);
          if (TFS_SUCCESS == ret)
          {
            // callback & slave will use
            if (INVALID_BLOCK_ID == block_id) // data block
            {
              block_id = attach_block_id;
            }
            message->set_block_id(block_id);
            message->set_attach_block_id(attach_block_id);
            message->set_file_id(file_id);
            message->set_lease_id(lease_id);
            message->set_flag(TFS_FILE_FIRST_WRITE_TO_SLAVE);
          }
        }
        else if (!is_master && (flag & TFS_FILE_FIRST_WRITE_TO_SLAVE))
        {
          // first write to slave
          ret = get_op_manager().prepare_op(attach_block_id,
              file_id, lease_id, OP_TYPE_WRITE, is_master, family_info, servers);
        }
        else
        {
          // not the first wirte, just reset operation
          ret = get_op_manager().reset_op(attach_block_id, file_id, lease_id, servers);
        }

        // async op prepare work finished
        prepare_ok = (TFS_SUCCESS == ret);
      }

      // post request to slaves
      if ((TFS_SUCCESS == ret) && (servers.size() > 1U) && is_master)
      {
        ret = get_op_manager().forward_op(message, attach_block_id, family_id, servers);
      }

      // local write file
      BlockInfoV2 local;
      if (TFS_SUCCESS == ret)
      {
        ret = get_op_manager().write_file(block_id, attach_block_id,
            file_id, lease_id, data, length, offset, version, local);
        get_op_manager().update_op(attach_block_id, file_id, lease_id, ret, local);
      }

      // master check if all successful
      // slave response to master
      if (is_master)
      {
        if (prepare_ok)
        {
          write_file_callback(message);
        }
        else
        {
          message->reply(new StatusMessage(ret, "master prepare op fail"));
        }
      }
      else
      {
        SlaveDsRespMessage* resp_msg = new (std::nothrow) SlaveDsRespMessage();
        assert(NULL != resp_msg);
        resp_msg->set_server_id(ds_info.information_.id_);
        resp_msg->set_block_info(local);
        resp_msg->set_status(ret);
        message->reply(resp_msg);

        // slave write fail, release op
        if (TFS_SUCCESS != ret)
        {
          get_op_manager().release_op(attach_block_id, file_id, lease_id);
        }
      }

      TIMER_END();

      TBSYS_LOG_DW(ret, "write file %s, blockid: %"PRI64_PREFIX"u, attach_blockid: %"PRI64_PREFIX"u, "
          "fileid: %"PRI64_PREFIX"u, leaseid: %"PRI64_PREFIX"u, length: %d, offset: %d, "
          "version: %d, role: %s, peer ip: %s, cost: %"PRI64_PREFIX"d, ret: %d",
          (TFS_SUCCESS == ret) ? "success" : "fail", block_id, attach_block_id, file_id, lease_id,
          length, offset, version, is_master ? "master": "slave",
          tbsys::CNetUtil::addrToString(peer_id).c_str(), TIMER_DURATION(), ret);

      return TFS_SUCCESS;
    }
    
    int ClientRequestServer::write_file_callback(WriteFileMessageV2* message)
    {
      uint64_t attach_block_id = message->get_attach_block_id();
      uint64_t file_id = message->get_file_id();
      uint64_t lease_id = message->get_lease_id();
      uint32_t length = message->get_length();
      uint32_t offset = message->get_offset();
      uint64_t peer_id = message->get_connection()->getPeerId();
      OpStat op_stat;

      int ret = TFS_SUCCESS;
      bool all_finish = get_op_manager().check_op(attach_block_id, file_id, lease_id, op_stat);
      if (all_finish)
      {
        ret = op_stat.status_;
        if (TFS_SUCCESS != ret)
        {
          // req ns resolve version conflict
          if (EXIT_BLOCK_VERSION_CONFLICT_ERROR == ret)
          {
            get_op_manager().resolve_block_version_conflict(attach_block_id, file_id, lease_id);
          }
          message->reply_error_packet(TBSYS_LOG_LEVEL(WARN), ret, op_stat.error_.str().c_str());

          // if fail, close will never happen, release op, expire writable block
          get_op_manager().release_op(attach_block_id, file_id, lease_id, ret);
        }
        else
        {
          WriteFileRespMessageV2* resp_msg = new (std::nothrow) WriteFileRespMessageV2();
          assert(NULL != resp_msg);
          resp_msg->set_block_id(attach_block_id);
          resp_msg->set_file_id(file_id);
          resp_msg->set_lease_id(lease_id);
          message->reply(resp_msg);
        }

        if (TFS_SUCCESS != ret)
        {
          get_traffic_control().rw_stat(RW_STAT_TYPE_WRITE, ret, 0 == offset, length);
        }

        TBSYS_LOG_IW(ret, "WRITE file %s, ret: %d. blockid: %"PRI64_PREFIX"u, "
            "fileid: %"PRI64_PREFIX"u, leaseid: %"PRI64_PREFIX"u, "
            "length: %d, offset: %d, peer ip: %s, cost: %"PRI64_PREFIX"d",
            (TFS_SUCCESS == ret) ? "success": "fail", ret, attach_block_id,
            file_id, lease_id, length, offset,
            tbsys::CNetUtil::addrToString(peer_id).c_str(), op_stat.cost_);
      }

      return TFS_SUCCESS;
    }

参考

     http://code.taobao.org/p/tfs/src/

原文地址:https://www.cnblogs.com/gisorange/p/4948749.html