graph executor setup all device memory in method’s GraphExecutor::SetupStorage
, the allocated memory are stored in storage_pool_.
storage_pool_.push_back(pit.linked_param);
} else {
std::vector<int64_t> shape = pit.shape;
if (shape.size() == 1) {
shape[0] = (shape[0] + 3) / 4;
}
Optional<String> mem_scope;
if (!pit.scope.empty()) {
mem_scope = String(pit.scope);
}
storage_pool_.push_back(NDArray::Empty(shape, pit.dtype, dev, mem_scope));
}
}
// Assign the pooled entries. A unified memory pool is used to simplifiy
// memory assignment for each node entry. The allocated memory on each device
// is mapped to this pool.
data_entry_.resize(num_node_entries());
data_alignment_.resize(num_node_entries());
for (size_t i = 0; i < data_entry_.size(); ++i) {
int storage_id = attrs_.storage_id[i];
In my opinion, share params is for memory-saving accross multi same instance.
} else if (name == "load_params") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
this->LoadParams(args[0].operator std::string());
});
} else if (name == "share_params") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
const auto& module = args[0].operator Module();
ICHECK_EQ(module.operator->()->type_key(), std::string("GraphExecutor"));
const auto& param_blob = args[1].operator std::string();
dmlc::MemoryStringStream strm(const_cast<std::string*>(¶m_blob));
this->ShareParams(dynamic_cast<const GraphExecutor&>(*module.operator->()), &strm);
});
} else if (name == "get_input_index") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
CHECK(String::CanConvertFrom(args[0])) << "Input key is not a string";
*rv = this->GetInputIndex(args[0].operator String());
});
} else if (name == "get_input_info") {
return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
auto [shape_info, dtype_info] = this->GetInputInfo();
Map<String, ObjectRef> input_info;
why not remove from storage_pool_? so that the reference of origin NDArray
use count will decrease to 0, then the allocated duplicated memory will be freed.
@masahi
I use this method try to run multi same instance, but the memory not get decreased.
uint64_t sz;
strm->Read(&sz);
size_t size = static_cast<size_t>(sz);
ICHECK(size == names.size()) << "Invalid parameters file format";
for (size_t i = 0; i < size; ++i) {
int in_idx = GetInputIndex(names[i]);
if (in_idx < 0) continue;
uint32_t eid = this->entry_id(input_nodes_[in_idx], 0);
ICHECK_LT(eid, data_entry_.size());
ICHECK_EQ(data_entry_[eid].use_count(), 1);
data_entry_[eid] = other.GetInput(GetInputIndex(names[i]));
ICHECK_GT(data_entry_[eid].use_count(), 1);
const DLTensor* tmp = data_entry_[eid].operator->();
data_alignment_[eid] = details::GetDataAlignment(*tmp);
}
this->SetupOpExecs();
}
void GraphExecutor::LinkedNDArrayDeleter(Object* container) {
// container is the NDArray::Container which needs to get deleted.
// The data member points to global const memory, so it does not need deleting.
There migit be some small changes need to update graph_executor
’s share_params
method, to free the multi instance’s weight memory that have different batch size or seq_len. and it works.