From 2802086a76a9b0c1572ec6c3414bb369b7c33f6b Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Wed, 12 Feb 2025 13:19:22 +0800 Subject: [PATCH] update dsv_io --- example/text_io_ex.cpp | 24 +-- lib/io/dsv_io.cpp | 359 ++++++++++++++----------------------- lib/io/dsv_io.h | 54 +++--- out.txt | 53 ------ tool/dsviewer/dsviewer.cpp | 20 +-- 5 files changed, 182 insertions(+), 328 deletions(-) delete mode 100644 out.txt diff --git a/example/text_io_ex.cpp b/example/text_io_ex.cpp index 930579e..767abdd 100644 --- a/example/text_io_ex.cpp +++ b/example/text_io_ex.cpp @@ -32,27 +32,19 @@ using namespace gctl; int main(int argc, char const *argv[]) try { - dsv_io tc, tout; + dsv_io tc; tc.delimeter('|'); tc.head_number(1); tc.load_text("tmp/world_data", ".txt", ColHead|RowHead); - tc.info(AttInfo|HeadInfo|TagInfo|ColInfo|RowInfo); + tc.info(AttInfo|HeadInfo|TagInfo|ColInfo); - //tc.set_column_type(Int, "IndepYear_n"); - //tc.filt_column("IndepYear_n < 0", {"IndepYear_n"}, {"Name_s", "Population_n", "GNP_n"}, tout); + tc.filter("America", "Continent_s", ColHead); + tc.save_text("out"); - tc.filt_column("America", "Continent_s", {"Name_s", "Population_n", "GNP_n"}, tout); - //tc.match_column("America", "Continent_s", {}, tout); - - //tout.add_column("GNP_n2", "Population_n"); - //array GNP_n2(tout.row_number(), 1000.0); - //tout.fill_column(GNP_n2, "GNP_n2"); - - int lr_id = tout.add_row(); - tout.fill_row(array{"Asia", "China", "14000000", "1949"}, lr_id); - - tout.delimeter('|'); - tout.save_text("out"); + dsv_io tc2 = tc.export_table(); + tc2.head_records(tc.head_records()); + tc2.delimeter('|'); + tc2.save_text("out2"); /* geodsv_io tc; diff --git a/lib/io/dsv_io.cpp b/lib/io/dsv_io.cpp index e335c88..743c28a 100644 --- a/lib/io/dsv_io.cpp +++ b/lib/io/dsv_io.cpp @@ -342,28 +342,23 @@ void gctl::dsv_io::save_text(std::string filename, std::string file_exten) outfile << "# " << annotates_[i] << std::endl; } - // 探测是否有行头 - bool col_st = 1; for (int i = 0; i <= row_num_; i++) { - if (table_[i][0].out_ok_ && table_[i][0].str_ != "") + for (size_t j = 0; j <= col_num_; j++) { - col_st = 0; - break; + if (table_[i][j].out_ok_ && table_[i][j].str_!= "") + { + outfile << table_[i][j].str_; + for (size_t k = j + 1; k <= col_num_; k++) + { + if (table_[i][k].out_ok_) outfile << deli_sym_ << table_[i][k].str_; + } + outfile << std::endl; + break; + } } } - for (int i = 0; i <= row_num_; i++) - { - // 单独处理第一列 即行头 - outfile << table_[i][col_st].str_; - for (int j = col_st + 1; j <= col_num_; j++) - { - if (table_[i][j].out_ok_) outfile << deli_sym_ << table_[i][j].str_; - } - outfile << std::endl; - } - outfile.close(); return; } @@ -391,6 +386,47 @@ void gctl::dsv_io::init_table(int row, int col) return; } +gctl::dsv_io gctl::dsv_io::export_table(bool ignore_disabled) +{ + std::vector str_line, row_names, col_names; + std::vector > str_table; + + std::string cor_name = table_[0][0].str_; + for (size_t j = 1; j <= col_num_; j++) + { + if (table_[0][j].out_ok_ || !ignore_disabled) + col_names.push_back(table_[0][j].str_); + } + + for (size_t i = 1; i <= row_num_; i++) + { + if (table_[i][0].out_ok_ || !ignore_disabled) + { + str_line.clear(); + + for (size_t j = 1; j <= col_num_; j++) + { + if (table_[i][j].out_ok_ || !ignore_disabled) + str_line.push_back(table_[i][j].str_); + } + + str_table.push_back(str_line); + row_names.push_back(table_[i][0].str_); + } + } + + dsv_io out_table; + out_table.init_table(str_table); + out_table.row_names(row_names, {}, cor_name); + out_table.column_names(col_names); + + destroy_vector(row_names); + destroy_vector(col_names); + destroy_vector(str_line); + destroy_vector(str_table); + return out_table; +} + void gctl::dsv_io::info(int t) { if (t & HeadInfo) @@ -521,6 +557,19 @@ int gctl::dsv_io::name_index(std::string name, bool iter_row) } } +void gctl::dsv_io::table_output(switch_type_e s) +{ + for (size_t i = 0; i <= row_num_; i++) + { + for (size_t j = 0; j <= col_num_; j++) + { + if (s == Enable) table_[i][j].out_ok_ = true; + else table_[i][j].out_ok_ = false; + } + } + return; +} + void gctl::dsv_io::column_output(int idx, switch_type_e s) { if (idx > col_num_ || idx <= 0) @@ -627,162 +676,63 @@ int gctl::dsv_io::add_row(std::string name, std::string id_name) return add_row(name, name_index(id_name, true)); } -void gctl::dsv_io::filt_column(std::string cnd_str, std::string cnd_col, - const std::vector &out_col, dsv_io &out_table) +void gctl::dsv_io::filter(std::string cnd_str, std::string cnd_tar, table_headtype_e thead) { - int idx = name_index(cnd_col); - if (idx < 0) throw std::runtime_error("[gctl::dsv_io::] Invalid column index or name."); + int idx; + if (thead == RowHead) idx = name_index(cnd_tar, true); + else if (thead == ColHead) idx = name_index(cnd_tar); + else throw std::runtime_error("[gctl::dsv_io::filter] Invalid table head type."); - array odx; - bool out_row = false; - if (out_col.empty()) out_row = true; - else - { - odx.resize(out_col.size()); - for (size_t i = 0; i < out_col.size(); i++) - { - odx[i] = name_index(out_col[i]); - if (odx[i] < 0) throw std::runtime_error("[gctl::dsv_io::] Invalid column index or name."); - } - } + if (idx < 0) throw std::runtime_error("[gctl::dsv_io::filter] Invalid row/column index or name."); std::smatch ret; std::regex pat(cnd_str); - std::vector str_line, row_names; - std::vector > str_table; - - for (size_t i = 1; i <= row_num_; i++) + if (thead == RowHead) // cnd_tar是行头 此时为按列过滤 { - if (regex_search(table_[i][idx].str_, ret, pat)) + for (size_t i = 1; i <= col_num_; i++) { - if (out_row) + if (!regex_search(table_[idx][i].str_, ret, pat)) { - str_line.clear(); - - for (size_t j = 1; j <= col_num_; j++) - { - str_line.push_back(table_[i][j].str_); - } - - str_table.push_back(str_line); + column_output(i, Disable); } - else - { - str_line.clear(); - str_line.push_back(table_[i][idx].str_); - - for (size_t j = 0; j < odx.size(); j++) - { - str_line.push_back(table_[i][odx[j]].str_); - } - - str_table.push_back(str_line); - } - - row_names.push_back(table_[i][0].str_); } } - - out_table.init_table(str_table); - - std::vector io_col; - if (out_row) + else // cnd_tar是列头 此时为按行过滤 { - column_names(io_col); - out_table.cell(table_[0][0].str_, 0, 0); - } - else - { - io_col.push_back(cnd_col); - - for (size_t j = 0; j < odx.size(); j++) + for (size_t i = 1; i <= row_num_; i++) { - io_col.push_back(out_col[j]); + if (!regex_search(table_[i][idx].str_, ret, pat)) + { + row_output(i, Disable); + } } } - - out_table.column_names(io_col); - out_table.row_names(row_names, {}, table_[0][0].str_); - - destroy_vector(row_names); - destroy_vector(io_col); - destroy_vector(str_line); - destroy_vector(str_table); return; } -void gctl::dsv_io::filt_column(rowbool_func_t func, const std::vector &out_col, dsv_io &out_table) +void gctl::dsv_io::filter(linebool_func_t func, table_headtype_e thead) { - array odx; - bool out_row = false; - if (out_col.empty()) out_row = true; - else + if (thead == RowHead) { - odx.resize(out_col.size()); - for (size_t i = 0; i < out_col.size(); i++) + for (size_t i = 1; i <= row_num_; i++) { - odx[i] = name_index(out_col[i]); - if (odx[i] < 0) throw std::runtime_error("[gctl::dsv_io::] Invalid column index or name."); + if (!func(table_[i])) row_output(i, Disable); } } - - std::vector str_line, row_names; - std::vector > str_table; - - for (size_t i = 1; i <= row_num_; i++) + else if (thead == ColHead) { - if (func(table_[i])) + std::vector col_cell(row_num_); + for (size_t i = 1; i <= col_num_; i++) { - if (out_row) + for (size_t j = 1; j < row_num_; j++) { - str_line.clear(); - - for (size_t j = 1; j <= col_num_; j++) - { - str_line.push_back(table_[i][j].str_); - } - - str_table.push_back(str_line); + col_cell[j] = table_[j][i]; } - else - { - str_line.clear(); - - for (size_t j = 0; j < odx.size(); j++) - { - str_line.push_back(table_[i][odx[j]].str_); - } - - str_table.push_back(str_line); - } - - row_names.push_back(table_[i][0].str_); + + if (!func(col_cell)) column_output(i, Disable); } } - - out_table.init_table(str_table); - - std::vector io_col; - if (out_row) - { - column_names(io_col); - out_table.cell(table_[0][0].str_, 0, 0); - } - else - { - for (size_t j = 0; j < odx.size(); j++) - { - io_col.push_back(out_col[j]); - } - } - - out_table.column_names(io_col); - out_table.row_names(row_names, {}, table_[0][0].str_); - - destroy_vector(row_names); - destroy_vector(io_col); - destroy_vector(str_line); - destroy_vector(str_table); + else throw std::runtime_error("[gctl::dsv_io::filter] Invalid table head type."); return; } @@ -834,41 +784,45 @@ void gctl::dsv_io::cal_column(std::string expr_str, const std::vector &cnd_col, - const std::vector &out_col, dsv_io& out_table) +void gctl::dsv_io::filter(std::string cnd_str, const std::vector &cnd_tars, table_headtype_e thead) { - array idx(cnd_col.size()); - for (size_t i = 0; i < cnd_col.size(); i++) + array idx(cnd_tars.size()); + if (thead == RowHead) { - idx[i] = name_index(cnd_col[i]); - - if (idx[i] < 0) throw std::runtime_error("[gctl::dsv_io::] Invalid column index or name."); - - if (table_[0][idx[i]].type_ != Int && table_[0][idx[i]].type_ != Float) + for (size_t i = 0; i < cnd_tars.size(); i++) { - throw std::runtime_error("[gctl::dsv_io] Invalid column type for numerical calculating."); + idx[i] = name_index(cnd_tars[i], true); + + if (idx[i] <= 0 || idx[i] > row_num_) throw std::runtime_error("[gctl::dsv_io::filter] Invalid row index or name."); + + if (table_[idx[i]][0].type_ != Int && table_[idx[i]][0].type_ != Float) + { + throw std::runtime_error("[gctl::dsv_io::filter] Invalid row type for numerical calculating."); + } } } - - array odx; - bool out_row = false; - if (out_col.empty()) out_row = true; - else + else if (thead == ColHead) { - odx.resize(out_col.size()); - for (size_t i = 0; i < out_col.size(); i++) + for (size_t i = 0; i < cnd_tars.size(); i++) { - odx[i] = name_index(out_col[i]); - if (odx[i] < 0) throw std::runtime_error("[gctl::dsv_io::] Invalid column index or name."); + idx[i] = name_index(cnd_tars[i]); + + if (idx[i] <= 0 || idx[i] > col_num_) throw std::runtime_error("[gctl::dsv_io::filter] Invalid column index or name."); + + if (table_[0][idx[i]].type_ != Int && table_[0][idx[i]].type_ != Float) + { + throw std::runtime_error("[gctl::dsv_io::filter] Invalid column type for numerical calculating."); + } } } - + else throw std::runtime_error("[gctl::dsv_io::filter] Invalid table head type."); + exprtk::symbol_table symbol_table; - array var(cnd_col.size()); + array var(cnd_tars.size()); for (size_t i = 0; i < var.size(); i++) { - symbol_table.add_variable(cnd_col[i], var[i]); + symbol_table.add_variable(cnd_tars[i], var[i]); } exprtk::expression expression; @@ -878,79 +832,34 @@ void gctl::dsv_io::filt_column(std::string cnd_str, const std::vector str_line, row_names; - std::vector > str_table; - - for (size_t i = 1; i <= row_num_; i++) + if (thead == RowHead) // cnd_tars是行头 此时为按列过滤 { - for (size_t j = 0; j < var.size(); j++) + for (size_t i = 1; i <= col_num_; i++) { - var[j] = table_[i][idx[j]].value(); - } - - if (expression.value() > 0.5) // return 1 if matched or 0 if dismatched - { - if (out_row) + for (size_t j = 0; j < var.size(); j++) { - str_line.clear(); - - for (size_t j = 1; j <= col_num_; j++) - { - str_line.push_back(table_[i][j].str_); - } - - str_table.push_back(str_line); - } - else - { - str_line.clear(); - for (size_t j = 0; j < idx.size(); j++) - { - str_line.push_back(table_[i][idx[j]].str_); - } - - for (size_t j = 0; j < odx.size(); j++) - { - str_line.push_back(table_[i][odx[j]].str_); - } - - str_table.push_back(str_line); + var[j] = table_[idx[j]][i].value(); } - row_names.push_back(table_[i][0].str_); + // return 1 if matched or 0 if dismatched + if (expression.value() < 0.5) column_output(i, Disable); } } - - out_table.init_table(str_table); - - std::vector io_col; - if (out_row) + else // cnd_tars是列头 此时为按行过滤 { - column_names(io_col); - out_table.cell(table_[0][0].str_, 0, 0); - } - else - { - for (size_t j = 0; j < idx.size(); j++) + for (size_t i = 1; i <= row_num_; i++) { - io_col.push_back(cnd_col[j]); - } + for (size_t j = 0; j < var.size(); j++) + { + var[j] = table_[i][idx[j]].value(); + } - for (size_t j = 0; j < odx.size(); j++) - { - io_col.push_back(out_col[j]); + // return 1 if matched or 0 if dismatched + if (expression.value() < 0.5) row_output(i, Disable); } } - - out_table.column_names(io_col); - out_table.row_names(row_names, {}, table_[0][0].str_); - - destroy_vector(row_names); - destroy_vector(io_col); - destroy_vector(str_line); - destroy_vector(str_table); return; } diff --git a/lib/io/dsv_io.h b/lib/io/dsv_io.h index c5fe61b..c02de32 100644 --- a/lib/io/dsv_io.h +++ b/lib/io/dsv_io.h @@ -251,14 +251,14 @@ namespace gctl * * @param att 注释 */ - void annotoations(const std::vector &att){annotates_ = att;} + void annotations(const std::vector &att){annotates_ = att;} /** * @brief 返回注释行 * * @return 注释行 */ - const std::vector &annotoations(){return annotates_;} + const std::vector &annotations(){return annotates_;} /** * @brief 设置标记行符号 @@ -421,6 +421,14 @@ namespace gctl * @param col 数据列数 */ void init_table(int row, int col); + + /** + * @brief 导出表格(默认不会导出失效的行和列) + * + * @param ignore_disabled 忽略失效的行和列 + * @return 导出的表格 + */ + dsv_io export_table(bool ignore_disabled = true); /** * @brief 返回表格信息 @@ -438,6 +446,13 @@ namespace gctl */ int name_index(std::string name, bool iter_row = false); + /** + * @brief 设置表格输出(作用于整个表格) + * + * @param s 设置输出类型 + */ + void table_output(switch_type_e s); + /** * @brief 设置列输出。你仍然可以使用这些数据,它们只是不会被输出 * @@ -511,32 +526,27 @@ namespace gctl int add_row(std::string name, std::string id_name); /** - * @brief 按行过滤并返回符合条件的列数据 - * - * @note 过滤后的表格第一列尾用于匹配正则表达式的列,剩余列尾为筛选后符合条件的列数据。 + * @brief 按行或列过滤数据(不符合条件的行与列的输出属性将设置为假) * * @param cnd_str 正则表达式 - * @param cnd_col 用于匹配正则表达式的列名称 - * @param out_col 输出的列索引列表(列表为空时则会输出所有列),正则表达式为真时即筛选这些行与列上对应的数据 - * @param out_table 输出的表格 + * @param cnd_tar 用于匹配正则表达式的行或列名称 + * @param thead 用于匹配正则表达式的行或列类型 RowHead时表示按列过滤,ColHead时表示按行过滤 */ - void filt_column(std::string cnd_str, std::string cnd_col, - const std::vector &out_col, dsv_io &out_table); + void filter(std::string cnd_str, std::string cnd_tar, table_headtype_e thead = RowHead); /** - * @brief row operate function pointer + * @brief table line operate function pointer * */ - typedef bool (*rowbool_func_t)(const std::vector &table_row); + typedef bool (*linebool_func_t)(const std::vector &table_line); /** - * @brief 按行过滤并返回符合条件的列数据 + * @brief 逐行或列过滤数据(不符合条件的行与列的输出属性将设置为假) * - * @param func 处理行类容的布尔函数 - * @param out_col 输出的列索引列表(列表为空时则会输出所有列),正则表达式为真时即筛选这些行与列上对应的数据 - * @param out_table 输出的表格 + * @param func 处理行或列类容的布尔函数 + * @param thead RowHead时表示按行过滤,ColHead时表示按列过滤 */ - void filt_column(rowbool_func_t func, const std::vector &out_col, dsv_io &out_table); + void filter(linebool_func_t func, table_headtype_e thead = RowHead); #ifdef GCTL_EXPRTK @@ -553,18 +563,16 @@ namespace gctl void cal_column(std::string expr_str, const std::vector &col_list, int p = 6); /** - * @brief 按行过滤并返回符合条件的列数据 + * @brief 按行或列过滤数据(不符合条件的行与列的输出属性将设置为假) * * @note 只有单元格类型为float和Int类型的列数据才能用于计算。计算由exprtk库完成,支持的表达式见其说明文档。 * 因为没有使用strtk库的相关内容,所以并不支持对字符串与数字类型的混合条件判断。基于字符串的内容提取请使用其他函数。 * * @param cnd_str 条件表达式 - * @param cnd_col 用于条件表达式的列索引列表 - * @param out_col 输出的列索引列表(列表为空时则会输出所有列),即条件判断为真时即筛选这些行与列上对应的数据 - * @param out_table 输出的表格 + * @param cnd_tars 用于匹配正则表达式的行或列名称 + * @param thead 用于匹配正则表达式的行或列类型 RowHead时表示按列过滤,ColHead时表示按行过滤 */ - void filt_column(std::string cnd_str, const std::vector &cnd_col, - const std::vector &out_col, dsv_io &out_table); + void filter(std::string cnd_str, const std::vector &cnd_tars, table_headtype_e thead = RowHead); #endif // GCTL_EXPRTK diff --git a/out.txt b/out.txt deleted file mode 100644 index 9b49ad3..0000000 --- a/out.txt +++ /dev/null @@ -1,53 +0,0 @@ -Code_s|Continent_s|Name_s|Population_n|GNP_n -ANT|North America|Netherlands Antilles|217000|1941 -AIA|North America|Anguilla|8000|63.2 -ATG|North America|Antigua and Barbuda|68000|612 -ARG|South America|Argentina|37032000|340238 -ABW|North America|Aruba|103000|828 -BHS|North America|Bahamas|307000|3527 -BRB|North America|Barbados|270000|2223 -BLZ|North America|Belize|241000|630 -BMU|North America|Bermuda|65000|2328 -BOL|South America|Bolivia|8329000|8571 -BRA|South America|Brazil|170115000|776739 -VGB|North America|Virgin Islands, British|21000|612 -CYM|North America|Cayman Islands|38000|1263 -CHL|South America|Chile|15211000|72949 -CRI|North America|Costa Rica|4023000|10226 -DMA|North America|Dominica|71000|256 -DOM|North America|Dominican Republic|8495000|15846 -ECU|South America|Ecuador|12646000|19770 -SLV|North America|El Salvador|6276000|11863 -FLK|South America|Falkland Islands|2000|0 -GRD|North America|Grenada|94000|318 -GRL|North America|Greenland|56000|0 -GLP|North America|Guadeloupe|456000|3501 -GTM|North America|Guatemala|11385000|19008 -GUY|South America|Guyana|861000|722 -HTI|North America|Haiti|8222000|3459 -HND|North America|Honduras|6485000|5333 -JAM|North America|Jamaica|2583000|6871 -CAN|North America|Canada|31147000|598862 -COL|South America|Colombia|42321000|102896 -CUB|North America|Cuba|11201000|17843 -MTQ|North America|Martinique|395000|2731 -MEX|North America|Mexico|98881000|414972 -MSR|North America|Montserrat|11000|109 -NIC|North America|Nicaragua|5074000|1988 -PAN|North America|Panama|2856000|9131 -PRY|South America|Paraguay|5496000|8444 -PER|South America|Peru|25662000|64140 -PRI|North America|Puerto Rico|3869000|34100 -GUF|South America|French Guiana|181000|681 -KNA|North America|Saint Kitts and Nevis|38000|299 -LCA|North America|Saint Lucia|154000|571 -VCT|North America|Saint Vincent and the Grenadines|114000|285 -SPM|North America|Saint Pierre and Miquelon|7000|0 -SUR|South America|Suriname|417000|870 -TTO|North America|Trinidad and Tobago|1295000|6232 -TCA|North America|Turks and Caicos Islands|17000|96 -URY|South America|Uruguay|3337000|20831 -VEN|South America|Venezuela|24170000|95023 -USA|North America|United States|278357000|8510700 -VIR|North America|Virgin Islands, U.S.|93000|0 -|Asia|China|14000000|1949 diff --git a/tool/dsviewer/dsviewer.cpp b/tool/dsviewer/dsviewer.cpp index f961c3c..7410e1b 100644 --- a/tool/dsviewer/dsviewer.cpp +++ b/tool/dsviewer/dsviewer.cpp @@ -460,20 +460,18 @@ void rand_data(const std::vector &cmd_units) void filt_data(const std::vector &cmd_units) { - // filter ,,... ,,... - if (cmd_units.size() < 5) throw std::runtime_error("filter: insufficient parameters."); - std::vector tar_names, out_names; + // filter row|col ,,... + if (cmd_units.size() < 4) throw std::runtime_error("filter: insufficient parameters."); + std::vector tar_names; parse_string_to_vector(cmd_units[3], ',', tar_names); - parse_string_to_vector(cmd_units[4], ',', out_names); - dsv_io out_table; - if (tar_names.size() == 1) tc.filt_column(cmd_units[2], tar_names[0], out_names, out_table); - else tc.filt_column(cmd_units[2], tar_names, out_names, out_table); + table_headtype_e thead; + if (cmd_units[1] == "row") thead = RowHead; + else if (cmd_units[1] == "col") thead = ColHead; + else throw std::runtime_error("filter: invalid parameters."); - std::string naked_name, exten_name; - parse_filename(cmd_units[1], naked_name, exten_name); - if (exten_name == ".csv") out_table.save_csv(cmd_units[1]); - else out_table.save_text(naked_name, exten_name); + if (tar_names.size() == 1) tc.filter(cmd_units[2], tar_names[0], thead); + else tc.filter(cmd_units[2], tar_names, thead); return; }