Skip to content

2-初始化配置文件

  1. 首先,执行以下命令生成配置文件:
shell
deno task init:config

生成的配置文件将保存在 $HOME/.libian/crawler/config 目录中,并创建符号链接到 data_cleaner_ci_generated/config.json

  1. 接下来,请更新配置文件,并设置您自己的数据仓库,比如 PostgreSQL 连接参数。

以下是一个配置文件模板。

json
{
  "repositories": [
    {
      "typ": "postgres",
      "param": {
        "dbname": "SET_TO_YOUR",
        "user": "SET_TO_YOUR",
        "password": "SET_TO_YOUR",
        "host": "SET_TO_YOUR",
        "port": 5432,
        "ssl": true
      },
      "dataset_tables": [
        {
          "dataset_typename": "SET_TO_YOUR_LibianCrawlerGarbage",
          "schema": "libian_crawler",
          "tablename": "garbage",
          "group_by_jsonata": "g_type & '__' & g_content.crawler_tag",
          "batch_size": {
            "api": 200,
            "code_gen": 500
          },
          "cache_by_id": true,
          "with_jsonata_template": ["parse_html_tree"]
        }
      ]
    }
  ],
  "libian_crawler": {
    "data_storage": {
      "connect_param": {
        "dbname": "SET_TO_YOUR",
        "user": "SET_TO_YOUR",
        "password": "SET_TO_YOUR",
        "host": "SET_TO_YOUR",
        "port": 5432,
        "ssl": true
      },
      "migration": {
        "schema": "libian_crawler_cleaned_migration",
        "table": "migration",
        "lock_table": "migration_lock"
      },
      "insert_batch_size": 100
    }
  }
}
了解 初始化配置文件 具体做了什么
ts
// ...

/**
 * init_config 将创建符号链接并创建一些模板文件。
 */
export async function init_config() {
  const home_dir = os.homedir();
  // 在家目录中创建配置文件
  const config_file_path = path.join(
    home_dir,
    ".libian",
    "crawler",
    "config",
    "dc_v1.json"
  );
  await write_file({
    file_path: config_file_path,
    creator: {
      mode: "text",
      content: () =>
        new Promise<string>((rs, _) =>
          rs(JSON.stringify(template_config(), null, 2))
        ),
    },
    log_tag: {
      alia_name: "config file",
    },
  });
  // 将家目录下的配置文件符号链接至 data_cleaner_ci_generated
  await write_file({
    file_path: path.join(data_cleaner_ci_generated, "config.json"),
    creator: {
      mode: "symlink",
      old: config_file_path,
      allow_old_not_found: false,
    },
    log_tag: {
      alia_name: "config file symlink",
    },
  });
  // 在家目录下创建 user_code 目录
  const user_code_dir = path.join(
    home_dir,
    ".libian",
    "crawler",
    "data_cleaner_ci",
    "user_code"
  );
  await Deno.mkdir(user_code_dir, {
    recursive: true,
    mode: 0o700,
  });
  console.log("Mkdir user code dir at :", user_code_dir);
  const user_code_dir_link = path.join("user_code");
  // 将家目录下的 user_code 目录符号链接至 data_cleaner_ci_generated
  await write_file({
    file_path: user_code_dir_link,
    creator: {
      mode: "symlink",
      old: user_code_dir,
      allow_old_not_found: false,
    },
    log_tag: {
      alia_name: "user code dir link",
    },
  });
  await write_file({
    file_path: path.join(user_code_dir_link, "readme.md"),
    creator: {
      mode: "text",
      // deno-lint-ignore require-await
      content: async () => `# 用户代码目录

此目录是符号链接,指向 ${user_code_dir}。

并且此目录被父目录的 .gitignore 忽略。

这么做是为了避免程序员个人数仓的代码泄漏到主仓库分支中。

如果有保存个人代码的需要,可将 个人仓库 中的 user_code 目录被软链接 ${user_code_dir} 所指向。
`,
    },
    log_tag: {
      alia_name: "user code readme file",
    },
  });
  await write_file({
    file_path: path.join(user_code_dir_link, "LibianCrawlerGarbage.ts"),
    creator: {
      mode: "text",
      // deno-lint-ignore require-await
      content: async () =>
        `
export type LibianCrawlerGarbage = {} // 自己改成自己数仓的类型;
export const read_LibianCrawlerGarbage = ()=>{} // 自己改成自己数仓的 api;
      `,
    },
    log_tag: {
      alia_name: "user code readme file",
    },
  });
}

// ...