2-初始化配置文件
- 首先,执行以下命令生成配置文件:
shell
deno task init:config
生成的配置文件将保存在 $HOME/.libian/crawler/config
目录中,并创建符号链接到 data_cleaner_ci_generated/config.json
。
- 接下来,请更新配置文件,并设置您自己的数据仓库,比如 PostgreSQL 连接参数。
以下是一个配置文件模板。
json
{
"repositories": [
{
"typ": "postgres",
"param": {
"dbname": "SET_TO_YOUR",
"user": "SET_TO_YOUR",
"password": "SET_TO_YOUR",
"host": "SET_TO_YOUR",
"port": 5432,
"ssl": true
},
"dataset_tables": [
{
"dataset_typename": "SET_TO_YOUR_LibianCrawlerGarbage",
"schema": "libian_crawler",
"tablename": "garbage",
"group_by_jsonata": "g_type & '__' & g_content.crawler_tag",
"batch_size": {
"api": 200,
"code_gen": 500
},
"cache_by_id": true,
"with_jsonata_template": ["parse_html_tree"]
}
]
}
],
"libian_crawler": {
"data_storage": {
"connect_param": {
"dbname": "SET_TO_YOUR",
"user": "SET_TO_YOUR",
"password": "SET_TO_YOUR",
"host": "SET_TO_YOUR",
"port": 5432,
"ssl": true
},
"migration": {
"schema": "libian_crawler_cleaned_migration",
"table": "migration",
"lock_table": "migration_lock"
},
"insert_batch_size": 100
}
}
}
了解 初始化配置文件 具体做了什么
ts
// ...
/**
* init_config 将创建符号链接并创建一些模板文件。
*/
export async function init_config() {
const home_dir = os.homedir();
// 在家目录中创建配置文件
const config_file_path = path.join(
home_dir,
".libian",
"crawler",
"config",
"dc_v1.json"
);
await write_file({
file_path: config_file_path,
creator: {
mode: "text",
content: () =>
new Promise<string>((rs, _) =>
rs(JSON.stringify(template_config(), null, 2))
),
},
log_tag: {
alia_name: "config file",
},
});
// 将家目录下的配置文件符号链接至 data_cleaner_ci_generated
await write_file({
file_path: path.join(data_cleaner_ci_generated, "config.json"),
creator: {
mode: "symlink",
old: config_file_path,
allow_old_not_found: false,
},
log_tag: {
alia_name: "config file symlink",
},
});
// 在家目录下创建 user_code 目录
const user_code_dir = path.join(
home_dir,
".libian",
"crawler",
"data_cleaner_ci",
"user_code"
);
await Deno.mkdir(user_code_dir, {
recursive: true,
mode: 0o700,
});
console.log("Mkdir user code dir at :", user_code_dir);
const user_code_dir_link = path.join("user_code");
// 将家目录下的 user_code 目录符号链接至 data_cleaner_ci_generated
await write_file({
file_path: user_code_dir_link,
creator: {
mode: "symlink",
old: user_code_dir,
allow_old_not_found: false,
},
log_tag: {
alia_name: "user code dir link",
},
});
await write_file({
file_path: path.join(user_code_dir_link, "readme.md"),
creator: {
mode: "text",
// deno-lint-ignore require-await
content: async () => `# 用户代码目录
此目录是符号链接,指向 ${user_code_dir}。
并且此目录被父目录的 .gitignore 忽略。
这么做是为了避免程序员个人数仓的代码泄漏到主仓库分支中。
如果有保存个人代码的需要,可将 个人仓库 中的 user_code 目录被软链接 ${user_code_dir} 所指向。
`,
},
log_tag: {
alia_name: "user code readme file",
},
});
await write_file({
file_path: path.join(user_code_dir_link, "LibianCrawlerGarbage.ts"),
creator: {
mode: "text",
// deno-lint-ignore require-await
content: async () =>
`
export type LibianCrawlerGarbage = {} // 自己改成自己数仓的类型;
export const read_LibianCrawlerGarbage = ()=>{} // 自己改成自己数仓的 api;
`,
},
log_tag: {
alia_name: "user code readme file",
},
});
}
// ...