generated at
scrapboxとHugoを同期させる

Scrapbox個人ブログ(Hugo)を同期させるようにした.
scrapboxとクローラでも言及したが, 空のリンクに検索がヒットするのはよくないと思い, 同期を始めた.
scrapbox自体は書き心地やUXが最高で手放したくないため, 一部ページを同期させ, 正しく検索結果が載るか試してみる.

コードは以下に示す通り.
scrapbox記法からmarkdownへの変換はこちらを改変したものを使用.
javascript replace は割と万能である.
shell script / python / Node.js が動く環境があればOK.

main.sh
url='https://scrapbox.io/api/pages/yuwd?limit=500' # url='https://scrapbox.io/api/pages/yuwd?limit=10' dates=(`curl $url | jq ".pages[].updated" | xargs`) echo "COUNT=${#dates[@]}" i=0 IFS=$'\n' for title in $(curl $url | jq ".pages[].title" | sed -e "s/^\"//g" | sed -e "s/\"$//g") do md_title=`echo $title | tr ' ' '_' | sed s/\?//g | sed s/!//g | sed s/://g` etitle=$(echo $title | python -c 'import sys;from urllib.parse import quote; [print(quote(l)[:-3],end="") for l in sys.stdin]') if [ ! -e $md_title.md ]; then echo $title url="https://scrapbox.io/api/pages/yuwd/${etitle}/text" echo $url curl $url | node sb2md.js > $md_title.md python scrapbox.py $md_title.md "${dates[$i]}" fi i=`expr $i + 1` done


sb2md.js
// https://gist.github.com/yuntan/bb82cdf336ec76a15c66b910754f5f33 if (!Object.prototype.then) { Object.prototype.then = function (f) { return f.call(null, this); } } process.stdin.resume(); process.stdin.setEncoding('utf8'); let input_string = ''; process.stdin.on('data', chunk => { input_string += chunk; }); process.stdin.on('end', () => { const text = input_string; console.log(sb2md(text)); }); function sb2md(text) { // code block const escapeCodeBlocks = s => s.replace( /^code:(.+)$((\n^[ \t].*$)+)/mg, (_, p1, p2) => '```' + p1 + p2.replace(/^[ \t]/mg, '').replace(/\r|\n|\r\n/g, '+++') + '+++```' ); const unescapeCodeBlocks = s => s.replace(/\+{3}/g, '\n'); const replaceLine = line => /^`{3}/.test(line) ? line : // level 2 heading line.replace(/^\[\[([^\[\]]+)\]\]$/, '## $1') .replace(/^\[\*\s+(\S[^\[\]]*)\]$/, '## $1') // anchor link .replace(/\[(\S.*)\s+(https?:\/\/\S+)\]/g, '[$1]($2)') .replace(/\[(https?:\/\/\S+)\s+(\S.*)\]/g, '[$2]($1)') // image block .replace(/^\[(https?:\/\/\S+\.(png|gif|jpe?g))\]$/, '![]($1)') .replace(/^\[(https:\/\/gyazo.com\/\S+)\]$/, '![]($1.png)') // unordered list .replace(/^\s{6}(\S.*)$/, ' - $1') .replace(/^\s{5}(\S.*)$/, ' - $1') .replace(/^\s{4}(\S.*)$/, ' - $1') .replace(/^\s{3}(\S.*)$/, ' - $1') .replace(/^\s{2}(\S.*)$/, ' - $1') .replace(/^\s(\S.*)$/, '- $1') // bold text .replace(/\[\[([^\[\]]+)\]\]/g, '**$1**') .replace(/\[\*\s+([^\[\]]+)\]/g, '**$1**') // italic text .replace(/\[\/\s+([^\[\]]+)\]/g, '*$1*'); return text .then(escapeCodeBlocks) .split(/\r|\n|\r\n/) // first line is level 1 heading .then(lines => [lines[0].replace(/^(.+)$/, '$1')].concat(lines.slice(1))) .map(replaceLine) .join('\n') .then(unescapeCodeBlocks); }

scrapbox.py
import os import re import sys import datetime import requests from urllib.parse import quote head_pattern = re.compile(r"\[\*# ([^\$\*][^\[\]]+)\][^\(]") math_pattern = re.compile(r"\[\$\s+([^\]]+)\]") link_pattern = re.compile(r"\[([^\$\*][^\[\]]+)\][^\(]") url_link_pattern = re.compile(r"\[([^\$\*][^\[\]]+) (https?://[^\]]+)\]") tag_pattern = re.compile(r'#([^\s]+)') img_pattern = re.compile(r'!\[\]\((https?://.+)\)') large_math_pattern = re.compile(r'\s*-\s+(\$[^\$]+\$)\s*$') if len(sys.argv) < 3: print('Usage: scrapbox.py <file> <date>') sys.exit(1) print(sys.argv) lines = [] tags = [] in_snippets = False with open(sys.argv[1], 'r') as f: lines = f.readlines() for i, line in enumerate(lines): if len(line.replace(" ","")) <= 1: continue if line.startswith("```"): in_snippets = not in_snippets if in_snippets: continue for j in range(6): hash = "".join(['*']*(6-j)) t = f"[{hash} [" has_t = t in lines[i] lines[i] = lines[i].replace(t,"![](") if has_t: lines[i] = lines[i][:-3] + ')\n' line = lines[i] head_match = head_pattern.finditer(line) math_match = math_pattern.finditer(line) links_match = link_pattern.finditer(line) tags_match = tag_pattern.finditer(line) url_links_match = url_link_pattern.finditer(line) if head_match: targets = set() for _link in head_match: link = _link.group(1) targets.add(link) for link in targets: lines[i] = lines[i].replace(f"[*# {link}]", f"### {link}") if math_match: targets = set() for _link in math_match: link = _link.group(1) targets.add(link) for link in targets: update_link = link.replace("_",r"\_") lines[i] = lines[i].replace(f"{link}]", f"${update_link}$").replace("[$","") if url_links_match: targets = set() for url_link in url_links_match: title = url_link.group(1) url = url_link.group(2) targets.add((title,url)) for title, url in targets: lines[i] = lines[i].replace(f"[{title} {url}]",f"[{title}]({url})") if links_match: targets = set() for _link in links_match: link = _link.group(1) targets.add(link) for link in targets: md_path = link.replace(' ','_').replace('?','').replace('!','').replace(':','') + ".md" md = '{{< ref "' + md_path + '" >}}' if os.path.exists(md_path): lines[i] = lines[i].replace(f"[{link}]",f"[{link}]({md}/)") else: lines[i] = lines[i].replace(f"[{link}]",f"[{link}](https://scrapbox.io/yuwd/{quote(link)})") if tags_match and i == 1: for _tag in tags_match: tag = _tag.group(1) tags.append(tag) lines[i] = lines[i].replace(f"#{tag}",f"") img_match = img_pattern.finditer(lines[i]) if img_match: for img in img_match: url = img.group(1) img_url = url exts = ['png','gif','jpeg','jpg','webp'] if "https://gyazo.com/" in url: for e in exts: if url.endswith(e): img_url = url break img_url = f"{url}.{e}" res = requests.get(img_url) if res.status_code == 200: break lines[i] = lines[i].replace(f"![]({url})",'{{<' + f'img src="{img_url}" position="center"' + '>}}<br>') large_math_match = large_math_pattern.finditer(lines[i]) if large_math_match: for _math in large_math_match: math = _math.group(1) lines[i] = f"${math}$\n" meta = \ """ --- title: "TITLE" date: DATE description: draft: false hideToc: false enableToc: true enableTocContent: true tocPosition: inner tags: TAGS series: - # image: images/feature3/code-file.png libraries: - katex - mermaid - msc --- """ title = lines[0].replace('\n','') if "paper" in tags: title = f"【論文メモ】{title}" else: tags.append("post") print("TAG",tags) date = datetime.datetime.fromtimestamp(int(sys.argv[2])).strftime('%Y-%m-%dT%H:%M:%S+09:00') meta = meta.replace("TITLE", title) meta = meta.replace("DATE", date) meta = meta.replace("TAGS", "\n".join(map(lambda x: f"- {x}",tags))) lines = list(map(lambda x: x + '\n', meta.split('\n'))) + lines[2:] with open(sys.argv[1], 'w') as f: f.writelines(lines) # for line in lines: # print(line,end='')