/gosyujin/Gyazoの画像情報をScrapbox import用のjsonに整形する

generated at 2/16/2025, 4:48:53 AM
Gyazoの画像情報をScrapbox import用のjsonに整形する

make.tsimport { parse } from "https://deno.land/std@0.97.0/flags/mod.ts";
import { sleep } from "https://deno.land/x/sleep@v1.2.1/mod.ts";

type ImportJson = {
  pages: {
    title: string;
    created?: number;
    updated?: number;
    id?: string;
    lines: {
      text: string;
      created?: number;
      updated?: number;
      id?: string;
    }[];
  }[];
};

const inputFileName = Deno.args[0];
const text = await Deno.readTextFile(inputFileName);

const path = inputFileName.split('/');
const name = path[path.length - 1];

const fileName = name.replace(/.txt$/, '').normalize();
const fileNameIndex = `${fileName}_index`.normalize();
const fileNamePages = `${fileName}_pages`.normalize();
const commonIndex = 'gosyujin_books_index'.normalize();

async function getGyazoImage(gyazoId: string) {
  const token = 'YOUR_ACCESS_TOKEN';
  const endpoint = 'https://api.gyazo.com/api/images/';
  const url = `${endpoint}${gyazoId}?access_token=${token}`;

  return await fetch(url)
  .then(response => {
    if (response.status !== 200) console.error(response.status);
    return response.json();
  })
  .then(r => {
    if (r.ocr === undefined) return "OCR情報がありません(処理中、自分が所有していない、Gyazoじゃない)";
    return r.ocr.description;
  })
  .catch(error => {
    console.error(error);
  })
  .finally(() => {});
}

function getPermaLink(url: string){
  return (new URL(url)).pathname.replace('/', '');
}

function makeBookIndex(pageList: string[]) {
  const index: ImportJson = {
    pages: [{
      title: fileNameIndex,
      lines: [
        { text: fileNameIndex },
        { text: `[${commonIndex}]` },
        { text: `[${pageList[0]}]` },
        ...pageList.map((m, i) => {
          return { text: `[${('0000'+(i+1)).slice(-4)}: ${fileName}]` }
        })
      ]
    }]
  };
  return index;
}

async function makeBookPages(pageList: string[]) {
  const pages: ImportJson = { pages: [] };

  const jsonList = pageList.map(async(m, i) => {
    const permaLink = getPermaLink(m);

    const currentLink = `${('0000'+(i+1)).slice(-4)}: ${fileName}`;
    let prevLink;
    let prev2Image;
    let prev2Link;
    let prevImage;
    let nextLink;
    let nextImage;
    let next2Link;
    let next2Image;
    if (pageList[i-2] === undefined) {
      prev2Link = '';
      prev2Image = '';
    } else {
      prev2Link = `${('0000'+(i+1-2)).slice(-4)}: ${fileName}`;
      prev2Image = pageList[i-2];
    }
    if (pageList[i-1] === undefined) {
      prevLink = '';
      prevImage = '';
    } else {
      prevLink = `${('0000'+(i+1-1)).slice(-4)}: ${fileName}`;
      prevImage = pageList[i-1];
    }
    if (pageList[i+1] === undefined) {
      nextLink = '';
      nextImage = '';
    } else {
      nextLink = `${('0000'+(i+1+1)).slice(-4)}: ${fileName}`;
      nextImage = pageList[i+1];
    }
    if (pageList[i+2] === undefined) {
      next2Link = '';
      next2Image = '';
    } else {
      next2Link = `${('0000'+(i+1+2)).slice(-4)}: ${fileName}`;
      next2Image = pageList[i+2];
    }
    
    let ocr;
    let count = 0;
    while (ocr === undefined) {
      count++;
      ocr = await getGyazoImage(permaLink);
      //console.log(`retry count ${count}: ${permaLink}`);
      await sleep(5);
    }
    const concatOcr = ocr.split('\n').join('');

    //console.log(currentLink);
    const json: ImportJson = {
      pages: [{
        title: currentLink,
        lines: [
          { text: currentLink },
          { text: `Prev: [${prevLink}] | Next: [${nextLink}]` },
          { text: `[[${m}]]` },
          { text: `[${prev2Image}][${prevImage}][${nextImage}][${next2Image}]`},
          { text: `code:${permaLink}` },
          ...ocr.split('\n').map((m: string) => {
            return { text: ` ${m}` }
          }),
          { text: `code:${permaLink}_concat` },
          { text: ` ${concatOcr}` }
        ]
      }]
    }

    return json;
  });

  return await Promise.all(jsonList)
  .then(json => {
    json.map(json => {
      pages.pages.push(json.pages[0]);
    });
    return pages;
  });
}

const index = makeBookIndex(text.split("\n"));
await Deno.writeTextFile(`./${fileNameIndex}.json`, JSON.stringify(index, null, 2));
console.log(`write: ${fileNameIndex}.json`);

const pages = await makeBookPages(text.split("\n"));
await Deno.writeTextFile(`./${fileNamePages}.json`, JSON.stringify(pages, null, 2));
console.log(`write: ${fileNamePages}.json`);

ログ
全体的にページ内容のフォーマットを変更(2022/03/01)
pagesからindexへのリンクを削除し、リンクを一方通行にした
pagesのOCR内容を2種類載せるようにした
1. GyazoでOCRされたままのテキスト
例ここで、スクリー
ンショットを取得
します。
2. OCRされたテキストの改行を除去して結合したテキスト
例ここで、スクリーンショットを取得します。
検索の利便性をとって2. の方式を使っていたが、行が長すぎるとどこでヒットしているかわかりづらい
ヒットさせてからブラウザの検索機能を使えばどこにあるかは見つけられるが…
両方あれば「スクリーンショット」で検索したら2. 経由でページが見つかる
その後1. を見ればどの辺にあるのかわかる
画像が横に並べられるようになったので前のページ2枚、次のページ2枚も貼っておくようにした