pdfをscrapboxに変換する
Claudeさん
h6もあったんだけど普通のテキストも入ってしまったのでh6は消した
js(() => {
main();
function main() {
// Remove site-specific check
const body = document.body; // Use the entire body instead of a specific div
const sections = makeHierarchy(body);
const pages = [];
for (const section of sections) {
const pages_ = [...toPages(section)];
if (pages_.length === 0) continue;
pages.push(pages_.pop());
pages.push(...pages_);
}
const index = generateIndex(pages);
const json = {
pages: [index, ...pages.map(({id, lines}) => ({ title: id, lines: [id, ...lines] }))],
};
downloadJSON(json);
}
function generateIndex(pages) {
return {
title: "Table of Contents",
lines: [
"Table of Contents",
...pages.map(({ id, level }) => `${" ".repeat(level - 1)}[${id}]`),
],
};
}
function* toPages(section) {
const data = section[0];
const lines = [];
for (const node of section.slice(1)) {
if (Array.isArray(node)) {
const pages = [...toPages(node)];
if (pages.length === 0) continue;
const link = pages[pages.length - 1].id;
lines.push(`[${link}]`, "");
yield pages.pop();
for (const page of pages) {
yield page;
}
continue;
}
lines.push(...convertNode(node), "");
}
yield { ...data, lines };
}
function convertNode(node) {
switch (node.nodeName) {
case "P":
return convertP(node).split("\n");
case "TABLE":
return ["table:table", ...node.innerText.split("\n").map((text) => ` ${text}`)];
case "UL":
case "OL":
return convertList(node);
case "PRE":
return ["code:txt", ...node.innerText.split("\n").map((text) => ` ${text}`)];
case "FIGURE":
return convertFigure(node);
default:
return node.innerText.split("\n");
}
}
function convertP(p) {
return Array.from(p.childNodes).map((node) => {
switch (node.nodeName) {
case "STRONG":
case "B":
return node.textContent.split("\n").map((text) => `[* ${text}]`).join("\n");
case "EM":
case "I":
return node.textContent.split("\n").map((text) => `[_ ${text}]`).join("\n");
case "CODE":
return node.textContent.split("\n").map((text) => `\`${text}\``).join("\n");
case "A":
return convertLink(node);
case "UL":
case "OL":
return convertList(node).join("\n");
default:
return node.textContent;
}
}).join("");
}
function convertLink(node) {
if (node.id) {
const n = node.textContent.trim();
return `([Footnote ${n}])`;
}
const url = node.href.trim();
const text = node.textContent.trim();
return url === text ? ` ${url} ` : `[${url} ${text}]`;
}
function convertList(ul) {
const list = Array.from(ul.children).filter((li) => li.tagName === "LI");
return list.flatMap(
(li) => convertP(li).split("\n").map((text, i) => i === 0 ? ` ${text}` : ` ${text}`)
);
}
function convertFigure(figure) {
const img = figure.querySelector("img");
if (!img) return [];
const src = img.src.trim();
const text = img.alt.trim() || "Image";
return [text, ` [${src}]`];
}
function makeHierarchy(body) {
const headings = body.querySelectorAll("h1, h2, h3, h4, h5");
const sections = [];
let currentSection = null;
let currentLevel = 0;
headings.forEach((heading) => {
const level = parseInt(heading.tagName.slice(1));
if (level <= currentLevel) {
while (currentSection && currentSection[0].level >= level) {
currentSection = currentSection[0].parent;
}
}
const newSection = [{
// id: heading.id || heading.textContent.trim().replace(/\s+/g, '-').toLowerCase(),
ここ
-
に置き換えるのはやりすぎ
js id: heading.id || heading.textContent.trim().replace(/\s+/, ''),
text: heading.textContent,
level: level,
parent: currentSection
}];
if (currentSection) {
currentSection.push(newSection);
} else {
sections.push(newSection);
}
currentSection = newSection;
currentLevel = level;
let sibling = heading.nextElementSibling;
while (sibling && !sibling.matches("h1, h2, h3, h4, h5")) {
currentSection.push(sibling);
sibling = sibling.nextElementSibling;
}
});
return sections;
}
function downloadJSON(json) {
const blob = new Blob([JSON.stringify(json)], {type: 'application/json'});
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = 'extracted_content.json';
a.click();
URL.revokeObjectURL(url);
}
})();