/marshmallow-rm/このマシュマロの質問解答を集めたScrapboxはどうやって作ったのですか？

generated at 2/18/2025, 7:52:56 AM
このマシュマロの質問解答を集めたScrapboxはどうやって作ったのですか？
質問
このマシュマロの質問解答を集めたScrapboxはどうやって作ったのですか？

解答
恥ずかしながら、マシュマロの質問に対する解答を、まったくローカルに残していなかったので、データを拾い集めるところからはじめました。

まず質問の方は、マシュマロの確認済みのページに残っています。
かなりの数だったので手作業コピペはあきらめ、Python＋Selenuimで自動巡回で吸い出してもらいました。

get_questions_from_marshmallow.pyfrom selenium import webdriver
browser = webdriver.Chrome() #使ってるブラウザ、ここではChromeを指定

def get_questions_from_marshmallow():
    root_url = 'https://marshmallow-qa.com/messages/answered?page=%s' #確認済みのマシュマロ
    counter = 1
    questions = []

    #確認済みページへ接続
    browser.get(root_url%str(counter)) 

    while True:
        #空っぽのページかチェック
        if '表示できるメッセージがないようです' in browser.find_element_by_css_selector('body > main > div').text:
            break

        #そのページの内容を抽出
        root_now_xpath = '/html/body/main/div/ul/li[%s]/div/div/div[2]/a'
        for i in range(2,33):
            now_item = browser.find_elements_by_xpath(root_now_xpath%str(i))
            if now_item:
                questions.append (now_item[0].text)

        try:
            #次のページへいく
            counter += 1
            browser.get(root_url%str(counter)) 
        except:
            ##次のページへいくボタンがなければ例外が出るので、そこで終了
            print("NoSuchElementException")
            break
            
    return questions


解答の方はTwitter上に流されているので、まずユーザー情報のページに有る「全ツイートの履歴をリクエスト」機能をつかって全ツイートを含むcvsファイルを取得しました。
全ツイートを含むcvsファイルから、マシュマロへの解答ツイートだけを抽出しました。

マシュマロへの解答は１つのツイートだけのものあれば、複数のツイートでできたスレッドによるものもあります。
スレッドになっている解答をあつめるために、解答ツイートのtweet_idをin_reply_to_status_idに含むツイートを集め、さらにそのtweet_idをin_reply_to_status_idに含むツイートを集め…を繰り返して、スレッド解答をまとめました。

get_answers_from_tweet_csv.pyimport pandas as pd

#スレッドをたどる
def get_text(now_id):
    return no_retweets[no_retweets['tweet_id'] == now_id].text.values[0]

def thread(now_id):
    # 元のidのtextを積む
    text_data = get_text(now_id)
    # 元のidをin_reply_to_status_idに含むものを探す
    has_in_reply_to_status_id = no_retweets[no_retweets['in_reply_to_status_id'] == now_id]
    #見つかるうちは、元のidをin_reply_to_status_idに含むものを探す
    while len(has_in_reply_to_status_id)>0:
        now_id = has_in_reply_to_status_id.tweet_id.values[0]
        text_data += '\n' + get_text(now_id)
        #print (now_id, text_data)
        has_in_reply_to_status_id = no_retweets[no_retweets['in_reply_to_status_id'] == now_id]
    return text_data

def get_answer_from_tweet_csv():
	#全ツイートcvsファイル読み込み
   tweets_df = pd.read_csv('tweets.csv')
   
   #リツイートは省く
   no_retweets = tweets_df[tweets_df['retweeted_status_id'].isnull()]
   
   # expanded_urlsを持つものだけを抽出
   no_retweets_with_expanded_urls = no_retweets[no_retweets['expanded_urls'].notnull()]
   
   # expanded_urlsにhttps://marshmallow-qa.comを含むもの＝マショマロの解答のトップ　を抽出
   marshmallow_tops = no_retweets_with_expanded_urls.query('source.str.contains("https://marshmallow-qa.com")', engine='python')
   
   #答えを収集しリストに格納
   answers = [ [now_tweet_id, thread(now_tweet_id)] for now_tweet_id in marshmallow_tops['tweet_id']]
   
   return answers

こうしてできた質問データと解答データを突き合わせて（手作業が少し必要だったのでExcel上でやりました）、
最後にjsonで書き出したものをScrapboxにインポートしました。

marshmallow_to_scrapbox.pyimport pandas as pd
import json
import re 


#１つの問答をjson用のdictに変換する補助関数
def qa2dict(now_qa):
    
    now_question = now_qa[0]
    now_answer   = now_qa[1]   
    
    #『』で囲まれたものを書物として抽出して、タグ化する
    book_list = re.findall('『[^』]+』' , now_answer)
    book_list = ['[' + book + ']' for book in book_list]
    
    now_question_list = now_question.split('\n')
    now_answer_list   = now_answer.split('\n')
    
    qa_dict ={}
    qa_dict['title'] = now_question_list[0][:30] #　質問の１行目をタイトルに
    qa_dict['lines'] = ['','[**** 質問]'] + now_question_list
    qa_dict['lines'] += ['','[**** 解答]'] + now_answer_list
    if book_list:
        qa_dict['lines'] += ['','[**** 文献]'] + book_list
    return qa_dict

#マシュマロの全問答を集めたエクセルファイルを読み込み
mash_df = pd.read_excel('マシュマロ全質問回答.xlsx')
#リストに変換
mash_list = mash_df.values.tolist() #[[質問, 解答],[質問, 解答],…]
 
#問答を１つずつjson用のdictに変換し、格納
dict_list = []
for mash_qa in mash_list:
    dict_list.append(qa2dict(mash_qa))

#出力用のjsonのdictをつくる
pages_dict = dict(pages=dict_list)
#jsonファイルを書き出し
f2 = open('mash.json', 'w')
json.dump(pages_dict, f2, ensure_ascii=False, indent=4, sort_keys=False)

以上となります。