ぬの部屋（仮）
nu-no-he-ya

月火水木金土日

2345678

9101112131415

16171819202122

23242526272829

1234

567891011

12131415161718

19202122232425

262728293031

123456

78910111213

14151617181920

21222324252627

282930

3456789

10111213141516

17181920212223

24252627282930

3456789

10111213141516

17181920212223

2425262728

12345

6789101112

13141516171819

20212223242526

2728293031

2345678

9101112131415

16171819202122

23242526272829

3031

123

45678910

11121314151617

18192021222324

252627282930

123456

78910111213

14151617181920

21222324252627

28293031

2345678

9101112131415

16171819202122

23242526272829

1234

567891011

12131415161718

19202122232425

262728293031

1234567

891011121314

15161718192021

22232425262728

293031

3456789

10111213141516

17181920212223

24252627282930

12345

6789101112

13141516171819

20212223242526

2728293031

1234567

891011121314

15161718192021

22232425262728

2930

123

45678910

11121314151617

18192021222324

25262728293031

1234

567891011

12131415161718

19202122232425

26272829

1234567

891011121314

15161718192021

22232425262728

293031

123

45678910

11121314151617

18192021222324

25262728293031

12345

6789101112

13141516171819

20212223242526

27282930

2345678

9101112131415

16171819202122

23242526272829

3031

123

45678910

11121314151617

18192021222324

252627282930

123456

78910111213

14151617181920

21222324252627

28293031

3456789

10111213141516

17181920212223

24252627282930

1234

567891011

12131415161718

19202122232425

2627282930

1234567

891011121314

15161718192021

22232425262728

293031

3456789

10111213141516

17181920212223

24252627282930

12345

6789101112

13141516171819

20212223242526

2728293031

12345

6789101112

13141516171819

20212223242526

2728

2345678

9101112131415

16171819202122

23242526272829

3031

1234

567891011

12131415161718

19202122232425

262728293031

123456

78910111213

14151617181920

21222324252627

282930

3456789

10111213141516

17181920212223

24252627282930

1234

567891011

12131415161718

19202122232425

2627282930

1234567

891011121314

15161718192021

22232425262728

293031

123

45678910

11121314151617

18192021222324

25262728293031

12345

6789101112

13141516171819

20212223242526

27282930

2345678

9101112131415

16171819202122

23242526272829

3031

123

45678910

11121314151617

18192021222324

252627282930

123456

78910111213

14151617181920

21222324252627

28293031

123456

78910111213

14151617181920

21222324252627

3456789

10111213141516

17181920212223

24252627282930

12345

6789101112

13141516171819

20212223242526

2728293031

1234567

891011121314

15161718192021

22232425262728

2930

123

45678910

11121314151617

18192021222324

25262728293031

12345

6789101112

13141516171819

20212223242526

27282930

2345678

9101112131415

16171819202122

23242526272829

3031

1234

567891011

12131415161718

19202122232425

262728293031

123456

78910111213

14151617181920

21222324252627

282930

3456789

10111213141516

17181920212223

24252627282930

1234

567891011

12131415161718

19202122232425

2627282930

1234567

891011121314

15161718192021

22232425262728

293031

1234567

891011121314

15161718192021

22232425262728

123

45678910

11121314151617

18192021222324

25262728293031

123456

78910111213

14151617181920

21222324252627

28293031

2345678

9101112131415

16171819202122

23242526272829

1234

567891011

12131415161718

19202122232425

262728293031

123456

78910111213

14151617181920

21222324252627

282930

3456789

10111213141516

17181920212223

24252627282930

12345

6789101112

13141516171819

20212223242526

2728293031

1234567

891011121314

15161718192021

22232425262728

2930

123

45678910

11121314151617

18192021222324

25262728293031

12345

6789101112

13141516171819

20212223242526

27282930

2345678

9101112131415

16171819202122

23242526272829

3031

3456789

10111213141516

17181920212223

242526272829

12345

6789101112

13141516171819

20212223242526

2728293031

2345678

9101112131415

16171819202122

23242526272829

3031

123

45678910

11121314151617

18192021222324

252627282930

123456

78910111213

14151617181920

21222324252627

28293031

2345678

9101112131415

16171819202122

23242526272829

1234

567891011

12131415161718

19202122232425

262728293031

1234567

891011121314

15161718192021

22232425262728

293031

3456789

10111213141516

17181920212223

24252627282930

12345

6789101112

13141516171819

20212223242526

2728293031

1234567

891011121314

15161718192021

22232425262728

2930

123

45678910

11121314151617

18192021222324

25262728293031

123

45678910

11121314151617

18192021222324

25262728

123456

78910111213

14151617181920

21222324252627

28293031

3456789

10111213141516

17181920212223

24252627282930

1234

567891011

12131415161718

19202122232425

2627282930

1234567

891011121314

15161718192021

22232425262728

293031

3456789

10111213141516

17181920212223

24252627282930

12345

6789101112

13141516171819

20212223242526

2728293031

2345678

9101112131415

16171819202122

23242526272829

3031

123

45678910

11121314151617

18192021222324

252627282930

123456

78910111213

14151617181920

21222324252627

28293031

2345678

9101112131415

16171819202122

23242526272829

1234

567891011

12131415161718

19202122232425

262728293031

1234567

891011121314

15161718192021

22232425262728

293031

123

45678910

11121314151617

18192021222324

25262728293031

12345

6789101112

13141516171819

20212223242526

27282930

123

45678910

11121314151617

18192021222324

252627282930

123456

78910111213

14151617181920

21222324252627

28293031

1234

567891011

12131415161718

19202122232425

2627282930

1234567

891011121314

15161718192021

22232425262728

293031

3456789

10111213141516

17181920212223

24252627282930

12345

6789101112

13141516171819

20212223242526

2728293031

12345

6789101112

13141516171819

20212223242526

2728

2345678

9101112131415

16171819202122

23242526272829

3031

123456

78910111213

14151617181920

21222324252627

282930

3456789

10111213141516

17181920212223

24252627282930

1234567

891011121314

15161718192021

22232425262728

293031

123

45678910

11121314151617

18192021222324

252627282930

123456

78910111213

14151617181920

21222324252627

28293031

123456

78910111213

14151617181920

21222324252627

28293031

1234

567891011

12131415161718

19202122232425

262728293031

123456

78910111213

14151617181920

21222324252627

282930

3456789

10111213141516

17181920212223

24252627282930

12345

6789101112

13141516171819

20212223242526

2728293031

1234567

891011121314

15161718192021

22232425262728

2930

123

45678910

11121314151617

18192021222324

25262728293031

12345

6789101112

13141516171819

20212223242526

27282930

2345678

9101112131415

16171819202122

23242526272829

3031

2345678

9101112131415

16171819202122

232425262728

1234

567891011

12131415161718

19202122232425

262728293031

1234567

891011121314

15161718192021

22232425262728

293031

3456789

10111213141516

17181920212223

24252627282930

12345

6789101112

13141516171819

20212223242526

2728293031

1234567

891011121314

15161718192021

22232425262728

2930

123

45678910

11121314151617

18192021222324

25262728293031

gumbo-parserでHTMLをパースしてみる

HTMLを扱う方法を探している。gumbo-parserはgoogleが公開したApache-2.0 licenseのパーサー。

ソースコードをGitHubからダウンロード・展開する。

https://github.com/google/gumbo-parser

使い方

CMake不要。.cファイルをコピーしてプロジェクトに加える。

注意点として、windowsにはstrings.hが存在しない。

追加のインクルードディレクトリ：

visualc/includeにはstrings.hが入っている。

・gumbo-parser-master/src

・gumbo-parser-master/visualc/include

プロジェクトへ追加

src/*.c ファイルをプロジェクトへ追加する。

attribute.c
char_ref.c
error.c
parser.c
string_buffer.c
string_piece.c
tag.c
tokenizer.c
utf8.c
util.c
vector.c

サンプルコード

・gumbo_parse関数でパースを行う。

・GumboVectorはGumboNodeの配列となっている。

・node->v.element.children->data から子ノードにアクセスできる

#include <iostream>
#include <fstream>
#include <sstream>

#include <gumbo.h>

std::unique_ptr<std::string> getTitleCore(const GumboNode* node) {

  // ノードがHTML要素の場合だけ処理
  if (node->type == GUMBO_NODE_ELEMENT) {

    // titleタグの場合
    if (node->v.element.tag == GUMBO_TAG_TITLE) {

      // 子要素一覧へアクセス
      const GumboVector* children = &node->v.element.children;

      // 子要素がある場合
      if (children->length > 0) {

        // 子要素の先頭を取得
        const GumboNode* child = static_cast<GumboNode*>(children->data[0]);
        if (child->type == GUMBO_NODE_TEXT) {

          return std::make_unique<std::string>(child->v.text.text);

        }
      }

    }

    // titleタグ以外の場合、このタグの子要素を全て調査。それを再帰的に行う
    else {

      const GumboVector* children = &node->v.element.children;
      for (unsigned int i = 0; i < children->length; ++i) {

        std::unique_ptr<std::string> result = getTitleCore(static_cast<GumboNode*>(children->data[i]));
        if (result != nullptr) {
          return result;
        }

      }

    }

  }

  return nullptr;

}

std::unique_ptr<std::string>  getTitle(std::string html) {

  // Gumboでパース
  GumboOutput* output = gumbo_parse(html.c_str());

  // タイトルを取得する自作関数の本体呼び出し
  std::unique_ptr<std::string> result = getTitleCore(output->root);

  // GumboOutputの解放
  gumbo_destroy_output(&kGumboDefaultOptions, output);

  return result;
}


int main()
{
  std::string html = R"(
<html>
  <head>
    <title>The Title</title>
  </head>
  <body>
    <h1>Test</h1>
  </body>
</html>
)";

  std::unique_ptr<std::string> mytitle = getTitle(html);

  if (mytitle != nullptr) {
    std::cout << *mytitle << std::endl;
  }
  else {
    std::cout << "** No title **" << std::endl;
  }
}