From 77cc3fa602db79bcc8433049e9c1ed60ae50a088 Mon Sep 17 00:00:00 2001 From: midichef <67946319+midichef@users.noreply.github.com> Date: Wed, 29 Nov 2023 23:30:12 -0800 Subject: [PATCH 1/4] [html] display title/aria-label/caption/summary --- visidata/loaders/html.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/visidata/loaders/html.py b/visidata/loaders/html.py index ef87e2b54..d958d22cd 100644 --- a/visidata/loaders/html.py +++ b/visidata/loaders/html.py @@ -33,6 +33,10 @@ class HtmlTablesSheet(IndexSheet): Column('tag', width=0, getter=lambda col,row: row.html.tag), Column('id', getter=lambda col,row: row.html.attrib.get('id')), Column('classes', getter=lambda col,row: row.html.attrib.get('class')), + Column('title', getter=lambda col,row: row.html.attrib.get('title')), + Column('aria_label', getter=lambda col,row: row.html.attrib.get('aria-label')), + Column('caption', getter=lambda col,row: row.html.xpath('normalize-space(./caption)') if row.html.xpath('./caption') else None), + Column('summary', getter=lambda col,row: row.html.attrib.get('summary')), ] def iterload(self): lxml = vd.importExternal('lxml') From 891f7aa66b8e7f598379f7af21fdc5c27252b50f Mon Sep 17 00:00:00 2001 From: anjakefala Date: Thu, 30 Nov 2023 07:50:46 -0800 Subject: [PATCH 2/4] [tests] update html test with new columns --- tests/golden/pull2140.tsv | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/golden/pull2140.tsv b/tests/golden/pull2140.tsv index 4c5641188..5ea08efdc 100644 --- a/tests/golden/pull2140.tsv +++ b/tests/golden/pull2140.tsv @@ -1,3 +1,3 @@ -name rows cols id classes -table_0 0 0 test_empty -links 0 5 +name rows cols id classes title aria_label caption summary +table_0 0 0 test_empty +links 0 5 From 6ab0def9dfa1458a829696028236ae7039f05a5a Mon Sep 17 00:00:00 2001 From: midichef <67946319+midichef@users.noreply.github.com> Date: Fri, 1 Dec 2023 14:00:17 -0800 Subject: [PATCH 3/4] [html] cache the caption column --- visidata/loaders/html.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/visidata/loaders/html.py b/visidata/loaders/html.py index d958d22cd..4293547ec 100644 --- a/visidata/loaders/html.py +++ b/visidata/loaders/html.py @@ -35,7 +35,7 @@ class HtmlTablesSheet(IndexSheet): Column('classes', getter=lambda col,row: row.html.attrib.get('class')), Column('title', getter=lambda col,row: row.html.attrib.get('title')), Column('aria_label', getter=lambda col,row: row.html.attrib.get('aria-label')), - Column('caption', getter=lambda col,row: row.html.xpath('normalize-space(./caption)') if row.html.xpath('./caption') else None), + Column('caption', getter=lambda col,row: row.html.xpath('normalize-space(./caption)') if row.html.xpath('./caption') else None, cache=True), Column('summary', getter=lambda col,row: row.html.attrib.get('summary')), ] def iterload(self): From 243db376932a61701f943aa39c46f42a33f91f89 Mon Sep 17 00:00:00 2001 From: midichef <67946319+midichef@users.noreply.github.com> Date: Thu, 7 Dec 2023 03:04:39 -0800 Subject: [PATCH 4/4] [html] show table's sibling h1-h6 tags --- tests/golden/pull2140.tsv | 6 +++--- visidata/loaders/html.py | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/golden/pull2140.tsv b/tests/golden/pull2140.tsv index 5ea08efdc..502e7998f 100644 --- a/tests/golden/pull2140.tsv +++ b/tests/golden/pull2140.tsv @@ -1,3 +1,3 @@ -name rows cols id classes title aria_label caption summary -table_0 0 0 test_empty -links 0 5 +name rows cols id classes title aria_label caption summary heading +table_0 0 0 test_empty +links 0 5 diff --git a/visidata/loaders/html.py b/visidata/loaders/html.py index 4293547ec..b1523e0b9 100644 --- a/visidata/loaders/html.py +++ b/visidata/loaders/html.py @@ -37,6 +37,7 @@ class HtmlTablesSheet(IndexSheet): Column('aria_label', getter=lambda col,row: row.html.attrib.get('aria-label')), Column('caption', getter=lambda col,row: row.html.xpath('normalize-space(./caption)') if row.html.xpath('./caption') else None, cache=True), Column('summary', getter=lambda col,row: row.html.attrib.get('summary')), + Column('heading', getter=lambda col,row: row.html.xpath('normalize-space(./preceding-sibling::*[self::h1 or self::h2 or self::h3 or self::h4 or self::h5 or self::h6][1])') or None, cache=True), ] def iterload(self): lxml = vd.importExternal('lxml')