Skip to content

Commit

Permalink
Be resilient to non-JSON content in namespaces.json.gz
Browse files Browse the repository at this point in the history
The production dump for `alswiktionary-20180101-siteinfo-namespaces.json.gz`
is in HTML format, even if the filename indicates JSON.

Fixes #41
  • Loading branch information
brawer committed May 29, 2024
1 parent 8c35d6f commit 97c64ac
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 1 deletion.
Binary file not shown.
6 changes: 5 additions & 1 deletion cmd/qrank-builder/wikisites.go
Original file line number Diff line number Diff line change
Expand Up @@ -299,8 +299,12 @@ func readNamespaces(site *WikiSite, dumps string) error {
}
var si siteinfo
if err := json.Unmarshal(data, &si); err != nil {
// Intentionally logging an error without failing, because some
// deprecated wiki projects such as alswiktionary contain HTML
// instead of JSON in their `siteinfo-namespaces.json.gz` file.
// https://github.com/brawer/wikidata-qrank/issues/41
logger.Printf("malformed json: %s", path)
return err
return nil
}

for key, ns := range si.Query.Namespaces {
Expand Down
27 changes: 27 additions & 0 deletions cmd/qrank-builder/wikisites_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@ import (
"bytes"
"fmt"
"io"
"log"
"net/http"
"os"
"path/filepath"
"reflect"
"strings"
"testing"
"time"
)
Expand Down Expand Up @@ -118,3 +120,28 @@ func (f *FakeWikiSite) RoundTrip(req *http.Request) (*http.Response, error) {

return nil, fmt.Errorf("unexpected request: %s", req.URL.String())
}

// https://github.com/brawer/wikidata-qrank/issues/41
func TestReadNamespaces_Bug41(t *testing.T) {
var buf bytes.Buffer
logger = log.New(&buf, "", log.Lshortfile)
dumped, _ := time.Parse(time.DateOnly, "2018-01-01")
site := &WikiSite{
Key: "alswiktionary",
Domain: "als.wiktionary.org",
LastDumped: dumped,
Namespaces: make(map[string]*Namespace, 20),
}
dumps := filepath.Join("testdata", "bug_41")
err := readNamespaces(site, dumps)
if err != nil {
t.Fatal(err)
}
if len(site.Namespaces) != 0 {
t.Errorf("got %v, want empty map", site.Namespaces)
}
gotLog := string(buf.Bytes())
if !strings.Contains(gotLog, "alswiktionary") {
fmt.Errorf("log should contain name of malformed Wiki dump, log=%q", gotLog)
}
}

0 comments on commit 97c64ac

Please sign in to comment.