diff --git a/app/assets/build/config.yml b/app/assets/build/config.yml index 3f390b3..a17b412 100644 --- a/app/assets/build/config.yml +++ b/app/assets/build/config.yml @@ -34,7 +34,14 @@ xml_file_location: # The path used for OCR files on disk. Solr will read ocr files from this directory if "lazy" indexing is used. # (Use Windows file path for Windows.) "/var/ocr_files" - +input_image_resolution: + # ALTO files aren't required to use pixel units. If you have ALTO files that were created with units other than + # pixels you are advised to update your files before submitting them to be indexed. However, this service + # will attempt to convert 'inch1200' and 'mm10' units to pixels. When the unit is 'inch1200' + # the conversion is in part based on the image resolution (dpi). The service will look for the resolution + # in the ALTO processing elements. If it is not found, the default resolution below is used. You can change + # the default resolution if needed. + 300 log_dir: # The location of your log directory. (Use Windows file path for Windows.) "/var/log/ocr_processor" diff --git a/app/build_distros.sh b/app/build_distros.sh index e308642..c742bc1 100644 --- a/app/build_distros.sh +++ b/app/build_distros.sh @@ -1,8 +1,11 @@ + #!/bin/bash PLATFORMS="darwin/amd64" PLATFORMS="$PLATFORMS windows/amd64" +PLATFORMS="$PLATFORMS windows/386" PLATFORMS="$PLATFORMS linux/amd64" +PLATFORMS="$PLATFORMS linux/386" PLATFORMS="$PLATFORMS freebsd/amd64" PLATFORMS="$PLATFORMS netbsd/amd64" PLATFORMS="$PLATFORMS openbsd/amd64" diff --git a/app/config.yml b/app/config.yml index 0730c76..e47a385 100644 --- a/app/config.yml +++ b/app/config.yml @@ -33,7 +33,14 @@ escape_utf8: xml_file_location: # The path used for OCR files on disk. Solr will read ocr files from this directory if "lazy" indexing is used. "/var/ocr_files" - +input_image_resolution: + # ALTO files aren't required to use pixel units. If you have ALTO files that were created with units other than + # pixels you are advised to update your files before submitting them to be indexed. However, this service + # will attempt to convert 'inch1200' and 'mm10' units to pixels. When the unit is 'inch1200' + # the conversion is in part based on the image resolution (dpi). The service will look for the resolution + # in the ALTO processing elements. If it is not found, the default resolution below is used. You can change + # the default resolution if needed. + 300 log_dir: # The location of your log directory. "/var/log/ocr_processor" diff --git a/app/model/config.go b/app/model/config.go index eb30512..bae5420 100644 --- a/app/model/config.go +++ b/app/model/config.go @@ -1,15 +1,16 @@ package model type Configuration struct { - DSpaceHost string - Collections []string - SolrUrl string - SolrCore string - ConvertToMiniOcr bool - IndexType string - EscapeUtf8 bool - XmlFileLocation string - HttpPort string - IpWhitelist []string - LogDir string + DSpaceHost string + Collections []string + SolrUrl string + SolrCore string + ConvertToMiniOcr bool + IndexType string + EscapeUtf8 bool + XmlFileLocation string + HttpPort string + IpWhitelist []string + InputImageResolution int + LogDir string } diff --git a/app/process/alto.go b/app/process/alto.go index b82ffef..aa594f8 100644 --- a/app/process/alto.go +++ b/app/process/alto.go @@ -7,6 +7,7 @@ import ( "github.com/mspalti/ocrprocessor/model" "io" "log" + "regexp" "strconv" "strings" ) @@ -52,7 +53,19 @@ func updateAlto(alto *[]byte, position int, settings model.Configuration) (*stri decoder := xml.NewDecoder(reader) encoder := xml.NewEncoder(&buffer) + var dpiMatcher = regexp.MustCompile(`xdpi:(\d+)`) + + // These control conversion from inch1200 to pixel units + // This will be attempted for every ALTO file in which the + // MeasurementUnit is set to 'inch1200' + checkUnit := false + convertInchToPixel := false + convertMM10ToPixel := false + lookForDpi := false + dpiValue := -1 + for { + token, err := decoder.RawToken() if err == io.EOF { break @@ -63,26 +76,90 @@ func updateAlto(alto *[]byte, position int, settings model.Configuration) (*stri } switch t := token.(type) { + case xml.CharData: + + str := string(t) + if checkUnit { + if str == "inch1200" { + convertInchToPixel = true + t = []byte("pixel") + } + if str == "mm10" { + convertMM10ToPixel = true + t = []byte("pixel") + } + checkUnit = false + } + if lookForDpi { + dpi := dpiMatcher.FindSubmatch([]byte(str)) + dpiValue, err = strconv.Atoi(string(dpi[1])) + if err != nil { + return nil, err + } + lookForDpi = false + } + case xml.StartElement: + if t.Name.Local == "MeasurementUnit" { + checkUnit = true + } + if t.Name.Local == "processingStepSettings" { + lookForDpi = true + } if t.Name.Local == "Page" { id := "Page." + strconv.Itoa(position) - pos := getPosition(t, "ID") - t.Attr[pos].Value = id + idPos := getPosition(t, "ID") + t.Attr[idPos].Value = id + if convertInchToPixel { + err := inchToPixel(&t, dpiValue, settings) + if err != nil { + return nil, err + } + } + if convertMM10ToPixel { + err := mmToPixel(&t) + if err != nil { + return nil, err + } + } if err := encoder.EncodeToken(t); err != nil { return nil, err } continue } - if t.Name.Local == "String" && settings.EscapeUtf8 && settings.IndexType == "lazy" { - pos := getPosition(t, "CONTENT") - t.Attr[pos].Value = ToXmlCodePoint(t.Attr[pos].Value) - if err := encoder.EncodeToken(t); err != nil { - return nil, err + if t.Name.Local == "String" { + modified := false + if settings.EscapeUtf8 && settings.IndexType == "lazy" { + pos := getPosition(t, "CONTENT") + t.Attr[pos].Value = ToXmlCodePoint(t.Attr[pos].Value) + modified = true + } + if convertMM10ToPixel || convertInchToPixel { + if convertInchToPixel { + err := inchToPixel(&t, dpiValue, settings) + if err != nil { + return nil, err + } + } + if convertMM10ToPixel { + err := mmToPixel(&t) + if err != nil { + return nil, err + } + } + modified = true + } + // If String token values were modified then encode now and continue. + if modified { + if err := encoder.EncodeToken(t); err != nil { + return nil, err + } + continue } - continue } } + if err := encoder.EncodeToken(xml.CopyToken(token)); err != nil { return nil, err } @@ -99,6 +176,98 @@ func updateAlto(alto *[]byte, position int, settings model.Configuration) (*stri } +func inchToPixel(t *xml.StartElement, dpiValue int, settings model.Configuration) error { + + h := getPosition(*t, "HEIGHT") + w := getPosition(*t, "WIDTH") + harr := strings.Split(t.Attr[h].Value, ".") + warr := strings.Split(t.Attr[w].Value, ".") + height, err := strconv.Atoi(harr[0]) + if err != nil { + return err + } + width, err := strconv.Atoi(warr[0]) + if err != nil { + return err + } + t.Attr[h].Value = convertInchDimToPixel(height, dpiValue, settings) + t.Attr[w].Value = convertInchDimToPixel(width, dpiValue, settings) + + if t.Name.Local == "String" { + hp := getPosition(*t, "HPOS") + vp := getPosition(*t, "VPOS") + hposarr := strings.Split(t.Attr[hp].Value, ".") + vposarr := strings.Split(t.Attr[vp].Value, ".") + hpos, err := strconv.Atoi(hposarr[0]) + if err != nil { + return err + } + vpos, err := strconv.Atoi(vposarr[0]) + if err != nil { + return err + } + t.Attr[hp].Value = convertInchDimToPixel(hpos, dpiValue, settings) + t.Attr[vp].Value = convertInchDimToPixel(vpos, dpiValue, settings) + } + + return nil +} + +func convertInchDimToPixel(input int, dpi int, settings model.Configuration) string { + if dpi == -1 { + dpi = settings.InputImageResolution + } + dim := (input * dpi) / 1200 + return strconv.Itoa(dim) +} + +// mmToPixel updates mm10 unit values to pixels +func mmToPixel(t *xml.StartElement) error { + h := getPosition(*t, "HEIGHT") + w := getPosition(*t, "WIDTH") + hp := getPosition(*t, "HPOS") + vp := getPosition(*t, "VPOS") + htmm, err := strconv.Atoi(t.Attr[h].Value) + if err != nil { + return err + } + wdmm, err := strconv.Atoi(t.Attr[w].Value) + if err != nil { + return err + } + var hposmm *int + var vposmm *int + if hp >= 0 { + v, err := strconv.Atoi(t.Attr[hp].Value) + hposmm = &v + if err != nil { + return err + } + } + if vp >= 0 { + v, err := strconv.Atoi(t.Attr[vp].Value) + vposmm = &v + if err != nil { + return err + } + } + height := 3.7795275591 * float64(htmm) + width := 3.7795275591 * float64(wdmm) + t.Attr[h].Value = strconv.Itoa(int(height)) + t.Attr[w].Value = strconv.Itoa(int(width)) + + if hposmm != nil { + hpos := 3.7795275591 * float64(*hposmm) + t.Attr[hp].Value = strconv.Itoa(int(hpos)) + } + if vposmm != nil { + vpos := 3.7795275591 * float64(*vposmm) + t.Attr[vp].Value = strconv.Itoa(int(vpos)) + } + + return nil +} + // convertToMiniOcr creates miniOcr output from the ALTO input. func convertToMiniOcr(original *string, position int, settings model.Configuration) (*string, error) { reader := strings.NewReader(*original) @@ -127,8 +296,10 @@ func convertToMiniOcr(original *string, position int, settings model.Configurati case xml.StartElement: if t.Name.Local == "Page" { - height := t.Attr[2].Value - width := t.Attr[3].Value + h := getPosition(t, "HEIGHT") + w := getPosition(t, "WIDTH") + height := t.Attr[h].Value + width := t.Attr[w].Value dims := width + " " + height textBlockElements = nil pageId := "Page." + strconv.Itoa(position) @@ -160,11 +331,18 @@ func convertToMiniOcr(original *string, position int, settings model.Configurati continue } if t.Name.Local == "String" { - content := t.Attr[0] - height := t.Attr[1] - width := t.Attr[2] - vpos := t.Attr[3] - hpos := t.Attr[4] + c := getPosition(t, "CONTENT") + h := getPosition(t, "HEIGHT") + w := getPosition(t, "WIDTH") + hp := getPosition(t, "HPOS") + vp := getPosition(t, "VPOS") + + content := t.Attr[c] + height := t.Attr[h] + width := t.Attr[w] + hpos := t.Attr[hp] + vpos := t.Attr[vp] + var str = "" if escape { str = ToXmlCodePoint(content.Value)