Merge pull request #390 from ajwheeler/exomol_linelists

parsing exomol linelists
ajwheeler · Jan 2, 2025 · 38990bd · 38990bd
2 parents b9d035f + 54bab06
commit 38990bd
Show file tree

Hide file tree

Showing 11 changed files with 918 additions and 15 deletions.
diff --git a/Project.toml b/Project.toml
@@ -7,6 +7,7 @@ version = "0.41.1"
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
 Compat = "34da2185-b29b-5c13-b0c7-acf172513d20"
 DSP = "717857b8-e6f2-59f4-9121-6e50c889abd2"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 DiffResults = "163ba53b-c6d8-5494-b064-1a9d43ac40c5"
 FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
@@ -23,12 +24,14 @@ SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
+TableOperations = "ab02a1b2-a7df-11e8-156e-fb1833f50b87"
 Trapz = "592b5752-818d-11e9-1e9a-2b8ca4a44cd1"
 
 [compat]
 CSV = "0.8, 0.9, 0.10"
 Compat = "4.10 - 4"
 DSP = "0.7, 0.8"
+DataFrames = "1.7.0"
 DiffResults = "1"
 FastGaussQuadrature = "0.4, 0.5, 1"
 ForwardDiff = "0.10"
@@ -45,5 +48,6 @@ SparseArrays = "1.7 - 1"
 SpecialFunctions = "1.4, 2"
 StaticArrays = "1"
 Statistics = "1"
+TableOperations = "1.2.0"
 Trapz = "2"
 julia = "1.7 - 1"
diff --git a/docs/src/API.md b/docs/src/API.md
@@ -1,15 +1,16 @@
-This page documents the Korg API for users of the package. Low-level functions that only people 
-digging into the internals of Korg will be interested in can be found in the 
+This page documents the Korg API for users of the package. Low-level functions that only people
+digging into the internals of Korg will be interested in can be found in the
 [Developer documentation](@ref).
 
 # [Top-level functions](@id API)
-If you are synthesize a spectrum Korg, these are the functions you will call.  
+If you are synthesize a spectrum Korg, these are the functions you will call.
 These functions are exported, so if you do `using Korg`, you can call them unqualified (i.e.
-`synthesize` instead of `Korg.synthesize`).  
+`synthesize` instead of `Korg.synthesize`).
 
 ```@docs
 synthesize
 read_linelist
+load_ExoMol_linelist
 read_model_atmosphere
 interpolate_marcs
 format_A_X
@@ -30,7 +31,7 @@ Korg.Fit.ews_to_stellar_parameters
 ```
 
 # Secondary functions
-You don't need use these to synthesize spectra, but they might be relevant depending on what you are 
+You don't need use these to synthesize spectra, but they might be relevant depending on what you are
 doing.
 
 ## Postprocessing

diff --git a/src/Korg.jl b/src/Korg.jl
@@ -29,6 +29,6 @@ include("qfactors.jl")                 # formalism to compute theoretical RV pre
 @compat public get_APOGEE_DR17_linelist, get_GALAH_DR3_linelist, get_GES_linelist, save_linelist,
                Fit, apply_LSF, compute_LSF_matrix, air_to_vacuum, vacuum_to_air, line_profile,
                blackbody, prune_linelist, merge_close_lines, Line
-export synthesize, read_linelist, read_model_atmosphere, interpolate_marcs,
+export synthesize, read_linelist, load_ExoMol_linelist, read_model_atmosphere, interpolate_marcs,
        format_A_X
 end # module
diff --git a/src/constants.jl b/src/constants.jl
@@ -15,5 +15,5 @@ const eV_to_cgs = 1.602e-12 # ergs per eV
 
 const kboltz_eV = 8.617333262145e-5 #eV/K
 const hplanck_eV = 4.135667696e-15 #eV*s
-const RydbergH_eV = 13.598287264 #eV 
+const RydbergH_eV = 13.598287264 #eV
 const Rydberg_eV = 13.605693122994 #eV 2018 CODATA via wikipedia
diff --git a/src/linelist.jl b/src/linelist.jl
@@ -1,4 +1,5 @@
-using CSV, HDF5, LazyArtifacts
+using CSV, HDF5, LazyArtifacts, TableOperations
+using DataFrames: DataFrame, leftjoin, leftjoin!, rename!
 using Pkg.Artifacts: @artifact_str
 
 #This type represents an individual line.
@@ -165,6 +166,101 @@ function approximate_gammas(wl, species, E_lower; ionization_energies=ionization
     γstark, log_γvdW
 end
 
+"""
+    load_ExoMol_linelist(specs, states_file, transitions_file, upper_wavelength, lower_wavelength)
+
+Load a linelist from ExoMol. Returns a vector of [`Line`](@ref)s, the same as [`read_linelist`](@ref).
+
+# Arguments
+
+  - `spec`: the species, i.e. the molecule that the linelist is for
+  - `states_file`: the path to the ExoMol states file
+  - `transitions_file`: the path to the ExoMol, transitions file
+  - `upper_wavelength`: the upper limit of the wavelength range to load (Å)
+  - `lower_wavelength`: the lower limit of the wavelength range to load (Å)
+
+# Keyword Arguments
+
+  - `line_strength_cutoff`: the cutoff for the line strength (default: -15) used to filter the
+    linelist. See [`approximate_line_strength`](@ref) for more information.
+  - `T_line_strength`: the temperature (K) at which to evaluate the line strength (default: 3500.0)
+
+!!! warning
+
+    This functionality is in beta.
+"""
+function load_ExoMol_linelist(spec, states_file, transitions_file, ll, ul;
+                              line_strength_cutoff=-15, T_line_strength=3500.0)
+    if spec isa AbstractString
+        spec = Species(spec)
+    end
+    @info "Loading ExoMol linelist from $states_file and $transitions_file. This functionality is experimental. Please report any issues."
+
+    if !occursin("states", states_file) || !occursin("trans", transitions_file)
+        @info "The states and transitions files, $states_file and $transitions_file, don't contain the string 'states' or 'trans', respectively. You may have mixed them up."
+    end
+
+    # These contortions allow us to read only the first N columns, without parsing the rest.
+    # This is (probably) faster, and more memory efficient. But also the ExoMol files sometimes
+    # have various extra columns. Hopefully we can cound on the first three being consistent.
+    raw_transitions = CSV.File(transitions_file; delim=" ", ignorerepeated=true, header=false) |>
+                      TableOperations.select(:Column1, :Column2, :Column3) |>
+                      DataFrame
+    rename!(raw_transitions, :Column1 => :id_upper, :Column2 => :id_lower, :Column3 => :A)
+    states = CSV.File(states_file; delim=" ", ignorerepeated=true, header=false) |>
+             TableOperations.select(:Column1, :Column2, :Column3) |>
+             DataFrame
+    rename!(states, :Column1 => :id, :Column2 => :E_wavenumber, :Column3 => :g)
+
+    # join the transitions and states tables on the id column to get the upper and lower level info
+    transitions = leftjoin(raw_transitions, states; on=:id_upper => :id)
+    rename!(transitions, :E_wavenumber => :wavenumber_upper, :g => :g_upper)
+    leftjoin!(transitions, states; on=:id_lower => :id)
+    rename!(transitions, :E_wavenumber => :wavenumber_lower, :g => :g_lower)
+
+    # if the column is missing, the join didn't find a matching state
+    if any(ismissing.(transitions.g_lower)) || any(ismissing.(transitions.g_upper))
+        throw(ArgumentError("Some of the transitions in $transitions_file can't be mapped to states in $states_file"))
+    end
+
+    # Gray 4th ed, eq 11.12 (page 214) but with an extra factor of 1/4π for stradian vs unit sphere
+    # difference of 1e16 is due to Å vs cm
+    prefactor = (Korg.electron_mass_cgs * Korg.c_cgs) / (8π^2 * Korg.electron_charge_cgs^2)
+
+    transitions.wavenumber = transitions.wavenumber_upper .- transitions.wavenumber_lower
+    transitions.f = @. transitions.A * prefactor * transitions.g_upper /
+                       (transitions.g_lower * transitions.wavenumber^2)
+    transitions.log_gf = log10.(transitions.g_lower .* transitions.f)
+
+    isotopic_correction = log10(prod(maximum(last.(collect(values(Korg.isotopic_abundances[Z]))))
+                                     for Z in get_atoms(spec.formula)))
+    @info "Applying isotopic correction of $isotopic_correction to all log gf values. (Assuming most abundant isotope for all atoms.)"
+    transitions.log_gf .+= isotopic_correction
+
+    transitions.E_lower = @. Korg.hplanck_eV * transitions.wavenumber_lower * Korg.c_cgs
+    transitions.wavelength = 1.0 ./ transitions.wavenumber
+
+    region = transitions[ll.<transitions.wavelength.*1e8.<ul, :]
+
+    lines = map(eachrow(region)) do row
+        Korg.Line(row.wavelength, row.log_gf, spec, row.E_lower)
+    end |> reverse
+    lines = lines[approximate_line_strength.(lines, T_line_strength).>line_strength_cutoff]
+    sort!(lines; by=l -> l.wl)
+    lines
+end
+
+"""
+    approximate_line_strength(line::Line, T)
+
+Approximate the line strength (`log10(gfλ) - θχ`, in arbitrary units) of a line at temperature
+`T` (K).  This can be used to very quickly filter a linelist, and is used to filter very large
+molecular linelists from ExoMol (see [`load_ExoMol_linelist`](@ref)).
+"""
+function approximate_line_strength(line::Line, T)
+    line.log_gf + log10(line.wl) - log10(ℯ) * line.E_lower / (Korg.kboltz_eV * T)
+end
+
 """
     read_linelist(filename; format="vald", isotopic_abundances=Korg.isotopic_abundances)
 
@@ -205,7 +301,7 @@ a dict mapping atomic number to a dict mapping from atomic weight to abundance.
 Be warned that for linelists which are pre-scaled for isotopic abundance, the estimation of
 radiative broadening from log(gf) is not accurate.
 
-See also [`save_linelist`](@ref).
+See also: [`load_ExoMol_linelist`](@ref), [`save_linelist`](@ref).
 """
 function read_linelist(fname::String;
                        format=endswith(fname, ".h5") ? "korg" : "vald",

diff --git a/src/species.jl b/src/species.jl
@@ -6,7 +6,7 @@ const MAX_ATOMS_PER_MOLECULE = 6
 Represents an atom or molecule, irrespective of its charge.
 """
 struct Formula
-    # Support molecules with up to MAX_ATOMS_PER_MOLECULE atoms. 
+    # Support molecules with up to MAX_ATOMS_PER_MOLECULE atoms.
     # Unlike tuples, SVectors support sorting, which is why we use them here.
     atoms::SVector{6,UInt8}
 
@@ -30,7 +30,7 @@ struct Formula
     """
         Formula(code::String)
 
-    Construct a Formula from an encoded string form.  This can be a MOOG-style numeric code, 
+    Construct a Formula from an encoded string form.  This can be a MOOG-style numeric code,
     i.e. "0801" for OH, or an atomic or molecular symbol, i.e. "FeH", "Li", or "C2".
     """
     function Formula(code::AbstractString)
@@ -117,7 +117,33 @@ function n_atoms(f::Formula)
 end
 
 # it's important that this produces something parsable by the constructor
-Base.show(io::IO, f::Formula) = print(io, *([atomic_symbols[i] for i in f.atoms if i != 0]...))
+function Base.show(io::IO, f::Formula)
+    i = findfirst(f.atoms .!= 0)
+    @assert !isnothing(i) # formula with no atoms should be allowed by the constructor
+
+    a = f.atoms[i]
+    n_of_this_atom = 1
+    i += 1
+    while i <= length(f.atoms)
+        if f.atoms[i] == a
+            n_of_this_atom += 1
+        else
+            formula_show_helper(io, a, n_of_this_atom)
+            a = f.atoms[i]
+            n_of_this_atom = 1
+        end
+        i += 1
+    end
+    formula_show_helper(io, a, n_of_this_atom)
+end
+
+function formula_show_helper(io::IO, atom, n_of_this_atom)
+    if n_of_this_atom == 1
+        print(io, atomic_symbols[atom])
+    else
+        print(io, atomic_symbols[atom], n_of_this_atom)
+    end
+end
 
 """
     ismolecule(f::Formula)
@@ -183,19 +209,19 @@ representing the species. `code` can be either a string or a float.
 function Species(code::AbstractString)
     code = strip(code, ['0', ' ']) # leading 0s are safe to remove
 
-    # if the species ends in "+" or "-", convert it to a numerical charge. Remember, the ionization 
+    # if the species ends in "+" or "-", convert it to a numerical charge. Remember, the ionization
     # number is the charge+1, so for us "H 0" is H⁻ and "H 2" in H⁺.
     if code[end] == '+'
         code = code[1:end-1] * " 2"
     elseif code[end] == '-'
         code = code[1:end-1] * " 0"
     end
 
-    # these are the valid separators between the atomic number part of a species code and the 
+    # these are the valid separators between the atomic number part of a species code and the
     # charge-containing part.  For example, "01.01" parses the same as "01 01", but "01,01" fails.
     # Or, "C 2" parses the same at "C.2".  "-"s can't be separators as they can be minus signs.
     toks = split(code, [' ', '.', '_'])
-    # this allows for leading, trailing, and repeat separators.  "01.01" parses the same as 
+    # this allows for leading, trailing, and repeat separators.  "01.01" parses the same as
     # ".01..01.".
     filter!(!=(""), toks)