Finding years in text


Its interesting to parse text and find a time slice. Lets have an example:
In 305 BC, Ptolemy took the title of Pharaoh. As Ptolemy I Soter (“Saviour”), he founded the Ptolemaic dynasty that was to rule Egypt for nearly 300 years.
We can extract from above [-305]
An invasion of Macedonia and Greece by Goths, who had been displaced from their lands on the Black Sea, was defeated by emperor Claudius II Gothicus at the Battle of Naissus in 268 or 269 years.
here are two years [268] and [269]
But there could be a range:
Ptolemy successfully defended Egypt against an invasion by Perdiccas in 321 B.C. and consolidated his position in Egypt and the surrounding areas during the Wars of the Diadochi (322–301 BC).
Above text contains [-321] and range [-322, -301]
Here roman digits
At the end of the II century BC, the Pharnavazid king Pharnajom was dethroned by his own subjects and the crown given to the Armenian prince Arshak who ascended the Iberian throne in 93 BC, establishing the Arshakids dynasty.
with two dates [-200] and [-93]

Notices

So we see here, that we can parse numbers and detect multiplier. BC in general means negative sign. AD is almost impossible to face in a text, but this is positive sign. This view on time border will be enough for us.

Dictionary of tokens

I looked at text and collected tokens like this
“BC”, “B.C.”, “century BC”, “in {number}”, “in {number} or {number}”, “by late {number}”

Code

It is interesting to try parsing years in text from end. Just because year tockens are located after digits. So if program is encountered “BC” or “B.C.” then next is some number expected, may be arabic, may be roman. If program found BC - it means we need to apply negative sign, convert 234 BC to just -234, So sign multiplier needed. If program encountered something like “years ago” - it need to add correcction -2k. E.g. 5000 years ago actually means -(5000 - 2000) = -3000. So we can write something stupid like:
package years

import (
    "errors"
    "sort"
    "strconv"
    "strings"
    "unicode"
)

var tokens = []token{
    token{"years ago", -1, 2000},
    token{"BC", -1, 0},
    token{"bc", -1, 0},
    token{"B.C.", -1, 0},
    token{"b.c.", -1, 0},
}

var multipliers = []multiplier{
    multiplier{"century", 100},
}

var rangeTokens = []rune{
    '-',
    '—',
    '–',
}

var romanNumbers = map[rune]int{
    'I': 1,
    'V': 5,
    'X': 10,
    'L': 50,
    'C': 100,
    'D': 500,
    'M': 1000,
}

type token struct {
    str        string
    sign       float64
    correction float64
}

type multiplier struct {
    str        string
    multiplier float64
}

//YearRow years period, place and mentioning context
type YearRow struct {
    Years   []int
    Context string
    Place   string
}

func (y YearRow) String() string {
    str := []string{}
    for _, year := range y.Years {
        if len(str) > 0 {
            str = append(str, ",")
        }
        str = append(str, strconv.Itoa(year))
    }
    return strings.Join(append(str, "|", y.Place, "|", y.Context, "\n"), "")
}

func firstLeftRange(s string, sign float64, correction float64) ([]int, int) {
    years := make([]int, 0)
    preceding := []rune(s)

    year, j, ri := leftNumber(preceding, sign)
    if j >= 0 {

        multiplier := 1.0
        strmul := s[j:]

        for _, m := range multipliers {
            if strings.Contains(strmul, m.str) {
                multiplier = m.multiplier
                break
            }
        }

        years = append(years, int(year*multiplier+correction))
        ln := len(preceding[0:j])
        isNoticedRange := false
        for i := ln - 1; i >= 0; i-- {
            r := preceding[i]
            if containsRune(rangeTokens, r) {
                isNoticedRange = true
            }
            if isDigitOrRoman(r) && isNoticedRange {
                year, j, _ = leftNumber(preceding[0:i+1], sign)
                if j >= 0 {
                    i = j
                    years = append(years, int(year*multiplier+correction))
                    if len(years) == 2 {
                        break
                    }
                }
            }
        }
    }
    if len(years) == 2 && years[0] < years[1] && years[0] > 0 {
        years[0] = -years[0]
        years[1] = -years[1]
    }
    sort.Ints(years)
    return years, ri
}

func leftNumber(runes []rune, sign float64) (float64, int, int) {
    ln := len(runes)
    leftDigitind := -1
    rightDigitInd := -1

    for i := ln - 1; i >= 0; i-- {
        r := runes[i]
        if isDigitOrRoman(r) && rightDigitInd < 0 {
            rightDigitInd = i + 1
        } else if rightDigitInd >= 0 {
            if !isDigitOrRoman(r) && ',' != r && '.' != r {
                leftDigitind = i + 1
                break
            }
            if i == 0 {
                leftDigitind = i
            }
        }
    }

    if leftDigitind >= 0 && rightDigitInd >= 0 {
        s := string(runes[leftDigitind:rightDigitInd])
        if strings.Contains(s, ",") && !strings.Contains(s, ".") {
            s = strings.Replace(s, ",", ".", -1)
        }
        num, err := strconv.ParseFloat(s, 64)
        if err != nil {
            num, err = parseRoman(s)
        }
        if err == nil {
            return sign * num, leftDigitind, rightDigitInd
        }
    }
    return -1, -1, -1
}

func isDigitOrRoman(r rune) bool {
    if unicode.IsDigit(r) {
        return true
    }
    if _, ok := romanNumbers[r]; ok {
        return true
    }
    return false
}

func parseRoman(roman string) (float64, error) {
    number := 0
    lastDigit := 1000
    for _, romRune := range []rune(roman) {
        digit := romanNumbers[romRune]
        if lastDigit < digit {
            number -= 2 * lastDigit
        }
        lastDigit = digit
        number += lastDigit
    }
    if number != 0 {
        return float64(number), nil
    }
    return -1, errors.New("Cant parse roman digits")
}

//todo notify if parsing is unsuccessfull - encoding?

//LookupYearRow walks row from right to left
func LookupYearRow(str, place string) []YearRow {
    yrs := make([]YearRow, 0)
    lastRightInd := -1
    for _, t := range tokens {
        i := len(str)
        for i > 0 {
            i = strings.LastIndex(str[0:i], t.str)
            if i > 0 {
                leftRange, rightInd := firstLeftRange(str[0:i], t.sign, t.correction)
                if rightInd != lastRightInd {
                    lastRightInd = rightInd
                    yr := YearRow{leftRange, str, place}
                    yrs = append(yrs, yr)
                }
            }
        }
    }
    return yrs
}

func containsRune(runes []rune, rn rune) bool {
    for _, r := range runes {
        if r == rn {
            return true
        }
    }
    return false
}

Tests

package years_test

import (
    "testing"

    "github.com/quewelcy/apostaxi/years"
)

func TestYears(t *testing.T) {
    testYear(t, "In 305 BC, Ptolemy took the title of Pharaoh. That was to rule Egypt for nearly 300 years.", []int{-305})
    testYear(t, "Ptolemy successfully defended Egypt against an invasion by Perdiccas in 321 B.C.", []int{-321})
    testYear(t, "during the Wars of the Diadochi (322–301 BC).", []int{-322, -301})
    testYear(t, "At the end of the II century BC, the Pharnavazid king Pharnajom was dethroned", []int{-200})
    testYear(t, "Arshak ascended the Iberian throne in 93 BC.", []int{-93})
    testYear(t, "3450 years ago: Mycenean Greece, first deciphered writing in Europe", []int{-1450})
}

func testYear(t *testing.T, r string, expYears ...[]int) {
    years := years.LookupYearRow(r, "")
    for i, expYearSet := range expYears {
        for j, expYear := range expYearSet {
            if i >= len(years) {
                t.Error("Not all expected years parsed. Expected", expYears, "received", years)
                return
            }
            yr := years[i]
            if expYear != yr.Years[j] {
                t.Error("Expected", expYear, "got", yr.Years[j])
                return
            }
        }
    }
}

Popular Posts