2024-03-10 01:51:01 +00:00
|
|
|
package extractor
|
|
|
|
|
|
|
|
import (
|
|
|
|
"errors"
|
|
|
|
"github.com/advancedlogic/GoOse"
|
2024-03-12 19:06:40 +00:00
|
|
|
"log/slog"
|
2024-03-10 01:51:01 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
var (
|
|
|
|
ErrExtractFailed = errors.New("extraction failed")
|
|
|
|
)
|
|
|
|
|
|
|
|
type Extractor struct {
|
|
|
|
goose *goose.Goose
|
|
|
|
}
|
|
|
|
|
|
|
|
func NewExtractor() *Extractor {
|
|
|
|
gooseExtractor := goose.New()
|
|
|
|
|
|
|
|
return &Extractor{
|
|
|
|
goose: &gooseExtractor,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
type Article struct {
|
|
|
|
Title string
|
|
|
|
Text string
|
|
|
|
Url string
|
|
|
|
}
|
|
|
|
|
|
|
|
func (e *Extractor) GetArticleFromUrl(url string) (Article, error) {
|
2024-03-12 19:06:40 +00:00
|
|
|
slog.Info("extractor: requested extraction from URL ", "url", url)
|
|
|
|
|
2024-03-10 01:51:01 +00:00
|
|
|
article, err := e.goose.ExtractFromURL(url)
|
|
|
|
|
|
|
|
if err != nil {
|
2024-03-12 19:06:40 +00:00
|
|
|
slog.Error("extractor: failed extracting from URL", "url", url)
|
|
|
|
|
2024-03-10 01:51:01 +00:00
|
|
|
return Article{}, ErrExtractFailed
|
|
|
|
}
|
|
|
|
|
2024-03-12 19:06:40 +00:00
|
|
|
slog.Debug("extractor: article extracted", "article", article)
|
|
|
|
|
2024-03-10 01:51:01 +00:00
|
|
|
return Article{
|
|
|
|
Title: article.Title,
|
|
|
|
Text: article.CleanedText,
|
|
|
|
Url: article.FinalURL,
|
|
|
|
}, nil
|
|
|
|
}
|