Aggiornamento massivo: aggiunto backend PostgreSQL per statistiche aste con fallback SQLite, nuovi modelli e servizi, UI moderna con grafici interattivi, refactoring stato applicazione (ApplicationStateService), documentazione completa per deploy Docker/Unraid/Gitea, nuovi CSS e script JS per UX avanzata, template Unraid, test database, e workflow CI/CD estesi. Pronto per produzione e analisi avanzate.
370 lines
17 KiB
C#
370 lines
17 KiB
C#
using System;
|
|
using System.Collections.Generic;
|
|
using System.Globalization;
|
|
using System.IO;
|
|
using System.Linq;
|
|
using System.Net;
|
|
using System.Net.Http;
|
|
using System.Text;
|
|
using System.Text.RegularExpressions;
|
|
using System.Threading.Tasks;
|
|
using AutoBidder.Models;
|
|
|
|
namespace AutoBidder.Services
|
|
{
|
|
/// <summary>
|
|
/// Semplice scraper che scarica la pagina delle "closed auctions" di Bidoo,
|
|
/// estrae i link alle singole aste, visita ciascuna pagina e prova ad estrarre
|
|
/// informazioni utili (nome prodotto, prezzo finale, vincitore, puntate usate).
|
|
/// Risultato salvato in CSV per analisi statistiche esterne.
|
|
///
|
|
/// Nota: il parsing è basato su euristiche (regex) per resistere a vari formati HTML.
|
|
/// </summary>
|
|
public class ClosedAuctionsScraper
|
|
{
|
|
private readonly HttpClient _http;
|
|
private readonly StatsService? _statsService;
|
|
private readonly Action<string>? _log;
|
|
|
|
public ClosedAuctionsScraper(HttpMessageHandler? handler = null, StatsService? statsService = null, Action<string>? log = null)
|
|
{
|
|
var h = handler ?? new HttpClientHandler
|
|
{
|
|
AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate | DecompressionMethods.Brotli
|
|
};
|
|
_http = new HttpClient(h)
|
|
{
|
|
Timeout = TimeSpan.FromSeconds(10)
|
|
};
|
|
|
|
// Default headers user-like
|
|
_http.DefaultRequestHeaders.TryAddWithoutValidation("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36");
|
|
_http.DefaultRequestHeaders.TryAddWithoutValidation("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
|
|
_http.DefaultRequestHeaders.TryAddWithoutValidation("Accept-Language", "it-IT,it;q=0.9,en-US;q=0.8,en;q=0.7");
|
|
|
|
_statsService = statsService;
|
|
_log = log;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Scarica la pagina di aste chiuse, estrae i link ed esegue scraping per ogni asta.
|
|
/// Salva il risultato in CSV.
|
|
/// </summary>
|
|
public async Task ScrapeAndSaveCsvAsync(string closedAuctionsUrl, string outputCsvPath)
|
|
{
|
|
var results = await ScrapeAsync(closedAuctionsUrl);
|
|
SaveCsv(results, outputCsvPath);
|
|
}
|
|
|
|
/// <summary>
|
|
/// Scarica la pagina di aste chiuse, estrae i link ed esegue scraping per ogni asta.
|
|
/// Ritorna la lista dei record (non salva su disco).
|
|
/// </summary>
|
|
public async Task<List<ClosedAuctionRecord>> ScrapeAsync(string closedAuctionsUrl)
|
|
{
|
|
var list = new List<ClosedAuctionRecord>();
|
|
await foreach (var rec in ScrapeYieldAsync(closedAuctionsUrl))
|
|
{
|
|
list.Add(rec);
|
|
}
|
|
return list;
|
|
}
|
|
|
|
/// <summary>
|
|
/// Scarica la pagina di aste chiuse e produce i record uno per uno (yield) per permettere aggiornamenti UI incrementali.
|
|
/// </summary>
|
|
public async IAsyncEnumerable<ClosedAuctionRecord> ScrapeYieldAsync(string closedAuctionsUrl)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(closedAuctionsUrl)) throw new ArgumentNullException(nameof(closedAuctionsUrl));
|
|
|
|
var baseUri = new Uri("https://it.bidoo.com/");
|
|
_log?.Invoke($"[scraper] Downloading closed auctions page: {closedAuctionsUrl}");
|
|
var html = await GetStringAsync(closedAuctionsUrl);
|
|
if (html == null)
|
|
{
|
|
_log?.Invoke("[scraper] ERROR: unable to download closed auctions page");
|
|
throw new InvalidOperationException("Impossibile scaricare la pagina delle aste chiuse.");
|
|
}
|
|
|
|
var auctionUrls = ExtractAuctionLinks(html, baseUri).Distinct().ToList();
|
|
_log?.Invoke($"[scraper] Found {auctionUrls.Count} auction links on closed auctions page.");
|
|
|
|
foreach (var auctionUrl in auctionUrls)
|
|
{
|
|
ClosedAuctionRecord record;
|
|
try
|
|
{
|
|
_log?.Invoke($"[scraper] Fetching auction page: {auctionUrl}");
|
|
var contextInfo = ExtractSummaryInfoForUrl(html, auctionUrl);
|
|
var auctionHtml = await GetStringAsync(auctionUrl);
|
|
if (auctionHtml == null)
|
|
{
|
|
_log?.Invoke($"[scraper] WARNING: failed to download auction page: {auctionUrl}");
|
|
throw new InvalidOperationException("Download auction page failed");
|
|
}
|
|
|
|
var productName = contextInfo?.ProductName ?? ExtractProductNameFromAuctionHtml(auctionHtml);
|
|
var finalPrice = contextInfo?.FinalPrice ?? ExtractFinalPriceFromAuctionHtml(auctionHtml);
|
|
var winner = contextInfo?.Winner ?? ExtractWinnerFromAuctionHtml(auctionHtml);
|
|
var bidsUsed = ExtractBidsUsedFromAuctionHtml(auctionHtml);
|
|
|
|
_log?.Invoke($"[scraper] Parsed: Name='{productName}', FinalPrice={(finalPrice.HasValue? finalPrice.Value.ToString("F2", CultureInfo.InvariantCulture):"null")}, Winner='{winner}', BidsUsed={(bidsUsed.HasValue?bidsUsed.Value.ToString():"null")}" );
|
|
|
|
// Ensure HTML entities decoded already by helper methods
|
|
record = new ClosedAuctionRecord
|
|
{
|
|
AuctionUrl = auctionUrl,
|
|
ProductName = productName,
|
|
FinalPrice = finalPrice,
|
|
Winner = winner,
|
|
BidsUsed = bidsUsed,
|
|
ScrapedAt = DateTime.UtcNow,
|
|
Notes = string.Empty
|
|
};
|
|
|
|
// Record stats if service provided (fire-and-forget)
|
|
// DEPRECATED: RecordClosedAuctionAsync removed - use RecordAuctionCompletedAsync
|
|
// if (_statsService != null)
|
|
// {
|
|
// _statsService.RecordClosedAuctionAsync(record);
|
|
// }
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_log?.Invoke($"[scraper] ERROR parsing auction {auctionUrl}: {ex.Message}");
|
|
record = new ClosedAuctionRecord
|
|
{
|
|
AuctionUrl = auctionUrl,
|
|
ProductName = "(parse error)",
|
|
FinalPrice = null,
|
|
Winner = null,
|
|
BidsUsed = null,
|
|
ScrapedAt = DateTime.UtcNow,
|
|
Notes = ex.Message
|
|
};
|
|
}
|
|
|
|
yield return record;
|
|
}
|
|
}
|
|
|
|
private async Task<string?> GetStringAsync(string url)
|
|
{
|
|
try
|
|
{
|
|
var uri = new Uri(url, UriKind.RelativeOrAbsolute);
|
|
if (!uri.IsAbsoluteUri)
|
|
{
|
|
uri = new Uri(new Uri("https://it.bidoo.com"), url);
|
|
}
|
|
var req = new HttpRequestMessage(HttpMethod.Get, uri);
|
|
req.Headers.TryAddWithoutValidation("Referer", "https://it.bidoo.com/");
|
|
var resp = await _http.SendAsync(req);
|
|
resp.EnsureSuccessStatusCode();
|
|
var txt = await resp.Content.ReadAsStringAsync();
|
|
_log?.Invoke($"[scraper] HTTP {resp.StatusCode} {uri}");
|
|
return txt;
|
|
}
|
|
catch (Exception ex)
|
|
{
|
|
_log?.Invoke($"[scraper] HTTP ERROR fetching {url}: {ex.Message}");
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private IEnumerable<string> ExtractAuctionLinks(string closedHtml, Uri baseUri)
|
|
{
|
|
var urls = new List<string>();
|
|
|
|
// Cerca attributi data-href
|
|
var mh = Regex.Matches(closedHtml, "data-href\\s*=\\s*\\\"(?<u>[^\\\"]+)\\\"", RegexOptions.IgnoreCase);
|
|
foreach (Match m in mh)
|
|
{
|
|
var u = m.Groups["u"].Value.Trim();
|
|
if (!string.IsNullOrEmpty(u)) urls.Add(ToAbsolute(u, baseUri));
|
|
}
|
|
|
|
// fallback: cerca link a auction.php?a=
|
|
var mh2 = Regex.Matches(closedHtml, "href\\s*=\\s*\\\"(?<u>[^\\\"]*auction.php\\?a=[^\\\"]+)\\\"", RegexOptions.IgnoreCase);
|
|
foreach (Match m in mh2)
|
|
{
|
|
var u = m.Groups["u"].Value.Trim();
|
|
urls.Add(ToAbsolute(u, baseUri));
|
|
}
|
|
|
|
return urls.Where(u => !string.IsNullOrWhiteSpace(u));
|
|
}
|
|
|
|
private string ToAbsolute(string url, Uri baseUri)
|
|
{
|
|
try
|
|
{
|
|
if (url.StartsWith("http://", StringComparison.OrdinalIgnoreCase) || url.StartsWith("https://", StringComparison.OrdinalIgnoreCase))
|
|
return url;
|
|
if (url.StartsWith("//"))
|
|
return "https:" + url;
|
|
if (url.StartsWith("/"))
|
|
return new Uri(baseUri, url).ToString();
|
|
return new Uri(baseUri, "/" + url).ToString();
|
|
}
|
|
catch
|
|
{
|
|
return url;
|
|
}
|
|
}
|
|
|
|
private (string? ProductName, double? FinalPrice, string? Winner)? ExtractSummaryInfoForUrl(string closedHtml, string auctionUrl)
|
|
{
|
|
try
|
|
{
|
|
var idx = closedHtml.IndexOf(auctionUrl, StringComparison.OrdinalIgnoreCase);
|
|
if (idx < 0) return null;
|
|
|
|
var start = Math.Max(0, idx - 800);
|
|
var len = Math.Min(2500, closedHtml.Length - start);
|
|
var seg = closedHtml.Substring(start, len);
|
|
|
|
var namePattern1 = "<b[^>]*class=\\\"media-heading\\\"[^>]*>\\s*<a[^>]*>(?<name>[^<]+)</a>";
|
|
var namePattern2 = "<span[^>]*class=\\\"media-heading[^\\\"]*\\\"[^>]*>\\s*<a[^>]*>(?<name>[^<]+)</a>";
|
|
var nameMatch = Regex.Match(seg, namePattern1, RegexOptions.IgnoreCase);
|
|
if (!nameMatch.Success)
|
|
{
|
|
nameMatch = Regex.Match(seg, namePattern2, RegexOptions.IgnoreCase);
|
|
}
|
|
var product = nameMatch.Success ? WebUtility.HtmlDecode(nameMatch.Groups["name"].Value).Trim() : null;
|
|
|
|
var priceMatch = Regex.Match(seg, "<span[^>]*class=\\\"price\\\"[^>]*>(?<p>[0-9.,]+)\\s*€", RegexOptions.IgnoreCase);
|
|
double? price = null;
|
|
if (priceMatch.Success) price = ParseEuro(priceMatch.Groups["p"].Value);
|
|
|
|
var winnerPattern1 = "<span[^>]*class=\\\"username\\\"[^>]*>.*?<span[^>]*class=\\\"offer\\\"[^>]*>(?<w>[^<]+)</span>";
|
|
var winnerPattern2 = "<span[^>]*class=\\\"mobile_offerer offer\\\"[^>]*>(?<w>[^<]+)</span>";
|
|
var winnerMatch = Regex.Match(seg, winnerPattern1, RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
|
if (!winnerMatch.Success)
|
|
{
|
|
winnerMatch = Regex.Match(seg, winnerPattern2, RegexOptions.IgnoreCase);
|
|
}
|
|
var winner = winnerMatch.Success ? WebUtility.HtmlDecode(winnerMatch.Groups["w"].Value).Trim() : null;
|
|
|
|
return (product, price, winner);
|
|
}
|
|
catch
|
|
{
|
|
return null;
|
|
}
|
|
}
|
|
|
|
private string? ExtractProductNameFromAuctionHtml(string? auctionHtml)
|
|
{
|
|
if (string.IsNullOrEmpty(auctionHtml)) return null;
|
|
var content = auctionHtml ?? string.Empty;
|
|
var m = Regex.Match(content, "<h1[^>]*>(?<n>.*?)</h1>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
|
if (m.Success) return WebUtility.HtmlDecode(StripTags(m.Groups["n"].Value)).Trim();
|
|
|
|
m = Regex.Match(content, "<title[^>]*>(?<t>.*?)</title>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
|
if (m.Success) return WebUtility.HtmlDecode(StripTags(m.Groups["t"].Value)).Trim();
|
|
|
|
m = Regex.Match(content, "<b[^>]*class=\\\"media-heading\\\"[^>]*>\\s*<a[^>]*>(?<name>[^<]+)</a>", RegexOptions.IgnoreCase);
|
|
if (m.Success) return WebUtility.HtmlDecode(m.Groups["name"].Value).Trim();
|
|
|
|
return null;
|
|
}
|
|
|
|
private double? ExtractFinalPriceFromAuctionHtml(string? auctionHtml)
|
|
{
|
|
if (string.IsNullOrEmpty(auctionHtml)) return null;
|
|
var content = auctionHtml ?? string.Empty;
|
|
|
|
var m = Regex.Match(content, "<span[^>]*class=\\\"price\\\"[^>]*>(?<p>[0-9.,]+)\\s*€", RegexOptions.IgnoreCase);
|
|
if (m.Success) return ParseEuro(m.Groups["p"].Value);
|
|
|
|
m = Regex.Match(content, "prez[zo]?[\\\"\\']?[^0-9]{0,30}(?<p>[0-9.,]+)\\s*€", RegexOptions.IgnoreCase);
|
|
if (m.Success) return ParseEuro(m.Groups["p"].Value);
|
|
|
|
m = Regex.Match(content, "([0-9]{1,3}(?:[.,][0-9]{2}))\\s*€", RegexOptions.IgnoreCase);
|
|
if (m.Success) return ParseEuro(m.Groups[1].Value);
|
|
|
|
return null;
|
|
}
|
|
|
|
private string? ExtractWinnerFromAuctionHtml(string? auctionHtml)
|
|
{
|
|
if (string.IsNullOrEmpty(auctionHtml)) return null;
|
|
var content = auctionHtml ?? string.Empty;
|
|
|
|
var m = Regex.Match(content, "Vincitore[:\\s\\\"]+<[^>]*>(?<w>[^<]+)</", RegexOptions.IgnoreCase);
|
|
if (m.Success) return WebUtility.HtmlDecode(m.Groups["w"].Value).Trim();
|
|
|
|
m = Regex.Match(content, "<span[^>]*class=\\\"username\\\"[^>]*>.*?<span[^>]*class=\\\"offer\\\"[^>]*>(?<w>[^<]+)</span>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
|
if (m.Success) return WebUtility.HtmlDecode(m.Groups["w"].Value).Trim();
|
|
|
|
m = Regex.Match(content, "mobile_offerer offer\\\"[^>]*>(?<w>[^<]+)<", RegexOptions.IgnoreCase);
|
|
if (m.Success) return WebUtility.HtmlDecode(m.Groups["w"].Value).Trim();
|
|
|
|
return null;
|
|
}
|
|
|
|
private int? ExtractBidsUsedFromAuctionHtml(string? auctionHtml)
|
|
{
|
|
if (string.IsNullOrEmpty(auctionHtml)) return null;
|
|
var content = auctionHtml ?? string.Empty;
|
|
|
|
// 1) Look for the explicit bids-used span: <p ...><span>628</span> Puntate utilizzate</p>
|
|
var m = Regex.Match(content, "class=\\\"bids-used\\\"[^>]*>[^<]*<span[^>]*>(?<n>[0-9]{1,7})</span>", RegexOptions.IgnoreCase | RegexOptions.Singleline);
|
|
if (m.Success && int.TryParse(m.Groups["n"].Value, out var val)) return val;
|
|
|
|
// 2) Look for numeric followed by 'Puntate utilizzate' or similar
|
|
m = Regex.Match(content, "(?<n>[0-9]{1,7})\\s*(?:Puntate utilizzate|Puntate usate|puntate utilizzate|puntate usate|puntate)\\b", RegexOptions.IgnoreCase);
|
|
if (m.Success && int.TryParse(m.Groups["n"].Value, out val)) return val;
|
|
|
|
// 3) Fallbacks used previously
|
|
m = Regex.Match(content, "(?<n>[0-9]+)\\s*(?:puntate|Puntate|puntate usate|puntate_usate|pt\\.?|pts)\\b", RegexOptions.IgnoreCase);
|
|
if (m.Success && int.TryParse(m.Groups["n"].Value, out val)) return val;
|
|
|
|
m = Regex.Match(content, "usato[sx]?\\s*(?<n>[0-9]{1,6})\\s*(?:puntate|pts|pt)\\b", RegexOptions.IgnoreCase);
|
|
if (m.Success && int.TryParse(m.Groups["n"].Value, out val)) return val;
|
|
|
|
m = Regex.Match(content, "(Puntate\\s*(?:usate|vinte)?)[^0-9]{0,10}(?<n>[0-9]{1,6})", RegexOptions.IgnoreCase);
|
|
if (m.Success && int.TryParse(m.Groups["n"].Value, out val)) return val;
|
|
|
|
m = Regex.Match(content, "<[^>]*>\\s*(?<n>[0-9]{1,6})\\s*(puntate)\\s*<", RegexOptions.IgnoreCase);
|
|
if (m.Success && int.TryParse(m.Groups["n"].Value, out val)) return val;
|
|
|
|
return null;
|
|
}
|
|
|
|
private double? ParseEuro(string s)
|
|
{
|
|
if (string.IsNullOrWhiteSpace(s)) return null;
|
|
s = s.Trim();
|
|
s = s.Replace(".", "").Replace(',', '.');
|
|
if (double.TryParse(s, NumberStyles.Any, CultureInfo.InvariantCulture, out var d)) return d;
|
|
return null;
|
|
}
|
|
|
|
private string StripTags(string input)
|
|
{
|
|
return Regex.Replace(input ?? string.Empty, "<.*?>", string.Empty);
|
|
}
|
|
|
|
private void SaveCsv(IEnumerable<Models.ClosedAuctionRecord> data, string filePath)
|
|
{
|
|
var sb = new StringBuilder();
|
|
sb.AppendLine("AuctionUrl,ProductName,FinalPrice,Winner,BidsUsed,ScrapedAt,Notes");
|
|
foreach (var r in data)
|
|
{
|
|
// Escape quotes
|
|
string Escape(string? v) => (v ?? string.Empty).Replace("\"", "\"\"");
|
|
|
|
var finalPrice = r.FinalPrice.HasValue ? r.FinalPrice.Value.ToString("F2", CultureInfo.InvariantCulture) : string.Empty;
|
|
var bidsUsed = r.BidsUsed.HasValue ? r.BidsUsed.Value.ToString() : string.Empty;
|
|
|
|
var line = $"\"{Escape(r.AuctionUrl)}\",\"{Escape(r.ProductName)}\",{finalPrice},\"{Escape(r.Winner)}\",{bidsUsed},\"{r.ScrapedAt:O}\",\"{Escape(r.Notes)}\"";
|
|
sb.AppendLine(line);
|
|
}
|
|
|
|
File.WriteAllText(filePath, sb.ToString(), Encoding.UTF8);
|
|
}
|
|
}
|
|
}
|