diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 68ca551..b03e770 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -20,7 +20,7 @@ jobs: submodules: true - uses: actions/setup-dotnet@v4 with: - dotnet-version: '6.x' + dotnet-version: '8.x' - name: Restore dependencies run: dotnet restore - name: Build @@ -32,4 +32,4 @@ jobs: with: name: RobloxUltimateScraper (${{ matrix.configuration }}, ${{ matrix.platform }}) path: | - ./RobloxUltimateScraper/bin/${{ matrix.configuration }}/net6.0/${{ matrix.platform }}/publish/* \ No newline at end of file + ./RobloxUltimateScraper/bin/${{ matrix.configuration }}/net8.0/${{ matrix.platform }}/publish/* \ No newline at end of file diff --git a/RobloxUltimateScraper/CommandLineConfig.cs b/RobloxUltimateScraper/CommandLineConfig.cs new file mode 100644 index 0000000..c2748e2 --- /dev/null +++ b/RobloxUltimateScraper/CommandLineConfig.cs @@ -0,0 +1,249 @@ +using CommandLine; +using RobloxUltimateScraper.Enums; + +namespace RobloxUltimateScraper +{ + /// + /// Scraper configuration + /// + internal class CommandLineConfig + { + /// + /// Selected scraper type. + /// + public ScraperType Scraper { get; set; } = ScraperType.None; + + /// + /// Asset to scrape. + /// Should be used with scraper types . + /// + public ulong ScraperAssetId { get; set; } = 0; + + /// + /// Asset list to scrape. + /// Should be used with scraper types and . + /// + public string ScraperListPath { get; set; } = string.Empty; + + /// + /// Asset scrape start range. + /// Should be used with scraper types . + /// + public ulong ScraperStartRange { get; set; } = 0; + + /// + /// Asset scrape end range. + /// Should be used with scraper types . + /// + public ulong ScraperEndRange { get; set; } = 0; + + /// + /// Use the asset scraper. + /// COMMAND LINE USE ONLY! + /// + [Option('a', "asset", Required = false, HelpText = "Use the asset scraper. Parameter takes in an ID.")] + public ulong UseAssetScraper + { + set + { + Scraper = ScraperType.Asset; + ScraperAssetId = value; + } + } + + /// + /// Use the asset list scraper. + /// COMMAND LINE USE ONLY! + /// + [Option('l', "list", Required = false, HelpText = "Use the asset list scraper. Parameter takes in a list path. WIP!")] + public string UseListScraper + { + set + { + Scraper = ScraperType.List; + ScraperListPath = value; + } + } + + /// + /// Use the asset list versions scraper. + /// COMMAND LINE USE ONLY! + /// + [Option("listversions", Required = false, HelpText = "Use the asset list version scraper. Parameter takes in a list path. WIP!")] + public string UseListVersionsScraper + { + set + { + Scraper = ScraperType.ListVersions; + ScraperListPath = value; + } + } + + /// + /// Use the asset range scraper. + /// COMMAND LINE USE ONLY! + /// + [Option('r', "range", Required = false, HelpText = "Use the asset range scraper. Parameter takes in [Start ID]-[End ID].")] + public string UseRangeScraper + { + set + { + Scraper = ScraperType.Range; + + // parse input + string[] segments = value.Split('-'); + + if (segments.Length != 2) + throw new ArgumentException("Parameter is not in valid format."); + + if (!ulong.TryParse(segments[0], out ulong startRange)) + throw new ArgumentException("Start range is not an integer."); + + if (!ulong.TryParse(segments[1], out ulong endRange)) + throw new ArgumentException("End range is not an integer."); + + ScraperStartRange = startRange; + ScraperEndRange = endRange; + } + } + + /// + /// Assets output type. + /// + [Option('o', "output", Required = false, Default = OutputType.Both, HelpText = "Assets output type. (Files, Index, Console, Both)")] + public OutputType OutputType { get; set; } = OutputType.Both; + + /// + /// Index type. + /// + [Option('i', "index", Required = false, Default = IndexType.All, HelpText = "Index type. (Text, Json, All)")] + public IndexType IndexType { get; set; } = IndexType.All; + + /// + /// Asset compression type. + /// + [Option('c', "compression", Required = false, Default = CompressionType.None, HelpText = "Compression type. (None, GZip, Bzip2, Zstd)")] + public CompressionType CompressionType { get; set; } = CompressionType.None; + + [Option("compressionlevel", Required = false, Default = 9, HelpText = "Compression level for the compression. Only works for BZip2 (1-9) and Zstd (1-22). Other name: --cl.")] + public int CompressionLevelArg { get; set; } = 9; // 9 is good for both BZip2 and Zstd + + // this sucks but commandlineparser has no way to set multiple names for an argument + // and short arguments are only allowed to be a single character + [Option("cl", Required = false, Hidden = true)] + public int? CompressionLevelArgOtherName { get; set; } + + /// + /// Assets output directory. + /// + [Option('d', "directory", Required = false, HelpText = "Assets output directory.")] + public string OutputDirectory { get; set; } = ""; + + /// + /// Assets output extension. + /// + [Option('e', "extension", Required = false, Default = "Auto", HelpText = "Assets output extension. A value of 'Auto' will determine the extension based on the asset type.")] + public string OutputExtension { get; set; } = "Auto"; + + /// + /// Number of scrape workers. + /// + [Option('w', "workers", Required = false, Default = 1, HelpText = "Number of scrape workers.")] + public int Workers { get; set; } = 1; + + /// + /// Roblox authentication cookie (ROBLOSECURITY). + /// For copylocked game scraping. + /// + [Option("cookies", Required = false, HelpText = "Roblox authentication cookie (.ROBLOSECURITY). This argument is prioritised over the environment variable 'ROBLOXULTIMATESCRAPER_COOKIE'.")] + public string? AuthCookie { get; set; } + + /// + /// Http timeout in seconds. + /// + [Option('t', "timeout", Required = false, Default = 180, HelpText = "Http timeout in seconds.")] + public int HttpTimeout { get; set; } = 180; + + private string _baseUrl = "roblox.com"; + + /// + /// Roblox environment to download from. + /// + [Option("baseurl", Required = false, Default = "www.roblox.com", HelpText = "Roblox environment to download from.")] + public string BaseUrl + { + get => _baseUrl; + + set + { + if (value.StartsWith("http://")) + value = value[7..]; + else if (value.StartsWith("https://")) + value = value[8..]; + + if (value.StartsWith("www.") || value.StartsWith("web.")) + value = value[4..]; + + int idx = value.IndexOf('/'); + if (idx != -1) + value = value[..idx]; + + _baseUrl = value; + } + } + + /// + /// Decides where CDN url parameters should be trimmed. + /// + [Option("trim", Required = false, Default = TrimCdnUrlType.All, HelpText = "Decides where CDN url parameters should be trimmed. (Off, Console, Output, All)")] + public TrimCdnUrlType TrimCdnUrl { get; set; } = TrimCdnUrlType.All; + + /// + /// Disables checks responsible for checking if the current run is authenticated. + /// + [Option("disablerobloxauthchecks", Required = false, Default = false, HelpText = "Disables checks responsible for checking if the current run is authenticated.")] + public bool DisableRobloxAuthChecks { get; set; } = false; + + /// + /// Should fail if the current run is unauthenticated? + /// + [Option("failifunauthenticated", Required = false, Default = false, HelpText = "Should fail if the current run is unauthenticated?")] + public bool FailIfUnauthenticated { get; set; } = false; + + /// + /// Should only use a single and shared HTTP client for all scraper threads. + /// + [Option("singlehttpclient", Required = false, Default = false, HelpText = "Should only use a single and shared HTTP client for all scraper threads. This was the behaviour present in 0.1.2.0 and before. Other name: --shc.")] + public bool SingleHttpClient { get; set; } = false; + + // hack... again!!! + [Option("shc", Required = false, Hidden = true)] + public bool? SingleHttpClientOtherName { get; set; } + + /// + /// Should images gathered using the range scraper be compressed? + /// + [Option("compressimages", Required = false, Default = false, HelpText = "Should images gathered using the range scraper be compressed?")] + public bool CompressImages { get; set; } = false; + + /// + /// The file structure to use for range & list scrapes. This will not apply to single asset downloads. + /// + [Option("multipledownloadstructure", Required = false, Default = MultipleDownloadStructureType.Default, HelpText = "The file structure to use for range & list scrapes. This will not apply to single asset downloads. Other name: --mds.")] + public MultipleDownloadStructureType MultipleDownloadStructure { get; set; } = MultipleDownloadStructureType.Default; + + // hack... again!!! + [Option("mds", Required = false, Hidden = true)] + public MultipleDownloadStructureType? MultipleDownloadStructureOtherName { get; set; } + + /// + /// Only download assets with the given asset type. + /// + [Option("expectedassettype", Required = false, Default = null, HelpText = "Only download assets with the given asset type. Other name: --eat")] + public AssetType? ExpectedAssetType { get; set; } + + // hack... again!!! + [Option("eat", Required = false, Hidden = true)] + public AssetType? ExpectedAssetTypeOtherName { get; set; } + } +} diff --git a/RobloxUltimateScraper/Config.cs b/RobloxUltimateScraper/Config.cs index 08bab82..8dba540 100644 --- a/RobloxUltimateScraper/Config.cs +++ b/RobloxUltimateScraper/Config.cs @@ -1,317 +1,78 @@ -using CommandLine; -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; +using RobloxUltimateScraper.Enums; namespace RobloxUltimateScraper { - /// - /// Output type - /// - internal enum OutputType + internal class Config { - /// - /// Asset files - /// - Files = 0, + public static Config Default { get; private set; } = null!; - [Obsolete] - FilesOnly = 0, + private readonly CommandLineConfig _clConfig; - /// - /// Asset index - /// - Index = 1, + public ulong ScraperAssetId => _clConfig.ScraperAssetId; - [Obsolete] - IndexOnly = 1, + public string ScraperListPath => _clConfig.ScraperListPath; - /// - /// Console output - /// - Console = 2, + public ulong ScraperStartRange => _clConfig.ScraperStartRange; + public ulong ScraperEndRange => _clConfig.ScraperEndRange; - /// - /// Asset files and index - /// - Both = 3 - } + public ScraperType Scraper => _clConfig.Scraper; + public OutputType OutputType => _clConfig.OutputType; + public IndexType IndexType => _clConfig.IndexType; - /// - /// Compression type on asset files - /// - internal enum CompressionType - { - /// - /// No compression - /// - None, + public CompressionType CompressionType => _clConfig.CompressionType; + public int CompressionLevel { get; } - /// - /// GZip compression - /// - GZip, + public string OutputDirectory => _clConfig.OutputDirectory; + public string OutputExtension => _clConfig.OutputExtension; - /// - /// BZip2 compression - /// - BZip2, + public int Workers => _clConfig.Workers; - /// - /// Zstd compression - /// - Zstd - } + public string? AuthCookie => _clConfig.AuthCookie; - /// - /// Index type - /// - internal enum IndexType - { - /// - /// Text index - /// - Text, + public int HttpTimeout => _clConfig.HttpTimeout; - /// - /// Json index - /// - Json, + public string BaseUrl => _clConfig.BaseUrl; - /// - /// Text and json indexes - /// - All - } + /// + public TrimCdnUrlType TrimCdnUrl => _clConfig.TrimCdnUrl; - /// - /// Scraper type - /// - internal enum ScraperType - { - /// - /// Asset version scraper - /// - Asset, - - /// - /// Asset list scraper - /// - List, + /// + public bool DisableRobloxAuthChecks => _clConfig.DisableRobloxAuthChecks; - /// - /// Asset list scraper, with versions - /// - ListVersions, + /// + public bool FailIfUnauthenticated => _clConfig.FailIfUnauthenticated; - /// - /// Asset range scraper - /// - Range - } + /// + public bool SingleHttpClient { get; } - /// - /// Scraper configuration - /// - internal class Config - { - /// - /// singleton. - /// - public static Config Default { get; set; } = default!; + /// + public bool CompressImages => _clConfig.CompressImages; - /// - /// Selected scraper type. - /// - public ScraperType? Scraper { get; set; } - - /// - /// Asset to scrape. - /// Should be used with scraper types . - /// - public long ScraperId { get; set; } = 0; - - /// - /// Asset list to scrape. - /// Should be used with scraper types and . - /// - public string ScraperListPath { get; set; } = string.Empty; - - /// - /// Asset scrape start range. - /// Should be used with scraper types . - /// - public long ScraperStartRange { get; set; } = 0; - - /// - /// Asset scrape end range. - /// Should be used with scraper types . - /// - public long ScraperEndRange { get; set; } = 0; - - /// - /// Use the asset scraper. - /// COMMAND LINE USE ONLY! - /// - [Option('a', "asset", Required = false, HelpText = "Use the asset scraper. Parameter takes in an ID.")] - public long UseAssetScraper - { - set - { - Scraper = ScraperType.Asset; - ScraperId = value; - } - } - - /// - /// Use the asset list scraper. - /// COMMAND LINE USE ONLY! - /// - [Option('l', "list", Required = false, HelpText = "Use the asset list scraper. Parameter takes in a list path. WIP!")] - public string UseListScraper - { - set - { - Scraper = ScraperType.List; - ScraperListPath = value; - } - } + /// + public MultipleDownloadStructureType MultipleDownloadStructure { get; } - /// - /// Use the asset list versions scraper. - /// COMMAND LINE USE ONLY! - /// - [Option("listversions", Required = false, HelpText = "Use the asset list version scraper. Parameter takes in a list path. WIP!")] - public string UseListVersionsScraper - { - set - { - Scraper = ScraperType.ListVersions; - ScraperListPath = value; - } - } + /// + public AssetType? ExpectedAssetType { get; } - /// - /// Use the asset range scraper. - /// COMMAND LINE USE ONLY! - /// - [Option('r', "range", Required = false, HelpText = "Use the asset range scraper. Parameter takes in [Start ID]-[End ID]. WIP!")] - public string UseRangeScraper + public Config(CommandLineConfig config) { - set - { - Scraper = ScraperType.Range; - - // parse input - string[] segments = value.Split('-'); + _clConfig = config; - if (segments.Length != 2) - throw new ArgumentException("Parameter is not in valid format."); + CompressionLevel = _clConfig.CompressionLevelArgOtherName != null ? (int)_clConfig.CompressionLevelArgOtherName : _clConfig.CompressionLevelArg; + SingleHttpClient = _clConfig.SingleHttpClientOtherName ?? _clConfig.SingleHttpClient; + MultipleDownloadStructure = _clConfig.MultipleDownloadStructureOtherName ?? _clConfig.MultipleDownloadStructure; + ExpectedAssetType = _clConfig.ExpectedAssetTypeOtherName ?? _clConfig.ExpectedAssetType; - if (!long.TryParse(segments[0], out long startRange)) - throw new ArgumentException("Start range is not an integer."); - - if (!long.TryParse(segments[1], out long endRange)) - throw new ArgumentException("End range is not an integer."); - - ScraperStartRange = startRange; - ScraperEndRange = endRange; - } + if (ExpectedAssetType == AssetType.Unknown) + throw new ApplicationException("Invalid value for ExpectedAssetType"); } - /// - /// Assets output type. - /// - [Option('o', "output", Required = false, Default = OutputType.Both, HelpText = "Assets output type. (Files, Index, Console, Both)")] - public OutputType OutputType { get; set; } = OutputType.Both; - - /// - /// Index type. - /// - [Option('i', "index", Required = false, Default = IndexType.All, HelpText = "Index type. (Text, Json, All)")] - public IndexType IndexType { get; set; } = IndexType.All; - - /// - /// Asset compression type. - /// - [Option('c', "compression", Required = false, Default = CompressionType.None, HelpText = "Compression type. (None, GZip, Bzip2, Zstd)")] - public CompressionType CompressionType { get; set; } = CompressionType.None; - - [Option("compressionlevel", Required = false, Default = 9, HelpText = "Compression level for the compression. Only works for BZip2 (1-9) and Zstd (1-22). Other name: --cl.")] - public int CompressionLevelArg { get; set; } = 9; // 9 is good for both BZip2 and Zstd - - // this sucks but commandlineparser has no way to set multiple names for an argument - // and short arguments are only allowed to be a single character - [Option("cl", Required = false, Hidden = true)] - public int? CompressionLevelArgOtherName { get; set; } - - /// - /// Asset compression level. - /// - public int CompressionLevel { get => CompressionLevelArgOtherName != null ? (int)CompressionLevelArgOtherName : CompressionLevelArg; } - - /// - /// Assets output directory. - /// - [Option('d', "directory", Required = false, HelpText = "Assets output directory.")] - public string OutputDirectory { get; set; } = ""; - - /// - /// Assets output extension. - /// - [Option('e', "extension", Required = false, Default = "Auto", HelpText = "Assets output extension. A value of 'Auto' will determine the extension based on the asset type.")] - public string OutputExtension { get; set; } = "Auto"; - - /// - /// Number of scrape workers. - /// - [Option('w', "workers", Required = false, Default = 1, HelpText = "Number of scrape workers.")] - public int Workers { get; set; } = 1; - - /// - /// Roblox authentication cookie (ROBLOSECURITY). - /// For copylocked game scraping. - /// - [Option("cookies", Required = false, HelpText = "Roblox authentication cookie (.ROBLOSECURITY). This argument is prioritised over the environment variable 'ROBLOXULTIMATESCRAPER_COOKIE'.")] - public string? AuthCookie { get; set; } - - /// - /// Http timeout in seconds. - /// - [Option('t', "timeout", Required = false, Default = 180, HelpText = "Http timeout in seconds.")] - public int HttpTimeout { get; set; } = 180; - - private string _baseUrl = "roblox.com"; - - /// - /// Roblox environment to download from. - /// - [Option("baseurl", Required = false, Default = "www.roblox.com", HelpText = "Roblox environment to download from.")] - public string BaseUrl + public static void Initialise(CommandLineConfig commandLineConfig) { - get => _baseUrl; - - set - { - if (value.StartsWith("http://")) - value = value[7..]; - else if (value.StartsWith("https://")) - value = value[8..]; - - if (value.StartsWith("www.") || value.StartsWith("web.")) - value = value[4..]; + if (Default != null) + throw new Exception("Can not initialise Config twice."); - int idx = value.IndexOf('/'); - if (idx != -1) - value = value[..idx]; - - _baseUrl = value; - } + Default = new Config(commandLineConfig); } - - [Option("trimcdnurlinconsole", Required = false, Default = null, HelpText = "Should the CDN url in console be trimmed.")] - public bool? TrimCdnUrlInConsole { get; set; } } } diff --git a/RobloxUltimateScraper/Enums/AssetType.cs b/RobloxUltimateScraper/Enums/AssetType.cs index d3006fb..7a0f1a6 100644 --- a/RobloxUltimateScraper/Enums/AssetType.cs +++ b/RobloxUltimateScraper/Enums/AssetType.cs @@ -1,11 +1,8 @@ namespace RobloxUltimateScraper.Enums { - /// - /// Automatically generated, do not modify. - /// internal enum AssetType { - Product = 0, + Unknown = 0, Image = 1, TShirt = 2, Audio = 3, @@ -81,14 +78,18 @@ internal enum AssetType CodeSnippet = 80, AdsVideo = 81, OtaUpdate = 82, - Screenshot = 83 + Screenshot = 83, + RuntimePropertySet = 84, + StorePreviewVideo = 85, + GamePreviewVideo = 86, + CreatorExperienceConfig = 87 } internal static class AssetTypeEx { private static readonly Dictionary _extensionMap = new Dictionary() { - [AssetType.Product] = null, + [AssetType.Unknown] = null, [AssetType.Image] = "png", // TODO: auto detect what type of image it is [AssetType.TShirt] = "rbxm", [AssetType.Audio] = "ogg", @@ -119,7 +120,7 @@ internal static class AssetTypeEx [AssetType.Package] = "txt", [AssetType.YouTubeVideo] = null, [AssetType.GamePass] = null, - [AssetType.App] = "rbxm", + [AssetType.App] = "rbxl", [AssetType.Code] = null, [AssetType.Plugin] = "rbxm", [AssetType.SolidModel] = "rbxm", @@ -140,11 +141,11 @@ internal static class AssetTypeEx [AssetType.SwimAnimation] = "rbxm", [AssetType.WalkAnimation] = "rbxm", [AssetType.PoseAnimation] = "rbxm", - [AssetType.LocalizationTableManifest] = null, - [AssetType.LocalizationTableTranslation] = null, + [AssetType.LocalizationTableManifest] = "json", + [AssetType.LocalizationTableTranslation] = "json", [AssetType.EmoteAnimation] = "rbxm", [AssetType.Video] = null, - [AssetType.TexturePack] = null, + [AssetType.TexturePack] = "xml", [AssetType.TShirtAccessory] = "rbxm", [AssetType.ShirtAccessory] = "rbxm", [AssetType.PantsAccessory] = "rbxm", @@ -154,17 +155,21 @@ internal static class AssetTypeEx [AssetType.LeftShoeAccessory] = "rbxm", [AssetType.RightShoeAccessory] = "rbxm", [AssetType.DressSkirtAccessory] = "rbxm", - [AssetType.FontFamily] = null, - [AssetType.FontFace] = null, - [AssetType.MeshHiddenSurfaceRemoval] = null, + [AssetType.FontFamily] = "json", + [AssetType.FontFace] = "ttf", + [AssetType.MeshHiddenSurfaceRemoval] = "rbxm", [AssetType.EyebrowAccessory] = "rbxm", [AssetType.EyelashAccessory] = "rbxm", [AssetType.MoodAnimation] = "rbxm", [AssetType.DynamicHead] = "rbxm", [AssetType.CodeSnippet] = null, [AssetType.AdsVideo] = null, - [AssetType.OtaUpdate] = null, - [AssetType.Screenshot] = null + [AssetType.OtaUpdate] = "rbxm", + [AssetType.Screenshot] = null, + [AssetType.RuntimePropertySet] = null, + [AssetType.StorePreviewVideo] = null, + [AssetType.GamePreviewVideo] = null, + [AssetType.CreatorExperienceConfig] = null }; public static string? GetExtension(this AssetType type) diff --git a/RobloxUltimateScraper/Enums/CompressionType.cs b/RobloxUltimateScraper/Enums/CompressionType.cs new file mode 100644 index 0000000..8d4e9ef --- /dev/null +++ b/RobloxUltimateScraper/Enums/CompressionType.cs @@ -0,0 +1,28 @@ +namespace RobloxUltimateScraper.Enums +{ + /// + /// Compression type on asset files + /// + internal enum CompressionType + { + /// + /// No compression + /// + None, + + /// + /// GZip compression + /// + GZip, + + /// + /// BZip2 compression + /// + BZip2, + + /// + /// Zstd compression + /// + Zstd + } +} diff --git a/RobloxUltimateScraper/Enums/IndexType.cs b/RobloxUltimateScraper/Enums/IndexType.cs new file mode 100644 index 0000000..7641eb5 --- /dev/null +++ b/RobloxUltimateScraper/Enums/IndexType.cs @@ -0,0 +1,23 @@ +namespace RobloxUltimateScraper.Enums +{ + /// + /// Index type + /// + internal enum IndexType + { + /// + /// Text index + /// + Text, + + /// + /// Json index + /// + Json, + + /// + /// Text and json indexes + /// + All + } +} diff --git a/RobloxUltimateScraper/Enums/MultipleDownloadStructureType.cs b/RobloxUltimateScraper/Enums/MultipleDownloadStructureType.cs new file mode 100644 index 0000000..7282f99 --- /dev/null +++ b/RobloxUltimateScraper/Enums/MultipleDownloadStructureType.cs @@ -0,0 +1,20 @@ +namespace RobloxUltimateScraper.Enums +{ + internal enum MultipleDownloadStructureType + { + /// + /// All asset downloads are put in their own directories + /// + Default, + + /// + /// All asset download files and index files are in the same directories + /// + Combined, + + /// + /// Asset download files and index files are put into their respective directories + /// + Separated + } +} diff --git a/RobloxUltimateScraper/Enums/OutputType.cs b/RobloxUltimateScraper/Enums/OutputType.cs new file mode 100644 index 0000000..3cad91a --- /dev/null +++ b/RobloxUltimateScraper/Enums/OutputType.cs @@ -0,0 +1,40 @@ +namespace RobloxUltimateScraper.Enums +{ + /// + /// Asset output type + /// + internal enum OutputType + { + /// + /// Asset files + /// + Files = 0, + + [Obsolete] + FilesOnly = 0, + + /// + /// Asset index + /// + Index = 1, + + [Obsolete] + IndexOnly = 1, + + /// + /// Console output only + /// + Console = 2, + + /// + /// Asset files and index + /// + Both = 3 + } + + internal static class OutputTypeEx + { + public static bool IsFileSavingEnabled(this OutputType type) => type == OutputType.Files || type == OutputType.Both; + public static bool IsIndexEnabled(this OutputType type) => type == OutputType.Index || type == OutputType.Both; + } +} diff --git a/RobloxUltimateScraper/Enums/RobloxAuthStatus.cs b/RobloxUltimateScraper/Enums/RobloxAuthStatus.cs new file mode 100644 index 0000000..363505b --- /dev/null +++ b/RobloxUltimateScraper/Enums/RobloxAuthStatus.cs @@ -0,0 +1,25 @@ +namespace RobloxUltimateScraper.Enums +{ + internal enum RobloxAuthStatus + { + /// + /// The auth cookie being used is valid + /// + Authenticated, + + /// + /// No authentication cookie is set + /// + Unauthenticated, + + /// + /// The auth cookie being used is invalid + /// + InvalidAuth, + + /// + /// An error occured while validating + /// + Error + } +} diff --git a/RobloxUltimateScraper/Enums/ScraperType.cs b/RobloxUltimateScraper/Enums/ScraperType.cs new file mode 100644 index 0000000..586a2ba --- /dev/null +++ b/RobloxUltimateScraper/Enums/ScraperType.cs @@ -0,0 +1,33 @@ +namespace RobloxUltimateScraper.Enums +{ + /// + /// Scraper type + /// + internal enum ScraperType + { + /// + /// No scraper selected + /// + None, + + /// + /// Asset version scraper + /// + Asset, + + /// + /// Asset list scraper + /// + List, + + /// + /// Asset list scraper, with versions + /// + ListVersions, + + /// + /// Asset range scraper + /// + Range + } +} diff --git a/RobloxUltimateScraper/Enums/TrimCdnUrlType.cs b/RobloxUltimateScraper/Enums/TrimCdnUrlType.cs new file mode 100644 index 0000000..2930a89 --- /dev/null +++ b/RobloxUltimateScraper/Enums/TrimCdnUrlType.cs @@ -0,0 +1,51 @@ +using System.Diagnostics; + +namespace RobloxUltimateScraper.Enums +{ + internal enum TrimCdnUrlType + { + /// + /// Turn off CDN url trimming entirely + /// + Off, + + /// + /// Should trim the CDN url for Console only + /// + Console, + + /// + /// Should trim the CDN url for Output only + /// + Output, + + /// + /// Should trim the CDN url for both Console and Output + /// + All + } + + internal static class TrimCdnUrlTypeEx + { + public static bool ShouldTrim(this TrimCdnUrlType type, OutputType outputType) + { + if (type == TrimCdnUrlType.Off) + return false; + if (type == TrimCdnUrlType.All) + return true; + + switch (outputType) + { + case OutputType.Console: + return type == TrimCdnUrlType.Console; + + case OutputType.Index: + return type == TrimCdnUrlType.Output; + + default: + Debug.Assert(false); + return false; + } + } + } +} diff --git a/RobloxUltimateScraper/FileWriter.cs b/RobloxUltimateScraper/FileWriter.cs index 4a2882d..dbb0e20 100644 --- a/RobloxUltimateScraper/FileWriter.cs +++ b/RobloxUltimateScraper/FileWriter.cs @@ -1,9 +1,4 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Reflection.Emit; -using System.Text; -using System.Threading.Tasks; +using RobloxUltimateScraper.Enums; namespace RobloxUltimateScraper { @@ -28,11 +23,11 @@ public static string BuildOutputFileName(string fileName, string? fileExtension) /// File path /// Stream /// Last modified - public static void Save(string filePath, Stream stream, int compressionLevel, DateTime? lastModified = null) + public static void Save(string filePath, Stream stream, CompressionType compressionType, int compressionLevel, DateTime? lastModified = null) { using (MemoryStream ms = new MemoryStream()) { - switch (Config.Default.CompressionType) + switch (compressionType) { case CompressionType.GZip: ICSharpCode.SharpZipLib.GZip.GZip.Compress(stream, ms, false); diff --git a/RobloxUltimateScraper/Http.cs b/RobloxUltimateScraper/Http.cs new file mode 100644 index 0000000..d5e709c --- /dev/null +++ b/RobloxUltimateScraper/Http.cs @@ -0,0 +1,89 @@ +using System.Diagnostics; +using System.Net; + +namespace RobloxUltimateScraper +{ + internal static class Http + { + private static CookieContainer _cookieContainer = null!; + + /// + /// A singleton that can be shared across all threads + /// + public static HttpClient Client { get; private set; } = null!; + + /// + /// Boolean that indicates if a .ROBLOSECURITY cookie has been set for this session + /// + public static bool HasRobloxAuth { get; private set; } = false; + + /// + /// Creates a new with the appropriate settings for this scraper + /// + /// New HttpClient instance + public static HttpClient CreateClient() + { + Debug.Assert(_cookieContainer != null); + + HttpClientHandler httpClientHandler = new HttpClientHandler + { + AutomaticDecompression = DecompressionMethods.All, + AllowAutoRedirect = false, // we are using v1 because v2 is bad + CookieContainer = _cookieContainer, + UseCookies = true + }; + + HttpClient client = new HttpClient(httpClientHandler) + { + Timeout = TimeSpan.FromSeconds(Config.Default.HttpTimeout) + }; + client.DefaultRequestHeaders.Add("User-Agent", "Roblox/WinInet"); + + return client; + } + + private static string? GetRobloxAuthCookie() + { + if (!string.IsNullOrEmpty(Config.Default.AuthCookie)) + { + Console.WriteLine("Using auth cookie from arguments."); + return Config.Default.AuthCookie; + } + + string? envValue; + envValue = Environment.GetEnvironmentVariable("ROBLOXULTIMATESCRAPER_COOKIE"); + if (!string.IsNullOrEmpty(envValue)) + { + Console.WriteLine("Using auth cookie from environment variables (ROBLOXULTIMATESCRAPER_COOKIE)."); + return envValue; + } + + envValue = Environment.GetEnvironmentVariable("ROBLOXULTIMATESCRAPER_COOKIE_PATH"); + if (!string.IsNullOrEmpty(envValue)) + { + Console.WriteLine("Using auth cookie from environment variables (ROBLOXULTIMATESCRAPER_COOKIE_PATH)."); + + if (!File.Exists(envValue)) + throw new ApplicationException($"Can not read the auth cookie: File {envValue} does not exist."); + + return File.ReadAllText(envValue); + } + + return null; + } + + public static void Init() + { + _cookieContainer = new CookieContainer(); + + string? cookie = GetRobloxAuthCookie(); + if (cookie != null) + { + _cookieContainer.Add(new Cookie(".ROBLOSECURITY", cookie, "/", $".{Config.Default.BaseUrl}")); + HasRobloxAuth = true; + } + + Client = CreateClient(); + } + } +} diff --git a/RobloxUltimateScraper/Models/AssetInput.cs b/RobloxUltimateScraper/Models/AssetInput.cs deleted file mode 100644 index 9afd2fe..0000000 --- a/RobloxUltimateScraper/Models/AssetInput.cs +++ /dev/null @@ -1,14 +0,0 @@ -using System; -using System.Collections.Generic; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace RobloxUltimateScraper.Models -{ - internal class AssetInput - { - public long Id { get; set; } - public int Version { get; set; } - } -} diff --git a/RobloxUltimateScraper/Models/AssetOutput.cs b/RobloxUltimateScraper/Models/AssetOutput.cs index 1ad4ce4..c555d4f 100644 --- a/RobloxUltimateScraper/Models/AssetOutput.cs +++ b/RobloxUltimateScraper/Models/AssetOutput.cs @@ -1,22 +1,10 @@ -using System; -using System.Collections.Generic; -using System.Diagnostics; -using System.Linq; -using System.Text; -using System.Threading.Tasks; - -namespace RobloxUltimateScraper.Models +namespace RobloxUltimateScraper.Models { /// /// Asset information for index /// internal class AssetOutput : IComparable { - /// - /// Asset version - /// - public long Id { get; set; } - /// /// Asset version /// @@ -67,7 +55,7 @@ internal class AssetOutput : IComparable // 1818 | v1 | Error: failed to download public string ToString(bool trimCdnUrl) { - string output = $"{Id} | v{Version}"; + string output = $"v{Version}"; if (Error != null) { @@ -96,10 +84,6 @@ public int CompareTo(AssetOutput? other) { if (other == null) return 1; - // compare asset ids - if (Id > other.Id) return 1; - if (Id < other.Id) return -1; - // compare versions if (Version > other.Version) return 1; if (Version < other.Version) return -1; diff --git a/RobloxUltimateScraper/Program.cs b/RobloxUltimateScraper/Program.cs index c9e9d12..e5eb149 100644 --- a/RobloxUltimateScraper/Program.cs +++ b/RobloxUltimateScraper/Program.cs @@ -1,22 +1,22 @@ using CommandLine; using CommandLine.Text; using RobloxUltimateScraper.Enums; -using RobloxUltimateScraper.Models; +using System.Net; using System.Reflection; namespace RobloxUltimateScraper { internal class Program { - static async Task Main(string[] args) + static void Main(string[] args) { if (args.Length == 0) // make it display the help menu args = new string[] { "--help" }; Parser cmdLineParser = new Parser(settings => settings.CaseInsensitiveEnumValues = true); - ParserResult configParser = cmdLineParser.ParseArguments(args); + ParserResult configParser = cmdLineParser.ParseArguments(args); configParser.WithNotParsed(errors => Error(configParser, errors)); - await configParser.WithParsedAsync(async config => await Run(config)); + configParser.WithParsed(config => Run(config)); } /// @@ -24,25 +24,38 @@ static async Task Main(string[] args) /// /// Configuration /// - static async Task Run(Config config) + static void Run(CommandLineConfig config) { Console.WriteLine($"RobloxUltimateScraper v{Assembly.GetExecutingAssembly().GetName().Version}"); - Config.Default = config; + Config.Initialise(config); + + try + { + Http.Init(); + } + catch (Exception ex) + { + Console.WriteLine($"[ERROR]: Failed to initialise HTTP ({ex})"); + Environment.Exit(1); + return; + } + + if (!Config.Default.DisableRobloxAuthChecks) + CheckRobloxAuthStatus(); // TODO: add functionality for // list // list versions - // range switch (Config.Default.Scraper) { - case null: - Console.WriteLine("Please define which scraper you wish to use!"); + case ScraperType.None: + Console.WriteLine("No scraper selected."); Console.WriteLine("Run the scraper with the --help argument for all commands."); break; case ScraperType.Asset: - await RunAssetScraper(); + RunAssetScraper(); break; case ScraperType.List: @@ -54,7 +67,7 @@ static async Task Run(Config config) break; case ScraperType.Range: - Console.WriteLine("Range scraper has not been implemented yet."); + RunRangeScraper(); break; default: @@ -67,12 +80,77 @@ static async Task Run(Config config) /// Handles command line parsing failure /// /// Errors from command line parser - static void Error(ParserResult config, IEnumerable errors) + static void Error(ParserResult config, IEnumerable errors) { HelpText text = HelpText.AutoBuild(config); Console.WriteLine(text); } + static void CheckRobloxAuthStatus() + { + RobloxAuthStatus status = GetRobloxAuthStatus(out int httpCode, out Exception? exception); + + switch (status) + { + case RobloxAuthStatus.Unauthenticated: + Console.WriteLine("[WARNING]: No authentication is set for this run. You may face problems downloading assets not authored by Roblox."); + break; + + case RobloxAuthStatus.InvalidAuth: + Console.WriteLine("[ERROR]: Provided authentication cookie is invalid."); + Environment.Exit(1); + return; + + case RobloxAuthStatus.Error: + if (exception != null) + { + Console.WriteLine($"[WARNING]: Failed to check if authentication is valid ({exception.Message})."); + Console.WriteLine(exception); + } + else + { + Console.WriteLine($"[WARNING]: Failed to check if authentication is valid (got unexpected HTTP code {httpCode})."); + } + break; + + } + + if (status != RobloxAuthStatus.Authenticated && Config.Default.FailIfUnauthenticated) + { + Console.WriteLine("[ERROR]: Fail if unauthenticated flag is enabled. Stopping execution."); + Environment.Exit(1); + return; + } + } + + static RobloxAuthStatus GetRobloxAuthStatus(out int httpCode, out Exception? exception) + { + httpCode = 0; + exception = null; + + if (!Http.HasRobloxAuth) + return RobloxAuthStatus.Unauthenticated; + + try + { + HttpResponseMessage message = Http.Client.GetAsync("https://users.roblox.com/v1/users/authenticated").Result; + + httpCode = (int)message.StatusCode; + + return message.StatusCode switch + { + HttpStatusCode.OK => RobloxAuthStatus.Authenticated, + HttpStatusCode.Unauthorized => RobloxAuthStatus.InvalidAuth, + _ => RobloxAuthStatus.Error + }; + } + catch (Exception ex) + { + exception = ex; + return RobloxAuthStatus.Error; + } + } + // TODO: move asset scraper to a separate file /// @@ -82,63 +160,141 @@ static void Error(ParserResult config, IEnumerable errors) /// Downloads /// Errors /// Versions - static void SetAssetScraperTitle(long id, int downloaded, int errors, int total) + static void SetAssetScraperTitle(ulong id, int downloaded, int errors, int total) { - Console.Title = $"{nameof(RobloxUltimateScraper)} | Asset {id} | {downloaded}/{total} | {errors} Errors"; + Console.Title = $"RobloxUltimateScraper | Asset {id} | {downloaded}/{total} | {errors} Errors"; } /// /// Starts the asset scraper /// /// - static async Task RunAssetScraper() + static void RunAssetScraper() { - long assetId = Config.Default.ScraperId; + ulong assetId = Config.Default.ScraperAssetId; - if (string.IsNullOrEmpty(Config.Default.OutputDirectory) && !Scraper.ConsoleOnly) - Config.Default.OutputDirectory = $"Asset_{assetId}"; + string outputDirectory; - Scraper.ShouldTrimCdnUrlInConsole = Config.Default.TrimCdnUrlInConsole ?? !Scraper.ConsoleOnly; + if (string.IsNullOrEmpty(Config.Default.OutputDirectory)) + outputDirectory = $"Asset_{assetId}"; + else + outputDirectory = Config.Default.OutputDirectory; - // get all place versions - var assetDeliveryInfo = await Scraper.GetAssetDeliveryInformation(assetId); + Scraper scraper = new Scraper(assetId, outputDirectory); + var setupResult = scraper.Setup(Http.Client).Result; + if (!setupResult.Success) + { + Console.WriteLine(setupResult.Message); + return; + } - if (!assetDeliveryInfo.Success) + if (Config.Default.ExpectedAssetType.HasValue && scraper.AssetType != Config.Default.ExpectedAssetType.Value) { - Console.WriteLine($"Failed to fetch versions for asset {assetId}: {assetDeliveryInfo.Error}"); - Environment.Exit(1); + Console.WriteLine($"Asset {assetId}'s type is not whitelisted (expected {Config.Default.ExpectedAssetType})"); + return; } - Console.WriteLine($"Asset {assetId} has {assetDeliveryInfo.TotalVersions} versions!"); + scraper.CreateOutputDirectories(); + + Console.WriteLine($"Asset {assetId} has {scraper.TotalVersions} versions!"); + + // set up titles + SetAssetScraperTitle(assetId, 0, 0, scraper.TotalVersions); + scraper.OnDownloadFinished += (_) => SetAssetScraperTitle(assetId, scraper.SuccessfulDownloads, scraper.FailedDownloads, scraper.TotalVersions); + + // start workers + List workers = new List(); + + for (int i = 1; i <= Config.Default.Workers; i++) + workers.Add(Task.Run(scraper.StartWorker)); + + Task.WaitAll(workers.ToArray()); + + // finalise + scraper.PrintDownloadStatistics(); + scraper.WriteIndexFile(); + } + + class RangeScraperData + { + public ulong StartRange = 0; + public ulong EndRange = 0; + + public ulong TotalIds = 0; + public int DownloadedIds = 0; + public int ErrorIds = 0; - Scraper.FileExtension = Config.Default.OutputExtension == "Auto" ? assetDeliveryInfo.AssetType.GetExtension() : Config.Default.OutputExtension; - Scraper.CompressionLevel = Config.Default.CompressionLevel; // BZip2 and Zstd libraries automatically clamp the compression level + public int TotalVersions = 0; + public int DownloadedVersions = 0; + public int ErrorVersions = 0; + } - // add to queue - for (int i = 1; i <= assetDeliveryInfo.TotalVersions; i++) + static async Task RangeScraperTitleLogic(RangeScraperData data, CancellationToken token) + { + while (!token.IsCancellationRequested) { - Scraper.Assets.Enqueue(new AssetInput + Console.Title = $"RobloxUltimateScraper | Range {data.StartRange}-{data.EndRange} | " + + $"{data.DownloadedIds}/{data.TotalIds} IDs ({data.ErrorIds} errors) | " + + $"{data.DownloadedVersions}/{data.TotalVersions} Versions ({data.ErrorVersions} errors)"; + + try { - Id = assetId, - Version = i - }); - } + await Task.Delay(1000, token); + } + catch (TaskCanceledException) + { + return; + } + } + } + + static void RunRangeScraper() + { + string outputDirectory; + + if (string.IsNullOrEmpty(Config.Default.OutputDirectory)) + outputDirectory = $"Range_{DateTimeOffset.UtcNow.ToUnixTimeSeconds()}"; + else + outputDirectory = Config.Default.OutputDirectory; + + if (Config.Default.OutputType != OutputType.Console) + Directory.CreateDirectory(outputDirectory); + + RangeScraperData data = new RangeScraperData() + { + StartRange = Config.Default.ScraperStartRange, + EndRange = Config.Default.ScraperEndRange, + TotalIds = Config.Default.ScraperEndRange - Config.Default.ScraperStartRange + }; + + RangeScraper scraper = new RangeScraper(Config.Default.ScraperStartRange, Config.Default.ScraperEndRange, outputDirectory); // set up titles - SetAssetScraperTitle(assetId, 0, 0, assetDeliveryInfo.TotalVersions); - Scraper.OnDownloadFinished += () => SetAssetScraperTitle(assetId, Scraper.SuccessfulDownloads, Scraper.FailedDownloads, assetDeliveryInfo.TotalVersions); + scraper.OnAssetDownloadFinished += (bool success) => { Interlocked.Increment(ref data.DownloadedVersions); if (!success) { Interlocked.Increment(ref data.ErrorVersions); } }; + scraper.OnAssetVersionsDiscovered += (int versions) => { Interlocked.Add(ref data.TotalVersions, versions); }; + + scraper.OnAssetFinished += () => Interlocked.Increment(ref data.DownloadedIds); + scraper.OnAssetError += () => { Interlocked.Increment(ref data.DownloadedIds); Interlocked.Increment(ref data.ErrorIds); }; + + CancellationTokenSource cts = new CancellationTokenSource(); + Task titleUpdateTask = Task.Run(() => RangeScraperTitleLogic(data, cts.Token)); // start workers List workers = new List(); for (int i = 1; i <= Config.Default.Workers; i++) - workers.Add(Task.Run(Scraper.StartWorker)); + workers.Add(Task.Run(scraper.StartWorker)); Task.WaitAll(workers.ToArray()); + cts.Cancel(); - // finalise - Scraper.PrintDownloadStatistics(); - Scraper.WriteIndexFile($"{assetId} asset versions on {DateTime.Now.ToString("R")} ({assetDeliveryInfo.TotalVersions} versions)"); + Console.WriteLine($"ID Range: {data.StartRange}-{data.EndRange}"); + Console.WriteLine($"Total IDs: {data.TotalIds}"); + Console.WriteLine($"Successful ID Downloads: {data.DownloadedIds}"); + Console.WriteLine($"Failed ID Downloads: {data.ErrorIds}"); + Console.WriteLine($"Total Versions: {data.TotalVersions}"); + Console.WriteLine($"Successful Version Downloads: {data.DownloadedVersions}"); + Console.WriteLine($"Failed Version Downloads: {data.ErrorVersions}"); } } } \ No newline at end of file diff --git a/RobloxUltimateScraper/Properties/launchSettings.json b/RobloxUltimateScraper/Properties/launchSettings.json index 178e2e5..ec5a221 100644 --- a/RobloxUltimateScraper/Properties/launchSettings.json +++ b/RobloxUltimateScraper/Properties/launchSettings.json @@ -5,7 +5,11 @@ }, "RobloxUltimateScraper - Asset Scraper": { "commandName": "Project", - "commandLineArgs": "-w 30\r\n-c Zstd\r\n--cl 15\r\n-d Crossroads\r\n-o Both\r\n-a 1818\r\n-i All" + "commandLineArgs": "-w 5\r\n-c Zstd\r\n--cl 15\r\n-d Crossroads\r\n-o Both\r\n-a 1818\r\n-i All" + }, + "RobloxUltimateScraper - Range Scraper": { + "commandName": "Project", + "commandLineArgs": "-w 5\r\n-c Zstd\r\n--cl 9\r\n-o Both\r\n-r 1000000-1000100\r\n-i All\r\n--mds Separated\r\n--eat Image" } } } \ No newline at end of file diff --git a/RobloxUltimateScraper/RangeScraper.cs b/RobloxUltimateScraper/RangeScraper.cs new file mode 100644 index 0000000..6d86ca9 --- /dev/null +++ b/RobloxUltimateScraper/RangeScraper.cs @@ -0,0 +1,112 @@ +using RobloxUltimateScraper.Enums; + +namespace RobloxUltimateScraper +{ + /// + /// Range scraper + /// + internal class RangeScraper + { + public delegate void AssetDownloadFinished(bool success); // Once a version is finished downloading + public delegate void AssetVersionsDiscovered(int versions); // Once asset scraper returns total asset versions + public delegate void AssetFinished(); // Once an ID is finished downloading + public delegate void AssetError(); // Once asset scraper setup errors + + public event AssetDownloadFinished? OnAssetDownloadFinished; + public event AssetVersionsDiscovered? OnAssetVersionsDiscovered; + public event AssetFinished? OnAssetFinished; + public event AssetError? OnAssetError; + + public ulong StartRange { get; } + public ulong EndRange { get; } + + public ulong CurrentId { get; private set; } + + public string OutputDirectory { get; } + + private object _lock = new object(); + + public RangeScraper(ulong startRange, ulong endRange, string outputDirectory) + { + StartRange = startRange; + EndRange = endRange; + + CurrentId = startRange - 1; + + OutputDirectory = outputDirectory; + } + + public async Task StartWorker(HttpClient httpClient) + { + while (EndRange > CurrentId) + { + ulong id; + lock (_lock) + { + if (EndRange <= CurrentId) + continue; + CurrentId++; + id = CurrentId; + } + + Scraper scraper = new Scraper(id, OutputDirectory); + scraper.SetupMultipleDownloadStructure(Config.Default.MultipleDownloadStructure, $"Asset_{id}"); + + if (File.Exists(scraper.GetIndexFilePath())) // index.txt is an indication that the download was finished. does not work for non-index runs. + { + Console.WriteLine($"Skipping {id} - already done. Delete the file/directory to redo the download."); + OnAssetFinished?.Invoke(); + continue; + } + + var result = await scraper.Setup(httpClient); + if (!result.Success) + { + Console.WriteLine($"Failed to download {id} ({result.Message})"); + OnAssetError?.Invoke(); + continue; + } + + if (Config.Default.ExpectedAssetType.HasValue && scraper.AssetType != Config.Default.ExpectedAssetType.Value) + { + Console.WriteLine($"Asset {id}'s type is not whitelisted (expected {Config.Default.ExpectedAssetType})"); + OnAssetFinished?.Invoke(); + continue; + } + + scraper.CreateOutputDirectories(); + OnAssetVersionsDiscovered?.Invoke(scraper.TotalVersions); + + scraper.OnDownloadFinished += (bool success) => OnAssetDownloadFinished?.Invoke(success); + + // do not bother compressing images gathered using this scraper (unless specified otherwise by the config) + if (scraper.AssetType == AssetType.Image && !Config.Default.CompressImages) + scraper.ShouldCompress = false; + + await scraper.StartWorker(httpClient); + + scraper.PrintDownloadStatistics(); + scraper.WriteIndexFile(); + + Console.WriteLine($"{id} has been completed."); + OnAssetFinished?.Invoke(); + } + } + + public async Task StartWorker() + { + HttpClient httpClient = Config.Default.SingleHttpClient ? Http.Client : Http.CreateClient(); + + try + { + await StartWorker(httpClient); + } + finally + { + // only dispose if it isnt the global http client + if (httpClient != Http.Client) + httpClient.Dispose(); + } + } + } +} diff --git a/RobloxUltimateScraper/RobloxUltimateScraper.csproj b/RobloxUltimateScraper/RobloxUltimateScraper.csproj index 7195b27..24f9b5c 100644 --- a/RobloxUltimateScraper/RobloxUltimateScraper.csproj +++ b/RobloxUltimateScraper/RobloxUltimateScraper.csproj @@ -1,8 +1,8 @@ - + Exe - net6.0 + net8.0 enable enable 0.1.3.0 @@ -12,7 +12,7 @@ - + diff --git a/RobloxUltimateScraper/Scraper.cs b/RobloxUltimateScraper/Scraper.cs index a32b18b..8aa269d 100644 --- a/RobloxUltimateScraper/Scraper.cs +++ b/RobloxUltimateScraper/Scraper.cs @@ -1,150 +1,175 @@ using RobloxUltimateScraper.Enums; using RobloxUltimateScraper.Models; -using System; -using System.Collections.Generic; using System.Diagnostics; -using System.Linq; using System.Net; using System.Text; using System.Text.Json; -using System.Threading.Tasks; namespace RobloxUltimateScraper { /// - /// The core + /// Asset ID scraper /// - internal static class Scraper + internal class Scraper { /// - /// Assets to download + /// Successful or failed download event. /// - public static Queue Assets { get; } + public delegate void DownloadFinished(bool success); + + private readonly object _lock = new object(); /// - /// File extension to be used for saving + /// Asset ID being downloaded /// - public static string? FileExtension { get; set; } = null; + public ulong AssetId { get; } /// - /// Should the CDN url be trimmed in the console output + /// Asset type of the asset /// - public static bool ShouldTrimCdnUrlInConsole { get; set; } = true; + public AssetType AssetType { get; private set; } = AssetType.Unknown; /// - /// Compression level + /// Total versions present in this asset /// - public static int CompressionLevel { get; set; } = 0; + public int TotalVersions { get; private set; } /// - /// Is index enabled + /// Most recent version being downloaded /// - public static bool IndexEnabled { get { return Config.Default.OutputType == OutputType.Index || Config.Default.OutputType == OutputType.Both; } } + public int CurrentVersion { get; private set; } /// - /// Are files enabled + /// Directory where to output the files to /// - public static bool FilesEnabled { get { return Config.Default.OutputType == OutputType.Files || Config.Default.OutputType == OutputType.Both; } } + public string OutputDirectory { get; set; } /// - /// Is console only + /// Directory to output index files to. This value is prioritised over /// - public static bool ConsoleOnly { get { return Config.Default.OutputType == OutputType.Console; } } + public string? IndexOutputDirectory { get; set; } /// - /// Versions that successfully downloaded + /// Directory to output asset files to. This value is prioritised over /// - public static int SuccessfulDownloads { get; private set; } + public string? FilesOutputDirectory { get; set; } /// - /// Versions that failed to download + /// File extension to be used for saving /// - public static int FailedDownloads { get; private set; } + public string? FileExtension { get; set; } = null; + private int _successfulDownloads; /// - /// Successful or failed download event. + /// Versions that successfully downloaded /// - public delegate void DownloadFinished(); + public int SuccessfulDownloads => _successfulDownloads; + private int _failedDownloads; /// - /// Event that fires upon a successful or failed download. + /// Versions that failed to download /// - public static event DownloadFinished? OnDownloadFinished; + public int FailedDownloads => _failedDownloads; /// - /// singleton. + /// Event that fires upon a successful or failed download. /// - private static HttpClient _HttpClient { get; } + public event DownloadFinished? OnDownloadFinished; /// - /// Http client cookies. + /// Should compress the files downloaded? /// - private static CookieContainer _CookieContainer { get; } + public bool ShouldCompress { get; set; } = true; /// /// Index entries /// - private static List _Index { get; } + private List _index = new List(); + + /// + /// Name of the output index files + /// + private string _indexName = "index"; /// /// Initialises values used by /// - static Scraper() + public Scraper(ulong assetId, string outputDirectory) { - Assets = new Queue(); - - SuccessfulDownloads = 0; - FailedDownloads = 0; - - _CookieContainer = new CookieContainer(); + AssetId = assetId; + OutputDirectory = outputDirectory; + } - string? cookie = null; + public struct SetupResult + { + public bool Success; + public string Message; + } - if (!string.IsNullOrEmpty(Config.Default.AuthCookie)) - { - Console.WriteLine("Using cookies from arguments."); - cookie = Config.Default.AuthCookie; - } - else + /// + /// Sets up the directory paths further for multiple download scrapers. + /// + /// Name of the child directory if needed + /// Directory structure type + public void SetupMultipleDownloadStructure(MultipleDownloadStructureType type, string childDirectoryName) + { + switch (type) { - string? envValue = Environment.GetEnvironmentVariable("ROBLOXULTIMATESCRAPER_COOKIE"); - if (!string.IsNullOrEmpty(envValue)) - { - Console.WriteLine("Using cookies from environment variables."); - cookie = envValue; - } + case MultipleDownloadStructureType.Default: + OutputDirectory = Path.Combine(OutputDirectory, childDirectoryName); + break; + + case MultipleDownloadStructureType.Combined: + _indexName = $"{AssetId}_index"; + break; + + case MultipleDownloadStructureType.Separated: + _indexName = $"{AssetId}_index"; + IndexOutputDirectory = Path.Combine(OutputDirectory, "Indexes"); + FilesOutputDirectory = Path.Combine(OutputDirectory, "Files"); + break; } - - if (cookie != null) - _CookieContainer.Add(new Cookie(".ROBLOSECURITY", cookie, "/", $".{Config.Default.BaseUrl}")); + } - HttpClientHandler httpClientHandler = new HttpClientHandler - { - AutomaticDecompression = DecompressionMethods.All, - AllowAutoRedirect = false, // we are using v1 because v2 is bad - CookieContainer = _CookieContainer, - UseCookies = true - }; + public async Task Setup(HttpClient httpClient) + { + var assetDeliveryInfo = await GetAssetDeliveryInformation(httpClient); + if (!assetDeliveryInfo.Success) + return new SetupResult { Success = false, Message = $"Failed to fetch versions for asset {AssetId}: {assetDeliveryInfo.Error}" }; - _HttpClient = new HttpClient(httpClientHandler) - { - Timeout = TimeSpan.FromSeconds(Config.Default.HttpTimeout) - }; - //_HttpClient.DefaultRequestHeaders.Add("User-Agent", "Roblox/WinINet"); + AssetType = assetDeliveryInfo.AssetType; + TotalVersions = assetDeliveryInfo.TotalVersions; + + FileExtension = Config.Default.OutputExtension == "Auto" ? assetDeliveryInfo.AssetType.GetExtension() : Config.Default.OutputExtension; - _Index = new List(); + return new SetupResult { Success = true }; + } + + /// + /// Creates all the necessary output directories + /// + public void CreateOutputDirectories() + { + // create all the directories we need + Directory.CreateDirectory(OutputDirectory); + + if (!string.IsNullOrEmpty(IndexOutputDirectory)) + Directory.CreateDirectory(IndexOutputDirectory); + + if (!string.IsNullOrEmpty(FilesOutputDirectory)) + Directory.CreateDirectory(FilesOutputDirectory); } /// /// Creates a request to https://assetdelivery.roblox.com/v1/asset/ /// - /// Asset Id + /// Http client /// Asset Version (0 for latest) /// Http response - public static Task AssetRequest(long id, int version = 0) + public Task AssetRequest(HttpClient httpClient, int version = 0) { - string url = $"https://assetdelivery.{Config.Default.BaseUrl}/v1/asset/?id={id}&version={version}"; - return _HttpClient.GetAsync(url); + string url = $"https://assetdelivery.{Config.Default.BaseUrl}/v1/asset/?id={AssetId}&version={version}"; + return httpClient.GetAsync(url); } /// @@ -168,6 +193,29 @@ public static bool IsSuccessStatusCode(HttpStatusCode code, bool allowForbidden } } + /// + /// Gets the status code from an assetdelivery response. + /// This function will use the field and the to deduce the correct status code. + /// + /// Response message from assetdelivery + /// Status code + private static HttpStatusCode GetAssetDeliveryStatusCode(HttpResponseMessage responseMessage) + { + switch (responseMessage.StatusCode) + { + case HttpStatusCode.Forbidden: + // roblox updated assetdelivery to return 403 for copylocked assets + // originally, it returned a 409 + // this breaks a bunch of existing logic, so we have to check the body aswell + string responseContent = responseMessage.Content.ReadAsStringAsync().Result; + bool isConflict = responseContent.Contains("User is not authorized to access Asset."); // this should suffice for now + return isConflict ? HttpStatusCode.Conflict : HttpStatusCode.Forbidden; + + default: + return responseMessage.StatusCode; + } + } + public struct AssetDeliveryInformation { public bool Success; @@ -180,21 +228,21 @@ public struct AssetDeliveryInformation /// /// Retrieves information from asset delivery /// - /// Asset Id /// Asset delivery information - public static async Task GetAssetDeliveryInformation(long id) + public async Task GetAssetDeliveryInformation(HttpClient httpClient) { - HttpResponseMessage response = await AssetRequest(id); + HttpResponseMessage response = await AssetRequest(httpClient); + HttpStatusCode statusCode = GetAssetDeliveryStatusCode(response); - if (response.StatusCode == HttpStatusCode.Conflict) + if (statusCode == HttpStatusCode.Conflict) return new AssetDeliveryInformation { Success = false, Error = "Insufficient permissions to download asset" }; - if (!IsSuccessStatusCode(response.StatusCode, allowForbidden: true)) // 403 means that the latest version is deleted but can still download - return new AssetDeliveryInformation { Success = false, Error = $"Unhandled status code ({(int)response.StatusCode})" }; + if (!IsSuccessStatusCode(statusCode, allowForbidden: true)) // 403 means that the latest version is deleted but can still download + return new AssetDeliveryInformation { Success = false, Error = $"Unhandled status code ({(int)statusCode})" }; IEnumerable? values; - int versions = 0; - AssetType assetType = 0; + int versions; + AssetType assetType; { if (!response.Headers.TryGetValues("roblox-assetversionnumber", out values)) @@ -215,7 +263,7 @@ public static async Task GetAssetDeliveryInformation(l if (!Enum.TryParse(versionsStr, out assetType)) { Debug.Assert(false); - assetType = AssetType.Product; + assetType = AssetType.Unknown; } } @@ -225,36 +273,86 @@ public static async Task GetAssetDeliveryInformation(l /// /// Retrieves the CDN url from an asset id /// - /// Asset Id + /// Http client /// Version (0 for latest) /// Success, Error string, CDN url - public static async Task<(bool, string, string)> GetCDNUrl(long id, int version = 0) + public async Task<(bool, string, string)> GetCdnUrl(HttpClient httpClient, int version = 0) { - HttpResponseMessage response = await AssetRequest(id, version); + try + { + HttpResponseMessage response = await AssetRequest(httpClient, version); + HttpStatusCode statusCode = GetAssetDeliveryStatusCode(response); + + switch (statusCode) + { + case HttpStatusCode.Conflict: + return (false, "Insufficient permissions to download asset", ""); - if (response.StatusCode == HttpStatusCode.Conflict) - return (false, "Insufficient permissions to download asset", ""); + case HttpStatusCode.Forbidden: + return (false, "Asset version has been deleted", ""); - if (!IsSuccessStatusCode(response.StatusCode, allowForbidden: true)) // 403 means that the latest version is deleted but can still download - return (false, $"Unhandled status code ({(int)response.StatusCode}) ({await response.Content.ReadAsStringAsync()})", ""); + case HttpStatusCode.TooManyRequests: + return (false, "Too many requests", ""); + } + + if (!IsSuccessStatusCode(statusCode)) + return (false, $"Unhandled status code ({(int)statusCode}) ({await response.Content.ReadAsStringAsync()})", ""); - if (!response.Headers.TryGetValues("Location", out IEnumerable? values)) - return (false, "Location header is missing", ""); // this should never happen, but handle anyways + if (!response.Headers.TryGetValues("Location", out IEnumerable? values)) + return (false, "Location header is missing", ""); // this should never happen, but handle anyways - string location = values.First(); + string location = values.First(); - return (true, "Success", location); + return (true, "Success", location); + } + catch (Exception ex) + { + return (false, ex.ToString(), ""); + } + } + + /// + /// Gets content from the CDN using a specified URL + /// + /// Http client + /// CDN Url + /// Success, Error Message, HttpResponseMessage + public static async Task<(bool, string, HttpResponseMessage?)> GetCdnContent(HttpClient httpClient, string url) + { + try + { + HttpResponseMessage response = await httpClient.GetAsync(url); + + switch (response.StatusCode) + { + case HttpStatusCode.Forbidden: + return (false, "Asset not found on CDN", null); + + case HttpStatusCode.TooManyRequests: + return (false, "Too many requests", null); + + default: + if (!IsSuccessStatusCode(response.StatusCode)) + return (false, $"Unknown status code ({(int)response.StatusCode})", null); + break; + } + + return (true, "Success", response); + } + catch (Exception ex) + { + return (false, ex.ToString(), null); + } } /// /// Constructs the asset output path /// - /// Id /// Version /// Asset output path - public static string BuildAssetOutputFileName(long id, int version) + public string BuildAssetOutputFileName(int version) { - string fileName = id.ToString(); + string fileName = AssetId.ToString(); if (version != 0) fileName += $"-v{version}"; @@ -264,13 +362,12 @@ public static string BuildAssetOutputFileName(long id, int version) /// /// Logs an asset to index /// - /// Id /// Version /// CDN url /// File size in Mb /// Last modified /// Error message - private static void LogAsset(long id, + private void LogAsset( int version, string? cdnUrl = null, double? fileSizeInMb = null, @@ -279,7 +376,6 @@ private static void LogAsset(long id, { AssetOutput output = new AssetOutput { - Id = id, Version = version, CDNUrl = cdnUrl, FileSizeInMb = fileSizeInMb, @@ -287,49 +383,48 @@ private static void LogAsset(long id, Error = error }; - Console.WriteLine(output.ToString(trimCdnUrl: ShouldTrimCdnUrlInConsole)); + Console.WriteLine(output.ToString(trimCdnUrl: Config.Default.TrimCdnUrl.ShouldTrim(OutputType.Console))); - _Index.Add(output); + _index.Add(output); } /// /// Logs an asset to index and saves it /// /// Http response messsage - /// Id /// Version /// CDN url - private static async Task LogAssetFromCDNHttpMessageResponse(HttpResponseMessage response, - long id, + private async Task LogAssetFromCdnHttpMessageResponse(HttpResponseMessage response, int version, string cdnUrl) { - if (!ConsoleOnly) - Directory.CreateDirectory(Config.Default.OutputDirectory); + string outputDir = FilesOutputDirectory ?? OutputDirectory; // get last modified - string? lastModified = response.Content.Headers.GetValues("last-modified").FirstOrDefault(); + string? lastModified = null; + if (response.Content.Headers.TryGetValues("last-modified", out IEnumerable? lastModifiedValues)) + lastModified = lastModifiedValues.First(); double? fileSize = null; using (Stream stream = await response.Content.ReadAsStreamAsync()) { - fileSize = Math.Round(stream.Length / 1024f / 1024f, 6); + fileSize = Math.Round(stream.Length / 1024.0 / 1024.0, 6); - if (FilesEnabled) + if (Config.Default.OutputType.IsFileSavingEnabled()) { - string outputName = BuildAssetOutputFileName(id, version); - string path = Path.Combine(Config.Default.OutputDirectory, outputName); + string outputName = BuildAssetOutputFileName(version); + string path = Path.Combine(outputDir, outputName); string outputPath = FileWriter.BuildOutputFileName(path, FileExtension); DateTime? lastModifiedDT = lastModified != null ? DateTime.Parse(lastModified) : null; - FileWriter.Save(outputPath, stream, CompressionLevel, lastModifiedDT); + CompressionType compressionType = ShouldCompress ? Config.Default.CompressionType : CompressionType.None; + FileWriter.Save(outputPath, stream, compressionType, Config.Default.CompressionLevel, lastModifiedDT); } } LogAsset( - id: id, version: version, cdnUrl: cdnUrl, fileSizeInMb: fileSize, @@ -340,19 +435,19 @@ private static async Task LogAssetFromCDNHttpMessageResponse(HttpResponseMessage /// /// Increments and invokes . /// - private static void FireAssetSuccess() + private void FireAssetSuccess() { - SuccessfulDownloads++; - OnDownloadFinished?.Invoke(); + Interlocked.Increment(ref _successfulDownloads); + OnDownloadFinished?.Invoke(true); } /// /// Increments and invokes . /// - private static void FireAssetFailed() + private void FireAssetFailed() { - SuccessfulDownloads++; - OnDownloadFinished?.Invoke(); + Interlocked.Increment(ref _failedDownloads); + OnDownloadFinished?.Invoke(false); } /// @@ -360,74 +455,87 @@ private static void FireAssetFailed() /// /// Worker // TODO: add try catch blocks. give 3 retries w/ exceptions - public static async Task StartWorker() + public async Task StartWorker(HttpClient httpClient) { - while (Assets.Count > 0) + while (TotalVersions > CurrentVersion) { - AssetInput asset; - lock (Assets) + int version; + lock (_lock) { - if (Assets.Count == 0) + if (TotalVersions <= CurrentVersion) continue; - asset = Assets.Dequeue(); + CurrentVersion++; + version = CurrentVersion; } // get the url - (bool cdnGetSuccess, string cdnGetMessage, string cdnUrl) = await GetCDNUrl(asset.Id, asset.Version); + (bool cdnGetSuccess, string cdnGetMessage, string cdnUrl) = await GetCdnUrl(httpClient, version); if (!cdnGetSuccess) { - LogAsset(error: $"Failed to fetch {asset.Id} v{asset.Version}: {cdnGetMessage}", id: asset.Id, version: asset.Version); + LogAsset(error: $"Failed to fetch {AssetId} v{version}: {cdnGetMessage}", version: version); FireAssetFailed(); continue; } // download the asset - HttpResponseMessage cdnResponse = await _HttpClient.GetAsync(cdnUrl); + (bool cdnDownloadSuccess, string cdnDownloadMessage, HttpResponseMessage? cdnDownloadResponse) = await GetCdnContent(httpClient, cdnUrl); - if (cdnResponse.StatusCode == HttpStatusCode.Forbidden) + if (!cdnDownloadSuccess) { - LogAsset(error: $"Failed to fetch {asset.Id} v{asset.Version} ({cdnUrl}): Asset not found on CDN", id: asset.Id, version: asset.Version); + LogAsset(error: $"Failed to fetch {AssetId} v{version} ({cdnUrl}): {cdnDownloadMessage}", version: version); FireAssetFailed(); continue; } - if (!IsSuccessStatusCode(cdnResponse.StatusCode)) - { - LogAsset(error: $"Failed to fetch {asset.Id} v{asset.Version} ({cdnUrl}): Unknown status code ({(int)cdnResponse.StatusCode})", id: asset.Id, version: asset.Version); - FireAssetFailed(); - continue; - } + Debug.Assert(cdnDownloadResponse != null); // save! - await LogAssetFromCDNHttpMessageResponse(cdnResponse, asset.Id, asset.Version, cdnUrl); + await LogAssetFromCdnHttpMessageResponse(cdnDownloadResponse, version, cdnUrl); FireAssetSuccess(); } } + public async Task StartWorker() + { + HttpClient httpClient = Config.Default.SingleHttpClient ? Http.Client : Http.CreateClient(); + + try + { + await StartWorker(httpClient); + } + finally + { + // only dispose if it isnt the global http client + if (httpClient != Http.Client) + httpClient.Dispose(); + } + } + /// /// Prints download statistics /// - public static void PrintDownloadStatistics() + public void PrintDownloadStatistics() { - Console.WriteLine($"Successful Downloads: {SuccessfulDownloads}"); - Console.WriteLine($"Failed Downloads: {FailedDownloads}"); - Console.WriteLine($"Total Downloads: {SuccessfulDownloads + FailedDownloads}"); + Console.WriteLine($"{AssetId} | Successful Downloads: {SuccessfulDownloads}"); + Console.WriteLine($"{AssetId} | Failed Downloads: {FailedDownloads}"); + Console.WriteLine($"{AssetId} | Total Downloads: {SuccessfulDownloads + FailedDownloads}"); } /// /// Writes the index file /// - /// Index header - public static void WriteIndexFile(string header) + public void WriteIndexFile() { - if (!IndexEnabled) + if (!Config.Default.OutputType.IsIndexEnabled()) return; - Directory.CreateDirectory(Config.Default.OutputDirectory); + string outputDir = IndexOutputDirectory ?? OutputDirectory; + + Directory.CreateDirectory(outputDir); // sort index values - _Index.Sort(); + _index.Sort(); List indexPaths = new List(); @@ -436,14 +544,14 @@ public static void WriteIndexFile(string header) // create index file contents StringBuilder builder = new StringBuilder(); - builder.AppendLine(header); + builder.AppendLine($"{AssetId} asset versions on {DateTime.Now.ToString("R")} ({TotalVersions} versions)"); - foreach (AssetOutput asset in _Index) - builder.AppendLine(asset.ToString()); + foreach (AssetOutput asset in _index) + builder.AppendLine(asset.ToString(trimCdnUrl: Config.Default.TrimCdnUrl.ShouldTrim(OutputType.Index))); string contents = builder.ToString(); - string path = Path.Combine(Config.Default.OutputDirectory, "index.txt"); + string path = Path.Combine(outputDir, $"{_indexName}.txt"); indexPaths.Add(path); File.WriteAllText(path, contents); @@ -451,9 +559,9 @@ public static void WriteIndexFile(string header) if (Config.Default.IndexType == IndexType.Json || Config.Default.IndexType == IndexType.All) { - string contents = JsonSerializer.Serialize(_Index); + string contents = JsonSerializer.Serialize(_index); - string path = Path.Combine(Config.Default.OutputDirectory, "index.json"); + string path = Path.Combine(outputDir, $"{_indexName}.json"); indexPaths.Add(path); File.WriteAllText(path, contents); @@ -462,5 +570,12 @@ public static void WriteIndexFile(string header) // write information about index Console.WriteLine($"Index file(s) can be found at {string.Join(", ", indexPaths)}"); } + + /// + /// Gets the path of the index file + /// + /// Get the path of the JSON variation of the index + /// Index file path + public string GetIndexFilePath(bool json = false) => Path.Combine(IndexOutputDirectory ?? OutputDirectory, $"{_indexName}.{(json ? "json" : "txt")}"); } }