From 19f0b1b015535464bc9d201816496c4e4670c054 Mon Sep 17 00:00:00 2001 From: unknown <58430028+owbryd@users.noreply.github.com> Date: Fri, 15 May 2026 11:20:16 -0300 Subject: [PATCH] fix bugs in HTML file --- app/dl/dl.go | 47 +++++ app/dl/htmlexport.go | 386 +++++++++++++++++++++++++++++++++++++++ cmd/dl.go | 11 +- pkg/consts/flag.go | 5 + pkg/tmessage/chat.go | 177 ++++++++++++++++++ pkg/tmessage/tmessage.go | 29 ++- 6 files changed, 651 insertions(+), 4 deletions(-) create mode 100644 app/dl/htmlexport.go create mode 100644 pkg/tmessage/chat.go diff --git a/app/dl/dl.go b/app/dl/dl.go index e603250ea1..054369424d 100644 --- a/app/dl/dl.go +++ b/app/dl/dl.go @@ -39,12 +39,21 @@ type Options struct { Takeout bool Group bool // auto detect grouped message + // chat-based download + Chat string + Topic int + MsgStart int + MsgEnd int + // resume opts Continue, Restart bool // serve Serve bool Port int + + // html export for text messages + SaveHTML bool } type parser struct { @@ -62,6 +71,14 @@ func Run(ctx context.Context, c *telegram.Client, kvd storage.Storage, opts Opti {Data: opts.URLs, Parser: tmessage.FromURL(ctx, pool, kvd, opts.URLs)}, {Data: opts.Files, Parser: tmessage.FromFile(ctx, pool, kvd, opts.Files, true)}, } + + // Add chat-based parser if --chat is specified + if opts.Chat != "" { + parsers = append(parsers, parser{ + Data: []string{opts.Chat}, + Parser: tmessage.FromChat(ctx, pool, kvd, opts.Chat, opts.Topic, opts.MsgStart, opts.MsgEnd), + }) + } dialogs, err := collectDialogs(parsers) if err != nil { return err @@ -69,6 +86,36 @@ func Run(ctx context.Context, c *telegram.Client, kvd storage.Storage, opts Opti logctx.From(ctx).Debug("Collect dialogs", zap.Any("dialogs", dialogs)) + // Export text messages as HTML if --save-html is enabled and --chat was used + if opts.SaveHTML && opts.Chat != "" { + for _, group := range dialogs { + for _, d := range group { + if len(d.TextMessages) > 0 { + chatName := opts.Chat + chatID := tmessage.GetDialogPeerID(d.Peer) + + // Convert tmessage.TextMsg to dl.TextMessage + textMsgs := make([]TextMessage, len(d.TextMessages)) + for i, tm := range d.TextMessages { + textMsgs[i] = TextMessage{ + ID: tm.ID, + Date: tm.Date, + Text: tm.Text, + Entities: tm.Entities, + ReplyToMsgID: tm.ReplyToMsgID, + FromName: tm.FromName, + } + } + + if err := exportTextMessagesHTML(opts.Dir, chatName, chatID, textMsgs); err != nil { + logctx.From(ctx).Warn("Failed to export text messages as HTML", + zap.Error(err)) + } + } + } + } + } + if opts.Serve { return serve(ctx, kvd, pool, dialogs, opts.Port, opts.Takeout) } diff --git a/app/dl/htmlexport.go b/app/dl/htmlexport.go new file mode 100644 index 0000000000..7728c50b4e --- /dev/null +++ b/app/dl/htmlexport.go @@ -0,0 +1,386 @@ +package dl + +import ( + "fmt" + "html" + "os" + "path/filepath" + "sort" + "strings" + "time" + + "github.com/fatih/color" + "github.com/gotd/td/tg" +) + +// HTML tag constants to avoid goconst lint warnings. +const ( + htmlCloseB = "" + htmlCloseA = "" +) + +// TextMessage holds the data for a single text-only message to be exported as HTML. +type TextMessage struct { + ID int + Date int + Text string + Entities []tg.MessageEntityClass + // ReplyToMsgID is the ID of the message this is replying to (0 if not a reply) + ReplyToMsgID int + // FromName is the sender name (if available) + FromName string +} + +// exportTextMessagesHTML writes a self-contained HTML file with all collected text messages. +// The file is saved to dir/chatName_text_messages.html +func exportTextMessagesHTML(dir string, chatName string, chatID int64, messages []TextMessage) error { + if len(messages) == 0 { + return nil + } + + // Sort messages by ID (ascending = chronological) + sort.Slice(messages, func(i, j int) bool { + return messages[i].ID < messages[j].ID + }) + + // Sanitize chat name for filename + safeName := sanitizeFilename(chatName) + filename := fmt.Sprintf("%s_%d_text_messages.html", safeName, chatID) + path := filepath.Join(dir, filename) + + if err := os.MkdirAll(dir, 0o755); err != nil { + return fmt.Errorf("create dir for HTML export: %w", err) + } + + f, err := os.Create(path) + if err != nil { + return fmt.Errorf("create HTML file: %w", err) + } + defer f.Close() + + // Write HTML header + fmt.Fprintf(f, ` + +
+ + +", ""
+ case *tg.MessageEntityPre:
+ offset, length = e.Offset, e.Length
+ lang := ""
+ if e.Language != "" {
+ lang = fmt.Sprintf(" data-lang=\"%s\"", html.EscapeString(e.Language))
+ }
+ openTag = fmt.Sprintf("", lang) + closeTag = "" + case *tg.MessageEntityTextURL: + offset, length = e.Offset, e.Length + openTag = fmt.Sprintf("", html.EscapeString(e.URL)) + closeTag = htmlCloseA + case *tg.MessageEntityURL: + offset, length = e.Offset, e.Length + urlText := string(runes[offset : offset+length]) + openTag = fmt.Sprintf("", html.EscapeString(urlText)) + closeTag = htmlCloseA + case *tg.MessageEntityMentionName: + offset, length = e.Offset, e.Length + openTag = fmt.Sprintf("", e.UserID) + closeTag = htmlCloseB + case *tg.MessageEntityMention: + offset, length = e.Offset, e.Length + mention := string(runes[offset : offset+length]) + openTag = fmt.Sprintf("", + html.EscapeString(strings.TrimPrefix(mention, "@"))) + closeTag = htmlCloseA + case *tg.MessageEntityHashtag: + offset, length = e.Offset, e.Length + openTag = "" + closeTag = htmlCloseB + case *tg.MessageEntitySpoiler: + offset, length = e.Offset, e.Length + openTag = "" + closeTag = "" + case *tg.MessageEntityBlockquote: + offset, length = e.Offset, e.Length + openTag = "
" + closeTag = "" + default: + continue + } + + tags = append(tags, tag{pos: offset, close: false, order: 0, html: openTag}) + tags = append(tags, tag{pos: offset + length, close: true, order: 1, html: closeTag}) + } + + // Sort tags: by position, then closes before opens at same position + sort.SliceStable(tags, func(i, j int) bool { + if tags[i].pos != tags[j].pos { + return tags[i].pos < tags[j].pos + } + return tags[i].order > tags[j].order // closes first + }) + + // Build output + var b strings.Builder + tagIdx := 0 + for i, r := range runes { + // Insert any tags at this position + for tagIdx < len(tags) && tags[tagIdx].pos == i { + b.WriteString(tags[tagIdx].html) + tagIdx++ + } + b.WriteString(html.EscapeString(string(r))) + } + // Flush remaining tags at end position + for tagIdx < len(tags) { + b.WriteString(tags[tagIdx].html) + tagIdx++ + } + + return b.String() +} + +// sanitizeFilename removes or replaces characters that are not safe for filenames. +func sanitizeFilename(name string) string { + replacer := strings.NewReplacer( + "/", "_", "\\", "_", ":", "_", "*", "_", + "?", "_", "\"", "_", "<", "_", ">", "_", "|", "_", + " ", "_", + ) + result := replacer.Replace(name) + if len(result) > 60 { + result = result[:60] + } + return result +} diff --git a/cmd/dl.go b/cmd/dl.go index 6540ee8e97..fe150f8f79 100644 --- a/cmd/dl.go +++ b/cmd/dl.go @@ -23,8 +23,8 @@ func NewDownload() *cobra.Command { Short: "Download anything from Telegram (protected) chat", GroupID: groupTools.ID, RunE: func(cmd *cobra.Command, args []string) error { - if len(opts.URLs) == 0 && len(opts.Files) == 0 { - return fmt.Errorf("no urls or files provided") + if len(opts.URLs) == 0 && len(opts.Files) == 0 && opts.Chat == "" { + return fmt.Errorf("no urls, files, or chat provided") } opts.Template = viper.GetString(consts.FlagDlTemplate) @@ -61,6 +61,13 @@ func NewDownload() *cobra.Command { cmd.Flags().BoolVar(&opts.Takeout, "takeout", false, "takeout sessions let you export data from your account with lower flood wait limits.") cmd.Flags().BoolVar(&opts.Group, "group", false, "auto detect grouped message and download all of them") + // chat-based download flags + cmd.Flags().StringVarP(&opts.Chat, "chat", "c", "", "chat id or username to download all media from. Supports -100XXXXXXXXXX format") + cmd.Flags().IntVar(&opts.Topic, "topic", 0, "topic id for forum groups (downloads only messages within the topic)") + cmd.Flags().IntVar(&opts.MsgStart, "msg-start", 0, "optional: minimum message id to download (default: from the beginning)") + cmd.Flags().IntVar(&opts.MsgEnd, "msg-end", 0, "optional: maximum message id to download (default: to the latest)") + cmd.Flags().BoolVar(&opts.SaveHTML, "save-html", true, "save text messages (non-media) as an HTML file when using --chat (default: true)") + // resume flags, if both false then ask user cmd.Flags().BoolVar(&opts.Continue, _continue, false, "continue the last download directly") cmd.Flags().BoolVar(&opts.Restart, restart, false, "restart the last download directly") diff --git a/pkg/consts/flag.go b/pkg/consts/flag.go index 1df7b7dd79..aac451b3db 100644 --- a/pkg/consts/flag.go +++ b/pkg/consts/flag.go @@ -13,4 +13,9 @@ const ( FlagNTP = "ntp" FlagReconnectTimeout = "reconnect-timeout" FlagDlTemplate = "template" + FlagChat = "chat" + FlagTopic = "topic" + FlagMsgStart = "msg-start" + FlagMsgEnd = "msg-end" + FlagSaveHTML = "save-html" ) diff --git a/pkg/tmessage/chat.go b/pkg/tmessage/chat.go new file mode 100644 index 0000000000..6bcc620a76 --- /dev/null +++ b/pkg/tmessage/chat.go @@ -0,0 +1,177 @@ +package tmessage + +import ( + "context" + "fmt" + "strings" + + "github.com/fatih/color" + "github.com/go-faster/errors" + "github.com/gotd/td/telegram/peers" + "github.com/gotd/td/telegram/query" + "github.com/gotd/td/telegram/query/messages" + "github.com/gotd/td/tg" + "go.uber.org/zap" + + "github.com/iyear/tdl/core/dcpool" + "github.com/iyear/tdl/core/logctx" + "github.com/iyear/tdl/core/storage" + "github.com/iyear/tdl/core/util/tutil" +) + +// FromChat creates a ParseSource that collects all media message IDs from a chat +// by iterating the chat history via the Telegram API. This avoids the need to +// pass individual message links or export JSON files. +// +// chat: chat ID (numeric) or username +// topic: topic root message ID (0 = no topic, iterate full history) +// msgStart: minimum message ID to include (0 = no lower bound) +// msgEnd: maximum message ID to include (0 = no upper bound) +func FromChat(ctx context.Context, pool dcpool.Pool, kvd storage.Storage, chat string, topic int, msgStart int, msgEnd int) ParseSource { + return func() ([]*Dialog, error) { + manager := peers.Options{Storage: storage.NewPeers(kvd)}. + Build(pool.Default(ctx)) + + // Normalize chat ID: handle -100XXXXXXXXXX format + normalizedChat := normalizeChatID(chat) + + peer, err := tutil.GetInputPeer(ctx, manager, normalizedChat) + if err != nil { + return nil, errors.Wrapf(err, "resolve chat '%s'", chat) + } + + logctx.From(ctx).Info("Resolved chat for download", + zap.Int64("peer_id", peer.ID()), + zap.String("peer_name", peer.VisibleName()), + zap.Int("topic", topic), + zap.Int("msg_start", msgStart), + zap.Int("msg_end", msgEnd)) + + color.Cyan("Collecting messages from '%s' (ID: %d)...", peer.VisibleName(), peer.ID()) + + // Build the appropriate query (topic or full history) + var q messages.Query + if topic > 0 { + q = query.NewQuery(pool.Default(ctx)).Messages().GetReplies(peer.InputPeer()).MsgID(topic) + } else { + q = query.NewQuery(pool.Default(ctx)).Messages().GetHistory(peer.InputPeer()) + } + + iter := messages.NewIterator(q, 100) + + // If msgEnd is set, start iterating from that point + if msgEnd > 0 { + iter = iter.OffsetID(msgEnd + 1) + } + + msgIDs := make([]int, 0) + textMsgs := make([]TextMsg, 0) + for iter.Next(ctx) { + msg := iter.Value() + + m, ok := msg.Msg.(*tg.Message) + if !ok { + continue + } + + // Stop if we've gone past the start boundary + if msgStart > 0 && m.ID < msgStart { + break + } + + // Collect media messages for download + if hasMedia(m) { + msgIDs = append(msgIDs, m.ID) + } + + // Collect text messages for HTML export (including captions on media messages) + if m.Message != "" { + tm := TextMsg{ + ID: m.ID, + Date: m.Date, + Text: m.Message, + Entities: m.Entities, + } + + // Extract reply info + if replyTo, ok := m.GetReplyTo(); ok { + if rh, ok := replyTo.(*tg.MessageReplyHeader); ok { + tm.ReplyToMsgID = rh.ReplyToMsgID + } + } + + // Try to extract sender name from the message's FromID + if fromID, ok := m.GetFromID(); ok { + tm.FromName = extractSenderName(ctx, manager, fromID) + } + + textMsgs = append(textMsgs, tm) + } + } + + if err := iter.Err(); err != nil { + return nil, errors.Wrap(err, "iterate chat messages") + } + + if len(msgIDs) == 0 && len(textMsgs) == 0 { + return nil, fmt.Errorf("no messages found in chat '%s' (ID: %d)", peer.VisibleName(), peer.ID()) + } + + if len(msgIDs) > 0 { + color.Green("Found %d media messages to download", len(msgIDs)) + } + if len(textMsgs) > 0 { + color.Green("Found %d text messages for HTML export", len(textMsgs)) + } + + return []*Dialog{{ + Peer: peer.InputPeer(), + Messages: msgIDs, + TextMessages: textMsgs, + }}, nil + } +} + +// extractSenderName tries to resolve a PeerClass into a display name. +func extractSenderName(ctx context.Context, manager *peers.Manager, fromID tg.PeerClass) string { + switch f := fromID.(type) { + case *tg.PeerUser: + if p, err := manager.ResolveUserID(ctx, f.UserID); err == nil { + return p.VisibleName() + } + case *tg.PeerChannel: + if p, err := manager.ResolveChannelID(ctx, f.ChannelID); err == nil { + return p.VisibleName() + } + case *tg.PeerChat: + if p, err := manager.ResolveChatID(ctx, f.ChatID); err == nil { + return p.VisibleName() + } + } + return "" +} + +// hasMedia checks if a message contains downloadable media (document or photo). +func hasMedia(m *tg.Message) bool { + md, ok := m.GetMedia() + if !ok { + return false + } + + switch md.(type) { + case *tg.MessageMediaDocument, *tg.MessageMediaPhoto: + return true + default: + return false + } +} + +// normalizeChatID handles the common Telegram marked channel ID format. +// Users often encounter IDs like -1001931890116 (from bot APIs), but the +// internal Telegram channel ID is 1931890116. This strips the -100 prefix. +func normalizeChatID(chat string) string { + if strings.HasPrefix(chat, "-100") && len(chat) > 4 { + return chat[4:] + } + return chat +} diff --git a/pkg/tmessage/tmessage.go b/pkg/tmessage/tmessage.go index 7f37661631..b8f41904d3 100644 --- a/pkg/tmessage/tmessage.go +++ b/pkg/tmessage/tmessage.go @@ -4,9 +4,21 @@ import ( "github.com/gotd/td/tg" ) +// TextMsg holds the essential data of a text-only message (no downloadable media) +// for later export as HTML. +type TextMsg struct { + ID int + Date int + Text string + Entities []tg.MessageEntityClass + ReplyToMsgID int + FromName string +} + type Dialog struct { - Peer tg.InputPeerClass - Messages []int + Peer tg.InputPeerClass + Messages []int + TextMessages []TextMsg // text-only messages (for HTML export) } type ParseSource func() ([]*Dialog, error) @@ -14,3 +26,16 @@ type ParseSource func() ([]*Dialog, error) func Parse(src ParseSource) ([]*Dialog, error) { return src() } + +// GetDialogPeerID extracts the numeric ID from an InputPeerClass. +func GetDialogPeerID(peer tg.InputPeerClass) int64 { + switch p := peer.(type) { + case *tg.InputPeerUser: + return p.UserID + case *tg.InputPeerChat: + return p.ChatID + case *tg.InputPeerChannel: + return p.ChannelID + } + return 0 +}