4343import java .util .concurrent .ScheduledFuture ;
4444import java .util .concurrent .TimeUnit ;
4545import java .util .concurrent .TimeoutException ;
46- import java .util .concurrent .atomic .AtomicInteger ;
46+ import java .util .concurrent .atomic .AtomicLong ;
4747import lombok .extern .slf4j .Slf4j ;
4848import org .apache .commons .lang3 .StringUtils ;
4949import org .apache .commons .lang3 .mutable .MutableInt ;
@@ -101,7 +101,7 @@ public class ServiceUnitStateChannelImpl implements ServiceUnitStateChannel {
101101 private long totalCleanupCnt = 0 ;
102102 private long totalBrokerCleanupTombstoneCnt = 0 ;
103103 private long totalServiceUnitCleanupTombstoneCnt = 0 ;
104- private long totalServiceUnitCleanupErrorCnt = 0 ;
104+ private AtomicLong totalCleanupErrorCnt = new AtomicLong () ;
105105 private long totalCleanupScheduledCnt = 0 ;
106106 private long totalCleanupIgnoredCnt = 0 ;
107107 private long totalCleanupCancelledCnt = 0 ;
@@ -175,10 +175,11 @@ public synchronized void start() throws PulsarServerException {
175175 }
176176 tableview = pulsar .getClient ().newTableViewBuilder (schema )
177177 .topic (TOPIC )
178- // TODO: enable CompactionStrategy
178+ .loadConf (Map .of (
179+ "topicCompactionStrategyClassName" ,
180+ ServiceUnitStateCompactionStrategy .class .getName ()))
179181 .create ();
180- // TODO: schedule listen instead of foreachAndListen
181- tableview .forEachAndListen ((key , value ) -> handle (key , value ));
182+ tableview .listen ((key , value ) -> handle (key , value ));
182183 log .debug ("Successfully started the channel tableview." );
183184
184185 pulsar .getLocalMetadataStore ().registerSessionListener (this ::handleMetadataSessionEvent );
@@ -332,8 +333,6 @@ private void handle(String serviceUnit, ServiceUnitStateData data) {
332333 }
333334
334335 ServiceUnitState state = data == null ? Free : data .state ();
335-
336- // TODO : Add state validation in tableview by the compaction strategy
337336 switch (state ) {
338337 case Owned -> handleOwnEvent (serviceUnit , data );
339338 case Assigned -> handleAssignEvent (serviceUnit , data );
@@ -599,7 +598,16 @@ private void scheduleCleanup(String broker, long delayInSecs) {
599598 .delayedExecutor (delayInSecs , TimeUnit .SECONDS , pulsar .getLoadManagerExecutor ());
600599 totalCleanupScheduledCnt ++;
601600 return CompletableFuture
602- .runAsync (() -> doCleanup (broker ), delayed );
601+ .runAsync (() -> {
602+ try {
603+ doCleanup (broker );
604+ } catch (Throwable e ) {
605+ log .error ("Failed to run the cleanup job for the broker {}, "
606+ + "totalCleanupErrorCnt:{}." ,
607+ broker , totalCleanupErrorCnt .incrementAndGet (), e );
608+ }
609+ }
610+ , delayed );
603611 });
604612
605613 log .info ("Scheduled ownership cleanup for broker:{} with delay:{} secs. Pending clean jobs:{}." ,
@@ -610,23 +618,23 @@ private void scheduleCleanup(String broker, long delayInSecs) {
610618 private void doCleanup (String broker ) {
611619 long startTime = System .nanoTime ();
612620 log .info ("Started ownership cleanup for the inactive broker:{}" , broker );
613- AtomicInteger serviceUnitTombstoneCnt = new AtomicInteger () ;
614- AtomicInteger serviceUnitTombstoneErrorCnt = new AtomicInteger ();
621+ int serviceUnitTombstoneCnt = 0 ;
622+ long totalCleanupErrorCntStart = totalCleanupErrorCnt . get ();
615623 for (Map .Entry <String , ServiceUnitStateData > etr : tableview .entrySet ()) {
616624 ServiceUnitStateData stateData = etr .getValue ();
617625 String serviceUnit = etr .getKey ();
618626 if (StringUtils .equals (broker , stateData .broker ())
619627 || StringUtils .equals (broker , stateData .sourceBroker ())) {
620628 log .info ("Cleaning ownership serviceUnit:{}, stateData:{}." , serviceUnit , stateData );
621629 tombstoneAsync (serviceUnit ).whenComplete ((__ , e ) -> {
622- if (e == null ) {
623- serviceUnitTombstoneCnt .incrementAndGet ();
624- } else {
625- log .error ("Failed cleaning the ownership serviceUnit:{}, stateData:{}." ,
626- serviceUnit , stateData );
627- serviceUnitTombstoneErrorCnt .incrementAndGet ();
630+ if (e != null ) {
631+ log .error ("Failed cleaning the ownership serviceUnit:{}, stateData:{}, "
632+ + "cleanupErrorCnt:{}." ,
633+ serviceUnit , stateData ,
634+ totalCleanupErrorCnt .incrementAndGet () - totalCleanupErrorCntStart );
628635 }
629636 });
637+ serviceUnitTombstoneCnt ++;
630638 }
631639 }
632640
@@ -636,26 +644,22 @@ private void doCleanup(String broker) {
636644 log .error ("Failed to flush the in-flight messages." , e );
637645 }
638646
639- if (serviceUnitTombstoneCnt . get () > 0 ) {
647+ if (serviceUnitTombstoneCnt > 0 ) {
640648 this .totalCleanupCnt ++;
641- this .totalServiceUnitCleanupTombstoneCnt += serviceUnitTombstoneCnt . get () ;
649+ this .totalServiceUnitCleanupTombstoneCnt += serviceUnitTombstoneCnt ;
642650 this .totalBrokerCleanupTombstoneCnt ++;
643651 }
644652
645- if (serviceUnitTombstoneErrorCnt .get () > 0 ) {
646- this .totalServiceUnitCleanupErrorCnt += serviceUnitTombstoneErrorCnt .get ();
647- }
648-
649653 double cleanupTime = TimeUnit .NANOSECONDS
650654 .toMillis ((System .nanoTime () - startTime ));
651655 // TODO: clean load data stores
652656 log .info ("Completed a cleanup for the inactive broker:{} in {} ms. "
653657 + "Published tombstone for orphan service units: serviceUnitTombstoneCnt:{}, "
654- + "serviceUnitTombstoneErrorCnt :{}, metrics:{} " ,
658+ + "approximate cleanupErrorCnt :{}, metrics:{} " ,
655659 broker ,
656660 cleanupTime ,
657661 serviceUnitTombstoneCnt ,
658- serviceUnitTombstoneErrorCnt ,
662+ totalCleanupErrorCntStart - totalCleanupErrorCnt . get () ,
659663 printCleanupMetrics ());
660664 cleanupJobs .remove (broker );
661665 }
@@ -675,8 +679,8 @@ private void monitorOwnerships(List<String> brokers) {
675679 long startTime = System .nanoTime ();
676680 Set <String > inactiveBrokers = new HashSet <>();
677681 Set <String > activeBrokers = new HashSet <>(brokers );
678- AtomicInteger serviceUnitTombstoneCnt = new AtomicInteger () ;
679- AtomicInteger serviceUnitTombstoneErrorCnt = new AtomicInteger ();
682+ int serviceUnitTombstoneCnt = 0 ;
683+ long totalCleanupErrorCntStart = totalCleanupErrorCnt . get ();
680684 long now = System .currentTimeMillis ();
681685 for (Map .Entry <String , ServiceUnitStateData > etr : tableview .entrySet ()) {
682686 String serviceUnit = etr .getKey ();
@@ -690,14 +694,14 @@ private void monitorOwnerships(List<String> brokers) {
690694 serviceUnit , stateData );
691695
692696 tombstoneAsync (serviceUnit ).whenComplete ((__ , e ) -> {
693- if (e == null ) {
694- serviceUnitTombstoneCnt .incrementAndGet ();
695- } else {
696- log .error ("Failed cleaning the ownership serviceUnit:{}, stateData:{}." ,
697- serviceUnit , stateData );
698- serviceUnitTombstoneErrorCnt .incrementAndGet ();
697+ if (e != null ) {
698+ log .error ("Failed cleaning the ownership serviceUnit:{}, stateData:{}, "
699+ + "cleanupErrorCnt:{}." ,
700+ serviceUnit , stateData ,
701+ totalCleanupErrorCnt .incrementAndGet () - totalCleanupErrorCntStart );
699702 }
700703 });
704+ serviceUnitTombstoneCnt ++;
701705 }
702706 }
703707
@@ -711,36 +715,35 @@ private void monitorOwnerships(List<String> brokers) {
711715 log .error ("Failed to flush the in-flight messages." , e );
712716 }
713717
714- if (serviceUnitTombstoneCnt . get () > 0 ) {
715- this .totalServiceUnitCleanupTombstoneCnt += serviceUnitTombstoneCnt . get () ;
718+ if (serviceUnitTombstoneCnt > 0 ) {
719+ this .totalServiceUnitCleanupTombstoneCnt += serviceUnitTombstoneCnt ;
716720 }
717- this .totalServiceUnitCleanupErrorCnt += serviceUnitTombstoneErrorCnt .get ();
718721
719722 double monitorTime = TimeUnit .NANOSECONDS
720723 .toMillis ((System .nanoTime () - startTime ));
721724 log .info ("Completed the ownership monitor run in {} ms. "
722725 + "Scheduled cleanups for inactiveBrokers:{}. inactiveBrokerCount:{}. "
723726 + "Published tombstone for orphan service units: serviceUnitTombstoneCnt:{}, "
724- + "serviceUnitTombstoneErrorCnt :{}, metrics:{} " ,
727+ + "approximate cleanupErrorCnt :{}, metrics:{} " ,
725728 monitorTime ,
726729 inactiveBrokers ,
727730 inactiveBrokers .size (),
728731 serviceUnitTombstoneCnt ,
729- serviceUnitTombstoneErrorCnt ,
732+ totalCleanupErrorCntStart - totalCleanupErrorCnt . get () ,
730733 printCleanupMetrics ());
731734
732735 }
733736
734737 private String printCleanupMetrics () {
735738 return String .format (
736739 "{totalCleanupCnt:%d, totalBrokerCleanupTombstoneCnt:%d, "
737- + "totalServiceUnitCleanupTombstoneCnt:%d, totalServiceUnitCleanupErrorCnt :%d, "
740+ + "totalServiceUnitCleanupTombstoneCnt:%d, totalCleanupErrorCnt :%d, "
738741 + "totalCleanupScheduledCnt%d, totalCleanupIgnoredCnt:%d, totalCleanupCancelledCnt:%d, "
739742 + " activeCleanupJobs:%d}" ,
740743 totalCleanupCnt ,
741744 totalBrokerCleanupTombstoneCnt ,
742745 totalServiceUnitCleanupTombstoneCnt ,
743- totalServiceUnitCleanupErrorCnt ,
746+ totalCleanupErrorCnt . get () ,
744747 totalCleanupScheduledCnt ,
745748 totalCleanupIgnoredCnt ,
746749 totalCleanupCancelledCnt ,
0 commit comments