2015-10-14 01:11:29 +00:00
|
|
|
package satan
|
|
|
|
|
|
|
|
import (
|
2015-10-18 00:22:07 +00:00
|
|
|
"fmt"
|
2015-10-14 01:11:29 +00:00
|
|
|
"log"
|
2015-10-24 16:25:16 +00:00
|
|
|
"os"
|
2015-10-18 00:22:07 +00:00
|
|
|
"runtime/debug"
|
2015-10-14 01:11:29 +00:00
|
|
|
"sync"
|
2015-10-23 22:04:47 +00:00
|
|
|
"sync/atomic"
|
2015-10-14 01:11:29 +00:00
|
|
|
"time"
|
2015-10-26 23:54:00 +00:00
|
|
|
|
|
|
|
"github.com/localhots/satan/stats"
|
2015-10-14 01:11:29 +00:00
|
|
|
)
|
|
|
|
|
|
|
|
// Satan is the master daemon.
|
|
|
|
type Satan struct {
|
2015-10-17 00:41:21 +00:00
|
|
|
SubscribeFunc SubscribeFunc
|
|
|
|
Publisher Publisher
|
2015-10-26 23:54:00 +00:00
|
|
|
DaemonStats stats.Publisher
|
2015-10-24 16:25:16 +00:00
|
|
|
Logger *log.Logger
|
2015-10-17 00:41:21 +00:00
|
|
|
|
2015-10-27 00:42:00 +00:00
|
|
|
MinNumWorkers uint32
|
|
|
|
MaxNumWorkers uint32
|
|
|
|
numWorkers int64
|
|
|
|
ScalePlan *ScalePlan
|
2015-10-27 00:09:19 +00:00
|
|
|
|
2015-10-26 23:54:00 +00:00
|
|
|
daemons []Daemon
|
|
|
|
queue chan *task
|
|
|
|
runtimeStats stats.Manager
|
2015-10-18 00:22:07 +00:00
|
|
|
|
|
|
|
wgWorkers sync.WaitGroup
|
|
|
|
wgSystem sync.WaitGroup
|
|
|
|
shutdownWorkers chan struct{}
|
|
|
|
shutdownSystem chan struct{}
|
2015-10-14 01:11:29 +00:00
|
|
|
}
|
|
|
|
|
2015-10-15 23:07:04 +00:00
|
|
|
// Actor is a function that could be executed by daemon workers.
|
|
|
|
type Actor func()
|
|
|
|
|
2015-10-17 00:41:21 +00:00
|
|
|
// SubscribeFunc is a function that is used by daemons to subscribe to messages.
|
|
|
|
type SubscribeFunc func(consumer, topic string) Streamer
|
|
|
|
|
|
|
|
// Streamer is the interface that wraps message consumers. Error handling
|
|
|
|
// should be provided by the implementation. Feel free to panic.
|
|
|
|
type Streamer interface {
|
|
|
|
Messages() <-chan []byte
|
|
|
|
Close()
|
|
|
|
}
|
|
|
|
|
|
|
|
// Publisher is the interface that wraps message publishers. Error handling
|
|
|
|
// should be provided by the implementation. Feel free to panic.
|
|
|
|
type Publisher interface {
|
|
|
|
Publish(msg []byte)
|
|
|
|
Close()
|
|
|
|
}
|
|
|
|
|
2015-10-27 00:42:00 +00:00
|
|
|
type ScalePlan struct {
|
2015-10-27 00:33:04 +00:00
|
|
|
Interval time.Duration
|
|
|
|
MinProcessedTasks uint32
|
|
|
|
LatencyThreshold time.Duration
|
|
|
|
TaskWaitThreshold time.Duration
|
|
|
|
AdjustmentStep uint32
|
|
|
|
}
|
|
|
|
|
2015-10-18 00:22:07 +00:00
|
|
|
type task struct {
|
|
|
|
daemon Daemon
|
|
|
|
actor Actor
|
|
|
|
createdAt time.Time
|
|
|
|
system bool
|
|
|
|
name string
|
|
|
|
}
|
|
|
|
|
2015-10-23 22:04:47 +00:00
|
|
|
var (
|
|
|
|
workerIndex uint64
|
|
|
|
)
|
|
|
|
|
2015-10-14 01:11:29 +00:00
|
|
|
// Summon creates a new instance of Satan.
|
|
|
|
func Summon() *Satan {
|
|
|
|
return &Satan{
|
2015-10-27 00:42:00 +00:00
|
|
|
Logger: log.New(os.Stdout, "[daemons] ", log.LstdFlags),
|
|
|
|
MinNumWorkers: 10,
|
|
|
|
MaxNumWorkers: 1000,
|
|
|
|
queue: make(chan *task),
|
|
|
|
runtimeStats: stats.NewBasicStats(),
|
|
|
|
shutdownWorkers: make(chan struct{}),
|
|
|
|
shutdownSystem: make(chan struct{}),
|
2015-10-14 01:11:29 +00:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// AddDaemon adds a new daemon.
|
|
|
|
func (s *Satan) AddDaemon(d Daemon) {
|
2015-10-17 00:41:21 +00:00
|
|
|
base := d.base()
|
|
|
|
base.self = d
|
|
|
|
base.subscribeFunc = s.SubscribeFunc
|
|
|
|
base.publisher = s.Publisher
|
|
|
|
base.queue = s.queue
|
2015-10-24 16:25:16 +00:00
|
|
|
base.logger = s.Logger
|
2015-10-18 00:22:07 +00:00
|
|
|
base.shutdown = s.shutdownSystem
|
2015-10-14 01:11:29 +00:00
|
|
|
|
2015-10-17 00:41:21 +00:00
|
|
|
go d.Startup()
|
2015-10-14 01:11:29 +00:00
|
|
|
s.daemons = append(s.daemons, d)
|
|
|
|
}
|
|
|
|
|
|
|
|
// StartDaemons starts all registered daemons.
|
|
|
|
func (s *Satan) StartDaemons() {
|
2015-10-27 00:42:00 +00:00
|
|
|
s.addWorkers(s.MinNumWorkers)
|
2015-10-27 00:33:04 +00:00
|
|
|
|
2015-10-27 00:42:00 +00:00
|
|
|
if s.ScalePlan != nil {
|
2015-10-27 00:33:04 +00:00
|
|
|
go s.autoScale()
|
|
|
|
}
|
2015-10-14 01:11:29 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
// StopDaemons stops all running daemons.
|
|
|
|
func (s *Satan) StopDaemons() {
|
2015-10-18 00:22:07 +00:00
|
|
|
close(s.shutdownSystem)
|
2015-10-14 01:11:29 +00:00
|
|
|
for _, d := range s.daemons {
|
|
|
|
d.Shutdown()
|
|
|
|
}
|
2015-10-17 02:14:09 +00:00
|
|
|
|
2015-10-18 00:22:07 +00:00
|
|
|
s.wgSystem.Wait()
|
|
|
|
close(s.shutdownWorkers)
|
|
|
|
s.wgWorkers.Wait()
|
2015-10-15 23:07:04 +00:00
|
|
|
close(s.queue)
|
2015-10-26 23:54:00 +00:00
|
|
|
|
|
|
|
fmt.Println(s.runtimeStats.Fetch(stats.Latency))
|
|
|
|
fmt.Println(s.runtimeStats.Fetch(stats.TaskWait))
|
2015-10-14 01:11:29 +00:00
|
|
|
}
|
|
|
|
|
2015-10-27 00:33:04 +00:00
|
|
|
func (s *Satan) addWorkers(num uint32) {
|
|
|
|
for i := uint32(0); i < num; i++ {
|
2015-10-23 22:04:47 +00:00
|
|
|
go s.runWorker()
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-27 00:33:04 +00:00
|
|
|
func (s *Satan) stopWorkers(num uint32) {
|
|
|
|
for i := uint32(0); i < num; i++ {
|
2015-10-23 22:04:47 +00:00
|
|
|
s.shutdownWorkers <- struct{}{}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Satan) runWorker() {
|
2015-10-18 00:22:07 +00:00
|
|
|
s.wgWorkers.Add(1)
|
|
|
|
defer s.wgWorkers.Done()
|
2015-10-23 22:04:47 +00:00
|
|
|
|
2015-10-27 00:42:00 +00:00
|
|
|
atomic.AddInt64(&s.numWorkers, 1)
|
|
|
|
defer atomic.AddInt64(&s.numWorkers, -1)
|
|
|
|
|
2015-10-23 22:04:47 +00:00
|
|
|
i := atomic.AddUint64(&workerIndex, 1)
|
2015-10-24 16:25:16 +00:00
|
|
|
s.Logger.Printf("Starting worker #%d", i)
|
2015-10-24 15:49:38 +00:00
|
|
|
|
|
|
|
defer func() {
|
|
|
|
if err := recover(); err != nil {
|
2015-10-24 16:25:16 +00:00
|
|
|
s.Logger.Printf("Worker #%d crashed. Error: %v\n", i, err)
|
2015-10-24 15:49:38 +00:00
|
|
|
debug.PrintStack()
|
|
|
|
go s.runWorker() // Restarting worker
|
|
|
|
}
|
|
|
|
}()
|
2015-10-14 01:11:29 +00:00
|
|
|
|
2015-10-15 23:28:40 +00:00
|
|
|
for {
|
2015-10-24 15:08:12 +00:00
|
|
|
start := time.Now()
|
2015-10-15 23:28:40 +00:00
|
|
|
select {
|
|
|
|
case t := <-s.queue:
|
2015-10-24 15:08:12 +00:00
|
|
|
dur := time.Now().UnixNano() - start.UnixNano()
|
2015-10-26 23:54:00 +00:00
|
|
|
s.runtimeStats.Add(stats.TaskWait, time.Duration(dur))
|
2015-10-18 00:22:07 +00:00
|
|
|
s.processTask(t)
|
2015-10-18 00:34:10 +00:00
|
|
|
case <-s.shutdownWorkers:
|
2015-10-24 16:25:16 +00:00
|
|
|
s.Logger.Printf("Worker #%d has stopped", i)
|
2015-10-18 00:34:10 +00:00
|
|
|
return
|
2015-10-15 23:28:40 +00:00
|
|
|
}
|
2015-10-14 01:11:29 +00:00
|
|
|
}
|
|
|
|
}
|
2015-10-18 00:22:07 +00:00
|
|
|
|
|
|
|
func (s *Satan) processTask(t *task) {
|
|
|
|
dur := time.Now().UnixNano() - t.createdAt.UnixNano()
|
2015-10-26 23:54:00 +00:00
|
|
|
s.runtimeStats.Add(stats.Latency, time.Duration(dur))
|
2015-10-18 00:22:07 +00:00
|
|
|
|
|
|
|
if t.system {
|
|
|
|
s.processSystemTask(t)
|
|
|
|
} else {
|
|
|
|
s.processGeneralTask(t)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Satan) processSystemTask(t *task) {
|
|
|
|
s.wgSystem.Add(1)
|
|
|
|
defer s.wgSystem.Done()
|
|
|
|
defer func() {
|
|
|
|
if err := recover(); err != nil {
|
2015-10-24 16:25:16 +00:00
|
|
|
s.Logger.Printf("System task %s recovered from a panic\nError: %v\n", t, err)
|
2015-10-18 00:22:07 +00:00
|
|
|
debug.PrintStack()
|
2015-10-18 00:49:33 +00:00
|
|
|
|
|
|
|
t.createdAt = time.Now()
|
2015-10-18 00:22:07 +00:00
|
|
|
s.queue <- t // Restarting task
|
|
|
|
} else {
|
2015-10-24 16:25:16 +00:00
|
|
|
s.Logger.Printf("System task %s has stopped\n", t)
|
2015-10-18 00:22:07 +00:00
|
|
|
}
|
|
|
|
}()
|
|
|
|
|
2015-10-24 16:25:16 +00:00
|
|
|
s.Logger.Printf("Starting system task %s\n", t)
|
2015-10-24 00:07:47 +00:00
|
|
|
t.actor() // <--- ACTION STARTS HERE
|
2015-10-18 00:22:07 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Satan) processGeneralTask(t *task) {
|
2015-10-23 23:40:20 +00:00
|
|
|
defer func() {
|
|
|
|
if err := recover(); err != nil {
|
2015-10-26 23:54:00 +00:00
|
|
|
if s.DaemonStats != nil {
|
|
|
|
s.DaemonStats.Error(t.daemon.base().String())
|
2015-10-23 23:41:19 +00:00
|
|
|
}
|
2015-10-23 23:40:20 +00:00
|
|
|
t.daemon.base().handlePanic(err)
|
2015-10-24 16:25:16 +00:00
|
|
|
s.Logger.Printf("Daemon %s recovered from a panic\nError: %v\n", t.daemon.base(), err)
|
2015-10-23 23:40:20 +00:00
|
|
|
debug.PrintStack()
|
|
|
|
}
|
|
|
|
}()
|
2015-10-26 23:54:00 +00:00
|
|
|
if s.DaemonStats != nil {
|
2015-10-23 23:41:19 +00:00
|
|
|
defer func(start time.Time) {
|
|
|
|
dur := time.Now().UnixNano() - start.UnixNano()
|
2015-10-26 23:54:00 +00:00
|
|
|
s.DaemonStats.Add(t.daemon.base().String(), time.Duration(dur))
|
2015-10-23 23:41:19 +00:00
|
|
|
}(time.Now())
|
|
|
|
}
|
2015-10-18 00:22:07 +00:00
|
|
|
|
2015-10-24 00:07:47 +00:00
|
|
|
t.actor() // <--- ACTION STARTS HERE
|
2015-10-18 00:22:07 +00:00
|
|
|
}
|
|
|
|
|
2015-10-27 00:33:04 +00:00
|
|
|
func (s *Satan) autoScale() {
|
2015-10-27 00:42:00 +00:00
|
|
|
t := time.NewTicker(s.ScalePlan.Interval)
|
2015-10-27 00:33:04 +00:00
|
|
|
defer t.Stop()
|
|
|
|
|
|
|
|
for {
|
|
|
|
select {
|
|
|
|
case <-t.C:
|
|
|
|
s.adjustNumWorkers()
|
|
|
|
case <-s.shutdownSystem:
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
func (s *Satan) adjustNumWorkers() {
|
|
|
|
lat := s.runtimeStats.Fetch(stats.Latency)
|
|
|
|
tw := s.runtimeStats.Fetch(stats.TaskWait)
|
2015-10-27 00:42:00 +00:00
|
|
|
if lat.Processed() < int64(s.ScalePlan.MinProcessedTasks) {
|
2015-10-27 00:33:04 +00:00
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2015-10-27 00:42:00 +00:00
|
|
|
if uint32(s.numWorkers)+s.ScalePlan.AdjustmentStep > s.MaxNumWorkers {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if lat.P95() > float64(s.ScalePlan.LatencyThreshold) {
|
|
|
|
s.addWorkers(s.ScalePlan.AdjustmentStep)
|
2015-10-27 00:33:04 +00:00
|
|
|
s.runtimeStats.Reset()
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2015-10-27 00:42:00 +00:00
|
|
|
if uint32(s.numWorkers)-s.ScalePlan.AdjustmentStep < s.MinNumWorkers {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
if tw.P95() > float64(s.ScalePlan.TaskWaitThreshold) {
|
|
|
|
s.stopWorkers(s.ScalePlan.AdjustmentStep)
|
2015-10-27 00:33:04 +00:00
|
|
|
s.runtimeStats.Reset()
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-10-18 00:22:07 +00:00
|
|
|
func (t *task) String() string {
|
|
|
|
if t.name == "" {
|
2015-10-23 23:42:58 +00:00
|
|
|
return fmt.Sprintf("[unnamed %s process]", t.daemon)
|
2015-10-18 00:22:07 +00:00
|
|
|
}
|
|
|
|
|
2015-10-23 23:42:58 +00:00
|
|
|
return fmt.Sprintf("%s[%s]", t.daemon, t.name)
|
2015-10-18 00:22:07 +00:00
|
|
|
}
|