diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 6265871a4af2..7335c3f3ce48 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -183,9 +183,65 @@ static struct attribute_group crash_note_cpu_attr_group = { }; #endif +#ifdef CONFIG_SCHED_WALT +static ssize_t sched_load_boost_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + ssize_t rc; + int boost; + struct cpu *cpu = container_of(dev, struct cpu, dev); + int cpuid = cpu->dev.id; + + boost = per_cpu(sched_load_boost, cpuid); + rc = scnprintf(buf, PAGE_SIZE-2, "%d\n", boost); + + return rc; +} + +static ssize_t __ref sched_load_boost_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int err; + int boost; + struct cpu *cpu = container_of(dev, struct cpu, dev); + int cpuid = cpu->dev.id; + + err = kstrtoint(strstrip((char *)buf), 0, &boost); + if (err) + return err; + + /* + * -100 is low enough to cancel out CPU's load and make it near zro. + * 1000 is close to the maximum value that cpu_util_freq_{walt,pelt} + * can take without overflow. + */ + if (boost < -100 || boost > 1000) + return -EINVAL; + + per_cpu(sched_load_boost, cpuid) = boost; + + return count; +} + +static DEVICE_ATTR_RW(sched_load_boost); + +static struct attribute *sched_cpu_attrs[] = { + &dev_attr_sched_load_boost.attr, + NULL +}; + +static struct attribute_group sched_cpu_attr_group = { + .attrs = sched_cpu_attrs, +}; +#endif + static const struct attribute_group *common_cpu_attr_groups[] = { #ifdef CONFIG_KEXEC &crash_note_cpu_attr_group, +#endif +#ifdef CONFIG_SCHED_WALT + &sched_cpu_attr_group, #endif NULL }; @@ -193,6 +249,9 @@ static const struct attribute_group *common_cpu_attr_groups[] = { static const struct attribute_group *hotplugable_cpu_attr_groups[] = { #ifdef CONFIG_KEXEC &crash_note_cpu_attr_group, +#endif +#ifdef CONFIG_SCHED_WALT + &sched_cpu_attr_group, #endif NULL }; diff --git a/fs/proc/base.c b/fs/proc/base.c index 7d13501910ee..ed38e70f475a 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1459,6 +1459,56 @@ static const struct file_operations proc_pid_sched_operations = { #endif +/* + * Print out various scheduling related per-task fields: + */ + +#ifdef CONFIG_SCHED_WALT +extern int __weak sched_wake_up_idle_show(struct seq_file *m, void *v); +extern ssize_t __weak sched_wake_up_idle_write(struct file *file, + const char __user *buf, size_t count, loff_t *offset); +extern int __weak sched_wake_up_idle_open(struct inode *inode, + struct file *filp); + +static const struct file_operations proc_pid_sched_wake_up_idle_operations = { + .open = sched_wake_up_idle_open, + .read = seq_read, + .write = sched_wake_up_idle_write, + .llseek = seq_lseek, + .release = single_release, +}; + +extern int __weak sched_init_task_load_show(struct seq_file *m, void *v); +extern ssize_t __weak +sched_init_task_load_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset); +extern int __weak +sched_init_task_load_open(struct inode *inode, struct file *filp); + +static const struct file_operations proc_pid_sched_init_task_load_operations = { + .open = sched_init_task_load_open, + .read = seq_read, + .write = sched_init_task_load_write, + .llseek = seq_lseek, + .release = single_release, +}; + +extern int __weak sched_group_id_show(struct seq_file *m, void *v); +extern ssize_t __weak +sched_group_id_write(struct file *file, const char __user *buf, + size_t count, loff_t *offset); +extern int __weak sched_group_id_open(struct inode *inode, struct file *filp); + +static const struct file_operations proc_pid_sched_group_id_operations = { + .open = sched_group_id_open, + .read = seq_read, + .write = sched_group_id_write, + .llseek = seq_lseek, + .release = single_release, +}; + +#endif /* CONFIG_SCHED_WALT */ + #ifdef CONFIG_SCHED_AUTOGROUP /* * Print out autogroup related information: @@ -3011,6 +3061,13 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), +#ifdef CONFIG_SCHED_WALT + REG("sched_wake_up_idle", 00644, + proc_pid_sched_wake_up_idle_operations), + REG("sched_init_task_load", 00644, + proc_pid_sched_init_task_load_operations), + REG("sched_group_id", 00666, proc_pid_sched_group_id_operations), +#endif #ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), #endif diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 068793a619ca..f4c5aae47659 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -69,6 +69,9 @@ enum cpuhp_state { CPUHP_SLAB_PREPARE, CPUHP_MD_RAID5_PREPARE, CPUHP_RCUTREE_PREP, +#ifdef CONFIG_SCHED_WALT + CPUHP_CORE_CTL_ISOLATION_DEAD, +#endif CPUHP_CPUIDLE_COUPLED_PREPARE, CPUHP_POWERPC_PMAC_PREPARE, CPUHP_POWERPC_MMU_CTX_PREPARE, diff --git a/include/linux/sched.h b/include/linux/sched.h index 2f35afc13de3..27d14012d30b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -117,6 +117,18 @@ struct task_group; (task->flags & PF_FROZEN) == 0 && \ (task->state & TASK_NOLOAD) == 0) +/* + * Enum for display driver to provide varying refresh rates + */ +enum fps { + FPS0 = 0, + FPS30 = 30, + FPS48 = 48, + FPS60 = 60, + FPS90 = 90, + FPS120 = 120, +}; + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP /* @@ -212,6 +224,21 @@ struct task_group; /* Task command name length: */ #define TASK_COMM_LEN 16 +enum task_event { + PUT_PREV_TASK = 0, + PICK_NEXT_TASK = 1, + TASK_WAKE = 2, + TASK_MIGRATE = 3, + TASK_UPDATE = 4, + IRQ_UPDATE = 5, +}; + +/* Note: this need to be in sync with migrate_type_names array */ +enum migrate_types { + GROUP_TO_RQ, + RQ_TO_GROUP, +}; + extern void scheduler_tick(void); #define MAX_SCHEDULE_TIMEOUT LONG_MAX @@ -478,6 +505,89 @@ struct sched_entity { #endif }; +struct cpu_cycle_counter_cb { + u64 (*get_cpu_cycle_counter)(int cpu); +}; + +DECLARE_PER_CPU_READ_MOSTLY(int, sched_load_boost); + +#ifdef CONFIG_SCHED_WALT +extern void __weak sched_exit(struct task_struct *p); +extern int __weak +register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb); +extern void __weak +sched_update_cpu_freq_min_max(const cpumask_t *cpus, u32 fmin, u32 fmax); +extern void __weak free_task_load_ptrs(struct task_struct *p); +extern void __weak sched_set_refresh_rate(enum fps fps); + +#define RAVG_HIST_SIZE_MAX 5 +#define NUM_BUSY_BUCKETS 10 + +/* ravg represents frequency scaled cpu-demand of tasks */ +struct ravg { + /* + * 'mark_start' marks the beginning of an event (task waking up, task + * starting to execute, task being preempted) within a window + * + * 'sum' represents how runnable a task has been within current + * window. It incorporates both running time and wait time and is + * frequency scaled. + * + * 'sum_history' keeps track of history of 'sum' seen over previous + * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are + * ignored. + * + * 'demand' represents maximum sum seen over previous + * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency + * demand for tasks. + * + * 'curr_window_cpu' represents task's contribution to cpu busy time on + * various CPUs in the current window + * + * 'prev_window_cpu' represents task's contribution to cpu busy time on + * various CPUs in the previous window + * + * 'curr_window' represents the sum of all entries in curr_window_cpu + * + * 'prev_window' represents the sum of all entries in prev_window_cpu + * + * 'pred_demand' represents task's current predicted cpu busy time + * + * 'busy_buckets' groups historical busy time into different buckets + * used for prediction + * + * 'demand_scaled' represents task's demand scaled to 1024 + */ + u64 mark_start; + u32 sum, demand; + u32 coloc_demand; + u32 sum_history[RAVG_HIST_SIZE_MAX]; + u32 *curr_window_cpu, *prev_window_cpu; + u32 curr_window, prev_window; + u32 pred_demand; + u8 busy_buckets[NUM_BUSY_BUCKETS]; + u16 demand_scaled; + u16 pred_demand_scaled; + u64 active_time; + u64 last_win_size; +}; +#else +static inline void sched_exit(struct task_struct *p) { } + +static inline int +register_cpu_cycle_counter_cb(struct cpu_cycle_counter_cb *cb) +{ + return 0; +} + +static inline void free_task_load_ptrs(struct task_struct *p) { } + +static inline void sched_update_cpu_freq_min_max(const cpumask_t *cpus, + u32 fmin, u32 fmax) { } + +static inline void sched_set_refresh_rate(enum fps fps) { } +#endif /* CONFIG_SCHED_WALT */ + struct sched_rt_entity { struct list_head run_list; unsigned long timeout; @@ -675,6 +785,20 @@ struct task_struct { const struct sched_class *sched_class; struct sched_entity se; struct sched_rt_entity rt; +#ifdef CONFIG_SCHED_WALT + u64 last_sleep_ts; + bool wake_up_idle; + struct ravg ravg; + u32 init_load_pct; + u64 last_wake_ts; + u64 last_enqueued_ts; + struct related_thread_group *grp; + struct list_head grp_list; + u64 cpu_cycles; + bool misfit; + u8 unfilter; +#endif + #ifdef CONFIG_CGROUP_SCHED struct task_group *sched_task_group; #endif @@ -2000,4 +2124,37 @@ int sched_trace_rq_cpu(struct rq *rq); const struct cpumask *sched_trace_rd_span(struct root_domain *rd); +#ifdef CONFIG_SCHED_WALT +#define PF_WAKE_UP_IDLE 1 +static inline u32 sched_get_wake_up_idle(struct task_struct *p) +{ + return p->wake_up_idle; +} + +static inline int sched_set_wake_up_idle(struct task_struct *p, + int wake_up_idle) +{ + p->wake_up_idle = !!wake_up_idle; + return 0; +} + +static inline void set_wake_up_idle(bool enabled) +{ + current->wake_up_idle = enabled; +} +#else +static inline u32 sched_get_wake_up_idle(struct task_struct *p) +{ + return 0; +} + +static inline int sched_set_wake_up_idle(struct task_struct *p, + int wake_up_idle) +{ + return 0; +} + +static inline void set_wake_up_idle(bool enabled) {} +#endif + #endif diff --git a/include/linux/sched/core_ctl.h b/include/linux/sched/core_ctl.h new file mode 100644 index 000000000000..20ba464c1170 --- /dev/null +++ b/include/linux/sched/core_ctl.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2016, 2019, The Linux Foundation. All rights reserved. + */ + +#ifndef __CORE_CTL_H +#define __CORE_CTL_H + +#define MAX_CPUS_PER_CLUSTER 6 +#define MAX_CLUSTERS 3 + +struct core_ctl_notif_data { + unsigned int nr_big; + unsigned int coloc_load_pct; + unsigned int ta_util_pct[MAX_CLUSTERS]; + unsigned int cur_cap_pct[MAX_CLUSTERS]; +}; + +#ifdef CONFIG_SCHED_WALT +extern int __weak core_ctl_set_boost(bool boost); +extern void __weak core_ctl_notifier_register(struct notifier_block *n); +extern void __weak core_ctl_notifier_unregister(struct notifier_block *n); +#else +static inline int core_ctl_set_boost(bool boost) +{ + return 0; +} +static inline void core_ctl_notifier_register(struct notifier_block *n) {} +static inline void core_ctl_notifier_unregister(struct notifier_block *n) {} +#endif +#endif diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h index afa940cd50dc..145c4bbb2ff8 100644 --- a/include/linux/sched/cpufreq.h +++ b/include/linux/sched/cpufreq.h @@ -10,6 +10,11 @@ #define SCHED_CPUFREQ_IOWAIT (1U << 0) #define SCHED_CPUFREQ_MIGRATION (1U << 1) +#define SCHED_CPUFREQ_INTERCLUSTER_MIG (1U << 3) +#define SCHED_CPUFREQ_WALT (1U << 4) +#define SCHED_CPUFREQ_PL (1U << 5) +#define SCHED_CPUFREQ_EARLY_DET (1U << 6) +#define SCHED_CPUFREQ_CONTINUE (1U << 8) #ifdef CONFIG_CPU_FREQ struct update_util_data { diff --git a/include/linux/sched/stat.h b/include/linux/sched/stat.h index 568286411b43..1eed4a2526a5 100644 --- a/include/linux/sched/stat.h +++ b/include/linux/sched/stat.h @@ -21,6 +21,28 @@ extern bool single_task_running(void); extern unsigned long nr_iowait(void); extern unsigned long nr_iowait_cpu(int cpu); +#ifdef CONFIG_SCHED_WALT +extern void __weak sched_update_nr_prod(int cpu, long delta, bool inc); +extern unsigned int __weak sched_get_cpu_util(int cpu); +extern void __weak sched_update_hyst_times(void); +extern u64 __weak sched_lpm_disallowed_time(int cpu); +#else +static inline void sched_update_nr_prod(int cpu, long delta, bool inc) {} +static inline unsigned int sched_get_cpu_util(int cpu) +{ + return 0; +} +static inline u64 sched_get_cpu_last_busy_time(int cpu) +{ + return 0; +} +static inline void sched_update_hyst_times(void) {} +static inline u64 sched_lpm_disallowed_time(int cpu) +{ + return 0; +} +#endif + static inline int sched_info_on(void) { #ifdef CONFIG_SCHEDSTATS diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index f9fb44edf4c7..05bc4afc548d 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -24,6 +24,42 @@ extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +#ifdef CONFIG_SCHED_WALT +extern unsigned int __weak sysctl_sched_user_hint; +extern const int __weak sched_user_hint_max; +extern unsigned int __weak sysctl_sched_cpu_high_irqload; +extern unsigned int __weak sysctl_sched_boost; +extern unsigned int __weak sysctl_sched_group_upmigrate_pct; +extern unsigned int __weak sysctl_sched_group_downmigrate_pct; +extern unsigned int __weak sysctl_sched_conservative_pl; +extern unsigned int __weak sysctl_sched_walt_rotate_big_tasks; +extern unsigned int __weak sysctl_sched_min_task_util_for_boost; +extern unsigned int __weak sysctl_sched_min_task_util_for_colocation; +extern unsigned int __weak sysctl_sched_asym_cap_sibling_freq_match_pct; +extern unsigned int __weak sysctl_sched_coloc_downmigrate_ns; +extern unsigned int __weak sysctl_sched_task_unfilter_nr_windows; +extern unsigned int __weak sysctl_sched_busy_hyst_enable_cpus; +extern unsigned int __weak sysctl_sched_busy_hyst; +extern unsigned int __weak sysctl_sched_coloc_busy_hyst_enable_cpus; +extern unsigned int __weak sysctl_sched_coloc_busy_hyst; +extern unsigned int __weak sysctl_sched_coloc_busy_hyst_max_ms; +extern unsigned int __weak sysctl_sched_window_stats_policy; +extern unsigned int __weak sysctl_sched_ravg_window_nr_ticks; + +extern int __weak +walt_proc_group_thresholds_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +extern int __weak +walt_proc_user_hint_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); + +extern int __weak +sched_ravg_window_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); +#endif enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, @@ -47,6 +83,10 @@ int sched_proc_update_handler(struct ctl_table *table, int write, loff_t *ppos); #endif +#ifdef CONFIG_SCHED_WALT +extern int __weak sched_boost_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif /* * control realtime throttling: * diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 6df477329b76..39d10afbdce6 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -73,6 +73,9 @@ extern int proc_do_large_bitmap(struct ctl_table *, int, extern int proc_do_static_key(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +extern int proc_douintvec_ravg_window(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos); /* * Register a set of sysctl names by calling register_sysctl_table diff --git a/init/Kconfig b/init/Kconfig index e381021cc2a5..ec2127b69683 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -470,6 +470,15 @@ config HAVE_SCHED_AVG_IRQ depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING depends on SMP +config SCHED_WALT + bool "Support window based load tracking" + depends on SMP + help + This feature will allow the scheduler to maintain a tunable window + based set of metrics for tasks and runqueues. These metrics can be + used to guide task placement as well as task frequency requirements + for cpufreq governors. + config BSD_PROCESS_ACCT bool "BSD Process Accounting" depends on MULTIUSER diff --git a/init/init_task.c b/init/init_task.c index 9e5cbe5eab7b..af4dc7dcf246 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -91,6 +91,9 @@ struct task_struct init_task #endif #ifdef CONFIG_CGROUP_SCHED .sched_task_group = &root_task_group, +#endif +#ifdef CONFIG_SCHED_WALT + .wake_up_idle = false, #endif .ptraced = LIST_HEAD_INIT(init_task.ptraced), .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), diff --git a/kernel/exit.c b/kernel/exit.c index a46a50d67002..b26826cdd597 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -761,6 +761,7 @@ void __noreturn do_exit(long code) } exit_signals(tsk); /* sets PF_EXITING */ + sched_exit(tsk); /* * Ensure that all new tsk->pi_lock acquisitions must observe * PF_EXITING. Serializes against futex.c:attach_to_pi_owner(). diff --git a/kernel/fork.c b/kernel/fork.c index 1a38712da4cd..e549047d0990 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2283,6 +2283,7 @@ bad_fork_cleanup_perf: perf_event_free_task(p); bad_fork_cleanup_policy: lockdep_free_task(p); + free_task_load_ptrs(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d2b880f2ff09..7a64fe10e73f 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -19,6 +19,7 @@ #include "../smpboot.h" #include "pelt.h" +#include "walt.h" #define CREATE_TRACE_POINTS #include @@ -1298,6 +1299,7 @@ static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) uclamp_rq_inc(rq, p); p->sched_class->enqueue_task(rq, p, flags); + walt_update_last_enqueue(p); } static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) @@ -1312,6 +1314,10 @@ static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags) uclamp_rq_dec(rq, p); p->sched_class->dequeue_task(rq, p, flags); +#ifdef CONFIG_SCHED_WALT + if (p == rq->ed_task) + early_detection_notify(rq, sched_ktime_clock()); +#endif } void activate_task(struct rq *rq, struct task_struct *p, int flags) @@ -1331,6 +1337,11 @@ void deactivate_task(struct rq *rq, struct task_struct *p, int flags) if (task_contributes_to_load(p)) rq->nr_uninterruptible++; +#ifdef CONFIG_SCHED_WALT + if (flags & DEQUEUE_SLEEP) + clear_ed_task(p, rq); +#endif + dequeue_task(rq, p, flags); } @@ -1492,8 +1503,11 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); dequeue_task(rq, p, DEQUEUE_NOCLOCK); + double_lock_balance(rq, cpu_rq(new_cpu)); + if (!(rq->clock_update_flags & RQCF_UPDATED)) + update_rq_clock(rq); set_task_cpu(p, new_cpu); - rq_unlock(rq, rf); + double_rq_unlock(cpu_rq(new_cpu), rq); rq = cpu_rq(new_cpu); @@ -1750,12 +1764,13 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.nr_migrations++; rseq_migrate(p); perf_event_task_migrate(p); + + fixup_busy_time(p, new_cpu); } __set_task_cpu(p, new_cpu); } -#ifdef CONFIG_NUMA_BALANCING static void __migrate_swap_task(struct task_struct *p, int cpu) { if (task_on_rq_queued(p)) { @@ -1870,7 +1885,6 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, out: return ret; } -#endif /* CONFIG_NUMA_BALANCING */ /* * wait_task_inactive - wait for a thread to unschedule. @@ -2616,6 +2630,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ smp_cond_load_acquire(&p->on_cpu, !VAL); + walt_try_to_wake_up(p); + p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -2644,6 +2660,17 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) unlock: raw_spin_unlock_irqrestore(&p->pi_lock, flags); out: +#ifdef CONFIG_SCHED_WALT + if (success && sched_predl) { + raw_spin_lock_irqsave(&cpu_rq(cpu)->lock, flags); + if (do_pl_notif(cpu_rq(cpu))) + cpufreq_update_util(cpu_rq(cpu), + SCHED_CPUFREQ_WALT | + SCHED_CPUFREQ_PL); + raw_spin_unlock_irqrestore(&cpu_rq(cpu)->lock, flags); + } +#endif + if (success) ttwu_stat(p, cpu, wake_flags); preempt_enable(); @@ -2689,6 +2716,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.prev_sum_exec_runtime = 0; p->se.nr_migrations = 0; p->se.vruntime = 0; +#ifdef CONFIG_SCHED_WALT + p->last_sleep_ts = 0; + p->wake_up_idle = false; +#endif INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -2840,6 +2871,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) { unsigned long flags; + init_new_task_load(p); __sched_fork(clone_flags, p); /* * We mark the process as NEW here. This guarantees that @@ -2945,7 +2977,9 @@ void wake_up_new_task(struct task_struct *p) struct rq_flags rf; struct rq *rq; + add_new_task_to_grp(p); raw_spin_lock_irqsave(&p->pi_lock, rf.flags); + p->state = TASK_RUNNING; #ifdef CONFIG_SMP /* @@ -2963,6 +2997,7 @@ void wake_up_new_task(struct task_struct *p) update_rq_clock(rq); post_init_entity_util_avg(p); + mark_task_starting(p); activate_task(rq, p, ENQUEUE_NOCLOCK); trace_sched_wakeup_new(p); check_preempt_curr(rq, p, WF_FORK); @@ -3497,6 +3532,9 @@ void sched_exec(void) unsigned long flags; int dest_cpu; + if (sched_energy_enabled()) + return; + raw_spin_lock_irqsave(&p->pi_lock, flags); dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0); if (dest_cpu == smp_processor_id()) @@ -3592,16 +3630,30 @@ void scheduler_tick(void) struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; struct rq_flags rf; + u64 wallclock; + bool early_notif; + u32 old_load; + struct related_thread_group *grp; + unsigned int flag = 0; sched_clock_tick(); rq_lock(rq, &rf); + old_load = task_load(curr); + set_window_start(rq); + wallclock = sched_ktime_clock(); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); update_rq_clock(rq); curr->sched_class->task_tick(rq, curr, 0); calc_global_load_tick(rq); psi_task_tick(rq); + early_notif = early_detection_notify(rq, wallclock); + if (early_notif) + flag = SCHED_CPUFREQ_WALT | SCHED_CPUFREQ_EARLY_DET; + + cpufreq_update_util(rq, flag); rq_unlock(rq, &rf); perf_event_task_tick(); @@ -3610,6 +3662,15 @@ void scheduler_tick(void) rq->idle_balance = idle_cpu(cpu); trigger_load_balance(rq); #endif + + rcu_read_lock(); + grp = task_related_thread_group(curr); + if (update_preferred_cluster(grp, curr, old_load, true)) + set_preferred_cluster(grp); + rcu_read_unlock(); + + if (curr->sched_class == &fair_sched_class) + check_for_migration(rq, curr); } #ifdef CONFIG_NO_HZ_FULL @@ -4005,6 +4066,7 @@ static void __sched notrace __schedule(bool preempt) struct rq_flags rf; struct rq *rq; int cpu; + u64 wallclock; cpu = smp_processor_id(); rq = cpu_rq(cpu); @@ -4052,7 +4114,15 @@ static void __sched notrace __schedule(bool preempt) clear_tsk_need_resched(prev); clear_preempt_need_resched(); + wallclock = sched_ktime_clock(); if (likely(prev != next)) { +#ifdef CONFIG_SCHED_WALT + if (!prev->on_rq) + prev->last_sleep_ts = wallclock; +#endif + + walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0); + walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0); rq->nr_switches++; /* * RCU users of rcu_dereference(rq->curr) may not see @@ -4080,6 +4150,7 @@ static void __sched notrace __schedule(bool preempt) /* Also unlocks the rq: */ rq = context_switch(rq, prev, next, &rf); } else { + walt_update_task_ravg(prev, rq, TASK_UPDATE, wallclock, 0); rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); rq_unlock_irq(rq, &rf); } @@ -4669,7 +4740,7 @@ struct task_struct *idle_task(int cpu) * * The task of @pid, if found. %NULL otherwise. */ -static struct task_struct *find_process_by_pid(pid_t pid) +struct task_struct *find_process_by_pid(pid_t pid) { return pid ? find_task_by_vpid(pid) : current; } @@ -6253,7 +6324,7 @@ void idle_task_exit(void) * * Also see the comment "Global load-average calculations". */ -static void calc_load_migrate(struct rq *rq) +void calc_load_migrate(struct rq *rq) { long delta = calc_load_fold_active(rq, 1); if (delta) @@ -6285,7 +6356,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq) * there's no concurrency possible, we hold the required locks anyway * because of lock validation efforts. */ -static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) +void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) { struct rq *rq = dead_rq; struct task_struct *next, *stop = rq->stop; @@ -6512,6 +6583,11 @@ int sched_cpu_deactivate(unsigned int cpu) static void sched_rq_cpu_starting(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + raw_spin_lock_irqsave(&rq->lock, flags); + set_window_start(rq); + raw_spin_unlock_irqrestore(&rq->lock, flags); rq->calc_load_update = calc_load_update; update_max_interval(); @@ -6521,6 +6597,7 @@ int sched_cpu_starting(unsigned int cpu) { sched_rq_cpu_starting(cpu); sched_tick_start(cpu); + clear_walt_request(cpu); return 0; } @@ -6535,6 +6612,7 @@ int sched_cpu_dying(unsigned int cpu) sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); + if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); set_rq_offline(rq); @@ -6543,6 +6621,8 @@ int sched_cpu_dying(unsigned int cpu) BUG_ON(rq->nr_running != 1); rq_unlock_irqrestore(rq, &rf); + clear_walt_request(cpu); + calc_load_migrate(rq); update_max_interval(); nohz_balance_exit_idle(rq); @@ -6564,6 +6644,8 @@ void __init sched_init_smp(void) sched_init_domains(cpu_active_mask); mutex_unlock(&sched_domains_mutex); + update_cluster_topology(); + /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) BUG(); @@ -6618,6 +6700,8 @@ void __init sched_init(void) wait_bit_init(); + init_clusters(); + #ifdef CONFIG_FAIR_GROUP_SCHED ptr += 2 * nr_cpu_ids * sizeof(void **); #endif @@ -6729,6 +6813,7 @@ void __init sched_init(void) rq->idle_stamp = 0; rq->avg_idle = 2*sysctl_sched_migration_cost; rq->max_idle_balance_cost = sysctl_sched_migration_cost; + walt_sched_init_rq(rq); INIT_LIST_HEAD(&rq->cfs_tasks); @@ -6743,6 +6828,8 @@ void __init sched_init(void) atomic_set(&rq->nr_iowait, 0); } + BUG_ON(alloc_related_thread_groups()); + set_load_weight(&init_task, false); /* @@ -6758,6 +6845,7 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + init_new_task_load(current); calc_load_update = jiffies + LOAD_FREQ; @@ -6972,6 +7060,97 @@ void ia64_set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); +#if defined(CONFIG_SCHED_WALT) && defined(CONFIG_UCLAMP_TASK_GROUP) +static inline void walt_init_sched_boost(struct task_group *tg) +{ + tg->sched_boost_no_override = false; + tg->sched_boost_enabled = true; + tg->colocate = false; + tg->colocate_update_disabled = false; +} + +void update_cgroup_boost_settings(void) +{ + struct task_group *tg; + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) { + if (tg->sched_boost_no_override) + continue; + + tg->sched_boost_enabled = false; + } + rcu_read_unlock(); +} + +void restore_cgroup_boost_settings(void) +{ + struct task_group *tg; + + rcu_read_lock(); + list_for_each_entry_rcu(tg, &task_groups, list) + tg->sched_boost_enabled = true; + rcu_read_unlock(); +} + +static void walt_schedgp_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *css; + bool colocate; + + cgroup_taskset_first(tset, &css); + tg = css_tg(css); + + colocate = tg->colocate; + + cgroup_taskset_for_each(task, css, tset) + sync_cgroup_colocation(task, colocate); +} + +static u64 +sched_boost_override_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + return (u64) tg->sched_boost_no_override; +} + +static int sched_boost_override_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 override) +{ + struct task_group *tg = css_tg(css); + + tg->sched_boost_no_override = !!override; + return 0; +} + +static u64 sched_colocate_read(struct cgroup_subsys_state *css, + struct cftype *cft) +{ + struct task_group *tg = css_tg(css); + + return (u64) tg->colocate; +} + +static int sched_colocate_write(struct cgroup_subsys_state *css, + struct cftype *cft, u64 colocate) +{ + struct task_group *tg = css_tg(css); + + if (tg->colocate_update_disabled) + return -EPERM; + + tg->colocate = !!colocate; + tg->colocate_update_disabled = true; + return 0; +} +#else +static inline void walt_init_sched_boost(struct task_group *tg) { } +static void walt_schedgp_attach(struct cgroup_taskset *tset) { } +#endif /* CONFIG_SCHED_WALT */ + static inline void alloc_uclamp_sched_group(struct task_group *tg, struct task_group *parent) { @@ -7139,6 +7318,7 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); + walt_init_sched_boost(tg); return &tg->css; } @@ -7225,6 +7405,8 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) cgroup_taskset_for_each(task, css, tset) sched_move_task(task); + + walt_schedgp_attach(tset); } #ifdef CONFIG_UCLAMP_TASK_GROUP @@ -7784,7 +7966,21 @@ static struct cftype cpu_legacy_files[] = { .read_u64 = cpu_uclamp_ls_read_u64, .write_u64 = cpu_uclamp_ls_write_u64, }, -#endif +#ifdef CONFIG_SCHED_WALT + { + .name = "uclamp.sched_boost_no_override", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = sched_boost_override_read, + .write_u64 = sched_boost_override_write, + }, + { + .name = "uclamp.colocate", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = sched_colocate_read, + .write_u64 = sched_colocate_write, + }, +#endif /* CONFIG_SCHED_WALT */ +#endif /* CONFIG_UCLAMP_TASK_GROUP */ { } /* Terminate */ }; @@ -7971,7 +8167,21 @@ static struct cftype cpu_files[] = { .read_u64 = cpu_uclamp_ls_read_u64, .write_u64 = cpu_uclamp_ls_write_u64, }, -#endif +#ifdef CONFIG_SCHED_WALT + { + .name = "uclamp.sched_boost_no_override", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = sched_boost_override_read, + .write_u64 = sched_boost_override_write, + }, + { + .name = "uclamp.colocate", + .flags = CFTYPE_NOT_ON_ROOT, + .read_u64 = sched_colocate_read, + .write_u64 = sched_colocate_write, + }, +#endif /* CONFIG_SCHED_WALT */ +#endif /* CONFIG_UCLAMP_TASK_GROUP */ { } /* terminate */ }; @@ -8040,3 +8250,57 @@ const u32 sched_prio_to_wmult[40] = { }; #undef CREATE_TRACE_POINTS + +__read_mostly bool sched_predl = 1; + +void enqueue_task_core(struct rq *rq, struct task_struct *p, int flags) +{ + enqueue_task(rq, p, 0); +} + +void dequeue_task_core(struct rq *rq, struct task_struct *p, int flags) +{ + dequeue_task(rq, p, 0); +} + +#ifdef CONFIG_SCHED_WALT +void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags, nr_windows; + u64 cur_jiffies_ts; + + raw_spin_lock_irqsave(&rq->lock, flags); + + /* + * cputime (wallclock) uses sched_clock so use the same here for + * consistency. + */ + delta += sched_clock() - wallclock; + cur_jiffies_ts = get_jiffies_64(); + + if (is_idle_task(curr)) + walt_update_task_ravg(curr, rq, IRQ_UPDATE, sched_ktime_clock(), + delta); + + nr_windows = cur_jiffies_ts - rq->irqload_ts; + + if (nr_windows) { + if (nr_windows < 10) { + /* Decay CPU's irqload by 3/4 for each window. */ + rq->avg_irqload *= (3 * nr_windows); + rq->avg_irqload = div64_u64(rq->avg_irqload, + 4 * nr_windows); + } else { + rq->avg_irqload = 0; + } + rq->avg_irqload += rq->cur_irqload; + rq->cur_irqload = 0; + } + + rq->cur_irqload += delta; + rq->irqload_ts = cur_jiffies_ts; + raw_spin_unlock_irqrestore(&rq->lock, flags); +} +#endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index fc49c1e169d9..c42e77415e4f 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -4,6 +4,7 @@ */ #include #include "sched.h" +#include "walt.h" #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -53,11 +54,18 @@ void irqtime_account_irq(struct task_struct *curr) struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); s64 delta; int cpu; +#ifdef CONFIG_SCHED_WALT + u64 wallclock; + bool account = true; +#endif if (!sched_clock_irqtime) return; cpu = smp_processor_id(); +#ifdef CONFIG_SCHED_WALT + wallclock = sched_clock_cpu(cpu); +#endif delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; @@ -71,6 +79,15 @@ void irqtime_account_irq(struct task_struct *curr) irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); +#ifdef CONFIG_SCHED_WALT + else + account = false; + + if (account) + sched_account_irqtime(cpu, curr, delta, wallclock); + else if (curr != this_cpu_ksoftirqd()) + sched_account_irqstart(cpu, curr, wallclock); +#endif } EXPORT_SYMBOL_GPL(irqtime_account_irq); diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index a8a08030a8f7..0947361dc7d0 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -17,6 +17,7 @@ */ #include "sched.h" #include "pelt.h" +#include "walt.h" struct dl_bandwidth def_dl_bandwidth; @@ -1380,6 +1381,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); + walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -1394,6 +1396,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); + walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); @@ -2101,7 +2104,9 @@ retry: } deactivate_task(rq, next_task, 0); + next_task->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(next_task, later_rq->cpu); + next_task->on_rq = TASK_ON_RQ_QUEUED; /* * Update the later_rq clock here, because the clock is used @@ -2195,7 +2200,9 @@ static void pull_dl_task(struct rq *this_rq) resched = true; deactivate_task(src_rq, p, 0); + p->on_rq = TASK_ON_RQ_MIGRATING; set_task_cpu(p, this_cpu); + p->on_rq = TASK_ON_RQ_QUEUED; activate_task(this_rq, p, 0); dmin = p->dl.deadline; @@ -2458,6 +2465,9 @@ const struct sched_class dl_sched_class = { .switched_to = switched_to_dl, .update_curr = update_curr_dl, +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = fixup_walt_sched_stats_common, +#endif }; int sched_dl_global_validate(void) diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index f7e4579e746c..d7959f96820f 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -646,6 +646,19 @@ do { \ SEQ_printf(m, " .%-30s: %ld\n", "curr->pid", (long)(task_pid_nr(rq->curr))); PN(clock); PN(clock_task); +#ifdef CONFIG_SMP + P(cpu_capacity); +#endif +#ifdef CONFIG_SCHED_WALT + P(cluster->max_possible_capacity); + P(cluster->efficiency); + P(cluster->cur_freq); + P(cluster->max_freq); + P(cluster->exec_scale_factor); + P(walt_stats.nr_big_tasks); + SEQ_printf(m, " .%-30s: %llu\n", "walt_stats.cumulative_runnable_avg", + rq->walt_stats.cumulative_runnable_avg_scaled); +#endif #undef P #undef PN @@ -724,6 +737,11 @@ static void sched_debug_header(struct seq_file *m) PN(sysctl_sched_wakeup_granularity); P(sysctl_sched_child_runs_first); P(sysctl_sched_features); +#ifdef CONFIG_SCHED_WALT + P(sched_init_task_load_windows); + P(sched_ravg_window); + P(sched_load_granule); +#endif #undef PN #undef P @@ -915,6 +933,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, P_SCHEDSTAT(se.statistics.nr_wakeups_passive); P_SCHEDSTAT(se.statistics.nr_wakeups_idle); +#ifdef CONFIG_SCHED_WALT + P(ravg.demand); +#endif avg_atom = p->se.sum_exec_runtime; if (nr_switches) avg_atom = div64_ul(avg_atom, nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index dc342f8094c2..8201f71bf301 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -24,6 +24,12 @@ #include +#include "walt.h" + +#ifdef CONFIG_SMP +static inline bool task_fits_max(struct task_struct *p, int cpu); +#endif /* CONFIG_SMP */ + /* * Targeted preemption latency for CPU-bound tasks: * @@ -85,6 +91,7 @@ unsigned int sysctl_sched_wakeup_granularity = 1000000UL; static unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; +DEFINE_PER_CPU_READ_MOSTLY(int, sched_load_boost); #ifdef CONFIG_SMP /* @@ -118,6 +125,8 @@ int __weak arch_asym_cpu_priority(int cpu) unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; #endif +unsigned int sched_small_task_threshold = 102; + static inline void update_load_add(struct load_weight *lw, unsigned long inc) { lw->weight += inc; @@ -3689,11 +3698,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static inline unsigned long task_util(struct task_struct *p) -{ - return READ_ONCE(p->se.avg.util_avg); -} - static inline unsigned long _task_util_est(struct task_struct *p) { struct util_est ue = READ_ONCE(p->se.avg.util_est); @@ -3703,6 +3707,9 @@ static inline unsigned long _task_util_est(struct task_struct *p) static inline unsigned long task_util_est(struct task_struct *p) { +#ifdef CONFIG_SCHED_WALT + return p->ravg.demand_scaled; +#endif return max(task_util(p), _task_util_est(p)); } @@ -4514,13 +4521,16 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq) dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; + walt_dec_throttled_cfs_rq_stats(&qcfs_rq->walt_stats, cfs_rq); if (qcfs_rq->load.weight) dequeue = 0; } - if (!se) + if (!se) { sub_nr_running(rq, task_delta); + walt_dec_throttled_cfs_rq_stats(&rq->walt_stats, cfs_rq); + } cfs_rq->throttled = 1; cfs_rq->throttled_clock = rq_clock(rq); @@ -4554,6 +4564,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) struct sched_entity *se; int enqueue = 1; long task_delta, idle_task_delta; + struct cfs_rq *tcfs_rq __maybe_unused = cfs_rq; se = cfs_rq->tg->se[cpu_of(rq)]; @@ -4583,6 +4594,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP); cfs_rq->h_nr_running += task_delta; cfs_rq->idle_h_nr_running += idle_task_delta; + walt_inc_throttled_cfs_rq_stats(&cfs_rq->walt_stats, tcfs_rq); if (cfs_rq_throttled(cfs_rq)) break; @@ -4590,8 +4602,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) assert_list_leaf_cfs_rq(rq); - if (!se) + if (!se) { add_nr_running(rq, task_delta); + walt_inc_throttled_cfs_rq_stats(&rq->walt_stats, tcfs_rq); + } /* Determine whether we need to wake up potentially idle CPU: */ if (rq->curr == rq->idle && rq->cfs.nr_running) @@ -4982,6 +4996,7 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) { cfs_rq->runtime_enabled = 0; INIT_LIST_HEAD(&cfs_rq->throttled_list); + walt_init_cfs_rq_stats(cfs_rq); } void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) @@ -5161,8 +5176,6 @@ static inline void hrtick_update(struct rq *rq) #endif #ifdef CONFIG_SMP -static inline unsigned long cpu_util(int cpu); - static inline bool cpu_overutilized(int cpu) { return !fits_capacity(cpu_util(cpu), capacity_of(cpu)); @@ -5223,6 +5236,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + walt_inc_cfs_rq_stats(cfs_rq, p); flags = ENQUEUE_WAKEUP; } @@ -5231,6 +5245,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running++; cfs_rq->idle_h_nr_running += idle_h_nr_running; + walt_inc_cfs_rq_stats(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; @@ -5241,6 +5256,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (!se) { add_nr_running(rq, 1); + inc_rq_walt_stats(rq, p); /* * Since new tasks are assigned an initial util_avg equal to * half of the spare capacity of their CPU, tiny tasks have the @@ -5308,6 +5324,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) break; cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + walt_dec_cfs_rq_stats(cfs_rq, p); /* Don't dequeue parent if it has other entities besides us */ if (cfs_rq->load.weight) { @@ -5328,6 +5345,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) cfs_rq = cfs_rq_of(se); cfs_rq->h_nr_running--; cfs_rq->idle_h_nr_running -= idle_h_nr_running; + walt_dec_cfs_rq_stats(cfs_rq, p); if (cfs_rq_throttled(cfs_rq)) break; @@ -5336,8 +5354,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_cfs_group(se); } - if (!se) + if (!se) { sub_nr_running(rq, 1); + dec_rq_walt_stats(rq, p); + } util_est_dequeue(&rq->cfs, p, task_sleep); hrtick_update(rq); @@ -5375,11 +5395,6 @@ static unsigned long cpu_runnable_load(struct rq *rq) return cfs_rq_runnable_load_avg(&rq->cfs); } -static unsigned long capacity_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity; -} - static unsigned long cpu_avg_load_per_task(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -6047,58 +6062,6 @@ static unsigned int uclamp_task_util(struct task_struct *p) #endif } -/** - * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks - * @cpu: the CPU to get the utilization of - * - * The unit of the return value must be the one of capacity so we can compare - * the utilization with the capacity of the CPU that is available for CFS task - * (ie cpu_capacity). - * - * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the - * recent utilization of currently non-runnable tasks on a CPU. It represents - * the amount of utilization of a CPU in the range [0..capacity_orig] where - * capacity_orig is the cpu_capacity available at the highest frequency - * (arch_scale_freq_capacity()). - * The utilization of a CPU converges towards a sum equal to or less than the - * current capacity (capacity_curr <= capacity_orig) of the CPU because it is - * the running time on this CPU scaled by capacity_curr. - * - * The estimated utilization of a CPU is defined to be the maximum between its - * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks - * currently RUNNABLE on that CPU. - * This allows to properly represent the expected utilization of a CPU which - * has just got a big task running since a long sleep period. At the same time - * however it preserves the benefits of the "blocked utilization" in - * describing the potential for other tasks waking up on the same CPU. - * - * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even - * higher than capacity_orig because of unfortunate rounding in - * cfs.avg.util_avg or just after migrating tasks and new task wakeups until - * the average stabilizes with the new running time. We need to check that the - * utilization stays within the range of [0..capacity_orig] and cap it if - * necessary. Without utilization capping, a group could be seen as overloaded - * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of - * available capacity. We allow utilization to overshoot capacity_curr (but not - * capacity_orig) as it useful for predicting the capacity required after task - * migrations (scheduler-driven DVFS). - * - * Return: the (estimated) utilization for the specified CPU - */ -static inline unsigned long cpu_util(int cpu) -{ - struct cfs_rq *cfs_rq; - unsigned int util; - - cfs_rq = &cpu_rq(cpu)->cfs; - util = READ_ONCE(cfs_rq->avg.util_avg); - - if (sched_feat(UTIL_EST)) - util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); - - return min_t(unsigned long, util, capacity_orig_of(cpu)); -} - /* * cpu_util_without: compute cpu utilization without any contributions from *p * @cpu: the CPU which utilization is requested @@ -6114,13 +6077,29 @@ static inline unsigned long cpu_util(int cpu) */ static unsigned long cpu_util_without(int cpu, struct task_struct *p) { +#ifndef CONFIG_SCHED_WALT struct cfs_rq *cfs_rq; +#endif unsigned int util; +#ifdef CONFIG_SCHED_WALT + /* + * WALT does not decay idle tasks in the same manner + * as PELT, so it makes little sense to subtract task + * utilization from cpu utilization. Instead just use + * cpu_util for this case. + */ + if (likely(p->state == TASK_WAKING)) + return cpu_util(cpu); +#endif + /* Task has no contribution or is new */ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time)) return cpu_util(cpu); +#ifdef CONFIG_SCHED_WALT + util = max_t(long, cpu_util(cpu) - task_util(p), 0); +#else cfs_rq = &cpu_rq(cpu)->cfs; util = READ_ONCE(cfs_rq->avg.util_avg); @@ -6179,6 +6158,7 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) util = max(util, estimated); } +#endif /* * Utilization (estimated) can exceed the CPU capacity, thus let's @@ -6188,6 +6168,18 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) return min_t(unsigned long, util, capacity_orig_of(cpu)); } +/* + * Returns the current capacity of cpu after applying both + * cpu and freq scaling. + */ +unsigned long capacity_curr_of(int cpu) +{ + unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig; + unsigned long scale_freq = arch_scale_freq_capacity(cpu); + + return cap_scale(max_cap, scale_freq); +} + /* * Disable WAKE_AFFINE in the case where task @p doesn't fit in the * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu. @@ -6344,7 +6336,7 @@ compute_energy(struct task_struct *p, int dst_cpu, struct perf_domain *pd) * other use-cases too. So, until someone finds a better way to solve this, * let's keep things simple by re-using the existing slow path. */ -static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync) +int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, int sync) { unsigned long prev_delta = ULONG_MAX, best_delta = ULONG_MAX; struct root_domain *rd = cpu_rq(smp_processor_id())->rd; @@ -7363,7 +7355,13 @@ static void detach_task(struct task_struct *p, struct lb_env *env) lockdep_assert_held(&env->src_rq->lock); deactivate_task(env->src_rq, p, DEQUEUE_NOCLOCK); + lockdep_off(); + double_lock_balance(env->src_rq, env->dst_rq); + if (!(env->src_rq->clock_update_flags & RQCF_UPDATED)) + update_rq_clock(env->src_rq); set_task_cpu(p, env->dst_cpu); + double_unlock_balance(env->src_rq, env->dst_rq); + lockdep_on(); } /* @@ -8891,8 +8889,6 @@ static int need_active_balance(struct lb_env *env) return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); } -static int active_load_balance_cpu_stop(void *data); - static int should_we_balance(struct lb_env *env) { struct sched_group *sg = env->sd->groups; @@ -9244,7 +9240,7 @@ update_next_balance(struct sched_domain *sd, unsigned long *next_balance) * least 1 task to be running on each physical CPU where possible, and * avoids physical / logical imbalances. */ -static int active_load_balance_cpu_stop(void *data) +int active_load_balance_cpu_stop(void *data) { struct rq *busiest_rq = data; int busiest_cpu = cpu_of(busiest_rq); @@ -10615,6 +10611,10 @@ const struct sched_class fair_sched_class = { #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif + +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = walt_fixup_sched_stats_fair, +#endif }; #ifdef CONFIG_SCHED_DEBUG diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index ee68dea4b11a..27a5c1e62c72 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -61,7 +61,8 @@ static noinline int __cpuidle cpu_idle_poll(void) stop_critical_timings(); while (!tif_need_resched() && - (cpu_idle_force_poll || tick_check_broadcast_expired())) + (cpu_idle_force_poll || tick_check_broadcast_expired() || + is_reserved(smp_processor_id()))) cpu_relax(); start_critical_timings(); trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); @@ -257,7 +258,8 @@ static void do_idle(void) * broadcast device expired for us, we don't want to go deep * idle as we know that the IPI is going to arrive right away. */ - if (cpu_idle_force_poll || tick_check_broadcast_expired()) { + if (cpu_idle_force_poll || tick_check_broadcast_expired() || + is_reserved(smp_processor_id())) { tick_nohz_idle_restart_tick(); cpu_idle_poll(); } else { diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 9b8adc01be3d..14012a8d2eb9 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -6,6 +6,7 @@ #include "sched.h" #include "pelt.h" +#include "walt.h" int sched_rr_timeslice = RR_TIMESLICE; int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE; @@ -2388,6 +2389,10 @@ const struct sched_class rt_sched_class = { #ifdef CONFIG_UCLAMP_TASK .uclamp_enabled = 1, #endif + +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = fixup_walt_sched_stats_common, +#endif }; #ifdef CONFIG_RT_GROUP_SCHED diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index e83bdf6bce23..7a9d3ba64110 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -84,6 +84,73 @@ struct rq; struct cpuidle_state; +extern __read_mostly bool sched_predl; + +struct sched_walt_cpu_load { + unsigned long prev_window_util; + unsigned long nl; + unsigned long pl; + bool rtgb_active; + u64 ws; +}; + +#ifdef CONFIG_SCHED_WALT +#define DECLARE_BITMAP_ARRAY(name, nr, bits) \ + unsigned long name[nr][BITS_TO_LONGS(bits)] + +extern unsigned int __weak sched_ravg_window; + +struct walt_sched_stats { + int nr_big_tasks; + u64 cumulative_runnable_avg_scaled; + u64 pred_demands_sum_scaled; +}; + +struct cpu_cycle { + u64 cycles; + u64 time; +}; + +struct group_cpu_time { + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; +}; + +struct load_subtractions { + u64 window_start; + u64 subs; + u64 new_subs; +}; + +#define NUM_TRACKED_WINDOWS 2 +#define NUM_LOAD_INDICES 1000 + +struct sched_cluster { + raw_spinlock_t load_lock; + struct list_head list; + struct cpumask cpus; + int id; + int max_power_cost; + int min_power_cost; + int max_possible_capacity; + int efficiency; /* Differentiate cpus with different IPC capability */ + unsigned int exec_scale_factor; + /* + * max_freq = user maximum + * max_mitigated_freq = thermal defined maximum + * max_possible_freq = maximum supported by hardware + */ + unsigned int cur_freq, max_freq, max_mitigated_freq, min_freq; + unsigned int max_possible_freq; + bool freq_init_done; + u64 aggr_grp_load; +}; + +extern __weak cpumask_t asym_cap_sibling_cpus; +#endif /* CONFIG_SCHED_WALT */ + /* task_struct::on_rq states: */ #define TASK_ON_RQ_QUEUED 1 #define TASK_ON_RQ_MIGRATING 2 @@ -401,7 +468,24 @@ struct task_group { struct uclamp_se uclamp[UCLAMP_CNT]; /* Latency-sensitive flag used for a task group */ unsigned int latency_sensitive; -#endif +#ifdef CONFIG_SCHED_WALT + /* Toggle ability to override sched boost enabled */ + bool sched_boost_no_override; + /* + * Controls whether a cgroup is eligible for sched boost or not. This + * can temporariliy be disabled by the kernel based on the no_override + * flag above. + */ + bool sched_boost_enabled; + /* + * Controls whether tasks of this cgroup should be colocated with each + * other and tasks of other cgroups that have the same flag turned on. + */ + bool colocate; + /* Controls whether further updates are allowed to the colocate flag */ + bool colocate_update_disabled; +#endif /* CONFIG_SCHED_WALT */ +#endif /* CONFIG_UCLAMP_TASK_GROUP */ }; @@ -565,6 +649,10 @@ struct cfs_rq { struct list_head leaf_cfs_rq_list; struct task_group *tg; /* group that "owns" this runqueue */ +#ifdef CONFIG_SCHED_WALT + struct walt_sched_stats walt_stats; +#endif + #ifdef CONFIG_CFS_BANDWIDTH int runtime_enabled; s64 runtime_remaining; @@ -961,6 +1049,41 @@ struct rq { u64 max_idle_balance_cost; #endif +#ifdef CONFIG_SCHED_WALT + struct task_struct *push_task; + struct sched_cluster *cluster; + struct cpumask freq_domain_cpumask; + struct walt_sched_stats walt_stats; + + u64 window_start; + u32 prev_window_size; + unsigned long walt_flags; + + u64 cur_irqload; + u64 avg_irqload; + u64 irqload_ts; + struct task_struct *ed_task; + struct cpu_cycle cc; + u64 old_busy_time, old_busy_time_group; + u64 old_estimated_time; + u64 curr_runnable_sum; + u64 prev_runnable_sum; + u64 nt_curr_runnable_sum; + u64 nt_prev_runnable_sum; + u64 cum_window_demand_scaled; + struct group_cpu_time grp_time; + struct load_subtractions load_subs[NUM_TRACKED_WINDOWS]; + DECLARE_BITMAP_ARRAY(top_tasks_bitmap, + NUM_TRACKED_WINDOWS, NUM_LOAD_INDICES); + u8 *top_tasks[NUM_TRACKED_WINDOWS]; + u8 curr_table; + int prev_top; + int curr_top; + bool notif_pending; + u64 last_cc_update; + u64 cycles; +#endif /* CONFIG_SCHED_WALT */ + #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; #endif @@ -1306,8 +1429,6 @@ enum numa_faults_stats { }; extern void sched_setnuma(struct task_struct *p, int node); extern int migrate_task_to(struct task_struct *p, int cpu); -extern int migrate_swap(struct task_struct *p, struct task_struct *t, - int cpu, int scpu); extern void init_numa_balancing(unsigned long clone_flags, struct task_struct *p); #else static inline void @@ -1316,6 +1437,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) } #endif /* CONFIG_NUMA_BALANCING */ +extern int migrate_swap(struct task_struct *p, struct task_struct *t, + int cpu, int scpu); + #ifdef CONFIG_SMP static inline void @@ -1782,8 +1906,15 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p, int type); #endif + +#ifdef CONFIG_SCHED_WALT + void (*fixup_walt_sched_stats)(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled, + u16 updated_pred_demand_scaled); +#endif }; + static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { WARN_ON_ONCE(rq->curr != prev); @@ -1960,6 +2091,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; + sched_update_nr_prod(cpu_of(rq), count, true); rq->nr_running = prev_nr + count; #ifdef CONFIG_SMP @@ -1974,6 +2106,7 @@ static inline void add_nr_running(struct rq *rq, unsigned count) static inline void sub_nr_running(struct rq *rq, unsigned count) { + sched_update_nr_prod(cpu_of(rq), count, false); rq->nr_running -= count; /* Check if we still need preemption */ sched_update_tick_dependency(rq); @@ -2014,6 +2147,18 @@ static inline int hrtick_enabled(struct rq *rq) #endif /* CONFIG_SCHED_HRTICK */ +#ifdef CONFIG_SCHED_WALT +u64 __weak sched_ktime_clock(void); +unsigned long __weak +cpu_util_freq_walt(int cpu, struct sched_walt_cpu_load *walt_load); +#else +#define sched_ravg_window TICK_NSEC +static inline u64 sched_ktime_clock(void) +{ + return 0; +} +#endif + #ifndef arch_scale_freq_capacity static __always_inline unsigned long arch_scale_freq_capacity(int cpu) @@ -2031,8 +2176,127 @@ unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu) } #endif +unsigned long capacity_curr_of(int cpu); + #ifdef CONFIG_SMP -#ifdef CONFIG_PREEMPTION +static inline unsigned long capacity_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity; +} + +static inline unsigned long capacity_orig_of(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} + +static inline unsigned long task_util(struct task_struct *p) +{ +#ifdef CONFIG_SCHED_WALT + return p->ravg.demand_scaled; +#endif + return READ_ONCE(p->se.avg.util_avg); +} + +/** + * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks + * @cpu: the CPU to get the utilization of + * + * The unit of the return value must be the one of capacity so we can compare + * the utilization with the capacity of the CPU that is available for CFS task + * (ie cpu_capacity). + * + * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the + * recent utilization of currently non-runnable tasks on a CPU. It represents + * the amount of utilization of a CPU in the range [0..capacity_orig] where + * capacity_orig is the cpu_capacity available at the highest frequency + * (arch_scale_freq_capacity()). + * The utilization of a CPU converges towards a sum equal to or less than the + * current capacity (capacity_curr <= capacity_orig) of the CPU because it is + * the running time on this CPU scaled by capacity_curr. + * + * The estimated utilization of a CPU is defined to be the maximum between its + * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks + * currently RUNNABLE on that CPU. + * This allows to properly represent the expected utilization of a CPU which + * has just got a big task running since a long sleep period. At the same time + * however it preserves the benefits of the "blocked utilization" in + * describing the potential for other tasks waking up on the same CPU. + * + * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even + * higher than capacity_orig because of unfortunate rounding in + * cfs.avg.util_avg or just after migrating tasks and new task wakeups until + * the average stabilizes with the new running time. We need to check that the + * utilization stays within the range of [0..capacity_orig] and cap it if + * necessary. Without utilization capping, a group could be seen as overloaded + * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of + * available capacity. We allow utilization to overshoot capacity_curr (but not + * capacity_orig) as it useful for predicting the capacity required after task + * migrations (scheduler-driven DVFS). + * + * Return: the (estimated) utilization for the specified CPU + */ +static inline unsigned long cpu_util(int cpu) +{ + struct cfs_rq *cfs_rq; + unsigned int util; + +#ifdef CONFIG_SCHED_WALT + u64 walt_cpu_util = + cpu_rq(cpu)->walt_stats.cumulative_runnable_avg_scaled; + + return min_t(unsigned long, walt_cpu_util, capacity_orig_of(cpu)); +#endif + + cfs_rq = &cpu_rq(cpu)->cfs; + util = READ_ONCE(cfs_rq->avg.util_avg); + + if (sched_feat(UTIL_EST)) + util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued)); + + return min_t(unsigned long, util, capacity_orig_of(cpu)); +} + +static inline unsigned long cpu_util_cum(int cpu, int delta) +{ + u64 util = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long capacity = capacity_orig_of(cpu); + +#ifdef CONFIG_SCHED_WALT + util = cpu_rq(cpu)->cum_window_demand_scaled; +#endif + delta += util; + if (delta < 0) + return 0; + + return (delta >= capacity) ? capacity : delta; +} + +static inline unsigned long +cpu_util_freq(int cpu, struct sched_walt_cpu_load *walt_load) +{ +#ifdef CONFIG_SCHED_WALT + return cpu_util_freq_walt(cpu, walt_load); +#else + return cpu_util(cpu); +#endif +} + +extern unsigned int capacity_margin_freq; + +static inline unsigned long +add_capacity_margin(unsigned long cpu_capacity, int cpu) +{ + cpu_capacity = cpu_capacity * capacity_margin_freq * + (100 + per_cpu(sched_load_boost, cpu)); + cpu_capacity /= 100; + cpu_capacity /= SCHED_CAPACITY_SCALE; + return cpu_capacity; +} + +#endif /* CONFIG_SMP */ + +#ifdef CONFIG_SMP +#ifdef CONFIG_PREEMPT static inline void double_rq_lock(struct rq *rq1, struct rq *rq2); @@ -2345,6 +2609,11 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) { struct update_util_data *data; +#ifdef CONFIG_SCHED_WALT + if (!(flags & SCHED_CPUFREQ_WALT)) + return; +#endif + data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, cpu_of(rq))); if (data) @@ -2432,13 +2701,6 @@ static inline bool uclamp_latency_sensitive(struct task_struct *p) # define arch_scale_freq_invariant() false #endif -#ifdef CONFIG_SMP -static inline unsigned long capacity_orig_of(int cpu) -{ - return cpu_rq(cpu)->cpu_capacity_orig; -} -#endif - /** * enum schedutil_type - CPU utilization type * @FREQUENCY_UTIL: Utilization used to select frequency @@ -2570,3 +2832,499 @@ static inline void membarrier_switch_mm(struct rq *rq, { } #endif + +enum sched_boost_policy { + SCHED_BOOST_NONE, + SCHED_BOOST_ON_BIG, + SCHED_BOOST_ON_ALL, +}; + +#ifdef CONFIG_SCHED_WALT + +static inline int cluster_first_cpu(struct sched_cluster *cluster) +{ + return cpumask_first(&cluster->cpus); +} + +struct related_thread_group { + int id; + raw_spinlock_t lock; + struct list_head tasks; + struct list_head list; + bool skip_min; + struct rcu_head rcu; + u64 last_update; + u64 downmigrate_ts; + u64 start_ts; +}; + +extern struct sched_cluster *sched_cluster[NR_CPUS]; + +extern unsigned int __weak sched_disable_window_stats; +extern unsigned int max_possible_freq; +extern unsigned int min_max_freq; +extern unsigned int max_possible_efficiency; +extern unsigned int min_possible_efficiency; +extern unsigned int max_possible_capacity; +extern unsigned int __weak min_max_possible_capacity; +extern unsigned int max_power_cost; +extern unsigned int __read_mostly __weak sched_init_task_load_windows; +extern unsigned int __read_mostly __weak sched_load_granule; + +extern int __weak update_preferred_cluster(struct related_thread_group *grp, + struct task_struct *p, u32 old_load, bool from_tick); +extern void __weak set_preferred_cluster(struct related_thread_group *grp); +extern void __weak add_new_task_to_grp(struct task_struct *new); + +#define NO_BOOST 0 +#define FULL_THROTTLE_BOOST 1 +#define CONSERVATIVE_BOOST 2 +#define RESTRAINED_BOOST 3 +#define FULL_THROTTLE_BOOST_DISABLE -1 +#define CONSERVATIVE_BOOST_DISABLE -2 +#define RESTRAINED_BOOST_DISABLE -3 +#define MAX_NUM_BOOST_TYPE (RESTRAINED_BOOST+1) + +static inline int asym_cap_siblings(int cpu1, int cpu2) +{ + return (cpumask_test_cpu(cpu1, &asym_cap_sibling_cpus) && + cpumask_test_cpu(cpu2, &asym_cap_sibling_cpus)); +} + +static inline int cpu_max_possible_capacity(int cpu) +{ + return cpu_rq(cpu)->cluster->max_possible_capacity; +} + +static inline unsigned int cluster_max_freq(struct sched_cluster *cluster) +{ + /* + * Governor and thermal driver don't know the other party's mitigation + * voting. So struct cluster saves both and return min() for current + * cluster fmax. + */ + return min(cluster->max_mitigated_freq, cluster->max_freq); +} + +static inline unsigned int cpu_max_freq(int cpu) +{ + return cluster_max_freq(cpu_rq(cpu)->cluster); +} + +static inline unsigned int cpu_max_possible_freq(int cpu) +{ + return cpu_rq(cpu)->cluster->max_possible_freq; +} + +static inline bool hmp_capable(void) +{ + return max_possible_capacity != min_max_possible_capacity; +} + +static inline bool is_max_capacity_cpu(int cpu) +{ + return cpu_max_possible_capacity(cpu) == max_possible_capacity; +} + +static inline bool is_min_capacity_cpu(int cpu) +{ + return cpu_max_possible_capacity(cpu) == min_max_possible_capacity; +} + +static inline unsigned int task_load(struct task_struct *p) +{ + return p->ravg.demand; +} + +static inline unsigned int task_pl(struct task_struct *p) +{ + return p->ravg.pred_demand; +} + +static inline bool task_in_related_thread_group(struct task_struct *p) +{ + return !!(rcu_access_pointer(p->grp) != NULL); +} + +static inline +struct related_thread_group *task_related_thread_group(struct task_struct *p) +{ + return rcu_dereference(p->grp); +} + +/* Is frequency of two cpus synchronized with each other? */ +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + struct rq *rq = cpu_rq(src_cpu); + + if (src_cpu == dst_cpu) + return 1; + + if (asym_cap_siblings(src_cpu, dst_cpu)) + return 1; + + return cpumask_test_cpu(dst_cpu, &rq->freq_domain_cpumask); +} + +#define CPU_RESERVED 1 + +extern enum sched_boost_policy __weak boost_policy; +extern unsigned int __weak sched_task_filter_util; +static inline enum sched_boost_policy sched_boost_policy(void) +{ + return boost_policy; +} + +extern unsigned int __weak sched_boost_type; +static inline int sched_boost(void) +{ + return sched_boost_type; +} + +static inline bool rt_boost_on_big(void) +{ + return sched_boost() == FULL_THROTTLE_BOOST ? + (sched_boost_policy() == SCHED_BOOST_ON_BIG) : false; +} + +static inline bool is_full_throttle_boost(void) +{ + return sched_boost() == FULL_THROTTLE_BOOST; +} + +extern int __weak preferred_cluster(struct sched_cluster *cluster, + struct task_struct *p); +extern struct sched_cluster *rq_cluster(struct rq *rq); + +#ifdef CONFIG_UCLAMP_TASK_GROUP +static inline bool task_sched_boost(struct task_struct *p) +{ + struct cgroup_subsys_state *css = task_css(p, cpu_cgrp_id); + struct task_group *tg; + + if (!css) + return false; + tg = container_of(css, struct task_group, css); + + return tg->sched_boost_enabled; +} + +extern int __weak sync_cgroup_colocation(struct task_struct *p, bool insert); +extern void update_cgroup_boost_settings(void); +extern void restore_cgroup_boost_settings(void); +#else +static inline bool +same_schedtg(struct task_struct *tsk1, struct task_struct *tsk2) +{ + return true; +} + +static inline bool task_sched_boost(struct task_struct *p) +{ + return true; +} + +static inline void update_cgroup_boost_settings(void) { } +static inline void restore_cgroup_boost_settings(void) { } +#endif + +extern int __weak alloc_related_thread_groups(void); + +extern void __weak check_for_migration(struct rq *rq, struct task_struct *p); + +static inline int is_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return test_bit(CPU_RESERVED, &rq->walt_flags); +} + +static inline int mark_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + return test_and_set_bit(CPU_RESERVED, &rq->walt_flags); +} + +static inline void clear_reserved(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + + clear_bit(CPU_RESERVED, &rq->walt_flags); +} + +static inline bool +task_in_cum_window_demand(struct rq *rq, struct task_struct *p) +{ + return cpu_of(rq) == task_cpu(p) && (p->on_rq || p->last_sleep_ts >= + rq->window_start); +} + +static inline void walt_fixup_cum_window_demand(struct rq *rq, s64 scaled_delta) +{ + rq->cum_window_demand_scaled += scaled_delta; + if (unlikely((s64)rq->cum_window_demand_scaled < 0)) + rq->cum_window_demand_scaled = 0; +} + +extern unsigned long __weak thermal_cap(int cpu); + +extern void __weak clear_walt_request(int cpu); + +extern enum sched_boost_policy sched_boost_policy(void); +extern void sched_boost_parse_dt(void); +extern void __weak clear_ed_task(struct task_struct *p, struct rq *rq); +extern bool __weak early_detection_notify(struct rq *rq, u64 wallclock); + +static inline unsigned int power_cost(int cpu, u64 demand) +{ + return cpu_max_possible_capacity(cpu); +} + +void __weak note_task_waking(struct task_struct *p, u64 wallclock); + +static inline bool task_placement_boost_enabled(struct task_struct *p) +{ + if (task_sched_boost(p)) + return sched_boost_policy() != SCHED_BOOST_NONE; + + return false; +} + +static inline enum sched_boost_policy task_boost_policy(struct task_struct *p) +{ + enum sched_boost_policy policy = task_sched_boost(p) ? + sched_boost_policy() : + SCHED_BOOST_NONE; + if (policy == SCHED_BOOST_ON_BIG) { + /* + * Filter out tasks less than min task util threshold + * under conservative boost. + */ + if (sched_boost() == CONSERVATIVE_BOOST && + task_util(p) <= sched_task_filter_util) + policy = SCHED_BOOST_NONE; + } + + return policy; +} + +static inline bool is_min_capacity_cluster(struct sched_cluster *cluster) +{ + return is_min_capacity_cpu(cluster_first_cpu(cluster)); +} + +extern void __weak walt_fixup_sched_stats_fair(struct rq *rq, + struct task_struct *p, + u16 updated_demand_scaled, + u16 updated_pred_demand_scaled); +extern void __weak walt_fixup_nr_big_tasks(struct rq *rq, struct task_struct *p, + int delta, bool inc); +#else /* CONFIG_SCHED_WALT */ + +struct walt_sched_stats; +struct related_thread_group; +struct sched_cluster; + +static inline bool task_sched_boost(struct task_struct *p) +{ + return false; +} + +static inline bool task_placement_boost_enabled(struct task_struct *p) +{ + return false; +} + +static inline void check_for_migration(struct rq *rq, struct task_struct *p) { } + +static inline int sched_boost(void) +{ + return 0; +} + +static inline bool rt_boost_on_big(void) +{ + return false; +} + +static inline bool is_full_throttle_boost(void) +{ + return false; +} + +static inline enum sched_boost_policy task_boost_policy(struct task_struct *p) +{ + return SCHED_BOOST_NONE; +} + +static inline bool +task_in_cum_window_demand(struct rq *rq, struct task_struct *p) +{ + return false; +} + +static inline bool hmp_capable(void) { return false; } +static inline bool is_max_capacity_cpu(int cpu) { return true; } +static inline bool is_min_capacity_cpu(int cpu) { return true; } + +static inline int +preferred_cluster(struct sched_cluster *cluster, struct task_struct *p) +{ + return -1; +} + +static inline struct sched_cluster *rq_cluster(struct rq *rq) +{ + return NULL; +} + +static inline int asym_cap_siblings(int cpu1, int cpu2) { return 0; } + +static inline void set_preferred_cluster(struct related_thread_group *grp) { } + +static inline bool task_in_related_thread_group(struct task_struct *p) +{ + return false; +} + +static inline +struct related_thread_group *task_related_thread_group(struct task_struct *p) +{ + return NULL; +} + +static inline u32 task_load(struct task_struct *p) { return 0; } +static inline u32 task_pl(struct task_struct *p) { return 0; } + +static inline int update_preferred_cluster(struct related_thread_group *grp, + struct task_struct *p, u32 old_load, bool from_tick) +{ + return 0; +} + +static inline void add_new_task_to_grp(struct task_struct *new) {} + +static inline int same_freq_domain(int src_cpu, int dst_cpu) +{ + return 1; +} + +static inline int mark_reserved(int cpu) +{ + return 0; +} + +static inline void clear_reserved(int cpu) { } +static inline int alloc_related_thread_groups(void) { return 0; } + +static inline void walt_fixup_cum_window_demand(struct rq *rq, + s64 scaled_delta) { } + +#ifdef CONFIG_SMP +static inline unsigned long thermal_cap(int cpu) +{ + return cpu_rq(cpu)->cpu_capacity_orig; +} +#endif + +static inline void clear_walt_request(int cpu) { } + +static inline int is_reserved(int cpu) +{ + return 0; +} + +static inline enum sched_boost_policy sched_boost_policy(void) +{ + return SCHED_BOOST_NONE; +} + +static inline void sched_boost_parse_dt(void) { } + +static inline void clear_ed_task(struct task_struct *p, struct rq *rq) { } + +static inline bool early_detection_notify(struct rq *rq, u64 wallclock) +{ + return 0; +} + +#ifdef CONFIG_SMP +static inline unsigned int power_cost(int cpu, u64 demand) +{ + return SCHED_CAPACITY_SCALE; +} +#endif + +static inline void note_task_waking(struct task_struct *p, u64 wallclock) { } +#endif /* CONFIG_SCHED_WALT */ + +struct sched_avg_stats { + int nr; + int nr_misfit; + int nr_max; + int nr_scaled; +}; +extern void sched_get_nr_running_avg(struct sched_avg_stats *stats); + +#if defined(CONFIG_SCHED_WALT) && defined(CONFIG_CFS_BANDWIDTH) + +extern void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq); +extern void walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p); +extern void walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p); +extern void walt_inc_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *cfs_rq); +extern void walt_dec_throttled_cfs_rq_stats(struct walt_sched_stats *stats, + struct cfs_rq *cfs_rq); +#else +static inline void walt_init_cfs_rq_stats(struct cfs_rq *cfs_rq) {} +static inline void +walt_inc_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) {} +static inline void +walt_dec_cfs_rq_stats(struct cfs_rq *cfs_rq, struct task_struct *p) {} + +#define walt_inc_throttled_cfs_rq_stats(...) +#define walt_dec_throttled_cfs_rq_stats(...) + +#endif + + +#ifdef CONFIG_SMP +#ifdef CONFIG_SCHED_WALT +extern int __weak group_balance_cpu_not_isolated(struct sched_group *sg); +#else +static inline int group_balance_cpu_not_isolated(struct sched_group *sg) +{ + return group_balance_cpu(sg); +} +#endif /* CONFIG_SCHED_WALT */ +#endif /* CONFIG_SMP */ + +extern int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu, + int sync); +extern int active_load_balance_cpu_stop(void *data); + +#ifdef CONFIG_HOTPLUG_CPU +extern void set_rq_online(struct rq *rq); +extern void set_rq_offline(struct rq *rq); +extern void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf); +extern void calc_load_migrate(struct rq *rq); +#ifdef CONFIG_SCHED_WALT +extern void __weak +detach_one_task_core(struct task_struct *p, struct rq *rq, + struct list_head *tasks); +extern void __weak attach_tasks_core(struct list_head *tasks, struct rq *rq); +#else +static inline void +detach_one_task_core(struct task_struct *p, struct rq *rq, + struct list_head *tasks) +{ +} +static inline void attach_tasks_core(struct list_head *tasks, struct rq *rq) {} +#endif +#endif + +extern struct task_struct *find_process_by_pid(pid_t pid); + +extern void enqueue_task_core(struct rq *rq, struct task_struct *p, int flags); +extern void dequeue_task_core(struct rq *rq, struct task_struct *p, int flags); diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index c0640739e05e..e58457b4377a 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -8,6 +8,7 @@ * See kernel/stop_machine.c */ #include "sched.h" +#include "walt.h" #ifdef CONFIG_SMP static int @@ -50,12 +51,14 @@ static void enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) { add_nr_running(rq, 1); + walt_inc_cumulative_runnable_avg(rq, p); } static void dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) { sub_nr_running(rq, 1); + walt_dec_cumulative_runnable_avg(rq, p); } static void yield_task_stop(struct rq *rq) @@ -144,4 +147,7 @@ const struct sched_class stop_sched_class = { .prio_changed = prio_changed_stop, .switched_to = switched_to_stop, .update_curr = update_curr_stop, +#ifdef CONFIG_SCHED_WALT + .fixup_walt_sched_stats = fixup_walt_sched_stats_common, +#endif }; diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h new file mode 100644 index 000000000000..22544c970bff --- /dev/null +++ b/kernel/sched/walt.h @@ -0,0 +1,265 @@ +/* SPDX-License-Identifier: GPL-2.0-only */ +/* + * Copyright (c) 2016-2019, The Linux Foundation. All rights reserved. + */ + +#ifndef __WALT_H +#define __WALT_H + +#ifdef CONFIG_SCHED_WALT + +#include +#include + +#define EXITING_TASK_MARKER 0xdeaddead + +extern void __weak +walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event, + u64 wallclock, u64 irqtime); + +static inline void +fixup_cumulative_runnable_avg(struct walt_sched_stats *stats, + s64 demand_scaled_delta, + s64 pred_demand_scaled_delta) +{ + if (sched_disable_window_stats) + return; + + stats->cumulative_runnable_avg_scaled += demand_scaled_delta; + BUG_ON((s64)stats->cumulative_runnable_avg_scaled < 0); + + stats->pred_demands_sum_scaled += pred_demand_scaled_delta; + BUG_ON((s64)stats->pred_demands_sum_scaled < 0); +} + +static inline void +walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) +{ + if (sched_disable_window_stats) + return; + + fixup_cumulative_runnable_avg(&rq->walt_stats, p->ravg.demand_scaled, + p->ravg.pred_demand_scaled); + + /* + * Add a task's contribution to the cumulative window demand when + * + * (1) task is enqueued with on_rq = 1 i.e migration, + * prio/cgroup/class change. + * (2) task is waking for the first time in this window. + */ + if (p->on_rq || (p->last_sleep_ts < rq->window_start)) + walt_fixup_cum_window_demand(rq, p->ravg.demand_scaled); +} + +static inline void +walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) +{ + if (sched_disable_window_stats) + return; + + fixup_cumulative_runnable_avg(&rq->walt_stats, + -(s64)p->ravg.demand_scaled, + -(s64)p->ravg.pred_demand_scaled); + + /* + * on_rq will be 1 for sleeping tasks. So check if the task + * is migrating or dequeuing in RUNNING state to change the + * prio/cgroup/class. + */ + if (task_on_rq_migrating(p) || p->state == TASK_RUNNING) + walt_fixup_cum_window_demand(rq, -(s64)p->ravg.demand_scaled); +} + +extern void __weak +fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled, + u16 updated_pred_demand_scaled); +extern void __weak inc_rq_walt_stats(struct rq *rq, struct task_struct *p); +extern void __weak dec_rq_walt_stats(struct rq *rq, struct task_struct *p); +extern void __weak fixup_busy_time(struct task_struct *p, int new_cpu); +extern void __weak init_new_task_load(struct task_struct *p); +extern void __weak mark_task_starting(struct task_struct *p); +extern void __weak set_window_start(struct rq *rq); +extern bool __weak do_pl_notif(struct rq *rq); + +#define SCHED_HIGH_IRQ_TIMEOUT 3 +static inline u64 sched_irqload(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + s64 delta; + + delta = get_jiffies_64() - rq->irqload_ts; + /* + * Current context can be preempted by irq and rq->irqload_ts can be + * updated by irq context so that delta can be negative. + * But this is okay and we can safely return as this means there + * was recent irq occurrence. + */ + + if (delta < SCHED_HIGH_IRQ_TIMEOUT) + return rq->avg_irqload; + else + return 0; +} + +static inline int sched_cpu_high_irqload(int cpu) +{ + return sched_irqload(cpu) >= sysctl_sched_cpu_high_irqload; +} + +static inline int exiting_task(struct task_struct *p) +{ + return (p->ravg.sum_history[0] == EXITING_TASK_MARKER); +} + +static inline u64 +scale_load_to_freq(u64 load, unsigned int src_freq, unsigned int dst_freq) +{ + return div64_u64(load * (u64)src_freq, (u64)dst_freq); +} + +extern void __weak sched_account_irqstart(int cpu, struct task_struct *curr, + u64 wallclock); + +static inline unsigned int max_task_load(void) +{ + return sched_ravg_window; +} + +extern void __weak update_cluster_topology(void); + +extern void __weak init_clusters(void); + +extern void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock); + +static inline int same_cluster(int src_cpu, int dst_cpu) +{ + return cpu_rq(src_cpu)->cluster == cpu_rq(dst_cpu)->cluster; +} + +void __weak walt_sched_init_rq(struct rq *rq); + +static inline void walt_update_last_enqueue(struct task_struct *p) +{ + p->last_enqueued_ts = sched_ktime_clock(); +} + +static inline bool is_suh_max(void) +{ + return sysctl_sched_user_hint == sched_user_hint_max; +} + +#define DEFAULT_CGROUP_COLOC_ID 1 +static inline bool walt_should_kick_upmigrate(struct task_struct *p, int cpu) +{ + struct related_thread_group *rtg = p->grp; + + if (is_suh_max() && rtg && rtg->id == DEFAULT_CGROUP_COLOC_ID && + rtg->skip_min && p->unfilter) + return is_min_capacity_cpu(cpu); + + return false; +} + +extern bool is_rtgb_active(void); +extern u64 get_rtgb_active_time(void); + +/* utility function to update walt signals at wakeup */ +static inline void walt_try_to_wake_up(struct task_struct *p) +{ + struct rq *rq = cpu_rq(task_cpu(p)); + struct rq_flags rf; + u64 wallclock; + unsigned int old_load; + struct related_thread_group *grp = NULL; + + rq_lock_irqsave(rq, &rf); + old_load = task_load(p); + wallclock = sched_ktime_clock(); + walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0); + walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0); + note_task_waking(p, wallclock); + rq_unlock_irqrestore(rq, &rf); + + rcu_read_lock(); + grp = task_related_thread_group(p); + if (update_preferred_cluster(grp, p, old_load, false)) + set_preferred_cluster(grp); + rcu_read_unlock(); +} + +#else /* CONFIG_SCHED_WALT */ + +static inline void walt_sched_init_rq(struct rq *rq) { } +static inline void walt_update_last_enqueue(struct task_struct *p) { } +static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq, + int event, u64 wallclock, u64 irqtime) { } +static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) +{ +} + +static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, + struct task_struct *p) +{ +} + +static inline void fixup_busy_time(struct task_struct *p, int new_cpu) { } +static inline void init_new_task_load(struct task_struct *p) +{ +} + +static inline void mark_task_starting(struct task_struct *p) { } +static inline void set_window_start(struct rq *rq) { } +static inline int sched_cpu_high_irqload(int cpu) { return 0; } + +static inline void sched_account_irqstart(int cpu, struct task_struct *curr, + u64 wallclock) +{ +} + +static inline void update_cluster_topology(void) { } +static inline void init_clusters(void) {} +static inline void sched_account_irqtime(int cpu, struct task_struct *curr, + u64 delta, u64 wallclock) +{ +} + +static inline int same_cluster(int src_cpu, int dst_cpu) { return 1; } +static inline bool do_pl_notif(struct rq *rq) { return false; } + +static inline void +inc_rq_walt_stats(struct rq *rq, struct task_struct *p) { } + +static inline void +dec_rq_walt_stats(struct rq *rq, struct task_struct *p) { } + +static inline void +fixup_walt_sched_stats_common(struct rq *rq, struct task_struct *p, + u16 updated_demand_scaled, + u16 updated_pred_demand_scaled) +{ +} + +static inline u64 sched_irqload(int cpu) +{ + return 0; +} + +static inline bool walt_should_kick_upmigrate(struct task_struct *p, int cpu) +{ + return false; +} + +static inline u64 get_rtgb_active_time(void) +{ + return 0; +} + +#define walt_try_to_wake_up(a) {} + +#endif /* CONFIG_SCHED_WALT */ + +#endif diff --git a/kernel/sysctl.c b/kernel/sysctl.c index f14c9f30c394..e28fb2d87ca4 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include #include @@ -126,6 +127,7 @@ static int sixty = 60; #endif static int __maybe_unused neg_one = -1; + static int __maybe_unused two = 2; static int __maybe_unused four = 4; static unsigned long zero_ul; @@ -140,7 +142,12 @@ static int ten_thousand = 10000; static int six_hundred_forty_kb = 640 * 1024; #endif #ifdef CONFIG_SCHED_WALT +static int neg_three = -3; +static int three = 3; static int two_hundred_fifty_five = 255; +const int sched_user_hint_max = 1000; +static unsigned int ns_per_sec = NSEC_PER_SEC; +static unsigned int one_hundred_thousand = 100000; #endif /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ @@ -231,6 +238,10 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, #endif static int proc_dopipe_max_size(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); +#ifdef CONFIG_SCHED_WALT +static int proc_douintvec_minmax_schedhyst(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); +#endif #ifdef CONFIG_MAGIC_SYSRQ /* Note: sysrq code uses its own private copy */ @@ -328,6 +339,172 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHED_WALT + { + .procname = "sched_user_hint", + .data = &sysctl_sched_user_hint, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = walt_proc_user_hint_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&sched_user_hint_max, + }, + { + .procname = "sched_window_stats_policy", + .data = &sysctl_sched_window_stats_policy, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &four, + }, + { + .procname = "sched_cpu_high_irqload", + .data = &sysctl_sched_cpu_high_irqload, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, + { + .procname = "sched_group_upmigrate", + .data = &sysctl_sched_group_upmigrate_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = walt_proc_group_thresholds_handler, + .extra1 = &sysctl_sched_group_downmigrate_pct, + }, + { + .procname = "sched_group_downmigrate", + .data = &sysctl_sched_group_downmigrate_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = walt_proc_group_thresholds_handler, + .extra1 = SYSCTL_ZERO, + .extra2 = &sysctl_sched_group_upmigrate_pct, + }, + { + .procname = "sched_boost", + .data = &sysctl_sched_boost, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_boost_handler, + .extra1 = &neg_three, + .extra2 = &three, + }, + { + .procname = "sched_conservative_pl", + .data = &sysctl_sched_conservative_pl, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_walt_rotate_big_tasks", + .data = &sysctl_sched_walt_rotate_big_tasks, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "sched_min_task_util_for_boost", + .data = &sysctl_sched_min_task_util_for_boost, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_thousand, + }, + { + .procname = "sched_min_task_util_for_colocation", + .data = &sysctl_sched_min_task_util_for_colocation, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_thousand, + }, + { + .procname = "sched_asym_cap_sibling_freq_match_pct", + .data = &sysctl_sched_asym_cap_sibling_freq_match_pct, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &one_hundred, + }, + { + .procname = "sched_coloc_downmigrate_ns", + .data = &sysctl_sched_coloc_downmigrate_ns, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + }, + { + .procname = "sched_task_unfilter_nr_windows", + .data = &sysctl_sched_task_unfilter_nr_windows, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = &two_hundred_fifty_five, + }, + { + .procname = "sched_busy_hysteresis_enable_cpus", + .data = &sysctl_sched_busy_hyst_enable_cpus, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax_schedhyst, + .extra1 = SYSCTL_ZERO, + .extra2 = &two_hundred_fifty_five, + }, + { + .procname = "sched_busy_hyst_ns", + .data = &sysctl_sched_busy_hyst, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax_schedhyst, + .extra1 = SYSCTL_ZERO, + .extra2 = &ns_per_sec, + }, + { + .procname = "sched_coloc_busy_hysteresis_enable_cpus", + .data = &sysctl_sched_coloc_busy_hyst_enable_cpus, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax_schedhyst, + .extra1 = SYSCTL_ZERO, + .extra2 = &two_hundred_fifty_five, + }, + { + .procname = "sched_coloc_busy_hyst_ns", + .data = &sysctl_sched_coloc_busy_hyst, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax_schedhyst, + .extra1 = SYSCTL_ZERO, + .extra2 = &ns_per_sec, + }, + { + .procname = "sched_coloc_busy_hyst_max_ms", + .data = &sysctl_sched_coloc_busy_hyst_max_ms, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax_schedhyst, + .extra1 = SYSCTL_ZERO, + .extra2 = &one_hundred_thousand, + }, + { + .procname = "sched_ravg_window_nr_ticks", + .data = &sysctl_sched_ravg_window_nr_ticks, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sched_ravg_window_handler, + }, +#endif #ifdef CONFIG_SCHED_DEBUG { .procname = "sched_min_granularity_ns", @@ -2874,6 +3051,19 @@ static int proc_dostring_coredump(struct ctl_table *table, int write, } #endif +#ifdef CONFIG_SCHED_WALT +static int proc_douintvec_minmax_schedhyst(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int ret = proc_douintvec_minmax(table, write, buffer, lenp, ppos); + + if (!ret && write) + sched_update_hyst_times(); + + return ret; +} +#endif + static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos, @@ -3341,6 +3531,29 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, return err; } +static int do_proc_douintvec_rwin(bool *negp, unsigned long *lvalp, + int *valp, int write, void *data) +{ + if (write) { + if (*lvalp == 0 || *lvalp == 2 || *lvalp == 5) + *valp = *lvalp; + else + return -EINVAL; + } else { + *negp = false; + *lvalp = *valp; + } + + return 0; +} + +int proc_douintvec_ravg_window(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return do_proc_dointvec(table, write, buffer, lenp, ppos, + do_proc_douintvec_rwin, NULL); +} + #else /* CONFIG_PROC_SYSCTL */ int proc_dostring(struct ctl_table *table, int write, @@ -3410,6 +3623,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, return -ENOSYS; } +int proc_douintvec_ravg_window(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + return -ENOSYS; +} + #endif /* CONFIG_PROC_SYSCTL */ #if defined(CONFIG_SYSCTL)