lmkd: Introduce kill strategy based on zone watermarks, swap and thrashing
am: 561cfd9478
Change-Id: Ief53fac72c50bef2735382757c53466cf9222e62
This commit is contained in:
commit
ad233c87b0
2 changed files with 353 additions and 8 deletions
|
|
@ -60,6 +60,23 @@ properties:
|
|||
any eligible task (fast decision). Default = false
|
||||
|
||||
ro.lmk.kill_timeout_ms: duration in ms after a kill when no additional
|
||||
kill will be done, Default = 0 (disabled)
|
||||
kill will be done. Default = 0 (disabled)
|
||||
|
||||
ro.lmk.debug: enable lmkd debug logs, Default = false
|
||||
|
||||
ro.lmk.swap_free_low_percentage: level of free swap as a percentage of the
|
||||
total swap space used as a threshold to consider
|
||||
the system as swap space starved. Default for
|
||||
low-RAM devices = 10, for high-end devices = 20
|
||||
|
||||
ro.lmk.thrashing_limit: number of workingset refaults as a percentage of
|
||||
the file-backed pagecache size used as a threshold
|
||||
to consider system thrashing its pagecache.
|
||||
Default for low-RAM devices = 30, for high-end
|
||||
devices = 100
|
||||
|
||||
ro.lmk.thrashing_limit_decay: thrashing threshold decay expressed as a
|
||||
percentage of the original threshold used to lower
|
||||
the threshold when system does not recover even
|
||||
after a kill. Default for low-RAM devices = 50,
|
||||
for high-end devices = 10
|
||||
|
|
|
|||
342
lmkd/lmkd.c
342
lmkd/lmkd.c
|
|
@ -79,6 +79,7 @@
|
|||
#define MEMCG_MEMORYSW_USAGE "/dev/memcg/memory.memsw.usage_in_bytes"
|
||||
#define ZONEINFO_PATH "/proc/zoneinfo"
|
||||
#define MEMINFO_PATH "/proc/meminfo"
|
||||
#define VMSTAT_PATH "/proc/vmstat"
|
||||
#define PROC_STATUS_TGID_FIELD "Tgid:"
|
||||
#define LINE_MAX 128
|
||||
|
||||
|
|
@ -110,13 +111,29 @@
|
|||
* PSI_WINDOW_SIZE_MS after the event happens.
|
||||
*/
|
||||
#define PSI_WINDOW_SIZE_MS 1000
|
||||
/* Polling period after initial PSI signal */
|
||||
#define PSI_POLL_PERIOD_MS 10
|
||||
/* Polling period after PSI signal when pressure is high */
|
||||
#define PSI_POLL_PERIOD_SHORT_MS 10
|
||||
/* Polling period after PSI signal when pressure is low */
|
||||
#define PSI_POLL_PERIOD_LONG_MS 100
|
||||
|
||||
#define min(a, b) (((a) < (b)) ? (a) : (b))
|
||||
#define max(a, b) (((a) > (b)) ? (a) : (b))
|
||||
|
||||
#define FAIL_REPORT_RLIMIT_MS 1000
|
||||
|
||||
/*
|
||||
* System property defaults
|
||||
*/
|
||||
/* ro.lmk.swap_free_low_percentage property defaults */
|
||||
#define DEF_LOW_SWAP_LOWRAM 10
|
||||
#define DEF_LOW_SWAP 20
|
||||
/* ro.lmk.thrashing_limit property defaults */
|
||||
#define DEF_THRASHING_LOWRAM 30
|
||||
#define DEF_THRASHING 100
|
||||
/* ro.lmk.thrashing_limit_decay property defaults */
|
||||
#define DEF_THRASHING_DECAY_LOWRAM 50
|
||||
#define DEF_THRASHING_DECAY 10
|
||||
|
||||
/* default to old in-kernel interface if no memory pressure events */
|
||||
static bool use_inkernel_interface = true;
|
||||
static bool has_inkernel_module;
|
||||
|
|
@ -157,6 +174,8 @@ static unsigned long kill_timeout_ms;
|
|||
static bool use_minfree_levels;
|
||||
static bool per_app_memcg;
|
||||
static int swap_free_low_percentage;
|
||||
static int thrashing_limit_pct;
|
||||
static int thrashing_limit_decay_pct;
|
||||
static bool use_psi_monitors = false;
|
||||
static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = {
|
||||
{ PSI_SOME, 70 }, /* 70ms out of 1sec for partial stall */
|
||||
|
|
@ -390,6 +409,41 @@ union meminfo {
|
|||
int64_t arr[MI_FIELD_COUNT];
|
||||
};
|
||||
|
||||
/* Fields to parse in /proc/vmstat */
|
||||
enum vmstat_field {
|
||||
VS_FREE_PAGES,
|
||||
VS_INACTIVE_FILE,
|
||||
VS_ACTIVE_FILE,
|
||||
VS_WORKINGSET_REFAULT,
|
||||
VS_PGSCAN_KSWAPD,
|
||||
VS_PGSCAN_DIRECT,
|
||||
VS_PGSCAN_DIRECT_THROTTLE,
|
||||
VS_FIELD_COUNT
|
||||
};
|
||||
|
||||
static const char* const vmstat_field_names[MI_FIELD_COUNT] = {
|
||||
"nr_free_pages",
|
||||
"nr_inactive_file",
|
||||
"nr_active_file",
|
||||
"workingset_refault",
|
||||
"pgscan_kswapd",
|
||||
"pgscan_direct",
|
||||
"pgscan_direct_throttle",
|
||||
};
|
||||
|
||||
union vmstat {
|
||||
struct {
|
||||
int64_t nr_free_pages;
|
||||
int64_t nr_inactive_file;
|
||||
int64_t nr_active_file;
|
||||
int64_t workingset_refault;
|
||||
int64_t pgscan_kswapd;
|
||||
int64_t pgscan_direct;
|
||||
int64_t pgscan_direct_throttle;
|
||||
} field;
|
||||
int64_t arr[VS_FIELD_COUNT];
|
||||
};
|
||||
|
||||
enum field_match_result {
|
||||
NO_MATCH,
|
||||
PARSE_FAIL,
|
||||
|
|
@ -445,6 +499,10 @@ static long page_k;
|
|||
static char* proc_get_name(int pid);
|
||||
static void poll_kernel();
|
||||
|
||||
static int clamp(int low, int high, int value) {
|
||||
return max(min(value, high), low);
|
||||
}
|
||||
|
||||
static bool parse_int64(const char* str, int64_t* ret) {
|
||||
char* endptr;
|
||||
long long val = strtoll(str, &endptr, 10);
|
||||
|
|
@ -1248,7 +1306,7 @@ static int memory_stat_from_procfs(struct memory_stat* mem_st, int pid) {
|
|||
#endif
|
||||
|
||||
/*
|
||||
* /prop/zoneinfo parsing routines
|
||||
* /proc/zoneinfo parsing routines
|
||||
* Expected file format is:
|
||||
*
|
||||
* Node <node_id>, zone <zone_name>
|
||||
|
|
@ -1442,7 +1500,7 @@ static int zoneinfo_parse(struct zoneinfo *zi) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* /prop/meminfo parsing routines */
|
||||
/* /proc/meminfo parsing routines */
|
||||
static bool meminfo_parse_line(char *line, union meminfo *mi) {
|
||||
char *cp = line;
|
||||
char *ap;
|
||||
|
|
@ -1497,6 +1555,59 @@ static int meminfo_parse(union meminfo *mi) {
|
|||
return 0;
|
||||
}
|
||||
|
||||
/* /proc/vmstat parsing routines */
|
||||
static bool vmstat_parse_line(char *line, union vmstat *vs) {
|
||||
char *cp;
|
||||
char *ap;
|
||||
char *save_ptr;
|
||||
int64_t val;
|
||||
int field_idx;
|
||||
enum field_match_result match_res;
|
||||
|
||||
cp = strtok_r(line, " ", &save_ptr);
|
||||
if (!cp) {
|
||||
return false;
|
||||
}
|
||||
|
||||
ap = strtok_r(NULL, " ", &save_ptr);
|
||||
if (!ap) {
|
||||
return false;
|
||||
}
|
||||
|
||||
match_res = match_field(cp, ap, vmstat_field_names, VS_FIELD_COUNT,
|
||||
&val, &field_idx);
|
||||
if (match_res == PARSE_SUCCESS) {
|
||||
vs->arr[field_idx] = val;
|
||||
}
|
||||
return (match_res != PARSE_FAIL);
|
||||
}
|
||||
|
||||
static int vmstat_parse(union vmstat *vs) {
|
||||
static struct reread_data file_data = {
|
||||
.filename = VMSTAT_PATH,
|
||||
.fd = -1,
|
||||
};
|
||||
char *buf;
|
||||
char *save_ptr;
|
||||
char *line;
|
||||
|
||||
memset(vs, 0, sizeof(union vmstat));
|
||||
|
||||
if ((buf = reread_file(&file_data)) == NULL) {
|
||||
return -1;
|
||||
}
|
||||
|
||||
for (line = strtok_r(buf, "\n", &save_ptr); line;
|
||||
line = strtok_r(NULL, "\n", &save_ptr)) {
|
||||
if (!vmstat_parse_line(line, vs)) {
|
||||
ALOGE("%s parse error", file_data.filename);
|
||||
return -1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void meminfo_log(union meminfo *mi) {
|
||||
for (int field_idx = 0; field_idx < MI_FIELD_COUNT; field_idx++) {
|
||||
android_log_write_int32(ctx, (int32_t)min(mi->arr[field_idx] * page_k, INT32_MAX));
|
||||
|
|
@ -1833,6 +1944,219 @@ static bool is_kill_pending(void) {
|
|||
return false;
|
||||
}
|
||||
|
||||
enum zone_watermark {
|
||||
WMARK_MIN = 0,
|
||||
WMARK_LOW,
|
||||
WMARK_HIGH,
|
||||
WMARK_NONE
|
||||
};
|
||||
|
||||
/*
|
||||
* Returns lowest breached watermark or WMARK_NONE.
|
||||
*/
|
||||
static enum zone_watermark get_lowest_watermark(struct zoneinfo *zi)
|
||||
{
|
||||
enum zone_watermark wmark = WMARK_NONE;
|
||||
|
||||
for (int node_idx = 0; node_idx < zi->node_count; node_idx++) {
|
||||
struct zoneinfo_node *node = &zi->nodes[node_idx];
|
||||
|
||||
for (int zone_idx = 0; zone_idx < node->zone_count; zone_idx++) {
|
||||
struct zoneinfo_zone *zone = &node->zones[zone_idx];
|
||||
int zone_free_mem;
|
||||
|
||||
if (!zone->fields.field.present) {
|
||||
continue;
|
||||
}
|
||||
|
||||
zone_free_mem = zone->fields.field.nr_free_pages - zone->fields.field.nr_free_cma;
|
||||
if (zone_free_mem > zone->max_protection + zone->fields.field.high) {
|
||||
continue;
|
||||
}
|
||||
if (zone_free_mem > zone->max_protection + zone->fields.field.low) {
|
||||
if (wmark > WMARK_HIGH) wmark = WMARK_HIGH;
|
||||
continue;
|
||||
}
|
||||
if (zone_free_mem > zone->max_protection + zone->fields.field.min) {
|
||||
if (wmark > WMARK_LOW) wmark = WMARK_LOW;
|
||||
continue;
|
||||
}
|
||||
wmark = WMARK_MIN;
|
||||
}
|
||||
}
|
||||
|
||||
return wmark;
|
||||
}
|
||||
|
||||
static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) {
|
||||
enum kill_reasons {
|
||||
NONE = -1, /* To denote no kill condition */
|
||||
PRESSURE_AFTER_KILL = 0,
|
||||
NOT_RESPONDING,
|
||||
LOW_SWAP_AND_THRASHING,
|
||||
LOW_MEM_AND_SWAP,
|
||||
LOW_MEM_AND_THRASHING,
|
||||
DIRECT_RECL_AND_THRASHING,
|
||||
KILL_REASON_COUNT
|
||||
};
|
||||
enum reclaim_state {
|
||||
NO_RECLAIM = 0,
|
||||
KSWAPD_RECLAIM,
|
||||
DIRECT_RECLAIM,
|
||||
};
|
||||
static int64_t init_ws_refault;
|
||||
static int64_t base_file_lru;
|
||||
static int64_t init_pgscan_kswapd;
|
||||
static int64_t init_pgscan_direct;
|
||||
static int64_t swap_low_threshold;
|
||||
static bool killing;
|
||||
static int thrashing_limit;
|
||||
static bool in_reclaim;
|
||||
|
||||
union meminfo mi;
|
||||
union vmstat vs;
|
||||
struct zoneinfo zi;
|
||||
struct timespec curr_tm;
|
||||
int64_t thrashing = 0;
|
||||
bool swap_is_low = false;
|
||||
enum vmpressure_level level = (enum vmpressure_level)data;
|
||||
enum kill_reasons kill_reason = NONE;
|
||||
bool cycle_after_kill = false;
|
||||
enum reclaim_state reclaim = NO_RECLAIM;
|
||||
enum zone_watermark wmark = WMARK_NONE;
|
||||
|
||||
/* Skip while still killing a process */
|
||||
if (is_kill_pending()) {
|
||||
/* TODO: replace this quick polling with pidfd polling if kernel supports */
|
||||
goto no_kill;
|
||||
}
|
||||
|
||||
if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) {
|
||||
ALOGE("Failed to get current time");
|
||||
return;
|
||||
}
|
||||
|
||||
if (vmstat_parse(&vs) < 0) {
|
||||
ALOGE("Failed to parse vmstat!");
|
||||
return;
|
||||
}
|
||||
|
||||
if (meminfo_parse(&mi) < 0) {
|
||||
ALOGE("Failed to parse meminfo!");
|
||||
return;
|
||||
}
|
||||
|
||||
/* Reset states after process got killed */
|
||||
if (killing) {
|
||||
killing = false;
|
||||
cycle_after_kill = true;
|
||||
/* Reset file-backed pagecache size and refault amounts after a kill */
|
||||
base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
|
||||
init_ws_refault = vs.field.workingset_refault;
|
||||
}
|
||||
|
||||
/* Check free swap levels */
|
||||
if (swap_free_low_percentage) {
|
||||
if (!swap_low_threshold) {
|
||||
swap_low_threshold = mi.field.total_swap * swap_free_low_percentage / 100;
|
||||
}
|
||||
swap_is_low = mi.field.free_swap < swap_low_threshold;
|
||||
}
|
||||
|
||||
/* Identify reclaim state */
|
||||
if (vs.field.pgscan_direct > init_pgscan_direct) {
|
||||
init_pgscan_direct = vs.field.pgscan_direct;
|
||||
init_pgscan_kswapd = vs.field.pgscan_kswapd;
|
||||
reclaim = DIRECT_RECLAIM;
|
||||
} else if (vs.field.pgscan_kswapd > init_pgscan_kswapd) {
|
||||
init_pgscan_kswapd = vs.field.pgscan_kswapd;
|
||||
reclaim = KSWAPD_RECLAIM;
|
||||
} else {
|
||||
in_reclaim = false;
|
||||
/* Skip if system is not reclaiming */
|
||||
goto no_kill;
|
||||
}
|
||||
|
||||
if (!in_reclaim) {
|
||||
/* Record file-backed pagecache size when entering reclaim cycle */
|
||||
base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file;
|
||||
init_ws_refault = vs.field.workingset_refault;
|
||||
thrashing_limit = thrashing_limit_pct;
|
||||
} else {
|
||||
/* Calculate what % of the file-backed pagecache refaulted so far */
|
||||
thrashing = (vs.field.workingset_refault - init_ws_refault) * 100 / base_file_lru;
|
||||
}
|
||||
in_reclaim = true;
|
||||
|
||||
/* Find out which watermark is breached if any */
|
||||
if (zoneinfo_parse(&zi) < 0) {
|
||||
ALOGE("Failed to parse zoneinfo!");
|
||||
return;
|
||||
}
|
||||
wmark = get_lowest_watermark(&zi);
|
||||
|
||||
/*
|
||||
* TODO: move this logic into a separate function
|
||||
* Decide if killing a process is necessary and record the reason
|
||||
*/
|
||||
if (cycle_after_kill && wmark < WMARK_LOW) {
|
||||
/*
|
||||
* Prevent kills not freeing enough memory which might lead to OOM kill.
|
||||
* This might happen when a process is consuming memory faster than reclaim can
|
||||
* free even after a kill. Mostly happens when running memory stress tests.
|
||||
*/
|
||||
kill_reason = PRESSURE_AFTER_KILL;
|
||||
} else if (level == VMPRESS_LEVEL_CRITICAL && events != 0) {
|
||||
/*
|
||||
* Device is too busy reclaiming memory which might lead to ANR.
|
||||
* Critical level is triggered when PSI complete stall (all tasks are blocked because
|
||||
* of the memory congestion) breaches the configured threshold.
|
||||
*/
|
||||
kill_reason = NOT_RESPONDING;
|
||||
} else if (swap_is_low && thrashing > thrashing_limit_pct) {
|
||||
/* Page cache is thrashing while swap is low */
|
||||
kill_reason = LOW_SWAP_AND_THRASHING;
|
||||
} else if (swap_is_low && wmark < WMARK_HIGH) {
|
||||
/* Both free memory and swap are low */
|
||||
kill_reason = LOW_MEM_AND_SWAP;
|
||||
} else if (wmark < WMARK_HIGH && thrashing > thrashing_limit) {
|
||||
/* Page cache is thrashing while memory is low */
|
||||
thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100;
|
||||
kill_reason = LOW_MEM_AND_THRASHING;
|
||||
} else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) {
|
||||
/* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */
|
||||
thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100;
|
||||
kill_reason = DIRECT_RECL_AND_THRASHING;
|
||||
}
|
||||
|
||||
/* Kill a process if necessary */
|
||||
if (kill_reason != NONE) {
|
||||
int pages_freed = find_and_kill_process(0);
|
||||
killing = (pages_freed > 0);
|
||||
meminfo_log(&mi);
|
||||
}
|
||||
|
||||
no_kill:
|
||||
/*
|
||||
* Start polling after initial PSI event;
|
||||
* extend polling while device is in direct reclaim or process is being killed;
|
||||
* do not extend when kswapd reclaims because that might go on for a long time
|
||||
* without causing memory pressure
|
||||
*/
|
||||
if (events || killing || reclaim == DIRECT_RECLAIM) {
|
||||
poll_params->update = POLLING_START;
|
||||
}
|
||||
|
||||
/* Decide the polling interval */
|
||||
if (swap_is_low || killing) {
|
||||
/* Fast polling during and after a kill or when swap is low */
|
||||
poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
|
||||
} else {
|
||||
/* By default use long intervals */
|
||||
poll_params->polling_interval_ms = PSI_POLL_PERIOD_LONG_MS;
|
||||
}
|
||||
}
|
||||
|
||||
static void mp_event_common(int data, uint32_t events, struct polling_params *poll_params) {
|
||||
int ret;
|
||||
unsigned long long evcount;
|
||||
|
|
@ -1881,7 +2205,7 @@ static void mp_event_common(int data, uint32_t events, struct polling_params *po
|
|||
if (use_psi_monitors && events) {
|
||||
/* Override polling params only if current event is more critical */
|
||||
if (!poll_params->poll_handler || data > poll_params->poll_handler->data) {
|
||||
poll_params->polling_interval_ms = PSI_POLL_PERIOD_MS;
|
||||
poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS;
|
||||
poll_params->update = POLLING_START;
|
||||
}
|
||||
}
|
||||
|
|
@ -2483,8 +2807,12 @@ int main(int argc __unused, char **argv __unused) {
|
|||
property_get_bool("ro.lmk.use_minfree_levels", false);
|
||||
per_app_memcg =
|
||||
property_get_bool("ro.config.per_app_memcg", low_ram_device);
|
||||
swap_free_low_percentage =
|
||||
property_get_int32("ro.lmk.swap_free_low_percentage", 10);
|
||||
swap_free_low_percentage = clamp(0, 100, property_get_int32("ro.lmk.swap_free_low_percentage",
|
||||
low_ram_device ? DEF_LOW_SWAP_LOWRAM : DEF_LOW_SWAP));
|
||||
thrashing_limit_pct = max(0, property_get_int32("ro.lmk.thrashing_limit",
|
||||
low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING));
|
||||
thrashing_limit_decay_pct = clamp(0, 100, property_get_int32("ro.lmk.thrashing_limit_decay",
|
||||
low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY));
|
||||
|
||||
ctx = create_android_logger(MEMINFO_LOG_TAG);
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue