diff --git a/lmkd/README.md b/lmkd/README.md index 656a6ea0a..a735955ae 100644 --- a/lmkd/README.md +++ b/lmkd/README.md @@ -60,6 +60,23 @@ properties: any eligible task (fast decision). Default = false ro.lmk.kill_timeout_ms: duration in ms after a kill when no additional - kill will be done, Default = 0 (disabled) + kill will be done. Default = 0 (disabled) ro.lmk.debug: enable lmkd debug logs, Default = false + + ro.lmk.swap_free_low_percentage: level of free swap as a percentage of the + total swap space used as a threshold to consider + the system as swap space starved. Default for + low-RAM devices = 10, for high-end devices = 20 + + ro.lmk.thrashing_limit: number of workingset refaults as a percentage of + the file-backed pagecache size used as a threshold + to consider system thrashing its pagecache. + Default for low-RAM devices = 30, for high-end + devices = 100 + + ro.lmk.thrashing_limit_decay: thrashing threshold decay expressed as a + percentage of the original threshold used to lower + the threshold when system does not recover even + after a kill. Default for low-RAM devices = 50, + for high-end devices = 10 diff --git a/lmkd/lmkd.c b/lmkd/lmkd.c index 04662feda..221fbc736 100644 --- a/lmkd/lmkd.c +++ b/lmkd/lmkd.c @@ -79,6 +79,7 @@ #define MEMCG_MEMORYSW_USAGE "/dev/memcg/memory.memsw.usage_in_bytes" #define ZONEINFO_PATH "/proc/zoneinfo" #define MEMINFO_PATH "/proc/meminfo" +#define VMSTAT_PATH "/proc/vmstat" #define PROC_STATUS_TGID_FIELD "Tgid:" #define LINE_MAX 128 @@ -110,13 +111,29 @@ * PSI_WINDOW_SIZE_MS after the event happens. */ #define PSI_WINDOW_SIZE_MS 1000 -/* Polling period after initial PSI signal */ -#define PSI_POLL_PERIOD_MS 10 +/* Polling period after PSI signal when pressure is high */ +#define PSI_POLL_PERIOD_SHORT_MS 10 +/* Polling period after PSI signal when pressure is low */ +#define PSI_POLL_PERIOD_LONG_MS 100 #define min(a, b) (((a) < (b)) ? (a) : (b)) +#define max(a, b) (((a) > (b)) ? (a) : (b)) #define FAIL_REPORT_RLIMIT_MS 1000 +/* + * System property defaults + */ +/* ro.lmk.swap_free_low_percentage property defaults */ +#define DEF_LOW_SWAP_LOWRAM 10 +#define DEF_LOW_SWAP 20 +/* ro.lmk.thrashing_limit property defaults */ +#define DEF_THRASHING_LOWRAM 30 +#define DEF_THRASHING 100 +/* ro.lmk.thrashing_limit_decay property defaults */ +#define DEF_THRASHING_DECAY_LOWRAM 50 +#define DEF_THRASHING_DECAY 10 + /* default to old in-kernel interface if no memory pressure events */ static bool use_inkernel_interface = true; static bool has_inkernel_module; @@ -157,6 +174,8 @@ static unsigned long kill_timeout_ms; static bool use_minfree_levels; static bool per_app_memcg; static int swap_free_low_percentage; +static int thrashing_limit_pct; +static int thrashing_limit_decay_pct; static bool use_psi_monitors = false; static struct psi_threshold psi_thresholds[VMPRESS_LEVEL_COUNT] = { { PSI_SOME, 70 }, /* 70ms out of 1sec for partial stall */ @@ -390,6 +409,41 @@ union meminfo { int64_t arr[MI_FIELD_COUNT]; }; +/* Fields to parse in /proc/vmstat */ +enum vmstat_field { + VS_FREE_PAGES, + VS_INACTIVE_FILE, + VS_ACTIVE_FILE, + VS_WORKINGSET_REFAULT, + VS_PGSCAN_KSWAPD, + VS_PGSCAN_DIRECT, + VS_PGSCAN_DIRECT_THROTTLE, + VS_FIELD_COUNT +}; + +static const char* const vmstat_field_names[MI_FIELD_COUNT] = { + "nr_free_pages", + "nr_inactive_file", + "nr_active_file", + "workingset_refault", + "pgscan_kswapd", + "pgscan_direct", + "pgscan_direct_throttle", +}; + +union vmstat { + struct { + int64_t nr_free_pages; + int64_t nr_inactive_file; + int64_t nr_active_file; + int64_t workingset_refault; + int64_t pgscan_kswapd; + int64_t pgscan_direct; + int64_t pgscan_direct_throttle; + } field; + int64_t arr[VS_FIELD_COUNT]; +}; + enum field_match_result { NO_MATCH, PARSE_FAIL, @@ -445,6 +499,10 @@ static long page_k; static char* proc_get_name(int pid); static void poll_kernel(); +static int clamp(int low, int high, int value) { + return max(min(value, high), low); +} + static bool parse_int64(const char* str, int64_t* ret) { char* endptr; long long val = strtoll(str, &endptr, 10); @@ -1248,7 +1306,7 @@ static int memory_stat_from_procfs(struct memory_stat* mem_st, int pid) { #endif /* - * /prop/zoneinfo parsing routines + * /proc/zoneinfo parsing routines * Expected file format is: * * Node , zone @@ -1442,7 +1500,7 @@ static int zoneinfo_parse(struct zoneinfo *zi) { return 0; } -/* /prop/meminfo parsing routines */ +/* /proc/meminfo parsing routines */ static bool meminfo_parse_line(char *line, union meminfo *mi) { char *cp = line; char *ap; @@ -1497,6 +1555,59 @@ static int meminfo_parse(union meminfo *mi) { return 0; } +/* /proc/vmstat parsing routines */ +static bool vmstat_parse_line(char *line, union vmstat *vs) { + char *cp; + char *ap; + char *save_ptr; + int64_t val; + int field_idx; + enum field_match_result match_res; + + cp = strtok_r(line, " ", &save_ptr); + if (!cp) { + return false; + } + + ap = strtok_r(NULL, " ", &save_ptr); + if (!ap) { + return false; + } + + match_res = match_field(cp, ap, vmstat_field_names, VS_FIELD_COUNT, + &val, &field_idx); + if (match_res == PARSE_SUCCESS) { + vs->arr[field_idx] = val; + } + return (match_res != PARSE_FAIL); +} + +static int vmstat_parse(union vmstat *vs) { + static struct reread_data file_data = { + .filename = VMSTAT_PATH, + .fd = -1, + }; + char *buf; + char *save_ptr; + char *line; + + memset(vs, 0, sizeof(union vmstat)); + + if ((buf = reread_file(&file_data)) == NULL) { + return -1; + } + + for (line = strtok_r(buf, "\n", &save_ptr); line; + line = strtok_r(NULL, "\n", &save_ptr)) { + if (!vmstat_parse_line(line, vs)) { + ALOGE("%s parse error", file_data.filename); + return -1; + } + } + + return 0; +} + static void meminfo_log(union meminfo *mi) { for (int field_idx = 0; field_idx < MI_FIELD_COUNT; field_idx++) { android_log_write_int32(ctx, (int32_t)min(mi->arr[field_idx] * page_k, INT32_MAX)); @@ -1833,6 +1944,219 @@ static bool is_kill_pending(void) { return false; } +enum zone_watermark { + WMARK_MIN = 0, + WMARK_LOW, + WMARK_HIGH, + WMARK_NONE +}; + +/* + * Returns lowest breached watermark or WMARK_NONE. + */ +static enum zone_watermark get_lowest_watermark(struct zoneinfo *zi) +{ + enum zone_watermark wmark = WMARK_NONE; + + for (int node_idx = 0; node_idx < zi->node_count; node_idx++) { + struct zoneinfo_node *node = &zi->nodes[node_idx]; + + for (int zone_idx = 0; zone_idx < node->zone_count; zone_idx++) { + struct zoneinfo_zone *zone = &node->zones[zone_idx]; + int zone_free_mem; + + if (!zone->fields.field.present) { + continue; + } + + zone_free_mem = zone->fields.field.nr_free_pages - zone->fields.field.nr_free_cma; + if (zone_free_mem > zone->max_protection + zone->fields.field.high) { + continue; + } + if (zone_free_mem > zone->max_protection + zone->fields.field.low) { + if (wmark > WMARK_HIGH) wmark = WMARK_HIGH; + continue; + } + if (zone_free_mem > zone->max_protection + zone->fields.field.min) { + if (wmark > WMARK_LOW) wmark = WMARK_LOW; + continue; + } + wmark = WMARK_MIN; + } + } + + return wmark; +} + +static void mp_event_psi(int data, uint32_t events, struct polling_params *poll_params) { + enum kill_reasons { + NONE = -1, /* To denote no kill condition */ + PRESSURE_AFTER_KILL = 0, + NOT_RESPONDING, + LOW_SWAP_AND_THRASHING, + LOW_MEM_AND_SWAP, + LOW_MEM_AND_THRASHING, + DIRECT_RECL_AND_THRASHING, + KILL_REASON_COUNT + }; + enum reclaim_state { + NO_RECLAIM = 0, + KSWAPD_RECLAIM, + DIRECT_RECLAIM, + }; + static int64_t init_ws_refault; + static int64_t base_file_lru; + static int64_t init_pgscan_kswapd; + static int64_t init_pgscan_direct; + static int64_t swap_low_threshold; + static bool killing; + static int thrashing_limit; + static bool in_reclaim; + + union meminfo mi; + union vmstat vs; + struct zoneinfo zi; + struct timespec curr_tm; + int64_t thrashing = 0; + bool swap_is_low = false; + enum vmpressure_level level = (enum vmpressure_level)data; + enum kill_reasons kill_reason = NONE; + bool cycle_after_kill = false; + enum reclaim_state reclaim = NO_RECLAIM; + enum zone_watermark wmark = WMARK_NONE; + + /* Skip while still killing a process */ + if (is_kill_pending()) { + /* TODO: replace this quick polling with pidfd polling if kernel supports */ + goto no_kill; + } + + if (clock_gettime(CLOCK_MONOTONIC_COARSE, &curr_tm) != 0) { + ALOGE("Failed to get current time"); + return; + } + + if (vmstat_parse(&vs) < 0) { + ALOGE("Failed to parse vmstat!"); + return; + } + + if (meminfo_parse(&mi) < 0) { + ALOGE("Failed to parse meminfo!"); + return; + } + + /* Reset states after process got killed */ + if (killing) { + killing = false; + cycle_after_kill = true; + /* Reset file-backed pagecache size and refault amounts after a kill */ + base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file; + init_ws_refault = vs.field.workingset_refault; + } + + /* Check free swap levels */ + if (swap_free_low_percentage) { + if (!swap_low_threshold) { + swap_low_threshold = mi.field.total_swap * swap_free_low_percentage / 100; + } + swap_is_low = mi.field.free_swap < swap_low_threshold; + } + + /* Identify reclaim state */ + if (vs.field.pgscan_direct > init_pgscan_direct) { + init_pgscan_direct = vs.field.pgscan_direct; + init_pgscan_kswapd = vs.field.pgscan_kswapd; + reclaim = DIRECT_RECLAIM; + } else if (vs.field.pgscan_kswapd > init_pgscan_kswapd) { + init_pgscan_kswapd = vs.field.pgscan_kswapd; + reclaim = KSWAPD_RECLAIM; + } else { + in_reclaim = false; + /* Skip if system is not reclaiming */ + goto no_kill; + } + + if (!in_reclaim) { + /* Record file-backed pagecache size when entering reclaim cycle */ + base_file_lru = vs.field.nr_inactive_file + vs.field.nr_active_file; + init_ws_refault = vs.field.workingset_refault; + thrashing_limit = thrashing_limit_pct; + } else { + /* Calculate what % of the file-backed pagecache refaulted so far */ + thrashing = (vs.field.workingset_refault - init_ws_refault) * 100 / base_file_lru; + } + in_reclaim = true; + + /* Find out which watermark is breached if any */ + if (zoneinfo_parse(&zi) < 0) { + ALOGE("Failed to parse zoneinfo!"); + return; + } + wmark = get_lowest_watermark(&zi); + + /* + * TODO: move this logic into a separate function + * Decide if killing a process is necessary and record the reason + */ + if (cycle_after_kill && wmark < WMARK_LOW) { + /* + * Prevent kills not freeing enough memory which might lead to OOM kill. + * This might happen when a process is consuming memory faster than reclaim can + * free even after a kill. Mostly happens when running memory stress tests. + */ + kill_reason = PRESSURE_AFTER_KILL; + } else if (level == VMPRESS_LEVEL_CRITICAL && events != 0) { + /* + * Device is too busy reclaiming memory which might lead to ANR. + * Critical level is triggered when PSI complete stall (all tasks are blocked because + * of the memory congestion) breaches the configured threshold. + */ + kill_reason = NOT_RESPONDING; + } else if (swap_is_low && thrashing > thrashing_limit_pct) { + /* Page cache is thrashing while swap is low */ + kill_reason = LOW_SWAP_AND_THRASHING; + } else if (swap_is_low && wmark < WMARK_HIGH) { + /* Both free memory and swap are low */ + kill_reason = LOW_MEM_AND_SWAP; + } else if (wmark < WMARK_HIGH && thrashing > thrashing_limit) { + /* Page cache is thrashing while memory is low */ + thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100; + kill_reason = LOW_MEM_AND_THRASHING; + } else if (reclaim == DIRECT_RECLAIM && thrashing > thrashing_limit) { + /* Page cache is thrashing while in direct reclaim (mostly happens on lowram devices) */ + thrashing_limit = (thrashing_limit * (100 - thrashing_limit_decay_pct)) / 100; + kill_reason = DIRECT_RECL_AND_THRASHING; + } + + /* Kill a process if necessary */ + if (kill_reason != NONE) { + int pages_freed = find_and_kill_process(0); + killing = (pages_freed > 0); + meminfo_log(&mi); + } + +no_kill: + /* + * Start polling after initial PSI event; + * extend polling while device is in direct reclaim or process is being killed; + * do not extend when kswapd reclaims because that might go on for a long time + * without causing memory pressure + */ + if (events || killing || reclaim == DIRECT_RECLAIM) { + poll_params->update = POLLING_START; + } + + /* Decide the polling interval */ + if (swap_is_low || killing) { + /* Fast polling during and after a kill or when swap is low */ + poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS; + } else { + /* By default use long intervals */ + poll_params->polling_interval_ms = PSI_POLL_PERIOD_LONG_MS; + } +} + static void mp_event_common(int data, uint32_t events, struct polling_params *poll_params) { int ret; unsigned long long evcount; @@ -1881,7 +2205,7 @@ static void mp_event_common(int data, uint32_t events, struct polling_params *po if (use_psi_monitors && events) { /* Override polling params only if current event is more critical */ if (!poll_params->poll_handler || data > poll_params->poll_handler->data) { - poll_params->polling_interval_ms = PSI_POLL_PERIOD_MS; + poll_params->polling_interval_ms = PSI_POLL_PERIOD_SHORT_MS; poll_params->update = POLLING_START; } } @@ -2483,8 +2807,12 @@ int main(int argc __unused, char **argv __unused) { property_get_bool("ro.lmk.use_minfree_levels", false); per_app_memcg = property_get_bool("ro.config.per_app_memcg", low_ram_device); - swap_free_low_percentage = - property_get_int32("ro.lmk.swap_free_low_percentage", 10); + swap_free_low_percentage = clamp(0, 100, property_get_int32("ro.lmk.swap_free_low_percentage", + low_ram_device ? DEF_LOW_SWAP_LOWRAM : DEF_LOW_SWAP)); + thrashing_limit_pct = max(0, property_get_int32("ro.lmk.thrashing_limit", + low_ram_device ? DEF_THRASHING_LOWRAM : DEF_THRASHING)); + thrashing_limit_decay_pct = clamp(0, 100, property_get_int32("ro.lmk.thrashing_limit_decay", + low_ram_device ? DEF_THRASHING_DECAY_LOWRAM : DEF_THRASHING_DECAY)); ctx = create_android_logger(MEMINFO_LOG_TAG);