summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2024-01-18 14:35:29 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2024-01-18 14:35:29 -0800
commita2ded784cd7fd83e567829637068cd86aeffbeaf (patch)
tree9924ca8854858fc5558e59b136d1f32695bc8cc4
parent5b890ad456b16a5b45e7f86d9708d7248e73d0f8 (diff)
parent25742aeb135c4a44e92fb347e037adaa145b9484 (diff)
downloadlinux-a2ded784cd7fd83e567829637068cd86aeffbeaf.tar.gz
linux-a2ded784cd7fd83e567829637068cd86aeffbeaf.tar.bz2
linux-a2ded784cd7fd83e567829637068cd86aeffbeaf.zip
Merge tag 'trace-v6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace
Pull tracing updates from Steven Rostedt: - Allow kernel trace instance creation to specify what events are created Inside the kernel, a subsystem may create a tracing instance that it can use to send events to user space. This sub-system may not care about the thousands of events that exist in eventfs. Allow the sub-system to specify what sub-systems of events it cares about, and only those events are exposed to this instance. - Allow the ring buffer to be broken up into bigger sub-buffers than just the architecture page size. A new tracefs file called "buffer_subbuf_size_kb" is created. The user can now specify a minimum size the sub-buffer may be in kilobytes. Note, that the implementation currently make the sub-buffer size a power of 2 pages (1, 2, 4, 8, 16, ...) but the user only writes in kilobyte size, and the sub-buffer will be updated to the next size that it will can accommodate it. If the user writes in 10, it will change the size to be 4 pages on x86 (16K), as that is the next available size that can hold 10K pages. - Update the debug output when a corrupt time is detected in the ring buffer. If the ring buffer detects inconsistent timestamps, there's a debug config options that will dump the contents of the meta data of the sub-buffer that is used for debugging. Add some more information to this dump that helps with debugging. - Add more timestamp debugging checks (only triggers when the config is enabled) - Increase the trace_seq iterator to 2 page sizes. - Allow strings written into tracefs_marker to be larger. Up to just under 2 page sizes (based on what trace_seq can hold). - Increase the trace_maker_raw write to be as big as a sub-buffer can hold. - Remove 32 bit time stamp logic, now that the rb_time_cmpxchg() has been removed. - More selftests were added. - Some code clean ups as well. * tag 'trace-v6.8' of git://git.kernel.org/pub/scm/linux/kernel/git/trace/linux-trace: (29 commits) ring-buffer: Remove stale comment from ring_buffer_size() tracing histograms: Simplify parse_actions() function tracing/selftests: Remove exec permissions from trace_marker.tc test ring-buffer: Use subbuf_order for buffer page masking tracing: Update subbuffer with kilobytes not page order ringbuffer/selftest: Add basic selftest to test changing subbuf order ring-buffer: Add documentation on the buffer_subbuf_order file ring-buffer: Just update the subbuffers when changing their allocation order ring-buffer: Keep the same size when updating the order tracing: Stop the tracing while changing the ring buffer subbuf size tracing: Update snapshot order along with main buffer order ring-buffer: Make sure the spare sub buffer used for reads has same size ring-buffer: Do no swap cpu buffers if order is different ring-buffer: Clear pages on error in ring_buffer_subbuf_order_set() failure ring-buffer: Read and write to ring buffers with custom sub buffer size ring-buffer: Set new size of the ring buffer sub page ring-buffer: Add interface for configuring trace sub buffer size ring-buffer: Page size per ring buffer ring-buffer: Have ring_buffer_print_page_header() be able to access ring_buffer_iter ring-buffer: Check if absolute timestamp goes backwards ...
-rw-r--r--Documentation/trace/ftrace.rst21
-rw-r--r--drivers/scsi/qla2xxx/qla_os.c2
-rw-r--r--include/linux/ring_buffer.h18
-rw-r--r--include/linux/trace.h4
-rw-r--r--include/linux/trace_seq.h9
-rw-r--r--kernel/trace/ring_buffer.c733
-rw-r--r--kernel/trace/ring_buffer_benchmark.c10
-rw-r--r--kernel/trace/trace.c234
-rw-r--r--kernel/trace/trace.h2
-rw-r--r--kernel/trace/trace_boot.c2
-rw-r--r--kernel/trace/trace_events.c107
-rw-r--r--kernel/trace/trace_events_hist.c49
-rw-r--r--kernel/trace/trace_seq.c3
-rw-r--r--samples/ftrace/sample-trace-array.c2
-rw-r--r--tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc95
-rw-r--r--tools/testing/selftests/ftrace/test.d/00basic/trace_marker.tc82
16 files changed, 999 insertions, 374 deletions
diff --git a/Documentation/trace/ftrace.rst b/Documentation/trace/ftrace.rst
index 16122a8895ba..7e7b8ec17934 100644
--- a/Documentation/trace/ftrace.rst
+++ b/Documentation/trace/ftrace.rst
@@ -218,6 +218,27 @@ of ftrace. Here is a list of some of the key files:
This displays the total combined size of all the trace buffers.
+ buffer_subbuf_size_kb:
+
+ This sets or displays the sub buffer size. The ring buffer is broken up
+ into several same size "sub buffers". An event can not be bigger than
+ the size of the sub buffer. Normally, the sub buffer is the size of the
+ architecture's page (4K on x86). The sub buffer also contains meta data
+ at the start which also limits the size of an event. That means when
+ the sub buffer is a page size, no event can be larger than the page
+ size minus the sub buffer meta data.
+
+ Note, the buffer_subbuf_size_kb is a way for the user to specify the
+ minimum size of the subbuffer. The kernel may make it bigger due to the
+ implementation details, or simply fail the operation if the kernel can
+ not handle the request.
+
+ Changing the sub buffer size allows for events to be larger than the
+ page size.
+
+ Note: When changing the sub-buffer size, tracing is stopped and any
+ data in the ring buffer and the snapshot buffer will be discarded.
+
free_buffer:
If a process is performing tracing, and the ring buffer should be
diff --git a/drivers/scsi/qla2xxx/qla_os.c b/drivers/scsi/qla2xxx/qla_os.c
index 03348f605c2e..dd674378f2f3 100644
--- a/drivers/scsi/qla2xxx/qla_os.c
+++ b/drivers/scsi/qla2xxx/qla_os.c
@@ -2889,7 +2889,7 @@ static void qla2x00_iocb_work_fn(struct work_struct *work)
static void
qla_trace_init(void)
{
- qla_trc_array = trace_array_get_by_name("qla2xxx");
+ qla_trc_array = trace_array_get_by_name("qla2xxx", NULL);
if (!qla_trc_array) {
ql_log(ql_log_fatal, NULL, 0x0001,
"Unable to create qla2xxx trace instance, instance logging will be disabled.\n");
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 782e14f62201..fa802db216f9 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -141,6 +141,7 @@ int ring_buffer_iter_empty(struct ring_buffer_iter *iter);
bool ring_buffer_iter_dropped(struct ring_buffer_iter *iter);
unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu);
+unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer);
void ring_buffer_reset_cpu(struct trace_buffer *buffer, int cpu);
void ring_buffer_reset_online_cpus(struct trace_buffer *buffer);
@@ -191,15 +192,24 @@ bool ring_buffer_time_stamp_abs(struct trace_buffer *buffer);
size_t ring_buffer_nr_pages(struct trace_buffer *buffer, int cpu);
size_t ring_buffer_nr_dirty_pages(struct trace_buffer *buffer, int cpu);
-void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu);
-void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data);
-int ring_buffer_read_page(struct trace_buffer *buffer, void **data_page,
+struct buffer_data_read_page;
+struct buffer_data_read_page *
+ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu);
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
+ struct buffer_data_read_page *page);
+int ring_buffer_read_page(struct trace_buffer *buffer,
+ struct buffer_data_read_page *data_page,
size_t len, int cpu, int full);
+void *ring_buffer_read_page_data(struct buffer_data_read_page *page);
struct trace_seq;
int ring_buffer_print_entry_header(struct trace_seq *s);
-int ring_buffer_print_page_header(struct trace_seq *s);
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s);
+
+int ring_buffer_subbuf_order_get(struct trace_buffer *buffer);
+int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order);
+int ring_buffer_subbuf_size_get(struct trace_buffer *buffer);
enum ring_buffer_flags {
RB_FL_OVERWRITE = 1 << 0,
diff --git a/include/linux/trace.h b/include/linux/trace.h
index 2a70a447184c..fdcd76b7be83 100644
--- a/include/linux/trace.h
+++ b/include/linux/trace.h
@@ -51,7 +51,7 @@ int trace_array_printk(struct trace_array *tr, unsigned long ip,
const char *fmt, ...);
int trace_array_init_printk(struct trace_array *tr);
void trace_array_put(struct trace_array *tr);
-struct trace_array *trace_array_get_by_name(const char *name);
+struct trace_array *trace_array_get_by_name(const char *name, const char *systems);
int trace_array_destroy(struct trace_array *tr);
/* For osnoise tracer */
@@ -84,7 +84,7 @@ static inline int trace_array_init_printk(struct trace_array *tr)
static inline void trace_array_put(struct trace_array *tr)
{
}
-static inline struct trace_array *trace_array_get_by_name(const char *name)
+static inline struct trace_array *trace_array_get_by_name(const char *name, const char *systems)
{
return NULL;
}
diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index 3691e0e76a1a..9ec229dfddaa 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -8,11 +8,14 @@
/*
* Trace sequences are used to allow a function to call several other functions
- * to create a string of data to use (up to a max of PAGE_SIZE).
+ * to create a string of data to use.
*/
+#define TRACE_SEQ_BUFFER_SIZE (PAGE_SIZE * 2 - \
+ (sizeof(struct seq_buf) + sizeof(size_t) + sizeof(int)))
+
struct trace_seq {
- char buffer[PAGE_SIZE];
+ char buffer[TRACE_SEQ_BUFFER_SIZE];
struct seq_buf seq;
size_t readpos;
int full;
@@ -21,7 +24,7 @@ struct trace_seq {
static inline void
trace_seq_init(struct trace_seq *s)
{
- seq_buf_init(&s->seq, s->buffer, PAGE_SIZE);
+ seq_buf_init(&s->seq, s->buffer, TRACE_SEQ_BUFFER_SIZE);
s->full = 0;
s->readpos = 0;
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9286f88fcd32..13aaf5e85b81 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -27,6 +27,7 @@
#include <linux/cpu.h>
#include <linux/oom.h>
+#include <asm/local64.h>
#include <asm/local.h>
/*
@@ -317,6 +318,11 @@ struct buffer_data_page {
unsigned char data[] RB_ALIGN_DATA; /* data of buffer page */
};
+struct buffer_data_read_page {
+ unsigned order; /* order of the page */
+ struct buffer_data_page *data; /* actual data, stored in this page */
+};
+
/*
* Note, the buffer_page list must be first. The buffer pages
* are allocated in cache lines, which means that each buffer
@@ -331,6 +337,7 @@ struct buffer_page {
unsigned read; /* index for next read */
local_t entries; /* entries on this page */
unsigned long real_end; /* real end of data */
+ unsigned order; /* order of the page */
struct buffer_data_page *page; /* Actual data page */
};
@@ -361,7 +368,7 @@ static __always_inline unsigned int rb_page_commit(struct buffer_page *bpage)
static void free_buffer_page(struct buffer_page *bpage)
{
- free_page((unsigned long)bpage->page);
+ free_pages((unsigned long)bpage->page, bpage->order);
kfree(bpage);
}
@@ -373,41 +380,6 @@ static inline bool test_time_stamp(u64 delta)
return !!(delta & TS_DELTA_TEST);
}
-#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
-
-/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
-#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
-
-int ring_buffer_print_page_header(struct trace_seq *s)
-{
- struct buffer_data_page field;
-
- trace_seq_printf(s, "\tfield: u64 timestamp;\t"
- "offset:0;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)sizeof(field.time_stamp),
- (unsigned int)is_signed_type(u64));
-
- trace_seq_printf(s, "\tfield: local_t commit;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- (unsigned int)sizeof(field.commit),
- (unsigned int)is_signed_type(long));
-
- trace_seq_printf(s, "\tfield: int overwrite;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), commit),
- 1,
- (unsigned int)is_signed_type(long));
-
- trace_seq_printf(s, "\tfield: char data;\t"
- "offset:%u;\tsize:%u;\tsigned:%u;\n",
- (unsigned int)offsetof(typeof(field), data),
- (unsigned int)BUF_PAGE_SIZE,
- (unsigned int)is_signed_type(char));
-
- return !trace_seq_has_overflowed(s);
-}
-
struct rb_irq_work {
struct irq_work work;
wait_queue_head_t waiters;
@@ -463,27 +435,9 @@ enum {
RB_CTX_MAX
};
-#if BITS_PER_LONG == 32
-#define RB_TIME_32
-#endif
-
-/* To test on 64 bit machines */
-//#define RB_TIME_32
-
-#ifdef RB_TIME_32
-
-struct rb_time_struct {
- local_t cnt;
- local_t top;
- local_t bottom;
- local_t msb;
-};
-#else
-#include <asm/local64.h>
struct rb_time_struct {
local64_t time;
};
-#endif
typedef struct rb_time_struct rb_time_t;
#define MAX_NEST 5
@@ -557,6 +511,10 @@ struct trace_buffer {
struct rb_irq_work irq_work;
bool time_stamp_abs;
+
+ unsigned int subbuf_size;
+ unsigned int subbuf_order;
+ unsigned int max_data_size;
};
struct ring_buffer_iter {
@@ -570,150 +528,48 @@ struct ring_buffer_iter {
u64 read_stamp;
u64 page_stamp;
struct ring_buffer_event *event;
+ size_t event_size;
int missed_events;
};
-#ifdef RB_TIME_32
-
-/*
- * On 32 bit machines, local64_t is very expensive. As the ring
- * buffer doesn't need all the features of a true 64 bit atomic,
- * on 32 bit, it uses these functions (64 still uses local64_t).
- *
- * For the ring buffer, 64 bit required operations for the time is
- * the following:
- *
- * - Reads may fail if it interrupted a modification of the time stamp.
- * It will succeed if it did not interrupt another write even if
- * the read itself is interrupted by a write.
- * It returns whether it was successful or not.
- *
- * - Writes always succeed and will overwrite other writes and writes
- * that were done by events interrupting the current write.
- *
- * - A write followed by a read of the same time stamp will always succeed,
- * but may not contain the same value.
- *
- * - A cmpxchg will fail if it interrupted another write or cmpxchg.
- * Other than that, it acts like a normal cmpxchg.
- *
- * The 60 bit time stamp is broken up by 30 bits in a top and bottom half
- * (bottom being the least significant 30 bits of the 60 bit time stamp).
- *
- * The two most significant bits of each half holds a 2 bit counter (0-3).
- * Each update will increment this counter by one.
- * When reading the top and bottom, if the two counter bits match then the
- * top and bottom together make a valid 60 bit number.
- */
-#define RB_TIME_SHIFT 30
-#define RB_TIME_VAL_MASK ((1 << RB_TIME_SHIFT) - 1)
-#define RB_TIME_MSB_SHIFT 60
-
-static inline int rb_time_cnt(unsigned long val)
+int ring_buffer_print_page_header(struct trace_buffer *buffer, struct trace_seq *s)
{
- return (val >> RB_TIME_SHIFT) & 3;
-}
-
-static inline u64 rb_time_val(unsigned long top, unsigned long bottom)
-{
- u64 val;
-
- val = top & RB_TIME_VAL_MASK;
- val <<= RB_TIME_SHIFT;
- val |= bottom & RB_TIME_VAL_MASK;
-
- return val;
-}
-
-static inline bool __rb_time_read(rb_time_t *t, u64 *ret, unsigned long *cnt)
-{
- unsigned long top, bottom, msb;
- unsigned long c;
-
- /*
- * If the read is interrupted by a write, then the cnt will
- * be different. Loop until both top and bottom have been read
- * without interruption.
- */
- do {
- c = local_read(&t->cnt);
- top = local_read(&t->top);
- bottom = local_read(&t->bottom);
- msb = local_read(&t->msb);
- } while (c != local_read(&t->cnt));
-
- *cnt = rb_time_cnt(top);
-
- /* If top, msb or bottom counts don't match, this interrupted a write */
- if (*cnt != rb_time_cnt(msb) || *cnt != rb_time_cnt(bottom))
- return false;
-
- /* The shift to msb will lose its cnt bits */
- *ret = rb_time_val(top, bottom) | ((u64)msb << RB_TIME_MSB_SHIFT);
- return true;
-}
-
-static bool rb_time_read(rb_time_t *t, u64 *ret)
-{
- unsigned long cnt;
-
- return __rb_time_read(t, ret, &cnt);
-}
-
-static inline unsigned long rb_time_val_cnt(unsigned long val, unsigned long cnt)
-{
- return (val & RB_TIME_VAL_MASK) | ((cnt & 3) << RB_TIME_SHIFT);
-}
-
-static inline void rb_time_split(u64 val, unsigned long *top, unsigned long *bottom,
- unsigned long *msb)
-{
- *top = (unsigned long)((val >> RB_TIME_SHIFT) & RB_TIME_VAL_MASK);
- *bottom = (unsigned long)(val & RB_TIME_VAL_MASK);
- *msb = (unsigned long)(val >> RB_TIME_MSB_SHIFT);
-}
+ struct buffer_data_page field;
-static inline void rb_time_val_set(local_t *t, unsigned long val, unsigned long cnt)
-{
- val = rb_time_val_cnt(val, cnt);
- local_set(t, val);
-}
+ trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+ "offset:0;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)sizeof(field.time_stamp),
+ (unsigned int)is_signed_type(u64));
-static void rb_time_set(rb_time_t *t, u64 val)
-{
- unsigned long cnt, top, bottom, msb;
+ trace_seq_printf(s, "\tfield: local_t commit;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ (unsigned int)sizeof(field.commit),
+ (unsigned int)is_signed_type(long));
- rb_time_split(val, &top, &bottom, &msb);
+ trace_seq_printf(s, "\tfield: int overwrite;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), commit),
+ 1,
+ (unsigned int)is_signed_type(long));
- /* Writes always succeed with a valid number even if it gets interrupted. */
- do {
- cnt = local_inc_return(&t->cnt);
- rb_time_val_set(&t->top, top, cnt);
- rb_time_val_set(&t->bottom, bottom, cnt);
- rb_time_val_set(&t->msb, val >> RB_TIME_MSB_SHIFT, cnt);
- } while (cnt != local_read(&t->cnt));
-}
+ trace_seq_printf(s, "\tfield: char data;\t"
+ "offset:%u;\tsize:%u;\tsigned:%u;\n",
+ (unsigned int)offsetof(typeof(field), data),
+ (unsigned int)buffer->subbuf_size,
+ (unsigned int)is_signed_type(char));
-static inline bool
-rb_time_read_cmpxchg(local_t *l, unsigned long expect, unsigned long set)
-{
- return local_try_cmpxchg(l, &expect, set);
+ return !trace_seq_has_overflowed(s);
}
-#else /* 64 bits */
-
-/* local64_t always succeeds */
-
-static inline bool rb_time_read(rb_time_t *t, u64 *ret)
+static inline void rb_time_read(rb_time_t *t, u64 *ret)
{
*ret = local64_read(&t->time);
- return true;
}
static void rb_time_set(rb_time_t *t, u64 val)
{
local64_set(&t->time, val);
}
-#endif
/*
* Enable this to make sure that the event passed to
@@ -820,10 +676,7 @@ u64 ring_buffer_event_time_stamp(struct trace_buffer *buffer,
WARN_ONCE(1, "nest (%d) greater than max", nest);
fail:
- /* Can only fail on 32 bit */
- if (!rb_time_read(&cpu_buffer->write_stamp, &ts))
- /* Screw it, just read the current time */
- ts = rb_time_stamp(cpu_buffer->buffer);
+ rb_time_read(&cpu_buffer->write_stamp, &ts);
return ts;
}
@@ -1619,10 +1472,12 @@ static int __rb_allocate_pages(struct ring_buffer_per_cpu *cpu_buffer,
list_add(&bpage->list, pages);
- page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags, 0);
+ page = alloc_pages_node(cpu_to_node(cpu_buffer->cpu), mflags,
+ cpu_buffer->buffer->subbuf_order);
if (!page)
goto free_pages;
bpage->page = page_address(page);
+ bpage->order = cpu_buffer->buffer->subbuf_order;
rb_init_page(bpage->page);
if (user_thread && fatal_signal_pending(current))
@@ -1701,7 +1556,8 @@ rb_allocate_cpu_buffer(struct trace_buffer *buffer, long nr_pages, int cpu)
rb_check_bpage(cpu_buffer, bpage);
cpu_buffer->reader_page = bpage;
- page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, 0);
+
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, cpu_buffer->buffer->subbuf_order);
if (!page)
goto fail_free_reader;
bpage->page = page_address(page);
@@ -1784,7 +1640,14 @@ struct trace_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
if (!zalloc_cpumask_var(&buffer->cpumask, GFP_KERNEL))
goto fail_free_buffer;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ /* Default buffer page size - one system page */
+ buffer->subbuf_order = 0;
+ buffer->subbuf_size = PAGE_SIZE - BUF_PAGE_HDR_SIZE;
+
+ /* Max payload is buffer page size - header (8bytes) */
+ buffer->max_data_size = buffer->subbuf_size - (sizeof(u32) * 2);
+
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
buffer->flags = flags;
buffer->clock = trace_clock_local;
buffer->reader_lock_key = key;
@@ -2103,7 +1966,7 @@ static void update_pages_handler(struct work_struct *work)
* @size: the new size.
* @cpu_id: the cpu buffer to resize
*
- * Minimum size is 2 * BUF_PAGE_SIZE.
+ * Minimum size is 2 * buffer->subbuf_size.
*
* Returns 0 on success and < 0 on failure.
*/
@@ -2125,7 +1988,7 @@ int ring_buffer_resize(struct trace_buffer *buffer, unsigned long size,
!cpumask_test_cpu(cpu_id, buffer->cpumask))
return 0;
- nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
+ nr_pages = DIV_ROUND_UP(size, buffer->subbuf_size);
/* we need a minimum of two pages */
if (nr_pages < 2)
@@ -2372,7 +2235,7 @@ rb_iter_head_event(struct ring_buffer_iter *iter)
*/
barrier();
- if ((iter->head + length) > commit || length > BUF_PAGE_SIZE)
+ if ((iter->head + length) > commit || length > iter->event_size)
/* Writer corrupted the read? */
goto reset;
@@ -2412,11 +2275,13 @@ rb_commit_index(struct ring_buffer_per_cpu *cpu_buffer)
}
static __always_inline unsigned
-rb_event_index(struct ring_buffer_event *event)
+rb_event_index(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event)
{
unsigned long addr = (unsigned long)event;
- return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE;
+ addr &= (PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1;
+
+ return addr - BUF_PAGE_HDR_SIZE;
}
static void rb_inc_iter(struct ring_buffer_iter *iter)
@@ -2605,6 +2470,7 @@ static inline void
rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
unsigned long tail, struct rb_event_info *info)
{
+ unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
struct buffer_page *tail_page = info->tail_page;
struct ring_buffer_event *event;
unsigned long length = info->length;
@@ -2613,13 +2479,13 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
* Only the event that crossed the page boundary
* must fill the old tail_page with padding.
*/
- if (tail >= BUF_PAGE_SIZE) {
+ if (tail >= bsize) {
/*
* If the page was filled, then we still need
* to update the real_end. Reset it to zero
* and the reader will ignore it.
*/
- if (tail == BUF_PAGE_SIZE)
+ if (tail == bsize)
tail_page->real_end = 0;
local_sub(length, &tail_page->write);
@@ -2647,7 +2513,7 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
* If we are less than the minimum size, we don't need to
* worry about it.
*/
- if (tail > (BUF_PAGE_SIZE - RB_EVNT_MIN_SIZE)) {
+ if (tail > (bsize - RB_EVNT_MIN_SIZE)) {
/* No room for any events */
/* Mark the rest of the page with padding */
@@ -2662,19 +2528,19 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer,
}
/* Put in a discarded event */
- event->array[0] = (BUF_PAGE_SIZE - tail) - RB_EVNT_HDR_SIZE;
+ event->array[0] = (bsize - tail) - RB_EVNT_HDR_SIZE;
event->type_len = RINGBUF_TYPE_PADDING;
/* time delta must be non zero */
event->time_delta = 1;
/* account for padding bytes */
- local_add(BUF_PAGE_SIZE - tail, &cpu_buffer->entries_bytes);
+ local_add(bsize - tail, &cpu_buffer->entries_bytes);
/* Make sure the padding is visible before the tail_page->write update */
smp_wmb();
/* Set write to end of buffer */
- length = (tail + length) - BUF_PAGE_SIZE;
+ length = (tail + length) - bsize;
local_sub(length, &tail_page->write);
}
@@ -2788,7 +2654,8 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
/* Slow path */
static struct ring_buffer_event *
-rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
+rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
+ struct ring_buffer_event *event, u64 delta, bool abs)
{
if (abs)
event->type_len = RINGBUF_TYPE_TIME_STAMP;
@@ -2796,7 +2663,7 @@ rb_add_time_stamp(struct ring_buffer_event *event, u64 delta, bool abs)
event->type_len = RINGBUF_TYPE_TIME_EXTEND;
/* Not the first event on the page, or not delta? */
- if (abs || rb_event_index(event)) {
+ if (abs || rb_event_index(cpu_buffer, event)) {
event->time_delta = delta & TS_MASK;
event->array[0] = delta >> TS_SHIFT;
} else {
@@ -2826,7 +2693,7 @@ rb_check_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
(unsigned long long)info->ts,
(unsigned long long)info->before,
(unsigned long long)info->after,
- (unsigned long long)(rb_time_read(&cpu_buffer->write_stamp, &write_stamp) ? write_stamp : 0),
+ (unsigned long long)({rb_time_read(&cpu_buffer->write_stamp, &write_stamp); write_stamp;}),
sched_clock_stable() ? "" :
"If you just came from a suspend/resume,\n"
"please switch to the trace global clock:\n"
@@ -2870,7 +2737,7 @@ static void rb_add_timestamp(struct ring_buffer_per_cpu *cpu_buffer,
if (!abs)
info->delta = 0;
}
- *event = rb_add_time_stamp(*event, info->delta, abs);
+ *event = rb_add_time_stamp(cpu_buffer, *event, info->delta, abs);
*length -= RB_LEN_TIME_EXTEND;
*delta = 0;
}
@@ -2954,10 +2821,10 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage;
unsigned long addr;
- new_index = rb_event_index(event);
+ new_index = rb_event_index(cpu_buffer, event);
old_index = new_index + rb_event_ts_length(event);
addr = (unsigned long)event;
- addr &= PAGE_MASK;
+ addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
bpage = READ_ONCE(cpu_buffer->tail_page);
@@ -3344,6 +3211,76 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
#define CHECK_FULL_PAGE 1L
#ifdef CONFIG_RING_BUFFER_VALIDATE_TIME_DELTAS
+
+static const char *show_irq_str(int bits)
+{
+ const char *type[] = {
+ ".", // 0
+ "s", // 1
+ "h", // 2
+ "Hs", // 3
+ "n", // 4
+ "Ns", // 5
+ "Nh", // 6
+ "NHs", // 7
+ };
+
+ return type[bits];
+}
+
+/* Assume this is an trace event */
+static const char *show_flags(struct ring_buffer_event *event)
+{
+ struct trace_entry *entry;
+ int bits = 0;
+
+ if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry))
+ return "X";
+
+ entry = ring_buffer_event_data(event);
+
+ if (entry->flags & TRACE_FLAG_SOFTIRQ)
+ bits |= 1;
+
+ if (entry->flags & TRACE_FLAG_HARDIRQ)
+ bits |= 2;
+
+ if (entry->flags & TRACE_FLAG_NMI)
+ bits |= 4;
+
+ return show_irq_str(bits);
+}
+
+static const char *show_irq(struct ring_buffer_event *event)
+{
+ struct trace_entry *entry;
+
+ if (rb_event_data_length(event) - RB_EVNT_HDR_SIZE < sizeof(*entry))
+ return "";
+
+ entry = ring_buffer_event_data(event);
+ if (entry->flags & TRACE_FLAG_IRQS_OFF)
+ return "d";
+ return "";
+}
+
+static const char *show_interrupt_level(void)
+{
+ unsigned long pc = preempt_count();
+ unsigned char level = 0;
+
+ if (pc & SOFTIRQ_OFFSET)
+ level |= 1;
+
+ if (pc & HARDIRQ_MASK)
+ level |= 2;
+
+ if (pc & NMI_MASK)
+ level |= 4;
+
+ return show_irq_str(level);
+}
+
static void dump_buffer_page(struct buffer_data_page *bpage,
struct rb_event_info *info,
unsigned long tail)
@@ -3364,34 +3301,57 @@ static void dump_buffer_page(struct buffer_data_page *bpage,
case RINGBUF_TYPE_TIME_EXTEND:
delta = rb_event_time_stamp(event);
ts += delta;
- pr_warn(" [%lld] delta:%lld TIME EXTEND\n", ts, delta);
+ pr_warn(" 0x%x: [%lld] delta:%lld TIME EXTEND\n",
+ e, ts, delta);
break;
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
ts = rb_fix_abs_ts(delta, ts);
- pr_warn(" [%lld] absolute:%lld TIME STAMP\n", ts, delta);
+ pr_warn(" 0x%x: [%lld] absolute:%lld TIME STAMP\n",
+ e, ts, delta);
break;
case RINGBUF_TYPE_PADDING:
ts += event->time_delta;
- pr_warn(" [%lld] delta:%d PADDING\n", ts, event->time_delta);
+ pr_warn(" 0x%x: [%lld] delta:%d PADDING\n",
+ e, ts, event->time_delta);
break;
case RINGBUF_TYPE_DATA:
ts += event->time_delta;
- pr_warn(" [%lld] delta:%d\n", ts, event->time_delta);
+ pr_warn(" 0x%x: [%lld] delta:%d %s%s\n",
+ e, ts, event->time_delta,
+ show_flags(event), show_irq(event));
break;
default:
break;
}
}
+ pr_warn("expected end:0x%lx last event actually ended at:0x%x\n", tail, e);
}
static DEFINE_PER_CPU(atomic_t, checking);
static atomic_t ts_dump;
+#define buffer_warn_return(fmt, ...) \
+ do { \
+ /* If another report is happening, ignore this one */ \
+ if (atomic_inc_return(&ts_dump) != 1) { \
+ atomic_dec(&ts_dump); \
+ goto out; \
+ } \
+ atomic_inc(&cpu_buffer->record_disabled); \
+ pr_warn(fmt, ##__VA_ARGS__); \
+ dump_buffer_page(bpage, info, tail); \
+ atomic_dec(&ts_dump); \
+ /* There's some cases in boot up that this can happen */ \
+ if (WARN_ON_ONCE(system_state != SYSTEM_BOOTING)) \
+ /* Do not re-enable checking */ \
+ return; \
+ } while (0)
+
/*
* Check if the current event time stamp matches the deltas on
* the buffer page.
@@ -3445,7 +3405,12 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
case RINGBUF_TYPE_TIME_STAMP:
delta = rb_event_time_stamp(event);
- ts = rb_fix_abs_ts(delta, ts);
+ delta = rb_fix_abs_ts(delta, ts);
+ if (delta < ts) {
+ buffer_warn_return("[CPU: %d]ABSOLUTE TIME WENT BACKWARDS: last ts: %lld absolute ts: %lld\n",
+ cpu_buffer->cpu, ts, delta);
+ }
+ ts = delta;
break;
case RINGBUF_TYPE_PADDING:
@@ -3462,23 +3427,11 @@ static void check_buffer(struct ring_buffer_per_cpu *cpu_buffer,
}
if ((full && ts > info->ts) ||
(!full && ts + info->delta != info->ts)) {
- /* If another report is happening, ignore this one */
- if (atomic_inc_return(&ts_dump) != 1) {
- atomic_dec(&ts_dump);
- goto out;
- }
- atomic_inc(&cpu_buffer->record_disabled);
- /* There's some cases in boot up that this can happen */
- WARN_ON_ONCE(system_state != SYSTEM_BOOTING);
- pr_warn("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s\n",
- cpu_buffer->cpu,
- ts + info->delta, info->ts, info->delta,
- info->before, info->after,
- full ? " (full)" : "");
- dump_buffer_page(bpage, info, tail);
- atomic_dec(&ts_dump);
- /* Do not re-enable checking */
- return;
+ buffer_warn_return("[CPU: %d]TIME DOES NOT MATCH expected:%lld actual:%lld delta:%lld before:%lld after:%lld%s context:%s\n",
+ cpu_buffer->cpu,
+ ts + info->delta, info->ts, info->delta,
+ info->before, info->after,
+ full ? " (full)" : "", show_interrupt_level());
}
out:
atomic_dec(this_cpu_ptr(&checking));
@@ -3498,16 +3451,14 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
struct ring_buffer_event *event;
struct buffer_page *tail_page;
unsigned long tail, write, w;
- bool a_ok;
- bool b_ok;
/* Don't let the compiler play games with cpu_buffer->tail_page */
tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page);
/*A*/ w = local_read(&tail_page->write) & RB_WRITE_MASK;
barrier();
- b_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
- a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
+ rb_time_read(&cpu_buffer->before_stamp, &info->before);
+ rb_time_read(&cpu_buffer->write_stamp, &info->after);
barrier();
info->ts = rb_time_stamp(cpu_buffer->buffer);
@@ -3522,7 +3473,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
if (!w) {
/* Use the sub-buffer timestamp */
info->delta = 0;
- } else if (unlikely(!a_ok || !b_ok || info->before != info->after)) {
+ } else if (unlikely(info->before != info->after)) {
info->add_timestamp |= RB_ADD_STAMP_FORCE | RB_ADD_STAMP_EXTEND;
info->length += RB_LEN_TIME_EXTEND;
} else {
@@ -3544,7 +3495,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
tail = write - info->length;
/* See if we shot pass the end of this buffer page */
- if (unlikely(write > BUF_PAGE_SIZE)) {
+ if (unlikely(write > cpu_buffer->buffer->subbuf_size)) {
check_buffer(cpu_buffer, info, CHECK_FULL_PAGE);
return rb_move_tail(cpu_buffer, tail, info);
}
@@ -3571,8 +3522,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
/* SLOW PATH - Interrupted between A and C */
/* Save the old before_stamp */
- a_ok = rb_time_read(&cpu_buffer->before_stamp, &info->before);
- RB_WARN_ON(cpu_buffer, !a_ok);
+ rb_time_read(&cpu_buffer->before_stamp, &info->before);
/*
* Read a new timestamp and update the before_stamp to make
@@ -3584,9 +3534,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
rb_time_set(&cpu_buffer->before_stamp, ts);
barrier();
- /*E*/ a_ok = rb_time_read(&cpu_buffer->write_stamp, &info->after);
- /* Was interrupted before here, write_stamp must be valid */
- RB_WARN_ON(cpu_buffer, !a_ok);
+ /*E*/ rb_time_read(&cpu_buffer->write_stamp, &info->after);
barrier();
/*F*/ if (write == (local_read(&tail_page->write) & RB_WRITE_MASK) &&
info->after == info->before && info->after < ts) {
@@ -3678,7 +3626,7 @@ rb_reserve_next_event(struct trace_buffer *buffer,
if (ring_buffer_time_stamp_abs(cpu_buffer->buffer)) {
add_ts_default = RB_ADD_STAMP_ABSOLUTE;
info.length += RB_LEN_TIME_EXTEND;
- if (info.length > BUF_MAX_DATA_SIZE)
+ if (info.length > cpu_buffer->buffer->max_data_size)
goto out_fail;
} else {
add_ts_default = RB_ADD_STAMP_NONE;
@@ -3753,7 +3701,7 @@ ring_buffer_lock_reserve(struct trace_buffer *buffer, unsigned long length)
if (unlikely(atomic_read(&cpu_buffer->record_disabled)))
goto out;
- if (unlikely(length > BUF_MAX_DATA_SIZE))
+ if (unlikely(length > buffer->max_data_size))
goto out;
if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -3787,7 +3735,7 @@ rb_decrement_entry(struct ring_buffer_per_cpu *cpu_buffer,
struct buffer_page *bpage = cpu_buffer->commit_page;
struct buffer_page *start;
- addr &= PAGE_MASK;
+ addr &= ~((PAGE_SIZE << cpu_buffer->buffer->subbuf_order) - 1);
/* Do the likely case first */
if (likely(bpage->page == (void *)addr)) {
@@ -3903,7 +3851,7 @@ int ring_buffer_write(struct trace_buffer *buffer,
if (atomic_read(&cpu_buffer->record_disabled))
goto out;
- if (length > BUF_MAX_DATA_SIZE)
+ if (length > buffer->max_data_size)
goto out;
if (unlikely(trace_recursive_lock(cpu_buffer)))
@@ -4483,6 +4431,7 @@ static struct buffer_page *
rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
{
struct buffer_page *reader = NULL;
+ unsigned long bsize = READ_ONCE(cpu_buffer->buffer->subbuf_size);
unsigned long overwrite;
unsigned long flags;
int nr_loops = 0;
@@ -4618,7 +4567,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
#define USECS_WAIT 1000000
for (nr_loops = 0; nr_loops < USECS_WAIT; nr_loops++) {
/* If the write is past the end of page, a writer is still updating it */
- if (likely(!reader || rb_page_write(reader) <= BUF_PAGE_SIZE))
+ if (likely(!reader || rb_page_write(reader) <= bsize))
break;
udelay(1);
@@ -5062,7 +5011,8 @@ ring_buffer_read_prepare(struct trace_buffer *buffer, int cpu, gfp_t flags)
return NULL;
/* Holds the entire event: data and meta data */
- iter->event = kmalloc(BUF_PAGE_SIZE, flags);
+ iter->event_size = buffer->subbuf_size;
+ iter->event = kmalloc(iter->event_size, flags);
if (!iter->event) {
kfree(iter);
return NULL;
@@ -5178,19 +5128,28 @@ EXPORT_SYMBOL_GPL(ring_buffer_iter_advance);
*/
unsigned long ring_buffer_size(struct trace_buffer *buffer, int cpu)
{
- /*
- * Earlier, this method returned
- * BUF_PAGE_SIZE * buffer->nr_pages
- * Since the nr_pages field is now removed, we have converted this to
- * return the per cpu buffer value.
- */
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return 0;
- return BUF_PAGE_SIZE * buffer->buffers[cpu]->nr_pages;
+ return buffer->subbuf_size * buffer->buffers[cpu]->nr_pages;
}
EXPORT_SYMBOL_GPL(ring_buffer_size);
+/**
+ * ring_buffer_max_event_size - return the max data size of an event
+ * @buffer: The ring buffer.
+ *
+ * Returns the maximum size an event can be.
+ */
+unsigned long ring_buffer_max_event_size(struct trace_buffer *buffer)
+{
+ /* If abs timestamp is requested, events have a timestamp too */
+ if (ring_buffer_time_stamp_abs(buffer))
+ return buffer->max_data_size - RB_LEN_TIME_EXTEND;
+ return buffer->max_data_size;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_max_event_size);
+
static void rb_clear_buffer_page(struct buffer_page *page)
{
local_set(&page->write, 0);
@@ -5461,6 +5420,9 @@ int ring_buffer_swap_cpu(struct trace_buffer *buffer_a,
if (cpu_buffer_a->nr_pages != cpu_buffer_b->nr_pages)
goto out;
+ if (buffer_a->subbuf_order != buffer_b->subbuf_order)
+ goto out;
+
ret = -EAGAIN;
if (atomic_read(&buffer_a->record_disabled))
@@ -5532,40 +5494,48 @@ EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
* Returns:
* The page allocated, or ERR_PTR
*/
-void *ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
+struct buffer_data_read_page *
+ring_buffer_alloc_read_page(struct trace_buffer *buffer, int cpu)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct buffer_data_page *bpage = NULL;
+ struct buffer_data_read_page *bpage = NULL;
unsigned long flags;
struct page *page;
if (!cpumask_test_cpu(cpu, buffer->cpumask))
return ERR_PTR(-ENODEV);
+ bpage = kzalloc(sizeof(*bpage), GFP_KERNEL);
+ if (!bpage)
+ return ERR_PTR(-ENOMEM);
+
+ bpage->order = buffer->subbuf_order;
cpu_buffer = buffer->buffers[cpu];
local_irq_save(flags);
arch_spin_lock(&cpu_buffer->lock);
if (cpu_buffer->free_page) {
- bpage = cpu_buffer->free_page;
+ bpage->data = cpu_buffer->free_page;
cpu_buffer->free_page = NULL;
}
arch_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
- if (bpage)
+ if (bpage->data)
goto out;
- page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY, 0);
- if (!page)
+ page = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL | __GFP_NORETRY,
+ cpu_buffer->buffer->subbuf_order);
+ if (!page) {
+ kfree(bpage);
return ERR_PTR(-ENOMEM);
+ }
- bpage = page_address(page);
+ bpage->data = page_address(page);
out:
- rb_init_page(bpage);
+ rb_init_page(bpage->data);
return bpage;
}
@@ -5575,14 +5545,15 @@ EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
* ring_buffer_free_read_page - free an allocated read page
* @buffer: the buffer the page was allocate for
* @cpu: the cpu buffer the page came from
- * @data: the page to free
+ * @data_page: the page to free
*
* Free a page allocated from ring_buffer_alloc_read_page.
*/
-void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data)
+void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu,
+ struct buffer_data_read_page *data_page)
{
struct ring_buffer_per_cpu *cpu_buffer;
- struct buffer_data_page *bpage = data;
+ struct buffer_data_page *bpage = data_page->data;
struct page *page = virt_to_page(bpage);
unsigned long flags;
@@ -5591,8 +5562,12 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
cpu_buffer = buffer->buffers[cpu];
- /* If the page is still in use someplace else, we can't reuse it */
- if (page_ref_count(page) > 1)
+ /*
+ * If the page is still in use someplace else, or order of the page
+ * is different from the subbuffer order of the buffer -
+ * we can't reuse it
+ */
+ if (page_ref_count(page) > 1 || data_page->order != buffer->subbuf_order)
goto out;
local_irq_save(flags);
@@ -5607,7 +5582,8 @@ void ring_buffer_free_read_page(struct trace_buffer *buffer, int cpu, void *data
local_irq_restore(flags);
out:
- free_page((unsigned long)bpage);
+ free_pages((unsigned long)bpage, data_page->order);
+ kfree(data_page);
}
EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
@@ -5628,9 +5604,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
* rpage = ring_buffer_alloc_read_page(buffer, cpu);
* if (IS_ERR(rpage))
* return PTR_ERR(rpage);
- * ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
+ * ret = ring_buffer_read_page(buffer, rpage, len, cpu, 0);
* if (ret >= 0)
- * process_page(rpage, ret);
+ * process_page(ring_buffer_read_page_data(rpage), ret);
+ * ring_buffer_free_read_page(buffer, cpu, rpage);
*
* When @full is set, the function will not return true unless
* the writer is off the reader page.
@@ -5645,7 +5622,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
* <0 if no data has been transferred.
*/
int ring_buffer_read_page(struct trace_buffer *buffer,
- void **data_page, size_t len, int cpu, int full)
+ struct buffer_data_read_page *data_page,
+ size_t len, int cpu, int full)
{
struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
struct ring_buffer_event *event;
@@ -5670,10 +5648,12 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
len -= BUF_PAGE_HDR_SIZE;
- if (!data_page)
+ if (!data_page || !data_page->data)
+ goto out;
+ if (data_page->order != buffer->subbuf_order)
goto out;
- bpage = *data_page;
+ bpage = data_page->data;
if (!bpage)
goto out;
@@ -5767,11 +5747,11 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/* swap the pages */
rb_init_page(bpage);
bpage = reader->page;
- reader->page = *data_page;
+ reader->page = data_page->data;
local_set(&reader->write, 0);
local_set(&reader->entries, 0);
reader->read = 0;
- *data_page = bpage;
+ data_page->data = bpage;
/*
* Use the real_end for the data size,
@@ -5793,7 +5773,7 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/* If there is room at the end of the page to save the
* missed events, then record it there.
*/
- if (BUF_PAGE_SIZE - commit >= sizeof(missed_events)) {
+ if (buffer->subbuf_size - commit >= sizeof(missed_events)) {
memcpy(&bpage->data[commit], &missed_events,
sizeof(missed_events));
local_add(RB_MISSED_STORED, &bpage->commit);
@@ -5805,8 +5785,8 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
/*
* This page may be off to user land. Zero it out here.
*/
- if (commit < BUF_PAGE_SIZE)
- memset(&bpage->data[commit], 0, BUF_PAGE_SIZE - commit);
+ if (commit < buffer->subbuf_size)
+ memset(&bpage->data[commit], 0, buffer->subbuf_size - commit);
out_unlock:
raw_spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -5816,6 +5796,209 @@ int ring_buffer_read_page(struct trace_buffer *buffer,
}
EXPORT_SYMBOL_GPL(ring_buffer_read_page);
+/**
+ * ring_buffer_read_page_data - get pointer to the data in the page.
+ * @page: the page to get the data from
+ *
+ * Returns pointer to the actual data in this page.
+ */
+void *ring_buffer_read_page_data(struct buffer_data_read_page *page)
+{
+ return page->data;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_read_page_data);
+
+/**
+ * ring_buffer_subbuf_size_get - get size of the sub buffer.
+ * @buffer: the buffer to get the sub buffer size from
+ *
+ * Returns size of the sub buffer, in bytes.
+ */
+int ring_buffer_subbuf_size_get(struct trace_buffer *buffer)
+{
+ return buffer->subbuf_size + BUF_PAGE_HDR_SIZE;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_size_get);
+
+/**
+ * ring_buffer_subbuf_order_get - get order of system sub pages in one buffer page.
+ * @buffer: The ring_buffer to get the system sub page order from
+ *
+ * By default, one ring buffer sub page equals to one system page. This parameter
+ * is configurable, per ring buffer. The size of the ring buffer sub page can be
+ * extended, but must be an order of system page size.
+ *
+ * Returns the order of buffer sub page size, in system pages:
+ * 0 means the sub buffer size is 1 system page and so forth.
+ * In case of an error < 0 is returned.
+ */
+int ring_buffer_subbuf_order_get(struct trace_buffer *buffer)
+{
+ if (!buffer)
+ return -EINVAL;
+
+ return buffer->subbuf_order;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_get);
+
+/**
+ * ring_buffer_subbuf_order_set - set the size of ring buffer sub page.
+ * @buffer: The ring_buffer to set the new page size.
+ * @order: Order of the system pages in one sub buffer page
+ *
+ * By default, one ring buffer pages equals to one system page. This API can be
+ * used to set new size of the ring buffer page. The size must be order of
+ * system page size, that's why the input parameter @order is the order of
+ * system pages that are allocated for one ring buffer page:
+ * 0 - 1 system page
+ * 1 - 2 system pages
+ * 3 - 4 system pages
+ * ...
+ *
+ * Returns 0 on success or < 0 in case of an error.
+ */
+int ring_buffer_subbuf_order_set(struct trace_buffer *buffer, int order)
+{
+ struct ring_buffer_per_cpu *cpu_buffer;
+ struct buffer_page *bpage, *tmp;
+ int old_order, old_size;
+ int nr_pages;
+ int psize;
+ int err;
+ int cpu;
+
+ if (!buffer || order < 0)
+ return -EINVAL;
+
+ if (buffer->subbuf_order == order)
+ return 0;
+
+ psize = (1 << order) * PAGE_SIZE;
+ if (psize <= BUF_PAGE_HDR_SIZE)
+ return -EINVAL;
+
+ old_order = buffer->subbuf_order;
+ old_size = buffer->subbuf_size;
+
+ /* prevent another thread from changing buffer sizes */
+ mutex_lock(&buffer->mutex);
+ atomic_inc(&buffer->record_disabled);
+
+ /* Make sure all commits have finished */
+ synchronize_rcu();
+
+ buffer->subbuf_order = order;
+ buffer->subbuf_size = psize - BUF_PAGE_HDR_SIZE;
+
+ /* Make sure all new buffers are allocated, before deleting the old ones */
+ for_each_buffer_cpu(buffer, cpu) {
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ continue;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ /* Update the number of pages to match the new size */
+ nr_pages = old_size * buffer->buffers[cpu]->nr_pages;
+ nr_pages = DIV_ROUND_UP(nr_pages, buffer->subbuf_size);
+
+ /* we need a minimum of two pages */
+ if (nr_pages < 2)
+ nr_pages = 2;
+
+ cpu_buffer->nr_pages_to_update = nr_pages;
+
+ /* Include the reader page */
+ nr_pages++;
+
+ /* Allocate the new size buffer */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+ if (__rb_allocate_pages(cpu_buffer, nr_pages,
+ &cpu_buffer->new_pages)) {
+ /* not enough memory for new pages */
+ err = -ENOMEM;
+ goto error;
+ }
+ }
+
+ for_each_buffer_cpu(buffer, cpu) {
+
+ if (!cpumask_test_cpu(cpu, buffer->cpumask))
+ continue;
+
+ cpu_buffer = buffer->buffers[cpu];
+
+ /* Clear the head bit to make the link list normal to read */
+ rb_head_page_deactivate(cpu_buffer);
+
+ /* Now walk the list and free all the old sub buffers */
+ list_for_each_entry_safe(bpage, tmp, cpu_buffer->pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ /* The above loop stopped an the last page needing to be freed */
+ bpage = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ free_buffer_page(bpage);
+
+ /* Free the current reader page */
+ free_buffer_page(cpu_buffer->reader_page);
+
+ /* One page was allocated for the reader page */
+ cpu_buffer->reader_page = list_entry(cpu_buffer->new_pages.next,
+ struct buffer_page, list);
+ list_del_init(&cpu_buffer->reader_page->list);
+
+ /* The cpu_buffer pages are a link list with no head */
+ cpu_buffer->pages = cpu_buffer->new_pages.next;
+ cpu_buffer->new_pages.next->prev = cpu_buffer->new_pages.prev;
+ cpu_buffer->new_pages.prev->next = cpu_buffer->new_pages.next;
+
+ /* Clear the new_pages list */
+ INIT_LIST_HEAD(&cpu_buffer->new_pages);
+
+ cpu_buffer->head_page
+ = list_entry(cpu_buffer->pages, struct buffer_page, list);
+ cpu_buffer->tail_page = cpu_buffer->commit_page = cpu_buffer->head_page;
+
+ cpu_buffer->nr_pages = cpu_buffer->nr_pages_to_update;
+ cpu_buffer->nr_pages_to_update = 0;
+
+ free_pages((unsigned long)cpu_buffer->free_page, old_order);
+ cpu_buffer->free_page = NULL;
+
+ rb_head_page_activate(cpu_buffer);
+
+ rb_check_pages(cpu_buffer);
+ }
+
+ atomic_dec(&buffer->record_disabled);
+ mutex_unlock(&buffer->mutex);
+
+ return 0;
+
+error:
+ buffer->subbuf_order = old_order;
+ buffer->subbuf_size = old_size;
+
+ atomic_dec(&buffer->record_disabled);
+ mutex_unlock(&buffer->mutex);
+
+ for_each_buffer_cpu(buffer, cpu) {
+ cpu_buffer = buffer->buffers[cpu];
+
+ if (!cpu_buffer->nr_pages_to_update)
+ continue;
+
+ list_for_each_entry_safe(bpage, tmp, &cpu_buffer->new_pages, list) {
+ list_del_init(&bpage->list);
+ free_buffer_page(bpage);
+ }
+ }
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_subbuf_order_set);
+
/*
* We only allocate new buffers, never free them if the CPU goes down.
* If we were to free the buffer, then the user would lose any trace that was in
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
index aef34673d79d..008187ebd7fe 100644
--- a/kernel/trace/ring_buffer_benchmark.c
+++ b/kernel/trace/ring_buffer_benchmark.c
@@ -104,10 +104,11 @@ static enum event_status read_event(int cpu)
static enum event_status read_page(int cpu)
{
+ struct buffer_data_read_page *bpage;
struct ring_buffer_event *event;
struct rb_page *rpage;
unsigned long commit;
- void *bpage;
+ int page_size;
int *entry;
int ret;
int inc;
@@ -117,14 +118,15 @@ static enum event_status read_page(int cpu)
if (IS_ERR(bpage))
return EVENT_DROPPED;
- ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+ page_size = ring_buffer_subbuf_size_get(buffer);
+ ret = ring_buffer_read_page(buffer, bpage, page_size, cpu, 1);
if (ret >= 0) {
- rpage = bpage;
+ rpage = ring_buffer_read_page_data(bpage);
/* The commit may have missed event flags set, clear them */
commit = local_read(&rpage->commit) & 0xfffff;
for (i = 0; i < commit && !test_error ; i += inc) {
- if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+ if (i >= (page_size - offsetof(struct rb_page, data))) {
TEST_ERROR();
break;
}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a0defe156b57..2a7c6fd934e9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1263,10 +1263,17 @@ static void set_buffer_entries(struct array_buffer *buf, unsigned long val);
int tracing_alloc_snapshot_instance(struct trace_array *tr)
{
+ int order;
int ret;
if (!tr->allocated_snapshot) {
+ /* Make the snapshot buffer have the same order as main buffer */
+ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
+ if (ret < 0)
+ return ret;
+
/* allocate spare buffer */
ret = resize_buffer_duplicate_size(&tr->max_buffer,
&tr->array_buffer, RING_BUFFER_ALL_CPUS);
@@ -1286,6 +1293,7 @@ static void free_snapshot(struct trace_array *tr)
* The max_tr ring buffer has some state (e.g. ring->clock) and
* we want preserve it.
*/
+ ring_buffer_subbuf_order_set(tr->max_buffer.buffer, 0);
ring_buffer_resize(tr->max_buffer.buffer, 1, RING_BUFFER_ALL_CPUS);
set_buffer_entries(&tr->max_buffer, 1);
tracing_reset_online_cpus(&tr->max_buffer);
@@ -3767,7 +3775,7 @@ static bool trace_safe_str(struct trace_iterator *iter, const char *str,
/* OK if part of the temp seq buffer */
if ((addr >= (unsigned long)iter->tmp_seq.buffer) &&
- (addr < (unsigned long)iter->tmp_seq.buffer + PAGE_SIZE))
+ (addr < (unsigned long)iter->tmp_seq.buffer + TRACE_SEQ_BUFFER_SIZE))
return true;
/* Core rodata can not be freed */
@@ -5032,7 +5040,7 @@ static int tracing_release(struct inode *inode, struct file *file)
return 0;
}
-static int tracing_release_generic_tr(struct inode *inode, struct file *file)
+int tracing_release_generic_tr(struct inode *inode, struct file *file)
{
struct trace_array *tr = inode->i_private;
@@ -6946,8 +6954,8 @@ waitagain:
goto out;
}
- if (cnt >= PAGE_SIZE)
- cnt = PAGE_SIZE - 1;
+ if (cnt >= TRACE_SEQ_BUFFER_SIZE)
+ cnt = TRACE_SEQ_BUFFER_SIZE - 1;
/* reset all but tr, trace, and overruns */
trace_iterator_reset(iter);
@@ -7292,8 +7300,9 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
enum event_trigger_type tt = ETT_NONE;
struct trace_buffer *buffer;
struct print_entry *entry;
+ int meta_size;
ssize_t written;
- int size;
+ size_t size;
int len;
/* Used in tracing_mark_raw_write() as well */
@@ -7306,23 +7315,44 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
if (!(tr->trace_flags & TRACE_ITER_MARKERS))
return -EINVAL;
- if (cnt > TRACE_BUF_SIZE)
- cnt = TRACE_BUF_SIZE;
-
- BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
+ if ((ssize_t)cnt < 0)
+ return -EINVAL;
- size = sizeof(*entry) + cnt + 2; /* add '\0' and possible '\n' */
+ meta_size = sizeof(*entry) + 2; /* add '\0' and possible '\n' */
+ again:
+ size = cnt + meta_size;
/* If less than "<faulted>", then make sure we can still add that */
if (cnt < FAULTED_SIZE)
size += FAULTED_SIZE - cnt;
+ if (size > TRACE_SEQ_BUFFER_SIZE) {
+ cnt -= size - TRACE_SEQ_BUFFER_SIZE;
+ goto again;
+ }
+
buffer = tr->array_buffer.buffer;
event = __trace_buffer_lock_reserve(buffer, TRACE_PRINT, size,
tracing_gen_ctx());
- if (unlikely(!event))
+ if (unlikely(!event)) {
+ /*
+ * If the size was greater than what was allowed, then
+ * make it smaller and try again.
+ */
+ if (size > ring_buffer_max_event_size(buffer)) {
+ /* cnt < FAULTED size should never be bigger than max */
+ if (WARN_ON_ONCE(cnt < FAULTED_SIZE))
+ return -EBADF;
+ cnt = ring_buffer_max_event_size(buffer) - meta_size;
+ /* The above should only happen once */
+ if (WARN_ON_ONCE(cnt + meta_size == size))
+ return -EBADF;
+ goto again;
+ }
+
/* Ring buffer disabled, return as if not open for write */
return -EBADF;
+ }
entry = ring_buffer_event_data(event);
entry->ip = _THIS_IP_;
@@ -7357,9 +7387,6 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
return written;
}
-/* Limit it for now to 3K (including tag) */
-#define RAW_DATA_MAX_SIZE (1024*3)
-
static ssize_t
tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
size_t cnt, loff_t *fpos)
@@ -7381,19 +7408,18 @@ tracing_mark_raw_write(struct file *filp, const char __user *ubuf,
return -EINVAL;
/* The marker must at least have a tag id */
- if (cnt < sizeof(unsigned int) || cnt > RAW_DATA_MAX_SIZE)
+ if (cnt < sizeof(unsigned int))
return -EINVAL;
- if (cnt > TRACE_BUF_SIZE)
- cnt = TRACE_BUF_SIZE;
-
- BUILD_BUG_ON(TRACE_BUF_SIZE >= PAGE_SIZE);
-
size = sizeof(*entry) + cnt;
if (cnt < FAULT_SIZE_ID)
size += FAULT_SIZE_ID - cnt;
buffer = tr->array_buffer.buffer;
+
+ if (size > ring_buffer_max_event_size(buffer))
+ return -EINVAL;
+
event = __trace_buffer_lock_reserve(buffer, TRACE_RAW_DATA, size,
tracing_gen_ctx());
if (!event)
@@ -7578,6 +7604,7 @@ struct ftrace_buffer_info {
struct trace_iterator iter;
void *spare;
unsigned int spare_cpu;
+ unsigned int spare_size;
unsigned int read;
};
@@ -8282,6 +8309,8 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
{
struct ftrace_buffer_info *info = filp->private_data;
struct trace_iterator *iter = &info->iter;
+ void *trace_data;
+ int page_size;
ssize_t ret = 0;
ssize_t size;
@@ -8293,6 +8322,17 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
return -EBUSY;
#endif
+ page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+
+ /* Make sure the spare matches the current sub buffer size */
+ if (info->spare) {
+ if (page_size != info->spare_size) {
+ ring_buffer_free_read_page(iter->array_buffer->buffer,
+ info->spare_cpu, info->spare);
+ info->spare = NULL;
+ }
+ }
+
if (!info->spare) {
info->spare = ring_buffer_alloc_read_page(iter->array_buffer->buffer,
iter->cpu_file);
@@ -8301,19 +8341,20 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
info->spare = NULL;
} else {
info->spare_cpu = iter->cpu_file;
+ info->spare_size = page_size;
}
}
if (!info->spare)
return ret;
/* Do we have previous read data to read? */
- if (info->read < PAGE_SIZE)
+ if (info->read < page_size)
goto read;
again:
trace_access_lock(iter->cpu_file);
ret = ring_buffer_read_page(iter->array_buffer->buffer,
- &info->spare,
+ info->spare,
count,
iter->cpu_file, 0);
trace_access_unlock(iter->cpu_file);
@@ -8334,11 +8375,11 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
info->read = 0;
read:
- size = PAGE_SIZE - info->read;
+ size = page_size - info->read;
if (size > count)
size = count;
-
- ret = copy_to_user(ubuf, info->spare + info->read, size);
+ trace_data = ring_buffer_read_page_data(info->spare);
+ ret = copy_to_user(ubuf, trace_data + info->read, size);
if (ret == size)
return -EFAULT;
@@ -8449,6 +8490,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
.spd_release = buffer_spd_release,
};
struct buffer_ref *ref;
+ int page_size;
int entries, i;
ssize_t ret = 0;
@@ -8457,13 +8499,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
return -EBUSY;
#endif
- if (*ppos & (PAGE_SIZE - 1))
+ page_size = ring_buffer_subbuf_size_get(iter->array_buffer->buffer);
+ if (*ppos & (page_size - 1))
return -EINVAL;
- if (len & (PAGE_SIZE - 1)) {
- if (len < PAGE_SIZE)
+ if (len & (page_size - 1)) {
+ if (len < page_size)
return -EINVAL;
- len &= PAGE_MASK;
+ len &= (~(page_size - 1));
}
if (splice_grow_spd(pipe, &spd))
@@ -8473,7 +8516,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
trace_access_lock(iter->cpu_file);
entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
- for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= PAGE_SIZE) {
+ for (i = 0; i < spd.nr_pages_max && len && entries; i++, len -= page_size) {
struct page *page;
int r;
@@ -8494,7 +8537,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
}
ref->cpu = iter->cpu_file;
- r = ring_buffer_read_page(ref->buffer, &ref->page,
+ r = ring_buffer_read_page(ref->buffer, ref->page,
len, iter->cpu_file, 1);
if (r < 0) {
ring_buffer_free_read_page(ref->buffer, ref->cpu,
@@ -8503,14 +8546,14 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
break;
}
- page = virt_to_page(ref->page);
+ page = virt_to_page(ring_buffer_read_page_data(ref->page));
spd.pages[i] = page;
- spd.partial[i].len = PAGE_SIZE;
+ spd.partial[i].len = page_size;
spd.partial[i].offset = 0;
spd.partial[i].private = (unsigned long)ref;
spd.nr_pages++;
- *ppos += PAGE_SIZE;
+ *ppos += page_size;
entries = ring_buffer_entries_cpu(iter->array_buffer->buffer, iter->cpu_file);
}
@@ -9354,6 +9397,103 @@ static const struct file_operations buffer_percent_fops = {
.llseek = default_llseek,
};
+static ssize_t
+buffer_subbuf_size_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ size_t size;
+ char buf[64];
+ int order;
+ int r;
+
+ order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ size = (PAGE_SIZE << order) / 1024;
+
+ r = sprintf(buf, "%zd\n", size);
+
+ return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+buffer_subbuf_size_write(struct file *filp, const char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ struct trace_array *tr = filp->private_data;
+ unsigned long val;
+ int old_order;
+ int order;
+ int pages;
+ int ret;
+
+ ret = kstrtoul_from_user(ubuf, cnt, 10, &val);
+ if (ret)
+ return ret;
+
+ val *= 1024; /* value passed in is in KB */
+
+ pages = DIV_ROUND_UP(val, PAGE_SIZE);
+ order = fls(pages - 1);
+
+ /* limit between 1 and 128 system pages */
+ if (order < 0 || order > 7)
+ return -EINVAL;
+
+ /* Do not allow tracing while changing the order of the ring buffer */
+ tracing_stop_tr(tr);
+
+ old_order = ring_buffer_subbuf_order_get(tr->array_buffer.buffer);
+ if (old_order == order)
+ goto out;
+
+ ret = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, order);
+ if (ret)
+ goto out;
+
+#ifdef CONFIG_TRACER_MAX_TRACE
+
+ if (!tr->allocated_snapshot)
+ goto out_max;
+
+ ret = ring_buffer_subbuf_order_set(tr->max_buffer.buffer, order);
+ if (ret) {
+ /* Put back the old order */
+ cnt = ring_buffer_subbuf_order_set(tr->array_buffer.buffer, old_order);
+ if (WARN_ON_ONCE(cnt)) {
+ /*
+ * AARGH! We are left with different orders!
+ * The max buffer is our "snapshot" buffer.
+ * When a tracer needs a snapshot (one of the
+ * latency tracers), it swaps the max buffer
+ * with the saved snap shot. We succeeded to
+ * update the order of the main buffer, but failed to
+ * update the order of the max buffer. But when we tried
+ * to reset the main buffer to the original size, we
+ * failed there too. This is very unlikely to
+ * happen, but if it does, warn and kill all
+ * tracing.
+ */
+ tracing_disabled = 1;
+ }
+ goto out;
+ }
+ out_max:
+#endif
+ (*ppos)++;
+ out:
+ if (ret)
+ cnt = ret;
+ tracing_start_tr(tr);
+ return cnt;
+}
+
+static const struct file_operations buffer_subbuf_size_fops = {
+ .open = tracing_open_generic_tr,
+ .read = buffer_subbuf_size_read,
+ .write = buffer_subbuf_size_write,
+ .release = tracing_release_generic_tr,
+ .llseek = default_llseek,
+};
+
static struct dentry *trace_instance_dir;
static void
@@ -9504,7 +9644,8 @@ static int trace_array_create_dir(struct trace_array *tr)
return ret;
}
-static struct trace_array *trace_array_create(const char *name)
+static struct trace_array *
+trace_array_create_systems(const char *name, const char *systems)
{
struct trace_array *tr;
int ret;
@@ -9524,6 +9665,12 @@ static struct trace_array *trace_array_create(const char *name)
if (!zalloc_cpumask_var(&tr->pipe_cpumask, GFP_KERNEL))
goto out_free_tr;
+ if (systems) {
+ tr->system_names = kstrdup_const(systems, GFP_KERNEL);
+ if (!tr->system_names)
+ goto out_free_tr;
+ }
+
tr->trace_flags = global_trace.trace_flags & ~ZEROED_TRACE_FLAGS;
cpumask_copy(tr->tracing_cpumask, cpu_all_mask);
@@ -9570,12 +9717,18 @@ static struct trace_array *trace_array_create(const char *name)
free_trace_buffers(tr);
free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
+ kfree_const(tr->system_names);
kfree(tr->name);
kfree(tr);
return ERR_PTR(ret);
}
+static struct trace_array *trace_array_create(const char *name)
+{
+ return trace_array_create_systems(name, NULL);
+}
+
static int instance_mkdir(const char *name)
{
struct trace_array *tr;
@@ -9601,6 +9754,7 @@ out_unlock:
/**
* trace_array_get_by_name - Create/Lookup a trace array, given its name.
* @name: The name of the trace array to be looked up/created.
+ * @systems: A list of systems to create event directories for (NULL for all)
*
* Returns pointer to trace array with given name.
* NULL, if it cannot be created.
@@ -9614,7 +9768,7 @@ out_unlock:
* trace_array_put() is called, user space can not delete it.
*
*/
-struct trace_array *trace_array_get_by_name(const char *name)
+struct trace_array *trace_array_get_by_name(const char *name, const char *systems)
{
struct trace_array *tr;
@@ -9626,7 +9780,7 @@ struct trace_array *trace_array_get_by_name(const char *name)
goto out_unlock;
}
- tr = trace_array_create(name);
+ tr = trace_array_create_systems(name, systems);
if (IS_ERR(tr))
tr = NULL;
@@ -9673,6 +9827,7 @@ static int __remove_instance(struct trace_array *tr)
free_cpumask_var(tr->pipe_cpumask);
free_cpumask_var(tr->tracing_cpumask);
+ kfree_const(tr->system_names);
kfree(tr->name);
kfree(tr);
@@ -9805,6 +9960,9 @@ init_tracer_tracefs(struct trace_array *tr, struct dentry *d_tracer)
trace_create_file("buffer_percent", TRACE_MODE_WRITE, d_tracer,
tr, &buffer_percent_fops);
+ trace_create_file("buffer_subbuf_size_kb", TRACE_MODE_WRITE, d_tracer,
+ tr, &buffer_subbuf_size_fops);
+
create_trace_options_dir(tr);
#ifdef CONFIG_TRACER_MAX_TRACE
@@ -10391,7 +10549,7 @@ __init static void enable_instances(void)
if (IS_ENABLED(CONFIG_TRACER_MAX_TRACE))
do_allocate_snapshot(tok);
- tr = trace_array_get_by_name(tok);
+ tr = trace_array_get_by_name(tok, NULL);
if (!tr) {
pr_warn("Failed to create instance buffer %s\n", curr_str);
continue;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0489e72c8169..00f873910c5d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -377,6 +377,7 @@ struct trace_array {
unsigned char trace_flags_index[TRACE_FLAGS_MAX_SIZE];
unsigned int flags;
raw_spinlock_t start_lock;
+ const char *system_names;
struct list_head err_log;
struct dentry *dir;
struct dentry *options;
@@ -615,6 +616,7 @@ void tracing_reset_all_online_cpus(void);
void tracing_reset_all_online_cpus_unlocked(void);
int tracing_open_generic(struct inode *inode, struct file *filp);
int tracing_open_generic_tr(struct inode *inode, struct file *filp);
+int tracing_release_generic_tr(struct inode *inode, struct file *file);
int tracing_open_file_tr(struct inode *inode, struct file *filp);
int tracing_release_file_tr(struct inode *inode, struct file *filp);
int tracing_single_release_file_tr(struct inode *inode, struct file *filp);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 7ccc7a8e155b..dbe29b4c6a7a 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -633,7 +633,7 @@ trace_boot_init_instances(struct xbc_node *node)
if (!p || *p == '\0')
continue;
- tr = trace_array_get_by_name(p);
+ tr = trace_array_get_by_name(p, NULL);
if (!tr) {
pr_err("Failed to get trace instance %s\n", p);
continue;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f29e815ca5b2..7c364b87352e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -1893,9 +1893,9 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
}
static ssize_t
-show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+show_header_page_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
{
- int (*func)(struct trace_seq *s) = filp->private_data;
+ struct trace_array *tr = filp->private_data;
struct trace_seq *s;
int r;
@@ -1908,7 +1908,31 @@ show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
trace_seq_init(s);
- func(s);
+ ring_buffer_print_page_header(tr->array_buffer.buffer, s);
+ r = simple_read_from_buffer(ubuf, cnt, ppos,
+ s->buffer, trace_seq_used(s));
+
+ kfree(s);
+
+ return r;
+}
+
+static ssize_t
+show_header_event_file(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+ struct trace_seq *s;
+ int r;
+
+ if (*ppos)
+ return 0;
+
+ s = kmalloc(sizeof(*s), GFP_KERNEL);
+ if (!s)
+ return -ENOMEM;
+
+ trace_seq_init(s);
+
+ ring_buffer_print_entry_header(s);
r = simple_read_from_buffer(ubuf, cnt, ppos,
s->buffer, trace_seq_used(s));
@@ -2165,10 +2189,18 @@ static const struct file_operations ftrace_tr_enable_fops = {
.release = subsystem_release,
};
-static const struct file_operations ftrace_show_header_fops = {
- .open = tracing_open_generic,
- .read = show_header,
+static const struct file_operations ftrace_show_header_page_fops = {
+ .open = tracing_open_generic_tr,
+ .read = show_header_page_file,
+ .llseek = default_llseek,
+ .release = tracing_release_generic_tr,
+};
+
+static const struct file_operations ftrace_show_header_event_fops = {
+ .open = tracing_open_generic_tr,
+ .read = show_header_event_file,
.llseek = default_llseek,
+ .release = tracing_release_generic_tr,
};
static int
@@ -2896,6 +2928,27 @@ void trace_event_eval_update(struct trace_eval_map **map, int len)
up_write(&trace_event_sem);
}
+static bool event_in_systems(struct trace_event_call *call,
+ const char *systems)
+{
+ const char *system;
+ const char *p;
+
+ if (!systems)
+ return true;
+
+ system = call->class->system;
+ p = strstr(systems, system);
+ if (!p)
+ return false;
+
+ if (p != systems && !isspace(*(p - 1)) && *(p - 1) != ',')
+ return false;
+
+ p += strlen(system);
+ return !*p || isspace(*p) || *p == ',';
+}
+
static struct trace_event_file *
trace_create_new_event(struct trace_event_call *call,
struct trace_array *tr)
@@ -2905,9 +2958,12 @@ trace_create_new_event(struct trace_event_call *call,
struct trace_event_file *file;
unsigned int first;
+ if (!event_in_systems(call, tr->system_names))
+ return NULL;
+
file = kmem_cache_alloc(file_cachep, GFP_TRACE);
if (!file)
- return NULL;
+ return ERR_PTR(-ENOMEM);
pid_list = rcu_dereference_protected(tr->filtered_pids,
lockdep_is_held(&event_mutex));
@@ -2972,8 +3028,17 @@ __trace_add_new_event(struct trace_event_call *call, struct trace_array *tr)
struct trace_event_file *file;
file = trace_create_new_event(call, tr);
+ /*
+ * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+ * allocation, or NULL if the event is not part of the tr->system_names.
+ * When the event is not part of the tr->system_names, return zero, not
+ * an error.
+ */
if (!file)
- return -ENOMEM;
+ return 0;
+
+ if (IS_ERR(file))
+ return PTR_ERR(file);
if (eventdir_initialized)
return event_create_dir(tr->event_dir, file);
@@ -3012,8 +3077,17 @@ __trace_early_add_new_event(struct trace_event_call *call,
int ret;
file = trace_create_new_event(call, tr);
+ /*
+ * trace_create_new_event() returns ERR_PTR(-ENOMEM) if failed
+ * allocation, or NULL if the event is not part of the tr->system_names.
+ * When the event is not part of the tr->system_names, return zero, not
+ * an error.
+ */
if (!file)
- return -ENOMEM;
+ return 0;
+
+ if (IS_ERR(file))
+ return PTR_ERR(file);
ret = event_define_fields(call);
if (ret)
@@ -3752,17 +3826,16 @@ static int events_callback(const char *name, umode_t *mode, void **data,
return 1;
}
- if (strcmp(name, "header_page") == 0)
- *data = ring_buffer_print_page_header;
-
- else if (strcmp(name, "header_event") == 0)
- *data = ring_buffer_print_entry_header;
+ if (strcmp(name, "header_page") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &ftrace_show_header_page_fops;
- else
+ } else if (strcmp(name, "header_event") == 0) {
+ *mode = TRACE_MODE_READ;
+ *fops = &ftrace_show_header_event_fops;
+ } else
return 0;
- *mode = TRACE_MODE_READ;
- *fops = &ftrace_show_header_fops;
return 1;
}
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 5ecf3c8bde20..6ece1308d36a 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -4805,36 +4805,35 @@ static int parse_actions(struct hist_trigger_data *hist_data)
int len;
for (i = 0; i < hist_data->attrs->n_actions; i++) {
+ enum handler_id hid = 0;
+ char *action_str;
+
str = hist_data->attrs->action_str[i];
- if ((len = str_has_prefix(str, "onmatch("))) {
- char *action_str = str + len;
+ if ((len = str_has_prefix(str, "onmatch(")))
+ hid = HANDLER_ONMATCH;
+ else if ((len = str_has_prefix(str, "onmax(")))
+ hid = HANDLER_ONMAX;
+ else if ((len = str_has_prefix(str, "onchange(")))
+ hid = HANDLER_ONCHANGE;
- data = onmatch_parse(tr, action_str);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
- } else if ((len = str_has_prefix(str, "onmax("))) {
- char *action_str = str + len;
+ action_str = str + len;
- data = track_data_parse(hist_data, action_str,
- HANDLER_ONMAX);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
- } else if ((len = str_has_prefix(str, "onchange("))) {
- char *action_str = str + len;
+ switch (hid) {
+ case HANDLER_ONMATCH:
+ data = onmatch_parse(tr, action_str);
+ break;
+ case HANDLER_ONMAX:
+ case HANDLER_ONCHANGE:
+ data = track_data_parse(hist_data, action_str, hid);
+ break;
+ default:
+ data = ERR_PTR(-EINVAL);
+ break;
+ }
- data = track_data_parse(hist_data, action_str,
- HANDLER_ONCHANGE);
- if (IS_ERR(data)) {
- ret = PTR_ERR(data);
- break;
- }
- } else {
- ret = -EINVAL;
+ if (IS_ERR(data)) {
+ ret = PTR_ERR(data);
break;
}
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
index 7be97229ddf8..c158d65a8a88 100644
--- a/kernel/trace/trace_seq.c
+++ b/kernel/trace/trace_seq.c
@@ -13,9 +13,6 @@
* trace_seq_init() more than once to reset the trace_seq to start
* from scratch.
*
- * The buffer size is currently PAGE_SIZE, although it may become dynamic
- * in the future.
- *
* A write to the buffer will either succeed or fail. That is, unlike
* sprintf() there will not be a partial write (well it may write into
* the buffer but it wont update the pointers). This allows users to
diff --git a/samples/ftrace/sample-trace-array.c b/samples/ftrace/sample-trace-array.c
index 6aba02a31c96..d0ee9001c7b3 100644
--- a/samples/ftrace/sample-trace-array.c
+++ b/samples/ftrace/sample-trace-array.c
@@ -105,7 +105,7 @@ static int __init sample_trace_array_init(void)
* NOTE: This function increments the reference counter
* associated with the trace array - "tr".
*/
- tr = trace_array_get_by_name("sample-instance");
+ tr = trace_array_get_by_name("sample-instance", "sched,timer,kprobes");
if (!tr)
return -1;
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc
new file mode 100644
index 000000000000..d44d09a33a74
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/ringbuffer_subbuf_size.tc
@@ -0,0 +1,95 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Change the ringbuffer sub-buffer size
+# requires: buffer_subbuf_size_kb
+# flags: instance
+
+get_buffer_data_size() {
+ sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page
+}
+
+get_buffer_data_offset() {
+ sed -ne 's/^.*data.*offset:\([0-9][0-9]*\).*/\1/p' events/header_page
+}
+
+get_event_header_size() {
+ type_len=`sed -ne 's/^.*type_len.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
+ time_len=`sed -ne 's/^.*time_delta.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
+ array_len=`sed -ne 's/^.*array.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
+ total_bits=$((type_len+time_len+array_len))
+ total_bits=$((total_bits+7))
+ echo $((total_bits/8))
+}
+
+get_print_event_buf_offset() {
+ sed -ne 's/^.*buf.*offset:\([0-9][0-9]*\).*/\1/p' events/ftrace/print/format
+}
+
+event_header_size=`get_event_header_size`
+print_header_size=`get_print_event_buf_offset`
+
+data_offset=`get_buffer_data_offset`
+
+marker_meta=$((event_header_size+print_header_size))
+
+make_str() {
+ cnt=$1
+ printf -- 'X%.0s' $(seq $cnt)
+}
+
+write_buffer() {
+ size=$1
+
+ str=`make_str $size`
+
+ # clear the buffer
+ echo > trace
+
+ # write the string into the marker
+ echo $str > trace_marker
+
+ echo $str
+}
+
+test_buffer() {
+ size_kb=$1
+ page_size=$((size_kb*1024))
+
+ size=`get_buffer_data_size`
+
+ # the size must be greater than or equal to page_size - data_offset
+ page_size=$((page_size-data_offset))
+ if [ $size -lt $page_size ]; then
+ exit fail
+ fi
+
+ # Now add a little more the meta data overhead will overflow
+
+ str=`write_buffer $size`
+
+ # Make sure the line was broken
+ new_str=`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,"");printf "%s", $0; exit}' trace`
+
+ if [ "$new_str" = "$str" ]; then
+ exit fail;
+ fi
+
+ # Make sure the entire line can be found
+ new_str=`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,"");printf "%s", $0; }' trace`
+
+ if [ "$new_str" != "$str" ]; then
+ exit fail;
+ fi
+}
+
+ORIG=`cat buffer_subbuf_size_kb`
+
+# Could test bigger sizes than 32K, but then creating the string
+# to write into the ring buffer takes too long
+for a in 4 8 16 32 ; do
+ echo $a > buffer_subbuf_size_kb
+ test_buffer $a
+done
+
+echo $ORIG > buffer_subbuf_size_kb
+
diff --git a/tools/testing/selftests/ftrace/test.d/00basic/trace_marker.tc b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker.tc
new file mode 100644
index 000000000000..9aa0db2b84fc
--- /dev/null
+++ b/tools/testing/selftests/ftrace/test.d/00basic/trace_marker.tc
@@ -0,0 +1,82 @@
+#!/bin/sh
+# SPDX-License-Identifier: GPL-2.0
+# description: Basic tests on writing to trace_marker
+# requires: trace_marker
+# flags: instance
+
+get_buffer_data_size() {
+ sed -ne 's/^.*data.*size:\([0-9][0-9]*\).*/\1/p' events/header_page
+}
+
+get_buffer_data_offset() {
+ sed -ne 's/^.*data.*offset:\([0-9][0-9]*\).*/\1/p' events/header_page
+}
+
+get_event_header_size() {
+ type_len=`sed -ne 's/^.*type_len.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
+ time_len=`sed -ne 's/^.*time_delta.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
+ array_len=`sed -ne 's/^.*array.*:[^0-9]*\([0-9][0-9]*\).*/\1/p' events/header_event`
+ total_bits=$((type_len+time_len+array_len))
+ total_bits=$((total_bits+7))
+ echo $((total_bits/8))
+}
+
+get_print_event_buf_offset() {
+ sed -ne 's/^.*buf.*offset:\([0-9][0-9]*\).*/\1/p' events/ftrace/print/format
+}
+
+event_header_size=`get_event_header_size`
+print_header_size=`get_print_event_buf_offset`
+
+data_offset=`get_buffer_data_offset`
+
+marker_meta=$((event_header_size+print_header_size))
+
+make_str() {
+ cnt=$1
+ # subtract two for \n\0 as marker adds these
+ cnt=$((cnt-2))
+ printf -- 'X%.0s' $(seq $cnt)
+}
+
+write_buffer() {
+ size=$1
+
+ str=`make_str $size`
+
+ # clear the buffer
+ echo > trace
+
+ # write the string into the marker
+ echo -n $str > trace_marker
+
+ echo $str
+}
+
+test_buffer() {
+
+ size=`get_buffer_data_size`
+ oneline_size=$((size-marker_meta))
+ echo size = $size
+ echo meta size = $marker_meta
+
+ # Now add a little more the meta data overhead will overflow
+
+ str=`write_buffer $size`
+
+ # Make sure the line was broken
+ new_str=`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,"");printf "%s", $0; exit}' trace`
+
+ if [ "$new_str" = "$str" ]; then
+ exit fail;
+ fi
+
+ # Make sure the entire line can be found
+ new_str=`awk ' /tracing_mark_write:/ { sub(/^.*tracing_mark_write: /,"");printf "%s", $0; }' trace`
+
+ if [ "$new_str" != "$str" ]; then
+ exit fail;
+ fi
+}
+
+test_buffer