See the scenario below.
It is quite typical in data warehouses, to access data via dimension tables.
This is an example for a date dimension.
-- row generator
create table rgen
select 1 as n union select 2 union select 3 union select 4 union
select 5 union select 6 union select 7 union select 8 union
select 9 union select 10;
-- fact table: 1 million rows
select @a:=0;
create table r1tst
select
@a:=@a + 1 as rkey,
floor(@a/10000) as rbatchid,
floor(rand()*10) + 1 as rtype,
adddate('2010-01-01', interval (@a/4000) day) as rdate,
(rand()*1000000/100) as rsum
from rgen a,rgen b,rgen c,rgen d,rgen e,rgen f;
alter table r1tst add index(rbatchid),
add index(rdate);
-- date dimension
select @d:=-1;
create table r_date_dim select
@d:=@d+1 as date_key,
adddate('2010-01-01', interval @d day) as date_dt,
DAY(adddate('2010-01-01', interval @d day)) as date_day,
DAYNAME(adddate('2010-01-01', interval @d day)) as date_day_name,
DAYOFMONTH(adddate('2010-01-01', interval @d day)) as date_day_of_month,
DAYOFWEEK(adddate('2010-01-01', interval @d day)) as date_day_of_week,
DAYOFYEAR(adddate('2010-01-01', interval @d day)) as date_day_of_year,
LAST_DAY(adddate('2010-01-01', interval @d day)) as date_last_day,
MONTH(adddate('2010-01-01', interval @d day)) as date_month,
MONTHNAME(adddate('2010-01-01', interval @d day)) as date_month_name,
QUARTER(adddate('2010-01-01', interval @d day)) as date_quarter,
TO_DAYS(adddate('2010-01-01', interval @d day)) as date_days,
WEEK(adddate('2010-01-01', interval @d day)) as date_week,
WEEKDAY(adddate('2010-01-01', interval @d day)) as date_week_day,
WEEKOFYEAR(adddate('2010-01-01', interval @d day)) as date_week_of_year,
YEAR(adddate('2010-01-01', interval @d day)) as date_year,
YEARWEEK(adddate('2010-01-01', interval @d day)) as year_week
from rgen a,rgen b, rgen c;
alter table r_date_dim add index (date_year),
add index (date_month), add index(date_dt),
add index (date_key);
-- query A:
mysql> select sum(rsum) from r1tst v1,r_date_dim v2
-> where v1.rdate=v2.date_dt and v2.date_year=2010 and v2.date_quarter=2;
+-----------------+
| sum(rsum) |
+-----------------+
| 1819518419.8031 |
+-----------------+
1 row in set (37.64 sec)
mysql> explain select sum(rsum) from r1tst v1,r_date_dim v2
-> where v1.rdate=v2.date_dt and v2.date_year=2010 and v2.date_quarter=2;
+--+------+---+----+-----------------+-------+----+--------+-------+-----------+
|id|select|tab|type|possible_keys |key | len| ref |rows |Extra |
+--+------+---+----+-----------------+-------+----+--------+-------+-----------+
| 1|SIMPLE|v1 |ALL |rdate |NULL |NULL|NULL |1000000| |
| 1|SIMPLE|v2 |ref |date_year,date_dt|date_dt|9 |v1.rdate| 1|Using where|
+--+------+---+----+-----------------+-------+----+--------+-------+-----------+
2 rows in set (0.03 sec)
-- query B:
mysql> select sum(rsum) from r1tst v1,r_date_dim v2
-> where v1.rdate=date(v2.date_dt) and v2.date_year=2010 and v2.date_quarter=2;
+-----------------+
| sum(rsum) |
+-----------------+
| 1819518419.8031 |
+-----------------+
1 row in set (5.41 sec)
mysql> explain select sum(rsum) from r1tst v1,r_date_dim v2
-> where v1.rdate=date(v2.date_dt) and v2.date_year=2010 and v2.date_quarter=2;
+--+------+---+----+---------+---------+----+-----+----+-----------+
|id|select|tab|type|possible |key | len| ref |rows|Extra |
+--+------+---+----+---------+---------+----+-----+----+-----------+
| 1|SIMPLE| v2|ref |date_year|date_year| 5 |const| 361|Using where|
| 1|SIMPLE| v1|ref |rdate |rdate | 9 | func|3984|Using where|
+--+------+---+----+---------+---------+----+-----+----+-----------+
2 rows in set (0.00 sec)
In theory it should be a really bad idea having a function around the the join criteria, as it prevents index usage.
Why is query B faster than query A?
(Find all correct answers)
a) There is no good index on the date-dimension, so the optimizer thinks A is the best plan.
b) In B, the date() in the join forces a certain join order.
c) Restricting on the fact table directly is more efficient than going via date dimension.
[ Basic Optimizations (10%) - Overview of Optimization Principles ]
a b c
Sometimes it is necessary to force a certain join order, even tho mostly the optimizer figures it out just fine.
In general MySQL will determine the best join order, but there are the following mechanisms to control it: